Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor: 유튜브 댓글 크롤링 API response 값 수정(시간, 총 댓글 수 추가) #38

Merged
merged 5 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 53 additions & 49 deletions .platform/nginx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -11,53 +11,57 @@ events {
}

http {
include /etc/nginx/mime.types;
default_type application/octet-stream;


log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';

include conf.d/*.conf;

map $http_upgrade $connection_upgrade {
default "upgrade";
}

upstream springboot {
server 127.0.0.1:8080;
keepalive 1024;
}

server {
listen 80 default_server;
listen [::]:80 default_server;

location / {
proxy_pass http://springboot;
# CORS 관련 헤더 추가
add_header 'Access-Control-Allow-Origin' '*';
add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS';
add_header 'Access-Control-Allow-Headers' 'Authorization, Content-Type';
proxy_http_version 1.1;
proxy_set_header Connection $connection_upgrade;
proxy_set_header Upgrade $http_upgrade;

proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}

access_log /var/log/nginx/access.log main;

client_header_timeout 60;
client_body_timeout 60;
keepalive_timeout 60;
gzip off;
gzip_comp_level 4;

# Include the Elastic Beanstalk generated locations
include conf.d/elasticbeanstalk/healthd.conf;
}
include /etc/nginx/mime.types;
default_type application/octet-stream;

log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';

include conf.d/*.conf;

map $http_upgrade $connection_upgrade {
default "upgrade";
}

upstream springboot {
server 127.0.0.1:8080;
keepalive 1024;
}

server {
listen 80 default_server;
listen [::]:80 default_server;

location / {
proxy_pass http://springboot;
# CORS 관련 헤더 추가
add_header 'Access-Control-Allow-Origin' '*';
add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS';
add_header 'Access-Control-Allow-Headers' 'Authorization, Content-Type';
proxy_http_version 1.1;
proxy_set_header Connection $connection_upgrade;
proxy_set_header Upgrade $http_upgrade;

proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;

# 타임아웃 설정 추가
proxy_read_timeout 900s; # 백엔드 서버로부터의 응답을 기다리는 시간
proxy_connect_timeout 900s; # 백엔드 서버에 연결을 시도하는 시간
proxy_send_timeout 900s; # Nginx가 백엔드 서버로 요청을 전송하는 시간
}

access_log /var/log/nginx/access.log main;

client_header_timeout 60;
client_body_timeout 60;
keepalive_timeout 60;
gzip off;
gzip_comp_level 4;

# Include the Elastic Beanstalk generated locations
include conf.d/elasticbeanstalk/healthd.conf;
}
}
5 changes: 2 additions & 3 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,9 @@ dependencies {
implementation 'io.jsonwebtoken:jjwt-jackson:0.12.2'

//selenium
implementation 'org.seleniumhq.selenium:selenium-java:4.1.4'
implementation 'io.github.bonigarcia:webdrivermanager:5.0.3'
implementation 'org.seleniumhq.selenium:selenium-java:4.22.0'
implementation 'io.github.bonigarcia:webdrivermanager:5.4.0'
implementation 'org.jsoup:jsoup:1.13.1'
testImplementation 'org.seleniumhq.selenium:selenium-java:4.22.0'

//Google Firebase
implementation 'com.google.firebase:firebase-admin:9.2.0'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.cmc.suppin.event.crawl.controller;

import com.cmc.suppin.event.crawl.controller.dto.CrawlResponseDTO;
import com.cmc.suppin.event.crawl.service.CrawlService;
import com.cmc.suppin.global.response.ApiResponse;
import com.cmc.suppin.global.response.ResponseCode;
Expand Down Expand Up @@ -51,9 +52,9 @@ public ResponseEntity<ApiResponse<String>> checkExistingComments(@RequestParam("
"크롤링하려는 URL이 중복되지 않았을 때의 요청이기 때문에, 새로운 댓글을 크롤링합니다. <br>" +
"- DB에 기존 댓글이 존재하는 경우: 크롤링을 중지하고 예외를 던집니다. <br>" +
"- DB에 기존 댓글이 존재하지 않는 경우: 새로운 댓글을 크롤링하고 이를 DB에 저장합니다.")
public ResponseEntity<ApiResponse<String>> crawlYoutubeComments(@RequestParam("url") String url, @RequestParam("eventId") Long eventId, @RequestParam("forceUpdate") boolean forceUpdate, @CurrentAccount Account account) {
crawlService.crawlYoutubeComments(url, eventId, account.userId(), forceUpdate);
return ResponseEntity.ok(ApiResponse.of(ResponseCode.SUCCESS, "댓글 수집이 완료되었습니다."));
public ResponseEntity<ApiResponse<CrawlResponseDTO.CrawlResultDTO>> crawlYoutubeComments(@RequestParam("url") String url, @RequestParam("eventId") Long eventId, @RequestParam("forceUpdate") boolean forceUpdate, @CurrentAccount Account account) {
CrawlResponseDTO.CrawlResultDTO crawlResultDTO = crawlService.crawlYoutubeComments(url, eventId, account.userId(), forceUpdate);
return ResponseEntity.ok(ApiResponse.of(ResponseCode.SUCCESS, crawlResultDTO));
}

// @GetMapping("/count")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ public class CrawlResponseDTO {
@NoArgsConstructor
@AllArgsConstructor
public static class CrawlResultDTO {
private String author;
private String commentText;
private String date;
private String crawlingDate;
private int totalCommentCount;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.cmc.suppin.event.crawl.controller.dto.CommentRequestDTO;
import com.cmc.suppin.event.crawl.controller.dto.CommentResponseDTO;
import com.cmc.suppin.event.crawl.controller.dto.CrawlResponseDTO;
import com.cmc.suppin.event.crawl.domain.Comment;
import com.cmc.suppin.event.events.domain.Event;

Expand Down Expand Up @@ -56,5 +57,12 @@ public static CommentResponseDTO.WinnerResponseDTO toWinnerResponseDTO(List<Comm
.winners(winnerDetails)
.build();
}

public static CrawlResponseDTO.CrawlResultDTO toCrawlResultDTO(LocalDateTime crawlingDate, int totalCommentCount) {
return CrawlResponseDTO.CrawlResultDTO.builder()
.crawlingDate(crawlingDate.format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm")))
.totalCommentCount(totalCommentCount)
.build();
}
}

20 changes: 17 additions & 3 deletions src/main/java/com/cmc/suppin/event/crawl/service/CrawlService.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.cmc.suppin.event.crawl.service;

import com.cmc.suppin.event.crawl.controller.dto.CrawlResponseDTO;
import com.cmc.suppin.event.crawl.converter.CommentConverter;
import com.cmc.suppin.event.crawl.converter.DateConverter;
import com.cmc.suppin.event.crawl.domain.Comment;
Expand Down Expand Up @@ -52,7 +53,7 @@ public String checkExistingComments(String url, String userId) {
return null;
}

public void crawlYoutubeComments(String url, Long eventId, String userId, boolean forceUpdate) {
public CrawlResponseDTO.CrawlResultDTO crawlYoutubeComments(String url, Long eventId, String userId, boolean forceUpdate) {
Member member = memberRepository.findByUserIdAndStatusNot(userId, UserStatus.DELETED)
.orElseThrow(() -> new IllegalArgumentException("Member not found"));

Expand Down Expand Up @@ -100,18 +101,23 @@ public void crawlYoutubeComments(String url, Long eventId, String userId, boolea
try {
Thread.sleep(5000); // 초기 로딩 대기

long endTime = System.currentTimeMillis() + 300000; // 스크롤 시간 조정 (필요에 따라 조정)
long endTime = System.currentTimeMillis() + 600000; // 스크롤 시간을 10분으로 설정 (600,000ms)
JavascriptExecutor jsExecutor = (JavascriptExecutor) driver;

int previousCommentCount = 0;
int currentCommentCount;

while (System.currentTimeMillis() < endTime) {
jsExecutor.executeScript("window.scrollTo(0, document.documentElement.scrollHeight);");

Thread.sleep(1000);
Thread.sleep(1000); // 1초 대기

String pageSource = driver.getPageSource();
Document doc = Jsoup.parse(pageSource);
Elements comments = doc.select("ytd-comment-thread-renderer");

currentCommentCount = comments.size();

for (Element commentElement : comments) {
String author = commentElement.select("#author-text span").text();
String text = commentElement.select("#content yt-attributed-string#content-text").text();
Expand All @@ -126,12 +132,20 @@ public void crawlYoutubeComments(String url, Long eventId, String userId, boolea
commentRepository.save(comment);
}
}

// 더 이상 새로운 댓글이 없을 때, 크롤링 종료
if (currentCommentCount == previousCommentCount) {
break; // 새로운 댓글이 로드되지 않으면 루프를 종료합니다.
}

previousCommentCount = currentCommentCount;
}
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
driver.quit();
}
return CommentConverter.toCrawlResultDTO(LocalDateTime.now(), uniqueComments.size());
}
}

4 changes: 4 additions & 0 deletions src/main/resources/application.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
spring:
server:
port: 8080
connection-timeout: 15m
jackson:
time-zone: Asia/Seoul
datasource:
Expand Down Expand Up @@ -38,6 +39,9 @@ spring:
protocol: smtp
default-encoding: UTF-8
test-connection: false
mvc:
async:
request-timeout: 15m

jwt:
token:
Expand Down
Loading