From ec8a6e6cbdde3e60b5126dd2d57d4b6b5f109815 Mon Sep 17 00:00:00 2001 From: Wonderson Chideya Date: Tue, 2 Aug 2022 13:55:59 +0100 Subject: [PATCH] page crawler update --- pom.xml | 2 +- src/main/java/SpiderCrawler.java | 9 ++++----- src/test/java/PageCrawlerTest.java | 8 +++++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pom.xml b/pom.xml index ce81963..525a04a 100644 --- a/pom.xml +++ b/pom.xml @@ -13,7 +13,7 @@ https://nexus.olcs.dev-dvsacloud.uk/repository/maven-releases https://nexus.olcs.dev-dvsacloud.uk/repository/maven-snapshots 3.8.1 - 1.6.3 + 1.6.5 2.17.0 1.8 5.6.0 diff --git a/src/main/java/SpiderCrawler.java b/src/main/java/SpiderCrawler.java index f64a432..5604c4e 100644 --- a/src/main/java/SpiderCrawler.java +++ b/src/main/java/SpiderCrawler.java @@ -15,7 +15,7 @@ public class SpiderCrawler{ private static final Logger LOGGER = LogManager.getLogger(SpiderCrawler.class); public static Document request(String url, ArrayList visitedURL) { - Document doc = null; + Document doc; try { if (isUrlValid(url)) { doc = Jsoup.connect(url).get(); @@ -27,21 +27,20 @@ public static Document request(String url, ArrayList visitedURL) { return doc; } } catch (IOException e) { - return doc; + return null; } return null; } public static void crawler(int level, String url, ArrayList visited) { - if (level <= 5) { + if (level <= 3) { Document doc = request(url, visited); if (doc != null) { for (Element link : doc.select("a[href]")) { String formattedLink = link.absUrl("href"); - if (!visited.contains(formattedLink)) { + if (!visited.contains(formattedLink)) crawler(level++, formattedLink, visited); - } } } } diff --git a/src/test/java/PageCrawlerTest.java b/src/test/java/PageCrawlerTest.java index 4d30386..019316f 100644 --- a/src/test/java/PageCrawlerTest.java +++ b/src/test/java/PageCrawlerTest.java @@ -1,20 +1,22 @@ +import activesupport.driver.Browser; import org.junit.Before; import org.junit.Test; +import org.openqa.selenium.By; import java.util.ArrayList; public class PageCrawlerTest { - String baseURL = System.getProperty("url"); + String baseURL = "https://ssweb.qa.olcs.dev-dvsacloud.uk/auth/login/"; @Before public void setUp() { - System.setProperty("browser", "chrome"); + System.setProperty("browser", "headless"); } @Test - public void someTest(){ + public void someTest() { SpiderCrawler.crawler(1, baseURL, new ArrayList<>()); } }