From 04a7f14f4b27824b8d0d66b98a03e5bd24224cea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Wed, 15 Nov 2017 22:32:33 -0500 Subject: [PATCH 1/5] Initial skeleton on HTML SAX parser based on nekohtml (issue #80) --- .../util/parser/HtmlSaxParser.java | 220 ++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java diff --git a/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java new file mode 100644 index 000000000..7fb9994c6 --- /dev/null +++ b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java @@ -0,0 +1,220 @@ +package focusedCrawler.util.parser; + +import java.io.IOException; +import java.io.StringReader; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; + +import org.cyberneko.html.parsers.SAXParser; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; + +public class HtmlSaxParser extends SAXParser implements ContentHandler { + + public static void main(String[] args) throws Exception { + + String url = "http://example.com"; + String html = + "

My heading 1!

My Paragraph.

"; + html = "Hello World!"; + html = new String(Files.readAllBytes(Paths.get( + "src/test/resources/focusedCrawler/memex/cdr/http%3A%2F%2Fwww.darpa.mil%2Fprogram%2Fmemex"))); + HtmlSaxParser parser = new HtmlSaxParser(url, html); + parser.print(); + } + + static class Anchor { + + private String href; + private int textStart; + private int textEnd; + private String anchorText; + + Anchor(String href, int textStart, int textEnd, String anchorText) { + this.href = href; + this.textStart = textStart; + this.textEnd = textEnd; + this.anchorText = anchorText; + } + + @Override + public String toString() { + return "Anchor[href=" + href + ", textStart=" + textStart + ", textEnd=" + textEnd + + ", text=" + anchorText + "]"; + } + + } + + enum TextType { + TITLE, TEXT, ANCHOR_TEXT, IGNORE + } + + private TextType textState = TextType.TEXT; + private List anchors = new ArrayList<>(); + private List images; + private String baseUrl; + private StringBuilder title = new StringBuilder(); + private StringBuilder text = new StringBuilder(); + private StringBuilder anchorText = new StringBuilder(); + private String currentHref = null; + private int currentHrefTextStart = 0; + + public HtmlSaxParser(String url, String html) throws SAXException, IOException { + this.baseUrl = url; + // super.setContentHandler(new BoilerpipeHTMLContentHandler()); + setContentHandler(this); + InputSource input = new InputSource(new StringReader(html)); + this.parse(input); + } + + private void print() { + System.out.println("---"); + System.out.println("TEXT: " + text.toString()); + System.out.println("ANCHORS: "); + for (Anchor anchor : anchors) { + System.out.println("> " + anchor); + } + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + String tagName = localName; + System.out.println("(" + localName + " " + qName + " " + uri); + switch (tagName) { + case "BASE": { + String href = atts.getValue("href"); + if (href != null && !href.isEmpty()) { + // All extracted links should be relative to the href of tag + this.baseUrl = href; + } + break; + } + case "A": { + this.textState = TextType.ANCHOR_TEXT; + String href = atts.getValue("href"); + if (href != null && !href.isEmpty()) { + this.currentHref = href; + this.currentHrefTextStart = text.length(); + } + break; + } + case "IMG": { + String href = atts.getValue("href"); + if (href != null && !href.isEmpty()) { + images.add(href); + } + break; + } + case "NOSCRIPT": + case "SCRIPT": + case "STYLE": + this.textState = TextType.IGNORE; + break; + case "TITLE": + this.textState = TextType.TITLE; + break; + // default: + // this.textState = TextType.TEXT; + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + String tagName = localName; + System.out.println(")" + localName); + switch (tagName) { + case "A": + if (currentHref != null && !currentHref.isEmpty()) { + // TODO: validate href? unescape? + anchors.add(new Anchor(currentHref, currentHrefTextStart, text.length(), + anchorText.toString().trim())); + currentHref = null; + } + anchorText = new StringBuilder(); + textState = TextType.TEXT; + break; + case "TITLE": + break; + case "P": + case "H1": + case "H2": + case "H3": + case "H4": + case "H5": + case "H6": + text.append("\n\n"); + break; + case "BR": + text.append('\n'); + break; + default: + text.append(' '); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + System.out.println(new String(ch, start, length)); + switch (textState) { + case IGNORE: + break; + case TEXT: + text.append(ch, start, length); + break; + case ANCHOR_TEXT: + text.append(ch, start, length); + anchorText.append(ch, start, length); + break; + case TITLE: + title.append(ch, start, length); + break; + } + } + + @Override + public void setDocumentLocator(Locator locator) { + // System.out.println("NekoHtmlSaxParser.setDocumentLocator()"); + } + + @Override + public void startDocument() throws SAXException { + // System.out.println("NekoHtmlSaxParser.startDocument()"); + } + + @Override + public void endDocument() throws SAXException { + // System.out.println("NekoHtmlSaxParser.startElement()"); + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + System.out.println("NekoHtmlSaxParser.ignorableWhitespace()"); + } + + @Override + public void processingInstruction(String target, String data) throws SAXException { + System.out.println("NekoHtmlSaxParser.processingInstruction()"); + } + + @Override + public void skippedEntity(String name) throws SAXException { + System.out.println("NekoHtmlSaxParser.skippedEntity()"); + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + System.out.println("NekoHtmlSaxParser.startPrefixMapping()"); + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + System.out.println("NekoHtmlSaxParser.endPrefixMapping()"); + } + +} From c0f59420c2958e66557c07dea634db1e743b5f20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Fri, 17 Nov 2017 16:15:10 -0500 Subject: [PATCH 2/5] Added unit tests and implemented URL normalization and validation --- .../util/parser/HtmlSaxParser.java | 114 ++++++++- .../util/parser/HtmlSaxParserTest.java | 242 ++++++++++++++++++ 2 files changed, 350 insertions(+), 6 deletions(-) create mode 100644 src/test/java/focusedCrawler/util/parser/HtmlSaxParserTest.java diff --git a/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java index 7fb9994c6..18487a589 100644 --- a/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java +++ b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java @@ -2,18 +2,28 @@ import java.io.IOException; import java.io.StringReader; +import java.net.MalformedURLException; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import java.util.TreeSet; +import java.util.regex.Pattern; +import org.apache.commons.validator.routines.UrlValidator; import org.cyberneko.html.parsers.SAXParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; +import focusedCrawler.crawler.crawlercommons.filters.basic.BasicURLNormalizer; + public class HtmlSaxParser extends SAXParser implements ContentHandler { public static void main(String[] args) throws Exception { @@ -54,6 +64,21 @@ enum TextType { TITLE, TEXT, ANCHOR_TEXT, IGNORE } + public static final Logger logger = LoggerFactory.getLogger(HtmlSaxParser.class); + + private final String[] schemes = {"http", "https"}; + private final UrlValidator urlValidator = new UrlValidator(schemes); + + // ONION links aren't accepted by the validator + // Regex ".[^.]+" --> any string of at least 1 char without dot + private Pattern onionRegex = Pattern.compile("https?://.[^.]+\\.onion.*"); + + private static final List invalidParameters = + Arrays.asList("sid", "phpsessid", "sessionid", "jsessionid"); + private static final BasicURLNormalizer urlNormalizer = + new BasicURLNormalizer(new TreeSet<>(invalidParameters), false); + + private TextType textState = TextType.TEXT; private List anchors = new ArrayList<>(); private List images; @@ -64,12 +89,20 @@ enum TextType { private String currentHref = null; private int currentHrefTextStart = 0; - public HtmlSaxParser(String url, String html) throws SAXException, IOException { + public HtmlSaxParser(URL url, String html) { + this(url.toString(), html); + } + + public HtmlSaxParser(String url, String html) { this.baseUrl = url; // super.setContentHandler(new BoilerpipeHTMLContentHandler()); setContentHandler(this); InputSource input = new InputSource(new StringReader(html)); - this.parse(input); + try { + this.parse(input); + } catch (SAXException | IOException e) { + throw new RuntimeException("Failed to parse page: " + url, e); + } } private void print() { @@ -98,10 +131,7 @@ public void startElement(String uri, String localName, String qName, Attributes case "A": { this.textState = TextType.ANCHOR_TEXT; String href = atts.getValue("href"); - if (href != null && !href.isEmpty()) { - this.currentHref = href; - this.currentHrefTextStart = text.length(); - } + createLink(href); break; } case "IMG": { @@ -124,6 +154,37 @@ public void startElement(String uri, String localName, String qName, Attributes } } + private void createLink(String href) { + String url = null; + if (href == null || href.isEmpty()) + return; + else + url = href.trim(); + + if (url.startsWith("javacript:")) + return; + + if (url.startsWith("mailto:")) { + // TODO store email + return; + } + + if (url.startsWith("tel:")) { + // TODO store phone number + return; + } + + String absoluteUrl = resolveRelativeHref(href, baseUrl); + if (absoluteUrl == null || absoluteUrl.isEmpty()) + return; + + if (!(urlValidator.isValid(absoluteUrl) || onionRegex.matcher(absoluteUrl).matches())) + return; + + this.currentHref = urlNormalizer.filter(absoluteUrl); + this.currentHrefTextStart = text.length(); + } + @Override public void endElement(String uri, String localName, String qName) throws SAXException { String tagName = localName; @@ -177,6 +238,19 @@ public void characters(char[] ch, int start, int length) throws SAXException { } } + private String resolveRelativeHref(String href, String baseUrl) { + URL absoluteUrl = resolveRelativeHrefToUrl(href, baseUrl); + return absoluteUrl == null ? null : absoluteUrl.toString(); + } + + private URL resolveRelativeHrefToUrl(String href, String baseUrl) { + try { + return new URL(new URL(baseUrl), href); + } catch (MalformedURLException e) { + throw new RuntimeException("Invalid URL: " + baseUrl + " - " + href, e); + } + } + @Override public void setDocumentLocator(Locator locator) { // System.out.println("NekoHtmlSaxParser.setDocumentLocator()"); @@ -217,4 +291,32 @@ public void endPrefixMapping(String prefix) throws SAXException { System.out.println("NekoHtmlSaxParser.endPrefixMapping()"); } + public URL[] links() { + List links = new ArrayList<>(); + for (Anchor anchor : anchors) { + URL absoluteUrl = resolveRelativeHrefToUrl(anchor.href, baseUrl); + links.add(absoluteUrl); + } + return (URL[]) links.toArray(new URL[links.size()]); + } + + + public LinkNeighborhood[] getLinkNeighboor() { + List links = new ArrayList<>(); + for (Anchor anchor : anchors) { + URL absoluteUrl = resolveRelativeHrefToUrl(anchor.href, baseUrl); + LinkNeighborhood ln = new LinkNeighborhood(absoluteUrl); + links.add(ln); + } + return (LinkNeighborhood[]) links.toArray(new LinkNeighborhood[links.size()]); + } + + public URL getURL() { + try { + return new URL(baseUrl); + } catch (MalformedURLException e) { + throw new RuntimeException("Invalid URL: " + baseUrl, e); + } + } + } diff --git a/src/test/java/focusedCrawler/util/parser/HtmlSaxParserTest.java b/src/test/java/focusedCrawler/util/parser/HtmlSaxParserTest.java new file mode 100644 index 000000000..1215c9984 --- /dev/null +++ b/src/test/java/focusedCrawler/util/parser/HtmlSaxParserTest.java @@ -0,0 +1,242 @@ +package focusedCrawler.util.parser; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.notNullValue; +import static org.junit.Assert.assertThat; + +import java.net.MalformedURLException; +import java.net.URL; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class HtmlSaxParserTest { + + @Before + public void setUp() throws Exception {} + + @After + public void tearDown() throws Exception {} + + @Test + public void htmlEncodedLinksShouldBeEscaped() throws Exception { + // given + String testString = new HtmlBuilder() + .withBody("Anchor text.") + .build(); + + // when + HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString); + URL[] extractedLinks = pageParser.links(); + LinkNeighborhood[] neighborhood = pageParser.getLinkNeighboor(); + + // then + assertThat(extractedLinks[0].toString(), is("http://ex.com/index.php?p1=asdf&p2=qwer")); + assertThat(neighborhood[0].getLink().toString(), is("http://ex.com/index.php?p1=asdf&p2=qwer")); + } + + @Test + public void linksShouldNotContainFragments() throws Exception { + // given + String testString = new HtmlBuilder() + .appendToBody("

My First Heading

") + .appendToBody("Mouse") + .build(); + URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document"); + + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testString); + URL[] extractedLinks = pageParser.links(); + + // then + assertThat(extractedLinks.length, is(1)); + assertThat(extractedLinks[0].toString(), is("https://en.wikipedia.org/wiki/Mouse_(computing)")); + } + + @Test + public void constructorsShouldWork() throws MalformedURLException { + // given + URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document"); + String testPage = createTestPage(); + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage); + // then + assertThat(pageParser.getURL(), is(notNullValue())); + } + + @Test + public void shouldExtractOnionLinks() throws MalformedURLException { + // given + URL url = new URL("http://example.com/test.html"); + String testPage = new HtmlBuilder() + .appendToBody("link 1") + .appendToBody("link 1") + .build(); + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString()); + URL[] links = pageParser.links(); + + // then + assertThat(links.length, is(2)); + assertThat(links[0].toString(), is("http://3g2asl4qw6kufc5m.onion/")); + assertThat(links[1].toString(), is("http://3g2asl4qw6kufc5m.onion/test.html")); + } + +// @Test +// public void shouldParseText() throws MalformedURLException { +// // given +// URL url = new URL("http://example.com/"); +// StringBuilder testPage = new StringBuilder(); +// testPage.append(""); +// testPage.append(""); +// testPage.append(""); +// testPage.append("

My First paragraph. My second second paragraph.

"); +// testPage.append(""); +// testPage.append(""); +// +// // when +// HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString()); +// String[] links = pageParser.palavras(); +// int[] ocorrencias = pageParser.ocorrencias(); +// System.out.println(testPage.toString()); +// System.out.println(Arrays.deepToString(links)); +// System.out.println(Ints.asList(ocorrencias)); +//// // then +//// assertThat(links.length, is(1)); +//// assertThat(links[0].toString(), is("http://example.com/asdf.html")); +//// +//// assertThat(lns.length, is(1)); +//// assertThat(lns[0].getLink().toString(), is("http://example.com/asdf.html")); +// } + + @Test + public void shouldExtractAnchoTextAndTextAroundLink() throws MalformedURLException { + // given + String url = "http://www.example.com"; + String testPage = HtmlBuilder.newBuilder() + .appendToBody("

My First Heading

") + .appendToBody("My first paragraph.") + .build(); + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage); + LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighboor(); + // then + assertThat(neighborhoods.length, is(1)); + + assertThat(neighborhoods[0].getAroundString().trim(), is("my first heading")); + assertThat(neighborhoods[0].getAround()[0], is("my")); + assertThat(neighborhoods[0].getAround()[1], is("first")); + assertThat(neighborhoods[0].getAround()[2], is("heading")); + + assertThat(neighborhoods[0].getAnchorString().trim(), is("my first paragraph")); + assertThat(neighborhoods[0].getAnchor()[0], is("my")); + assertThat(neighborhoods[0].getAnchor()[1], is("first")); + assertThat(neighborhoods[0].getAnchor()[2], is("paragraph")); + } + + @Test + public void shouldNotExtractInvalidLinks() throws MalformedURLException { + // given + URL url = new URL("http://example.com/test.html"); + String testPage = new HtmlBuilder() + .withBody( + "

My First Heading

" + + "link 0" + + "link 1" + + "link 2" + ) + .build(); + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString()); + URL[] links = pageParser.links(); + LinkNeighborhood[] lns = pageParser.getLinkNeighboor(); + + // then + assertThat(links.length, is(1)); + assertThat(links[0].toString(), is("http://example.com/asdf.html")); + + assertThat(lns.length, is(1)); + assertThat(lns[0].getLink().toString(), is("http://example.com/asdf.html")); + } + + @Test + public void shouldNormalizeLinks() throws MalformedURLException { + // given + URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document"); + String testPage = HtmlBuilder.newBuilder() + .appendToBody("

My First Heading

") + .appendToBody("Link 1.") + .appendToBody("Link 2.") + .appendToBody("Link 3.") + .build(); + // when + HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage); + LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighboor(); + URL[] links = pageParser.links(); + + // then + assertThat(neighborhoods.length, is(3)); + assertThat(links.length, is(3)); + + assertThat(neighborhoods[0].getLink().toString(), is("http://example.com/post.php?")); + assertThat(links[0].toString(), is("http://example.com/post.php?")); + + assertThat(neighborhoods[1].getLink().toString(), is("http://example.com/post.php?a=1&b=2")); + assertThat(links[1].toString(), is("http://example.com/post.php?a=1&b=2")); + + assertThat(neighborhoods[2].getLink().toString(), is("http://example.com/")); + assertThat(links[2].toString(), is("http://example.com/")); + } + + private String createTestPage() { + return HtmlBuilder.newBuilder() + .appendToBody("

My First Heading

") + .appendToBody("My first paragraph.") + .build(); + } + + public static class HtmlBuilder { + + private String header = ""; + private String body = ""; + + public static HtmlBuilder newBuilder() { + return new HtmlBuilder(); + } + + public HtmlBuilder appendToBody(String body) { + this.body += body; + return this; + } + + public HtmlBuilder withHeader(String header) { + this.header = header; + return this; + } + + public HtmlBuilder withBody(String body) { + this.body = body; + return this; + } + + public String build() { + StringBuilder html = new StringBuilder(); + html.append(""); + html.append(""); + if(header != null && !header.isEmpty()) { + html.append(header); + } + html.append(""); + if(body != null && !body.isEmpty()) { + html.append(body); + } + html.append(""); + html.append(""); + return html.toString(); + } + + } + + +} From 60b5582d9038960e1890a0b0c8a358691ab226a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Wed, 9 May 2018 19:33:24 -0400 Subject: [PATCH 3/5] Merge with master changes --- src/main/java/focusedCrawler/util/Urls.java | 7 +- .../util/parser/HtmlSaxParser.java | 162 ++++++++---------- .../focusedCrawler/util/parser/PaginaURL.java | 6 +- 3 files changed, 84 insertions(+), 91 deletions(-) diff --git a/src/main/java/focusedCrawler/util/Urls.java b/src/main/java/focusedCrawler/util/Urls.java index 66188df8a..df9e6bed3 100644 --- a/src/main/java/focusedCrawler/util/Urls.java +++ b/src/main/java/focusedCrawler/util/Urls.java @@ -55,7 +55,7 @@ public static String removeFragmentsIfAny(String url) { return url; } - public static String resolveHttpLink(HttpUrl base, String link) { + public static HttpUrl resolveHttpLink(HttpUrl base, String link) { HttpUrl resolvedUrl; try { if (base == null) { @@ -67,6 +67,11 @@ public static String resolveHttpLink(HttpUrl base, String link) { // The link is invalid or malformed resolvedUrl = null; } + return resolvedUrl; + } + + public static String resolveHttpLinkAsString(HttpUrl base, String link) { + HttpUrl resolvedUrl = resolveHttpLink(base, link); if (resolvedUrl == null) { return null; } else { diff --git a/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java index 18487a589..8384e7858 100644 --- a/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java +++ b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java @@ -1,5 +1,12 @@ package focusedCrawler.util.parser; +import focusedCrawler.util.Urls; +import okhttp3.HttpUrl; +import org.cyberneko.html.parsers.SAXParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.*; + import java.io.IOException; import java.io.StringReader; import java.net.MalformedURLException; @@ -7,25 +14,11 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; -import java.util.TreeSet; -import java.util.regex.Pattern; - -import org.apache.commons.validator.routines.UrlValidator; -import org.cyberneko.html.parsers.SAXParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.InputSource; -import org.xml.sax.Locator; -import org.xml.sax.SAXException; - -import focusedCrawler.crawler.crawlercommons.filters.basic.BasicURLNormalizer; public class HtmlSaxParser extends SAXParser implements ContentHandler { + public static void main(String[] args) throws Exception { String url = "http://example.com"; @@ -66,23 +59,12 @@ enum TextType { public static final Logger logger = LoggerFactory.getLogger(HtmlSaxParser.class); - private final String[] schemes = {"http", "https"}; - private final UrlValidator urlValidator = new UrlValidator(schemes); - - // ONION links aren't accepted by the validator - // Regex ".[^.]+" --> any string of at least 1 char without dot - private Pattern onionRegex = Pattern.compile("https?://.[^.]+\\.onion.*"); - - private static final List invalidParameters = - Arrays.asList("sid", "phpsessid", "sessionid", "jsessionid"); - private static final BasicURLNormalizer urlNormalizer = - new BasicURLNormalizer(new TreeSet<>(invalidParameters), false); - private TextType textState = TextType.TEXT; private List anchors = new ArrayList<>(); private List images; - private String baseUrl; + private HttpUrl base; + private StringBuilder title = new StringBuilder(); private StringBuilder text = new StringBuilder(); private StringBuilder anchorText = new StringBuilder(); @@ -94,7 +76,7 @@ public HtmlSaxParser(URL url, String html) { } public HtmlSaxParser(String url, String html) { - this.baseUrl = url; + this.base = HttpUrl.parse(url); // super.setContentHandler(new BoilerpipeHTMLContentHandler()); setContentHandler(this); InputSource input = new InputSource(new StringReader(html)); @@ -121,23 +103,38 @@ public void startElement(String uri, String localName, String qName, Attributes System.out.println("(" + localName + " " + qName + " " + uri); switch (tagName) { case "BASE": { + // + // Handles the BASE tag which sets the URL that should be used for resolving + // relative links + // String href = atts.getValue("href"); if (href != null && !href.isEmpty()) { // All extracted links should be relative to the href of tag - this.baseUrl = href; + try { + HttpUrl newBase = Urls.resolveHttpLink(this.base, href); + if (newBase != null) { + this.base = newBase; + } + } catch (Exception e) { + // ignore invalid URLs + } } break; } case "A": { this.textState = TextType.ANCHOR_TEXT; String href = atts.getValue("href"); - createLink(href); + String link = createLink(this.base, href); + if (link != null) { + this.currentHref = link; + this.currentHrefTextStart = text.length(); + } break; } case "IMG": { String href = atts.getValue("href"); if (href != null && !href.isEmpty()) { - images.add(href); + images.add(createLink(this.base, href)); } break; } @@ -154,35 +151,43 @@ public void startElement(String uri, String localName, String qName, Attributes } } - private void createLink(String href) { - String url = null; - if (href == null || href.isEmpty()) - return; - else + private String createLink(HttpUrl base, String href) { + if (href == null || href.isEmpty()) { + return null; + } + + String url = href; + + if (url.startsWith(" ") || url.endsWith(" ")) { url = href.trim(); + } - if (url.startsWith("javacript:")) - return; + if (url.startsWith("javascript:")) { + return null; + } if (url.startsWith("mailto:")) { - // TODO store email - return; + return null; } if (url.startsWith("tel:")) { - // TODO store phone number - return; + return null; } - String absoluteUrl = resolveRelativeHref(href, baseUrl); - if (absoluteUrl == null || absoluteUrl.isEmpty()) - return; + if (url.startsWith("data:")) { + return null; + } + + String absoluteUrl = Urls.resolveHttpLinkAsString(base, href); + if (absoluteUrl == null || absoluteUrl.isEmpty()) { + return null; + } - if (!(urlValidator.isValid(absoluteUrl) || onionRegex.matcher(absoluteUrl).matches())) - return; + if (!Urls.isValid(absoluteUrl)) { + return null; + } - this.currentHref = urlNormalizer.filter(absoluteUrl); - this.currentHrefTextStart = text.length(); + return Urls.normalize(absoluteUrl); } @Override @@ -192,7 +197,6 @@ public void endElement(String uri, String localName, String qName) throws SAXExc switch (tagName) { case "A": if (currentHref != null && !currentHref.isEmpty()) { - // TODO: validate href? unescape? anchors.add(new Anchor(currentHref, currentHrefTextStart, text.length(), anchorText.toString().trim())); currentHref = null; @@ -238,17 +242,29 @@ public void characters(char[] ch, int start, int length) throws SAXException { } } - private String resolveRelativeHref(String href, String baseUrl) { - URL absoluteUrl = resolveRelativeHrefToUrl(href, baseUrl); - return absoluteUrl == null ? null : absoluteUrl.toString(); + public URL[] links() { + List links = new ArrayList<>(); + for (Anchor anchor : anchors) { + URL absoluteUrl = Urls.toJavaURL(anchor.href); + if (absoluteUrl != null) { + links.add(absoluteUrl); + } + } + return (URL[]) links.toArray(new URL[links.size()]); } - private URL resolveRelativeHrefToUrl(String href, String baseUrl) { - try { - return new URL(new URL(baseUrl), href); - } catch (MalformedURLException e) { - throw new RuntimeException("Invalid URL: " + baseUrl + " - " + href, e); + public LinkNeighborhood[] getLinkNeighboor() { + List links = new ArrayList<>(); + for (Anchor anchor : anchors) { + URL absoluteUrl = Urls.toJavaURL(anchor.href); + LinkNeighborhood ln = new LinkNeighborhood(absoluteUrl); + links.add(ln); } + return (LinkNeighborhood[]) links.toArray(new LinkNeighborhood[links.size()]); + } + + public URL getURL() { + return base != null ? base.url() : null; } @Override @@ -291,32 +307,4 @@ public void endPrefixMapping(String prefix) throws SAXException { System.out.println("NekoHtmlSaxParser.endPrefixMapping()"); } - public URL[] links() { - List links = new ArrayList<>(); - for (Anchor anchor : anchors) { - URL absoluteUrl = resolveRelativeHrefToUrl(anchor.href, baseUrl); - links.add(absoluteUrl); - } - return (URL[]) links.toArray(new URL[links.size()]); - } - - - public LinkNeighborhood[] getLinkNeighboor() { - List links = new ArrayList<>(); - for (Anchor anchor : anchors) { - URL absoluteUrl = resolveRelativeHrefToUrl(anchor.href, baseUrl); - LinkNeighborhood ln = new LinkNeighborhood(absoluteUrl); - links.add(ln); - } - return (LinkNeighborhood[]) links.toArray(new LinkNeighborhood[links.size()]); - } - - public URL getURL() { - try { - return new URL(baseUrl); - } catch (MalformedURLException e) { - throw new RuntimeException("Invalid URL: " + baseUrl, e); - } - } - } diff --git a/src/main/java/focusedCrawler/util/parser/PaginaURL.java b/src/main/java/focusedCrawler/util/parser/PaginaURL.java index a94b38b9f..0263d1c71 100644 --- a/src/main/java/focusedCrawler/util/parser/PaginaURL.java +++ b/src/main/java/focusedCrawler/util/parser/PaginaURL.java @@ -900,7 +900,7 @@ protected void separadorTextoCodigo(String arquivo) { // arquivo equivale ao ln.setImgSource(str); } try { - imagens.add(Urls.resolveHttpLink(base,str).toString()); + imagens.add(Urls.resolveHttpLinkAsString(base,str).toString()); } catch (Exception e) { // TODO: handle exception } @@ -1022,7 +1022,7 @@ else if (tagName.equals("frame") && atributo.equals("src")) { } else if (tagName.equals("base") && atributo.equals("href")) { try { HttpUrl oldBase = (baseUrl == null) ? null : HttpUrl.get(baseUrl); - String newBase = Urls.resolveHttpLink(oldBase, str); + String newBase = Urls.resolveHttpLinkAsString(oldBase, str); base = (newBase == null) ? null : HttpUrl.parse(newBase); } catch (Exception e) { // ignore invalid URLs @@ -1237,7 +1237,7 @@ protected String addLink(String link, HttpUrl base) { return ""; } link = link.trim(); - link = Urls.resolveHttpLink(base, link); + link = Urls.resolveHttpLinkAsString(base, link); if (link == null) { return ""; } From 27e12d1628c9fa37ebe2662d70443fa91883f14b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Fri, 16 Jul 2021 23:45:44 -0400 Subject: [PATCH 4/5] Implemented extraction of the text around links --- .../util/parser/HtmlSaxParser.java | 373 +++++++++++------- .../util/parser/HtmlSaxParserTest.java | 39 +- 2 files changed, 262 insertions(+), 150 deletions(-) diff --git a/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java b/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java index 01e826865..6910a8268 100644 --- a/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java +++ b/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java @@ -2,6 +2,11 @@ import achecrawler.util.Urls; import okhttp3.HttpUrl; +import org.apache.commons.io.input.CharSequenceReader; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.SimpleAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.cyberneko.html.parsers.SAXParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -15,60 +20,27 @@ import java.util.ArrayList; import java.util.List; -public class HtmlSaxParser extends SAXParser implements ContentHandler { - - - public static void main(String[] args) throws Exception { - - String url = "http://example.com"; - String html = - "

My heading 1!

My Paragraph.

"; - html = "Hello World!"; - html = new String(Files.readAllBytes(Paths.get( - "ache-tools/src/test/resources/achecrawler/memex/cdr/http%3A%2F%2Fwww.darpa.mil%2Fprogram%2Fmemex"))); - HtmlSaxParser parser = new HtmlSaxParser(url, html); - parser.print(); - } - - static class Anchor { - - private String href; - private int textStart; - private int textEnd; - private String anchorText; - - Anchor(String href, int textStart, int textEnd, String anchorText) { - this.href = href; - this.textStart = textStart; - this.textEnd = textEnd; - this.anchorText = anchorText; - } - @Override - public String toString() { - return "Anchor[href=" + href + ", textStart=" + textStart + ", textEnd=" + textEnd - + ", text=" + anchorText + "]"; - } +public class HtmlSaxParser extends SAXParser implements ContentHandler { - } + public static final Logger logger = LoggerFactory.getLogger(HtmlSaxParser.class); - enum TextType { - TITLE, TEXT, ANCHOR_TEXT, IGNORE - } + public static final int AROUND_WORDS = 10; - public static final Logger logger = LoggerFactory.getLogger(HtmlSaxParser.class); + private final List anchors = new ArrayList<>(); + private final List images = new ArrayList<>(); + private final List tokens = new ArrayList<>(); + private final StringBuilder title = new StringBuilder(); + private final StringBuilder text = new StringBuilder(); + private final SimpleTokenizer tokenizer = new SimpleTokenizer(new CharSequenceReader(text)); - private TextType textState = TextType.TEXT; - private List anchors = new ArrayList<>(); - private List images; private HttpUrl base; - - private StringBuilder title = new StringBuilder(); - private StringBuilder text = new StringBuilder(); - private StringBuilder anchorText = new StringBuilder(); + private TextType textState = TextType.TEXT; private String currentHref = null; private int currentHrefTextStart = 0; + private int currentHrefTokenStart = 0; + private StringBuilder anchorText = new StringBuilder(); public HtmlSaxParser(URL url, String html) { this(url.toString(), html); @@ -86,38 +58,11 @@ public HtmlSaxParser(String url, String html) { } } - private void print() { - System.out.println("---"); - System.out.println("TEXT: " + text.toString()); - System.out.println("ANCHORS: "); - for (Anchor anchor : anchors) { - System.out.println("> " + anchor); - } - } - @Override - public void startElement(String uri, String localName, String qName, Attributes atts) - throws SAXException { - String tagName = localName; - System.out.println("(" + localName + " " + qName + " " + uri); + public void startElement(String uri, String tagName, String qName, Attributes atts) { switch (tagName) { case "BASE": { - // - // Handles the BASE tag which sets the URL that should be used for resolving - // relative links - // - String href = atts.getValue("href"); - if (href != null && !href.isEmpty()) { - // All extracted links should be relative to the href of tag - try { - HttpUrl newBase = Urls.resolveHttpLink(this.base, href); - if (newBase != null) { - this.base = newBase; - } - } catch (Exception e) { - // ignore invalid URLs - } - } + handleBaseTag(atts); break; } case "A": { @@ -127,6 +72,9 @@ public void startElement(String uri, String localName, String qName, Attributes if (link != null) { this.currentHref = link; this.currentHrefTextStart = text.length(); + + this.tokenizer.tokenize(); + this.currentHrefTokenStart = this.tokens.size(); } break; } @@ -150,60 +98,21 @@ public void startElement(String uri, String localName, String qName, Attributes } } - private String createLink(HttpUrl base, String href) { - if (href == null || href.isEmpty()) { - return null; - } - - String url = href; - - if (url.startsWith(" ") || url.endsWith(" ")) { - url = href.trim(); - } - - if (url.startsWith("javascript:")) { - return null; - } - - if (url.startsWith("mailto:")) { - return null; - } - - if (url.startsWith("tel:")) { - return null; - } - - if (url.startsWith("data:")) { - return null; - } - - String absoluteUrl = Urls.resolveHttpLinkAsString(base, href); - if (absoluteUrl == null || absoluteUrl.isEmpty()) { - return null; - } - - if (!Urls.isValid(absoluteUrl)) { - return null; - } - - return Urls.normalize(absoluteUrl); - } - @Override - public void endElement(String uri, String localName, String qName) throws SAXException { - String tagName = localName; - System.out.println(")" + localName); + public void endElement(String uri, String tagName, String qName) { switch (tagName) { case "A": if (currentHref != null && !currentHref.isEmpty()) { + tokenizer.tokenize(); anchors.add(new Anchor(currentHref, currentHrefTextStart, text.length(), - anchorText.toString().trim())); + anchorText.toString().trim(), currentHrefTokenStart, tokens.size())); currentHref = null; } anchorText = new StringBuilder(); textState = TextType.TEXT; break; case "TITLE": + textState = TextType.IGNORE; break; case "P": case "H1": @@ -217,17 +126,37 @@ public void endElement(String uri, String localName, String qName) throws SAXExc case "BR": text.append('\n'); break; + case "NOSCRIPT": + case "SCRIPT": + case "STYLE": + this.textState = TextType.TEXT; + break; default: text.append(' '); } } + /* + * Handles the BASE tag which sets the URL that should be used for resolving + */ + private void handleBaseTag(Attributes attributes) { + String href = attributes.getValue("href"); + if (href != null && !href.isEmpty()) { + // All extracted links should be relative to the href of tag + try { + HttpUrl newBase = Urls.resolveHttpLink(this.base, href); + if (newBase != null) { + this.base = newBase; + } + } catch (Exception e) { + // ignore invalid URLs + } + } + } + @Override - public void characters(char[] ch, int start, int length) throws SAXException { - System.out.println(new String(ch, start, length)); + public void characters(char[] ch, int start, int length) { switch (textState) { - case IGNORE: - break; case TEXT: text.append(ch, start, length); break; @@ -238,6 +167,8 @@ public void characters(char[] ch, int start, int length) throws SAXException { case TITLE: title.append(ch, start, length); break; + case IGNORE: + break; } } @@ -252,61 +183,223 @@ public URL[] links() { return links.toArray(new URL[links.size()]); } - public LinkNeighborhood[] getLinkNeighboor() { + public LinkNeighborhood[] getLinkNeighborhood() { List links = new ArrayList<>(); for (Anchor anchor : anchors) { URL absoluteUrl = Urls.toJavaURL(anchor.href); LinkNeighborhood ln = new LinkNeighborhood(absoluteUrl); - // TODO: -// ln.setAround(); -// ln.setAnchor(); + ln.setAround(createAroundText(anchor)); + ln.setAnchor(createAnchorText(anchor)); links.add(ln); } return links.toArray(new LinkNeighborhood[links.size()]); } + private String[] createAnchorText(Anchor anchor) { + List aroundTemp = new ArrayList<>(); + for (int i = anchor.tokenStart; i < anchor.tokenEnd; i++) { + aroundTemp.add(tokens.get(i)); + } + return aroundTemp.toArray(new String[aroundTemp.size()]); + } + + private String[] createAroundText(Anchor anchor) { + List aroundTemp = new ArrayList(); + final int begin = Math.max(0, anchor.tokenStart - AROUND_WORDS); + for (int i = begin; i < anchor.tokenStart; i++) { + aroundTemp.add(tokens.get(i)); + } + int end = Math.min(tokens.size(), anchor.tokenEnd + AROUND_WORDS); + for (int i = anchor.tokenEnd; i < end; i++) { + aroundTemp.add(tokens.get(i)); + } + return aroundTemp.toArray(new String[aroundTemp.size()]); + } + + private String createLink(HttpUrl base, String href) { + if (href == null || href.isEmpty()) { + return null; + } + String url = href; + if (url.startsWith(" ") || url.endsWith(" ")) { + url = href.trim(); + } + if (url.startsWith("javascript:")) { + return null; + } + if (url.startsWith("mailto:")) { + return null; + } + if (url.startsWith("tel:")) { + return null; + } + if (url.startsWith("data:")) { + return null; + } + String absoluteUrl = Urls.resolveHttpLinkAsString(base, href); + if (absoluteUrl == null || absoluteUrl.isEmpty()) { + return null; + } + if (!Urls.isValid(absoluteUrl)) { + return null; + } + return Urls.normalize(absoluteUrl); + } + public URL getURL() { return base != null ? base.url() : null; } + public List getTokens() { + return this.tokens; + } + + public String title() { + return this.title.toString(); + } + + private void print() { + // TODO: Clean up + System.out.println("---"); + System.out.println("TEXT: " + text.toString()); + System.out.println("ANCHORS: "); + for (Anchor anchor : anchors) { + System.out.println("> " + anchor); + } + } + @Override public void setDocumentLocator(Locator locator) { - // System.out.println("NekoHtmlSaxParser.setDocumentLocator()"); } @Override - public void startDocument() throws SAXException { - // System.out.println("NekoHtmlSaxParser.startDocument()"); + public void startDocument() { } @Override - public void endDocument() throws SAXException { - // System.out.println("NekoHtmlSaxParser.startElement()"); + public void endDocument() { + // Finish tokenization of text left over + this.tokenizer.tokenize(); } @Override - public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { - System.out.println("NekoHtmlSaxParser.ignorableWhitespace()"); + public void ignorableWhitespace(char[] ch, int start, int length) { } @Override - public void processingInstruction(String target, String data) throws SAXException { - System.out.println("NekoHtmlSaxParser.processingInstruction()"); + public void processingInstruction(String target, String data) { } @Override - public void skippedEntity(String name) throws SAXException { - System.out.println("NekoHtmlSaxParser.skippedEntity()"); + public void skippedEntity(String name) { } @Override - public void startPrefixMapping(String prefix, String uri) throws SAXException { - System.out.println("NekoHtmlSaxParser.startPrefixMapping()"); + public void startPrefixMapping(String prefix, String uri) { } @Override - public void endPrefixMapping(String prefix) throws SAXException { - System.out.println("NekoHtmlSaxParser.endPrefixMapping()"); + public void endPrefixMapping(String prefix) { + } + + enum TextType { + TITLE, TEXT, ANCHOR_TEXT, IGNORE + } + + static class Anchor { + + private final String href; + private final int textStart; + private final int textEnd; + private final String anchorText; + private final int tokenStart; + private final int tokenEnd; + + Anchor(String href, int textStart, int textEnd, String anchorText, int tokenStart, int tokenEnd) { + this.href = href; + this.textStart = textStart; + this.textEnd = textEnd; + this.anchorText = anchorText; + this.tokenStart = tokenStart; + this.tokenEnd = tokenEnd; + } + + @Override + public String toString() { + return "Anchor[href=" + href + + ", textStart=" + textStart + + ", textEnd=" + textEnd + + ", text=" + anchorText + + "]"; + } } + public class SimpleTokenizer { + + private final TokenStream ts; + private final CharTermAttribute cattr; + + public SimpleTokenizer(CharSequenceReader cleanText) { + // TODO: setup a good general tokenizer + Analyzer analyzer = new SimpleAnalyzer(); +// this.analyzer = new StandardAnalyzer(StandardAnalyzer.ENGLISH_STOP_WORDS_SET); +// this.analyzer = new Analyzer() { +// @Override +// protected TokenStreamComponents createComponents(final String fieldName) { +// final StandardTokenizer src = new StandardTokenizer(); +// src.setMaxTokenLength(255); +// // return new TokenStreamComponents(src); +//// TokenStream tok = new StandardFilter(src); +//// tok = new LowerCaseFilter(tok); +////// tok = new StopFilter(tok, stopwords); +//// return new TokenStreamComponents(src, tok) { +//// @Override +//// protected void setReader(final Reader reader) { +//// // So that if maxTokenLength was changed, the change takes +//// // effect next time tokenStream is called: +//// src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength); +//// super.setReader(reader); +//// } +//// }; +// } +// }; + ts = analyzer.tokenStream("cleanText", cleanText); + cattr = ts.addAttribute(CharTermAttribute.class); + try { + ts.reset(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void tokenize() { + try { + while (ts.incrementToken()) { + String token = cattr.toString(); + HtmlSaxParser.this.tokens.add(token); + } + } catch (IOException e) { + throw new RuntimeException("Tokenization failed", e); + } + } + } + + // TODO: Clean up + public static void main(String[] args) throws Exception { + + String url = "http://www.darpa.mil/program/memex"; + String html = + "

My heading 1!

My Paragraph.

"; + html = new String(Files.readAllBytes(Paths.get( + "ache-tools/src/test/resources/achecrawler/memex/cdr/http%3A%2F%2Fwww.darpa.mil%2Fprogram%2Fmemex"))); + HtmlSaxParser parser = new HtmlSaxParser(url, html); +// parser.print(); +// PaginaURL parser = new PaginaURL(new URL(url), html); + + final LinkNeighborhood[] neighborhoods = parser.getLinkNeighborhood(); + for (LinkNeighborhood n : neighborhoods) { + System.out.println("> Around: " + n.getLink().toString()); + System.out.println(n.getAroundString()); + } + } } diff --git a/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java b/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java index ea1c48b57..20791657a 100644 --- a/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java +++ b/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java @@ -7,12 +7,25 @@ import java.net.MalformedURLException; import java.net.URL; -import org.junit.After; -import org.junit.Before; import org.junit.Test; public class HtmlSaxParserTest { + @Test + public void shouldExtractTitle() throws Exception { + // given + String testString = new HtmlBuilder() + .withHeader("ACHE Crawler \n \t") + .withBody("

My text

") + .build(); + + // when + HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString); + + // then + assertThat(pageParser.title().trim(), is("ACHE Crawler")); + } + @Test public void htmlEncodedLinksShouldBeEscaped() throws Exception { // given @@ -23,7 +36,7 @@ public void htmlEncodedLinksShouldBeEscaped() throws Exception { // when HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString); URL[] extractedLinks = pageParser.links(); - LinkNeighborhood[] neighborhood = pageParser.getLinkNeighboor(); + LinkNeighborhood[] neighborhood = pageParser.getLinkNeighborhood(); // then assertThat(extractedLinks[0].toString(), is("http://ex.com/index.php?p1=asdf&p2=qwer")); @@ -110,25 +123,31 @@ public void shouldExtractAnchorTextAndTextAroundLink() throws MalformedURLExcept String url = "http://www.example.com"; String testPage = HtmlBuilder.newBuilder() .appendToBody("

My First Heading

") - .appendToBody("My first paragraph.") + .appendToBody("My first anchor text.") +// .appendToBody("my second anchor text.") + .appendToBody("

my paragraph.

") + .appendToBody("free text") .build(); // when HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage); - LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighboor(); + LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighborhood(); // PaginaURL pageParser = new PaginaURL(new URL(url), testPage); // LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighboor(); +// System.out.println("tokens = " + pageParser.getTokens()); + // then assertThat(neighborhoods.length, is(1)); - assertThat(neighborhoods[0].getAroundString().trim(), is("my first heading")); + assertThat(neighborhoods[0].getAroundString().trim(), is("my first heading my paragraph free text")); assertThat(neighborhoods[0].getAround()[0], is("my")); assertThat(neighborhoods[0].getAround()[1], is("first")); assertThat(neighborhoods[0].getAround()[2], is("heading")); - assertThat(neighborhoods[0].getAnchorString().trim(), is("my first paragraph")); + assertThat(neighborhoods[0].getAnchorString().trim(), is("my first anchor text")); assertThat(neighborhoods[0].getAnchor()[0], is("my")); assertThat(neighborhoods[0].getAnchor()[1], is("first")); - assertThat(neighborhoods[0].getAnchor()[2], is("paragraph")); + assertThat(neighborhoods[0].getAnchor()[2], is("anchor")); + assertThat(neighborhoods[0].getAnchor()[3], is("text")); } @Test @@ -146,7 +165,7 @@ public void shouldNotExtractInvalidLinks() throws MalformedURLException { // when HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString()); URL[] links = pageParser.links(); - LinkNeighborhood[] lns = pageParser.getLinkNeighboor(); + LinkNeighborhood[] lns = pageParser.getLinkNeighborhood(); // then assertThat(links.length, is(1)); @@ -168,7 +187,7 @@ public void shouldNormalizeLinks() throws MalformedURLException { .build(); // when HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage); - LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighboor(); + LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighborhood(); URL[] links = pageParser.links(); // then From de8fe4ca8fb3937ede6ec7e378e0fc96b4056bf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Sat, 17 Jul 2021 12:56:28 -0400 Subject: [PATCH 5/5] Add HTML entity test and clean code --- .../util/parser/HtmlSaxParser.java | 9 +- .../util/parser/HtmlSaxParserTest.java | 98 +++++++++++-------- 2 files changed, 64 insertions(+), 43 deletions(-) diff --git a/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java b/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java index 6910a8268..f458dac34 100644 --- a/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java +++ b/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java @@ -100,6 +100,7 @@ public void startElement(String uri, String tagName, String qName, Attributes at @Override public void endElement(String uri, String tagName, String qName) { + // TODO: extract data from tags (e.g., description, keywords, noindex, nofollow) switch (tagName) { case "A": if (currentHref != null && !currentHref.isEmpty()) { @@ -112,7 +113,7 @@ public void endElement(String uri, String tagName, String qName) { textState = TextType.TEXT; break; case "TITLE": - textState = TextType.IGNORE; + textState = TextType.TEXT; break; case "P": case "H1": @@ -250,7 +251,7 @@ public URL getURL() { return base != null ? base.url() : null; } - public List getTokens() { + public List tokens() { return this.tokens; } @@ -258,6 +259,10 @@ public String title() { return this.title.toString(); } + public String text() { + return this.text.toString(); + } + private void print() { // TODO: Clean up System.out.println("---"); diff --git a/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java b/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java index 20791657a..96fb4e665 100644 --- a/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java +++ b/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java @@ -12,7 +12,7 @@ public class HtmlSaxParserTest { @Test - public void shouldExtractTitle() throws Exception { + public void shouldExtractTitle() { // given String testString = new HtmlBuilder() .withHeader("ACHE Crawler \n \t") @@ -27,12 +27,28 @@ public void shouldExtractTitle() throws Exception { } @Test - public void htmlEncodedLinksShouldBeEscaped() throws Exception { + public void shouldCleanHtmlEntities() { + // given + String testString = new HtmlBuilder() + .withHeader("ACHE > domain specific search ©") + .withBody("

My text & me. €

") + .build(); + + // when + HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString); + + // then + assertThat(pageParser.title(), is("ACHE > domain specific search ©")); + assertThat(pageParser.text().trim(), is("My\u00A0text & me. €")); + } + + @Test + public void htmlEncodedLinksShouldBeEscaped() { // given String testString = new HtmlBuilder() .withBody("Anchor text.") .build(); - + // when HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString); URL[] extractedLinks = pageParser.links(); @@ -44,23 +60,23 @@ public void htmlEncodedLinksShouldBeEscaped() throws Exception { } @Test - public void linksShouldNotContainFragments() throws Exception { + public void linksShouldNotContainFragments() throws MalformedURLException { // given String testString = new HtmlBuilder() .appendToBody("

My First Heading

") .appendToBody("Mouse") .build(); URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document"); - + // when HtmlSaxParser pageParser = new HtmlSaxParser(url, testString); URL[] extractedLinks = pageParser.links(); - + // then assertThat(extractedLinks.length, is(1)); assertThat(extractedLinks[0].toString(), is("https://en.wikipedia.org/wiki/Mouse_(computing)")); } - + @Test public void constructorsShouldWork() throws MalformedURLException { // given @@ -71,7 +87,7 @@ public void constructorsShouldWork() throws MalformedURLException { // then assertThat(pageParser.getURL(), is(notNullValue())); } - + @Test public void shouldExtractOnionLinks() throws MalformedURLException { // given @@ -79,17 +95,17 @@ public void shouldExtractOnionLinks() throws MalformedURLException { String testPage = new HtmlBuilder() .appendToBody("link 1") .appendToBody("link 1") - .build(); + .build(); // when HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString()); URL[] links = pageParser.links(); - + // then assertThat(links.length, is(2)); assertThat(links[0].toString(), is("http://3g2asl4qw6kufc5m.onion/")); assertThat(links[1].toString(), is("http://3g2asl4qw6kufc5m.onion/test.html")); } - + // @Test // public void shouldParseText() throws MalformedURLException { // // given @@ -116,7 +132,7 @@ public void shouldExtractOnionLinks() throws MalformedURLException { //// assertThat(lns.length, is(1)); //// assertThat(lns[0].getLink().toString(), is("http://example.com/asdf.html")); // } - + @Test public void shouldExtractAnchorTextAndTextAroundLink() throws MalformedURLException { // given @@ -137,40 +153,40 @@ public void shouldExtractAnchorTextAndTextAroundLink() throws MalformedURLExcept // then assertThat(neighborhoods.length, is(1)); - + assertThat(neighborhoods[0].getAroundString().trim(), is("my first heading my paragraph free text")); assertThat(neighborhoods[0].getAround()[0], is("my")); assertThat(neighborhoods[0].getAround()[1], is("first")); assertThat(neighborhoods[0].getAround()[2], is("heading")); - + assertThat(neighborhoods[0].getAnchorString().trim(), is("my first anchor text")); assertThat(neighborhoods[0].getAnchor()[0], is("my")); assertThat(neighborhoods[0].getAnchor()[1], is("first")); assertThat(neighborhoods[0].getAnchor()[2], is("anchor")); assertThat(neighborhoods[0].getAnchor()[3], is("text")); } - + @Test public void shouldNotExtractInvalidLinks() throws MalformedURLException { // given URL url = new URL("http://example.com/test.html"); String testPage = new HtmlBuilder() .withBody( - "

My First Heading

" - + "link 0" - + "link 1" - + "link 2" + "

My First Heading

" + + "link 0" + + "link 1" + + "link 2" ) - .build(); + .build(); // when HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString()); URL[] links = pageParser.links(); - LinkNeighborhood[] lns = pageParser.getLinkNeighborhood(); - + LinkNeighborhood[] lns = pageParser.getLinkNeighborhood(); + // then assertThat(links.length, is(1)); assertThat(links[0].toString(), is("http://example.com/asdf.html")); - + assertThat(lns.length, is(1)); assertThat(lns[0].getLink().toString(), is("http://example.com/asdf.html")); } @@ -180,11 +196,11 @@ public void shouldNormalizeLinks() throws MalformedURLException { // given URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document"); String testPage = HtmlBuilder.newBuilder() - .appendToBody("

My First Heading

") - .appendToBody("Link 1.") - .appendToBody("Link 2.") - .appendToBody("Link 3.") - .build(); + .appendToBody("

My First Heading

") + .appendToBody("Link 1.") + .appendToBody("Link 2.") + .appendToBody("Link 3.") + .build(); // when HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage); LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighborhood(); @@ -193,10 +209,10 @@ public void shouldNormalizeLinks() throws MalformedURLException { // then assertThat(neighborhoods.length, is(3)); assertThat(links.length, is(3)); - + assertThat(neighborhoods[0].getLink().toString(), is("http://example.com/post.php?")); assertThat(links[0].toString(), is("http://example.com/post.php?")); - + assertThat(neighborhoods[1].getLink().toString(), is("http://example.com/post.php?a=1&b=2")); assertThat(links[1].toString(), is("http://example.com/post.php?a=1&b=2")); @@ -206,20 +222,20 @@ public void shouldNormalizeLinks() throws MalformedURLException { private String createTestPage() { return HtmlBuilder.newBuilder() - .appendToBody("

My First Heading

") - .appendToBody("My first paragraph.") - .build(); + .appendToBody("

My First Heading

") + .appendToBody("My first paragraph.") + .build(); } - + public static class HtmlBuilder { - + private String header = ""; private String body = ""; - + public static HtmlBuilder newBuilder() { return new HtmlBuilder(); } - + public HtmlBuilder appendToBody(String body) { this.body += body; return this; @@ -234,16 +250,16 @@ public HtmlBuilder withBody(String body) { this.body = body; return this; } - + public String build() { StringBuilder html = new StringBuilder(); html.append(""); html.append(""); - if(header != null && !header.isEmpty()) { + if (header != null && !header.isEmpty()) { html.append(header); } html.append(""); - if(body != null && !body.isEmpty()) { + if (body != null && !body.isEmpty()) { html.append(body); } html.append(""); @@ -252,6 +268,6 @@ public String build() { } } - + }