From 04a7f14f4b27824b8d0d66b98a03e5bd24224cea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?A=C3=A9cio=20Santos?= <aecio.solando@gmail.com>
Date: Wed, 15 Nov 2017 22:32:33 -0500
Subject: [PATCH 1/5] Initial skeleton on HTML SAX parser based on nekohtml
 (issue #80)

---
 .../util/parser/HtmlSaxParser.java            | 220 ++++++++++++++++++
 1 file changed, 220 insertions(+)
 create mode 100644 src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java
diff --git a/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java
new file mode 100644
index 000000000..7fb9994c6
--- /dev/null
+++ b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java
@@ -0,0 +1,220 @@
+package focusedCrawler.util.parser;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.cyberneko.html.parsers.SAXParser;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+public class HtmlSaxParser extends SAXParser implements ContentHandler {
+
+    public static void main(String[] args) throws Exception {
+
+        String url = "http://example.com";
+        String html =
+                "<html><meta /><body><h1><!-- my comment --><a HREF=\"index.html\">My heading 1!</a></h1><div><p>My Paragraph.</p></body></html>";
+        html = "Hello World!";
+        html = new String(Files.readAllBytes(Paths.get(
+                "src/test/resources/focusedCrawler/memex/cdr/http%3A%2F%2Fwww.darpa.mil%2Fprogram%2Fmemex")));
+        HtmlSaxParser parser = new HtmlSaxParser(url, html);
+        parser.print();
+    }
+
+    static class Anchor {
+
+        private String href;
+        private int textStart;
+        private int textEnd;
+        private String anchorText;
+
+        Anchor(String href, int textStart, int textEnd, String anchorText) {
+            this.href = href;
+            this.textStart = textStart;
+            this.textEnd = textEnd;
+            this.anchorText = anchorText;
+        }
+
+        @Override
+        public String toString() {
+            return "Anchor[href=" + href + ", textStart=" + textStart + ", textEnd=" + textEnd
+                    + ", text=" + anchorText + "]";
+        }
+
+    }
+
+    enum TextType {
+        TITLE, TEXT, ANCHOR_TEXT, IGNORE
+    }
+
+    private TextType textState = TextType.TEXT;
+    private List<Anchor> anchors = new ArrayList<>();
+    private List<String> images;
+    private String baseUrl;
+    private StringBuilder title = new StringBuilder();
+    private StringBuilder text = new StringBuilder();
+    private StringBuilder anchorText = new StringBuilder();
+    private String currentHref = null;
+    private int currentHrefTextStart = 0;
+
+    public HtmlSaxParser(String url, String html) throws SAXException, IOException {
+        this.baseUrl = url;
+        // super.setContentHandler(new BoilerpipeHTMLContentHandler());
+        setContentHandler(this);
+        InputSource input = new InputSource(new StringReader(html));
+        this.parse(input);
+    }
+
+    private void print() {
+        System.out.println("---");
+        System.out.println("TEXT: " + text.toString());
+        System.out.println("ANCHORS: ");
+        for (Anchor anchor : anchors) {
+            System.out.println("> " + anchor);
+        }
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts)
+            throws SAXException {
+        String tagName = localName;
+        System.out.println("(" + localName + " " + qName + " " + uri);
+        switch (tagName) {
+            case "BASE": {
+                String href = atts.getValue("href");
+                if (href != null && !href.isEmpty()) {
+                    // All extracted links should be relative to the href of <base> tag
+                    this.baseUrl = href;
+                }
+                break;
+            }
+            case "A": {
+                this.textState = TextType.ANCHOR_TEXT;
+                String href = atts.getValue("href");
+                if (href != null && !href.isEmpty()) {
+                    this.currentHref = href;
+                    this.currentHrefTextStart = text.length();
+                }
+                break;
+            }
+            case "IMG": {
+                String href = atts.getValue("href");
+                if (href != null && !href.isEmpty()) {
+                    images.add(href);
+                }
+                break;
+            }
+            case "NOSCRIPT":
+            case "SCRIPT":
+            case "STYLE":
+                this.textState = TextType.IGNORE;
+                break;
+            case "TITLE":
+                this.textState = TextType.TITLE;
+                break;
+            // default:
+            // this.textState = TextType.TEXT;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        String tagName = localName;
+        System.out.println(")" + localName);
+        switch (tagName) {
+            case "A":
+                if (currentHref != null && !currentHref.isEmpty()) {
+                    // TODO: validate href? unescape?
+                    anchors.add(new Anchor(currentHref, currentHrefTextStart, text.length(),
+                            anchorText.toString().trim()));
+                    currentHref = null;
+                }
+                anchorText = new StringBuilder();
+                textState = TextType.TEXT;
+                break;
+            case "TITLE":
+                break;
+            case "P":
+            case "H1":
+            case "H2":
+            case "H3":
+            case "H4":
+            case "H5":
+            case "H6":
+                text.append("\n\n");
+                break;
+            case "BR":
+                text.append('\n');
+                break;
+            default:
+                text.append(' ');
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        System.out.println(new String(ch, start, length));
+        switch (textState) {
+            case IGNORE:
+                break;
+            case TEXT:
+                text.append(ch, start, length);
+                break;
+            case ANCHOR_TEXT:
+                text.append(ch, start, length);
+                anchorText.append(ch, start, length);
+                break;
+            case TITLE:
+                title.append(ch, start, length);
+                break;
+        }
+    }
+
+    @Override
+    public void setDocumentLocator(Locator locator) {
+        // System.out.println("NekoHtmlSaxParser.setDocumentLocator()");
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+        // System.out.println("NekoHtmlSaxParser.startDocument()");
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        // System.out.println("NekoHtmlSaxParser.startElement()");
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+        System.out.println("NekoHtmlSaxParser.ignorableWhitespace()");
+    }
+
+    @Override
+    public void processingInstruction(String target, String data) throws SAXException {
+        System.out.println("NekoHtmlSaxParser.processingInstruction()");
+    }
+
+    @Override
+    public void skippedEntity(String name) throws SAXException {
+        System.out.println("NekoHtmlSaxParser.skippedEntity()");
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+        System.out.println("NekoHtmlSaxParser.startPrefixMapping()");
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+        System.out.println("NekoHtmlSaxParser.endPrefixMapping()");
+    }
+
+}

From c0f59420c2958e66557c07dea634db1e743b5f20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?A=C3=A9cio=20Santos?= <aecio.solando@gmail.com>
Date: Fri, 17 Nov 2017 16:15:10 -0500
Subject: [PATCH 2/5] Added unit tests and implemented URL normalization and
 validation

---
 .../util/parser/HtmlSaxParser.java            | 114 ++++++++-
 .../util/parser/HtmlSaxParserTest.java        | 242 ++++++++++++++++++
 2 files changed, 350 insertions(+), 6 deletions(-)
 create mode 100644 src/test/java/focusedCrawler/util/parser/HtmlSaxParserTest.java

diff --git a/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java
index 7fb9994c6..18487a589 100644
--- a/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java
+++ b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java
@@ -2,18 +2,28 @@
 
 import java.io.IOException;
 import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
+import java.util.TreeSet;
+import java.util.regex.Pattern;
 
+import org.apache.commons.validator.routines.UrlValidator;
 import org.cyberneko.html.parsers.SAXParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.InputSource;
 import org.xml.sax.Locator;
 import org.xml.sax.SAXException;
 
+import focusedCrawler.crawler.crawlercommons.filters.basic.BasicURLNormalizer;
+
 public class HtmlSaxParser extends SAXParser implements ContentHandler {
 
     public static void main(String[] args) throws Exception {
@@ -54,6 +64,21 @@ enum TextType {
         TITLE, TEXT, ANCHOR_TEXT, IGNORE
     }
 
+    public static final Logger logger = LoggerFactory.getLogger(HtmlSaxParser.class);
+
+    private final String[] schemes = {"http", "https"};
+    private final UrlValidator urlValidator = new UrlValidator(schemes);
+
+    // ONION links aren't accepted by the validator
+    // Regex ".[^.]+" --> any string of at least 1 char without dot
+    private Pattern onionRegex = Pattern.compile("https?://.[^.]+\\.onion.*");
+
+    private static final List<String> invalidParameters =
+            Arrays.asList("sid", "phpsessid", "sessionid", "jsessionid");
+    private static final BasicURLNormalizer urlNormalizer =
+            new BasicURLNormalizer(new TreeSet<>(invalidParameters), false);
+
+
     private TextType textState = TextType.TEXT;
     private List<Anchor> anchors = new ArrayList<>();
     private List<String> images;
@@ -64,12 +89,20 @@ enum TextType {
     private String currentHref = null;
     private int currentHrefTextStart = 0;
 
-    public HtmlSaxParser(String url, String html) throws SAXException, IOException {
+    public HtmlSaxParser(URL url, String html) {
+        this(url.toString(), html);
+    }
+
+    public HtmlSaxParser(String url, String html) {
         this.baseUrl = url;
         // super.setContentHandler(new BoilerpipeHTMLContentHandler());
         setContentHandler(this);
         InputSource input = new InputSource(new StringReader(html));
-        this.parse(input);
+        try {
+            this.parse(input);
+        } catch (SAXException | IOException e) {
+            throw new RuntimeException("Failed to parse page: " + url, e);
+        }
     }
 
     private void print() {
@@ -98,10 +131,7 @@ public void startElement(String uri, String localName, String qName, Attributes
             case "A": {
                 this.textState = TextType.ANCHOR_TEXT;
                 String href = atts.getValue("href");
-                if (href != null && !href.isEmpty()) {
-                    this.currentHref = href;
-                    this.currentHrefTextStart = text.length();
-                }
+                createLink(href);
                 break;
             }
             case "IMG": {
@@ -124,6 +154,37 @@ public void startElement(String uri, String localName, String qName, Attributes
         }
     }
 
+    private void createLink(String href) {
+        String url = null;
+        if (href == null || href.isEmpty())
+            return;
+        else
+            url = href.trim();
+
+        if (url.startsWith("javacript:"))
+            return;
+
+        if (url.startsWith("mailto:")) {
+            // TODO store email
+            return;
+        }
+
+        if (url.startsWith("tel:")) {
+            // TODO store phone number
+            return;
+        }
+
+        String absoluteUrl = resolveRelativeHref(href, baseUrl);
+        if (absoluteUrl == null || absoluteUrl.isEmpty())
+            return;
+
+        if (!(urlValidator.isValid(absoluteUrl) || onionRegex.matcher(absoluteUrl).matches()))
+            return;
+
+        this.currentHref = urlNormalizer.filter(absoluteUrl);
+        this.currentHrefTextStart = text.length();
+    }
+
     @Override
     public void endElement(String uri, String localName, String qName) throws SAXException {
         String tagName = localName;
@@ -177,6 +238,19 @@ public void characters(char[] ch, int start, int length) throws SAXException {
         }
     }
 
+    private String resolveRelativeHref(String href, String baseUrl) {
+        URL absoluteUrl = resolveRelativeHrefToUrl(href, baseUrl);
+        return absoluteUrl == null ? null : absoluteUrl.toString();
+    }
+
+    private URL resolveRelativeHrefToUrl(String href, String baseUrl) {
+        try {
+            return new URL(new URL(baseUrl), href);
+        } catch (MalformedURLException e) {
+            throw new RuntimeException("Invalid URL: " + baseUrl + " - " + href, e);
+        }
+    }
+
     @Override
     public void setDocumentLocator(Locator locator) {
         // System.out.println("NekoHtmlSaxParser.setDocumentLocator()");
@@ -217,4 +291,32 @@ public void endPrefixMapping(String prefix) throws SAXException {
         System.out.println("NekoHtmlSaxParser.endPrefixMapping()");
     }
 
+    public URL[] links() {
+        List<URL> links = new ArrayList<>();
+        for (Anchor anchor : anchors) {
+            URL absoluteUrl = resolveRelativeHrefToUrl(anchor.href, baseUrl);
+            links.add(absoluteUrl);
+        }
+        return (URL[]) links.toArray(new URL[links.size()]);
+    }
+
+
+    public LinkNeighborhood[] getLinkNeighboor() {
+        List<LinkNeighborhood> links = new ArrayList<>();
+        for (Anchor anchor : anchors) {
+            URL absoluteUrl = resolveRelativeHrefToUrl(anchor.href, baseUrl);
+            LinkNeighborhood ln = new LinkNeighborhood(absoluteUrl);
+            links.add(ln);
+        }
+        return (LinkNeighborhood[]) links.toArray(new LinkNeighborhood[links.size()]);
+    }
+
+    public URL getURL() {
+        try {
+            return new URL(baseUrl);
+        } catch (MalformedURLException e) {
+            throw new RuntimeException("Invalid URL: " + baseUrl, e);
+        }
+    }
+
 }
diff --git a/src/test/java/focusedCrawler/util/parser/HtmlSaxParserTest.java b/src/test/java/focusedCrawler/util/parser/HtmlSaxParserTest.java
new file mode 100644
index 000000000..1215c9984
--- /dev/null
+++ b/src/test/java/focusedCrawler/util/parser/HtmlSaxParserTest.java
@@ -0,0 +1,242 @@
+package focusedCrawler.util.parser;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.hamcrest.CoreMatchers.notNullValue;
+import static org.junit.Assert.assertThat;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class HtmlSaxParserTest {
+
+    @Before
+    public void setUp() throws Exception {}
+
+    @After
+    public void tearDown() throws Exception {}
+
+    @Test
+    public void htmlEncodedLinksShouldBeEscaped() throws Exception {
+        // given
+        String testString = new HtmlBuilder()
+                .withBody("<a href=\"http://ex.com/index.php?p1=asdf&amp;p2=qwer\">Anchor text.</a>")
+                .build();
+        
+        // when
+        HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString);
+        URL[] extractedLinks = pageParser.links();
+        LinkNeighborhood[] neighborhood = pageParser.getLinkNeighboor();
+
+        // then
+        assertThat(extractedLinks[0].toString(), is("http://ex.com/index.php?p1=asdf&p2=qwer"));
+        assertThat(neighborhood[0].getLink().toString(), is("http://ex.com/index.php?p1=asdf&p2=qwer"));
+    }
+
+    @Test
+    public void linksShouldNotContainFragments() throws Exception {
+        // given
+        String testString = new HtmlBuilder()
+                .appendToBody("<h1>My First Heading</h1>")
+                .appendToBody("<a href=\"https://en.wikipedia.org/wiki/Mouse_(computing)#Mechanical_mice\">Mouse</a>")
+                .build();
+        URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document");
+        
+        // when
+        HtmlSaxParser pageParser = new HtmlSaxParser(url, testString);
+        URL[] extractedLinks = pageParser.links();
+        
+        // then
+        assertThat(extractedLinks.length, is(1));
+        assertThat(extractedLinks[0].toString(), is("https://en.wikipedia.org/wiki/Mouse_(computing)"));
+    }
+    
+    @Test
+    public void constructorsShouldWork() throws MalformedURLException {
+        // given
+        URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document");
+        String testPage = createTestPage();
+        // when
+        HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage);
+        // then
+        assertThat(pageParser.getURL(), is(notNullValue()));
+    }
+    
+    @Test
+    public void shouldExtractOnionLinks() throws MalformedURLException {
+        // given
+        URL url = new URL("http://example.com/test.html");
+        String testPage = new HtmlBuilder()
+                .appendToBody("<a href = \"http://3g2asl4qw6kufc5m.onion/\">link 1</a>")
+                .appendToBody("<a href = \"http://3g2asl4qw6kufc5m.onion/test.html\">link 1</a>")
+                .build();        
+        // when
+        HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString());
+        URL[] links = pageParser.links();
+        
+        // then
+        assertThat(links.length, is(2));
+        assertThat(links[0].toString(), is("http://3g2asl4qw6kufc5m.onion/"));
+        assertThat(links[1].toString(), is("http://3g2asl4qw6kufc5m.onion/test.html"));
+    }
+    
+//    @Test
+//    public void shouldParseText() throws MalformedURLException {
+//        // given
+//        URL url = new URL("http://example.com/");
+//        StringBuilder testPage = new StringBuilder();
+//        testPage.append("<!DOCTYPE html>");
+//        testPage.append("<html>");
+//        testPage.append("<body>");
+//        testPage.append("<p>My First paragraph. My second second paragraph.</p>");
+//        testPage.append("</body>");
+//        testPage.append("</html>");
+//
+//        // when
+//        HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString());
+//        String[] links = pageParser.palavras();
+//        int[] ocorrencias = pageParser.ocorrencias();
+//        System.out.println(testPage.toString());
+//        System.out.println(Arrays.deepToString(links));
+//        System.out.println(Ints.asList(ocorrencias));
+////        // then
+////        assertThat(links.length, is(1));
+////        assertThat(links[0].toString(), is("http://example.com/asdf.html"));
+////
+////        assertThat(lns.length, is(1));
+////        assertThat(lns[0].getLink().toString(), is("http://example.com/asdf.html"));
+//    }
+    
+    @Test
+    public void shouldExtractAnchoTextAndTextAroundLink() throws MalformedURLException {
+        // given
+        String url = "http://www.example.com";
+        String testPage = HtmlBuilder.newBuilder()
+                .appendToBody("<p>My First Heading</p>")
+                .appendToBody("<a href=\"http://example.com/about.html\">My first paragraph.</a>")
+                .build();
+        // when
+        HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage);
+        LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighboor();
+        // then
+        assertThat(neighborhoods.length, is(1));
+        
+        assertThat(neighborhoods[0].getAroundString().trim(), is("my first heading"));
+        assertThat(neighborhoods[0].getAround()[0], is("my"));
+        assertThat(neighborhoods[0].getAround()[1], is("first"));
+        assertThat(neighborhoods[0].getAround()[2], is("heading"));
+        
+        assertThat(neighborhoods[0].getAnchorString().trim(), is("my first paragraph"));
+        assertThat(neighborhoods[0].getAnchor()[0], is("my"));
+        assertThat(neighborhoods[0].getAnchor()[1], is("first"));
+        assertThat(neighborhoods[0].getAnchor()[2], is("paragraph"));
+    }
+    
+    @Test
+    public void shouldNotExtractInvalidLinks() throws MalformedURLException {
+        // given
+        URL url = new URL("http://example.com/test.html");
+        String testPage = new HtmlBuilder()
+                .withBody(
+                          "<h1>My First Heading</h1>"
+                        + "<a href = \"http://None/\">link 0</a>"
+                        + "<a href = \"http://12324/\">link 1</a>"
+                        + "<a href = \"/asdf.html\">link 2</a>"
+                )
+                .build();        
+        // when
+        HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString());
+        URL[] links = pageParser.links();
+        LinkNeighborhood[] lns  = pageParser.getLinkNeighboor();
+        
+        // then
+        assertThat(links.length, is(1));
+        assertThat(links[0].toString(), is("http://example.com/asdf.html"));
+    
+        assertThat(lns.length, is(1));
+        assertThat(lns[0].getLink().toString(), is("http://example.com/asdf.html"));
+    }
+
+    @Test
+    public void shouldNormalizeLinks() throws MalformedURLException {
+        // given
+        URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document");
+        String testPage = HtmlBuilder.newBuilder()
+            .appendToBody("<h1>My First Heading</h1>")
+            .appendToBody("<a href = \"http://Example.com:80/post.php?\">Link 1.</a>")
+            .appendToBody("<a href = \"HTTP://EXAMPLE.com/post.php?b=2&a=1\">Link 2.</a>")
+            .appendToBody("<a href = \"HTTP://EXAMPLE.com\">Link 3.</a>")
+            .build();
+        // when
+        HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage);
+        LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighboor();
+        URL[] links = pageParser.links();
+
+        // then
+        assertThat(neighborhoods.length, is(3));
+        assertThat(links.length, is(3));
+        
+        assertThat(neighborhoods[0].getLink().toString(), is("http://example.com/post.php?"));
+        assertThat(links[0].toString(), is("http://example.com/post.php?"));
+        
+        assertThat(neighborhoods[1].getLink().toString(), is("http://example.com/post.php?a=1&b=2"));
+        assertThat(links[1].toString(), is("http://example.com/post.php?a=1&b=2"));
+
+        assertThat(neighborhoods[2].getLink().toString(), is("http://example.com/"));
+        assertThat(links[2].toString(), is("http://example.com/"));
+    }
+
+    private String createTestPage() {
+        return HtmlBuilder.newBuilder()
+            .appendToBody("<h1>My First Heading</h1>")
+            .appendToBody("<a href=\"https://en.wikipedia.org/wiki/Mouse_(computing)#Mechanical_mice\">My first paragraph.</a>")
+            .build();
+    }
+    
+    public static class HtmlBuilder {
+        
+        private String header = "";
+        private String body = "";
+        
+        public static HtmlBuilder newBuilder() {
+            return new HtmlBuilder();
+        }
+        
+        public HtmlBuilder appendToBody(String body) {
+            this.body += body;
+            return this;
+        }
+
+        public HtmlBuilder withHeader(String header) {
+            this.header = header;
+            return this;
+        }
+
+        public HtmlBuilder withBody(String body) {
+            this.body = body;
+            return this;
+        }
+        
+        public String build() {
+            StringBuilder html = new StringBuilder();
+            html.append("<!DOCTYPE html>");
+            html.append("<html>");
+            if(header != null && !header.isEmpty()) {
+                html.append(header);
+            }
+            html.append("<body>");
+            if(body != null && !body.isEmpty()) {
+                html.append(body);
+            }
+            html.append("</body>");
+            html.append("</html>");
+            return html.toString();
+        }
+
+    }
+    
+
+}

From 60b5582d9038960e1890a0b0c8a358691ab226a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?A=C3=A9cio=20Santos?= <aecio.solando@gmail.com>
Date: Wed, 9 May 2018 19:33:24 -0400
Subject: [PATCH 3/5] Merge with master changes

---
 src/main/java/focusedCrawler/util/Urls.java   |   7 +-
 .../util/parser/HtmlSaxParser.java            | 162 ++++++++----------
 .../focusedCrawler/util/parser/PaginaURL.java |   6 +-
 3 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/src/main/java/focusedCrawler/util/Urls.java b/src/main/java/focusedCrawler/util/Urls.java
index 66188df8a..df9e6bed3 100644
--- a/src/main/java/focusedCrawler/util/Urls.java
+++ b/src/main/java/focusedCrawler/util/Urls.java
@@ -55,7 +55,7 @@ public static String removeFragmentsIfAny(String url) {
         return url;
     }
 
-    public static String resolveHttpLink(HttpUrl base, String link) {
+    public static HttpUrl resolveHttpLink(HttpUrl base, String link) {
         HttpUrl resolvedUrl;
         try {
             if (base == null) {
@@ -67,6 +67,11 @@ public static String resolveHttpLink(HttpUrl base, String link) {
             // The link is invalid or malformed
             resolvedUrl = null;
         }
+        return resolvedUrl;
+    }
+
+    public static String resolveHttpLinkAsString(HttpUrl base, String link) {
+        HttpUrl resolvedUrl = resolveHttpLink(base, link);
         if (resolvedUrl == null) {
             return null;
         } else {
diff --git a/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java
index 18487a589..8384e7858 100644
--- a/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java
+++ b/src/main/java/focusedCrawler/util/parser/HtmlSaxParser.java
@@ -1,5 +1,12 @@
 package focusedCrawler.util.parser;
 
+import focusedCrawler.util.Urls;
+import okhttp3.HttpUrl;
+import org.cyberneko.html.parsers.SAXParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.*;
+
 import java.io.IOException;
 import java.io.StringReader;
 import java.net.MalformedURLException;
@@ -7,25 +14,11 @@
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
-import java.util.TreeSet;
-import java.util.regex.Pattern;
-
-import org.apache.commons.validator.routines.UrlValidator;
-import org.cyberneko.html.parsers.SAXParser;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.Locator;
-import org.xml.sax.SAXException;
-
-import focusedCrawler.crawler.crawlercommons.filters.basic.BasicURLNormalizer;
 
 public class HtmlSaxParser extends SAXParser implements ContentHandler {
 
+
     public static void main(String[] args) throws Exception {
 
         String url = "http://example.com";
@@ -66,23 +59,12 @@ enum TextType {
 
     public static final Logger logger = LoggerFactory.getLogger(HtmlSaxParser.class);
 
-    private final String[] schemes = {"http", "https"};
-    private final UrlValidator urlValidator = new UrlValidator(schemes);
-
-    // ONION links aren't accepted by the validator
-    // Regex ".[^.]+" --> any string of at least 1 char without dot
-    private Pattern onionRegex = Pattern.compile("https?://.[^.]+\\.onion.*");
-
-    private static final List<String> invalidParameters =
-            Arrays.asList("sid", "phpsessid", "sessionid", "jsessionid");
-    private static final BasicURLNormalizer urlNormalizer =
-            new BasicURLNormalizer(new TreeSet<>(invalidParameters), false);
-
 
     private TextType textState = TextType.TEXT;
     private List<Anchor> anchors = new ArrayList<>();
     private List<String> images;
-    private String baseUrl;
+    private HttpUrl base;
+
     private StringBuilder title = new StringBuilder();
     private StringBuilder text = new StringBuilder();
     private StringBuilder anchorText = new StringBuilder();
@@ -94,7 +76,7 @@ public HtmlSaxParser(URL url, String html) {
     }
 
     public HtmlSaxParser(String url, String html) {
-        this.baseUrl = url;
+        this.base = HttpUrl.parse(url);
         // super.setContentHandler(new BoilerpipeHTMLContentHandler());
         setContentHandler(this);
         InputSource input = new InputSource(new StringReader(html));
@@ -121,23 +103,38 @@ public void startElement(String uri, String localName, String qName, Attributes
         System.out.println("(" + localName + " " + qName + " " + uri);
         switch (tagName) {
             case "BASE": {
+                //
+                // Handles the BASE tag which sets the URL that should be used for resolving
+                // relative links
+                //
                 String href = atts.getValue("href");
                 if (href != null && !href.isEmpty()) {
                     // All extracted links should be relative to the href of <base> tag
-                    this.baseUrl = href;
+                    try {
+                        HttpUrl newBase = Urls.resolveHttpLink(this.base, href);
+                        if (newBase != null) {
+                            this.base = newBase;
+                        }
+                    } catch (Exception e) {
+                        // ignore invalid URLs
+                    }
                 }
                 break;
             }
             case "A": {
                 this.textState = TextType.ANCHOR_TEXT;
                 String href = atts.getValue("href");
-                createLink(href);
+                String link = createLink(this.base, href);
+                if (link != null) {
+                    this.currentHref = link;
+                    this.currentHrefTextStart = text.length();
+                }
                 break;
             }
             case "IMG": {
                 String href = atts.getValue("href");
                 if (href != null && !href.isEmpty()) {
-                    images.add(href);
+                    images.add(createLink(this.base, href));
                 }
                 break;
             }
@@ -154,35 +151,43 @@ public void startElement(String uri, String localName, String qName, Attributes
         }
     }
 
-    private void createLink(String href) {
-        String url = null;
-        if (href == null || href.isEmpty())
-            return;
-        else
+    private String createLink(HttpUrl base, String href) {
+        if (href == null || href.isEmpty()) {
+            return null;
+        }
+
+        String url = href;
+
+        if (url.startsWith(" ") || url.endsWith(" ")) {
             url = href.trim();
+        }
 
-        if (url.startsWith("javacript:"))
-            return;
+        if (url.startsWith("javascript:")) {
+            return null;
+        }
 
         if (url.startsWith("mailto:")) {
-            // TODO store email
-            return;
+            return null;
         }
 
         if (url.startsWith("tel:")) {
-            // TODO store phone number
-            return;
+            return null;
         }
 
-        String absoluteUrl = resolveRelativeHref(href, baseUrl);
-        if (absoluteUrl == null || absoluteUrl.isEmpty())
-            return;
+        if (url.startsWith("data:")) {
+            return null;
+        }
+
+        String absoluteUrl = Urls.resolveHttpLinkAsString(base, href);
+        if (absoluteUrl == null || absoluteUrl.isEmpty()) {
+            return null;
+        }
 
-        if (!(urlValidator.isValid(absoluteUrl) || onionRegex.matcher(absoluteUrl).matches()))
-            return;
+        if (!Urls.isValid(absoluteUrl)) {
+            return null;
+        }
 
-        this.currentHref = urlNormalizer.filter(absoluteUrl);
-        this.currentHrefTextStart = text.length();
+        return Urls.normalize(absoluteUrl);
     }
 
     @Override
@@ -192,7 +197,6 @@ public void endElement(String uri, String localName, String qName) throws SAXExc
         switch (tagName) {
             case "A":
                 if (currentHref != null && !currentHref.isEmpty()) {
-                    // TODO: validate href? unescape?
                     anchors.add(new Anchor(currentHref, currentHrefTextStart, text.length(),
                             anchorText.toString().trim()));
                     currentHref = null;
@@ -238,17 +242,29 @@ public void characters(char[] ch, int start, int length) throws SAXException {
         }
     }
 
-    private String resolveRelativeHref(String href, String baseUrl) {
-        URL absoluteUrl = resolveRelativeHrefToUrl(href, baseUrl);
-        return absoluteUrl == null ? null : absoluteUrl.toString();
+    public URL[] links() {
+        List<URL> links = new ArrayList<>();
+        for (Anchor anchor : anchors) {
+            URL absoluteUrl = Urls.toJavaURL(anchor.href);
+            if (absoluteUrl != null) {
+                links.add(absoluteUrl);
+            }
+        }
+        return (URL[]) links.toArray(new URL[links.size()]);
     }
 
-    private URL resolveRelativeHrefToUrl(String href, String baseUrl) {
-        try {
-            return new URL(new URL(baseUrl), href);
-        } catch (MalformedURLException e) {
-            throw new RuntimeException("Invalid URL: " + baseUrl + " - " + href, e);
+    public LinkNeighborhood[] getLinkNeighboor() {
+        List<LinkNeighborhood> links = new ArrayList<>();
+        for (Anchor anchor : anchors) {
+            URL absoluteUrl = Urls.toJavaURL(anchor.href);
+            LinkNeighborhood ln = new LinkNeighborhood(absoluteUrl);
+            links.add(ln);
         }
+        return (LinkNeighborhood[]) links.toArray(new LinkNeighborhood[links.size()]);
+    }
+
+    public URL getURL() {
+        return base != null ? base.url() : null;
     }
 
     @Override
@@ -291,32 +307,4 @@ public void endPrefixMapping(String prefix) throws SAXException {
         System.out.println("NekoHtmlSaxParser.endPrefixMapping()");
     }
 
-    public URL[] links() {
-        List<URL> links = new ArrayList<>();
-        for (Anchor anchor : anchors) {
-            URL absoluteUrl = resolveRelativeHrefToUrl(anchor.href, baseUrl);
-            links.add(absoluteUrl);
-        }
-        return (URL[]) links.toArray(new URL[links.size()]);
-    }
-
-
-    public LinkNeighborhood[] getLinkNeighboor() {
-        List<LinkNeighborhood> links = new ArrayList<>();
-        for (Anchor anchor : anchors) {
-            URL absoluteUrl = resolveRelativeHrefToUrl(anchor.href, baseUrl);
-            LinkNeighborhood ln = new LinkNeighborhood(absoluteUrl);
-            links.add(ln);
-        }
-        return (LinkNeighborhood[]) links.toArray(new LinkNeighborhood[links.size()]);
-    }
-
-    public URL getURL() {
-        try {
-            return new URL(baseUrl);
-        } catch (MalformedURLException e) {
-            throw new RuntimeException("Invalid URL: " + baseUrl, e);
-        }
-    }
-
 }
diff --git a/src/main/java/focusedCrawler/util/parser/PaginaURL.java b/src/main/java/focusedCrawler/util/parser/PaginaURL.java
index a94b38b9f..0263d1c71 100644
--- a/src/main/java/focusedCrawler/util/parser/PaginaURL.java
+++ b/src/main/java/focusedCrawler/util/parser/PaginaURL.java
@@ -900,7 +900,7 @@ protected void separadorTextoCodigo(String arquivo) {    // arquivo equivale ao
                             		ln.setImgSource(str);
                             	}
                             	try {
-                            		imagens.add(Urls.resolveHttpLink(base,str).toString());	
+                            		imagens.add(Urls.resolveHttpLinkAsString(base,str).toString());
 								} catch (Exception e) {
 									// TODO: handle exception
 								}
@@ -1022,7 +1022,7 @@ else if (tagName.equals("frame") && atributo.equals("src")) {
                             } else if (tagName.equals("base") && atributo.equals("href")) {
                                 try {
                                     HttpUrl oldBase = (baseUrl == null) ? null : HttpUrl.get(baseUrl);
-                                    String newBase = Urls.resolveHttpLink(oldBase, str);
+                                    String newBase = Urls.resolveHttpLinkAsString(oldBase, str);
                                     base = (newBase == null) ? null : HttpUrl.parse(newBase);
                                 } catch (Exception e) {
                                     // ignore invalid URLs
@@ -1237,7 +1237,7 @@ protected String addLink(String link, HttpUrl base) {
             return "";
         }
         link = link.trim();
-        link = Urls.resolveHttpLink(base, link);
+        link = Urls.resolveHttpLinkAsString(base, link);
         if (link == null) {
             return "";
         }

From 27e12d1628c9fa37ebe2662d70443fa91883f14b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?A=C3=A9cio=20Santos?= <aecio.solando@gmail.com>
Date: Fri, 16 Jul 2021 23:45:44 -0400
Subject: [PATCH 4/5] Implemented extraction of the text around links

---
 .../util/parser/HtmlSaxParser.java            | 373 +++++++++++-------
 .../util/parser/HtmlSaxParserTest.java        |  39 +-
 2 files changed, 262 insertions(+), 150 deletions(-)

diff --git a/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java b/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java
index 01e826865..6910a8268 100644
--- a/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java
+++ b/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java
@@ -2,6 +2,11 @@
 
 import achecrawler.util.Urls;
 import okhttp3.HttpUrl;
+import org.apache.commons.io.input.CharSequenceReader;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.SimpleAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.cyberneko.html.parsers.SAXParser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -15,60 +20,27 @@
 import java.util.ArrayList;
 import java.util.List;
 
-public class HtmlSaxParser extends SAXParser implements ContentHandler {
-
-
-    public static void main(String[] args) throws Exception {
-
-        String url = "http://example.com";
-        String html =
-                "<html><meta /><body><h1><!-- my comment --><a HREF=\"index.html\">My heading 1!</a></h1><div><p>My Paragraph.</p></body></html>";
-        html = "Hello World!";
-        html = new String(Files.readAllBytes(Paths.get(
-                "ache-tools/src/test/resources/achecrawler/memex/cdr/http%3A%2F%2Fwww.darpa.mil%2Fprogram%2Fmemex")));
-        HtmlSaxParser parser = new HtmlSaxParser(url, html);
-        parser.print();
-    }
-
-    static class Anchor {
-
-        private String href;
-        private int textStart;
-        private int textEnd;
-        private String anchorText;
-
-        Anchor(String href, int textStart, int textEnd, String anchorText) {
-            this.href = href;
-            this.textStart = textStart;
-            this.textEnd = textEnd;
-            this.anchorText = anchorText;
-        }
 
-        @Override
-        public String toString() {
-            return "Anchor[href=" + href + ", textStart=" + textStart + ", textEnd=" + textEnd
-                    + ", text=" + anchorText + "]";
-        }
+public class HtmlSaxParser extends SAXParser implements ContentHandler {
 
-    }
+    public static final Logger logger = LoggerFactory.getLogger(HtmlSaxParser.class);
 
-    enum TextType {
-        TITLE, TEXT, ANCHOR_TEXT, IGNORE
-    }
+    public static final int AROUND_WORDS = 10;
 
-    public static final Logger logger = LoggerFactory.getLogger(HtmlSaxParser.class);
+    private final List<Anchor> anchors = new ArrayList<>();
+    private final List<String> images = new ArrayList<>();
+    private final List<String> tokens = new ArrayList<>();
 
+    private final StringBuilder title = new StringBuilder();
+    private final StringBuilder text = new StringBuilder();
+    private final SimpleTokenizer tokenizer = new SimpleTokenizer(new CharSequenceReader(text));
 
-    private TextType textState = TextType.TEXT;
-    private List<Anchor> anchors = new ArrayList<>();
-    private List<String> images;
     private HttpUrl base;
-
-    private StringBuilder title = new StringBuilder();
-    private StringBuilder text = new StringBuilder();
-    private StringBuilder anchorText = new StringBuilder();
+    private TextType textState = TextType.TEXT;
     private String currentHref = null;
     private int currentHrefTextStart = 0;
+    private int currentHrefTokenStart = 0;
+    private StringBuilder anchorText = new StringBuilder();
 
     public HtmlSaxParser(URL url, String html) {
         this(url.toString(), html);
@@ -86,38 +58,11 @@ public HtmlSaxParser(String url, String html) {
         }
     }
 
-    private void print() {
-        System.out.println("---");
-        System.out.println("TEXT: " + text.toString());
-        System.out.println("ANCHORS: ");
-        for (Anchor anchor : anchors) {
-            System.out.println("> " + anchor);
-        }
-    }
-
     @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts)
-            throws SAXException {
-        String tagName = localName;
-        System.out.println("(" + localName + " " + qName + " " + uri);
+    public void startElement(String uri, String tagName, String qName, Attributes atts) {
         switch (tagName) {
             case "BASE": {
-                //
-                // Handles the BASE tag which sets the URL that should be used for resolving
-                // relative links
-                //
-                String href = atts.getValue("href");
-                if (href != null && !href.isEmpty()) {
-                    // All extracted links should be relative to the href of <base> tag
-                    try {
-                        HttpUrl newBase = Urls.resolveHttpLink(this.base, href);
-                        if (newBase != null) {
-                            this.base = newBase;
-                        }
-                    } catch (Exception e) {
-                        // ignore invalid URLs
-                    }
-                }
+                handleBaseTag(atts);
                 break;
             }
             case "A": {
@@ -127,6 +72,9 @@ public void startElement(String uri, String localName, String qName, Attributes
                 if (link != null) {
                     this.currentHref = link;
                     this.currentHrefTextStart = text.length();
+
+                    this.tokenizer.tokenize();
+                    this.currentHrefTokenStart = this.tokens.size();
                 }
                 break;
             }
@@ -150,60 +98,21 @@ public void startElement(String uri, String localName, String qName, Attributes
         }
     }
 
-    private String createLink(HttpUrl base, String href) {
-        if (href == null || href.isEmpty()) {
-            return null;
-        }
-
-        String url = href;
-
-        if (url.startsWith(" ") || url.endsWith(" ")) {
-            url = href.trim();
-        }
-
-        if (url.startsWith("javascript:")) {
-            return null;
-        }
-
-        if (url.startsWith("mailto:")) {
-            return null;
-        }
-
-        if (url.startsWith("tel:")) {
-            return null;
-        }
-
-        if (url.startsWith("data:")) {
-            return null;
-        }
-
-        String absoluteUrl = Urls.resolveHttpLinkAsString(base, href);
-        if (absoluteUrl == null || absoluteUrl.isEmpty()) {
-            return null;
-        }
-
-        if (!Urls.isValid(absoluteUrl)) {
-            return null;
-        }
-
-        return Urls.normalize(absoluteUrl);
-    }
-
     @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-        String tagName = localName;
-        System.out.println(")" + localName);
+    public void endElement(String uri, String tagName, String qName) {
         switch (tagName) {
             case "A":
                 if (currentHref != null && !currentHref.isEmpty()) {
+                    tokenizer.tokenize();
                     anchors.add(new Anchor(currentHref, currentHrefTextStart, text.length(),
-                            anchorText.toString().trim()));
+                            anchorText.toString().trim(), currentHrefTokenStart, tokens.size()));
                     currentHref = null;
                 }
                 anchorText = new StringBuilder();
                 textState = TextType.TEXT;
                 break;
             case "TITLE":
+                textState = TextType.IGNORE;
                 break;
             case "P":
             case "H1":
@@ -217,17 +126,37 @@ public void endElement(String uri, String localName, String qName) throws SAXExc
             case "BR":
                 text.append('\n');
                 break;
+            case "NOSCRIPT":
+            case "SCRIPT":
+            case "STYLE":
+                this.textState = TextType.TEXT;
+                break;
             default:
                 text.append(' ');
         }
     }
 
+    /*
+     * Handles the BASE tag which sets the URL that should be used for resolving
+     */
+    private void handleBaseTag(Attributes attributes) {
+        String href = attributes.getValue("href");
+        if (href != null && !href.isEmpty()) {
+            // All extracted links should be relative to the href of <base> tag
+            try {
+                HttpUrl newBase = Urls.resolveHttpLink(this.base, href);
+                if (newBase != null) {
+                    this.base = newBase;
+                }
+            } catch (Exception e) {
+                // ignore invalid URLs
+            }
+        }
+    }
+
     @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
-        System.out.println(new String(ch, start, length));
+    public void characters(char[] ch, int start, int length) {
         switch (textState) {
-            case IGNORE:
-                break;
             case TEXT:
                 text.append(ch, start, length);
                 break;
@@ -238,6 +167,8 @@ public void characters(char[] ch, int start, int length) throws SAXException {
             case TITLE:
                 title.append(ch, start, length);
                 break;
+            case IGNORE:
+                break;
         }
     }
 
@@ -252,61 +183,223 @@ public URL[] links() {
         return links.toArray(new URL[links.size()]);
     }
 
-    public LinkNeighborhood[] getLinkNeighboor() {
+    public LinkNeighborhood[] getLinkNeighborhood() {
         List<LinkNeighborhood> links = new ArrayList<>();
         for (Anchor anchor : anchors) {
             URL absoluteUrl = Urls.toJavaURL(anchor.href);
             LinkNeighborhood ln = new LinkNeighborhood(absoluteUrl);
-            // TODO:
-//            ln.setAround();
-//            ln.setAnchor();
+            ln.setAround(createAroundText(anchor));
+            ln.setAnchor(createAnchorText(anchor));
             links.add(ln);
         }
         return links.toArray(new LinkNeighborhood[links.size()]);
     }
 
+    private String[] createAnchorText(Anchor anchor) {
+        List<String> aroundTemp = new ArrayList<>();
+        for (int i = anchor.tokenStart; i < anchor.tokenEnd; i++) {
+            aroundTemp.add(tokens.get(i));
+        }
+        return aroundTemp.toArray(new String[aroundTemp.size()]);
+    }
+
+    private String[] createAroundText(Anchor anchor) {
+        List<String> aroundTemp = new ArrayList<String>();
+        final int begin = Math.max(0, anchor.tokenStart - AROUND_WORDS);
+        for (int i = begin; i < anchor.tokenStart; i++) {
+            aroundTemp.add(tokens.get(i));
+        }
+        int end = Math.min(tokens.size(), anchor.tokenEnd + AROUND_WORDS);
+        for (int i = anchor.tokenEnd; i < end; i++) {
+            aroundTemp.add(tokens.get(i));
+        }
+        return aroundTemp.toArray(new String[aroundTemp.size()]);
+    }
+
+    private String createLink(HttpUrl base, String href) {
+        if (href == null || href.isEmpty()) {
+            return null;
+        }
+        String url = href;
+        if (url.startsWith(" ") || url.endsWith(" ")) {
+            url = href.trim();
+        }
+        if (url.startsWith("javascript:")) {
+            return null;
+        }
+        if (url.startsWith("mailto:")) {
+            return null;
+        }
+        if (url.startsWith("tel:")) {
+            return null;
+        }
+        if (url.startsWith("data:")) {
+            return null;
+        }
+        String absoluteUrl = Urls.resolveHttpLinkAsString(base, href);
+        if (absoluteUrl == null || absoluteUrl.isEmpty()) {
+            return null;
+        }
+        if (!Urls.isValid(absoluteUrl)) {
+            return null;
+        }
+        return Urls.normalize(absoluteUrl);
+    }
+
     public URL getURL() {
         return base != null ? base.url() : null;
     }
 
+    public List<String> getTokens() {
+        return this.tokens;
+    }
+
+    public String title() {
+        return this.title.toString();
+    }
+
+    private void print() {
+        // TODO: Clean up
+        System.out.println("---");
+        System.out.println("TEXT: " + text.toString());
+        System.out.println("ANCHORS: ");
+        for (Anchor anchor : anchors) {
+            System.out.println("> " + anchor);
+        }
+    }
+
     @Override
     public void setDocumentLocator(Locator locator) {
-        // System.out.println("NekoHtmlSaxParser.setDocumentLocator()");
     }
 
     @Override
-    public void startDocument() throws SAXException {
-        // System.out.println("NekoHtmlSaxParser.startDocument()");
+    public void startDocument() {
     }
 
     @Override
-    public void endDocument() throws SAXException {
-        // System.out.println("NekoHtmlSaxParser.startElement()");
+    public void endDocument() {
+        // Finish tokenization of text left over
+        this.tokenizer.tokenize();
     }
 
     @Override
-    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
-        System.out.println("NekoHtmlSaxParser.ignorableWhitespace()");
+    public void ignorableWhitespace(char[] ch, int start, int length) {
     }
 
     @Override
-    public void processingInstruction(String target, String data) throws SAXException {
-        System.out.println("NekoHtmlSaxParser.processingInstruction()");
+    public void processingInstruction(String target, String data) {
     }
 
     @Override
-    public void skippedEntity(String name) throws SAXException {
-        System.out.println("NekoHtmlSaxParser.skippedEntity()");
+    public void skippedEntity(String name) {
     }
 
     @Override
-    public void startPrefixMapping(String prefix, String uri) throws SAXException {
-        System.out.println("NekoHtmlSaxParser.startPrefixMapping()");
+    public void startPrefixMapping(String prefix, String uri) {
     }
 
     @Override
-    public void endPrefixMapping(String prefix) throws SAXException {
-        System.out.println("NekoHtmlSaxParser.endPrefixMapping()");
+    public void endPrefixMapping(String prefix) {
+    }
+
+    enum TextType {
+        TITLE, TEXT, ANCHOR_TEXT, IGNORE
+    }
+
+    static class Anchor {
+
+        private final String href;
+        private final int textStart;
+        private final int textEnd;
+        private final String anchorText;
+        private final int tokenStart;
+        private final int tokenEnd;
+
+        Anchor(String href, int textStart, int textEnd, String anchorText, int tokenStart, int tokenEnd) {
+            this.href = href;
+            this.textStart = textStart;
+            this.textEnd = textEnd;
+            this.anchorText = anchorText;
+            this.tokenStart = tokenStart;
+            this.tokenEnd = tokenEnd;
+        }
+
+        @Override
+        public String toString() {
+            return "Anchor[href=" + href +
+                    ", textStart=" + textStart +
+                    ", textEnd=" + textEnd +
+                    ", text=" + anchorText +
+                    "]";
+        }
     }
 
+    public class SimpleTokenizer {
+
+        private final TokenStream ts;
+        private final CharTermAttribute cattr;
+
+        public SimpleTokenizer(CharSequenceReader cleanText) {
+            // TODO: setup a good general tokenizer
+            Analyzer analyzer = new SimpleAnalyzer();
+//            this.analyzer = new StandardAnalyzer(StandardAnalyzer.ENGLISH_STOP_WORDS_SET);
+//            this.analyzer = new Analyzer() {
+//                @Override
+//                protected TokenStreamComponents createComponents(final String fieldName) {
+//                    final StandardTokenizer src = new StandardTokenizer();
+//                    src.setMaxTokenLength(255);
+//                    // return new TokenStreamComponents(src);
+////                    TokenStream tok = new StandardFilter(src);
+////                    tok = new LowerCaseFilter(tok);
+//////                    tok = new StopFilter(tok, stopwords);
+////                    return new TokenStreamComponents(src, tok) {
+////                        @Override
+////                        protected void setReader(final Reader reader) {
+////                            // So that if maxTokenLength was changed, the change takes
+////                            // effect next time tokenStream is called:
+////                            src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
+////                            super.setReader(reader);
+////                        }
+////                    };
+//                }
+//            };
+            ts = analyzer.tokenStream("cleanText", cleanText);
+            cattr = ts.addAttribute(CharTermAttribute.class);
+            try {
+                ts.reset();
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        public void tokenize() {
+            try {
+                while (ts.incrementToken()) {
+                    String token = cattr.toString();
+                    HtmlSaxParser.this.tokens.add(token);
+                }
+            } catch (IOException e) {
+                throw new RuntimeException("Tokenization failed", e);
+            }
+        }
+    }
+
+    // TODO: Clean up
+    public static void main(String[] args) throws Exception {
+
+        String url = "http://www.darpa.mil/program/memex";
+        String html =
+                "<html><meta /><body><h1><!-- my comment --><a HREF=\"index.html\">My heading 1!</a></h1><div><p>My Paragraph.</p></body></html>";
+        html = new String(Files.readAllBytes(Paths.get(
+                "ache-tools/src/test/resources/achecrawler/memex/cdr/http%3A%2F%2Fwww.darpa.mil%2Fprogram%2Fmemex")));
+        HtmlSaxParser parser = new HtmlSaxParser(url, html);
+//        parser.print();
+//        PaginaURL parser = new PaginaURL(new URL(url), html);
+
+        final LinkNeighborhood[] neighborhoods = parser.getLinkNeighborhood();
+        for (LinkNeighborhood n : neighborhoods) {
+            System.out.println("> Around: " + n.getLink().toString());
+            System.out.println(n.getAroundString());
+        }
+    }
 }
diff --git a/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java b/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java
index ea1c48b57..20791657a 100644
--- a/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java
+++ b/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java
@@ -7,12 +7,25 @@
 import java.net.MalformedURLException;
 import java.net.URL;
 
-import org.junit.After;
-import org.junit.Before;
 import org.junit.Test;
 
 public class HtmlSaxParserTest {
 
+    @Test
+    public void shouldExtractTitle() throws Exception {
+        // given
+        String testString = new HtmlBuilder()
+                .withHeader("<title>ACHE Crawler \n \t</title>")
+                .withBody("<p>My text</p>")
+                .build();
+
+        // when
+        HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString);
+
+        // then
+        assertThat(pageParser.title().trim(), is("ACHE Crawler"));
+    }
+
     @Test
     public void htmlEncodedLinksShouldBeEscaped() throws Exception {
         // given
@@ -23,7 +36,7 @@ public void htmlEncodedLinksShouldBeEscaped() throws Exception {
         // when
         HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString);
         URL[] extractedLinks = pageParser.links();
-        LinkNeighborhood[] neighborhood = pageParser.getLinkNeighboor();
+        LinkNeighborhood[] neighborhood = pageParser.getLinkNeighborhood();
 
         // then
         assertThat(extractedLinks[0].toString(), is("http://ex.com/index.php?p1=asdf&p2=qwer"));
@@ -110,25 +123,31 @@ public void shouldExtractAnchorTextAndTextAroundLink() throws MalformedURLExcept
         String url = "http://www.example.com";
         String testPage = HtmlBuilder.newBuilder()
                 .appendToBody("<p>My First Heading</p>")
-                .appendToBody("<a href=\"http://example.com/about.html\">My first paragraph.</a>")
+                .appendToBody("<a href=\"http://example.com/about.html\">My first anchor text.</a>")
+//                .appendToBody("<a href=\"http://example.com/about.html\">my second anchor text.</a>")
+                .appendToBody("<p>my paragraph.</p>")
+                .appendToBody("free text")
                 .build();
         // when
         HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage);
-        LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighboor();
+        LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighborhood();
 //        PaginaURL pageParser = new PaginaURL(new URL(url), testPage);
 //        LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighboor();
+//        System.out.println("tokens = " + pageParser.getTokens());
+
         // then
         assertThat(neighborhoods.length, is(1));
         
-        assertThat(neighborhoods[0].getAroundString().trim(), is("my first heading"));
+        assertThat(neighborhoods[0].getAroundString().trim(), is("my first heading my paragraph free text"));
         assertThat(neighborhoods[0].getAround()[0], is("my"));
         assertThat(neighborhoods[0].getAround()[1], is("first"));
         assertThat(neighborhoods[0].getAround()[2], is("heading"));
         
-        assertThat(neighborhoods[0].getAnchorString().trim(), is("my first paragraph"));
+        assertThat(neighborhoods[0].getAnchorString().trim(), is("my first anchor text"));
         assertThat(neighborhoods[0].getAnchor()[0], is("my"));
         assertThat(neighborhoods[0].getAnchor()[1], is("first"));
-        assertThat(neighborhoods[0].getAnchor()[2], is("paragraph"));
+        assertThat(neighborhoods[0].getAnchor()[2], is("anchor"));
+        assertThat(neighborhoods[0].getAnchor()[3], is("text"));
     }
     
     @Test
@@ -146,7 +165,7 @@ public void shouldNotExtractInvalidLinks() throws MalformedURLException {
         // when
         HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString());
         URL[] links = pageParser.links();
-        LinkNeighborhood[] lns  = pageParser.getLinkNeighboor();
+        LinkNeighborhood[] lns  = pageParser.getLinkNeighborhood();
         
         // then
         assertThat(links.length, is(1));
@@ -168,7 +187,7 @@ public void shouldNormalizeLinks() throws MalformedURLException {
             .build();
         // when
         HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage);
-        LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighboor();
+        LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighborhood();
         URL[] links = pageParser.links();
 
         // then

From de8fe4ca8fb3937ede6ec7e378e0fc96b4056bf6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?A=C3=A9cio=20Santos?= <aecio.solando@gmail.com>
Date: Sat, 17 Jul 2021 12:56:28 -0400
Subject: [PATCH 5/5] Add HTML entity test and clean code

---
 .../util/parser/HtmlSaxParser.java            |  9 +-
 .../util/parser/HtmlSaxParserTest.java        | 98 +++++++++++--------
 2 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java b/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java
index 6910a8268..f458dac34 100644
--- a/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java
+++ b/ache/src/main/java/achecrawler/util/parser/HtmlSaxParser.java
@@ -100,6 +100,7 @@ public void startElement(String uri, String tagName, String qName, Attributes at
 
     @Override
     public void endElement(String uri, String tagName, String qName) {
+        // TODO: extract data from <meta> tags (e.g., description, keywords, noindex, nofollow)
         switch (tagName) {
             case "A":
                 if (currentHref != null && !currentHref.isEmpty()) {
@@ -112,7 +113,7 @@ public void endElement(String uri, String tagName, String qName) {
                 textState = TextType.TEXT;
                 break;
             case "TITLE":
-                textState = TextType.IGNORE;
+                textState = TextType.TEXT;
                 break;
             case "P":
             case "H1":
@@ -250,7 +251,7 @@ public URL getURL() {
         return base != null ? base.url() : null;
     }
 
-    public List<String> getTokens() {
+    public List<String> tokens() {
         return this.tokens;
     }
 
@@ -258,6 +259,10 @@ public String title() {
         return this.title.toString();
     }
 
+    public String text() {
+        return this.text.toString();
+    }
+
     private void print() {
         // TODO: Clean up
         System.out.println("---");
diff --git a/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java b/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java
index 20791657a..96fb4e665 100644
--- a/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java
+++ b/ache/src/test/java/achecrawler/util/parser/HtmlSaxParserTest.java
@@ -12,7 +12,7 @@
 public class HtmlSaxParserTest {
 
     @Test
-    public void shouldExtractTitle() throws Exception {
+    public void shouldExtractTitle() {
         // given
         String testString = new HtmlBuilder()
                 .withHeader("<title>ACHE Crawler \n \t</title>")
@@ -27,12 +27,28 @@ public void shouldExtractTitle() throws Exception {
     }
 
     @Test
-    public void htmlEncodedLinksShouldBeEscaped() throws Exception {
+    public void shouldCleanHtmlEntities() {
+        // given
+        String testString = new HtmlBuilder()
+                .withHeader("<title>ACHE &gt; domain specific search &#169;</title>")
+                .withBody("<p>My&nbsp;text &amp; me. &euro;</p>")
+                .build();
+
+        // when
+        HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString);
+
+        // then
+        assertThat(pageParser.title(), is("ACHE > domain specific search ©"));
+        assertThat(pageParser.text().trim(), is("My\u00A0text & me. €"));
+    }
+
+    @Test
+    public void htmlEncodedLinksShouldBeEscaped() {
         // given
         String testString = new HtmlBuilder()
                 .withBody("<a href=\"http://ex.com/index.php?p1=asdf&amp;p2=qwer\">Anchor text.</a>")
                 .build();
-        
+
         // when
         HtmlSaxParser pageParser = new HtmlSaxParser("http://ex.com/index.html", testString);
         URL[] extractedLinks = pageParser.links();
@@ -44,23 +60,23 @@ public void htmlEncodedLinksShouldBeEscaped() throws Exception {
     }
 
     @Test
-    public void linksShouldNotContainFragments() throws Exception {
+    public void linksShouldNotContainFragments() throws MalformedURLException {
         // given
         String testString = new HtmlBuilder()
                 .appendToBody("<h1>My First Heading</h1>")
                 .appendToBody("<a href=\"https://en.wikipedia.org/wiki/Mouse_(computing)#Mechanical_mice\">Mouse</a>")
                 .build();
         URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document");
-        
+
         // when
         HtmlSaxParser pageParser = new HtmlSaxParser(url, testString);
         URL[] extractedLinks = pageParser.links();
-        
+
         // then
         assertThat(extractedLinks.length, is(1));
         assertThat(extractedLinks[0].toString(), is("https://en.wikipedia.org/wiki/Mouse_(computing)"));
     }
-    
+
     @Test
     public void constructorsShouldWork() throws MalformedURLException {
         // given
@@ -71,7 +87,7 @@ public void constructorsShouldWork() throws MalformedURLException {
         // then
         assertThat(pageParser.getURL(), is(notNullValue()));
     }
-    
+
     @Test
     public void shouldExtractOnionLinks() throws MalformedURLException {
         // given
@@ -79,17 +95,17 @@ public void shouldExtractOnionLinks() throws MalformedURLException {
         String testPage = new HtmlBuilder()
                 .appendToBody("<a href = \"http://3g2asl4qw6kufc5m.onion/\">link 1</a>")
                 .appendToBody("<a href = \"http://3g2asl4qw6kufc5m.onion/test.html\">link 1</a>")
-                .build();        
+                .build();
         // when
         HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString());
         URL[] links = pageParser.links();
-        
+
         // then
         assertThat(links.length, is(2));
         assertThat(links[0].toString(), is("http://3g2asl4qw6kufc5m.onion/"));
         assertThat(links[1].toString(), is("http://3g2asl4qw6kufc5m.onion/test.html"));
     }
-    
+
 //    @Test
 //    public void shouldParseText() throws MalformedURLException {
 //        // given
@@ -116,7 +132,7 @@ public void shouldExtractOnionLinks() throws MalformedURLException {
 ////        assertThat(lns.length, is(1));
 ////        assertThat(lns[0].getLink().toString(), is("http://example.com/asdf.html"));
 //    }
-    
+
     @Test
     public void shouldExtractAnchorTextAndTextAroundLink() throws MalformedURLException {
         // given
@@ -137,40 +153,40 @@ public void shouldExtractAnchorTextAndTextAroundLink() throws MalformedURLExcept
 
         // then
         assertThat(neighborhoods.length, is(1));
-        
+
         assertThat(neighborhoods[0].getAroundString().trim(), is("my first heading my paragraph free text"));
         assertThat(neighborhoods[0].getAround()[0], is("my"));
         assertThat(neighborhoods[0].getAround()[1], is("first"));
         assertThat(neighborhoods[0].getAround()[2], is("heading"));
-        
+
         assertThat(neighborhoods[0].getAnchorString().trim(), is("my first anchor text"));
         assertThat(neighborhoods[0].getAnchor()[0], is("my"));
         assertThat(neighborhoods[0].getAnchor()[1], is("first"));
         assertThat(neighborhoods[0].getAnchor()[2], is("anchor"));
         assertThat(neighborhoods[0].getAnchor()[3], is("text"));
     }
-    
+
     @Test
     public void shouldNotExtractInvalidLinks() throws MalformedURLException {
         // given
         URL url = new URL("http://example.com/test.html");
         String testPage = new HtmlBuilder()
                 .withBody(
-                          "<h1>My First Heading</h1>"
-                        + "<a href = \"http://None/\">link 0</a>"
-                        + "<a href = \"http://12324/\">link 1</a>"
-                        + "<a href = \"/asdf.html\">link 2</a>"
+                        "<h1>My First Heading</h1>"
+                                + "<a href = \"http://None/\">link 0</a>"
+                                + "<a href = \"http://12324/\">link 1</a>"
+                                + "<a href = \"/asdf.html\">link 2</a>"
                 )
-                .build();        
+                .build();
         // when
         HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage.toString());
         URL[] links = pageParser.links();
-        LinkNeighborhood[] lns  = pageParser.getLinkNeighborhood();
-        
+        LinkNeighborhood[] lns = pageParser.getLinkNeighborhood();
+
         // then
         assertThat(links.length, is(1));
         assertThat(links[0].toString(), is("http://example.com/asdf.html"));
-    
+
         assertThat(lns.length, is(1));
         assertThat(lns[0].getLink().toString(), is("http://example.com/asdf.html"));
     }
@@ -180,11 +196,11 @@ public void shouldNormalizeLinks() throws MalformedURLException {
         // given
         URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document");
         String testPage = HtmlBuilder.newBuilder()
-            .appendToBody("<h1>My First Heading</h1>")
-            .appendToBody("<a href = \"http://Example.com:80/post.php?\">Link 1.</a>")
-            .appendToBody("<a href = \"HTTP://EXAMPLE.com/post.php?b=2&a=1\">Link 2.</a>")
-            .appendToBody("<a href = \"HTTP://EXAMPLE.com\">Link 3.</a>")
-            .build();
+                .appendToBody("<h1>My First Heading</h1>")
+                .appendToBody("<a href = \"http://Example.com:80/post.php?\">Link 1.</a>")
+                .appendToBody("<a href = \"HTTP://EXAMPLE.com/post.php?b=2&a=1\">Link 2.</a>")
+                .appendToBody("<a href = \"HTTP://EXAMPLE.com\">Link 3.</a>")
+                .build();
         // when
         HtmlSaxParser pageParser = new HtmlSaxParser(url, testPage);
         LinkNeighborhood[] neighborhoods = pageParser.getLinkNeighborhood();
@@ -193,10 +209,10 @@ public void shouldNormalizeLinks() throws MalformedURLException {
         // then
         assertThat(neighborhoods.length, is(3));
         assertThat(links.length, is(3));
-        
+
         assertThat(neighborhoods[0].getLink().toString(), is("http://example.com/post.php?"));
         assertThat(links[0].toString(), is("http://example.com/post.php?"));
-        
+
         assertThat(neighborhoods[1].getLink().toString(), is("http://example.com/post.php?a=1&b=2"));
         assertThat(links[1].toString(), is("http://example.com/post.php?a=1&b=2"));
 
@@ -206,20 +222,20 @@ public void shouldNormalizeLinks() throws MalformedURLException {
 
     private String createTestPage() {
         return HtmlBuilder.newBuilder()
-            .appendToBody("<h1>My First Heading</h1>")
-            .appendToBody("<a href=\"https://en.wikipedia.org/wiki/Mouse_(computing)#Mechanical_mice\">My first paragraph.</a>")
-            .build();
+                .appendToBody("<h1>My First Heading</h1>")
+                .appendToBody("<a href=\"https://en.wikipedia.org/wiki/Mouse_(computing)#Mechanical_mice\">My first paragraph.</a>")
+                .build();
     }
-    
+
     public static class HtmlBuilder {
-        
+
         private String header = "";
         private String body = "";
-        
+
         public static HtmlBuilder newBuilder() {
             return new HtmlBuilder();
         }
-        
+
         public HtmlBuilder appendToBody(String body) {
             this.body += body;
             return this;
@@ -234,16 +250,16 @@ public HtmlBuilder withBody(String body) {
             this.body = body;
             return this;
         }
-        
+
         public String build() {
             StringBuilder html = new StringBuilder();
             html.append("<!DOCTYPE html>");
             html.append("<html>");
-            if(header != null && !header.isEmpty()) {
+            if (header != null && !header.isEmpty()) {
                 html.append(header);
             }
             html.append("<body>");
-            if(body != null && !body.isEmpty()) {
+            if (body != null && !body.isEmpty()) {
                 html.append(body);
             }
             html.append("</body>");
@@ -252,6 +268,6 @@ public String build() {
         }
 
     }
-    
+
 
 }