diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml index 83e71c03d..8aa9f7ab6 100644 --- a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml +++ b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml @@ -306,6 +306,7 @@ http://example.example/example + diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy index e55785585..71f3e5fb5 100644 --- a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy +++ b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy @@ -281,6 +281,7 @@ http://example.example/example // maxElementLength = 1024 // maxAttributeNameLength = 1024 // maxAttributeValueLength = 16384 + // obeyRelNofollow = false } extractorCss(ExtractorCSS) extractorJs(ExtractorJS) diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java index 88827d678..41fbe2c8d 100644 --- a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java @@ -329,6 +329,19 @@ public boolean getIgnoreUnexpectedHtml() { public void setIgnoreUnexpectedHtml(boolean ignoreUnexpectedHtml) { kp.put("ignoreUnexpectedHtml",ignoreUnexpectedHtml); } + + { + setObeyRelNofollow(false); + } + public boolean getObeyRelNofollow() { + return (Boolean) kp.get("obeyRelNofollow"); + } + /** + * If true links containing the "rel=nofollow" directive will not be extracted. + */ + public void setObeyRelNofollow(boolean obeyRelNofollow) { + kp.put("obeyRelNofollow", obeyRelNofollow); + } /** * CrawlMetadata provides the robots honoring policy to use when @@ -397,9 +410,10 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element, CharSequence valueContext = null; CharSequence nameVal = null; - // Just in case it's a LINK tag + // Just in case it's an A or LINK tag CharSequence linkHref = null; CharSequence linkRel = null; + CharSequence linkContext = null; final boolean framesAsEmbeds = getTreatFramesAsEmbedLinks(); @@ -409,7 +423,7 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element, final boolean extractValueAttributes = getExtractValueAttributes(); - + final String elementStr = element.toString(); while (attr.find()) { @@ -430,10 +444,13 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element, } else { context = elementContext(element, attr.group(2)); } + - if (elementStr.equalsIgnoreCase(LINK)) { - // delay handling LINK until the end as we need both HREF and REL + if ((elementStr.equalsIgnoreCase(LINK) || elementStr.equalsIgnoreCase("a")) + && linkHref == null) { + // delay handling A and LINK until the end as we need both HREF and REL linkHref = value; + linkContext = context; } else if ("a[data-remote='true']/@href".equals(context)) { processEmbed(curi, value, context); } else { @@ -595,8 +612,19 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element, } // finish handling LINK now both HREF and REL should be available - if (linkHref != null && linkRel != null) { - processLinkTagWithRel(curi, linkHref, linkRel); + if (linkHref != null) { + if (elementStr.equalsIgnoreCase(LINK)) { + if (linkRel != null) { + processLinkTagWithRel(curi, linkHref, linkRel); + } + } else { + if (linkRel != null && getObeyRelNofollow() + && TextUtils.matches("(?i).*\\bnofollow\\b.*", linkRel)) { + if (logger.isLoggable(Level.FINEST)) logger.finest("ignoring nofollow link: " + linkHref); + } else { + processLink(curi, linkHref, linkContext); + } + } } // finish handling form action, now method is available diff --git a/modules/src/main/java/org/archive/modules/extractor/JerichoExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/JerichoExtractorHTML.java index 17cad0e84..a24ef5163 100644 --- a/modules/src/main/java/org/archive/modules/extractor/JerichoExtractorHTML.java +++ b/modules/src/main/java/org/archive/modules/extractor/JerichoExtractorHTML.java @@ -142,6 +142,13 @@ protected void processGeneralTag(CrawlURI curi, Element element, if (rel != null) { processLinkTagWithRel(curi, attrValue, rel); } + } else if ("a".equals(elementName)) { + String rel = attributes.getValue("rel"); + if (rel != null && getObeyRelNofollow() && TextUtils.matches("(?i).*\\bnofollow\\b.*", rel)) { + if (logger.isLoggable(Level.FINEST)) logger.finest("ignoring nofollow link: " + attrValue); + } else { + processLink(curi, attrValue, context); + } } else { // other HREFs treated as links processLink(curi, attrValue, context); diff --git a/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java b/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java index 25f1609a1..b687c0e86 100644 --- a/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java +++ b/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java @@ -19,11 +19,8 @@ package org.archive.modules.extractor; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.List; +import java.util.*; +import java.util.stream.Collectors; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.Predicate; @@ -723,6 +720,31 @@ public void testLinkRel() throws URIException { assertEquals(expectedLinks, actualLinks); } + public void testDisobeyRelNofollow() throws URIException { + String html = ""; + CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.org/")); + getExtractor().setObeyRelNofollow(false); + getExtractor().extract(curi, html); + Set links = curi.getOutLinks().stream().map(CrawlURI::getURI).collect(Collectors.toSet()); + assertEquals(Set.of("https://www.example.org/both", + "https://www.example.org/normal", + "https://www.example.org/nofollow"), links); + } + + public void testRelNofollow() throws URIException { + String html = "" + + "" + + "" + + "" + + ""; + CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.org/")); + getExtractor().setObeyRelNofollow(true); + getExtractor().extract(curi, html); + Set links = curi.getOutLinks().stream().map(CrawlURI::getURI).collect(Collectors.toSet()); + assertEquals(Set.of("https://www.example.org/both", + "https://www.example.org/normal"), links); + } + private void genericCrawl(CrawlURI curi, CharSequence cs,String[] dest){ getExtractor().extract(curi, cs);