diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml
index 83e71c03d..8aa9f7ab6 100644
--- a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml
+++ b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml
@@ -306,6 +306,7 @@ http://example.example/example
+
diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy
index e55785585..71f3e5fb5 100644
--- a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy
+++ b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy
@@ -281,6 +281,7 @@ http://example.example/example
// maxElementLength = 1024
// maxAttributeNameLength = 1024
// maxAttributeValueLength = 16384
+ // obeyRelNofollow = false
}
extractorCss(ExtractorCSS)
extractorJs(ExtractorJS)
diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java
index 88827d678..41fbe2c8d 100644
--- a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java
+++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java
@@ -329,6 +329,19 @@ public boolean getIgnoreUnexpectedHtml() {
public void setIgnoreUnexpectedHtml(boolean ignoreUnexpectedHtml) {
kp.put("ignoreUnexpectedHtml",ignoreUnexpectedHtml);
}
+
+ {
+ setObeyRelNofollow(false);
+ }
+ public boolean getObeyRelNofollow() {
+ return (Boolean) kp.get("obeyRelNofollow");
+ }
+ /**
+ * If true links containing the "rel=nofollow" directive will not be extracted.
+ */
+ public void setObeyRelNofollow(boolean obeyRelNofollow) {
+ kp.put("obeyRelNofollow", obeyRelNofollow);
+ }
/**
* CrawlMetadata provides the robots honoring policy to use when
@@ -397,9 +410,10 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
CharSequence valueContext = null;
CharSequence nameVal = null;
- // Just in case it's a LINK tag
+ // Just in case it's an A or LINK tag
CharSequence linkHref = null;
CharSequence linkRel = null;
+ CharSequence linkContext = null;
final boolean framesAsEmbeds =
getTreatFramesAsEmbedLinks();
@@ -409,7 +423,7 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
final boolean extractValueAttributes =
getExtractValueAttributes();
-
+
final String elementStr = element.toString();
while (attr.find()) {
@@ -430,10 +444,13 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
} else {
context = elementContext(element, attr.group(2));
}
+
- if (elementStr.equalsIgnoreCase(LINK)) {
- // delay handling LINK until the end as we need both HREF and REL
+ if ((elementStr.equalsIgnoreCase(LINK) || elementStr.equalsIgnoreCase("a"))
+ && linkHref == null) {
+ // delay handling A and LINK until the end as we need both HREF and REL
linkHref = value;
+ linkContext = context;
} else if ("a[data-remote='true']/@href".equals(context)) {
processEmbed(curi, value, context);
} else {
@@ -595,8 +612,19 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
}
// finish handling LINK now both HREF and REL should be available
- if (linkHref != null && linkRel != null) {
- processLinkTagWithRel(curi, linkHref, linkRel);
+ if (linkHref != null) {
+ if (elementStr.equalsIgnoreCase(LINK)) {
+ if (linkRel != null) {
+ processLinkTagWithRel(curi, linkHref, linkRel);
+ }
+ } else {
+ if (linkRel != null && getObeyRelNofollow()
+ && TextUtils.matches("(?i).*\\bnofollow\\b.*", linkRel)) {
+ if (logger.isLoggable(Level.FINEST)) logger.finest("ignoring nofollow link: " + linkHref);
+ } else {
+ processLink(curi, linkHref, linkContext);
+ }
+ }
}
// finish handling form action, now method is available
diff --git a/modules/src/main/java/org/archive/modules/extractor/JerichoExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/JerichoExtractorHTML.java
index 17cad0e84..a24ef5163 100644
--- a/modules/src/main/java/org/archive/modules/extractor/JerichoExtractorHTML.java
+++ b/modules/src/main/java/org/archive/modules/extractor/JerichoExtractorHTML.java
@@ -142,6 +142,13 @@ protected void processGeneralTag(CrawlURI curi, Element element,
if (rel != null) {
processLinkTagWithRel(curi, attrValue, rel);
}
+ } else if ("a".equals(elementName)) {
+ String rel = attributes.getValue("rel");
+ if (rel != null && getObeyRelNofollow() && TextUtils.matches("(?i).*\\bnofollow\\b.*", rel)) {
+ if (logger.isLoggable(Level.FINEST)) logger.finest("ignoring nofollow link: " + attrValue);
+ } else {
+ processLink(curi, attrValue, context);
+ }
} else {
// other HREFs treated as links
processLink(curi, attrValue, context);
diff --git a/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java b/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java
index 25f1609a1..b687c0e86 100644
--- a/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java
+++ b/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java
@@ -19,11 +19,8 @@
package org.archive.modules.extractor;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;
+import java.util.stream.Collectors;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.Predicate;
@@ -723,6 +720,31 @@ public void testLinkRel() throws URIException {
assertEquals(expectedLinks, actualLinks);
}
+ public void testDisobeyRelNofollow() throws URIException {
+ String html = "";
+ CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.org/"));
+ getExtractor().setObeyRelNofollow(false);
+ getExtractor().extract(curi, html);
+ Set links = curi.getOutLinks().stream().map(CrawlURI::getURI).collect(Collectors.toSet());
+ assertEquals(Set.of("https://www.example.org/both",
+ "https://www.example.org/normal",
+ "https://www.example.org/nofollow"), links);
+ }
+
+ public void testRelNofollow() throws URIException {
+ String html = "" +
+ "" +
+ "" +
+ "" +
+ "";
+ CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.org/"));
+ getExtractor().setObeyRelNofollow(true);
+ getExtractor().extract(curi, html);
+ Set links = curi.getOutLinks().stream().map(CrawlURI::getURI).collect(Collectors.toSet());
+ assertEquals(Set.of("https://www.example.org/both",
+ "https://www.example.org/normal"), links);
+ }
+
private void genericCrawl(CrawlURI curi, CharSequence cs,String[] dest){
getExtractor().extract(curi, cs);