ExtractorHTML: Add obeyRelNofollow option

When enabled this option causes regular links annotated with rel=nofollow to not be extracted. This is useful for sites that use rel=nofollow to hint crawler traps.
internetarchive · Jan 13, 2025 · 4e8bda1 · 4e8bda1
1 parent 4c4510a
commit 4e8bda1
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 11 deletions.
diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml
@@ -306,6 +306,7 @@ http://example.example/example
   <!-- <property name="maxElementLength" value="1024" /> -->
   <!-- <property name="maxAttributeNameLength" value="1024" /> -->
   <!-- <property name="maxAttributeValueLength" value="16384" /> -->
+  <!-- <property name="obeyRelNofollow" value="false" /> -->
  </bean>
  <bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS">
  </bean> 

diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.groovy
@@ -281,6 +281,7 @@ http://example.example/example
         // maxElementLength = 1024
         // maxAttributeNameLength = 1024
         // maxAttributeValueLength = 16384
+        // obeyRelNofollow = false
     }
     extractorCss(ExtractorCSS)
     extractorJs(ExtractorJS)

diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java
@@ -329,6 +329,19 @@ public boolean getIgnoreUnexpectedHtml() {
     public void setIgnoreUnexpectedHtml(boolean ignoreUnexpectedHtml) {
         kp.put("ignoreUnexpectedHtml",ignoreUnexpectedHtml);
     }
+
+    {
+        setObeyRelNofollow(false);
+    }
+    public boolean getObeyRelNofollow() {
+        return (Boolean) kp.get("obeyRelNofollow");
+    }
+    /**
+     * If true links containing the "rel=nofollow" directive will not be extracted.
+     */
+    public void setObeyRelNofollow(boolean obeyRelNofollow) {
+        kp.put("obeyRelNofollow", obeyRelNofollow);
+    }
 
     /**
      * CrawlMetadata provides the robots honoring policy to use when 
@@ -397,9 +410,10 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
         CharSequence valueContext = null;
         CharSequence nameVal = null;
 
-        // Just in case it's a LINK tag
+        // Just in case it's an A or LINK tag
         CharSequence linkHref = null;
         CharSequence linkRel = null;
+        CharSequence linkContext = null;
 
         final boolean framesAsEmbeds = 
             getTreatFramesAsEmbedLinks();
@@ -409,7 +423,7 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
 
         final boolean extractValueAttributes = 
             getExtractValueAttributes();
-        
+
         final String elementStr = element.toString();
 
         while (attr.find()) {
@@ -430,10 +444,13 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
                 } else {
                     context = elementContext(element, attr.group(2));
                 }
+
 
-                if (elementStr.equalsIgnoreCase(LINK)) {
-                    // delay handling LINK until the end as we need both HREF and REL
+                if ((elementStr.equalsIgnoreCase(LINK) || elementStr.equalsIgnoreCase("a"))
+                    && linkHref == null) {
+                    // delay handling A and LINK until the end as we need both HREF and REL
                     linkHref = value;
+                    linkContext = context;
                 } else if ("a[data-remote='true']/@href".equals(context)) {
                     processEmbed(curi, value, context);
                 } else {
@@ -595,8 +612,19 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
         }
 
         // finish handling LINK now both HREF and REL should be available
-        if (linkHref != null && linkRel != null) {
-            processLinkTagWithRel(curi, linkHref, linkRel);
+        if (linkHref != null) {
+            if (elementStr.equalsIgnoreCase(LINK)) {
+                if (linkRel != null) {
+                    processLinkTagWithRel(curi, linkHref, linkRel);
+                }
+            } else {
+                if (linkRel != null && getObeyRelNofollow()
+                    && TextUtils.matches("(?i).*\\bnofollow\\b.*", linkRel)) {
+                    if (logger.isLoggable(Level.FINEST)) logger.finest("ignoring nofollow link: " + linkHref);
+                } else {
+                    processLink(curi, linkHref, linkContext);
+                }
+            }
         }
 
         // finish handling form action, now method is available

diff --git a/modules/src/main/java/org/archive/modules/extractor/JerichoExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/JerichoExtractorHTML.java
@@ -142,6 +142,13 @@ protected void processGeneralTag(CrawlURI curi, Element element,
                 if (rel != null) {
                     processLinkTagWithRel(curi, attrValue, rel);
                 }
+            } else if ("a".equals(elementName)) {
+                String rel = attributes.getValue("rel");
+                if (rel != null && getObeyRelNofollow() && TextUtils.matches("(?i).*\\bnofollow\\b.*", rel)) {
+                    if (logger.isLoggable(Level.FINEST)) logger.finest("ignoring nofollow link: " + attrValue);
+                } else {
+                    processLink(curi, attrValue, context);
+                }
             } else {
                 // other HREFs treated as links
                 processLink(curi, attrValue, context);

diff --git a/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java b/modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java
@@ -19,11 +19,8 @@
 
 package org.archive.modules.extractor;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;
+import java.util.stream.Collectors;
 
 import org.apache.commons.collections.CollectionUtils;
 import org.apache.commons.collections.Predicate;
@@ -723,6 +720,31 @@ public void testLinkRel() throws URIException {
         assertEquals(expectedLinks, actualLinks);
     }
 
+    public void testDisobeyRelNofollow() throws URIException {
+        String html = "<a href=/normal><a href=/nofollow rel=nofollow><a href=/both><a href=/both rel=nofollow>";
+        CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.org/"));
+        getExtractor().setObeyRelNofollow(false);
+        getExtractor().extract(curi, html);
+        Set<String> links = curi.getOutLinks().stream().map(CrawlURI::getURI).collect(Collectors.toSet());
+        assertEquals(Set.of("https://www.example.org/both",
+                "https://www.example.org/normal",
+                "https://www.example.org/nofollow"), links);
+    }
+
+    public void testRelNofollow() throws URIException {
+        String html = "<a href=/normal></a><a href=/nofollow rel=nofollow></a><a href=/both></a>" +
+                      "<a href=/both rel=nofollow></a>" +
+                      "<a href=/multi1 rel='noopener nofollow'></a>" +
+                      "<a href=/multi2 rel=\"nofollow nopener\"></a>" +
+                      "<a href=/multi3 rel='noopener nofollow noentry'></a>";
+        CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.org/"));
+        getExtractor().setObeyRelNofollow(true);
+        getExtractor().extract(curi, html);
+        Set<String> links = curi.getOutLinks().stream().map(CrawlURI::getURI).collect(Collectors.toSet());
+        assertEquals(Set.of("https://www.example.org/both",
+                "https://www.example.org/normal"), links);
+    }
+
     private void genericCrawl(CrawlURI curi, CharSequence cs,String[] dest){
         getExtractor().extract(curi, cs);