Skip to content

Commit

Permalink
ExtractorHTML: Add obeyRelNofollow option
Browse files Browse the repository at this point in the history
When enabled this option causes regular links annotated with rel=nofollow to not be extracted. This is useful for sites that use rel=nofollow to hint crawler traps.
  • Loading branch information
ato committed Jan 13, 2025
1 parent 4c4510a commit 4e8bda1
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ http://example.example/example
<!-- <property name="maxElementLength" value="1024" /> -->
<!-- <property name="maxAttributeNameLength" value="1024" /> -->
<!-- <property name="maxAttributeValueLength" value="16384" /> -->
<!-- <property name="obeyRelNofollow" value="false" /> -->
</bean>
<bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS">
</bean>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ http://example.example/example
// maxElementLength = 1024
// maxAttributeNameLength = 1024
// maxAttributeValueLength = 16384
// obeyRelNofollow = false
}
extractorCss(ExtractorCSS)
extractorJs(ExtractorJS)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,19 @@ public boolean getIgnoreUnexpectedHtml() {
public void setIgnoreUnexpectedHtml(boolean ignoreUnexpectedHtml) {
kp.put("ignoreUnexpectedHtml",ignoreUnexpectedHtml);
}

{
setObeyRelNofollow(false);
}
public boolean getObeyRelNofollow() {
return (Boolean) kp.get("obeyRelNofollow");
}
/**
* If true links containing the "rel=nofollow" directive will not be extracted.
*/
public void setObeyRelNofollow(boolean obeyRelNofollow) {
kp.put("obeyRelNofollow", obeyRelNofollow);
}

/**
* CrawlMetadata provides the robots honoring policy to use when
Expand Down Expand Up @@ -397,9 +410,10 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
CharSequence valueContext = null;
CharSequence nameVal = null;

// Just in case it's a LINK tag
// Just in case it's an A or LINK tag
CharSequence linkHref = null;
CharSequence linkRel = null;
CharSequence linkContext = null;

final boolean framesAsEmbeds =
getTreatFramesAsEmbedLinks();
Expand All @@ -409,7 +423,7 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,

final boolean extractValueAttributes =
getExtractValueAttributes();

final String elementStr = element.toString();

while (attr.find()) {
Expand All @@ -430,10 +444,13 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
} else {
context = elementContext(element, attr.group(2));
}


if (elementStr.equalsIgnoreCase(LINK)) {
// delay handling LINK until the end as we need both HREF and REL
if ((elementStr.equalsIgnoreCase(LINK) || elementStr.equalsIgnoreCase("a"))
&& linkHref == null) {
// delay handling A and LINK until the end as we need both HREF and REL
linkHref = value;
linkContext = context;
} else if ("a[data-remote='true']/@href".equals(context)) {
processEmbed(curi, value, context);
} else {
Expand Down Expand Up @@ -595,8 +612,19 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
}

// finish handling LINK now both HREF and REL should be available
if (linkHref != null && linkRel != null) {
processLinkTagWithRel(curi, linkHref, linkRel);
if (linkHref != null) {
if (elementStr.equalsIgnoreCase(LINK)) {
if (linkRel != null) {
processLinkTagWithRel(curi, linkHref, linkRel);
}
} else {
if (linkRel != null && getObeyRelNofollow()
&& TextUtils.matches("(?i).*\\bnofollow\\b.*", linkRel)) {
if (logger.isLoggable(Level.FINEST)) logger.finest("ignoring nofollow link: " + linkHref);
} else {
processLink(curi, linkHref, linkContext);
}
}
}

// finish handling form action, now method is available
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,13 @@ protected void processGeneralTag(CrawlURI curi, Element element,
if (rel != null) {
processLinkTagWithRel(curi, attrValue, rel);
}
} else if ("a".equals(elementName)) {
String rel = attributes.getValue("rel");
if (rel != null && getObeyRelNofollow() && TextUtils.matches("(?i).*\\bnofollow\\b.*", rel)) {
if (logger.isLoggable(Level.FINEST)) logger.finest("ignoring nofollow link: " + attrValue);
} else {
processLink(curi, attrValue, context);
}
} else {
// other HREFs treated as links
processLink(curi, attrValue, context);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,8 @@

package org.archive.modules.extractor;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.*;
import java.util.stream.Collectors;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.Predicate;
Expand Down Expand Up @@ -723,6 +720,31 @@ public void testLinkRel() throws URIException {
assertEquals(expectedLinks, actualLinks);
}

public void testDisobeyRelNofollow() throws URIException {
String html = "<a href=/normal><a href=/nofollow rel=nofollow><a href=/both><a href=/both rel=nofollow>";
CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.org/"));
getExtractor().setObeyRelNofollow(false);
getExtractor().extract(curi, html);
Set<String> links = curi.getOutLinks().stream().map(CrawlURI::getURI).collect(Collectors.toSet());
assertEquals(Set.of("https://www.example.org/both",
"https://www.example.org/normal",
"https://www.example.org/nofollow"), links);
}

public void testRelNofollow() throws URIException {
String html = "<a href=/normal></a><a href=/nofollow rel=nofollow></a><a href=/both></a>" +
"<a href=/both rel=nofollow></a>" +
"<a href=/multi1 rel='noopener nofollow'></a>" +
"<a href=/multi2 rel=\"nofollow nopener\"></a>" +
"<a href=/multi3 rel='noopener nofollow noentry'></a>";
CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.org/"));
getExtractor().setObeyRelNofollow(true);
getExtractor().extract(curi, html);
Set<String> links = curi.getOutLinks().stream().map(CrawlURI::getURI).collect(Collectors.toSet());
assertEquals(Set.of("https://www.example.org/both",
"https://www.example.org/normal"), links);
}

private void genericCrawl(CrawlURI curi, CharSequence cs,String[] dest){
getExtractor().extract(curi, cs);

Expand Down

0 comments on commit 4e8bda1

Please sign in to comment.