Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ExtractorHTML: Add obeyRelNofollow option #638

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ http://example.example/example
<!-- <property name="maxElementLength" value="1024" /> -->
<!-- <property name="maxAttributeNameLength" value="1024" /> -->
<!-- <property name="maxAttributeValueLength" value="16384" /> -->
<!-- <property name="obeyRelNofollow" value="false" /> -->
</bean>
<bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS">
</bean>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ http://example.example/example
// maxElementLength = 1024
// maxAttributeNameLength = 1024
// maxAttributeValueLength = 16384
// obeyRelNofollow = false
}
extractorCss(ExtractorCSS)
extractorJs(ExtractorJS)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,19 @@ public boolean getIgnoreUnexpectedHtml() {
public void setIgnoreUnexpectedHtml(boolean ignoreUnexpectedHtml) {
kp.put("ignoreUnexpectedHtml",ignoreUnexpectedHtml);
}

{
setObeyRelNofollow(false);
}
public boolean getObeyRelNofollow() {
return (Boolean) kp.get("obeyRelNofollow");
}
/**
* If true links containing the "rel=nofollow" directive will not be extracted.
*/
public void setObeyRelNofollow(boolean obeyRelNofollow) {
kp.put("obeyRelNofollow", obeyRelNofollow);
}

/**
* CrawlMetadata provides the robots honoring policy to use when
Expand Down Expand Up @@ -397,9 +410,10 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
CharSequence valueContext = null;
CharSequence nameVal = null;

// Just in case it's a LINK tag
// Just in case it's an A or LINK tag
CharSequence linkHref = null;
CharSequence linkRel = null;
CharSequence linkContext = null;

final boolean framesAsEmbeds =
getTreatFramesAsEmbedLinks();
Expand All @@ -409,7 +423,7 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,

final boolean extractValueAttributes =
getExtractValueAttributes();

final String elementStr = element.toString();

while (attr.find()) {
Expand All @@ -430,10 +444,13 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
} else {
context = elementContext(element, attr.group(2));
}


if (elementStr.equalsIgnoreCase(LINK)) {
// delay handling LINK until the end as we need both HREF and REL
if ((elementStr.equalsIgnoreCase(LINK) || elementStr.equalsIgnoreCase("a"))
&& linkHref == null) {
// delay handling A and LINK until the end as we need both HREF and REL
linkHref = value;
linkContext = context;
} else if ("a[data-remote='true']/@href".equals(context)) {
processEmbed(curi, value, context);
} else {
Expand Down Expand Up @@ -595,8 +612,19 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element,
}

// finish handling LINK now both HREF and REL should be available
if (linkHref != null && linkRel != null) {
processLinkTagWithRel(curi, linkHref, linkRel);
if (linkHref != null) {
if (elementStr.equalsIgnoreCase(LINK)) {
if (linkRel != null) {
processLinkTagWithRel(curi, linkHref, linkRel);
}
} else {
if (linkRel != null && getObeyRelNofollow()
&& TextUtils.matches("(?i).*\\bnofollow\\b.*", linkRel)) {
if (logger.isLoggable(Level.FINEST)) logger.finest("ignoring nofollow link: " + linkHref);
} else {
processLink(curi, linkHref, linkContext);
}
}
}

// finish handling form action, now method is available
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,13 @@ protected void processGeneralTag(CrawlURI curi, Element element,
if (rel != null) {
processLinkTagWithRel(curi, attrValue, rel);
}
} else if ("a".equals(elementName)) {
String rel = attributes.getValue("rel");
if (rel != null && getObeyRelNofollow() && TextUtils.matches("(?i).*\\bnofollow\\b.*", rel)) {
if (logger.isLoggable(Level.FINEST)) logger.finest("ignoring nofollow link: " + attrValue);
} else {
processLink(curi, attrValue, context);
}
} else {
// other HREFs treated as links
processLink(curi, attrValue, context);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,8 @@

package org.archive.modules.extractor;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.*;
import java.util.stream.Collectors;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.Predicate;
Expand Down Expand Up @@ -723,6 +720,31 @@ public void testLinkRel() throws URIException {
assertEquals(expectedLinks, actualLinks);
}

public void testDisobeyRelNofollow() throws URIException {
String html = "<a href=/normal><a href=/nofollow rel=nofollow><a href=/both><a href=/both rel=nofollow>";
CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.org/"));
getExtractor().setObeyRelNofollow(false);
getExtractor().extract(curi, html);
Set<String> links = curi.getOutLinks().stream().map(CrawlURI::getURI).collect(Collectors.toSet());
assertEquals(Set.of("https://www.example.org/both",
"https://www.example.org/normal",
"https://www.example.org/nofollow"), links);
}

public void testRelNofollow() throws URIException {
String html = "<a href=/normal></a><a href=/nofollow rel=nofollow></a><a href=/both></a>" +
"<a href=/both rel=nofollow></a>" +
"<a href=/multi1 rel='noopener nofollow'></a>" +
"<a href=/multi2 rel=\"nofollow nopener\"></a>" +
"<a href=/multi3 rel='noopener nofollow noentry'></a>";
CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.org/"));
getExtractor().setObeyRelNofollow(true);
getExtractor().extract(curi, html);
Set<String> links = curi.getOutLinks().stream().map(CrawlURI::getURI).collect(Collectors.toSet());
assertEquals(Set.of("https://www.example.org/both",
"https://www.example.org/normal"), links);
}

private void genericCrawl(CrawlURI curi, CharSequence cs,String[] dest){
getExtractor().extract(curi, cs);

Expand Down
Loading