Skip to content

Commit

Permalink
Merge pull request #42 from commoncrawl/40-ignore-metadata-in-body
Browse files Browse the repository at this point in the history
WAT extractor: do not add <meta itemprop="..." > from body as metadata
  • Loading branch information
sebastian-nagel authored Dec 10, 2024
2 parents 1d94164 + b474f5d commit 48e46d6
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,24 @@ private static class MetaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList<String> l = getAttrList(node,"name","rel","content","http-equiv","property");
if(l != null) {
if (l.size() == 2) {
if (l.get(0).equals("content")) {
/*
* drop single "content" attributes very likely stemming
* from <meta itemprop="..." content="..."> schema.org
* annotations embedded in the HTML body, see
* https://github.com/commoncrawl/ia-web-commons/issues/40
*/
return;
} else {
/*
* Single key-value metadata pair, e.g. <meta
* name="..."/> (no "content") - no value or something
* when wrong with attribute parsing.
*/
return;
}
}
data.addMeta(l);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,24 @@ private void checkTitle(Resource resource, String title) {
}
}

private void checkExtractedAttributes(Resource resource, int metaElements, int metaElementIndex,
String... attributes) throws JSONException {
assertNotNull(resource);
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
assertNotNull(metas);
if (metaElements > -1) {
assertEquals(metaElements, metas.length());
}
JSONObject meta = metas.getJSONObject(metaElementIndex);
assertEquals(attributes.length / 2, meta.length());
for (int i = 0; i < attributes.length; i += 2) {
String key = attributes[i];
assertNotNull(meta.get(key));
assertEquals(attributes[i + 1], meta.get(key));
}
}

private void checkLinks(Resource resource, String[][] expectedLinks) {
assertNotNull(resource);
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
Expand Down Expand Up @@ -241,20 +259,6 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
}
}

private void checkExtractHtmlLangAttribute(Resource resource, String... langAttributes)
throws JSONException {
assertNotNull(resource);
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
assertNotNull(metas);
JSONObject meta = metas.getJSONObject(0);
for (int i = 0; i < langAttributes.length; i += 2) {
String key = langAttributes[i];
assertNotNull(meta.get(key));
assertEquals(meta.get(key), langAttributes[i+1]);
}
}

public void testLinkExtraction() throws ResourceParseException, IOException {
String testFileName = "link-extraction-test.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
Expand Down Expand Up @@ -434,11 +438,21 @@ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException,
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "zh-CN");
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "cs-cz");
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/xml:lang", "content", "es-MX");
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "en");
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "zh-CN");
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "cs-cz");
checkExtractedAttributes(extractor.getNext(), 2, 0, "name", "HTML@/lang", "content", "en");
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/xml:lang", "content", "es-MX");
}

public void testBodyMetaElements() throws ResourceParseException, IOException {
String testFileName = "meta-itemprop.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
Resource resource = extractor.getNext();
checkExtractedAttributes(resource, 2, 0, "name", "HTML@/lang", "content", "en");
checkExtractedAttributes(resource, 2, 1, "name", "robots", "content", "index,follow");
}

public void testHtmlParserEntityDecoding() {
Expand Down
35 changes: 35 additions & 0 deletions src/test/resources/org/archive/resource/html/meta-itemprop.warc
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
WARC/1.0
WARC-Type: response
WARC-Date: 2024-12-05T10:47:02Z
Content-Length: 710
Content-Type: application/http; msgtype=response
WARC-Target-URI: https://www.example.org/
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
content-type: text/html; charset=UTF-8

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name=robots content="index,follow">
<title>Test</title>
</head>
<body>
<!-- from https://schema.org/docs/gs.html#advanced_missing -->
<div itemscope itemtype="https://schema.org/Offer">
<span itemprop="name">Blend-O-Matic</span>
<span itemprop="price">$19.95</span>
<div itemprop="reviews" itemscope itemtype="https://schema.org/AggregateRating">
<img src="four-stars.jpg" />
<meta itemprop="ratingValue" content="4" />
<meta itemprop="bestRating" content="5" />
Based on <span itemprop="ratingCount">25</span> user ratings
</div>
</div>
</body>
</html>



0 comments on commit 48e46d6

Please sign in to comment.