diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index b0b37f4a..c230440a 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -669,6 +669,24 @@ private static class MetaTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = getAttrList(node,"name","rel","content","http-equiv","property"); if(l != null) { + if (l.size() == 2) { + if (l.get(0).equals("content")) { + /* + * drop single "content" attributes very likely stemming + * from schema.org + * annotations embedded in the HTML body, see + * https://github.com/commoncrawl/ia-web-commons/issues/40 + */ + return; + } else { + /* + * Single key-value metadata pair, e.g. (no "content") - no value or something + * when wrong with attribute parsing. + */ + return; + } + } data.addMeta(l); } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index d6e5e802..65b263c7 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -166,6 +166,24 @@ private void checkTitle(Resource resource, String title) { } } + private void checkExtractedAttributes(Resource resource, int metaElements, int metaElementIndex, + String... attributes) throws JSONException { + assertNotNull(resource); + assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); + JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas"); + assertNotNull(metas); + if (metaElements > -1) { + assertEquals(metaElements, metas.length()); + } + JSONObject meta = metas.getJSONObject(metaElementIndex); + assertEquals(attributes.length / 2, meta.length()); + for (int i = 0; i < attributes.length; i += 2) { + String key = attributes[i]; + assertNotNull(meta.get(key)); + assertEquals(attributes[i + 1], meta.get(key)); + } + } + private void checkLinks(Resource resource, String[][] expectedLinks) { assertNotNull(resource); assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); @@ -241,20 +259,6 @@ private void checkLinks(Resource resource, String[][] expectedLinks) { } } - private void checkExtractHtmlLangAttribute(Resource resource, String... langAttributes) - throws JSONException { - assertNotNull(resource); - assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); - JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas"); - assertNotNull(metas); - JSONObject meta = metas.getJSONObject(0); - for (int i = 0; i < langAttributes.length; i += 2) { - String key = langAttributes[i]; - assertNotNull(meta.get(key)); - assertEquals(meta.get(key), langAttributes[i+1]); - } - } - public void testLinkExtraction() throws ResourceParseException, IOException { String testFileName = "link-extraction-test.warc"; ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); @@ -434,11 +438,21 @@ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException, ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper); - checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en"); - checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "zh-CN"); - checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "cs-cz"); - checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en"); - checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/xml:lang", "content", "es-MX"); + checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "en"); + checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "zh-CN"); + checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "cs-cz"); + checkExtractedAttributes(extractor.getNext(), 2, 0, "name", "HTML@/lang", "content", "en"); + checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/xml:lang", "content", "es-MX"); + } + + public void testBodyMetaElements() throws ResourceParseException, IOException { + String testFileName = "meta-itemprop.warc"; + ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); + ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); + ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper); + Resource resource = extractor.getNext(); + checkExtractedAttributes(resource, 2, 0, "name", "HTML@/lang", "content", "en"); + checkExtractedAttributes(resource, 2, 1, "name", "robots", "content", "index,follow"); } public void testHtmlParserEntityDecoding() { diff --git a/src/test/resources/org/archive/resource/html/meta-itemprop.warc b/src/test/resources/org/archive/resource/html/meta-itemprop.warc new file mode 100644 index 00000000..e0545b7f --- /dev/null +++ b/src/test/resources/org/archive/resource/html/meta-itemprop.warc @@ -0,0 +1,35 @@ +WARC/1.0 +WARC-Type: response +WARC-Date: 2024-12-05T10:47:02Z +Content-Length: 710 +Content-Type: application/http; msgtype=response +WARC-Target-URI: https://www.example.org/ +WARC-Identified-Payload-Type: text/html + +HTTP/1.1 200 +content-type: text/html; charset=UTF-8 + + + + + + + Test + + + +
+ Blend-O-Matic + $19.95 +
+ + + + Based on 25 user ratings +
+
+ + + + +