From febb13f761dad4b195c013fe6792a0452c676a70 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 5 Dec 2024 19:38:53 +0100 Subject: [PATCH 1/2] WAT extractor: do not add from body as metadata --- .../html/ExtractingParseObserver.java | 18 ++++++++++ .../html/ExtractingParseObserverTest.java | 22 ++++++++++++ .../archive/resource/html/meta-itemprop.warc | 35 +++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 src/test/resources/org/archive/resource/html/meta-itemprop.warc diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index b0b37f4a..c230440a 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -669,6 +669,24 @@ private static class MetaTagExtractor implements TagExtractor { public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { ArrayList l = getAttrList(node,"name","rel","content","http-equiv","property"); if(l != null) { + if (l.size() == 2) { + if (l.get(0).equals("content")) { + /* + * drop single "content" attributes very likely stemming + * from schema.org + * annotations embedded in the HTML body, see + * https://github.com/commoncrawl/ia-web-commons/issues/40 + */ + return; + } else { + /* + * Single key-value metadata pair, e.g. (no "content") - no value or something + * when wrong with attribute parsing. + */ + return; + } + } data.addMeta(l); } } diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index d6e5e802..a5aea5e1 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -166,6 +166,20 @@ private void checkTitle(Resource resource, String title) { } } + private void checkExtractedAttributes(Resource resource, String... attributes) throws JSONException { + assertNotNull(resource); + assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); + JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas"); + assertNotNull(metas); + JSONObject meta = metas.getJSONObject(0); + assertEquals(attributes.length / 2, meta.length()); + for (int i = 0; i < attributes.length; i += 2) { + String key = attributes[i]; + assertNotNull(meta.get(key)); + assertEquals(meta.get(key), attributes[i + 1]); + } + } + private void checkLinks(Resource resource, String[][] expectedLinks) { assertNotNull(resource); assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); @@ -441,6 +455,14 @@ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException, checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/xml:lang", "content", "es-MX"); } + public void testBodyMetaElements() throws ResourceParseException, IOException { + String testFileName = "meta-itemprop.warc"; + ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); + ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); + ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper); + checkExtractedAttributes(extractor.getNext(), "name", "robots", "content", "index,follow"); + } + public void testHtmlParserEntityDecoding() { String[][] entities = { // /* ampersand */ diff --git a/src/test/resources/org/archive/resource/html/meta-itemprop.warc b/src/test/resources/org/archive/resource/html/meta-itemprop.warc new file mode 100644 index 00000000..e0545b7f --- /dev/null +++ b/src/test/resources/org/archive/resource/html/meta-itemprop.warc @@ -0,0 +1,35 @@ +WARC/1.0 +WARC-Type: response +WARC-Date: 2024-12-05T10:47:02Z +Content-Length: 710 +Content-Type: application/http; msgtype=response +WARC-Target-URI: https://www.example.org/ +WARC-Identified-Payload-Type: text/html + +HTTP/1.1 200 +content-type: text/html; charset=UTF-8 + + + + + + + Test + + + +
+ Blend-O-Matic + $19.95 +
+ + + + Based on 25 user ratings +
+
+ + + + + From b474f5d57bbd9ebbebadc80dc102a3d898a29f41 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 10 Dec 2024 09:58:57 +0100 Subject: [PATCH 2/2] WAT extractor: do not add from body as metadata - rebase to recent head / master - unit test: merge methods to verify any kind of metadata attributes --- .../html/ExtractingParseObserverTest.java | 38 ++++++++----------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index a5aea5e1..65b263c7 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -166,17 +166,21 @@ private void checkTitle(Resource resource, String title) { } } - private void checkExtractedAttributes(Resource resource, String... attributes) throws JSONException { + private void checkExtractedAttributes(Resource resource, int metaElements, int metaElementIndex, + String... attributes) throws JSONException { assertNotNull(resource); assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas"); assertNotNull(metas); - JSONObject meta = metas.getJSONObject(0); + if (metaElements > -1) { + assertEquals(metaElements, metas.length()); + } + JSONObject meta = metas.getJSONObject(metaElementIndex); assertEquals(attributes.length / 2, meta.length()); for (int i = 0; i < attributes.length; i += 2) { String key = attributes[i]; assertNotNull(meta.get(key)); - assertEquals(meta.get(key), attributes[i + 1]); + assertEquals(attributes[i + 1], meta.get(key)); } } @@ -255,20 +259,6 @@ private void checkLinks(Resource resource, String[][] expectedLinks) { } } - private void checkExtractHtmlLangAttribute(Resource resource, String... langAttributes) - throws JSONException { - assertNotNull(resource); - assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); - JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas"); - assertNotNull(metas); - JSONObject meta = metas.getJSONObject(0); - for (int i = 0; i < langAttributes.length; i += 2) { - String key = langAttributes[i]; - assertNotNull(meta.get(key)); - assertEquals(meta.get(key), langAttributes[i+1]); - } - } - public void testLinkExtraction() throws ResourceParseException, IOException { String testFileName = "link-extraction-test.warc"; ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); @@ -448,11 +438,11 @@ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException, ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper); - checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en"); - checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "zh-CN"); - checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "cs-cz"); - checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en"); - checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/xml:lang", "content", "es-MX"); + checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "en"); + checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "zh-CN"); + checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "cs-cz"); + checkExtractedAttributes(extractor.getNext(), 2, 0, "name", "HTML@/lang", "content", "en"); + checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/xml:lang", "content", "es-MX"); } public void testBodyMetaElements() throws ResourceParseException, IOException { @@ -460,7 +450,9 @@ public void testBodyMetaElements() throws ResourceParseException, IOException { ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper); - checkExtractedAttributes(extractor.getNext(), "name", "robots", "content", "index,follow"); + Resource resource = extractor.getNext(); + checkExtractedAttributes(resource, 2, 0, "name", "HTML@/lang", "content", "en"); + checkExtractedAttributes(resource, 2, 1, "name", "robots", "content", "index,follow"); } public void testHtmlParserEntityDecoding() {