Merge pull request #37 from commoncrawl/ia-web-commons-36-title-embed…

…ded-svg WAT extractor: do not extract page title from embedded SVG images, fixes #36
commoncrawl · Oct 18, 2024 · da324f9 · da324f9
2 parents ea6cafd + e36c876
commit da324f9
Show file tree

Hide file tree

Showing 4 changed files with 118 additions and 15 deletions.
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -24,8 +24,10 @@ public class ExtractingParseObserver implements ParseObserver {
 	Stack<StringBuilder> openAnchorTexts;
 	StringBuilder textExtract;
 	String title = null;
+	boolean inHead = false;
 	boolean inTitle = false;
 	boolean inPre = false;
+	boolean inSVG = false;
 
 	protected static String cssUrlPatString = 
 		"url\\s*\\(\\s*([^)\\s]{1,8000}?)\\s*\\)";
@@ -59,7 +61,7 @@ public class ExtractingParseObserver implements ParseObserver {
 			"button", "canvas", "caption", "col", "colgroup", "dd", "div", "dl", "dt", "embed", "fieldset",
 			"figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
 			"li", "map", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table", "tbody",
-			"textarea", "tfoot", "th", "thead", "tr", "ul", "video" };
+			"textarea", "tfoot", "th", "thead", "title", "tr", "ul", "video" };
 	private static final Set<String> blockElements;
 	/* inline elements which content is not melted with surrounding words */
 	private final static String[] INLINE_ELEMENTS_SPACING = { "address", "cite", "details", "datalist", "iframe", "img",
@@ -144,11 +146,17 @@ public void handleTagEmpty(TagNode tag) {
 	@Override
 	public void handleTagOpen(TagNode tag) {
 		String name = tag.getTagName();
-		if(name.equals("TITLE")) {
+		if (name.equals("HEAD")) {
+			inHead = true;
+		} else if (name.equals("TITLE")) {
 			inTitle = !tag.isEmptyXmlTag();
 			return;
 		} else if (name.equals("PRE")) {
 			inPre = true;
+		} else if (name.equals("SVG")) {
+			inSVG = true;
+		} else if (name.equals("BODY")) {
+			inHead = false;
 		}
 
 		if (blockElements.contains(name)) {
@@ -183,9 +191,11 @@ public void handleTagOpen(TagNode tag) {
 	public void handleTagClose(TagNode tag) {
 		String name = tag.getTagName();
 
-		if(inTitle) {
+		if (inTitle) {
 			inTitle = false;
-			data.setTitle(title);
+			if (!inSVG && (inHead || !data.hasTitle())) {
+				data.setTitle(title);
+			}
 			title = null;
 		}
 
@@ -222,8 +232,12 @@ public void handleTagClose(TagNode tag) {
 					data.addHref(vals);
 				}
 			}
+		} else if (tag.getTagName().equals("HEAD")) {
+			inHead = false;
 		} else if (tag.getTagName().equals("PRE")) {
 			inPre = false;
+		} else if (tag.getTagName().equals("SVG")) {
+			inSVG = false;
 		}
 	}
 

diff --git a/src/main/java/org/archive/resource/html/HTMLMetaData.java b/src/main/java/org/archive/resource/html/HTMLMetaData.java
@@ -31,9 +31,15 @@ private JSONObject getHeader() {
 	public void setBaseHref(String href) {
 		putUnlessNull(getHeader(),HTML_BASE, href);
 	}
+
 	public void setTitle(String title) {
 		putUnlessNull(getHeader(),HTML_TITLE, title);
 	}
+
+	public boolean hasTitle() {
+		return header != null && header.has(HTML_TITLE);
+	}
+
 	private void putUnlessNull(JSONObject o, String k, String v) {
 		if(o != null) {
 			try {
@@ -43,6 +49,7 @@ private void putUnlessNull(JSONObject o, String k, String v) {
 			}
 		}
 	}
+
 	public String[] LtoA(List<String> l) {
 		String[] a = new String[l.size()];
 		l.toArray(a);

diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -152,6 +152,19 @@ private void checkAnchor(Multimap<String,String> anchors, String url, String anc
 		assertTrue("Wrong anchor text " + anchor + " for " + url, anchors.get(url).contains(anchor));
 	}
 
+	private void checkTitle(Resource resource, String title) {
+		assertNotNull(resource);
+		assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
+		JSONObject head = resource.getMetaData().optJSONObject("Head");
+		if (title != null) {
+			assertNotNull(head);
+			assertTrue("No title found", head.has(ResourceConstants.HTML_TITLE));
+			assertEquals(title, head.get(ResourceConstants.HTML_TITLE));
+		} else {
+			assertFalse(head.has(ResourceConstants.HTML_TITLE));
+		}
+	}
+
 	private void checkLinks(Resource resource, String[][] expectedLinks) {
 		assertNotNull(resource);
 		assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
@@ -247,7 +260,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
 				{"http://www.example.com/shakespeare.html", "Q@/cite"},
 				{"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"}
 		};
-		checkLinks(extractor.getNext(), html4links);
+		Resource resource = extractor.getNext();
+		checkTitle(resource, "Test XHTML Link Extraction");
+		checkLinks(resource, html4links);
 		String[][] html5links = {
 				{"http:///www.example.com/video.html", "LINK@/href", null, "canonical"},
 				{"video.rss", "LINK@/href", null, "alternate"},
@@ -256,18 +271,24 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
 				{"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
 				{"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
 		};
-		checkLinks(extractor.getNext(), html5links);
+		resource = extractor.getNext();
+		checkTitle(resource, "Test HTML5 Video Tag");
+		checkLinks(resource, html5links);
 		String[][] html5links2 = {
 				{"http://www.example.com/", "A@/href"},
 		};
-		checkLinks(extractor.getNext(), html5links2);
+		resource = extractor.getNext();
+		checkTitle(resource, "Testing poor HTML5");
+		checkLinks(resource, html5links2);
 		String[][] fbVideoLinks = {
 				{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
 				{"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
 				{"https://www.facebook.com/facebook/", "A@/href"},
 				{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}
 		};
-		checkLinks(extractor.getNext(), fbVideoLinks);
+		resource = extractor.getNext();
+		checkTitle(resource, "fb-video - Embedded Videos - Social Plugins");
+		checkLinks(resource, fbVideoLinks);
 		String[][] dataHrefLinks = {
 				{"standard.css", "LINK@/href", null, "stylesheet"},
 				{"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
@@ -293,7 +314,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
 				{"#", "A@/href"},
 				{"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"}
 		};
-		checkLinks(extractor.getNext(), dataHrefLinks);
+		resource = extractor.getNext();
+		checkTitle(resource, null); // empty title!
+		checkLinks(resource, dataHrefLinks);
 		String[][] fbSocialLinks = {
 				{"http://www.your-domain.com/your-page.html", "DIV@/data-uri"},
 				{"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"},
@@ -305,7 +328,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
 				{"https://www.facebook.com/facebook", "A@/href"},
 				{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
 		};
-		checkLinks(extractor.getNext(), fbSocialLinks);
+		resource = extractor.getNext();
+		// fragment without head and no title
+		checkLinks(resource, fbSocialLinks);
 		String[][] onClickLinks = {
 				{"webpage.html", "DIV@/onclick"},
 				{"index.html", "INPUT@/onclick"},
@@ -315,7 +340,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
 				{"http://example.com/location/href/1.html", "INPUT@/onclick"},
 				{"http://example.com/location/href/2.html", "INPUT@/onclick"}
 		};
-		checkLinks(extractor.getNext(), onClickLinks);
+		resource = extractor.getNext();
+		checkTitle(resource, "Test Extraction of URLs from INPUT onClick Attributes");
+		checkLinks(resource, onClickLinks);
 		String[][] escapedEntitiesLinks = {
 				{"http://www.example.com/", "__base__"},
 				{"http://www.example.com/redirected.html", "__meta_refresh__"},
@@ -325,12 +352,11 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
 				{"https://img.example.org/view?id=867&res=10x16", "IMG@/src",
 					"image URL containing escaped ampersand (\"&amp;\")" }
 		};
-		Resource resource = extractor.getNext();
+		resource = extractor.getNext();
 		assertNotNull(resource);
+		checkTitle(resource, "Title – \"Title\" written using character entities");
 		checkLinks(resource, escapedEntitiesLinks);
 		MetaData md = resource.getMetaData();
-		assertEquals("Wrong title", "Title – \"Title\" written using character entities",
-				md.getJSONObject(ResourceConstants.HTML_HEAD).getString(ResourceConstants.HTML_TITLE));
 		JSONArray metas = md.getJSONObject(ResourceConstants.HTML_HEAD).getJSONArray(ResourceConstants.HTML_META_TAGS);
 		for (int i = 0; i < metas.length(); i++) {
 			JSONObject o = metas.optJSONObject(i);
@@ -344,7 +370,7 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
 				"Anchor text with white space character entities and HTML block elements" } };
 		resource = extractor.getNext();
 		assertNotNull(resource);
-		System.out.println(resource);
+		checkTitle(resource, "Test Anchor Text Extraction With Whitespace");
 		checkLinks(resource, exampleLinks);
 	}
 
@@ -357,6 +383,7 @@ public void testTextExtraction() throws ResourceParseException, IOException {
 		Resource resource = extractor.getNext();
 		assertNotNull(resource);
 		assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
+		checkTitle(resource, "White space and paragraph breaks when converting HTML to text");
 		String text = resource.getMetaData().getString(ResourceConstants.HTML_TEXT);
 		System.out.println(text);
 		assertTrue(text.contains("text\nThere should be a paragraph break after <h1-h6>"));
@@ -377,6 +404,16 @@ public void testTextExtraction() throws ResourceParseException, IOException {
 		// assertTrue(text.matches("CDATA in MathML:\\W*x<y"));
 	}
 
+	public void testTitleExtraction() throws ResourceParseException, IOException {
+		String testFileName = "title-extraction-embedded-SVG.warc";
+		ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+		ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+		ExtractingResourceProducer extractor = 
+				new ExtractingResourceProducer(producer, mapper);
+		Resource resource = extractor.getNext();
+		checkTitle(resource, "Testing title extraction with embedded SVG");
+	}
+
 	public void testHtmlParserEntityDecoding() {
 		String[][] entities = { //
 				/* ampersand */

diff --git a/src/test/resources/org/archive/resource/html/title-extraction-embedded-SVG.warc b/src/test/resources/org/archive/resource/html/title-extraction-embedded-SVG.warc
@@ -0,0 +1,45 @@
+WARC/1.0
+WARC-Type: response
+WARC-Record-ID: <urn:uuid:9043ba74-5d11-4dad-97c1-d7454f8b7358>
+WARC-Target-URI: https://www.example.org/testEmbeddedSVG.html
+WARC-Date: 2024-10-14T10:05:41Z
+WARC-IP-Address: 127.0.0.1
+WARC-Block-Digest: sha1:XNN4JA3QDUN4DDEGTIPH5ZRORHYL657F
+WARC-Payload-Digest: sha1:4FUACFTG3WCL26OITZNMEPRKFP6WAAHN
+Content-Type: application/http;msgtype=response
+Content-Length: 856
+
+HTTP/1.1 200 OK
+Date: Mon, 14 Oct 2024 10:05:41 GMT
+Server: Apache/2.4.58 (Ubuntu)
+Upgrade: h2,h2c
+Connection: Upgrade, Keep-Alive
+Last-Modified: Mon, 14 Oct 2024 10:04:25 GMT
+ETag: "20a-6246cf6287f50"
+Accept-Ranges: bytes
+Content-Length: 522
+Vary: Accept-Encoding
+Keep-Alive: timeout=5, max=100
+Content-Type: text/html
+
+<!DOCTYPE html>
+<html>
+<head>
+<title>Testing title extraction with embedded SVG</title>
+<meta charset="utf-8">
+</head>
+<body>
+  <div>
+    <header>Testing title extraction with embedded SVG</header>
+    <p>This is body text...</p>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 400 400" fill="currentColor" width="1em">
+      <title>Embedded SVG</title>
+      <rect x="0" y="0" width="100%" height="100%" fill="lightblue"/>
+      <circle cx="100" cy="100" r="50" fill="red"/>
+    </svg>
+  </div>
+</body>
+</html>
+
+
+