Skip to content

Commit

Permalink
Merge pull request #37 from commoncrawl/ia-web-commons-36-title-embed…
Browse files Browse the repository at this point in the history
…ded-svg

WAT extractor: do not extract page title from embedded SVG images, fixes #36
  • Loading branch information
sebastian-nagel authored Oct 18, 2024
2 parents ea6cafd + e36c876 commit da324f9
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ public class ExtractingParseObserver implements ParseObserver {
Stack<StringBuilder> openAnchorTexts;
StringBuilder textExtract;
String title = null;
boolean inHead = false;
boolean inTitle = false;
boolean inPre = false;
boolean inSVG = false;

protected static String cssUrlPatString =
"url\\s*\\(\\s*([^)\\s]{1,8000}?)\\s*\\)";
Expand Down Expand Up @@ -59,7 +61,7 @@ public class ExtractingParseObserver implements ParseObserver {
"button", "canvas", "caption", "col", "colgroup", "dd", "div", "dl", "dt", "embed", "fieldset",
"figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
"li", "map", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table", "tbody",
"textarea", "tfoot", "th", "thead", "tr", "ul", "video" };
"textarea", "tfoot", "th", "thead", "title", "tr", "ul", "video" };
private static final Set<String> blockElements;
/* inline elements which content is not melted with surrounding words */
private final static String[] INLINE_ELEMENTS_SPACING = { "address", "cite", "details", "datalist", "iframe", "img",
Expand Down Expand Up @@ -144,11 +146,17 @@ public void handleTagEmpty(TagNode tag) {
@Override
public void handleTagOpen(TagNode tag) {
String name = tag.getTagName();
if(name.equals("TITLE")) {
if (name.equals("HEAD")) {
inHead = true;
} else if (name.equals("TITLE")) {
inTitle = !tag.isEmptyXmlTag();
return;
} else if (name.equals("PRE")) {
inPre = true;
} else if (name.equals("SVG")) {
inSVG = true;
} else if (name.equals("BODY")) {
inHead = false;
}

if (blockElements.contains(name)) {
Expand Down Expand Up @@ -183,9 +191,11 @@ public void handleTagOpen(TagNode tag) {
public void handleTagClose(TagNode tag) {
String name = tag.getTagName();

if(inTitle) {
if (inTitle) {
inTitle = false;
data.setTitle(title);
if (!inSVG && (inHead || !data.hasTitle())) {
data.setTitle(title);
}
title = null;
}

Expand Down Expand Up @@ -222,8 +232,12 @@ public void handleTagClose(TagNode tag) {
data.addHref(vals);
}
}
} else if (tag.getTagName().equals("HEAD")) {
inHead = false;
} else if (tag.getTagName().equals("PRE")) {
inPre = false;
} else if (tag.getTagName().equals("SVG")) {
inSVG = false;
}
}

Expand Down
7 changes: 7 additions & 0 deletions src/main/java/org/archive/resource/html/HTMLMetaData.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,15 @@ private JSONObject getHeader() {
public void setBaseHref(String href) {
putUnlessNull(getHeader(),HTML_BASE, href);
}

public void setTitle(String title) {
putUnlessNull(getHeader(),HTML_TITLE, title);
}

public boolean hasTitle() {
return header != null && header.has(HTML_TITLE);
}

private void putUnlessNull(JSONObject o, String k, String v) {
if(o != null) {
try {
Expand All @@ -43,6 +49,7 @@ private void putUnlessNull(JSONObject o, String k, String v) {
}
}
}

public String[] LtoA(List<String> l) {
String[] a = new String[l.size()];
l.toArray(a);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,19 @@ private void checkAnchor(Multimap<String,String> anchors, String url, String anc
assertTrue("Wrong anchor text " + anchor + " for " + url, anchors.get(url).contains(anchor));
}

private void checkTitle(Resource resource, String title) {
assertNotNull(resource);
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
JSONObject head = resource.getMetaData().optJSONObject("Head");
if (title != null) {
assertNotNull(head);
assertTrue("No title found", head.has(ResourceConstants.HTML_TITLE));
assertEquals(title, head.get(ResourceConstants.HTML_TITLE));
} else {
assertFalse(head.has(ResourceConstants.HTML_TITLE));
}
}

private void checkLinks(Resource resource, String[][] expectedLinks) {
assertNotNull(resource);
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
Expand Down Expand Up @@ -247,7 +260,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"http://www.example.com/shakespeare.html", "Q@/cite"},
{"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"}
};
checkLinks(extractor.getNext(), html4links);
Resource resource = extractor.getNext();
checkTitle(resource, "Test XHTML Link Extraction");
checkLinks(resource, html4links);
String[][] html5links = {
{"http:///www.example.com/video.html", "LINK@/href", null, "canonical"},
{"video.rss", "LINK@/href", null, "alternate"},
Expand All @@ -256,18 +271,24 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
};
checkLinks(extractor.getNext(), html5links);
resource = extractor.getNext();
checkTitle(resource, "Test HTML5 Video Tag");
checkLinks(resource, html5links);
String[][] html5links2 = {
{"http://www.example.com/", "A@/href"},
};
checkLinks(extractor.getNext(), html5links2);
resource = extractor.getNext();
checkTitle(resource, "Testing poor HTML5");
checkLinks(resource, html5links2);
String[][] fbVideoLinks = {
{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
{"https://www.facebook.com/facebook/", "A@/href"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbVideoLinks);
resource = extractor.getNext();
checkTitle(resource, "fb-video - Embedded Videos - Social Plugins");
checkLinks(resource, fbVideoLinks);
String[][] dataHrefLinks = {
{"standard.css", "LINK@/href", null, "stylesheet"},
{"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
Expand All @@ -293,7 +314,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"#", "A@/href"},
{"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"}
};
checkLinks(extractor.getNext(), dataHrefLinks);
resource = extractor.getNext();
checkTitle(resource, null); // empty title!
checkLinks(resource, dataHrefLinks);
String[][] fbSocialLinks = {
{"http://www.your-domain.com/your-page.html", "DIV@/data-uri"},
{"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"},
Expand All @@ -305,7 +328,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"https://www.facebook.com/facebook", "A@/href"},
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbSocialLinks);
resource = extractor.getNext();
// fragment without head and no title
checkLinks(resource, fbSocialLinks);
String[][] onClickLinks = {
{"webpage.html", "DIV@/onclick"},
{"index.html", "INPUT@/onclick"},
Expand All @@ -315,7 +340,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"http://example.com/location/href/1.html", "INPUT@/onclick"},
{"http://example.com/location/href/2.html", "INPUT@/onclick"}
};
checkLinks(extractor.getNext(), onClickLinks);
resource = extractor.getNext();
checkTitle(resource, "Test Extraction of URLs from INPUT onClick Attributes");
checkLinks(resource, onClickLinks);
String[][] escapedEntitiesLinks = {
{"http://www.example.com/", "__base__"},
{"http://www.example.com/redirected.html", "__meta_refresh__"},
Expand All @@ -325,12 +352,11 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"https://img.example.org/view?id=867&res=10x16", "IMG@/src",
"image URL containing escaped ampersand (\"&amp;\")" }
};
Resource resource = extractor.getNext();
resource = extractor.getNext();
assertNotNull(resource);
checkTitle(resource, "Title – \"Title\" written using character entities");
checkLinks(resource, escapedEntitiesLinks);
MetaData md = resource.getMetaData();
assertEquals("Wrong title", "Title – \"Title\" written using character entities",
md.getJSONObject(ResourceConstants.HTML_HEAD).getString(ResourceConstants.HTML_TITLE));
JSONArray metas = md.getJSONObject(ResourceConstants.HTML_HEAD).getJSONArray(ResourceConstants.HTML_META_TAGS);
for (int i = 0; i < metas.length(); i++) {
JSONObject o = metas.optJSONObject(i);
Expand All @@ -344,7 +370,7 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
"Anchor text with white space character entities and HTML block elements" } };
resource = extractor.getNext();
assertNotNull(resource);
System.out.println(resource);
checkTitle(resource, "Test Anchor Text Extraction With Whitespace");
checkLinks(resource, exampleLinks);
}

Expand All @@ -357,6 +383,7 @@ public void testTextExtraction() throws ResourceParseException, IOException {
Resource resource = extractor.getNext();
assertNotNull(resource);
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
checkTitle(resource, "White space and paragraph breaks when converting HTML to text");
String text = resource.getMetaData().getString(ResourceConstants.HTML_TEXT);
System.out.println(text);
assertTrue(text.contains("text\nThere should be a paragraph break after <h1-h6>"));
Expand All @@ -377,6 +404,16 @@ public void testTextExtraction() throws ResourceParseException, IOException {
// assertTrue(text.matches("CDATA in MathML:\\W*x<y"));
}

public void testTitleExtraction() throws ResourceParseException, IOException {
String testFileName = "title-extraction-embedded-SVG.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor =
new ExtractingResourceProducer(producer, mapper);
Resource resource = extractor.getNext();
checkTitle(resource, "Testing title extraction with embedded SVG");
}

public void testHtmlParserEntityDecoding() {
String[][] entities = { //
/* ampersand */
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
WARC/1.0
WARC-Type: response
WARC-Record-ID: <urn:uuid:9043ba74-5d11-4dad-97c1-d7454f8b7358>
WARC-Target-URI: https://www.example.org/testEmbeddedSVG.html
WARC-Date: 2024-10-14T10:05:41Z
WARC-IP-Address: 127.0.0.1
WARC-Block-Digest: sha1:XNN4JA3QDUN4DDEGTIPH5ZRORHYL657F
WARC-Payload-Digest: sha1:4FUACFTG3WCL26OITZNMEPRKFP6WAAHN
Content-Type: application/http;msgtype=response
Content-Length: 856

HTTP/1.1 200 OK
Date: Mon, 14 Oct 2024 10:05:41 GMT
Server: Apache/2.4.58 (Ubuntu)
Upgrade: h2,h2c
Connection: Upgrade, Keep-Alive
Last-Modified: Mon, 14 Oct 2024 10:04:25 GMT
ETag: "20a-6246cf6287f50"
Accept-Ranges: bytes
Content-Length: 522
Vary: Accept-Encoding
Keep-Alive: timeout=5, max=100
Content-Type: text/html

<!DOCTYPE html>
<html>
<head>
<title>Testing title extraction with embedded SVG</title>
<meta charset="utf-8">
</head>
<body>
<div>
<header>Testing title extraction with embedded SVG</header>
<p>This is body text...</p>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 400 400" fill="currentColor" width="1em">
<title>Embedded SVG</title>
<rect x="0" y="0" width="100%" height="100%" fill="lightblue"/>
<circle cx="100" cy="100" r="50" fill="red"/>
</svg>
</div>
</body>
</html>



0 comments on commit da324f9

Please sign in to comment.