Skip to content

Commit

Permalink
Make MetaData multi-valued to preserve values of repeating WARC and H…
Browse files Browse the repository at this point in the history
…TTP headers
  • Loading branch information
sebastian-nagel committed Nov 9, 2024
1 parent da324f9 commit f310468
Show file tree
Hide file tree
Showing 3 changed files with 297 additions and 3 deletions.
67 changes: 64 additions & 3 deletions src/main/java/org/archive/resource/MetaData.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@
import com.github.openjson.JSONObject;
import com.github.openjson.JSONTokener;

/**
* A nested structure of {@linkplain JSONObject}s to hold the metadata of
* content in nested containers, e.g. a HTML page as payload of a HTTP response
* in a WARC record stored as gzip "member".
*
* MetaData is multi-valued: if a second value is added under the same "key"
* ("name"), both values are stored in a {@linkplain JSONArray} as value. This
* allows to hold all values of repeating WARC or HTTP headers.
*/
public class MetaData extends JSONObject {

private static final Logger LOG =
Expand Down Expand Up @@ -67,6 +76,18 @@ public int getInt(String key) {
}
}

@Override
public int optInt(String key, int defaultValue) {
if (has(key)) {
try {
return super.getInt(key);
} catch(JSONException e) {
LOG.severe(e.getMessage());
}
}
return defaultValue;
}

@Override
public long getLong(String key) {
try {
Expand All @@ -77,6 +98,18 @@ public long getLong(String key) {
}
}

@Override
public long optLong(String key, long defaultValue) {
if (has(key)) {
try {
return super.getLong(key);
} catch(JSONException e) {
LOG.severe(e.getMessage());
}
}
return defaultValue;
}

@Override
public String getString(String key) {
try {
Expand All @@ -102,9 +135,37 @@ public void setTopMetaData(MetaData topMetaData) {
this.topMetaData = topMetaData;
}

@Override
public JSONObject put(String name, boolean value) throws JSONException {
return super.accumulate(name, value);
}

@Override
public JSONObject put(String name, double value) throws JSONException {
return super.accumulate(name, value);
}

@Override
public JSONObject put(String name, int value) throws JSONException {
return super.accumulate(name, value);
}

@Override
public JSONObject put(String name, long value) throws JSONException {
return super.accumulate(name, value);
}

@Override
public JSONObject put(String key, Object value) {
if (has(key)) {
return super.accumulate(key, value);
}
return super.put(key, value);
}

public JSONObject putString(String key, String val) {
try {
return super.put(key,val);
return super.accumulate(key,val);
} catch(JSONException e) {
LOG.severe(e.getMessage());
return null;
Expand All @@ -113,7 +174,7 @@ public JSONObject putString(String key, String val) {

public JSONObject putLong(String key, long val) {
try {
return super.put(key,String.valueOf(val));
return super.accumulate(key,String.valueOf(val));
} catch(JSONException e) {
LOG.severe(e.getMessage());
return null;
Expand All @@ -122,7 +183,7 @@ public JSONObject putLong(String key, long val) {

public JSONObject putBoolean(String key, boolean val) {
try {
return super.put(key,val);
return super.accumulate(key,val);
} catch(JSONException e) {
LOG.severe(e.getMessage());
return null;
Expand Down
186 changes: 186 additions & 0 deletions src/test/java/org/archive/resource/MetaDataTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package org.archive.resource;

import java.io.IOException;

import org.archive.extract.ExtractingResourceFactoryMapper;
import org.archive.extract.ExtractingResourceProducer;
import org.archive.extract.ProducerUtils;
import org.archive.extract.ResourceFactoryMapper;
import org.archive.format.json.JSONUtils;

import com.github.openjson.JSONArray;
import com.github.openjson.JSONObject;

import junit.framework.TestCase;

public class MetaDataTest extends TestCase {

private static String[] testFilePaths = {
"src/test/resources/org/archive/format/warc/IAH-urls-wget.warc",
"src/test/resources/org/archive/format/warc/mutliple-headers.warc"
};

private static JSONObject obj = new JSONObject("{\"foo\":\"bar\",\"hello\":\"world\"}");

private MetaData putMetaData(MetaData m) {
m.putBoolean("boolean-1", false);
m.putBoolean("boolean-2", true);
m.put("boolean-3", true);
m.put("boolean-1", true); // append

m.put("double-1", 0.5d);
m.put("double-2", 2.5d);
m.put("double-3", 3.5d);
m.put("double-1", 1.5d); // append

m.put("int-1", 0);
m.put("int-2", 2);
m.put("int-3", 3);
m.put("int-1", 1); // append

// choose JSON "numbers" which are forced into a Java long (too big for an integer)
m.putLong("long-1", 0xffffffffL + 0L);
m.putLong("long-2", 0xffffffffL + 2L);
m.put("long-3", 0xffffffffL + 3L);
m.put("long-1", 0xffffffffL + 1L); // append

m.putString("string-1", "0");
m.putString("string-2", "2");
m.put("string-3", "3");
m.put("string-1", "1"); // append

m.putOpt("obj-1", obj);
m.put("obj-1", obj); // append
m.put("obj-2", obj);
m.putOpt("obj-2", null); // do nothing because value is null

return m;
}

private void verifyMultiValuedMetaData(MetaData m) {
// boolean
assertEquals(JSONArray.class, m.get("boolean-1").getClass());
assertEquals(false, ((JSONArray) m.get("boolean-1")).getBoolean(0));
assertEquals(true, ((JSONArray) m.get("boolean-1")).getBoolean(1));
assertEquals(true, m.getBoolean("boolean-2"));
assertEquals(true, m.getBoolean("boolean-3"));
assertEquals(Boolean.class, m.get("boolean-3").getClass());
assertEquals(true, m.optBoolean("boolean-3", false));
assertEquals(false, m.optBoolean("boolean-99", false));

// double
assertEquals(JSONArray.class, m.get("double-1").getClass());
assertEquals(0.5d, ((JSONArray) m.get("double-1")).getDouble(0));
assertEquals(1.5d, ((JSONArray) m.get("double-1")).getDouble(1));
assertEquals(2.5d, m.getDouble("double-2"));
assertEquals(3.5d, m.getDouble("double-3"));
assertEquals(Double.class, m.get("double-3").getClass());
assertEquals(3.5d, m.optDouble("double-3"));
assertEquals(99.5d, m.optDouble("double-99", 99.5d));

// int
assertEquals(JSONArray.class, m.get("int-1").getClass());
assertEquals(0, ((JSONArray) m.get("int-1")).getInt(0));
assertEquals(1, ((JSONArray) m.get("int-1")).getInt(1));
assertEquals(2, m.getInt("int-2"));
assertEquals(3, m.getInt("int-3"));
assertEquals(Integer.class, m.get("int-3").getClass());
assertEquals(3, m.optInt("int-3"));
assertEquals(99, m.optInt("int-99", 99));

// long
assertEquals(JSONArray.class, m.get("long-1").getClass());
assertEquals(0xffffffffL + 0L, ((JSONArray) m.get("long-1")).getLong(0));
assertEquals(0xffffffffL + 1L, ((JSONArray) m.get("long-1")).getLong(1));
assertEquals(0xffffffffL + 2L, m.getLong("long-2"));
assertEquals(0xffffffffL + 3L, m.getLong("long-3"));
assertEquals(Long.class, m.get("long-3").getClass());
assertEquals(0xffffffffL + 3L, m.optLong("long-3"));
assertEquals(0xffffffffL + 99L, m.optLong("long-99", 0xffffffffL + 99L));

// String
assertEquals(JSONArray.class, m.get("string-1").getClass());
assertEquals("0", ((JSONArray) m.get("string-1")).getString(0));
assertEquals("1", ((JSONArray) m.get("string-1")).getString(1));
assertEquals("2", m.getString("string-2"));
assertEquals("3", m.getString("string-3"));
assertEquals(String.class, m.get("string-3").getClass());
assertEquals("3", m.optString("string-3"));
assertEquals("99", m.optString("string-99", "99"));

// Object
assertEquals(JSONArray.class, m.get("obj-1").getClass());
assertEquals(JSONObject.class, ((JSONArray) m.get("obj-1")).get(0).getClass());
assertEquals(JSONObject.class, ((JSONArray) m.get("obj-1")).get(1).getClass());
assertEquals("bar", ((JSONObject) ((JSONArray) m.get("obj-1")).get(0)).get("foo"));
assertEquals("world", ((JSONObject) ((JSONArray) m.get("obj-1")).get(0)).get("hello"));
assertEquals("bar", ((JSONObject) ((JSONArray) m.get("obj-1")).get(1)).get("foo"));
assertEquals("world", ((JSONObject) ((JSONArray) m.get("obj-1")).get(1)).get("hello"));
assertEquals(JSONObject.class, m.get("obj-2").getClass());
assertEquals("bar", ((JSONObject) m.get("obj-2")).get("foo"));
assertEquals("world", ((JSONObject) m.get("obj-2")).get("hello"));
}

public void testMultiValued() {
MetaData m = new MetaData();
m = putMetaData(m);
verifyMultiValuedMetaData(m);

// test (de)serialization
m = new MetaData(m.toString(2));
verifyMultiValuedMetaData(m);
}

private MetaData readNextWARCResponseAsMetaData(String filePath) throws IOException, ResourceParseException {
ResourceProducer producer = ProducerUtils.getProducer(filePath);
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer exProducer = new ExtractingResourceProducer(producer, mapper);
Resource r = exProducer.getNext();
while (r != null) {
MetaData top = r.getMetaData().getTopMetaData();
JSONObject warcHeaders = JSONUtils.extractObject(top, "Envelope.WARC-Header-Metadata");
if (warcHeaders.has("WARC-Type") && "response".equals(warcHeaders.getString("WARC-Type"))) {
return top;
}
r = exProducer.getNext();
}
return null;
}

/**
* Verify that in the legacy test file all WARC and HTTP headers are
* single-valued, i.e. {@linkplain String}s.
*/
public void testSingleHeaders() throws IOException, ResourceParseException {
MetaData m = readNextWARCResponseAsMetaData(testFilePaths[0]);

JSONObject warcHeaders = JSONUtils.extractObject(m, "Envelope.WARC-Header-Metadata");
JSONObject httpHeaders = JSONUtils.extractObject(m, "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers");

for (Object header : warcHeaders.keySet()) {
assertEquals(String.class, warcHeaders.get(header.toString()).getClass());
}

for (Object header : httpHeaders.keySet()) {
assertEquals(String.class, httpHeaders.get(header.toString()).getClass());
}
}

public void testMultipleHeaders() throws IOException, ResourceParseException {
MetaData m = readNextWARCResponseAsMetaData(testFilePaths[1]);

JSONObject warcHeaders = JSONUtils.extractObject(m, "Envelope.WARC-Header-Metadata");
JSONObject httpHeaders = JSONUtils.extractObject(m, "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers");

assertEquals("https://www.example.com/index.html/", warcHeaders.getString("WARC-Target-URI"));
assertEquals(JSONArray.class, warcHeaders.get("WARC-Protocol").getClass());
assertEquals(2, ((JSONArray) warcHeaders.get("WARC-Protocol")).length());
assertEquals("h2", ((JSONArray) warcHeaders.get("WARC-Protocol")).get(0));

assertEquals("108", httpHeaders.getString("Content-Length"));
assertEquals(JSONArray.class, httpHeaders.get("x-powered-by").getClass());
assertEquals(2, ((JSONArray) httpHeaders.get("x-powered-by")).length());
assertEquals("PHP/8.3.11", ((JSONArray) httpHeaders.get("x-powered-by")).get(0));
assertEquals("PleskLin", ((JSONArray) httpHeaders.get("x-powered-by")).get(1));
}
}
47 changes: 47 additions & 0 deletions src/test/resources/org/archive/format/warc/mutliple-headers.warc
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
WARC/1.0
WARC-Type: response
WARC-Date: 2024-09-27T10:47:02Z
WARC-Record-ID: <urn:uuid:7a10b628-4d3b-6f2e-8b73-c65d80646310>
Content-Length: 971
Content-Type: application/http; msgtype=response
WARC-Warcinfo-ID: <urn:uuid:824d10d3-4f67-131a-9cbf-e40ecb5f0fa5>
WARC-Concurrent-To: <urn:uuid:51776b84-429e-53cb-a335-b53cf855c57a>
WARC-IP-Address: 172.67.184.105
WARC-Target-URI: https://www.example.com/index.html/
WARC-Protocol: h2
WARC-Protocol: tls/1.3
WARC-Cipher-Suite: TLS_AES_256_GCM_SHA384
WARC-Payload-Digest: sha1:70FB81039DCE25916E0E0CB48CF6662E3F27FFFC
WARC-Block-Digest: sha1:80573371A8271BE6B3AA26FD9DB72E9AD9F316D9
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
date: Fri, 27 Sep 2024 10:47:02 GMT
content-type: text/html; charset=UTF-8
x-powered-by: PHP/8.3.11
x-powered-by: PleskLin
x-pingback: https://www.example.com/xmlrpc.php
link: <https://www.example.com/wp-json/>; rel="https://api.w.org/"
link: <https://www.example.com/wp-json/wp/v2/posts/00000>; rel="alternate"; title="JSON"; type="application/json"
link: <https://www.example.com/?p=00000>; rel=shortlink
x-litespeed-cache: miss
vary: Accept-Encoding
x-turbo-charged-by: LiteSpeed
cf-cache-status: DYNAMIC
report-to: {"endpoints":[{"url":"https:\/\/a.nel.cloudflare.com\/report\/v4?s=XXtestYY"}],"group":"cf-nel","max_age":604800}
nel: {"success_fraction":0,"report_to":"cf-nel","max_age":604800}
server: cloudflare
cf-ray: 8bf61e4afb9e7f9e-IAD
X-Crawler-content-encoding: br
alt-svc: h3=":443"; ma=86400
Content-Length: 108

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Test</title>
</head>
<body/>
</html>

0 comments on commit f310468

Please sign in to comment.