From c7fafcdf975d84c7e450bf67ec10acc6edd013e3 Mon Sep 17 00:00:00 2001 From: Jannis Tsiroyannis Date: Fri, 13 Dec 2024 11:44:15 +0100 Subject: [PATCH 1/2] Add profile negotiation for EMM dump data. --- emm/src/main/java/whelk/Dump.java | 48 ++++++++++++++++--------- emm/src/main/java/whelk/EmmServlet.java | 5 ++- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/emm/src/main/java/whelk/Dump.java b/emm/src/main/java/whelk/Dump.java index f0c1e82f2e..9d6dc34a36 100644 --- a/emm/src/main/java/whelk/Dump.java +++ b/emm/src/main/java/whelk/Dump.java @@ -71,7 +71,7 @@ public class Dump { private static final int GZIP_BUF_SIZE = 64 * 1024; private static final String ND_JSON_LD_GZ_EXT = ".ndjsonld.gz"; - public static void sendDumpResponse(Whelk whelk, String apiBaseUrl, HttpServletRequest req, HttpServletResponse res) throws IOException, SQLException { + public static void sendDumpResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String apiBaseUrl, HttpServletRequest req, HttpServletResponse res) throws IOException { String selection = req.getParameter("selection"); if (selection == null) { @@ -86,6 +86,8 @@ public static void sendDumpResponse(Whelk whelk, String apiBaseUrl, HttpServletR return; } + String profile = req.getParameter("profile"); // May be null, meaning default (kbv) + String tmpDir = System.getProperty("java.io.tmpdir"); Path dumpsPath = Paths.get(tmpDir, "dumps"); Files.createDirectories(dumpsPath); @@ -97,10 +99,10 @@ public static void sendDumpResponse(Whelk whelk, String apiBaseUrl, HttpServletR } if (isDownload) { - sendDumpDownloadResponse(whelk, dumpFilePath, res); + sendDumpDownloadResponse(whelk, targetVocabMapper, profile, dumpFilePath, res); } else { long offsetNumeric = Long.parseLong(offset); - sendDumpPageResponse(whelk, apiBaseUrl, selection, dumpFilePath, offsetNumeric, res); + sendDumpPageResponse(whelk, targetVocabMapper, profile, apiBaseUrl, selection, dumpFilePath, offsetNumeric, res); } } @@ -149,7 +151,7 @@ private static void sendDumpIndexResponse(String apiBaseUrl, HttpServletResponse HttpTools.sendResponse(res, responseObject, JSON_CONTENT_TYPE); } - private static void sendDumpPageResponse(Whelk whelk, String apiBaseUrl, String dump, Path dumpFilePath, long offsetLines, HttpServletResponse res) throws IOException { + private static void sendDumpPageResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, String apiBaseUrl, String dump, Path dumpFilePath, long offsetLines, HttpServletResponse res) throws IOException { ArrayList recordIdsOnPage = new ArrayList<>(EmmChangeSet.TARGET_HITS_PER_PAGE); Long totalEntityCount = null; @@ -212,10 +214,10 @@ private static void sendDumpPageResponse(Whelk whelk, String apiBaseUrl, String BasicFileAttributes attributes = Files.readAttributes(dumpFilePath, BasicFileAttributes.class); Instant dumpCreationTime = attributes.creationTime().toInstant(); - sendFormattedResponse(whelk, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines, totalEntityCount, dumpCreationTime); + sendFormattedResponse(whelk, targetVocabMapper, profile, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines, totalEntityCount, dumpCreationTime); } - private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String dump, ArrayList recordIdsOnPage, HttpServletResponse res, long offset, Long totalEntityCount, Instant dumpCreationTime) throws IOException{ + private static void sendFormattedResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, String apiBaseUrl, String dump, ArrayList recordIdsOnPage, HttpServletResponse res, long offset, Long totalEntityCount, Instant dumpCreationTime) throws IOException{ var responseObject = new LinkedHashMap<>(); responseObject.put(JsonLd.CONTEXT_KEY, "https://www.w3.org/ns/activitystreams"); @@ -272,11 +274,11 @@ private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String itemOfPath.add("@graph"); itemOfPath.add(1); itemOfPath.add("itemOf"); // unggh.. doc._set(itemOfPath, instance.getThing(), doc.data); - items.add(wrapDoc(doc, contextDoc)); + items.add(formatDoc(doc, contextDoc, whelk, targetVocabMapper, profile)); } // For normal categories else { - items.add(wrapDoc(doc, contextDoc)); + items.add(formatDoc(doc, contextDoc, whelk, targetVocabMapper, profile)); } } @@ -284,7 +286,7 @@ private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String HttpTools.sendResponse(res, responseObject, JSON_CONTENT_TYPE); } - private static void sendDumpDownloadResponse(Whelk whelk, Path dumpFilePath, HttpServletResponse res) { + private static void sendDumpDownloadResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Path dumpFilePath, HttpServletResponse res) { String filename = Unicode.stripSuffix(dumpFilePath.getFileName().toString(), ".dump") + ND_JSON_LD_GZ_EXT; res.setHeader("Content-Disposition", "attachment; filename=" + filename); res.setHeader("Content-Type", "application/octet-stream"); @@ -325,11 +327,11 @@ private static void sendDumpDownloadResponse(Whelk whelk, Path dumpFilePath, Htt batch.add(line.trim()); if (batch.size() >= batchSize) { - writeJsonLdLines(whelk, batch, contextDoc, os); + writeJsonLdLines(whelk, targetVocabMapper, profile, batch, contextDoc, os); batch = new ArrayList<>(batchSize); } } - writeJsonLdLines(whelk, batch, contextDoc, os); + writeJsonLdLines(whelk, targetVocabMapper, profile, batch, contextDoc, os); res.flushBuffer(); } } catch (Exception e) { @@ -337,14 +339,14 @@ private static void sendDumpDownloadResponse(Whelk whelk, Path dumpFilePath, Htt } } - private static void writeJsonLdLines(Whelk whelk, Collection ids, Document contextDoc, OutputStream os) throws IOException { + private static void writeJsonLdLines(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Collection ids, Document contextDoc, OutputStream os) throws IOException { Map idsAndRecords = whelk.bulkLoad(ids); for (Document doc : idsAndRecords.values()) { if (doc.getDeleted()) { continue; } - writeJsonLdLine(wrapDoc(doc, contextDoc), os); + writeJsonLdLine(formatDoc(doc, contextDoc, whelk, targetVocabMapper, profile), os); } os.flush(); } @@ -357,15 +359,29 @@ private static void writeJsonLdLine(Object object, OutputStream os) throws IOExc os.write("\n".getBytes(StandardCharsets.UTF_8)); } - private static Object wrapDoc(Document doc, Document contextDoc) { + private static Object formatDoc(Document doc, Document contextDoc, Whelk whelk, TargetVocabMapper targetVocabMapper, String profile) { var context = new ArrayList<>(); context.add(null); context.add(contextDoc.getRecordIdentifiers().getFirst()); - return Map.of( + + Document formattedDoc = doc; // Will be replaced if there's a profile + if (profile != null) { + Document profileDoc = whelk.getStorage().getDocumentByIri(profile); + if (profileDoc != null) { + formattedDoc = new Document((Map) targetVocabMapper.applyTargetVocabularyMap(profile, profileDoc.data, doc.data)); + } + else { + logger.info("Bad profile requested for EMM dump: {}", profile); + } + } + + Map data = Map.of( JsonLd.ID_KEY, doc.getRecordIdentifiers().getFirst(), JsonLd.CONTEXT_KEY, context, - JsonLd.GRAPH_KEY, doc.data.get(JsonLd.GRAPH_KEY) + JsonLd.GRAPH_KEY, formattedDoc.data.get(JsonLd.GRAPH_KEY) ); + + return data; } private static Object wrapContextDoc(Document contextDoc) { diff --git a/emm/src/main/java/whelk/EmmServlet.java b/emm/src/main/java/whelk/EmmServlet.java index 04751bc655..e96654cacc 100644 --- a/emm/src/main/java/whelk/EmmServlet.java +++ b/emm/src/main/java/whelk/EmmServlet.java @@ -13,11 +13,14 @@ public class EmmServlet extends HttpServlet { private final Logger logger = LogManager.getLogger(this.getClass()); private final Whelk whelk; + private final TargetVocabMapper targetVocabMapper; public static final String AS2_CONTENT_TYPE = "application/activity+json"; public EmmServlet() { whelk = Whelk.createLoadedCoreWhelk(); + Document contextDocument = whelk.getStorage().getDocumentByIri(whelk.getSystemContextUri()); + targetVocabMapper = new TargetVocabMapper(whelk.getJsonld(), contextDocument.data); } public void init() { @@ -31,7 +34,7 @@ public void doGet(HttpServletRequest req, HttpServletResponse res) { String apiBaseUrl = req.getRequestURL().toString(); if (req.getServletPath() != null && req.getServletPath().endsWith("/full")) { - Dump.sendDumpResponse(whelk, apiBaseUrl, req, res); + Dump.sendDumpResponse(whelk, targetVocabMapper, apiBaseUrl, req, res); return; } String until = req.getParameter("until"); From 6e382d455f185c23164c640a8b8f102c648f7618 Mon Sep 17 00:00:00 2001 From: Jannis Tsiroyannis Date: Fri, 20 Dec 2024 12:29:31 +0100 Subject: [PATCH 2/2] EMM: Don't load the profile document over and over, and use the correct context for profiles. --- emm/src/main/java/whelk/Dump.java | 60 ++++++++++++++++++------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/emm/src/main/java/whelk/Dump.java b/emm/src/main/java/whelk/Dump.java index 9d6dc34a36..a65c86553c 100644 --- a/emm/src/main/java/whelk/Dump.java +++ b/emm/src/main/java/whelk/Dump.java @@ -87,6 +87,14 @@ public static void sendDumpResponse(Whelk whelk, TargetVocabMapper targetVocabMa } String profile = req.getParameter("profile"); // May be null, meaning default (kbv) + Document profileDoc = null; + if (profile != null) { + profileDoc = whelk.getStorage().getDocumentByIri(profile); + if (profileDoc == null) { + logger.info("Bad profile requested for EMM dump: {}", profile); + profile = null; + } + } String tmpDir = System.getProperty("java.io.tmpdir"); Path dumpsPath = Paths.get(tmpDir, "dumps"); @@ -99,10 +107,10 @@ public static void sendDumpResponse(Whelk whelk, TargetVocabMapper targetVocabMa } if (isDownload) { - sendDumpDownloadResponse(whelk, targetVocabMapper, profile, dumpFilePath, res); + sendDumpDownloadResponse(whelk, targetVocabMapper, profile, profileDoc, dumpFilePath, res); } else { long offsetNumeric = Long.parseLong(offset); - sendDumpPageResponse(whelk, targetVocabMapper, profile, apiBaseUrl, selection, dumpFilePath, offsetNumeric, res); + sendDumpPageResponse(whelk, targetVocabMapper, profile, profileDoc, apiBaseUrl, selection, dumpFilePath, offsetNumeric, res); } } @@ -151,7 +159,7 @@ private static void sendDumpIndexResponse(String apiBaseUrl, HttpServletResponse HttpTools.sendResponse(res, responseObject, JSON_CONTENT_TYPE); } - private static void sendDumpPageResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, String apiBaseUrl, String dump, Path dumpFilePath, long offsetLines, HttpServletResponse res) throws IOException { + private static void sendDumpPageResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Document profileDoc, String apiBaseUrl, String dump, Path dumpFilePath, long offsetLines, HttpServletResponse res) throws IOException { ArrayList recordIdsOnPage = new ArrayList<>(EmmChangeSet.TARGET_HITS_PER_PAGE); Long totalEntityCount = null; @@ -214,10 +222,10 @@ private static void sendDumpPageResponse(Whelk whelk, TargetVocabMapper targetVo BasicFileAttributes attributes = Files.readAttributes(dumpFilePath, BasicFileAttributes.class); Instant dumpCreationTime = attributes.creationTime().toInstant(); - sendFormattedResponse(whelk, targetVocabMapper, profile, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines, totalEntityCount, dumpCreationTime); + sendFormattedResponse(whelk, targetVocabMapper, profile, profileDoc, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines, totalEntityCount, dumpCreationTime); } - private static void sendFormattedResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, String apiBaseUrl, String dump, ArrayList recordIdsOnPage, HttpServletResponse res, long offset, Long totalEntityCount, Instant dumpCreationTime) throws IOException{ + private static void sendFormattedResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Document profileDoc, String apiBaseUrl, String dump, ArrayList recordIdsOnPage, HttpServletResponse res, long offset, Long totalEntityCount, Instant dumpCreationTime) throws IOException{ var responseObject = new LinkedHashMap<>(); responseObject.put(JsonLd.CONTEXT_KEY, "https://www.w3.org/ns/activitystreams"); @@ -244,7 +252,12 @@ private static void sendFormattedResponse(Whelk whelk, TargetVocabMapper targetV var items = new ArrayList<>(EmmChangeSet.TARGET_HITS_PER_PAGE); responseObject.put("items", items); - var contextDoc = contextDoc(whelk); + Document contextDoc = null; + if (profileDoc != null) + contextDoc = profileDoc; + else { + contextDoc = contextDoc(whelk); + } if (offset == 0) { items.add(wrapContextDoc(contextDoc)); } @@ -274,11 +287,11 @@ private static void sendFormattedResponse(Whelk whelk, TargetVocabMapper targetV itemOfPath.add("@graph"); itemOfPath.add(1); itemOfPath.add("itemOf"); // unggh.. doc._set(itemOfPath, instance.getThing(), doc.data); - items.add(formatDoc(doc, contextDoc, whelk, targetVocabMapper, profile)); + items.add(formatDoc(doc, contextDoc, targetVocabMapper, profile, profileDoc)); } // For normal categories else { - items.add(formatDoc(doc, contextDoc, whelk, targetVocabMapper, profile)); + items.add(formatDoc(doc, contextDoc, targetVocabMapper, profile, profileDoc)); } } @@ -286,7 +299,7 @@ private static void sendFormattedResponse(Whelk whelk, TargetVocabMapper targetV HttpTools.sendResponse(res, responseObject, JSON_CONTENT_TYPE); } - private static void sendDumpDownloadResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Path dumpFilePath, HttpServletResponse res) { + private static void sendDumpDownloadResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Document profileDoc, Path dumpFilePath, HttpServletResponse res) { String filename = Unicode.stripSuffix(dumpFilePath.getFileName().toString(), ".dump") + ND_JSON_LD_GZ_EXT; res.setHeader("Content-Disposition", "attachment; filename=" + filename); res.setHeader("Content-Type", "application/octet-stream"); @@ -294,8 +307,13 @@ private static void sendDumpDownloadResponse(Whelk whelk, TargetVocabMapper targ int batchSize = EmmChangeSet.TARGET_HITS_PER_PAGE; try (GZIPOutputStream os = new GZIPOutputStream(new BufferedOutputStream(res.getOutputStream()), GZIP_BUF_SIZE)) { res.flushBuffer(); - - var contextDoc = contextDoc(whelk); + + Document contextDoc = null; + if (profileDoc != null) + contextDoc = profileDoc; + else { + contextDoc = contextDoc(whelk); + } writeJsonLdLine(wrapContextDoc(contextDoc), os); // Has the dump not begun being written yet ? @@ -327,11 +345,11 @@ private static void sendDumpDownloadResponse(Whelk whelk, TargetVocabMapper targ batch.add(line.trim()); if (batch.size() >= batchSize) { - writeJsonLdLines(whelk, targetVocabMapper, profile, batch, contextDoc, os); + writeJsonLdLines(whelk, targetVocabMapper, profile, profileDoc, batch, contextDoc, os); batch = new ArrayList<>(batchSize); } } - writeJsonLdLines(whelk, targetVocabMapper, profile, batch, contextDoc, os); + writeJsonLdLines(whelk, targetVocabMapper, profile, profileDoc, batch, contextDoc, os); res.flushBuffer(); } } catch (Exception e) { @@ -339,14 +357,14 @@ private static void sendDumpDownloadResponse(Whelk whelk, TargetVocabMapper targ } } - private static void writeJsonLdLines(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Collection ids, Document contextDoc, OutputStream os) throws IOException { + private static void writeJsonLdLines(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Document profileDoc, Collection ids, Document contextDoc, OutputStream os) throws IOException { Map idsAndRecords = whelk.bulkLoad(ids); for (Document doc : idsAndRecords.values()) { if (doc.getDeleted()) { continue; } - writeJsonLdLine(formatDoc(doc, contextDoc, whelk, targetVocabMapper, profile), os); + writeJsonLdLine(formatDoc(doc, contextDoc, targetVocabMapper, profile, profileDoc), os); } os.flush(); } @@ -359,20 +377,14 @@ private static void writeJsonLdLine(Object object, OutputStream os) throws IOExc os.write("\n".getBytes(StandardCharsets.UTF_8)); } - private static Object formatDoc(Document doc, Document contextDoc, Whelk whelk, TargetVocabMapper targetVocabMapper, String profile) { + private static Object formatDoc(Document doc, Document contextDoc, TargetVocabMapper targetVocabMapper, String profile, Document profileDoc) { var context = new ArrayList<>(); context.add(null); context.add(contextDoc.getRecordIdentifiers().getFirst()); Document formattedDoc = doc; // Will be replaced if there's a profile - if (profile != null) { - Document profileDoc = whelk.getStorage().getDocumentByIri(profile); - if (profileDoc != null) { - formattedDoc = new Document((Map) targetVocabMapper.applyTargetVocabularyMap(profile, profileDoc.data, doc.data)); - } - else { - logger.info("Bad profile requested for EMM dump: {}", profile); - } + if (profile != null && profileDoc != null) { + formattedDoc = new Document((Map) targetVocabMapper.applyTargetVocabularyMap(profile, profileDoc.data, doc.data)); } Map data = Map.of(