From b1c398108657414178002b992e8ad647d37a9d66 Mon Sep 17 00:00:00 2001 From: Ryan Doherty Date: Sun, 14 Aug 2022 22:47:19 -0400 Subject: [PATCH 1/3] Remove high-memory method in AnswerValue and use ID allocation batching to fix dataset upload performance --- .../gusdb/wdk/model/answer/AnswerValue.java | 88 ++------ .../single/SingleRecordAnswerValue.java | 28 ++- .../wdk/model/dataset/DatasetContents.java | 7 + .../wdk/model/dataset/DatasetFactory.java | 38 +++- .../model/dataset/DatasetFileContents.java | 22 ++ .../model/dataset/DatasetListContents.java | 7 + .../model/dataset/DatasetStringContents.java | 6 + .../gusdb/wdk/model/user/BasketFactory.java | 5 - .../DatasetRequestProcessor.java | 192 +++++++----------- .../service/service/user/DatasetService.java | 4 +- 10 files changed, 190 insertions(+), 207 deletions(-) rename Service/src/main/java/org/gusdb/wdk/service/request/user/{ => dataset}/DatasetRequestProcessor.java (70%) diff --git a/Model/src/main/java/org/gusdb/wdk/model/answer/AnswerValue.java b/Model/src/main/java/org/gusdb/wdk/model/answer/AnswerValue.java index 92c697bdad..b7a066c5ae 100644 --- a/Model/src/main/java/org/gusdb/wdk/model/answer/AnswerValue.java +++ b/Model/src/main/java/org/gusdb/wdk/model/answer/AnswerValue.java @@ -51,6 +51,9 @@ import org.gusdb.wdk.model.query.spec.QueryInstanceSpec; import org.gusdb.wdk.model.question.Question; import org.gusdb.wdk.model.record.Field; +import org.gusdb.wdk.model.record.PrimaryKeyDefinition; +import org.gusdb.wdk.model.record.PrimaryKeyIterator; +import org.gusdb.wdk.model.record.ResultSetPrimaryKeyIterator; import org.gusdb.wdk.model.record.RecordClass; import org.gusdb.wdk.model.record.RecordInstance; import org.gusdb.wdk.model.record.TableField; @@ -886,33 +889,6 @@ public void setSortingMap(Map sortingMap) { _sortedIdSql = null; } - /** - * This method is redundant with getAllIds(), consider deprecate either one of them. - * - * @return returns a list of all primary key values. - */ - public Object[][] getPrimaryKeyValues() throws WdkModelException { - String[] columns = _answerSpec.getQuestion().getRecordClass().getPrimaryKeyDefinition().getColumnRefs(); - List buffer = new ArrayList<>(); - - Optional legacyFilter = _answerSpec.getLegacyFilter(); - try (ResultList resultList = - legacyFilter.isPresent() ? - legacyFilter.get().getResults(this) : - _idsQueryInstance.getResults()) { - while (resultList.next()) { - Object[] pkValues = new String[columns.length]; - for (int columnIndex = 0; columnIndex < columns.length; columnIndex++) { - pkValues[columnIndex] = resultList.get(columns[columnIndex]); - } - buffer.add(pkValues); - } - Object[][] ids = new String[buffer.size()][columns.length]; - buffer.toArray(ids); - return ids; - } - } - private void reset() { _sortedIdSql = null; _checksum = null; @@ -920,36 +896,24 @@ private void reset() { } /** - * Get a list of all the primary key tuples of all the records in the answer. It is a shortcut of iterating - * through all the pages and get the primary keys. + * Creates a closable iterator of IDs for this answer + * + * NOTE! caller must close the return value to avoid resource leaks. * - * This method is redundant with getPrimaryKeyValues(), consider deprecate either one of them. + * @return an iterator of all the primary key tuples of all the records in the answer + * @throws WdkModelException if unable to execute ID query */ - public List getAllIds() throws WdkModelException { - String idSql = getSortedIdSql(); - String[] pkColumns = _answerSpec.getQuestion().getRecordClass().getPrimaryKeyDefinition().getColumnRefs(); - List pkValues = new ArrayList<>(); - WdkModel wdkModel = _answerSpec.getQuestion().getWdkModel(); - DataSource dataSource = wdkModel.getAppDb().getDataSource(); - ResultSet resultSet = null; + public PrimaryKeyIterator getAllIds() throws WdkModelException { try { - resultSet = SqlUtils.executeQuery(dataSource, idSql, _idsQueryInstance.getQuery().getFullName() + "__all-ids"); - while (resultSet.next()) { - String[] values = new String[pkColumns.length]; - for (int i = 0; i < pkColumns.length; i++) { - Object value = resultSet.getObject(pkColumns[i]); - values[i] = (value == null) ? null : value.toString(); - } - pkValues.add(values); - } - } - catch (SQLException ex) { - throw new WdkModelException(ex); + PrimaryKeyDefinition pkDef = _answerSpec.getQuestion().getRecordClass().getPrimaryKeyDefinition(); + DataSource dataSource = _wdkModel.getAppDb().getDataSource(); + String idSql = getSortedIdSql(); + String queryDescriptor = _idsQueryInstance.getQuery().getFullName() + "__all-ids"; + return new ResultSetPrimaryKeyIterator(pkDef, SqlUtils.executeQuery(dataSource, idSql, queryDescriptor)); } - finally { - SqlUtils.closeResultSetAndStatement(resultSet, null); + catch (SQLException e) { + throw new WdkModelException("Unable to execute ID query", e); } - return pkValues; } public void setPageIndex(int startIndex, int endIndex) { @@ -977,26 +941,6 @@ public JSONObject getFilterSummaryJson(String filterName) throws WdkUserExceptio } } - /** - * Returns one big string containing all IDs in this answer value's result in - * the following format: each '\n'-delimited line contains one record, whose - * primary keys are joined and delimited by a comma. - * - * @return list of all record IDs - */ - public String getAllIdsAsString() throws WdkModelException { - List pkValues = getAllIds(); - StringBuilder buffer = new StringBuilder(); - for (String[] pkValue : pkValues) { - if (buffer.length() > 0) buffer.append("\n"); - for (int i = 0; i < pkValue.length; i++) { - if (i > 0) buffer.append(", "); - buffer.append(pkValue[i]); - } - } - return buffer.toString(); - } - private final static String ID_QUERY_HANDLE = "pidq"; private final static String QUERY_HANDLE = "inq"; diff --git a/Model/src/main/java/org/gusdb/wdk/model/answer/single/SingleRecordAnswerValue.java b/Model/src/main/java/org/gusdb/wdk/model/answer/single/SingleRecordAnswerValue.java index 630c64abab..1316915b20 100644 --- a/Model/src/main/java/org/gusdb/wdk/model/answer/single/SingleRecordAnswerValue.java +++ b/Model/src/main/java/org/gusdb/wdk/model/answer/single/SingleRecordAnswerValue.java @@ -4,12 +4,11 @@ import static org.gusdb.fgputil.functional.Functions.mapToList; import java.util.Collections; -import java.util.List; import java.util.Map; +import java.util.NoSuchElementException; import org.gusdb.fgputil.EncryptionUtil; import org.gusdb.fgputil.FormatUtil; -import org.gusdb.fgputil.ListBuilder; import org.gusdb.fgputil.MapBuilder; import org.gusdb.fgputil.db.platform.DBPlatform; import org.gusdb.fgputil.validation.ValidObjectFactory.RunnableObj; @@ -20,6 +19,7 @@ import org.gusdb.wdk.model.answer.ResultSizeFactory; import org.gusdb.wdk.model.answer.spec.AnswerSpec; import org.gusdb.wdk.model.record.DynamicRecordInstance; +import org.gusdb.wdk.model.record.PrimaryKeyIterator; import org.gusdb.wdk.model.record.RecordClass; import org.gusdb.wdk.model.record.RecordInstance; import org.gusdb.wdk.model.user.User; @@ -98,7 +98,7 @@ public String getChecksum() throws WdkModelException { } @Override - public List getAllIds() throws WdkModelException { + public PrimaryKeyIterator getAllIds() throws WdkModelException { String[] pkArray = new String[_pkMap.size()]; String[] pkColNames = _recordClass.getPrimaryKeyDefinition().getColumnRefs(); if (pkArray.length != pkColNames.length) @@ -106,7 +106,27 @@ public List getAllIds() throws WdkModelException { for (int i = 0; i < pkColNames.length; i++) { pkArray[i] = (String)_pkMap.get(pkColNames[i]); } - return new ListBuilder().add(pkArray).toList(); + return new PrimaryKeyIterator() { + + private boolean valueReturned = false; + + @Override + public boolean hasNext() { + return !valueReturned; + } + + @Override + public String[] next() { + if (valueReturned) throw new NoSuchElementException(); + valueReturned = true; + return pkArray; + } + + @Override + public void close() { + // nothing to do here + } + }; } @Override diff --git a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetContents.java b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetContents.java index b18dfb7705..898d04032c 100644 --- a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetContents.java +++ b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetContents.java @@ -7,8 +7,13 @@ import java.io.Reader; public abstract class DatasetContents { + private static final int BUF_SIZE = 8192; + // constants used to estimate number of records + public static final int ESTIMATED_CHARS_PER_ID = 10; + public static final int ESTIMATED_BYTES_PER_ID = 15; + protected final String fileName; protected DatasetContents(String fileName) { @@ -19,6 +24,8 @@ public String getUploadFileName() { return fileName; } + public abstract long getEstimatedRowCount(); + @SuppressWarnings("ThrowFromFinallyBlock") public String truncate(final int len) throws WdkModelException { Reader reader = null; diff --git a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetFactory.java b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetFactory.java index f3ad8fc393..2203f837c7 100644 --- a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetFactory.java +++ b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetFactory.java @@ -13,7 +13,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Date; +import java.util.LinkedList; import java.util.List; +import java.util.Queue; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -296,7 +298,7 @@ private Dataset createOrGetDataset( ); insertDatasetValues(connection, datasetId, parser.iterator(content), - parser.datasetContentWidth(content)); + parser.datasetContentWidth(content), content.getEstimatedRowCount()); connection.commit(); // create and insert user dataset. @@ -533,24 +535,33 @@ private void insertDatasetValues( final Connection connection, final long datasetId, final DatasetIterator data, - final int length + final int numDataColumns, + final long estimatedRowCount ) throws SQLException, WdkModelException, WdkUserException { - String sql = buildDatasetValuesInsertQuery(length); + String sql = buildDatasetValuesInsertQuery(numDataColumns); LOG.info("Built the following insert SQL: " + sql); + int idAllocationBatchSize = calculateIdAllocationBatchSize(estimatedRowCount); + Queue datasetValueIdQueue = new LinkedList<>(); try (PreparedStatement psInsert = connection.prepareStatement(sql)) { long batchRow = 0; long rowOrderNumber = 1; while (data.hasNext()) { String[] value = data.next(); - // get a new value id. - long datasetValueId = _userDb.getPlatform() - .getNextId(_userDb.getDataSource(), _userSchema, TABLE_DATASET_VALUES); + // get a new value id + if (datasetValueIdQueue.isEmpty()) { + datasetValueIdQueue.addAll( + _userDb.getPlatform().getNextNIds( + _userDb.getDataSource(), + _userSchema, + TABLE_DATASET_VALUES, + idAllocationBatchSize)); + } - psInsert.setLong(1, datasetValueId); + psInsert.setLong(1, datasetValueIdQueue.poll()); psInsert.setLong(2, datasetId); psInsert.setLong(3, rowOrderNumber); - for (int j = 0; j < length; j++) { + for (int j = 0; j < numDataColumns; j++) { psInsert.setString(j + 4, value[j]); } psInsert.addBatch(); @@ -567,6 +578,17 @@ private void insertDatasetValues( } } + private int calculateIdAllocationBatchSize(long estimatedRowCount) { + // (0,10] = 1 + if (estimatedRowCount <= 10) return 1; + // (10,100] = 10 + if (estimatedRowCount <= 100) return 10; + // (100,1000] = 25 + if (estimatedRowCount <= 1000) return 25; + // (1000,Inf) = 250 + return 250; + } + private void validateValue(final String[] row) throws WdkUserException { // check the number of columns if (row.length > MAX_VALUE_COLUMNS) { diff --git a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetFileContents.java b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetFileContents.java index 964e4ea003..694addb5eb 100644 --- a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetFileContents.java +++ b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetFileContents.java @@ -33,14 +33,28 @@ public class DatasetFileContents extends DatasetContents { */ private String checksum; + /** + * Number of records expected in this dataset file + * (some files are written knowing how many records are contained within) + */ + private final Long numRecords; + public DatasetFileContents( final String fileName, final File contents ) { + this(fileName, contents, null); + } + + public DatasetFileContents( + final String fileName, + final File contents, + final Long numRecords) { super(fileName); LOG.info("Created new DatasetFileContents object pointing at file: " + contents.getAbsolutePath()); this.contents = contents; this.owned = false; + this.numRecords = numRecords; } DatasetFileContents( @@ -63,6 +77,7 @@ public DatasetFileContents( tmp.deleteOnExit(); this.owned = true; this.contents = tmp; + this.numRecords = null; } /** @@ -126,4 +141,11 @@ private static String genChecksum(final File file) { throw new WdkRuntimeException(e); } } + + @Override + public long getEstimatedRowCount() { + return numRecords != null + ? numRecords + : (contents.length() / ESTIMATED_BYTES_PER_ID) + 1; // round up + } } diff --git a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetListContents.java b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetListContents.java index ab4557cf5d..93cc79b4b9 100644 --- a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetListContents.java +++ b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetListContents.java @@ -7,6 +7,7 @@ import java.util.List; public class DatasetListContents extends DatasetContents { + private final List idList; private String checksum; @@ -15,6 +16,7 @@ public DatasetListContents(final List idList) { this.idList = idList; } + @Override public String getChecksum() { if (checksum != null) @@ -113,4 +115,9 @@ private void inc() { done = true; } } + + @Override + public long getEstimatedRowCount() { + return idList.size(); + } } diff --git a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetStringContents.java b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetStringContents.java index c6de110e4d..39e8c61fa4 100644 --- a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetStringContents.java +++ b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetStringContents.java @@ -6,6 +6,7 @@ import org.gusdb.fgputil.EncryptionUtil; public class DatasetStringContents extends DatasetContents { + private final String contents; public DatasetStringContents(final String fileName, final String contents) { @@ -22,4 +23,9 @@ public String getChecksum() { public Reader getContentReader() { return new StringReader(contents); } + + @Override + public long getEstimatedRowCount() { + return (contents.length() / ESTIMATED_CHARS_PER_ID) + 1; // round up + } } diff --git a/Model/src/main/java/org/gusdb/wdk/model/user/BasketFactory.java b/Model/src/main/java/org/gusdb/wdk/model/user/BasketFactory.java index edb2305b9d..ed88db99a0 100644 --- a/Model/src/main/java/org/gusdb/wdk/model/user/BasketFactory.java +++ b/Model/src/main/java/org/gusdb/wdk/model/user/BasketFactory.java @@ -85,11 +85,6 @@ public void addEntireResultToBasket(User user, RunnableObj spec) thr } } - public void removeEntireResultFromBasket(User user, RunnableObj spec) throws WdkModelException { - List pkValues = AnswerValueFactory.makeAnswer(user, spec).getAllIds(); - removeFromBasket(user, spec.get().getQuestion().getRecordClass(), pkValues); - } - public void addPksToBasket(User user, RecordClass recordClass, Collection recordsToAdd) throws WdkModelException { addToBasket(user, recordClass, recordsToAdd.size(), new PrimaryKeyRecordStream(user, recordClass, recordsToAdd)); } diff --git a/Service/src/main/java/org/gusdb/wdk/service/request/user/DatasetRequestProcessor.java b/Service/src/main/java/org/gusdb/wdk/service/request/user/dataset/DatasetRequestProcessor.java similarity index 70% rename from Service/src/main/java/org/gusdb/wdk/service/request/user/DatasetRequestProcessor.java rename to Service/src/main/java/org/gusdb/wdk/service/request/user/dataset/DatasetRequestProcessor.java index f2f90b6e28..4f6e04c803 100644 --- a/Service/src/main/java/org/gusdb/wdk/service/request/user/DatasetRequestProcessor.java +++ b/Service/src/main/java/org/gusdb/wdk/service/request/user/dataset/DatasetRequestProcessor.java @@ -1,29 +1,27 @@ -package org.gusdb.wdk.service.request.user; +package org.gusdb.wdk.service.request.user.dataset; -import static org.gusdb.fgputil.FormatUtil.join; import static org.gusdb.fgputil.json.JsonIterators.arrayStream; import java.io.BufferedWriter; +import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Arrays; -import java.util.List; +import java.util.Iterator; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.ws.rs.core.MediaType; import org.apache.log4j.Logger; -import org.gusdb.fgputil.FormatUtil; import org.gusdb.fgputil.client.ClientUtil; -import org.gusdb.fgputil.functional.Functions; +import org.gusdb.fgputil.iterator.IteratorUtil; import org.gusdb.fgputil.json.JsonType; import org.gusdb.fgputil.json.JsonType.ValueType; -import org.gusdb.fgputil.json.JsonUtil; import org.gusdb.fgputil.validation.ValidObjectFactory.RunnableObj; import org.gusdb.fgputil.validation.ValidationLevel; import org.gusdb.fgputil.web.SessionProxy; @@ -45,89 +43,22 @@ import org.gusdb.wdk.model.query.param.Param; import org.gusdb.wdk.model.query.spec.ParameterContainerInstanceSpecBuilder.FillStrategy; import org.gusdb.wdk.model.question.Question; +import org.gusdb.wdk.model.record.PrimaryKeyIterator; import org.gusdb.wdk.model.record.PrimaryKeyValue; +import org.gusdb.wdk.model.record.RecordClass; import org.gusdb.wdk.model.record.RecordInstance; +import org.gusdb.wdk.model.user.BasketFactory; import org.gusdb.wdk.model.user.Strategy; import org.gusdb.wdk.model.user.User; import org.gusdb.wdk.service.request.exception.DataValidationException; import org.gusdb.wdk.service.request.exception.RequestMisformatException; import org.gusdb.wdk.service.service.TemporaryFileService; import org.json.JSONArray; -import org.json.JSONObject; public class DatasetRequestProcessor { private static Logger LOG = Logger.getLogger(DatasetRequestProcessor.class); - public enum DatasetSourceType { - - ID_LIST("idList", "ids", ValueType.ARRAY), - BASKET("basket", "basketName", ValueType.STRING), - FILE("file", "temporaryFileId", ValueType.STRING), - STRATEGY("strategy", JsonKeys.STRATEGY_ID, ValueType.NUMBER), - URL("url", "url", ValueType.STRING); - - private final String _typeIndicator; - private final String _configJsonKey; - private final ValueType _configValueType; - - DatasetSourceType(String typeIndicator, String configJsonKey, ValueType configValueType) { - _typeIndicator = typeIndicator; - _configJsonKey = configJsonKey; - _configValueType = configValueType; - } - - public String getTypeIndicator() { - return _typeIndicator; - } - - public String getConfigJsonKey() { - return _configJsonKey; - } - - public ValueType getConfigType() { - return _configValueType; - } - - public static DatasetSourceType getFromTypeIndicator(String typeIndicator) throws RequestMisformatException { - return Arrays.stream(values()) - .filter(val -> val._typeIndicator.equals(typeIndicator)) - .findFirst() - .orElseThrow(() -> new RequestMisformatException( - "Invalid source type. Only [" + FormatUtil.join(values(), ", ") + "] allowed.")); - } - } - - public static class DatasetRequest { - - private final DatasetSourceType _sourceType; - private final JsonType _configValue; - private final Optional _displayName; - private final Map _additionalConfig; - - public DatasetRequest(JSONObject input) throws RequestMisformatException { - _sourceType = DatasetSourceType.getFromTypeIndicator(input.getString(JsonKeys.SOURCE_TYPE)); - JSONObject sourceContent = input.getJSONObject(JsonKeys.SOURCE_CONTENT); - _configValue = new JsonType(sourceContent.get(_sourceType.getConfigJsonKey())); - if (!_configValue.getType().equals(_sourceType.getConfigType())) { - throw new RequestMisformatException("Value of '" + - _sourceType.getConfigJsonKey() + "' must be a " + _sourceType.getConfigType()); - } - _additionalConfig = Functions.getMapFromKeys( - JsonUtil.getKeys(sourceContent).stream() - .filter(key -> !key.equals(_sourceType.getConfigJsonKey())) - .collect(Collectors.toSet()), - key -> new JsonType(sourceContent.get(key))); - _displayName = Optional.ofNullable(JsonUtil.getStringOrDefault(input, JsonKeys.DISPLAY_NAME, null)); - } - - public DatasetSourceType getSourceType() { return _sourceType; } - public JsonType getConfigValue() { return _configValue; } - public Optional getDisplayName() { return _displayName; } - public Map getAdditionalConfig() { return _additionalConfig; } - - } - public static Dataset createFromRequest( DatasetRequest request, User user, @@ -175,52 +106,72 @@ private static Dataset createFromBasket( final User user, final DatasetFactory factory ) throws WdkModelException, DataValidationException { - var recordClass = factory.getWdkModel() + + RecordClass recordClass = factory.getWdkModel() .getRecordClassByUrlSegment(recordClassName) .orElseThrow(() -> new DataValidationException( "No record class exists with name '" + recordClassName + "'.")); - var basketFactory = factory.getWdkModel().getBasketFactory(); - var wasEmpty = true; + BasketFactory basketFactory = factory.getWdkModel().getBasketFactory(); - try { - var file = Files.createTempFile("dataset-", - "-" + user.getStableId() + "-" + recordClassName).toFile(); + long basketSize = basketFactory.getBasketCounts(user).get(recordClass); + if (basketSize == 0) + throw new DataValidationException("Basket '" + recordClassName + "' does " + + "not contain any records. No dataset can be made."); - file.deleteOnExit(); + // write basket records to file (just to parse again :() + File file = null; + try (Stream basketStream = basketFactory.getBasket(user, recordClass)) { - try ( - var write = new BufferedWriter(new FileWriter(file)); - var stream = basketFactory.getBasket(user, recordClass) - ) { - var it = stream + Iterator recordIterator = basketStream .map(RecordInstance::getPrimaryKey) .map(PrimaryKeyValue::getValues) .map(Map::values) .map(c -> c.toArray(new String[0])) - .map(a -> join(a, ListDatasetParser.DATASET_COLUMN_DIVIDER)) .iterator(); - if (it.hasNext()) { - wasEmpty = false; - while (it.hasNext()) { - write.write(it.next()); - write.write('\n'); - } + file = createTempFile(user, recordClassName); - write.flush(); - } + writeRecordsToFile(file, recordIterator); + + return createDataset(user, new ListDatasetParser(), + new DatasetFileContents(null, file, basketSize), factory); + } + catch (IOException e) { + throw new WdkModelException("Could not create dataset from basket", e); + } + finally { + deleteFile(file); + } + } + + private static void deleteFile(File file) { + if (file != null) { + try { + Files.delete(file.toPath()); } + catch (IOException e) { + LOG.warn("Unable to delete file after use: " + file.getAbsolutePath()); + } + } + } - if (wasEmpty) - throw new DataValidationException("Basket '" + recordClassName + "' does " - + "not contain any records. No dataset can be made."); + private static File createTempFile(User user, String recordClassName) throws IOException { + return Files.createTempFile(user.getWdkModel().getModelConfig().getWdkTempDir(), + "dataset-", "-" + user.getStableId() + "-" + recordClassName).toFile(); + } - return createDataset(user, new ListDatasetParser(), - new DatasetFileContents(null, file), factory); + private static void writeRecordsToFile(File file, Iterator rows) throws IOException { - } catch (IOException e) { - throw new WdkModelException(e); + try (BufferedWriter writer = new BufferedWriter(new FileWriter(file))) { + + for (String[] rowArray : IteratorUtil.toIterable(rows)) { + String row = String.join(ListDatasetParser.DATASET_COLUMN_DIVIDER, rowArray); + writer.write(row); + writer.write('\n'); + } + + writer.flush(); } } @@ -240,20 +191,29 @@ private static Dataset createFromStrategy( AnswerValue answerValue = AnswerValueFactory.makeAnswer( Strategy.getRunnableStep(strategy, strategy.get().getRootStepId()).get()); - List ids = answerValue.getAllIds(); - if (ids.isEmpty()) - throw new DataValidationException("Strategy '" + strategyId + "' does not" - + " contain any records. No dataset can be made."); + long resultSize = answerValue.getResultSizeFactory().getResultSize(); + if (resultSize == 0) + throw new DataValidationException("Strategy '" + strategyId + "' does " + + "not contain any records. No dataset can be made."); - return createDataset(user, new ListDatasetParser(), - new DatasetListContents(joinIds(ids)), factory); - } + // write records to file (just to parse again :() + File file = null; + try (PrimaryKeyIterator pkIterator = answerValue.getAllIds()) { + + file = createTempFile(user, strategy.get().getRecordClass().get().getUrlSegment()); - private static List joinIds(List ids) { - return ids.stream() - .map(idArray -> join(idArray, ListDatasetParser.DATASET_COLUMN_DIVIDER)) - .collect(Collectors.toList()); + writeRecordsToFile(file, pkIterator); + + return createDataset(user, new ListDatasetParser(), + new DatasetFileContents(null, file, resultSize), factory); + } + catch (Exception e) { + throw new WdkModelException("Could not create dataset from basket", e); + } + finally { + deleteFile(file); + } } private static Dataset createDataset( diff --git a/Service/src/main/java/org/gusdb/wdk/service/service/user/DatasetService.java b/Service/src/main/java/org/gusdb/wdk/service/service/user/DatasetService.java index 7228e9ee02..ca4f7b1346 100644 --- a/Service/src/main/java/org/gusdb/wdk/service/service/user/DatasetService.java +++ b/Service/src/main/java/org/gusdb/wdk/service/service/user/DatasetService.java @@ -25,8 +25,8 @@ import org.gusdb.wdk.service.annotation.OutSchema; import org.gusdb.wdk.service.request.exception.DataValidationException; import org.gusdb.wdk.service.request.exception.RequestMisformatException; -import org.gusdb.wdk.service.request.user.DatasetRequestProcessor; -import org.gusdb.wdk.service.request.user.DatasetRequestProcessor.DatasetRequest; +import org.gusdb.wdk.service.request.user.dataset.DatasetRequestProcessor; +import org.gusdb.wdk.service.request.user.dataset.DatasetRequest; import org.json.JSONException; import org.json.JSONObject; From ed269c65e287ec91a94f87d692038bc739184109 Mon Sep 17 00:00:00 2001 From: Ryan Doherty Date: Sun, 14 Aug 2022 22:47:44 -0400 Subject: [PATCH 2/3] Remove high-memory method in AnswerValue and use ID allocation batching to fix dataset upload performance #2 --- .../wdk/model/record/PrimaryKeyIterator.java | 9 ++++ .../record/ResultSetPrimaryKeyIterator.java | 34 +++++++++++++ .../request/user/dataset/DatasetRequest.java | 42 +++++++++++++++ .../user/dataset/DatasetSourceType.java | 51 +++++++++++++++++++ 4 files changed, 136 insertions(+) create mode 100644 Model/src/main/java/org/gusdb/wdk/model/record/PrimaryKeyIterator.java create mode 100644 Model/src/main/java/org/gusdb/wdk/model/record/ResultSetPrimaryKeyIterator.java create mode 100644 Service/src/main/java/org/gusdb/wdk/service/request/user/dataset/DatasetRequest.java create mode 100644 Service/src/main/java/org/gusdb/wdk/service/request/user/dataset/DatasetSourceType.java diff --git a/Model/src/main/java/org/gusdb/wdk/model/record/PrimaryKeyIterator.java b/Model/src/main/java/org/gusdb/wdk/model/record/PrimaryKeyIterator.java new file mode 100644 index 0000000000..a6c82c0029 --- /dev/null +++ b/Model/src/main/java/org/gusdb/wdk/model/record/PrimaryKeyIterator.java @@ -0,0 +1,9 @@ +package org.gusdb.wdk.model.record; + +import java.util.Iterator; + +public interface PrimaryKeyIterator extends Iterator, AutoCloseable { + + // no additional methods + +} diff --git a/Model/src/main/java/org/gusdb/wdk/model/record/ResultSetPrimaryKeyIterator.java b/Model/src/main/java/org/gusdb/wdk/model/record/ResultSetPrimaryKeyIterator.java new file mode 100644 index 0000000000..796660b5d5 --- /dev/null +++ b/Model/src/main/java/org/gusdb/wdk/model/record/ResultSetPrimaryKeyIterator.java @@ -0,0 +1,34 @@ +package org.gusdb.wdk.model.record; + +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.Optional; + +import org.gusdb.fgputil.db.stream.ResultSetIterator; + +public class ResultSetPrimaryKeyIterator extends ResultSetIterator implements PrimaryKeyIterator { + + private static class PrimaryKeyRowConverter implements RowConverter { + + private final String[] _pkColumns; + + public PrimaryKeyRowConverter(PrimaryKeyDefinition pkDef) { + _pkColumns = pkDef.getColumnRefs(); + } + + @Override + public Optional convert(ResultSet rs) throws SQLException { + String[] values = new String[_pkColumns.length]; + for (int i = 0; i < _pkColumns.length; i++) { + Object value = rs.getObject(_pkColumns[i]); + values[i] = (value == null) ? null : value.toString(); + } + return Optional.of(values); + } + } + + public ResultSetPrimaryKeyIterator(PrimaryKeyDefinition pkDef, ResultSet rs) { + super(rs, new PrimaryKeyRowConverter(pkDef)); + } + +} diff --git a/Service/src/main/java/org/gusdb/wdk/service/request/user/dataset/DatasetRequest.java b/Service/src/main/java/org/gusdb/wdk/service/request/user/dataset/DatasetRequest.java new file mode 100644 index 0000000000..e4939c185a --- /dev/null +++ b/Service/src/main/java/org/gusdb/wdk/service/request/user/dataset/DatasetRequest.java @@ -0,0 +1,42 @@ +package org.gusdb.wdk.service.request.user.dataset; + +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.gusdb.fgputil.functional.Functions; +import org.gusdb.fgputil.json.JsonType; +import org.gusdb.fgputil.json.JsonUtil; +import org.gusdb.wdk.core.api.JsonKeys; +import org.gusdb.wdk.service.request.exception.RequestMisformatException; +import org.json.JSONObject; + +public class DatasetRequest { + + private final DatasetSourceType _sourceType; + private final JsonType _configValue; + private final Optional _displayName; + private final Map _additionalConfig; + + public DatasetRequest(JSONObject input) throws RequestMisformatException { + _sourceType = DatasetSourceType.getFromTypeIndicator(input.getString(JsonKeys.SOURCE_TYPE)); + JSONObject sourceContent = input.getJSONObject(JsonKeys.SOURCE_CONTENT); + _configValue = new JsonType(sourceContent.get(_sourceType.getConfigJsonKey())); + if (!_configValue.getType().equals(_sourceType.getConfigType())) { + throw new RequestMisformatException("Value of '" + + _sourceType.getConfigJsonKey() + "' must be a " + _sourceType.getConfigType()); + } + _additionalConfig = Functions.getMapFromKeys( + JsonUtil.getKeys(sourceContent).stream() + .filter(key -> !key.equals(_sourceType.getConfigJsonKey())) + .collect(Collectors.toSet()), + key -> new JsonType(sourceContent.get(key))); + _displayName = Optional.ofNullable(JsonUtil.getStringOrDefault(input, JsonKeys.DISPLAY_NAME, null)); + } + + public DatasetSourceType getSourceType() { return _sourceType; } + public JsonType getConfigValue() { return _configValue; } + public Optional getDisplayName() { return _displayName; } + public Map getAdditionalConfig() { return _additionalConfig; } + +} \ No newline at end of file diff --git a/Service/src/main/java/org/gusdb/wdk/service/request/user/dataset/DatasetSourceType.java b/Service/src/main/java/org/gusdb/wdk/service/request/user/dataset/DatasetSourceType.java new file mode 100644 index 0000000000..770fc2e3bc --- /dev/null +++ b/Service/src/main/java/org/gusdb/wdk/service/request/user/dataset/DatasetSourceType.java @@ -0,0 +1,51 @@ +package org.gusdb.wdk.service.request.user.dataset; + +import java.util.Arrays; + +import org.gusdb.fgputil.FormatUtil; +import org.gusdb.fgputil.json.JsonType.ValueType; +import org.gusdb.wdk.core.api.JsonKeys; +import org.gusdb.wdk.service.request.exception.RequestMisformatException; + +/** + * Contains the possible ways a user can submit a dataset (for use as a dataset param), + * along with how to parse the config JSON for the submission. + */ +public enum DatasetSourceType { + + ID_LIST("idList", "ids", ValueType.ARRAY), + BASKET("basket", "basketName", ValueType.STRING), + FILE("file", "temporaryFileId", ValueType.STRING), + STRATEGY("strategy", JsonKeys.STRATEGY_ID, ValueType.NUMBER), + URL("url", "url", ValueType.STRING); + + private final String _typeIndicator; + private final String _configJsonKey; + private final ValueType _configValueType; + + DatasetSourceType(String typeIndicator, String configJsonKey, ValueType configValueType) { + _typeIndicator = typeIndicator; + _configJsonKey = configJsonKey; + _configValueType = configValueType; + } + + public String getTypeIndicator() { + return _typeIndicator; + } + + public String getConfigJsonKey() { + return _configJsonKey; + } + + public ValueType getConfigType() { + return _configValueType; + } + + public static DatasetSourceType getFromTypeIndicator(String typeIndicator) throws RequestMisformatException { + return Arrays.stream(values()) + .filter(val -> val._typeIndicator.equals(typeIndicator)) + .findFirst() + .orElseThrow(() -> new RequestMisformatException( + "Invalid source type. Only [" + FormatUtil.join(values(), ", ") + "] allowed.")); + } +} From 7968d3a664989879980d66495ce29fe1986edd78 Mon Sep 17 00:00:00 2001 From: Ryan Doherty Date: Mon, 15 Aug 2022 11:46:25 -0400 Subject: [PATCH 3/3] Fix whitespace --- .../java/org/gusdb/wdk/model/dataset/DatasetListContents.java | 1 - 1 file changed, 1 deletion(-) diff --git a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetListContents.java b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetListContents.java index 93cc79b4b9..f51b9d6b63 100644 --- a/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetListContents.java +++ b/Model/src/main/java/org/gusdb/wdk/model/dataset/DatasetListContents.java @@ -16,7 +16,6 @@ public DatasetListContents(final List idList) { this.idList = idList; } - @Override public String getChecksum() { if (checksum != null)