From 49bfd1273282a6b5a35229244570985bdf432336 Mon Sep 17 00:00:00 2001 From: Devin Smith Date: Wed, 22 Jan 2025 07:52:10 -0800 Subject: [PATCH 1/3] chore: Bump base images (#6575) --- docker/registry/cpp-clients-multi-base/gradle.properties | 2 +- docker/registry/go/gradle.properties | 2 +- docker/registry/manylinux2014_x86_64/gradle.properties | 2 +- docker/registry/minio/gradle.properties | 2 +- docker/registry/protoc-base/gradle.properties | 2 +- docker/registry/python/gradle.properties | 2 +- docker/registry/selenium/gradle.properties | 2 +- docker/registry/server-base/gradle.properties | 2 +- docker/registry/slim-base/gradle.properties | 2 +- docker/server-jetty/src/main/server-jetty/requirements.txt | 2 +- docker/server/src/main/server-netty/requirements.txt | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docker/registry/cpp-clients-multi-base/gradle.properties b/docker/registry/cpp-clients-multi-base/gradle.properties index 2f3feddc403..fe9877b0b33 100644 --- a/docker/registry/cpp-clients-multi-base/gradle.properties +++ b/docker/registry/cpp-clients-multi-base/gradle.properties @@ -1,5 +1,5 @@ io.deephaven.project.ProjectType=DOCKER_REGISTRY deephaven.registry.imageName=ghcr.io/deephaven/cpp-clients-multi-base:latest # When the image version changes, remember to update cpp-client/build-dependencies.sh to match the image's content -deephaven.registry.imageId=ghcr.io/deephaven/cpp-clients-multi-base@sha256:9a5efc669bf239d9c9f2ad6f934201219efa0112b93864f1afaa8bf510dedf49 +deephaven.registry.imageId=ghcr.io/deephaven/cpp-clients-multi-base@sha256:5d7dce6defbd30b3181a42d05988b763750467f2258e341b7431b378303cca4b deephaven.registry.platform=linux/amd64 diff --git a/docker/registry/go/gradle.properties b/docker/registry/go/gradle.properties index b10ac79995a..c4cc8f2a67f 100644 --- a/docker/registry/go/gradle.properties +++ b/docker/registry/go/gradle.properties @@ -1,3 +1,3 @@ io.deephaven.project.ProjectType=DOCKER_REGISTRY deephaven.registry.imageName=golang:1 -deephaven.registry.imageId=golang@sha256:73f06be4578c9987ce560087e2e2ea6485fb605e3910542cadd8fa09fc5f3e31 +deephaven.registry.imageId=golang@sha256:51a6466e8dbf3e00e422eb0f7a97ac450b2d57b33617bbe8d2ee0bddcd9d0d37 diff --git a/docker/registry/manylinux2014_x86_64/gradle.properties b/docker/registry/manylinux2014_x86_64/gradle.properties index 591603c5813..4e2e6edf8b5 100644 --- a/docker/registry/manylinux2014_x86_64/gradle.properties +++ b/docker/registry/manylinux2014_x86_64/gradle.properties @@ -1,4 +1,4 @@ io.deephaven.project.ProjectType=DOCKER_REGISTRY deephaven.registry.imageName=quay.io/pypa/manylinux2014_x86_64:latest -deephaven.registry.imageId=quay.io/pypa/manylinux2014_x86_64@sha256:24858373a047b97fd3a8fe2ee28709479a828fa0ed88719158b728947eb53270 +deephaven.registry.imageId=quay.io/pypa/manylinux2014_x86_64@sha256:52199398003bb163291b6822eea54457db8dd0550381b6b90a506f8f1cfb96b9 deephaven.registry.platform=linux/amd64 diff --git a/docker/registry/minio/gradle.properties b/docker/registry/minio/gradle.properties index 81eee96edf9..4b4922c2198 100644 --- a/docker/registry/minio/gradle.properties +++ b/docker/registry/minio/gradle.properties @@ -1,3 +1,3 @@ io.deephaven.project.ProjectType=DOCKER_REGISTRY deephaven.registry.imageName=minio/minio:latest -deephaven.registry.imageId=minio/minio@sha256:ac591851803a79aee64bc37f66d77c56b0a4b6e12d9e5356380f4105510f2332 +deephaven.registry.imageId=minio/minio@sha256:1dce27c494a16bae114774f1cec295493f3613142713130c2d22dd5696be6ad3 diff --git a/docker/registry/protoc-base/gradle.properties b/docker/registry/protoc-base/gradle.properties index 50feb88c208..9995e8df95c 100644 --- a/docker/registry/protoc-base/gradle.properties +++ b/docker/registry/protoc-base/gradle.properties @@ -1,5 +1,5 @@ io.deephaven.project.ProjectType=DOCKER_REGISTRY deephaven.registry.imageName=ghcr.io/deephaven/protoc-base:latest -deephaven.registry.imageId=ghcr.io/deephaven/protoc-base@sha256:3cb04f768cfdb66b47154568a23772d42dcc55076c7cbb94ae950309d350a82a +deephaven.registry.imageId=ghcr.io/deephaven/protoc-base@sha256:90f2a29c3b28d5e5a6cdc37d58d36fb413e4e6ce7964bd00f1d4e696fca46e18 # TODO(deephaven-base-images#54): arm64 native image for cpp-client-base deephaven.registry.platform=linux/amd64 diff --git a/docker/registry/python/gradle.properties b/docker/registry/python/gradle.properties index eb6c4cf53ef..1281252f951 100644 --- a/docker/registry/python/gradle.properties +++ b/docker/registry/python/gradle.properties @@ -1,3 +1,3 @@ io.deephaven.project.ProjectType=DOCKER_REGISTRY deephaven.registry.imageName=python:3.10 -deephaven.registry.imageId=python@sha256:941b0bfddbf17d809fd1f457acbf55dfca014e3e0e3d592b1c9070491681bc02 +deephaven.registry.imageId=python@sha256:eb7df628043d68aa30019fed02052bd27f1431c3a0abe9299d1e4d804d4b11e0 diff --git a/docker/registry/selenium/gradle.properties b/docker/registry/selenium/gradle.properties index b930e284623..8292f490c6b 100644 --- a/docker/registry/selenium/gradle.properties +++ b/docker/registry/selenium/gradle.properties @@ -1,3 +1,3 @@ io.deephaven.project.ProjectType=DOCKER_REGISTRY deephaven.registry.imageName=selenium/standalone-firefox:4.27.0 -deephaven.registry.imageId=selenium/standalone-firefox@sha256:287cd73a59d241b7264fe5fab7dbd31f92d56b8ec81ba74f53ecc4997f218dff +deephaven.registry.imageId=selenium/standalone-firefox@sha256:29f5f0e96c32b219ad1101d259374346cc96d3b9b1820ca700dfdcac5bc332b9 diff --git a/docker/registry/server-base/gradle.properties b/docker/registry/server-base/gradle.properties index 7ea66725aa2..2ff42e7080a 100644 --- a/docker/registry/server-base/gradle.properties +++ b/docker/registry/server-base/gradle.properties @@ -1,3 +1,3 @@ io.deephaven.project.ProjectType=DOCKER_REGISTRY deephaven.registry.imageName=ghcr.io/deephaven/server-base:edge -deephaven.registry.imageId=ghcr.io/deephaven/server-base@sha256:a904aba9cd5f91b6e2f19c4e024702700dfa37f549b73d24e9821f8f02061b45 +deephaven.registry.imageId=ghcr.io/deephaven/server-base@sha256:270e1a42772d5c68d0c49af1078cfa1bdeb659de786f8d04f7e78deb6fc85836 diff --git a/docker/registry/slim-base/gradle.properties b/docker/registry/slim-base/gradle.properties index 7625b2beb98..516473d39f0 100644 --- a/docker/registry/slim-base/gradle.properties +++ b/docker/registry/slim-base/gradle.properties @@ -1,3 +1,3 @@ io.deephaven.project.ProjectType=DOCKER_REGISTRY deephaven.registry.imageName=ghcr.io/deephaven/server-slim-base:edge -deephaven.registry.imageId=ghcr.io/deephaven/server-slim-base@sha256:941e783bde091416f4b0359cc2a9df893f8340ada9bf4681f84875a91dfcebd1 +deephaven.registry.imageId=ghcr.io/deephaven/server-slim-base@sha256:14c6a88b60463f256496e3c582d7a00ce3ab7672912216582dde9adf3e33eeac diff --git a/docker/server-jetty/src/main/server-jetty/requirements.txt b/docker/server-jetty/src/main/server-jetty/requirements.txt index 763f86295a3..4b5e00f54fe 100644 --- a/docker/server-jetty/src/main/server-jetty/requirements.txt +++ b/docker/server-jetty/src/main/server-jetty/requirements.txt @@ -12,7 +12,7 @@ numba==0.60.0 numpy==2.0.2 pandas==2.2.3 parso==0.8.4 -pyarrow==18.1.0 +pyarrow==19.0.0 python-dateutil==2.9.0.post0 pytz==2024.2 six==1.17.0 diff --git a/docker/server/src/main/server-netty/requirements.txt b/docker/server/src/main/server-netty/requirements.txt index 763f86295a3..4b5e00f54fe 100644 --- a/docker/server/src/main/server-netty/requirements.txt +++ b/docker/server/src/main/server-netty/requirements.txt @@ -12,7 +12,7 @@ numba==0.60.0 numpy==2.0.2 pandas==2.2.3 parso==0.8.4 -pyarrow==18.1.0 +pyarrow==19.0.0 python-dateutil==2.9.0.post0 pytz==2024.2 six==1.17.0 From 4b3ea4ba7b6bcae27efc2012548e5d1fc5cf9aaa Mon Sep 17 00:00:00 2001 From: Devin Smith Date: Wed, 22 Jan 2025 10:07:29 -0800 Subject: [PATCH 2/3] feat: DH-18399: Add ParquetColumnResolver (#6558) --- .../util/annotations/InternalUseOnly.java | 7 +- .../iceberg/junit5/SqliteCatalogBase.java | 54 +-- extensions/parquet/base/build.gradle | 13 +- .../parquet/base/ColumnChunkReaderImpl.java | 5 +- .../parquet/base/ColumnWriterImpl.java | 9 +- .../parquet/base/RowGroupWriterImpl.java | 24 +- .../parquet/impl/ColumnDescriptorUtil.java | 24 + .../parquet/impl/ParquetSchemaUtil.java | 190 ++++++++ .../deephaven/parquet/impl/package-info.java | 4 + .../parquet/base/TestParquetTimeUtils.java | 31 +- .../parquet/impl/ParquetSchemaUtilTest.java | 163 +++++++ .../parquet/table/ParquetInstructions.java | 52 ++- .../parquet/table/ParquetSchemaReader.java | 5 +- .../table/location/ParquetColumnResolver.java | 46 ++ .../location/ParquetColumnResolverMap.java | 58 +++ .../ParquetFieldIdColumnResolverFactory.java | 157 +++++++ .../table/location/ParquetTableLocation.java | 30 +- .../location/ParquetTableLocationKey.java | 28 +- .../table/ParquetInstructionsTest.java | 56 +++ .../table/ParquetTableReadWriteTest.java | 60 +-- .../parquet/table/TestParquetTools.java | 413 +++++++++++------- ...rquetFieldIdColumnResolverFactoryTest.java | 192 ++++++++ 22 files changed, 1354 insertions(+), 267 deletions(-) create mode 100644 extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/ColumnDescriptorUtil.java create mode 100644 extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/ParquetSchemaUtil.java create mode 100644 extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/package-info.java create mode 100644 extensions/parquet/base/src/test/java/io/deephaven/parquet/impl/ParquetSchemaUtilTest.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnResolver.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnResolverMap.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetFieldIdColumnResolverFactory.java create mode 100644 extensions/parquet/table/src/test/java/io/deephaven/parquet/table/location/ParquetFieldIdColumnResolverFactoryTest.java diff --git a/Util/src/main/java/io/deephaven/util/annotations/InternalUseOnly.java b/Util/src/main/java/io/deephaven/util/annotations/InternalUseOnly.java index 029b30267d3..f7966473bf5 100644 --- a/Util/src/main/java/io/deephaven/util/annotations/InternalUseOnly.java +++ b/Util/src/main/java/io/deephaven/util/annotations/InternalUseOnly.java @@ -9,10 +9,11 @@ import java.lang.annotation.Target; /** - * Indicates that a particular method is for internal use only and should not be used by client code. It is subject to - * change/removal at any time. + * Indicates that a particular {@link ElementType#METHOD method}, {@link ElementType#CONSTRUCTOR constructor}, + * {@link ElementType#TYPE type}, or {@link ElementType#PACKAGE package} is for internal use only and should not be used + * by client code. It is subject to change/removal at any time. */ -@Target({ElementType.METHOD, ElementType.CONSTRUCTOR, ElementType.TYPE}) +@Target({ElementType.METHOD, ElementType.CONSTRUCTOR, ElementType.TYPE, ElementType.PACKAGE}) @Inherited @Documented public @interface InternalUseOnly { diff --git a/extensions/iceberg/src/test/java/io/deephaven/iceberg/junit5/SqliteCatalogBase.java b/extensions/iceberg/src/test/java/io/deephaven/iceberg/junit5/SqliteCatalogBase.java index 11962bf71ec..3bb708fb715 100644 --- a/extensions/iceberg/src/test/java/io/deephaven/iceberg/junit5/SqliteCatalogBase.java +++ b/extensions/iceberg/src/test/java/io/deephaven/iceberg/junit5/SqliteCatalogBase.java @@ -34,8 +34,9 @@ import org.apache.iceberg.catalog.Namespace; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.types.Types; -import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; import org.jetbrains.annotations.Nullable; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -53,6 +54,11 @@ import java.util.List; import java.util.stream.Collectors; import static io.deephaven.engine.testutil.TstUtils.assertTableEquals; +import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.Types.buildMessage; +import static org.apache.parquet.schema.Types.optional; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.failBecauseExceptionWasNotThrown; @@ -416,8 +422,12 @@ void testColumnRenameWhileWriting() throws URISyntaxException { { final List parquetFiles = getAllParquetFilesFromDataFiles(tableIdentifier); assertThat(parquetFiles).hasSize(1); - verifyFieldIdsFromParquetFile(parquetFiles.get(0), originalDefinition.getColumnNames(), - nameToFieldIdFromSchema); + final MessageType expectedSchema = buildMessage() + .addFields( + optional(INT32).id(1).as(intType(32, true)).named("intCol"), + optional(DOUBLE).id(2).named("doubleCol")) + .named("root"); + verifySchema(parquetFiles.get(0), expectedSchema); } final Table moreData = TableTools.emptyTable(5) @@ -442,10 +452,18 @@ void testColumnRenameWhileWriting() throws URISyntaxException { final List parquetFiles = getAllParquetFilesFromDataFiles(tableIdentifier); assertThat(parquetFiles).hasSize(2); - verifyFieldIdsFromParquetFile(parquetFiles.get(0), moreData.getDefinition().getColumnNames(), - newNameToFieldId); - verifyFieldIdsFromParquetFile(parquetFiles.get(1), originalDefinition.getColumnNames(), - nameToFieldIdFromSchema); + final MessageType expectedSchema0 = buildMessage() + .addFields( + optional(INT32).id(1).as(intType(32, true)).named("newIntCol"), + optional(DOUBLE).id(2).named("newDoubleCol")) + .named("root"); + final MessageType expectedSchema1 = buildMessage() + .addFields( + optional(INT32).id(1).as(intType(32, true)).named("intCol"), + optional(DOUBLE).id(2).named("doubleCol")) + .named("root"); + verifySchema(parquetFiles.get(0), expectedSchema0); + verifySchema(parquetFiles.get(1), expectedSchema1); } // TODO: This is failing because we don't map columns based on the column ID when reading. Uncomment this @@ -455,31 +473,13 @@ void testColumnRenameWhileWriting() throws URISyntaxException { // moreData.renameColumns("intCol = newIntCol", "doubleCol = newDoubleCol")), fromIceberg); } - /** - * Verify that the schema of the parquet file read from the provided path has the provided column and corresponding - * field IDs. - */ - private void verifyFieldIdsFromParquetFile( - final String path, - final List columnNames, - final Map nameToFieldId) throws URISyntaxException { + private void verifySchema(String path, MessageType expectedSchema) throws URISyntaxException { final ParquetMetadata metadata = new ParquetTableLocationKey(new URI(path), 0, null, ParquetInstructions.builder() .setSpecialInstructions(dataInstructions()) .build()) .getMetadata(); - final List columnsMetadata = metadata.getFileMetaData().getSchema().getColumns(); - - final int numColumns = columnNames.size(); - for (int colIdx = 0; colIdx < numColumns; colIdx++) { - final String columnName = columnNames.get(colIdx); - final String columnNameFromParquetFile = columnsMetadata.get(colIdx).getPath()[0]; - assertThat(columnName).isEqualTo(columnNameFromParquetFile); - - final int expectedFieldId = nameToFieldId.get(columnName); - final int fieldIdFromParquetFile = columnsMetadata.get(colIdx).getPrimitiveType().getId().intValue(); - assertThat(fieldIdFromParquetFile).isEqualTo(expectedFieldId); - } + assertThat(metadata.getFileMetaData().getSchema()).isEqualTo(expectedSchema); } /** diff --git a/extensions/parquet/base/build.gradle b/extensions/parquet/base/build.gradle index 76bcfd2f7ed..9aff6b1022f 100644 --- a/extensions/parquet/base/build.gradle +++ b/extensions/parquet/base/build.gradle @@ -22,5 +22,16 @@ dependencies { implementation libs.guava compileOnly libs.jetbrains.annotations - testImplementation libs.junit4 + + testImplementation libs.assertj + + testImplementation platform(libs.junit.bom) + testImplementation libs.junit.jupiter + testRuntimeOnly libs.junit.jupiter.engine + testRuntimeOnly libs.junit.platform.launcher +} + +tasks.withType(Test).configureEach { + useJUnitPlatform { + } } diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java index 47126bc595a..3c51f99e47d 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java @@ -4,6 +4,7 @@ package io.deephaven.parquet.base; import io.deephaven.UncheckedDeephavenException; +import io.deephaven.parquet.impl.ParquetSchemaUtil; import io.deephaven.util.channel.SeekableChannelContext; import io.deephaven.util.channel.SeekableChannelsProvider; import io.deephaven.parquet.compress.CompressorAdapter; @@ -68,8 +69,8 @@ final class ColumnChunkReaderImpl implements ColumnChunkReader { this.columnName = columnName; this.channelsProvider = channelsProvider; this.columnChunk = columnChunk; - this.path = type - .getColumnDescription(columnChunk.meta_data.getPath_in_schema().toArray(new String[0])); + this.path = + ParquetSchemaUtil.columnDescriptor(type, columnChunk.meta_data.getPath_in_schema()).orElseThrow(); if (columnChunk.getMeta_data().isSetCodec()) { decompressor = DeephavenCompressorAdapterFactory.getInstance() .getByName(columnChunk.getMeta_data().getCodec().name()); diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java index 53e8f47103c..34334df51b2 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnWriterImpl.java @@ -32,6 +32,7 @@ import java.nio.channels.Channels; import java.nio.channels.WritableByteChannel; import java.util.EnumSet; +import java.util.Objects; import java.util.Set; import static org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt; @@ -76,11 +77,11 @@ final class ColumnWriterImpl implements ColumnWriter { final CompressorAdapter compressorAdapter, final int targetPageSize, final ByteBufferAllocator allocator) { - this.countingOutput = countingOutput; - this.column = column; - this.compressorAdapter = compressorAdapter; + this.countingOutput = Objects.requireNonNull(countingOutput); + this.column = Objects.requireNonNull(column); + this.compressorAdapter = Objects.requireNonNull(compressorAdapter); this.targetPageSize = targetPageSize; - this.allocator = allocator; + this.allocator = Objects.requireNonNull(allocator); dlEncoder = column.getMaxDefinitionLevel() == 0 ? null : new RunLengthBitPackingHybridEncoder( getWidthFromMaxInt(column.getMaxDefinitionLevel()), MIN_SLAB_SIZE, targetPageSize, allocator); diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java index 6d387228866..07e80a62505 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RowGroupWriterImpl.java @@ -5,6 +5,7 @@ import com.google.common.io.CountingOutputStream; import io.deephaven.parquet.compress.CompressorAdapter; +import io.deephaven.parquet.impl.ParquetSchemaUtil; import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; @@ -16,10 +17,11 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Objects; final class RowGroupWriterImpl implements RowGroupWriter { private final CountingOutputStream countingOutput; - private final MessageType type; + private final MessageType schema; private final int targetPageSize; private final ByteBufferAllocator allocator; private ColumnWriterImpl activeWriter; @@ -28,33 +30,33 @@ final class RowGroupWriterImpl implements RowGroupWriter { private final CompressorAdapter compressorAdapter; RowGroupWriterImpl(CountingOutputStream countingOutput, - MessageType type, + MessageType schema, int targetPageSize, ByteBufferAllocator allocator, CompressorAdapter compressorAdapter) { - this(countingOutput, type, targetPageSize, allocator, new BlockMetaData(), compressorAdapter); + this(countingOutput, schema, targetPageSize, allocator, new BlockMetaData(), compressorAdapter); } private RowGroupWriterImpl(CountingOutputStream countingOutput, - MessageType type, + MessageType schema, int targetPageSize, ByteBufferAllocator allocator, BlockMetaData blockMetaData, CompressorAdapter compressorAdapter) { - this.countingOutput = countingOutput; - this.type = type; + this.countingOutput = Objects.requireNonNull(countingOutput); + this.schema = Objects.requireNonNull(schema); this.targetPageSize = targetPageSize; - this.allocator = allocator; - this.blockMetaData = blockMetaData; - this.compressorAdapter = compressorAdapter; + this.allocator = Objects.requireNonNull(allocator); + this.blockMetaData = Objects.requireNonNull(blockMetaData); + this.compressorAdapter = Objects.requireNonNull(compressorAdapter); } String[] getPrimitivePath(String columnName) { String[] result = {columnName}; Type rollingType; - while (!(rollingType = type.getType(result)).isPrimitive()) { + while (!(rollingType = schema.getType(result)).isPrimitive()) { GroupType groupType = rollingType.asGroupType(); if (groupType.getFieldCount() != 1) { throw new UnsupportedOperationException("Encountered struct at:" + Arrays.toString(result)); @@ -74,7 +76,7 @@ public ColumnWriter addColumn(String columnName) { } activeWriter = new ColumnWriterImpl(this, countingOutput, - type.getColumnDescription(getPrimitivePath(columnName)), + ParquetSchemaUtil.columnDescriptor(schema, getPrimitivePath(columnName)).orElseThrow(), compressorAdapter, targetPageSize, allocator); diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/ColumnDescriptorUtil.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/ColumnDescriptorUtil.java new file mode 100644 index 00000000000..0eb346c3e12 --- /dev/null +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/ColumnDescriptorUtil.java @@ -0,0 +1,24 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.parquet.impl; + +import org.apache.parquet.column.ColumnDescriptor; +import org.jetbrains.annotations.NotNull; + +import javax.annotation.Nullable; + +public final class ColumnDescriptorUtil { + /** + * A more thorough check of {@link ColumnDescriptor} equality. In addition to + * {@link ColumnDescriptor#equals(Object)} which only checks the {@link ColumnDescriptor#getPath()}, this also + * checks for the equality of {@link ColumnDescriptor#getPrimitiveType()}, + * {@link ColumnDescriptor#getMaxRepetitionLevel()}, and {@link ColumnDescriptor#getMaxDefinitionLevel()}. + */ + public static boolean equals(@NotNull ColumnDescriptor x, @Nullable ColumnDescriptor y) { + return x == y || (x.equals(y) + && x.getPrimitiveType().equals(y.getPrimitiveType()) + && x.getMaxRepetitionLevel() == y.getMaxRepetitionLevel() + && x.getMaxDefinitionLevel() == y.getMaxDefinitionLevel()); + } +} diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/ParquetSchemaUtil.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/ParquetSchemaUtil.java new file mode 100644 index 00000000000..5b24b9db8f5 --- /dev/null +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/ParquetSchemaUtil.java @@ -0,0 +1,190 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.parquet.impl; + +import io.deephaven.base.verify.Assert; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Type.Repetition; + +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Deque; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Consumer; +import java.util.function.Predicate; + +/** + * Various improved ways of traversing {@link MessageType}. + */ +public final class ParquetSchemaUtil { + + public interface Visitor { + + /** + * Accept a Parquet column. + * + *

+ * This represents the constituents parts of a {@link ColumnDescriptor} in an easier to consume fashion. In + * particular, it is useful when the consumer wants to iterate the Typed-path from MessageType root to leaf + * without needing to resort to extraneous allocation of {@link MessageType#getType(String...)} or state + * management needed via {@link GroupType#getType(String)}. The arguments of this method can be made into a + * {@link ColumnDescriptor} using {@link ParquetSchemaUtil#makeColumnDescriptor(Collection, PrimitiveType)}. + * + * @param typePath the fully typed path + * @param primitiveType the leaf primitiveType, guaranteed to be the last element of path + */ + void accept(Collection typePath, PrimitiveType primitiveType); + } + + /** + * A more efficient implementation of {@link MessageType#getColumns()}. + */ + public static List columns(MessageType schema) { + final List out = new ArrayList<>(); + walkColumnDescriptors(schema, out::add); + return out; + } + + /** + * A more efficient implementation of {@link MessageType#getPaths()}. + */ + public static List paths(MessageType schema) { + final List out = new ArrayList<>(); + walk(schema, (typePath, primitiveType) -> out.add(makePath(typePath))); + return out; + } + + /** + * An alternative interface for traversing the column descriptors of a Parquet {@code schema}. + */ + public static void walkColumnDescriptors(MessageType schema, Consumer consumer) { + walk(schema, new ColumnDescriptorVisitor(consumer)); + } + + /** + * An alternative interface for traversing the leaf fields of a Parquet {@code schema}. + */ + public static void walk(MessageType schema, Visitor visitor) { + walk(schema, visitor, new ArrayDeque<>()); + } + + /** + * A more efficient implementation of {@link MessageType#getColumnDescription(String[])}. + */ + public static Optional columnDescriptor(MessageType schema, String[] path) { + if (path.length == 0) { + return Optional.empty(); + } + int repeatedCount = 0; + int notRequiredCount = 0; + GroupType current = schema; + for (int i = 0; i < path.length - 1; ++i) { + if (!current.containsField(path[i])) { + return Optional.empty(); + } + final Type field = current.getFields().get(current.getFieldIndex(path[i])); + if (field == null || field.isPrimitive()) { + return Optional.empty(); + } + current = field.asGroupType(); + if (isRepeated(current)) { + ++repeatedCount; + } + if (!isRequired(current)) { + ++notRequiredCount; + } + } + final PrimitiveType primitiveType; + { + if (!current.containsField(path[path.length - 1])) { + return Optional.empty(); + } + final Type field = current.getFields().get(current.getFieldIndex(path[path.length - 1])); + if (field == null || !field.isPrimitive()) { + return Optional.empty(); + } + primitiveType = field.asPrimitiveType(); + if (isRepeated(primitiveType)) { + ++repeatedCount; + } + if (!isRequired(primitiveType)) { + ++notRequiredCount; + } + } + return Optional.of(new ColumnDescriptor(path, primitiveType, repeatedCount, notRequiredCount)); + } + + /** + * A more efficient implementation of {@link MessageType#getColumnDescription(String[])}. + */ + public static Optional columnDescriptor(MessageType schema, List path) { + return columnDescriptor(schema, path.toArray(new String[0])); + } + + public static ColumnDescriptor makeColumnDescriptor(Collection typePath, PrimitiveType primitiveType) { + final String[] path = makePath(typePath); + final int maxRep = (int) typePath.stream().filter(ParquetSchemaUtil::isRepeated).count(); + final int maxDef = (int) typePath.stream().filter(Predicate.not(ParquetSchemaUtil::isRequired)).count(); + return new ColumnDescriptor(path, primitiveType, maxRep, maxDef); + } + + /** + * Checks if {@code schema} contains {@code descriptor} based on + * {@link ColumnDescriptorUtil#equals(ColumnDescriptor, ColumnDescriptor)}. + */ + public static boolean contains(MessageType schema, ColumnDescriptor descriptor) { + return columnDescriptor(schema, descriptor.getPath()) + .filter(cd -> ColumnDescriptorUtil.equals(descriptor, cd)) + .isPresent(); + } + + private static String[] makePath(Collection typePath) { + return typePath.stream().map(Type::getName).toArray(String[]::new); + } + + private static void walk(Type type, Visitor visitor, Deque stack) { + if (type.isPrimitive()) { + visitor.accept(stack, type.asPrimitiveType()); + return; + } + walk(type.asGroupType(), visitor, stack); + } + + private static void walk(GroupType type, Visitor visitor, Deque stack) { + for (final Type field : type.getFields()) { + Assert.eqTrue(stack.offerLast(field), "stack.offerLast(field)"); + walk(field, visitor, stack); + Assert.eq(stack.pollLast(), "stack.pollLast()", field, "field"); + } + } + + private static boolean isRepeated(Type x) { + return x.isRepetition(Repetition.REPEATED); + } + + private static boolean isRequired(Type x) { + return x.isRepetition(Repetition.REQUIRED); + } + + private static class ColumnDescriptorVisitor implements Visitor { + + private final Consumer consumer; + + public ColumnDescriptorVisitor(Consumer consumer) { + this.consumer = Objects.requireNonNull(consumer); + } + + @Override + public void accept(Collection typePath, PrimitiveType primitiveType) { + consumer.accept(makeColumnDescriptor(typePath, primitiveType)); + } + } +} diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/package-info.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/package-info.java new file mode 100644 index 00000000000..d28eb95bb22 --- /dev/null +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/impl/package-info.java @@ -0,0 +1,4 @@ +@InternalUseOnly +package io.deephaven.parquet.impl; + +import io.deephaven.util.annotations.InternalUseOnly; diff --git a/extensions/parquet/base/src/test/java/io/deephaven/parquet/base/TestParquetTimeUtils.java b/extensions/parquet/base/src/test/java/io/deephaven/parquet/base/TestParquetTimeUtils.java index f32f9975cfe..cf2c395c96d 100644 --- a/extensions/parquet/base/src/test/java/io/deephaven/parquet/base/TestParquetTimeUtils.java +++ b/extensions/parquet/base/src/test/java/io/deephaven/parquet/base/TestParquetTimeUtils.java @@ -5,52 +5,53 @@ import io.deephaven.time.DateTimeUtils; import io.deephaven.util.QueryConstants; -import junit.framework.TestCase; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.time.Instant; import java.time.LocalDateTime; import java.time.ZoneId; -public class TestParquetTimeUtils { +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +class TestParquetTimeUtils { @Test - public void testEpochNanosUTC() { + void testEpochNanosUTC() { final long nanos = 123456789123456789L; final Instant dt2 = Instant.ofEpochSecond(0, nanos); final LocalDateTime ldt = LocalDateTime.ofInstant(dt2, ZoneId.of("UTC")); - TestCase.assertEquals(nanos, ParquetTimeUtils.epochNanosUTC(ldt)); - TestCase.assertEquals(QueryConstants.NULL_LONG, ParquetTimeUtils.epochNanosUTC(null)); + assertThat(ParquetTimeUtils.epochNanosUTC(ldt)).isEqualTo(nanos); + assertThat(ParquetTimeUtils.epochNanosUTC(null)).isEqualTo(QueryConstants.NULL_LONG); } @Test - public void testEpochNanosTo() { + void testEpochNanosTo() { final long nanos = 123456789123456789L; final Instant dt2 = Instant.ofEpochSecond(0, nanos); final LocalDateTime ldt = LocalDateTime.ofInstant(dt2, ZoneId.of("UTC")); - TestCase.assertEquals(ldt, ParquetTimeUtils.epochNanosToLocalDateTimeUTC(nanos)); - TestCase.assertNull(ParquetTimeUtils.epochNanosToLocalDateTimeUTC(QueryConstants.NULL_LONG)); + assertThat(ParquetTimeUtils.epochNanosToLocalDateTimeUTC(nanos)).isEqualTo(ldt); + assertThat(ParquetTimeUtils.epochNanosToLocalDateTimeUTC(QueryConstants.NULL_LONG)).isNull(); } @Test - public void testEpochMicrosTo() { + void testEpochMicrosTo() { long nanos = 123456789123456789L; final long micros = DateTimeUtils.nanosToMicros(nanos); nanos = DateTimeUtils.microsToNanos(micros); final Instant dt2 = Instant.ofEpochSecond(0, nanos); final LocalDateTime ldt = LocalDateTime.ofInstant(dt2, ZoneId.of("UTC")); - TestCase.assertEquals(ldt, ParquetTimeUtils.epochMicrosToLocalDateTimeUTC(micros)); - TestCase.assertNull(ParquetTimeUtils.epochMicrosToLocalDateTimeUTC(QueryConstants.NULL_LONG)); + assertThat(ParquetTimeUtils.epochMicrosToLocalDateTimeUTC(micros)).isEqualTo(ldt); + assertThat(ParquetTimeUtils.epochMicrosToLocalDateTimeUTC(QueryConstants.NULL_LONG)).isNull(); } @Test - public void testEpochMillisTo() { + void testEpochMillisTo() { long nanos = 123456789123456789L; final long millis = DateTimeUtils.nanosToMillis(nanos); nanos = DateTimeUtils.millisToNanos(millis); final Instant dt2 = Instant.ofEpochSecond(0, nanos); final LocalDateTime ldt = LocalDateTime.ofInstant(dt2, ZoneId.of("UTC")); - TestCase.assertEquals(ldt, ParquetTimeUtils.epochMillisToLocalDateTimeUTC(millis)); - TestCase.assertNull(ParquetTimeUtils.epochMillisToLocalDateTimeUTC(QueryConstants.NULL_LONG)); + assertThat(ParquetTimeUtils.epochMillisToLocalDateTimeUTC(millis)).isEqualTo(ldt); + assertThat(ParquetTimeUtils.epochMillisToLocalDateTimeUTC(QueryConstants.NULL_LONG)).isNull(); } } diff --git a/extensions/parquet/base/src/test/java/io/deephaven/parquet/impl/ParquetSchemaUtilTest.java b/extensions/parquet/base/src/test/java/io/deephaven/parquet/impl/ParquetSchemaUtilTest.java new file mode 100644 index 00000000000..de6cfaccce3 --- /dev/null +++ b/extensions/parquet/base/src/test/java/io/deephaven/parquet/impl/ParquetSchemaUtilTest.java @@ -0,0 +1,163 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.parquet.impl; + +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.io.InvalidRecordException; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; +import org.junit.jupiter.api.Test; + +import java.util.Comparator; +import java.util.List; +import java.util.function.BiPredicate; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.Types.optional; +import static org.apache.parquet.schema.Types.repeated; +import static org.apache.parquet.schema.Types.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.failBecauseExceptionWasNotThrown; + +class ParquetSchemaUtilTest { + + private static final MessageType SCHEMA; + + private static final ColumnDescriptor[] NON_EXISTENT_COLUMNS = { + new ColumnDescriptor(new String[] {"Required"}, optional(INT32).named("Required"), 0, 0), + new ColumnDescriptor(new String[] {"Required"}, repeated(INT32).named("Required"), 0, 0), + new ColumnDescriptor(new String[] {"Required"}, required(INT32).id(42).named("Required"), 0, 0), + new ColumnDescriptor(new String[] {"Required2"}, required(INT32).named("Required"), 0, 0), + new ColumnDescriptor(new String[] {"Required"}, required(INT32).named("Required2"), 0, 0), + new ColumnDescriptor(new String[] {"Required"}, required(INT32).named("Required"), 1, 0), + new ColumnDescriptor(new String[] {"Required"}, required(INT32).named("Required"), 0, 1), + new ColumnDescriptor(new String[] {"Required"}, required(INT64).named("Required"), 0, 0), + new ColumnDescriptor(new String[] {"Required"}, required(INT32).as(intType(16)).named("Required"), 0, + 0), + new ColumnDescriptor(new String[] {"Required"}, optional(INT32).named("Required"), 0, 0), + new ColumnDescriptor(new String[] {}, repeated(INT32).named("Required"), 0, 0) + }; + + private static final String[][] NON_EXISTENT_LEAF_PATHS = { + {}, + {""}, + {"root"}, + {"required"}, + {"repeated"}, + {"optional"}, + {"REQUIRED"}, + {"REPEATED"}, + {"OPTIONAL"}, + {"RequiredGroup"}, + {"RepeatedGroup"}, + {"OptionalGroup"}, + {"RequiredGroup2"}, + {"RepeatedGroup2"}, + {"OptionalGroup2"}, + {"RequiredGroup", ""}, + {"RequiredGroup", "REQUIRED"}, + {"RequiredGroup2", "REQUIRED"}, + {"RequiredGroup2", "RequiredGroup", "REQUIRED"}, + {"RequiredGroup2", "REQUIREDGROUP", "Required"}, + {"REQUIREDGROUP2", "RequiredGroup", "Required"}, + {"foo"}, + {"foo", "bar"}, + {"foo", "bar", "baz"}, + {"foo", "bar", "baz", "zip"}, + }; + + static { + final PrimitiveType required = required(INT32).named("Required"); + final PrimitiveType repeated = repeated(INT32).named("Repeated"); + final PrimitiveType optional = optional(INT32).named("Optional"); + final GroupType l1 = Types.requiredList().element(required).named("L1"); + final GroupType l2 = Types.optionalList().element(required).named("L2"); + final GroupType requiredGroup = Types.requiredGroup() + .addFields(required, repeated, optional, l1, l2) + .named("RequiredGroup"); + final GroupType repeatedGroup = Types.repeatedGroup() + .addFields(required, repeated, optional, l1, l2) + .named("RepeatedGroup"); + final GroupType optionalGroup = Types.optionalGroup() + .addFields(required, repeated, optional, l1, l2) + .named("OptionalGroup"); + final GroupType requiredGroup2 = Types.requiredGroup() + .addFields(required, repeated, optional, l1, l2, requiredGroup, repeatedGroup, optionalGroup) + .named("RequiredGroup2"); + final GroupType repeatedGroup2 = Types.repeatedGroup() + .addFields(required, repeated, optional, l1, l2, requiredGroup, repeatedGroup, optionalGroup) + .named("RepeatedGroup2"); + final GroupType optionalGroup2 = Types.optionalGroup() + .addFields(required, repeated, optional, l1, l2, requiredGroup, repeatedGroup, optionalGroup) + .named("OptionalGroup2"); + SCHEMA = Types.buildMessage() + .addFields(required, repeated, optional, l1, l2, requiredGroup, repeatedGroup, optionalGroup, + requiredGroup2, + repeatedGroup2, optionalGroup2) + .named("root"); + } + + @Test + void columnsEmpty() { + final MessageType schema = Types.buildMessage().named("root"); + final List columns = ParquetSchemaUtil.columns(schema); + assertThat(columns) + .usingElementComparator(equalityMethod(ColumnDescriptorUtil::equals)) + .isEqualTo(schema.getColumns()); + } + + @Test + void columns() { + final List columns = ParquetSchemaUtil.columns(SCHEMA); + assertThat(columns) + .usingElementComparator(equalityMethod(ColumnDescriptorUtil::equals)) + .isEqualTo(SCHEMA.getColumns()); + } + + @Test + void columnDescriptor() { + for (ColumnDescriptor expected : ParquetSchemaUtil.columns(SCHEMA)) { + assertThat(ParquetSchemaUtil.columnDescriptor(SCHEMA, expected.getPath())) + .usingValueComparator(equalityMethod(ColumnDescriptorUtil::equals)) + .hasValue(expected); + // verify Parquet library has same behavior + assertThat(SCHEMA.getColumnDescription(expected.getPath())) + .usingComparator(equalityMethod(ColumnDescriptorUtil::equals)) + .isEqualTo(expected); + } + for (String[] nonExistentPath : NON_EXISTENT_LEAF_PATHS) { + assertThat(ParquetSchemaUtil.columnDescriptor(SCHEMA, nonExistentPath)).isEmpty(); + // verify Parquet library has similar behavior + try { + SCHEMA.getColumnDescription(nonExistentPath); + failBecauseExceptionWasNotThrown(Throwable.class); + } catch (InvalidRecordException | ClassCastException e) { + // good enough + } + } + } + + @Test + void contains() { + for (ColumnDescriptor column : ParquetSchemaUtil.columns(SCHEMA)) { + assertThat(ParquetSchemaUtil.contains(SCHEMA, column)).isTrue(); + } + for (ColumnDescriptor column : NON_EXISTENT_COLUMNS) { + assertThat(ParquetSchemaUtil.contains(SCHEMA, column)).isFalse(); + } + } + + /** + * This is not a valid comparator; it may only be used with assertJ for equality purposes and not comparison + * purposes. See Support specialized equality methods + */ + private static Comparator equalityMethod(BiPredicate predicate) { + // noinspection ComparatorMethodParameterNotUsed + return (o1, o2) -> predicate.test(o1, o2) ? 0 : -1; + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetInstructions.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetInstructions.java index d8fa3ff9aa7..e8493fc2241 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetInstructions.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetInstructions.java @@ -12,8 +12,10 @@ import io.deephaven.hash.KeyedObjectKey; import io.deephaven.hash.KeyedObjectKey.Basic; import io.deephaven.parquet.base.ParquetUtils; +import io.deephaven.parquet.table.location.ParquetColumnResolver; import io.deephaven.util.annotations.VisibleForTesting; import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.metadata.FileMetaData; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -167,6 +169,8 @@ public final String getColumnNameFromParquetColumnNameOrDefault(final String par public abstract Optional>> getIndexColumns(); + public abstract Optional getColumnResolverFactory(); + /** * Creates a new {@link ParquetInstructions} object with the same properties as the current object but definition * set as the provided {@link TableDefinition}. @@ -316,6 +320,11 @@ public Optional>> getIndexColumns() { return Optional.empty(); } + @Override + public Optional getColumnResolverFactory() { + return Optional.empty(); + } + @Override public ParquetInstructions withTableDefinition(@Nullable final TableDefinition useDefinition) { return withTableDefinitionAndLayout(useDefinition, null); @@ -333,7 +342,7 @@ public ParquetInstructions withTableDefinitionAndLayout( return new ReadOnly(null, null, getCompressionCodecName(), getMaximumDictionaryKeys(), getMaximumDictionarySize(), isLegacyParquet(), getTargetPageSize(), isRefreshing(), getSpecialInstructions(), generateMetadataFiles(), baseNameForPartitionedParquetData(), - useLayout, useDefinition, null, null); + useLayout, useDefinition, null, null, null); } @Override @@ -341,7 +350,7 @@ ParquetInstructions withIndexColumns(final Collection> indexColumns return new ReadOnly(null, null, getCompressionCodecName(), getMaximumDictionaryKeys(), getMaximumDictionarySize(), isLegacyParquet(), getTargetPageSize(), isRefreshing(), getSpecialInstructions(), generateMetadataFiles(), baseNameForPartitionedParquetData(), - null, null, indexColumns, null); + null, null, indexColumns, null, null); } @Override @@ -458,6 +467,7 @@ private static final class ReadOnly extends ParquetInstructions { private final TableDefinition tableDefinition; private final Collection> indexColumns; private final OnWriteCompleted onWriteCompleted; + private final ParquetColumnResolver.Factory columnResolver; private ReadOnly( final KeyedObjectHashMap columnNameToInstructions, @@ -474,7 +484,8 @@ private ReadOnly( final ParquetFileLayout fileLayout, final TableDefinition tableDefinition, final Collection> indexColumns, - final OnWriteCompleted onWriteCompleted) { + final OnWriteCompleted onWriteCompleted, + final ParquetColumnResolver.Factory columnResolver) { this.columnNameToInstructions = columnNameToInstructions; this.parquetColumnNameToInstructions = parquetColumnNameToColumnName; this.compressionCodecName = compressionCodecName; @@ -493,6 +504,12 @@ private ReadOnly( .map(List::copyOf) .collect(Collectors.toUnmodifiableList()); this.onWriteCompleted = onWriteCompleted; + this.columnResolver = columnResolver; + if (columnResolver != null) { + if (tableDefinition == null) { + throw new IllegalArgumentException("When setting columnResolver, tableDefinition must be provided"); + } + } } private T getOrDefault(final String columnName, final T defaultValue, @@ -617,6 +634,11 @@ public Optional>> getIndexColumns() { return Optional.ofNullable(indexColumns); } + @Override + public Optional getColumnResolverFactory() { + return Optional.ofNullable(columnResolver); + } + @Override public ParquetInstructions withTableDefinition(@Nullable final TableDefinition useDefinition) { return withTableDefinitionAndLayout(useDefinition, fileLayout); @@ -635,7 +657,7 @@ public ParquetInstructions withTableDefinitionAndLayout( getCompressionCodecName(), getMaximumDictionaryKeys(), getMaximumDictionarySize(), isLegacyParquet(), getTargetPageSize(), isRefreshing(), getSpecialInstructions(), generateMetadataFiles(), baseNameForPartitionedParquetData(), useLayout, useDefinition, - indexColumns, onWriteCompleted); + indexColumns, onWriteCompleted, columnResolver); } @Override @@ -644,7 +666,7 @@ ParquetInstructions withIndexColumns(final Collection> useIndexColu getCompressionCodecName(), getMaximumDictionaryKeys(), getMaximumDictionarySize(), isLegacyParquet(), getTargetPageSize(), isRefreshing(), getSpecialInstructions(), generateMetadataFiles(), baseNameForPartitionedParquetData(), fileLayout, - tableDefinition, useIndexColumns, onWriteCompleted); + tableDefinition, useIndexColumns, onWriteCompleted, columnResolver); } @Override @@ -709,6 +731,7 @@ public static class Builder { private TableDefinition tableDefinition; private Collection> indexColumns; private OnWriteCompleted onWriteCompleted; + private ParquetColumnResolver.Factory columnResolverFactory; /** * For each additional field added, make sure to update the copy constructor builder @@ -737,6 +760,7 @@ public Builder(final ParquetInstructions parquetInstructions) { tableDefinition = readOnlyParquetInstructions.getTableDefinition().orElse(null); indexColumns = readOnlyParquetInstructions.getIndexColumns().orElse(null); onWriteCompleted = readOnlyParquetInstructions.onWriteCompleted().orElse(null); + columnResolverFactory = readOnlyParquetInstructions.getColumnResolverFactory().orElse(null); } public Builder addColumnNameMapping(final String parquetColumnName, final String columnName) { @@ -974,6 +998,22 @@ public Builder setOnWriteCompleted(final OnWriteCompleted onWriteCompleted) { return this; } + /** + * Sets the column resolver factory to allow higher-level managers (such as Iceberg) to use advanced column + * resolution logic based on the table key and table location key. When set, + * {@link #setTableDefinition(TableDefinition)} must also be set. As such, the factory is not used for + * inference purposes. + * + *

+ * This is not typically set by end-users. + * + * @param columnResolverFactory the column resolver factory + */ + public Builder setColumnResolverFactory(ParquetColumnResolver.Factory columnResolverFactory) { + this.columnResolverFactory = columnResolverFactory; + return this; + } + public ParquetInstructions build() { final KeyedObjectHashMap columnNameToInstructionsOut = columnNameToInstructions; columnNameToInstructions = null; @@ -983,7 +1023,7 @@ public ParquetInstructions build() { return new ReadOnly(columnNameToInstructionsOut, parquetColumnNameToColumnNameOut, compressionCodecName, maximumDictionaryKeys, maximumDictionarySize, isLegacyParquet, targetPageSize, isRefreshing, specialInstructions, generateMetadataFiles, baseNameForPartitionedParquetData, fileLayout, - tableDefinition, indexColumns, onWriteCompleted); + tableDefinition, indexColumns, onWriteCompleted, columnResolverFactory); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetSchemaReader.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetSchemaReader.java index 658d37bc20a..811a480a186 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetSchemaReader.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetSchemaReader.java @@ -8,12 +8,12 @@ import io.deephaven.base.ClassUtil; import io.deephaven.base.Pair; import io.deephaven.engine.table.ColumnDefinition; +import io.deephaven.parquet.impl.ParquetSchemaUtil; import io.deephaven.stringset.StringSet; import io.deephaven.engine.table.impl.locations.TableDataException; import io.deephaven.parquet.table.metadata.CodecInfo; import io.deephaven.parquet.table.metadata.ColumnTypeInfo; import io.deephaven.parquet.table.metadata.TableInfo; -import io.deephaven.parquet.base.ParquetFileReader; import io.deephaven.util.SimpleTypeMap; import io.deephaven.vector.ByteVector; import io.deephaven.vector.CharVector; @@ -23,7 +23,6 @@ import io.deephaven.vector.LongVector; import io.deephaven.vector.ObjectVector; import io.deephaven.vector.ShortVector; -import org.apache.parquet.format.converter.ParquetMetadataConverter; import io.deephaven.util.codec.SimpleByteArrayCodec; import io.deephaven.util.codec.UTF8StringAsByteArrayCodec; import org.apache.commons.lang3.mutable.MutableObject; @@ -142,7 +141,7 @@ public static ParquetInstructions readParquetSchema( }; final ParquetMessageDefinition colDef = new ParquetMessageDefinition(); final Map parquetColumnNameToFirstPath = new HashMap<>(); - for (final ColumnDescriptor column : schema.getColumns()) { + for (final ColumnDescriptor column : ParquetSchemaUtil.columns(schema)) { if (column.getMaxRepetitionLevel() > 1) { // TODO (https://github.com/deephaven/deephaven-core/issues/871): Support this throw new UnsupportedOperationException("Unsupported maximum repetition level " diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnResolver.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnResolver.java new file mode 100644 index 00000000000..446b118e4a7 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnResolver.java @@ -0,0 +1,46 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.parquet.table.location; + +import io.deephaven.engine.table.impl.locations.TableKey; +import io.deephaven.parquet.table.ParquetInstructions; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.schema.MessageType; + +import java.util.List; +import java.util.Optional; + +/** + * A resolver from Deephaven column names to Parquet paths. + */ +public interface ParquetColumnResolver { + + /** + * A factory for creating Parquet column resolvers. This may be useful in situations where the mapping from a + * Deephaven column name to a Parquet column is not derived directly from the Deephaven column name. + * + * @see ParquetInstructions.Builder#setColumnResolverFactory(Factory) + */ + interface Factory { + + /** + * Create a Parquet column resolver. + * + * @param tableKey the table key + * @param tableLocationKey the Parquet TLK + * @return the Parquet column resolver + */ + ParquetColumnResolver of(TableKey tableKey, ParquetTableLocationKey tableLocationKey); + } + + /** + * The path to the leaf field in the Parquet schema corresponding to the Deephaven {@code columnName}. + * + * @param columnName the column name + * @return the path to the leaf field in the Parquet schema + * @see ColumnDescriptor#getPath() + * @see MessageType#getColumnDescription(String[]) + */ + Optional> of(String columnName); +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnResolverMap.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnResolverMap.java new file mode 100644 index 00000000000..3c92740cc26 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnResolverMap.java @@ -0,0 +1,58 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.parquet.table.location; + +import io.deephaven.annotations.BuildableStyle; +import io.deephaven.util.annotations.InternalUseOnly; +import org.immutables.value.Value; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +/** + * A simple {@link ParquetColumnResolver} implementation from a {@link Map}. + */ +@Value.Immutable +@BuildableStyle +public abstract class ParquetColumnResolverMap implements ParquetColumnResolver { + + public static Builder builder() { + return ImmutableParquetColumnResolverMap.builder(); + } + + abstract Map> mapUnsafe(); + + /** + * A map from Deephaven column name to Parquet path. + */ + public final Map> map() { + return mapUnsafe(); + } + + /** + * {@inheritDoc} + * + *

+ * Equivalent to {@code Optional.ofNullable(map().get(columnName))}. + */ + @Override + public Optional> of(String columnName) { + return Optional.ofNullable(map().get(columnName)); + } + + public interface Builder { + + // Ideally, not part of the public interface. + // See https://github.com/immutables/immutables/issues/1534 + @InternalUseOnly + Builder putMapUnsafe(String key, List value); + + default Builder putMap(String key, List value) { + return putMapUnsafe(key, List.copyOf(value)); + } + + ParquetColumnResolverMap build(); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetFieldIdColumnResolverFactory.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetFieldIdColumnResolverFactory.java new file mode 100644 index 00000000000..0b03216e3f0 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetFieldIdColumnResolverFactory.java @@ -0,0 +1,157 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.parquet.table.location; + +import io.deephaven.engine.table.impl.locations.TableKey; +import io.deephaven.parquet.impl.ParquetSchemaUtil; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * This {@link ParquetColumnResolver.Factory} resolves Parquet columns via {@link Type#getId() field ids}. The field ids + * are considered for resolution no matter what level in the schema they exist. For example, the following schema has + * field ids at different levels: + * + *

+ * message root {
+ *   required int32 X = 42;
+ *   required group Y (LIST) = 43 {
+ *     repeated group list {
+ *       required int32 element;
+ *     }
+ *   }
+ *   required group Z (LIST) {
+ *     repeated group list {
+ *       required int32 element = 44;
+ *     }
+ *   }
+ * }
+ * 
+ * + * In this example, {@code 42} would be resolvable to {@code [X]}, {@code 43} would be resolvable to + * {@code [Y, list, element]}, and {@code 44} would be resolvable to {@code [Z, list, element]}. + * + *

+ * If a schema has ambiguous field ids (according to this implementation's definition), the resolution will fail if the + * user requests those field ids. For example: + * + *

+ * message root {
+ *   required int32 X = 42;
+ *   required group Y (LIST) = 43 {
+ *     repeated group list {
+ *       required int32 element;
+ *     }
+ *   }
+ *   required group Z (LIST) {
+ *     repeated group list {
+ *       required int32 element = 42;
+ *     }
+ *   }
+ * }
+ * 
+ * + * In this example, if {@code 42} was requested, resolution would fail because it is ambiguous between paths {@code [X]} + * and {@code [Z, list, element]}. If {@code 43} was requested, resolution would succeed. + */ +public final class ParquetFieldIdColumnResolverFactory implements ParquetColumnResolver.Factory { + + /** + * Creates a field id column resolver factory. + * + * @param columnNameToFieldId a map from Deephaven column names to field ids + * @return the column resolver provider + */ + public static ParquetFieldIdColumnResolverFactory of(Map columnNameToFieldId) { + return new ParquetFieldIdColumnResolverFactory(columnNameToFieldId + .entrySet() + .stream() + .collect(Collectors.groupingBy( + Map.Entry::getValue, + Collectors.mapping(Map.Entry::getKey, Collectors.toSet())))); + } + + private final Map> fieldIdsToDhColumnNames; + + private ParquetFieldIdColumnResolverFactory(Map> fieldIdsToDhColumnNames) { + this.fieldIdsToDhColumnNames = Objects.requireNonNull(fieldIdsToDhColumnNames); + } + + /** + * Resolves the requested field ids for {@code schema}. + * + * @param schema the schema + * @return the resolver map + */ + public ParquetColumnResolverMap of(MessageType schema) { + final FieldIdMappingVisitor visitor = new FieldIdMappingVisitor(); + ParquetSchemaUtil.walk(schema, visitor); + return visitor.toResolver(); + } + + /** + * Equivalent to {@code of(tableLocationKey.getFileReader().getSchema())}. + * + * @param tableKey the table key + * @param tableLocationKey the Parquet TLK + * @return the resolver map + * @see #of(MessageType) + */ + @Override + public ParquetColumnResolverMap of(TableKey tableKey, ParquetTableLocationKey tableLocationKey) { + return of(tableLocationKey.getSchema()); + } + + private class FieldIdMappingVisitor implements ParquetSchemaUtil.Visitor { + private final Map> nameToPath = new HashMap<>(); + + public ParquetColumnResolverMap toResolver() { + ParquetColumnResolverMap.Builder builder = ParquetColumnResolverMap.builder(); + for (Map.Entry> e : nameToPath.entrySet()) { + builder.putMap(e.getKey(), e.getValue()); + } + return builder.build(); + } + + @Override + public void accept(Collection typePath, PrimitiveType primitiveType) { + // There are different resolution strategies that could all be reasonable. We could consider using only the + // field id closest to the leaf. This version, however, takes the most general approach and considers field + // ids wherever they appear; ultimately, only being resolvable if the field id mapping is unambiguous. + List path = null; + for (Type type : typePath) { + final Type.ID id = type.getId(); + if (id == null) { + continue; + } + final int fieldId = id.intValue(); + final Set set = fieldIdsToDhColumnNames.get(fieldId); + if (set == null) { + continue; + } + if (path == null) { + path = typePath.stream().map(Type::getName).collect(Collectors.toUnmodifiableList()); + } + for (String columnName : set) { + final List existing = nameToPath.putIfAbsent(columnName, path); + if (existing != null && !existing.equals(path)) { + throw new IllegalArgumentException(String.format( + "Parquet columns can't be unambigously mapped. %s -> %d has multiple paths [%s], [%s]", + columnName, fieldId, String.join(", ", existing), + String.join(", ", path))); + } + } + } + } + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java index 7c8f47636e5..3b11018f7bb 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java @@ -25,6 +25,7 @@ import io.deephaven.parquet.base.ColumnChunkReader; import io.deephaven.parquet.base.ParquetFileReader; import io.deephaven.parquet.base.RowGroupReader; +import io.deephaven.parquet.impl.ParquetSchemaUtil; import io.deephaven.parquet.table.ParquetInstructions; import io.deephaven.parquet.table.ParquetSchemaReader; import io.deephaven.parquet.table.ParquetTools; @@ -34,7 +35,6 @@ import io.deephaven.parquet.table.metadata.SortColumnInfo; import io.deephaven.parquet.table.metadata.TableInfo; import io.deephaven.util.channel.SeekableChannelsProvider; -import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.format.RowGroup; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.jetbrains.annotations.NotNull; @@ -60,6 +60,8 @@ public class ParquetTableLocation extends AbstractTableLocation { private final ParquetFileReader parquetFileReader; private final int[] rowGroupIndices; + private final ParquetColumnResolver resolver; + private final RowGroup[] rowGroups; private final RegionedPageStore.Parameters regionParameters; private final Map parquetColumnNameToPath; @@ -88,7 +90,9 @@ public ParquetTableLocation(@NotNull final TableKey tableKey, parquetMetadata = tableLocationKey.getMetadata(); rowGroupIndices = tableLocationKey.getRowGroupIndices(); } - + resolver = readInstructions.getColumnResolverFactory() + .map(factory -> factory.of(tableKey, tableLocationKey)) + .orElse(null); final int rowGroupCount = rowGroupIndices.length; rowGroups = IntStream.of(rowGroupIndices) .mapToObj(rgi -> parquetFileReader.fileMetaData.getRow_groups().get(rgi)) @@ -99,8 +103,7 @@ public ParquetTableLocation(@NotNull final TableKey tableKey, RegionedColumnSource.ROW_KEY_TO_SUB_REGION_ROW_INDEX_MASK, rowGroupCount, maxRowCount); parquetColumnNameToPath = new HashMap<>(); - for (final ColumnDescriptor column : parquetFileReader.getSchema().getColumns()) { - final String[] path = column.getPath(); + for (String[] path : ParquetSchemaUtil.paths(parquetFileReader.getSchema())) { if (path.length > 1) { parquetColumnNameToPath.put(path[0], path); } @@ -182,16 +185,27 @@ public List getSortedColumns() { @NotNull protected ColumnLocation makeColumnLocation(@NotNull final String columnName) { final String parquetColumnName = readInstructions.getParquetColumnNameFromColumnNameOrDefault(columnName); - final String[] columnPath = parquetColumnNameToPath.get(parquetColumnName); - final List nameList = - columnPath == null ? Collections.singletonList(parquetColumnName) : Arrays.asList(columnPath); + final List columnPath = getColumnPath(columnName, parquetColumnName); final ColumnChunkReader[] columnChunkReaders = Arrays.stream(getRowGroupReaders()) - .map(rgr -> rgr.getColumnChunk(columnName, nameList)).toArray(ColumnChunkReader[]::new); + .map(rgr -> rgr.getColumnChunk(columnName, columnPath)) + .toArray(ColumnChunkReader[]::new); final boolean exists = Arrays.stream(columnChunkReaders).anyMatch(ccr -> ccr != null && ccr.numRows() > 0); return new ParquetColumnLocation<>(this, columnName, parquetColumnName, exists ? columnChunkReaders : null); } + private List getColumnPath(@NotNull String columnName, String parquetColumnNameOrDefault) { + if (resolver != null) { + // empty list will result in exists=false + return resolver.of(columnName).orElse(List.of()); + } + final String[] columnPath = parquetColumnNameToPath.get(parquetColumnNameOrDefault); + // noinspection Java9CollectionFactory + return columnPath == null + ? Collections.singletonList(parquetColumnNameOrDefault) + : Collections.unmodifiableList(Arrays.asList(columnPath)); + } + private RowSet computeIndex() { final RowSetBuilderSequential sequentialBuilder = RowSetFactory.builderSequential(); diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java index e56054fb859..1f6c07acf78 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java @@ -11,7 +11,9 @@ import io.deephaven.util.channel.SeekableChannelsProviderLoader; import org.apache.commons.io.FilenameUtils; import org.apache.parquet.format.RowGroup; +import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -136,8 +138,8 @@ public synchronized void setFileReader(final ParquetFileReader fileReader) { } /** - * Get a previously-{@link #setMetadata(ParquetMetadata) set} or on-demand created {@link ParquetMetadata} for this - * location key's {@code file}. + * Get a previously-{@link #setMetadata(ParquetMetadata) set} or the {@link ParquetMetadata} for this location key's + * {@code file}. * * @return A {@link ParquetMetadata} for this location key's {@code file}. */ @@ -159,6 +161,28 @@ public synchronized void setMetadata(final ParquetMetadata metadata) { this.metadata = metadata; } + /** + * Equivalent to {@code getMetadata().getFileMetaData()}. + * + * @return the file metadata + * @see #getMetadata() + * @see ParquetMetadata#getFileMetaData() + */ + public FileMetaData getFileMetadata() { + return getMetadata().getFileMetaData(); + } + + /** + * Equivalent to {@code getFileMetadata().getSchema()}. + * + * @return the file metadata + * @see #getFileMetadata() + * @see FileMetaData#getSchema() + */ + public MessageType getSchema() { + return getFileMetadata().getSchema(); + } + /** * Get previously-{@link #setRowGroupIndices(int[]) set} or on-demand created {@link RowGroup} indices for this * location key's current {@link ParquetFileReader}. diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetInstructionsTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetInstructionsTest.java index c1a6a4864b8..9188563c368 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetInstructionsTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetInstructionsTest.java @@ -3,6 +3,11 @@ // package io.deephaven.parquet.table; +import io.deephaven.engine.table.ColumnDefinition; +import io.deephaven.engine.table.TableDefinition; +import io.deephaven.engine.table.impl.locations.TableKey; +import io.deephaven.parquet.table.location.ParquetColumnResolver; +import io.deephaven.parquet.table.location.ParquetTableLocationKey; import org.junit.Test; import static org.assertj.core.api.Assertions.assertThat; @@ -10,6 +15,27 @@ public class ParquetInstructionsTest { + @Test + public void empty() { + assertThat(ParquetInstructions.EMPTY.getSpecialInstructions()).isNull(); + assertThat(ParquetInstructions.EMPTY.getCompressionCodecName()) + .isEqualTo(ParquetInstructions.DEFAULT_COMPRESSION_CODEC_NAME); + assertThat(ParquetInstructions.EMPTY.getMaximumDictionaryKeys()) + .isEqualTo(ParquetInstructions.DEFAULT_MAXIMUM_DICTIONARY_KEYS); + assertThat(ParquetInstructions.EMPTY.getMaximumDictionarySize()) + .isEqualTo(ParquetInstructions.DEFAULT_MAXIMUM_DICTIONARY_SIZE); + assertThat(ParquetInstructions.EMPTY.isLegacyParquet()).isFalse(); + assertThat(ParquetInstructions.EMPTY.getTargetPageSize()) + .isEqualTo(ParquetInstructions.DEFAULT_TARGET_PAGE_SIZE); + assertThat(ParquetInstructions.EMPTY.isRefreshing()).isFalse(); + assertThat(ParquetInstructions.EMPTY.generateMetadataFiles()).isFalse(); + assertThat(ParquetInstructions.EMPTY.getFileLayout()).isEmpty(); + assertThat(ParquetInstructions.EMPTY.getTableDefinition()).isEmpty(); + assertThat(ParquetInstructions.EMPTY.getIndexColumns()).isEmpty(); + assertThat(ParquetInstructions.EMPTY.getColumnResolverFactory()).isEmpty(); + assertThat(ParquetInstructions.EMPTY.baseNameForPartitionedParquetData()).isEqualTo("{uuid}"); + } + @Test public void setFieldId() { final ParquetInstructions instructions = ParquetInstructions.builder() @@ -119,4 +145,34 @@ public void addColumnNameMappingBadName() { assertThat(e).hasMessageContaining("Invalid column name"); } } + + @Test + public void columnResolver() { + final ParquetInstructions instructions = ParquetInstructions.builder() + .setTableDefinition(TableDefinition.of(ColumnDefinition.ofInt("Foo"))) + .setColumnResolverFactory(ColumnResolverTestImpl.INSTANCE) + .build(); + assertThat(instructions.getColumnResolverFactory()).hasValue(ColumnResolverTestImpl.INSTANCE); + } + + @Test + public void columnResolverNoTableDefinition() { + try { + ParquetInstructions.builder() + .setColumnResolverFactory(ColumnResolverTestImpl.INSTANCE) + .build(); + failBecauseExceptionWasNotThrown(IllegalArgumentException.class); + } catch (IllegalArgumentException e) { + assertThat(e).hasMessageContaining("When setting columnResolver, tableDefinition must be provided"); + } + } + + private enum ColumnResolverTestImpl implements ParquetColumnResolver.Factory { + INSTANCE; + + @Override + public ParquetColumnResolver of(TableKey tableKey, ParquetTableLocationKey tableLocationKey) { + throw new UnsupportedOperationException(); + } + } } diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index 530be1c5d6b..fc6000f4d3b 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -71,14 +71,14 @@ import org.apache.commons.lang3.mutable.MutableDouble; import org.apache.commons.lang3.mutable.MutableFloat; import org.apache.commons.lang3.mutable.MutableObject; -import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.LogicalTypeAnnotation; -import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; import org.junit.*; import org.junit.experimental.categories.Category; @@ -131,6 +131,12 @@ import static io.deephaven.parquet.table.ParquetTools.writeTable; import static io.deephaven.parquet.table.ParquetTools.writeTables; import static io.deephaven.util.QueryConstants.*; +import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.Types.optional; import static org.junit.Assert.*; @Category(OutOfBandTest.class) @@ -1969,15 +1975,12 @@ public void decimalLogicalTypeTest() { final ParquetMetadata metadata = new ParquetTableLocationKey(new File(path).toURI(), 0, null, ParquetInstructions.EMPTY) .getMetadata(); - final List columnsMetadata = metadata.getFileMetaData().getSchema().getColumns(); - assertEquals("DECIMAL(7,5)", - columnsMetadata.get(0).getPrimitiveType().getLogicalTypeAnnotation().toString()); - assertEquals(PrimitiveType.PrimitiveTypeName.INT32, - columnsMetadata.get(0).getPrimitiveType().getPrimitiveTypeName()); - assertEquals("DECIMAL(12,8)", - columnsMetadata.get(1).getPrimitiveType().getLogicalTypeAnnotation().toString()); - assertEquals(PrimitiveType.PrimitiveTypeName.INT64, - columnsMetadata.get(1).getPrimitiveType().getPrimitiveTypeName()); + final MessageType expectedSchema = Types.buildMessage() + .addFields( + optional(INT32).as(decimalType(5, 7)).named("DecimalIntCol"), + optional(INT64).as(decimalType(8, 12)).named("DecimalLongCol")) + .named("root"); + assertEquals(expectedSchema, metadata.getFileMetaData().getSchema()); assertTableEquals(expected, fromDisk); } @@ -1989,15 +1992,12 @@ public void decimalLogicalTypeTest() { final ParquetMetadata metadata = new ParquetTableLocationKey(new File(path).toURI(), 0, null, ParquetInstructions.EMPTY) .getMetadata(); - final List columnsMetadata = metadata.getFileMetaData().getSchema().getColumns(); - assertEquals("DECIMAL(7,5)", - columnsMetadata.get(0).getPrimitiveType().getLogicalTypeAnnotation().toString()); - assertEquals(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, - columnsMetadata.get(0).getPrimitiveType().getPrimitiveTypeName()); - assertEquals("DECIMAL(12,8)", - columnsMetadata.get(1).getPrimitiveType().getLogicalTypeAnnotation().toString()); - assertEquals(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, - columnsMetadata.get(1).getPrimitiveType().getPrimitiveTypeName()); + final MessageType expectedSchema = Types.buildMessage() + .addFields( + optional(FIXED_LEN_BYTE_ARRAY).length(4).as(decimalType(5, 7)).named("DecimalIntCol"), + optional(FIXED_LEN_BYTE_ARRAY).length(6).as(decimalType(8, 12)).named("DecimalLongCol")) + .named("schema"); // note: this file must have been written down with a different schema name + assertEquals(expectedSchema, metadata.getFileMetaData().getSchema()); assertTableEquals(expected, fromDisk); } } @@ -2578,10 +2578,14 @@ public void testReadUintParquetData() { final ParquetMetadata metadata = new ParquetTableLocationKey(new File(path).toURI(), 0, null, ParquetInstructions.EMPTY).getMetadata(); - final List columnsMetadata = metadata.getFileMetaData().getSchema().getColumns(); - assertTrue(columnsMetadata.get(0).toString().contains("int32 uint8Col (INTEGER(8,false))")); - assertTrue(columnsMetadata.get(1).toString().contains("int32 uint16Col (INTEGER(16,false))")); - assertTrue(columnsMetadata.get(2).toString().contains("int32 uint32Col (INTEGER(32,false))")); + + final MessageType expectedSchema = Types.buildMessage() + .addFields( + optional(INT32).as(intType(8, false)).named("uint8Col"), + optional(INT32).as(intType(16, false)).named("uint16Col"), + optional(INT32).as(intType(32, false)).named("uint32Col")) + .named("schema"); + assertEquals(expectedSchema, metadata.getFileMetaData().getSchema()); final Table expected = newTable( charCol("uint8Col", (char) 255, (char) 2, (char) 0, NULL_CHAR), @@ -3704,25 +3708,25 @@ public void readWriteDateTimeTest() { new ParquetTableLocationKey(dest.toURI(), 0, null, ParquetInstructions.EMPTY).getMetadata(); final ColumnChunkMetaData dateColMetadata = metadata.getBlocks().get(0).getColumns().get(0); assertTrue(dateColMetadata.toString().contains("someDateColumn")); - assertEquals(PrimitiveType.PrimitiveTypeName.INT32, dateColMetadata.getPrimitiveType().getPrimitiveTypeName()); + assertEquals(INT32, dateColMetadata.getPrimitiveType().getPrimitiveTypeName()); assertEquals(LogicalTypeAnnotation.dateType(), dateColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); final ColumnChunkMetaData timeColMetadata = metadata.getBlocks().get(0).getColumns().get(1); assertTrue(timeColMetadata.toString().contains("someTimeColumn")); - assertEquals(PrimitiveType.PrimitiveTypeName.INT64, timeColMetadata.getPrimitiveType().getPrimitiveTypeName()); + assertEquals(INT64, timeColMetadata.getPrimitiveType().getPrimitiveTypeName()); assertEquals(LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.NANOS), timeColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); final ColumnChunkMetaData localDateTimeColMetadata = metadata.getBlocks().get(0).getColumns().get(2); assertTrue(localDateTimeColMetadata.toString().contains("someLocalDateTimeColumn")); - assertEquals(PrimitiveType.PrimitiveTypeName.INT64, + assertEquals(INT64, localDateTimeColMetadata.getPrimitiveType().getPrimitiveTypeName()); assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.NANOS), localDateTimeColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); final ColumnChunkMetaData instantColMetadata = metadata.getBlocks().get(0).getColumns().get(3); assertTrue(instantColMetadata.toString().contains("someInstantColumn")); - assertEquals(PrimitiveType.PrimitiveTypeName.INT64, + assertEquals(INT64, instantColMetadata.getPrimitiveType().getPrimitiveTypeName()); assertEquals(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.NANOS), instantColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java index ddb1894d25a..71632046f39 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java @@ -20,6 +20,8 @@ import io.deephaven.engine.util.TableTools; import io.deephaven.parquet.base.InvalidParquetFileException; import io.deephaven.parquet.table.layout.ParquetKeyValuePartitionedLayout; +import io.deephaven.parquet.table.location.ParquetColumnResolver; +import io.deephaven.parquet.table.location.ParquetFieldIdColumnResolverFactory; import io.deephaven.parquet.table.location.ParquetTableLocationKey; import io.deephaven.qst.type.Type; import io.deephaven.stringset.HashStringSet; @@ -52,6 +54,7 @@ import java.util.Arrays; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.function.Function; import java.util.stream.IntStream; import java.util.stream.LongStream; @@ -59,6 +62,8 @@ import static io.deephaven.engine.testutil.TstUtils.assertTableEquals; import static io.deephaven.engine.testutil.TstUtils.tableRangesAreEqual; import static io.deephaven.engine.util.TableTools.*; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.failBecauseExceptionWasNotThrown; import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; @@ -646,25 +651,21 @@ public void testWriteParquetFieldIds() throws NoSuchAlgorithmException, IOExcept // to know whenever serialization changes in any way. assertEquals("2ea68b0ddaeb432e9c2721f15460b6c42449a479c1960e836f6ebe3b14f33dc1", sha256sum(file.toPath())); - // TODO(deephaven-core#6128): Allow Parquet column access by field_id // This test is a bit circular; but assuming we trust our reading code, we should have relative confidence that // we are writing it down correctly if we can read it correctly. - // { - // final ParquetInstructions readInstructions = ParquetInstructions.builder() - // .setFieldId(BAZ, BAZ_ID) - // .setFieldId(ZAP, ZAP_ID) - // .build(); - // { - // final Table actual = ParquetTools.readTable(file.getPath(), readInstructions); - // assertEquals(td, actual.getDefinition()); - // assertTableEquals(expected, actual); - // } - // { - // final Table actual = ParquetTools.readTable(file.getPath(), readInstructions.withTableDefinition(td)); - // assertEquals(td, actual.getDefinition()); - // assertTableEquals(expected, actual); - // } - // } + { + final ParquetInstructions readInstructions = ParquetInstructions.builder() + .setTableDefinition(td) + .setColumnResolverFactory(ParquetFieldIdColumnResolverFactory.of(Map.of( + BAZ, BAZ_ID, + ZAP, ZAP_ID))) + .build(); + { + final Table actual = ParquetTools.readTable(file.getPath(), readInstructions); + assertEquals(td, actual.getDefinition()); + assertTableEquals(expected, actual); + } + } } /** @@ -689,7 +690,6 @@ public void testWriteParquetFieldIds() throws NoSuchAlgorithmException, IOExcept * * @see Arrow Parquet field_id */ - @Ignore("TODO(deephaven-core#6128): Allow Parquet column access by field_id") @Test public void testParquetFieldIds() { final String file = TestParquetTools.class.getResource("/ReferenceSimpleParquetFieldIds.parquet").getFile(); @@ -714,22 +714,12 @@ public void testParquetFieldIds() { final ColumnDefinition bazCol = ColumnDefinition.ofLong(BAZ); final ColumnDefinition zapCol = ColumnDefinition.ofString(ZAP); + final TableDefinition td = TableDefinition.of(bazCol, zapCol); final ParquetInstructions instructions = ParquetInstructions.builder() - .setFieldId(BAZ, BAZ_ID) - .setFieldId(ZAP, ZAP_ID) + .setTableDefinition(td) + .setColumnResolverFactory(ParquetFieldIdColumnResolverFactory.of(Map.of(BAZ, BAZ_ID, ZAP, ZAP_ID))) .build(); - final TableDefinition td = TableDefinition.of(bazCol, zapCol); - - // It's enough to just provide the mapping based on field_id - { - final Table table = ParquetTools.readTable(file, instructions); - assertEquals(td, table.getDefinition()); - assertTableEquals(newTable(td, - longCol(BAZ, 99, 101), - stringCol(ZAP, "Foo", "Bar")), table); - } - // But, the user can still provide a TableDefinition { final Table table = ParquetTools.readTable(file, instructions.withTableDefinition(td)); @@ -754,18 +744,21 @@ public void testParquetFieldIds() { ColumnDefinition.ofLong(BAZ), ColumnDefinition.ofString("column_53f0de5ae06f476eb82aa3f9294fcd05")); final ParquetInstructions partialInstructions = ParquetInstructions.builder() - .setFieldId(BAZ, BAZ_ID) + .setTableDefinition(partialTD) + .setColumnResolverFactory(ParquetFieldIdColumnResolverFactory.of(Map.of(BAZ, BAZ_ID))) .build(); final Table table = ParquetTools.readTable(file, partialInstructions); assertEquals(partialTD, table.getDefinition()); } - // There are no errors if a field ID is configured but not found; it won't be inferred. + // There are no errors if a field ID is configured but not found { final Table table = ParquetTools.readTable(file, ParquetInstructions.builder() - .setFieldId(BAZ, BAZ_ID) - .setFieldId(ZAP, ZAP_ID) - .setFieldId("Fake", 99) + .setTableDefinition(td) + .setColumnResolverFactory(ParquetFieldIdColumnResolverFactory.of(Map.of( + BAZ, BAZ_ID, + ZAP, ZAP_ID, + "Fake", 99))) .build()); assertEquals(td, table.getDefinition()); assertTableEquals(newTable(td, @@ -778,11 +771,12 @@ public void testParquetFieldIds() { final TableDefinition tdWithFake = TableDefinition.of(bazCol, zapCol, ColumnDefinition.ofShort("Fake")); final Table table = ParquetTools.readTable(file, ParquetInstructions.builder() - .setFieldId(BAZ, BAZ_ID) - .setFieldId(ZAP, ZAP_ID) - .setFieldId("Fake", 99) - .build() - .withTableDefinition(tdWithFake)); + .setTableDefinition(tdWithFake) + .setColumnResolverFactory(ParquetFieldIdColumnResolverFactory.of(Map.of( + BAZ, BAZ_ID, + ZAP, ZAP_ID, + "Fake", 99))) + .build()); assertEquals(tdWithFake, table.getDefinition()); assertTableEquals(newTable(tdWithFake, longCol(BAZ, 99, 101), @@ -796,57 +790,33 @@ public void testParquetFieldIds() { final TableDefinition dupeTd = TableDefinition.of(bazCol, zapCol, ColumnDefinition.ofLong(BAZ_DUPE)); final ParquetInstructions dupeInstructions = ParquetInstructions.builder() - .setFieldId(BAZ, BAZ_ID) - .setFieldId(ZAP, ZAP_ID) - .setFieldId(BAZ_DUPE, BAZ_ID) + .setTableDefinition(dupeTd) + .setColumnResolverFactory(ParquetFieldIdColumnResolverFactory.of(Map.of( + BAZ, BAZ_ID, + ZAP, ZAP_ID, + BAZ_DUPE, BAZ_ID))) .build(); { - final Table table = ParquetTools.readTable(file, dupeInstructions.withTableDefinition(dupeTd)); + final Table table = ParquetTools.readTable(file, dupeInstructions); assertEquals(dupeTd, table.getDefinition()); assertTableEquals(newTable(dupeTd, longCol(BAZ, 99, 101), stringCol(ZAP, "Foo", "Bar"), longCol(BAZ_DUPE, 99, 101)), table); } - - // In the case where we have dupe field IDs and don't provide an explicit definition, we are preferring to - // fail during the inference step - { - try { - ParquetTools.readTable(file, dupeInstructions); - Assertions.failBecauseExceptionWasNotThrown(IllegalArgumentException.class); - } catch (IllegalArgumentException e) { - Assertions.assertThat(e).hasMessageContaining("Non-unique Field ID mapping provided"); - } - } } - // If both a field id and parquet column name mapping is provided, they need to map to the same parquet column. + // If both a field id and parquet column name mapping is provided, column resolution wins { final TableDefinition bazTd = TableDefinition.of(bazCol); final ParquetInstructions inconsistent = ParquetInstructions.builder() - .setFieldId(BAZ, BAZ_ID) + .setTableDefinition(bazTd) + .setColumnResolverFactory(ParquetFieldIdColumnResolverFactory.of(Map.of(BAZ, BAZ_ID))) .addColumnNameMapping("53f0de5a-e06f-476e-b82a-a3f9294fcd05", BAZ) .build(); - // In the case where we are inferring the TableDefinition from parquet schema, the inconsistency will be - // noticed up front - try { - ParquetTools.readTable(file, inconsistent); - Assertions.failBecauseExceptionWasNotThrown(IllegalArgumentException.class); - } catch (IllegalArgumentException e) { - Assertions.assertThat(e) - .hasMessageContaining("Supplied ColumnDefinitions include duplicate names [Baz]"); - } - // In the case where we provide a TableDefinition, the inconsistency will be noticed when reading the - // data - try { - // Need to force read of data - ParquetTools.readTable(file, inconsistent.withTableDefinition(bazTd)).select(); - Assertions.failBecauseExceptionWasNotThrown(TableDataException.class); - } catch (TableDataException e) { - Assertions.assertThat(e).getRootCause().hasMessageContaining( - "For columnName=Baz, providing an explicit parquet column name path ([53f0de5a-e06f-476e-b82a-a3f9294fcd05]) and field id (0) mapping, but they are resolving to different columns, byFieldId=[colIx=0, pathKey=[e0cf7927-45dc-4dfc-b4ef-36bf4b6ae463], fieldId=0], byPath=[colIx=1, pathKey=[53f0de5a-e06f-476e-b82a-a3f9294fcd05], fieldId=1]"); - } + final Table table = ParquetTools.readTable(file, inconsistent); + assertEquals(bazTd, table.getDefinition()); + assertTableEquals(newTable(bazTd, longCol(BAZ, 99, 101)), table); } } @@ -877,7 +847,6 @@ public void testParquetFieldIds() { * * @see Arrow Parquet field_id */ - @Ignore("TODO(deephaven-core#6128): Allow Parquet column access by field_id") @Test public void testPartitionedParquetFieldIds() { final String file = TestParquetTools.class.getResource("/ReferencePartitionedFieldIds").getFile(); @@ -890,29 +859,22 @@ public void testPartitionedParquetFieldIds() { final ColumnDefinition partitionColumn = ColumnDefinition.ofInt(PARTITION).withPartitioning(); final ColumnDefinition bazCol = ColumnDefinition.ofLong(BAZ); final ColumnDefinition zapCol = ColumnDefinition.ofString(ZAP); + final TableDefinition expectedTd = TableDefinition.of(partitionColumn, bazCol, zapCol); final ParquetInstructions instructions = ParquetInstructions.builder() - .setFieldId(BAZ, BAZ_ID) - .setFieldId(ZAP, ZAP_ID) + .setTableDefinition(expectedTd) + .setColumnResolverFactory(ParquetFieldIdColumnResolverFactory.of(Map.of( + BAZ, BAZ_ID, + ZAP, ZAP_ID))) .build(); - - final TableDefinition expectedTd = TableDefinition.of(partitionColumn, bazCol, zapCol); - final Table expected = newTable(expectedTd, intCol(PARTITION, 0, 0, 1, 1), longCol(BAZ, 99, 101, 99, 101), stringCol(ZAP, "Foo", "Bar", "Foo", "Bar")); - { final Table actual = ParquetTools.readTable(file, instructions); assertEquals(expectedTd, actual.getDefinition()); assertTableEquals(expected, actual); } - - { - final Table actual = ParquetTools.readTable(file, instructions.withTableDefinition(expectedTd)); - assertEquals(expectedTd, actual.getDefinition()); - assertTableEquals(expected, actual); - } } /** @@ -933,14 +895,14 @@ public void testPartitionedParquetFieldIds() { * * @see Arrow Parquet field_id */ - @Ignore("TODO(deephaven-core#6128): Allow Parquet column access by field_id") @Test public void testParquetFieldIdsWithListType() { final String file = TestParquetTools.class.getResource("/ReferenceListParquetFieldIds.parquet").getFile(); final String FOO = "Foo"; final TableDefinition td = TableDefinition.of(ColumnDefinition.of(FOO, Type.intType().arrayType())); final ParquetInstructions instructions = ParquetInstructions.builder() - .setFieldId(FOO, 999) + .setTableDefinition(td) + .setColumnResolverFactory(ParquetFieldIdColumnResolverFactory.of(Map.of(FOO, 999))) .build(); final Table expected = TableTools.newTable(td, new ColumnHolder<>(FOO, int[].class, int.class, false, new int[] {1, 2, 3}, @@ -952,11 +914,6 @@ public void testParquetFieldIdsWithListType() { assertEquals(td, actual.getDefinition()); assertTableEquals(expected, actual); } - { - final Table actual = ParquetTools.readTable(file, instructions.withTableDefinition(td)); - assertEquals(td, actual.getDefinition()); - assertTableEquals(expected, actual); - } } /** @@ -965,73 +922,111 @@ public void testParquetFieldIdsWithListType() { * this column as "FirstName". Both standalone, and in combination with the newer file, we should be able to read it * with the latest schema. */ - @Ignore("TODO(deephaven-core#6128): Allow Parquet column access by field_id") @Test - public void testRenamingResolveViaFieldId() { + public void testRenamingColumnResolver() { final File f1 = new File(testRoot, "testRenamingResolveViaFieldId.00.parquet"); final File f2 = new File(testRoot, "testRenamingResolveViaFieldId.01.parquet"); - final int FIRST_NAME_ID = 15; + final String NAME = "Name"; + final TableDefinition td1 = TableDefinition.of(ColumnDefinition.ofString(NAME)); + final int NAME_ID = 15; + final Table t1 = newTable(td1, stringCol(NAME, "Shivam", "Ryan")); { - final String NAME = "Name"; - final Table t1 = newTable(TableDefinition.of(ColumnDefinition.ofString(NAME)), - stringCol(NAME, "Shivam", "Ryan")); ParquetTools.writeTable(t1, f1.getPath(), ParquetInstructions.builder() - .setFieldId(NAME, FIRST_NAME_ID) + .setFieldId(NAME, NAME_ID) .build()); } - final int LAST_NAME_ID = 16; final String FIRST_NAME = "FirstName"; final String LAST_NAME = "LastName"; - final TableDefinition td = TableDefinition.of( + final TableDefinition td2 = TableDefinition.of( ColumnDefinition.ofString(FIRST_NAME), ColumnDefinition.ofString(LAST_NAME)); - final ParquetInstructions instructions = ParquetInstructions.builder() - .setFieldId(FIRST_NAME, FIRST_NAME_ID) - .setFieldId(LAST_NAME, LAST_NAME_ID) - .build(); + // noinspection UnnecessaryLocalVariable + final int FIRST_NAME_ID = NAME_ID; + final int LAST_NAME_ID = 16; + final Table t2 = newTable(td2, + stringCol(FIRST_NAME, "Pete", "Colin"), + stringCol(LAST_NAME, "Goddard", "Alworth")); { - final Table t = newTable(td, - stringCol(FIRST_NAME, "Pete", "Colin"), - stringCol(LAST_NAME, "Goddard", "Alworth")); - ParquetTools.writeTable(t, f2.getPath(), instructions); + ParquetTools.writeTable(t2, f2.getPath(), ParquetInstructions.builder() + .setFieldId(FIRST_NAME, FIRST_NAME_ID) + .setFieldId(LAST_NAME, LAST_NAME_ID) + .build()); } - // If we read first file without an explicit definition, we should only get the column from the file + final ParquetColumnResolver.Factory resolver = ParquetFieldIdColumnResolverFactory.of(Map.of( + NAME, NAME_ID, + FIRST_NAME, FIRST_NAME_ID, + LAST_NAME, LAST_NAME_ID)); + + // f1 + td1 { - final TableDefinition expectedTd = TableDefinition.of(ColumnDefinition.ofString(FIRST_NAME)); - final Table expected = newTable(expectedTd, stringCol(FIRST_NAME, "Shivam", "Ryan")); - final Table actual = ParquetTools.readTable(f1.getPath(), instructions); - assertEquals(expectedTd, actual.getDefinition()); - assertTableEquals(expected, actual); + final Table actual = ParquetTools.readTable(f1.getPath(), ParquetInstructions.builder() + .setTableDefinition(td1) + .setColumnResolverFactory(resolver) + .build()); + assertEquals(td1, actual.getDefinition()); + assertTableEquals(t1, actual); } - // If we read first file with an explicit definition, the new column should return nulls + // f1 + td2 { - final Table expected = newTable(td, + final Table expected = newTable(td2, stringCol(FIRST_NAME, "Shivam", "Ryan"), stringCol(LAST_NAME, null, null)); - final Table actual = ParquetTools.readTable(f1.getPath(), instructions.withTableDefinition(td)); - assertEquals(td, actual.getDefinition()); + final Table actual = ParquetTools.readTable(f1.getPath(), ParquetInstructions.builder() + .setTableDefinition(td2) + .setColumnResolverFactory(resolver) + .build()); + assertEquals(td2, actual.getDefinition()); assertTableEquals(expected, actual); } - // We should be able to read both (flat partitioning) with the latest schema + // f2 + td1 { - final Table expected = newTable(td, + final Table expected = newTable(td1, stringCol(NAME, "Pete", "Colin")); + final Table actual = ParquetTools.readTable(f2.getPath(), ParquetInstructions.builder() + .setTableDefinition(td1) + .setColumnResolverFactory(resolver) + .build()); + assertEquals(td1, actual.getDefinition()); + assertTableEquals(expected, actual); + } + + // f2 + td2 + { + final Table actual = ParquetTools.readTable(f2.getPath(), ParquetInstructions.builder() + .setTableDefinition(td2) + .setColumnResolverFactory(resolver) + .build()); + assertEquals(td2, actual.getDefinition()); + assertTableEquals(t2, actual); + } + + // (f1, f2) + td1 + { + final Table expected = newTable(td1, + stringCol(NAME, "Shivam", "Ryan", "Pete", "Colin")); + final Table actual = ParquetTools.readTable(testRoot, ParquetInstructions.builder() + .setTableDefinition(td1) + .setColumnResolverFactory(resolver) + .build()); + assertEquals(td1, actual.getDefinition()); + assertTableEquals(expected, actual); + } + + // (f1, f2) + td2 + { + final Table expected = newTable(td2, stringCol(FIRST_NAME, "Shivam", "Ryan", "Pete", "Colin"), stringCol(LAST_NAME, null, null, "Goddard", "Alworth")); - { - final Table actual = ParquetTools.readTable(testRoot, instructions); - assertEquals(td, actual.getDefinition()); - assertTableEquals(expected, actual); - } - { - final Table actual = ParquetTools.readTable(testRoot, instructions.withTableDefinition(td)); - assertEquals(td, actual.getDefinition()); - assertTableEquals(expected, actual); - } + final Table actual = ParquetTools.readTable(testRoot, ParquetInstructions.builder() + .setTableDefinition(td2) + .setColumnResolverFactory(resolver) + .build()); + assertEquals(td2, actual.getDefinition()); + assertTableEquals(expected, actual); } } @@ -1055,28 +1050,53 @@ public void parquetWithNonUniqueFieldIds() { } { - final String BAZ = "Baz"; - final ParquetInstructions bazInstructions = ParquetInstructions.builder() - .setFieldId(BAZ, fieldId) - .build(); + final MessageType expectedSchema = Types.buildMessage() + .optional(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.intType(32)) + .id(fieldId) + .named(FOO) + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.stringType()) + .id(fieldId) + .named(BAR) + .named(MappedSchema.SCHEMA_NAME); + final MessageType actualSchema = readSchema(f); + assertEquals(expectedSchema, actualSchema); + } - // fieldId _won't_ be used to actually create a Baz column since the underlying file has multiple. In this - // case, we just infer the physical parquet column names. - { + // normal + no definition + { + final Table actual = ParquetTools.readTable(f.getPath()); + assertEquals(td, actual.getDefinition()); + assertTableEquals(expected, actual); + } - final Table actual = ParquetTools.readTable(f.getPath(), bazInstructions); - assertEquals(td, actual.getDefinition()); - assertTableEquals(expected, actual); - } + // normal + definition + { + final Table actual = ParquetTools.readTable(f.getPath(), ParquetInstructions.builder() + .setTableDefinition(td) + .build()); + assertEquals(td, actual.getDefinition()); + assertTableEquals(expected, actual); + } - // If the user explicitly asks for a definition with a mapping to a non-unique field id, they will get back - // the column of default (null) values. - { - final TableDefinition bazTd = TableDefinition.of(ColumnDefinition.ofInt(BAZ)); - final Table bazTable = newTable(bazTd, intCol(BAZ, QueryConstants.NULL_INT, QueryConstants.NULL_INT)); - final Table actual = ParquetTools.readTable(f.getPath(), bazInstructions.withTableDefinition(bazTd)); - assertEquals(bazTd, actual.getDefinition()); - assertTableEquals(bazTable, actual); + // ParquetColumnResolverFieldIds will not work given the duplicate field IDs in the file + { + final Table table = ParquetTools.readTable(f.getPath(), ParquetInstructions.builder() + .setTableDefinition(td) + .setColumnResolverFactory(ParquetFieldIdColumnResolverFactory.of(Map.of( + FOO, fieldId, + BAR, fieldId))) + .build()); + + // Only noticed when we build ParquetTableLocation; if necessary, we could refactor the implementation to + // notice this earlier on. + try { + table.select(); + failBecauseExceptionWasNotThrown(IllegalStateException.class); + } catch (IllegalArgumentException e) { + assertThat(e).hasMessageContaining( + "Parquet columns can't be unambigously mapped. Bar -> 31337 has multiple paths [Foo], [Bar]"); } } } @@ -1124,7 +1144,7 @@ public void nestedMessageEmpty() { ParquetTools.readTable(file); Assertions.failBecauseExceptionWasNotThrown(UnsupportedOperationException.class); } catch (UnsupportedOperationException e) { - Assertions.assertThat(e) + assertThat(e) .hasMessageContaining("Encountered unsupported multi-column field MyStruct"); } } @@ -1173,11 +1193,90 @@ public void nestedMessage1Row() { ParquetTools.readTable(file); Assertions.failBecauseExceptionWasNotThrown(UnsupportedOperationException.class); } catch (UnsupportedOperationException e) { - Assertions.assertThat(e) + assertThat(e) .hasMessageContaining("Encountered unsupported multi-column field MyStruct"); } } + @Test + public void intArray() { + final File f = new File(testRoot, "intArray.parquet"); + final String FOO = "Foo"; + final TableDefinition td = TableDefinition.of(ColumnDefinition.of(FOO, Type.intType().arrayType())); + final Table table = TableTools.newTable(td, new ColumnHolder<>(FOO, int[].class, int.class, false, + new int[] {1, 2, 3}, + null, + new int[0], + new int[] {42})); + { + ParquetTools.writeTable(table, f.getPath()); + } + + { + final MessageType expectedSchema = Types.buildMessage() + .optionalList() + .optionalElement(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.intType(32)) + .named(FOO) + .named(MappedSchema.SCHEMA_NAME); + assertEquals(readSchema(f), expectedSchema); + } + + { + final Table actual = ParquetTools.readTable(f.getPath()); + assertEquals(td, actual.getDefinition()); + assertTableEquals(table, actual); + } + + { + final Table actual = ParquetTools.readTable(f.getPath(), ParquetInstructions.builder() + .setTableDefinition(td) + .build()); + assertEquals(td, actual.getDefinition()); + assertTableEquals(table, actual); + } + } + + @Test + public void stringArray() { + final File f = new File(testRoot, "stringArray.parquet"); + final String FOO = "Foo"; + final TableDefinition td = TableDefinition.of(ColumnDefinition.of(FOO, Type.stringType().arrayType())); + final Table expected = TableTools.newTable(td, new ColumnHolder<>(FOO, String[].class, String.class, false, + new String[] {null, "", "Hello, world!"}, + null, + new String[0], + new String[] {"42"})); + { + ParquetTools.writeTable(expected, f.getPath()); + } + + { + final MessageType expectedSchema = Types.buildMessage() + .optionalList() + .optionalElement(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.stringType()) + .named(FOO) + .named(MappedSchema.SCHEMA_NAME); + assertEquals(readSchema(f), expectedSchema); + } + + { + final Table actual = ParquetTools.readTable(f.getPath()); + assertEquals(td, actual.getDefinition()); + assertTableEquals(expected, actual); + } + + { + final ParquetInstructions instructions = ParquetInstructions.builder() + .setTableDefinition(td) + .build(); + final Table actual = ParquetTools.readTable(f.getPath(), instructions); + assertEquals(td, actual.getDefinition()); + assertTableEquals(expected, actual); + } + } + private static String sha256sum(Path path) throws NoSuchAlgorithmException, IOException { final MessageDigest digest = MessageDigest.getInstance("SHA-256"); final DigestOutputStream out = new DigestOutputStream(OutputStream.nullOutputStream(), digest); diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/location/ParquetFieldIdColumnResolverFactoryTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/location/ParquetFieldIdColumnResolverFactoryTest.java new file mode 100644 index 00000000000..483d77a75d5 --- /dev/null +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/location/ParquetFieldIdColumnResolverFactoryTest.java @@ -0,0 +1,192 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.parquet.table.location; + +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; +import org.junit.Test; + +import java.util.List; +import java.util.Map; + +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.failBecauseExceptionWasNotThrown; + +public class ParquetFieldIdColumnResolverFactoryTest { + + // foo (42) + private static final PrimitiveType FOO_42 = Types.required(INT32) + .id(42) + .named("foo"); + + // bar (43), list, element + private static final GroupType BAR_43 = Types.requiredList() + .id(43) + .requiredElement(INT32) + .named("bar"); + + // baz, list, element (44) + private static final GroupType BAZ_44 = Types.requiredList() + .requiredElement(INT32) + .id(44) + .named("baz"); + + private static final ParquetFieldIdColumnResolverFactory FACTORY = ParquetFieldIdColumnResolverFactory.of(Map.of( + "DeepFoo", 42, + "DeepBar", 43, + "DeepBaz", 44)); + + private static String[] p(String... path) { + return path; + } + + @Test + public void messageFields() { + final MessageType schema = Types.buildMessage() + .addFields(FOO_42, BAR_43, BAZ_44) + .named("root"); + final ParquetColumnResolverMap expected = ParquetColumnResolverMap.builder() + .putMap("DeepFoo", List.of("foo")) + .putMap("DeepBar", List.of("bar", "list", "element")) + .putMap("DeepBaz", List.of("baz", "list", "element")) + .build(); + assertThat(FACTORY.of(schema)).isEqualTo(expected); + } + + @Test + public void messageGroupFields() { + final MessageType schema = Types.buildMessage() + .addFields(Types.repeatedGroup() + .addFields(FOO_42, BAR_43, BAZ_44) + .named("my_group")) + .named("root"); + final ParquetColumnResolverMap expected = ParquetColumnResolverMap.builder() + .putMap("DeepFoo", List.of("my_group", "foo")) + .putMap("DeepBar", List.of("my_group", "bar", "list", "element")) + .putMap("DeepBaz", List.of("my_group", "baz", "list", "element")) + .build(); + assertThat(FACTORY.of(schema)).isEqualTo(expected); + } + + @Test + public void messageListElements() { + final MessageType schema = Types.buildMessage() + .addFields( + Types.requiredList().element(FOO_42).named("my_list1"), + Types.requiredList().element(BAR_43).named("my_list2"), + Types.requiredList().element(BAZ_44).named("my_list3")) + .named("root"); + final ParquetColumnResolverMap expected = ParquetColumnResolverMap.builder() + .putMap("DeepFoo", List.of("my_list1", "list", "foo")) + .putMap("DeepBar", List.of("my_list2", "list", "bar", "list", "element")) + .putMap("DeepBaz", List.of("my_list3", "list", "baz", "list", "element")) + .build(); + assertThat(FACTORY.of(schema)).isEqualTo(expected); + } + + @Test + public void singleFieldMultipleIdsUnambiguous() { + final ParquetFieldIdColumnResolverFactory factory = ParquetFieldIdColumnResolverFactory.of(Map.of( + "Col1", 1, + "Col2", 2)); + + // BothCols (1), list, element (2) + final MessageType schema = Types.buildMessage().addFields(Types.requiredList() + .id(1) + .requiredElement(INT32) + .id(2) + .named("BothCols")) + .named("root"); + + final ParquetColumnResolverMap expected = ParquetColumnResolverMap.builder() + .putMap("Col1", List.of("BothCols", "list", "element")) + .putMap("Col2", List.of("BothCols", "list", "element")) + .build(); + + assertThat(factory.of(schema)).isEqualTo(expected); + } + + @Test + public void singleFieldRepeatedIds() { + final ParquetFieldIdColumnResolverFactory factory = ParquetFieldIdColumnResolverFactory.of(Map.of("Col1", 42)); + // X (42), list, element (42) + final MessageType schema = Types.buildMessage().addFields(Types.requiredList() + .id(42) + .requiredElement(INT32) + .id(42) + .named("X")) + .named("root"); + + final ParquetColumnResolverMap expected = ParquetColumnResolverMap.builder() + .putMap("Col1", List.of("X", "list", "element")) + .build(); + + // This resolution strategy is a little bit questionable... but it is unambiguous. If instead we needed to also + // provide the user with a single resulting Type with said field id, this strategy would not work. + assertThat(factory.of(schema)).isEqualTo(expected); + } + + @Test + public void ambiguousFields() { + // X (1) + // Y (1) + // Z (2) + final MessageType schema = Types.buildMessage() + .addFields( + Types.required(INT32).id(1).named("X"), + Types.required(INT32).id(1).named("Y"), + Types.required(INT32).id(2).named("Z")) + .named("root"); + try { + ParquetFieldIdColumnResolverFactory.of(Map.of("Col1", 1)).of(schema); + failBecauseExceptionWasNotThrown(IllegalArgumentException.class); + } catch (IllegalArgumentException e) { + assertThat(e).hasMessageContaining("Col1 -> 1 has multiple paths [X], [Y]"); + } + // Does not fail if ambiguous id is not referenced + { + final ParquetColumnResolverMap expected = ParquetColumnResolverMap.builder() + .putMap("ColZ", List.of("Z")) + .build(); + final ParquetColumnResolverMap actual = + ParquetFieldIdColumnResolverFactory.of(Map.of("ColZ", 2)).of(schema); + assertThat(actual).isEqualTo(expected); + } + } + + @Test + public void ambiguousFieldsNested() { + // X (1), list, element (2) + // Y (2), list, element (1) + // Z (3) + final MessageType schema = Types.buildMessage() + .addFields( + Types.requiredList().id(1).requiredElement(INT32).id(2).named("X"), + Types.requiredList().id(2).requiredElement(INT32).id(1).named("Y"), + Types.required(INT32).id(3).named("Z")) + .named("root"); + System.out.println(schema); + // Note: a different implementation _could_ take a different course of action here and proceed without error; + // for example, an implementation could choose to consider the innermost (or outermost) field id for matching + // purposes. + try { + ParquetFieldIdColumnResolverFactory.of(Map.of("Col1", 1)).of(schema); + failBecauseExceptionWasNotThrown(IllegalArgumentException.class); + } catch (IllegalArgumentException e) { + assertThat(e).hasMessageContaining("Col1 -> 1 has multiple paths [X, list, element], [Y, list, element]"); + } + // Does not fail if ambiguous id is not referenced + { + final ParquetColumnResolverMap expected = ParquetColumnResolverMap.builder() + .putMap("ColZ", List.of("Z")) + .build(); + final ParquetColumnResolverMap actual = + ParquetFieldIdColumnResolverFactory.of(Map.of("ColZ", 3)).of(schema); + assertThat(actual).isEqualTo(expected); + } + } +} From 5f62c5bc36d1cad63e3c9ddf33bedc83cb6bf377 Mon Sep 17 00:00:00 2001 From: "Charles P. Wright" Date: Thu, 23 Jan 2025 20:21:32 -0500 Subject: [PATCH 3/3] perf: DH-18300: Improve DataIndex performance. (#6585) DataIndex, particularly when used for where() filters had missing parallelization opportunities; and would read more data than strictly necessary to satisfy the filter. --------- Co-authored-by: Ryan Caudy --- .../java/io/deephaven/base/AtomicUtil.java | 41 +++ .../base/stats/ThreadSafeCounter.java | 30 ++ .../deephaven/base/stats/ThreadSafeValue.java | 37 ++ .../java/io/deephaven/base/stats/Value.java | 18 +- .../io/deephaven/base/stats/TestValue.java | 39 +++ CONTRIBUTING.md | 5 + .../engine/table/BasicDataIndex.java | 35 +- .../io/deephaven/engine/table/DataIndex.java | 16 +- .../engine/table/DataIndexOptions.java | 74 ++++ .../benchmark/engine/GroupByBenchmark.java | 14 - .../benchmark/engine/LastByBenchmark.java | 5 - .../table/impl/AbstractColumnSource.java | 201 ++++++++--- .../engine/table/impl/JoinControl.java | 4 + .../engine/table/impl/QueryTable.java | 40 ++- .../impl/dataindex/RemappedDataIndex.java | 9 +- .../impl/dataindex/StandaloneDataIndex.java | 3 +- .../impl/dataindex/TableBackedDataIndex.java | 7 +- .../impl/dataindex/TransformedDataIndex.java | 8 +- .../select/analyzers/SelectColumnLayer.java | 5 +- .../sources/regioned/MergedDataIndex.java | 328 +++++++++++++++--- .../regioned/PartitioningColumnDataIndex.java | 5 +- .../TestRegionedColumnSourceManager.java | 17 +- 22 files changed, 775 insertions(+), 166 deletions(-) create mode 100644 Base/src/main/java/io/deephaven/base/stats/ThreadSafeCounter.java create mode 100644 Base/src/main/java/io/deephaven/base/stats/ThreadSafeValue.java create mode 100644 engine/api/src/main/java/io/deephaven/engine/table/DataIndexOptions.java diff --git a/Base/src/main/java/io/deephaven/base/AtomicUtil.java b/Base/src/main/java/io/deephaven/base/AtomicUtil.java index 43aa96ac37b..866d11f5ee6 100644 --- a/Base/src/main/java/io/deephaven/base/AtomicUtil.java +++ b/Base/src/main/java/io/deephaven/base/AtomicUtil.java @@ -5,6 +5,7 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicLongFieldUpdater; public abstract class AtomicUtil { @@ -267,4 +268,44 @@ public static int atomicAndNot(AtomicInteger i, int mask) { } while (!i.compareAndSet(expect, update)); return update; } + + /** + * Sets the field to the minimum of the current value and the passed in value + * + * @param o the object to update + * @param fu the field updater + * @param value the value that is a candidate for the minumum + * @return true if the minimum was set + * @param the type of o + */ + public static boolean setMin(final T o, final AtomicLongFieldUpdater fu, final long value) { + long current = fu.get(o); + while (current > value) { + if (fu.compareAndSet(o, current, value)) { + return true; + } + current = fu.get(o); + } + return false; + } + + /** + * Sets the field to the maximum of the current value and the passed in value + * + * @param o the object to update + * @param fu the field updater + * @param value the value that is a candidate for the maximum + * @return true if the maximum was set + * @param the type of o + */ + public static boolean setMax(final T o, final AtomicLongFieldUpdater fu, final long value) { + long current = fu.get(o); + while (value > current) { + if (fu.compareAndSet(o, current, value)) { + return true; + } + current = fu.get(o); + } + return false; + } } diff --git a/Base/src/main/java/io/deephaven/base/stats/ThreadSafeCounter.java b/Base/src/main/java/io/deephaven/base/stats/ThreadSafeCounter.java new file mode 100644 index 00000000000..d66756187d3 --- /dev/null +++ b/Base/src/main/java/io/deephaven/base/stats/ThreadSafeCounter.java @@ -0,0 +1,30 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.base.stats; + +import java.util.function.LongFunction; + +// -------------------------------------------------------------------- + +/** + * A statistic where each value represents an additive quantity, and thus the sum of the values does have + * meaning. Examples include event counts and processing duration. If the sum of the values does not have a + * useful interpretation, use {@link State} instead. + *
    + *
  • {@link #increment} updates the counter, recording a single value. This is the most common usage. ({@link #sample} + * does exactly the same thing but is a poor verb to use with a Counter.) + *
+ */ +public class ThreadSafeCounter extends ThreadSafeValue { + + public ThreadSafeCounter(final long now) { + super(now); + } + + public char getTypeTag() { + return Counter.TYPE_TAG; + } + + public static final LongFunction FACTORY = ThreadSafeCounter::new; +} diff --git a/Base/src/main/java/io/deephaven/base/stats/ThreadSafeValue.java b/Base/src/main/java/io/deephaven/base/stats/ThreadSafeValue.java new file mode 100644 index 00000000000..b0cee5a2d14 --- /dev/null +++ b/Base/src/main/java/io/deephaven/base/stats/ThreadSafeValue.java @@ -0,0 +1,37 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.base.stats; + +import io.deephaven.base.AtomicUtil; + +import java.util.concurrent.atomic.AtomicLongFieldUpdater; + +/** + * A thread-safe extension of the {@link Value} class. + * + *

+ * The {@link #sample(long)} method is synchronized, so may introduce contention compared to the unsafe Value version of + * sample. + *

+ */ +public abstract class ThreadSafeValue extends Value { + + public ThreadSafeValue(long now) { + super(now); + } + + protected ThreadSafeValue(History history) { + super(history); + } + + @Override + public synchronized void sample(final long x) { + super.sample(x); + } + + @Override + public synchronized String toString() { + return super.toString(); + } +} diff --git a/Base/src/main/java/io/deephaven/base/stats/Value.java b/Base/src/main/java/io/deephaven/base/stats/Value.java index cb8c098b9b4..dd260221f1d 100644 --- a/Base/src/main/java/io/deephaven/base/stats/Value.java +++ b/Base/src/main/java/io/deephaven/base/stats/Value.java @@ -4,7 +4,10 @@ package io.deephaven.base.stats; public abstract class Value { - + /** + * The sample(long) method is not thread safe; and you can get wrong answers out of it. If you require safety, you + * should instead use a ThreadSafeValue. + */ protected long n = 0; protected long last = 0; protected long sum = 0; @@ -51,7 +54,7 @@ protected Value(History history) { this.history = history; } - public void sample(long x) { + public void sample(final long x) { n++; last = x; sum += x; @@ -99,4 +102,15 @@ public void update(Item item, ItemUpdateListener listener, long logInterval, lon } } } + + @Override + public String toString() { + if (n > 0) { + final double std = Math.sqrt(n > 1 ? (sum2 - ((double) sum * sum / (double) n)) / (n - 1) : Double.NaN); + final double avg = (double) sum / n; + return String.format("Value{n=%,d, sum=%,d, max=%,d, min=%,d, avg=%,.3f, std=%,.3f}", n, sum, max, min, avg, + std); + } + return String.format("Value{n=%,d}", n); + } } diff --git a/Base/src/test/java/io/deephaven/base/stats/TestValue.java b/Base/src/test/java/io/deephaven/base/stats/TestValue.java index 2b841c7ae3e..a1ab7eff36d 100644 --- a/Base/src/test/java/io/deephaven/base/stats/TestValue.java +++ b/Base/src/test/java/io/deephaven/base/stats/TestValue.java @@ -5,6 +5,7 @@ import junit.framework.TestCase; +import java.util.concurrent.Semaphore; import java.util.function.LongFunction; // -------------------------------------------------------------------- @@ -49,6 +50,44 @@ public void testCounter() { checkValue(Counter.FACTORY); } + public void testToString() { + // this is purposefully a heisentest, this should make it fail if it is really broken + for (int ii = 0; ii < 10; ++ii) { + doTestToString(); + } + } + + public void doTestToString() { + // we are testing toString, but also creating a pile of threads to exercise some of the AtomicFieldUpdater + // behavior of the value + final Value counter = ThreadSafeCounter.FACTORY.apply(0L); + + final String emptyString = counter.toString(); + assertEquals("Value{n=0}", emptyString); + + final Semaphore semaphore = new Semaphore(0); + final Semaphore completion = new Semaphore(0); + final int n = 100; + for (int ii = 0; ii < n; ++ii) { + final int fii = ii; + new Thread(() -> { + semaphore.acquireUninterruptibly(); + counter.sample(fii); + completion.release(); + }).start(); + } + semaphore.release(100); + completion.acquireUninterruptibly(100); + + assertEquals(n, counter.getN()); + assertEquals(((n - 1) * n) / 2, counter.getSum()); + assertEquals(0, counter.getMin()); + assertEquals(99, counter.getMax()); + + final String asString = counter.toString(); + assertEquals("Value{n=100, sum=4,950, max=99, min=0, avg=49.500, std=29.011}", asString); + } + // ---------------------------------------------------------------- private void checkValue(LongFunction factory) { Value value = factory.apply(1000L); diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 39aea6d0072..e82274c805a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -54,6 +54,11 @@ For more information, see: * [gh pr create](https://cli.github.com/manual/gh_pr_create) * [CLI In Use](https://cli.github.com/manual/examples.html) +## Labels + +- Each pull request must have a `ReleaseNotesNeeded` label if a user can perceive the changes. Build or testing-only changes should have a `NoReleaseNotesNeeded` label. +- Each pull request must have a `DocumentationNeeded` label if the user guide should be updated. Pull requests that do not require a user guide update should have a `NoDocumentationNeeded` label. + ## Styleguide The [styleguide](style/README.md) is applied globally to the entire project, except for generated code that gets checked in. To apply the styleguide, run `./gradlew spotlessApply`. diff --git a/engine/api/src/main/java/io/deephaven/engine/table/BasicDataIndex.java b/engine/api/src/main/java/io/deephaven/engine/table/BasicDataIndex.java index 13ff41aa000..f13141073b1 100644 --- a/engine/api/src/main/java/io/deephaven/engine/table/BasicDataIndex.java +++ b/engine/api/src/main/java/io/deephaven/engine/table/BasicDataIndex.java @@ -96,16 +96,46 @@ default ColumnSource[] keyColumns(@NotNull final ColumnSource[] indexedCol @FinalDefault @NotNull default ColumnSource rowSetColumn() { - return table().getColumnSource(rowSetColumnName(), RowSet.class); + return rowSetColumn(DataIndexOptions.DEFAULT); + } + + /** + * Get the {@link RowSet} {@link ColumnSource} of the index {@link #table() table}. + * + * @param options parameters for controlling how the the table will be built (if necessary) in order to retrieve the + * result {@link RowSet} {@link ColumnSource} + * + * @return The {@link RowSet} {@link ColumnSource} + */ + @FinalDefault + @NotNull + default ColumnSource rowSetColumn(final DataIndexOptions options) { + return table(options).getColumnSource(rowSetColumnName(), RowSet.class); } /** * Get the {@link Table} backing this data index. + * + *

+ * The returned table is fully in-memory, equivalent to {@link #table(DataIndexOptions)} with default options. + *

* * @return The {@link Table} */ @NotNull - Table table(); + default Table table() { + return table(DataIndexOptions.DEFAULT); + } + + /** + * Get the {@link Table} backing this data index. + * + * @param options parameters to control the returned table + * + * @return The {@link Table} + */ + @NotNull + Table table(DataIndexOptions options); /** * Whether the index {@link #table()} {@link Table#isRefreshing() is refreshing}. Some transformations will force @@ -115,4 +145,5 @@ default ColumnSource rowSetColumn() { * otherwise */ boolean isRefreshing(); + } diff --git a/engine/api/src/main/java/io/deephaven/engine/table/DataIndex.java b/engine/api/src/main/java/io/deephaven/engine/table/DataIndex.java index b257bdbc8d0..f248afb014c 100644 --- a/engine/api/src/main/java/io/deephaven/engine/table/DataIndex.java +++ b/engine/api/src/main/java/io/deephaven/engine/table/DataIndex.java @@ -48,7 +48,21 @@ interface RowKeyLookup { * @return A function that provides map-like lookup of index {@link #table()} row keys from an index lookup key */ @NotNull - RowKeyLookup rowKeyLookup(); + default RowKeyLookup rowKeyLookup() { + return rowKeyLookup(DataIndexOptions.DEFAULT); + } + + /** + * Build a {@link RowKeyLookup lookup function} of row keys for this index. If {@link #isRefreshing()} is + * {@code true}, this lookup function is only guaranteed to be accurate for the current cycle. Lookup keys should be + * in the order of the index's key columns. + * + * @param options parameters for building the table, if required by this RowKeyLookup + * + * @return A function that provides map-like lookup of index {@link #table()} row keys from an index lookup key + */ + @NotNull + RowKeyLookup rowKeyLookup(DataIndexOptions options); /** * Return a {@link RowKeyLookup lookup function} function of index row keys for this index. If diff --git a/engine/api/src/main/java/io/deephaven/engine/table/DataIndexOptions.java b/engine/api/src/main/java/io/deephaven/engine/table/DataIndexOptions.java new file mode 100644 index 00000000000..2498d36966d --- /dev/null +++ b/engine/api/src/main/java/io/deephaven/engine/table/DataIndexOptions.java @@ -0,0 +1,74 @@ +// +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending +// +package io.deephaven.engine.table; + +import io.deephaven.annotations.BuildableStyle; +import io.deephaven.api.filter.Filter; +import org.immutables.value.Value; + +/** + * Options for controlling the function of a {@link DataIndex}. + * + */ +@Value.Immutable +@BuildableStyle +public interface DataIndexOptions { + /** + * Static default options, which expect that operations will use the full table. + */ + DataIndexOptions DEFAULT = DataIndexOptions.builder().build(); + + /** + * Static options for operations that use a partial table instead of the full table. + */ + DataIndexOptions USING_PARTIAL_TABLE = DataIndexOptions.builder().operationUsesPartialTable(true).build(); + + /** + * Does this operation use only a subset of the DataIndex? + * + *

+ * The DataIndex implementation may use this hint to defer work for some row sets. + *

+ * + *

+ * Presently, this is used for the {@link Table#where(Filter)} operation to hint that work for computing + * {@link io.deephaven.engine.rowset.RowSet RowSets} for non-matching keys should be deferred. + *

+ * + * @return if this operation is only going to use a subset of this data index + */ + @Value.Default + default boolean operationUsesPartialTable() { + return false; + } + + /** + * Create a new builder for a {@link DataIndexOptions}. + * + * @return + */ + static Builder builder() { + return ImmutableDataIndexOptions.builder(); + } + + /** + * The builder interface to construct a {@link DataIndexOptions}. + */ + interface Builder { + /** + * Set whether this operation only uses a subset of the data index. + * + * @param usesPartialTable true if this operation only uses a partial table + * @return this builder + */ + Builder operationUsesPartialTable(boolean usesPartialTable); + + /** + * Build the {@link DataIndexOptions}. + * + * @return an immutable DataIndexOptions structure. + */ + DataIndexOptions build(); + } +} diff --git a/engine/benchmark/src/benchmark/java/io/deephaven/benchmark/engine/GroupByBenchmark.java b/engine/benchmark/src/benchmark/java/io/deephaven/benchmark/engine/GroupByBenchmark.java index 50af9c1fff5..c544aeb41ce 100644 --- a/engine/benchmark/src/benchmark/java/io/deephaven/benchmark/engine/GroupByBenchmark.java +++ b/engine/benchmark/src/benchmark/java/io/deephaven/benchmark/engine/GroupByBenchmark.java @@ -38,9 +38,6 @@ public class GroupByBenchmark { @Param({"Historical"}) private String tableType; - @Param({"operator"}) - private String mode; - @Param({"String", "Int", "Composite"}) private String keyType; @@ -122,17 +119,6 @@ public void setupEnv(BenchmarkParams params) { state = new TableBenchmarkState(BenchmarkTools.stripName(params.getBenchmark()), params.getWarmup().getCount()); table = bmt.getTable().coalesce().dropColumns("PartCol"); - - switch (mode) { - case "chunked": - QueryTable.USE_OLDER_CHUNKED_BY = true; - break; - case "operator": - QueryTable.USE_OLDER_CHUNKED_BY = false; - break; - default: - throw new IllegalArgumentException("Unknown mode " + mode); - } } @TearDown(Level.Trial) diff --git a/engine/benchmark/src/benchmark/java/io/deephaven/benchmark/engine/LastByBenchmark.java b/engine/benchmark/src/benchmark/java/io/deephaven/benchmark/engine/LastByBenchmark.java index c3dcc331c8e..c0ee0ada285 100644 --- a/engine/benchmark/src/benchmark/java/io/deephaven/benchmark/engine/LastByBenchmark.java +++ b/engine/benchmark/src/benchmark/java/io/deephaven/benchmark/engine/LastByBenchmark.java @@ -59,9 +59,6 @@ public class LastByBenchmark { @Param({"1"}) private int valueCount; - @Param({"true"}) - private boolean tracked; - private Table table; private String keyName; @@ -171,8 +168,6 @@ public void setupEnv(BenchmarkParams params) { state = new TableBenchmarkState(BenchmarkTools.stripName(params.getBenchmark()), params.getWarmup().getCount()); table = bmt.getTable().coalesce().dropColumns("PartCol"); - - QueryTable.TRACKED_FIRST_BY = QueryTable.TRACKED_LAST_BY = tracked; } @TearDown(Level.Trial) diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/AbstractColumnSource.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/AbstractColumnSource.java index 65d434d6598..2de486cbd02 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/AbstractColumnSource.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/AbstractColumnSource.java @@ -3,6 +3,9 @@ // package io.deephaven.engine.table.impl; +import io.deephaven.base.stats.Stats; +import io.deephaven.base.stats.ThreadSafeCounter; +import io.deephaven.base.stats.Value; import io.deephaven.base.string.cache.CharSequenceUtils; import io.deephaven.base.verify.Assert; import io.deephaven.chunk.Chunk; @@ -10,12 +13,14 @@ import io.deephaven.chunk.ObjectChunk; import io.deephaven.chunk.WritableChunk; import io.deephaven.chunk.attributes.Values; +import io.deephaven.configuration.Configuration; import io.deephaven.engine.context.ExecutionContext; import io.deephaven.engine.primitive.iterator.CloseableIterator; import io.deephaven.engine.rowset.*; import io.deephaven.engine.rowset.chunkattributes.OrderedRowKeys; import io.deephaven.engine.table.ColumnSource; import io.deephaven.engine.table.DataIndex; +import io.deephaven.engine.table.DataIndexOptions; import io.deephaven.engine.table.Table; import io.deephaven.engine.table.impl.chunkfillers.ChunkFiller; import io.deephaven.engine.table.impl.chunkfilter.ChunkFilter; @@ -34,12 +39,48 @@ import java.time.Instant; import java.time.ZonedDateTime; +import java.util.Arrays; import java.util.Collections; public abstract class AbstractColumnSource implements ColumnSource, DefaultChunkSource.WithPrev { + /** + * For a {@link #match(boolean, boolean, boolean, DataIndex, RowSet, Object...)} call that uses a DataIndex, by + * default we do not force the entire DataIndex to be loaded into memory. This is because many + * {@link io.deephaven.engine.table.impl.select.MatchFilter}s are highly selective and only need to instantiate a + * single RowSet value rather than the complete DataIndex for the entire table. When the Configuration property + * "AbstractColumnSource.usePartialDataIndex" is set to false, the query engine materializes the entire DataIndex + * table for the match call. + */ + public static boolean USE_PARTIAL_TABLE_DATA_INDEX = Configuration.getInstance() + .getBooleanWithDefault("AbstractColumnSource.usePartialDataIndex", true); + /** + * After generating a DataIndex table and identifying which row keys are responsive to the filter, the result RowSet + * can be built in serial or in parallel. By default, the index is built in parallel which may take advantage of + * using more threads for I/O of the index data structure. Parallel builds do require more setup and thread + * synchronization, so they can be disabled by setting the Configuration property + * "AbstractColumnSource.useParallelIndexBuild" to false. + */ + public static boolean USE_PARALLEL_ROWSET_BUILD = Configuration.getInstance() + .getBooleanWithDefault("AbstractColumnSource.useParallelRowSetBuild", true); + + /** + * Duration of match() calls using a DataIndex (also provides the count). + */ + private static final Value INDEX_FILTER_MILLIS = + Stats.makeItem("AbstractColumnSource", "indexFilterMillis", ThreadSafeCounter.FACTORY, + "Duration of match() with a DataIndex in millis") + .getValue(); + /** + * Duration of match() calls using a chunk filter (i.e. no DataIndex). + */ + private static final Value CHUNK_FILTER_MILLIS = + Stats.makeItem("AbstractColumnSource", "chunkFilterMillis", ThreadSafeCounter.FACTORY, + "Duration of match() without a DataIndex in millis") + .getValue(); + /** * Minimum average run length in an {@link RowSequence} that should trigger {@link Chunk}-filling by key ranges * instead of individual keys. @@ -115,82 +156,134 @@ public WritableRowSet match( final boolean usePrev, final boolean caseInsensitive, @Nullable final DataIndex dataIndex, - @NotNull final RowSet mapper, + @NotNull final RowSet rowsetToFilter, final Object... keys) { + if (dataIndex == null) { + return doChunkFilter(invertMatch, usePrev, caseInsensitive, rowsetToFilter, keys); + } + final long t0 = System.nanoTime(); + try { + return doDataIndexFilter(invertMatch, usePrev, caseInsensitive, dataIndex, rowsetToFilter, keys); + } finally { + final long t1 = System.nanoTime(); + INDEX_FILTER_MILLIS.sample((t1 - t0) / 1_000_000); + } + } - if (dataIndex != null) { - final Table indexTable = dataIndex.table(); - final RowSet matchingIndexRows; - if (caseInsensitive && type == String.class) { - // Linear scan through the index table, accumulating index row keys for case-insensitive matches - final RowSetBuilderSequential matchingIndexRowsBuilder = RowSetFactory.builderSequential(); - - // noinspection rawtypes - final KeyedObjectHashSet keySet = new KeyedObjectHashSet<>(new CIStringKey()); - // noinspection unchecked - Collections.addAll(keySet, keys); - - final RowSet indexRowSet = usePrev ? indexTable.getRowSet().prev() : indexTable.getRowSet(); - final ColumnSource indexKeySource = - indexTable.getColumnSource(dataIndex.keyColumnNames().get(0), String.class); - - final int chunkSize = (int) Math.min(CHUNK_SIZE, indexRowSet.size()); - try (final RowSequence.Iterator indexRowSetIterator = indexRowSet.getRowSequenceIterator(); - final GetContext indexKeyGetContext = indexKeySource.makeGetContext(chunkSize)) { - while (indexRowSetIterator.hasMore()) { - final RowSequence chunkIndexRows = indexRowSetIterator.getNextRowSequenceWithLength(chunkSize); - final ObjectChunk chunkKeys = (usePrev - ? indexKeySource.getPrevChunk(indexKeyGetContext, chunkIndexRows) - : indexKeySource.getChunk(indexKeyGetContext, chunkIndexRows)).asObjectChunk(); - final LongChunk chunkRowKeys = chunkIndexRows.asRowKeyChunk(); - final int thisChunkSize = chunkKeys.size(); - for (int ii = 0; ii < thisChunkSize; ++ii) { - final String key = chunkKeys.get(ii); - if (keySet.containsKey(key)) { - matchingIndexRowsBuilder.appendKey(chunkRowKeys.get(ii)); - } + private WritableRowSet doDataIndexFilter(final boolean invertMatch, + final boolean usePrev, + final boolean caseInsensitive, + @NotNull final DataIndex dataIndex, + @NotNull final RowSet rowsetToFilter, + final Object[] keys) { + final DataIndexOptions partialOption = + USE_PARTIAL_TABLE_DATA_INDEX ? DataIndexOptions.USING_PARTIAL_TABLE : DataIndexOptions.DEFAULT; + + final Table indexTable = dataIndex.table(partialOption); + + final RowSet matchingIndexRows; + if (caseInsensitive && type == String.class) { + // Linear scan through the index table, accumulating index row keys for case-insensitive matches + final RowSetBuilderSequential matchingIndexRowsBuilder = RowSetFactory.builderSequential(); + + // noinspection rawtypes + final KeyedObjectHashSet keySet = new KeyedObjectHashSet<>(new CIStringKey()); + // noinspection unchecked + Collections.addAll(keySet, keys); + + final RowSet indexRowSet = usePrev ? indexTable.getRowSet().prev() : indexTable.getRowSet(); + final ColumnSource indexKeySource = + indexTable.getColumnSource(dataIndex.keyColumnNames().get(0), String.class); + + final int chunkSize = (int) Math.min(CHUNK_SIZE, indexRowSet.size()); + try (final RowSequence.Iterator indexRowSetIterator = indexRowSet.getRowSequenceIterator(); + final GetContext indexKeyGetContext = indexKeySource.makeGetContext(chunkSize)) { + while (indexRowSetIterator.hasMore()) { + final RowSequence chunkIndexRows = indexRowSetIterator.getNextRowSequenceWithLength(chunkSize); + final ObjectChunk chunkKeys = (usePrev + ? indexKeySource.getPrevChunk(indexKeyGetContext, chunkIndexRows) + : indexKeySource.getChunk(indexKeyGetContext, chunkIndexRows)).asObjectChunk(); + final LongChunk chunkRowKeys = chunkIndexRows.asRowKeyChunk(); + final int thisChunkSize = chunkKeys.size(); + for (int ii = 0; ii < thisChunkSize; ++ii) { + final String key = chunkKeys.get(ii); + if (keySet.containsKey(key)) { + matchingIndexRowsBuilder.appendKey(chunkRowKeys.get(ii)); } } } - matchingIndexRows = matchingIndexRowsBuilder.build(); - } else { - // Use the lookup function to get the index row keys for the matching keys - final RowSetBuilderRandom matchingIndexRowsBuilder = RowSetFactory.builderRandom(); - - final DataIndex.RowKeyLookup rowKeyLookup = dataIndex.rowKeyLookup(); - for (Object key : keys) { - final long rowKey = rowKeyLookup.apply(key, usePrev); - if (rowKey != RowSequence.NULL_ROW_KEY) { - matchingIndexRowsBuilder.addKey(rowKey); - } + } + matchingIndexRows = matchingIndexRowsBuilder.build(); + } else { + // Use the lookup function to get the index row keys for the matching keys + final RowSetBuilderRandom matchingIndexRowsBuilder = RowSetFactory.builderRandom(); + + final DataIndex.RowKeyLookup rowKeyLookup = dataIndex.rowKeyLookup(partialOption); + for (final Object key : keys) { + final long rowKey = rowKeyLookup.apply(key, usePrev); + if (rowKey != RowSequence.NULL_ROW_KEY) { + matchingIndexRowsBuilder.addKey(rowKey); } - matchingIndexRows = matchingIndexRowsBuilder.build(); } + matchingIndexRows = matchingIndexRowsBuilder.build(); + } - try (final SafeCloseable ignored = matchingIndexRows) { - final WritableRowSet filtered = invertMatch ? mapper.copy() : RowSetFactory.empty(); - if (matchingIndexRows.isNonempty()) { - final ColumnSource indexRowSetSource = usePrev - ? dataIndex.rowSetColumn().getPrevSource() - : dataIndex.rowSetColumn(); + try (final SafeCloseable ignored = matchingIndexRows) { + final WritableRowSet filtered = invertMatch ? rowsetToFilter.copy() : RowSetFactory.empty(); + if (matchingIndexRows.isNonempty()) { + final ColumnSource indexRowSetSource = usePrev + ? dataIndex.rowSetColumn(partialOption).getPrevSource() + : dataIndex.rowSetColumn(partialOption); + + if (USE_PARALLEL_ROWSET_BUILD) { + final long[] rowKeyArray = new long[matchingIndexRows.intSize()]; + matchingIndexRows.toRowKeyArray(rowKeyArray); + Arrays.stream(rowKeyArray).parallel().forEach((final long rowKey) -> { + final RowSet matchingRowSet = indexRowSetSource.get(rowKey); + assert matchingRowSet != null; + if (invertMatch) { + synchronized (filtered) { + filtered.remove(matchingRowSet); + } + } else { + try (final RowSet intersected = matchingRowSet.intersect(rowsetToFilter)) { + synchronized (filtered) { + filtered.insert(intersected); + } + } + } + }); + } else { try (final CloseableIterator matchingIndexRowSetIterator = ChunkedColumnIterator.make(indexRowSetSource, matchingIndexRows)) { matchingIndexRowSetIterator.forEachRemaining((final RowSet matchingRowSet) -> { if (invertMatch) { filtered.remove(matchingRowSet); } else { - try (final RowSet intersected = matchingRowSet.intersect(mapper)) { + try (final RowSet intersected = matchingRowSet.intersect(rowsetToFilter)) { filtered.insert(intersected); } } }); } } - return filtered; } - } else { - return ChunkFilter.applyChunkFilter(mapper, this, usePrev, + return filtered; + } + } + + private WritableRowSet doChunkFilter(final boolean invertMatch, + final boolean usePrev, + final boolean caseInsensitive, + @NotNull final RowSet rowsetToFilter, + final Object[] keys) { + final long t0 = System.nanoTime(); + try { + return ChunkFilter.applyChunkFilter(rowsetToFilter, this, usePrev, ChunkMatchFilterFactory.getChunkFilter(type, caseInsensitive, invertMatch, keys)); + } finally { + final long t1 = System.nanoTime(); + CHUNK_FILTER_MILLIS.sample((t1 - t0) / 1_000_000); } } diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/JoinControl.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/JoinControl.java index 57af02c692f..cebd69e1365 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/JoinControl.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/JoinControl.java @@ -59,6 +59,10 @@ int tableSize(final long expectedEntries) { @Nullable DataIndex dataIndexToUse(Table table, ColumnSource[] sources) { + // Configuration property that serves as an escape hatch + if (!QueryTable.USE_DATA_INDEX_FOR_JOINS) { + return null; + } final DataIndexer indexer = DataIndexer.existingOf(table.getRowSet()); return indexer == null ? null : LivenessScopeStack.computeEnclosed( diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/QueryTable.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/QueryTable.java index a431891f94d..ff642fe80a6 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/QueryTable.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/QueryTable.java @@ -198,12 +198,27 @@ public interface MemoizableOperation ChunkedOperatorAggregationHelper.aggregation( + () -> ChunkedOperatorAggregationHelper.aggregation(aggregationControl, aggregationContextFactory, this, preserveEmpty, initialGroups, groupByColumns)); } } @@ -1237,13 +1244,14 @@ private void initializeAndPrioritizeFilters(@NotNull final WhereFilter... filter final int numFilters = filters.length; final BitSet priorityFilterIndexes = new BitSet(numFilters); - final QueryCompilerRequestProcessor.BatchProcessor compilationProcesor = QueryCompilerRequestProcessor.batch(); + final QueryCompilerRequestProcessor.BatchProcessor compilationProcessor = QueryCompilerRequestProcessor.batch(); + // Initialize our filters immediately so we can examine the columns they use. Note that filter // initialization is safe to invoke repeatedly. for (final WhereFilter filter : filters) { - filter.init(getDefinition(), compilationProcesor); + filter.init(getDefinition(), compilationProcessor); } - compilationProcesor.compile(); + compilationProcessor.compile(); for (int fi = 0; fi < numFilters; ++fi) { final WhereFilter filter = filters[fi]; diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/RemappedDataIndex.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/RemappedDataIndex.java index 95311871f96..b7f960eb5f8 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/RemappedDataIndex.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/RemappedDataIndex.java @@ -6,6 +6,7 @@ import io.deephaven.base.verify.Assert; import io.deephaven.engine.table.ColumnSource; import io.deephaven.engine.table.DataIndex; +import io.deephaven.engine.table.DataIndexOptions; import io.deephaven.engine.table.Table; import io.deephaven.engine.table.impl.indexer.DataIndexer; import org.jetbrains.annotations.NotNull; @@ -91,14 +92,14 @@ public List keyColumnNames() { @Override @NotNull - public Table table() { - return sourceIndex.table(); + public Table table(final DataIndexOptions options) { + return sourceIndex.table(options); } @Override @NotNull - public RowKeyLookup rowKeyLookup() { - return sourceIndex.rowKeyLookup(); + public RowKeyLookup rowKeyLookup(final DataIndexOptions options) { + return sourceIndex.rowKeyLookup(options); } @Override diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/StandaloneDataIndex.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/StandaloneDataIndex.java index b496c96171b..cf4eafbf9c7 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/StandaloneDataIndex.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/StandaloneDataIndex.java @@ -6,6 +6,7 @@ import io.deephaven.engine.liveness.LivenessArtifact; import io.deephaven.engine.table.BasicDataIndex; import io.deephaven.engine.table.ColumnSource; +import io.deephaven.engine.table.DataIndexOptions; import io.deephaven.engine.table.Table; import org.jetbrains.annotations.NotNull; @@ -63,7 +64,7 @@ public String rowSetColumnName() { @Override @NotNull - public Table table() { + public Table table(final DataIndexOptions unused) { return table; } diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/TableBackedDataIndex.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/TableBackedDataIndex.java index 4802143a9a1..d2a9d8659a1 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/TableBackedDataIndex.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/TableBackedDataIndex.java @@ -9,6 +9,7 @@ import io.deephaven.engine.context.ExecutionContext; import io.deephaven.engine.liveness.LivenessScopeStack; import io.deephaven.engine.table.ColumnSource; +import io.deephaven.engine.table.DataIndexOptions; import io.deephaven.engine.table.Table; import io.deephaven.engine.table.impl.QueryTable; import io.deephaven.engine.table.impl.by.*; @@ -83,7 +84,7 @@ public Map, String> keyColumnNamesByIndexedColumn() { @Override @NotNull - public Table table() { + public Table table(final DataIndexOptions unused) { Table localIndexTable; if ((localIndexTable = indexTable) != null) { return localIndexTable; @@ -136,8 +137,8 @@ private Table computeTable() { @Override @NotNull - public RowKeyLookup rowKeyLookup() { - table(); + public RowKeyLookup rowKeyLookup(final DataIndexOptions options) { + table(options); return (final Object key, final boolean usePrev) -> { // Pass the object to the aggregation lookup, then return the resulting row key. This index will be // correct in prev or current space because of the aggregation's hash-based lookup. diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/TransformedDataIndex.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/TransformedDataIndex.java index 7759adcaa49..c9f923566b4 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/TransformedDataIndex.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/dataindex/TransformedDataIndex.java @@ -10,11 +10,7 @@ import io.deephaven.engine.liveness.LivenessScopeStack; import io.deephaven.engine.rowset.RowSet; import io.deephaven.engine.rowset.WritableRowSet; -import io.deephaven.engine.table.BasicDataIndex; -import io.deephaven.engine.table.ColumnSource; -import io.deephaven.engine.table.DataIndex; -import io.deephaven.engine.table.DataIndexTransformer; -import io.deephaven.engine.table.Table; +import io.deephaven.engine.table.*; import io.deephaven.engine.table.impl.ForkJoinPoolOperationInitializer; import io.deephaven.engine.table.impl.perf.QueryPerformanceRecorder; import io.deephaven.engine.table.impl.select.FunctionalColumn; @@ -72,7 +68,7 @@ public String rowSetColumnName() { @Override @NotNull - public Table table() { + public Table table(final DataIndexOptions unused) { Table localIndexTable; if ((localIndexTable = indexTable) != null) { return localIndexTable; diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/select/analyzers/SelectColumnLayer.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/select/analyzers/SelectColumnLayer.java index 431d00ee7e7..356bd05f8b7 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/select/analyzers/SelectColumnLayer.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/select/analyzers/SelectColumnLayer.java @@ -191,7 +191,10 @@ public Runnable createUpdateHandler( // If we have shifts, that makes everything nasty; so we do not want to deal with it final boolean hasShifts = upstream.shifted().nonempty(); - final boolean serialTableOperationsSafe = updateGraph.serialTableOperationsSafe() + // liveResultOwner is only null when we are static; in the static case there is no need to + // worry about serial table operation checking + final boolean serialTableOperationsSafe = liveResultOwner == null + || updateGraph.serialTableOperationsSafe() || updateGraph.sharedLock().isHeldByCurrentThread() || updateGraph.exclusiveLock().isHeldByCurrentThread(); diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/MergedDataIndex.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/MergedDataIndex.java index 881e8a21e88..76d305b8bcc 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/MergedDataIndex.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/MergedDataIndex.java @@ -4,17 +4,20 @@ package io.deephaven.engine.table.impl.sources.regioned; import io.deephaven.UncheckedDeephavenException; +import io.deephaven.api.ColumnName; +import io.deephaven.api.Selectable; +import io.deephaven.base.stats.Stats; +import io.deephaven.base.stats.ThreadSafeCounter; +import io.deephaven.base.stats.Value; import io.deephaven.base.verify.Assert; import io.deephaven.base.verify.Require; +import io.deephaven.configuration.Configuration; import io.deephaven.engine.primitive.iterator.CloseableIterator; import io.deephaven.engine.rowset.RowSequence; import io.deephaven.engine.rowset.RowSet; import io.deephaven.engine.rowset.RowSetBuilderSequential; import io.deephaven.engine.rowset.RowSetFactory; -import io.deephaven.engine.table.BasicDataIndex; -import io.deephaven.engine.table.ColumnSource; -import io.deephaven.engine.table.PartitionedTableFactory; -import io.deephaven.engine.table.Table; +import io.deephaven.engine.table.*; import io.deephaven.engine.table.impl.ForkJoinPoolOperationInitializer; import io.deephaven.engine.table.impl.by.AggregationProcessor; import io.deephaven.engine.table.impl.by.AggregationRowLookup; @@ -23,14 +26,16 @@ import io.deephaven.engine.table.impl.locations.TableLocation; import io.deephaven.engine.table.impl.perf.QueryPerformanceRecorder; import io.deephaven.engine.table.impl.select.FunctionalColumn; +import io.deephaven.engine.table.impl.select.MultiSourceFunctionalColumn; import io.deephaven.engine.table.impl.select.SelectColumn; -import io.deephaven.util.SafeCloseable; import io.deephaven.util.annotations.InternalUseOnly; import io.deephaven.vector.ObjectVector; import org.jetbrains.annotations.NotNull; import java.util.*; +import java.util.concurrent.atomic.AtomicReferenceArray; import java.util.stream.IntStream; +import java.util.stream.LongStream; /** * DataIndex that accumulates the individual per-{@link TableLocation} data indexes of a {@link Table} backed by a @@ -45,6 +50,23 @@ */ @InternalUseOnly class MergedDataIndex extends AbstractDataIndex implements DataIndexer.RetainableDataIndex { + /** + * The duration in nanos to build a DataIndex table. + */ + private static final Value BUILD_INDEX_TABLE_MILLIS = Stats + .makeItem("MergedDataIndex", "buildTableMillis", ThreadSafeCounter.FACTORY, + "Duration in millis of building an index") + .getValue(); + + /** + * When merging row sets from multiple component DataIndex structures, reading each individual component can be + * performed in parallel to improve the wall-clock I/O time observed. For cases where the number of individual + * groups to merge is small and I/O bound this can be very beneficial. Setting up the parallelism, however, can add + * additional overhead. Setting the Configuration property "MergedDataIndex.useParallelLazyFetch" to false disables + * this behavior and each key's merged RowSet is computed serially. + */ + public static boolean USE_PARALLEL_LAZY_FETCH = Configuration.getInstance() + .getBooleanWithDefault("MergedDataIndex.useParallelLazyFetch", true); private static final String LOCATION_DATA_INDEX_TABLE_COLUMN_NAME = "__DataIndexTable"; @@ -63,6 +85,17 @@ class MergedDataIndex extends AbstractDataIndex implements DataIndexer.Retainabl */ private volatile Table indexTable; + /** + * A lazy version of the table. This value is never set if indexTable is set. + */ + private volatile Table lazyTable; + + /** + * If we have a lazy version of the table, we hold onto the partitioned table, which permits us to select the + * underlying rowsets without needing to reselect the key columns. + */ + private volatile PartitionedTable lazyPartitionedTable; + /** * Whether this index is known to be corrupt. */ @@ -113,19 +146,29 @@ public Map, String> keyColumnNamesByIndexedColumn() { @Override @NotNull - public Table table() { + public Table table(final DataIndexOptions options) { Table localIndexTable; if ((localIndexTable = indexTable) != null) { return localIndexTable; } + final boolean lazyRowSetMerge = options.operationUsesPartialTable(); + if (lazyRowSetMerge) { + if ((localIndexTable = lazyTable) != null) { + return localIndexTable; + } + } synchronized (this) { if ((localIndexTable = indexTable) != null) { return localIndexTable; + } else if (lazyRowSetMerge) { + if ((localIndexTable = lazyTable) != null) { + return localIndexTable; + } } try { return QueryPerformanceRecorder.withNugget( String.format("Merge Data Indexes [%s]", String.join(", ", keyColumnNames)), - ForkJoinPoolOperationInitializer.ensureParallelizable(this::buildTable)); + ForkJoinPoolOperationInitializer.ensureParallelizable(() -> buildTable(lazyRowSetMerge))); } catch (Throwable t) { isCorrupt = true; throw t; @@ -133,78 +176,262 @@ public Table table() { } } - private Table buildTable() { - final Table locationTable = columnSourceManager.locationTable().coalesce(); + /** + * The RowSetCacher is a bit of a hack that allows us to avoid reading actual RowSets from disk until they are + * actually required for a query operation. We are breaking engine rules in that we reference the source + * ColumnSource directly and do not have correct dependencies encoded in a select. MergedDataIndexes are only + * permitted for a static table, so we can get away with this. + * + *

+ * Once a RowSet has been written, we write down the result into an AtomicReferenceArray so that it need not be read + * repeatedly. If two threads attempt to concurrently fetch the same RowSet, then we use a placeholder object in the + * array to avoid duplicated work. + *

+ */ + private static class RowSetCacher { + final ColumnSource> source; + final AtomicReferenceArray results; - // Perform a parallelizable update to produce coalesced location index tables with their row sets shifted by - // the appropriate region offset. - // This potentially loads many small row sets into memory, but it avoids the risk of re-materializing row set - // pages during the accumulation phase. - final String[] keyColumnNamesArray = keyColumnNames.toArray(String[]::new); - final Table locationDataIndexes = locationTable - .update(List.of(SelectColumn.ofStateless(new FunctionalColumn<>( - columnSourceManager.locationColumnName(), TableLocation.class, - LOCATION_DATA_INDEX_TABLE_COLUMN_NAME, Table.class, - (final long locationRowKey, final TableLocation location) -> loadIndexTableAndShiftRowSets( - locationRowKey, location, keyColumnNamesArray))))) - .dropColumns(columnSourceManager.locationColumnName()); + private RowSetCacher(final ColumnSource> source, final int capacity) { + this.source = source; + this.results = new AtomicReferenceArray<>(capacity); + } - // Merge all the location index tables into a single table - final Table mergedDataIndexes = PartitionedTableFactory.of(locationDataIndexes).merge(); + RowSet get(final long rowKey) { + if (rowKey < 0 || rowKey >= results.length()) { + return null; + } - // Group the merged data indexes by the keys - final Table groupedByKeyColumns = mergedDataIndexes.groupBy(keyColumnNamesArray); + final int iRowKey = (int) rowKey; + do { + final Object localResult = results.get(iRowKey); + if (localResult instanceof RowSet) { + return (RowSet) localResult; + } + if (localResult instanceof Exception) { + throw new UncheckedDeephavenException("Exception found for cached RowSet", + (Exception) localResult); + } - // Combine the row sets from each group into a single row set - final Table combined = groupedByKeyColumns - .update(List.of(SelectColumn.ofStateless(new FunctionalColumn<>( + if (localResult != null) { + // noinspection EmptySynchronizedStatement + synchronized (localResult) { + // don't care to do anything, we are just waiting for the barrier to be done + } + continue; + } + + // we need to create our own placeholder, and synchronize on it first + final Object placeholder = new Object(); + // noinspection SynchronizationOnLocalVariableOrMethodParameter + synchronized (placeholder) { + if (!results.compareAndSet(iRowKey, null, placeholder)) { + // we must try again, someone else has claimed the placeholder + continue; + } + // We won the race, so it is our responsibility to get the right answer + + final ObjectVector inputRowSets = source.get(rowKey); + Assert.neqNull(inputRowSets, "inputRowSets"); + final RowSet computedResult; + try { + if (USE_PARALLEL_LAZY_FETCH) { + // noinspection DataFlowIssue + computedResult = mergeRowSetsParallel(rowKey, inputRowSets); + } else { + // noinspection DataFlowIssue + computedResult = mergeRowSetsSerial(rowKey, inputRowSets); + } + } catch (Exception e) { + results.set(iRowKey, e); + throw e; + } + results.set(iRowKey, computedResult); + return computedResult; + } + } while (true); + } + } + + private Table buildTable(final boolean lazyRowSetMerge) { + if (lazyTable != null) { + if (lazyRowSetMerge) { + return lazyTable; + } + } + + final long t0 = System.nanoTime(); + try { + final PartitionedTable partitionedTable; + if (lazyPartitionedTable != null) { + // We are synchronized, and can begin processing from the PartitionedTable rather than starting from + // scratch. The first step is to force our rowsets into memory, in parallel. + partitionedTable = lazyPartitionedTable.transform(t -> ForkJoinPoolOperationInitializer + .ensureParallelizable(() -> t.update(ROW_SET_COLUMN_NAME)).get()); + } else { + partitionedTable = buildPartitionedTable(lazyRowSetMerge); + } + + // Merge all the location index tables into a single table + final Table mergedDataIndexes = partitionedTable.merge(); + // Group the merged data indexes by the keys + final Table groupedByKeyColumns = mergedDataIndexes.groupBy(keyColumnNames.toArray(String[]::new)); + lookupFunction = AggregationProcessor.getRowLookup(groupedByKeyColumns); + + final Table combined; + if (lazyRowSetMerge) { + final ColumnSource> vectorColumnSource = + groupedByKeyColumns.getColumnSource(ROW_SET_COLUMN_NAME); + + // we are using a regular array here; so we must ensure that we are flat; or we'll have a bad time + Assert.assertion(groupedByKeyColumns.isFlat(), "groupedByKeyColumns.isFlat()"); + final RowSetCacher rowsetCacher = new RowSetCacher(vectorColumnSource, groupedByKeyColumns.intSize()); + combined = groupedByKeyColumns + .view(List.of(SelectColumn.ofStateless(new MultiSourceFunctionalColumn<>(List.of(), + ROW_SET_COLUMN_NAME, RowSet.class, (k, v) -> rowsetCacher.get(k))))); + + lazyPartitionedTable = partitionedTable; + lazyTable = combined; + } else { + // Combine the row sets from each group into a single row set + final List mergeFunction = List.of(SelectColumn.ofStateless(new FunctionalColumn<>( ROW_SET_COLUMN_NAME, ObjectVector.class, ROW_SET_COLUMN_NAME, RowSet.class, - this::mergeRowSets)))); - Assert.assertion(combined.isFlat(), "combined.isFlat()"); - Assert.eq(groupedByKeyColumns.size(), "groupedByKeyColumns.size()", combined.size(), "combined.size()"); + MergedDataIndex::mergeRowSetsSerial))); + + combined = groupedByKeyColumns.update(mergeFunction); + + indexTable = combined; + // if we were built off a lazy table, null it out now that we've setup the indexTable + lazyPartitionedTable = null; + lazyTable = null; + } + Assert.assertion(combined.isFlat(), "combined.isFlat()"); + Assert.eq(groupedByKeyColumns.size(), "groupedByKeyColumns.size()", combined.size(), "combined.size()"); - // Cleanup after ourselves - try (final CloseableIterator rowSets = mergedDataIndexes.objectColumnIterator(ROW_SET_COLUMN_NAME)) { - rowSets.forEachRemaining(SafeCloseable::close); + return combined; + } finally { + final long t1 = System.nanoTime(); + BUILD_INDEX_TABLE_MILLIS.sample((t1 - t0) / 1_000_000); } + } - lookupFunction = AggregationProcessor.getRowLookup(groupedByKeyColumns); - indexTable = combined; - return combined; + /** + * Perform a parallelizable update to produce coalesced location index tables with their row sets shifted by the + * appropriate region offset. The keys are always forced into memory so that the groupBy operation need not read + * from disk serially. + *

+ * If lazyRowSetMerge is true, the rowsets are read into memory as part of the mergeRowSets call and are not forced + * into memory here. If lazyRowSetMerge is false, then the RowSets are also forced into memory. + *

+ */ + private PartitionedTable buildPartitionedTable(final boolean lazyRowSetMerge) { + final String[] keyColumnNamesArray = keyColumnNames.toArray(String[]::new); + final Table locationTable = columnSourceManager.locationTable().coalesce(); + final Table locationDataIndexes = locationTable + .update(List.of(SelectColumn.ofStateless(new FunctionalColumn<>( + columnSourceManager.locationColumnName(), TableLocation.class, + LOCATION_DATA_INDEX_TABLE_COLUMN_NAME, Table.class, + (final long locationRowKey, final TableLocation location) -> loadIndexTableAndShiftRowSets( + locationRowKey, location, keyColumnNamesArray, !lazyRowSetMerge))))) + .dropColumns(columnSourceManager.locationColumnName()); + + return PartitionedTableFactory.of(locationDataIndexes); } private static Table loadIndexTableAndShiftRowSets( final long locationRowKey, @NotNull final TableLocation location, - @NotNull final String[] keyColumnNames) { + @NotNull final String[] keyColumnNames, + final boolean selectRowSets) { final BasicDataIndex dataIndex = location.getDataIndex(keyColumnNames); if (dataIndex == null) { throw new UncheckedDeephavenException(String.format("Failed to load data index [%s] for location %s", String.join(", ", keyColumnNames), location)); } final Table indexTable = dataIndex.table(); - return indexTable.coalesce().update(List.of(SelectColumn.ofStateless(new FunctionalColumn<>( - dataIndex.rowSetColumnName(), RowSet.class, - ROW_SET_COLUMN_NAME, RowSet.class, - (final RowSet rowSet) -> rowSet - .shift(RegionedColumnSource.getFirstRowKey(Math.toIntExact(locationRowKey))))))); + final long shiftAmount = RegionedColumnSource.getFirstRowKey(Math.toIntExact(locationRowKey)); + final Table coalesced = indexTable.coalesce(); + + final Selectable shiftFunction; + if (shiftAmount == 0) { + // A source column would be more convenient, but we are going to close the RowSet after we are done merging + // it and should not allow that close call to pass through to the original table. + shiftFunction = new FunctionalColumn<>( + dataIndex.rowSetColumnName(), RowSet.class, + ROW_SET_COLUMN_NAME, RowSet.class, + RowSet::copy); + } else { + shiftFunction = new FunctionalColumn<>( + dataIndex.rowSetColumnName(), RowSet.class, + ROW_SET_COLUMN_NAME, RowSet.class, + (final RowSet rowSet) -> rowSet.shift(shiftAmount)); + } + if (selectRowSets) { + // pull the key columns into memory while we are parallel; and read all the rowsets (by virtue of the shift) + final List columns = new ArrayList<>(keyColumnNames.length + 1); + Arrays.stream(keyColumnNames).map(ColumnName::of).forEach(columns::add); + columns.add(SelectColumn.ofStateless(shiftFunction)); + return ForkJoinPoolOperationInitializer.ensureParallelizable(() -> coalesced.update(columns)).get(); + } else { + // pull the key columns into memory while we are parallel; but do not read all the RowSets + final Table withInMemoryKeyColumns = + ForkJoinPoolOperationInitializer.ensureParallelizable(() -> coalesced.update(keyColumnNames)).get(); + return withInMemoryKeyColumns.updateView(List.of(SelectColumn.ofStateless(shiftFunction))); + } } - private RowSet mergeRowSets( + /** + * The returned RowSet is owned by the caller. The input RowSets are closed. + */ + private static RowSet mergeRowSetsSerial( @SuppressWarnings("unused") final long unusedRowKey, @NotNull final ObjectVector keyRowSets) { + final long numRowSets = keyRowSets.size(); + + if (numRowSets == 1) { + // we steal the reference, the input is never used again + return keyRowSets.get(0); + } + final RowSetBuilderSequential builder = RowSetFactory.builderSequential(); try (final CloseableIterator rowSets = keyRowSets.iterator()) { - rowSets.forEachRemaining(builder::appendRowSequence); + rowSets.forEachRemaining(rs -> { + builder.appendRowSequence(rs); + rs.close(); + }); } + + return builder.build(); + } + + /** + * The returned RowSet is owned by the caller. The input RowSets are closed. + */ + private static RowSet mergeRowSetsParallel( + @SuppressWarnings("unused") final long unusedRowKey, + @NotNull final ObjectVector keyRowSets) { + final long numRowSets = keyRowSets.size(); + + if (numRowSets == 1) { + // we steal the reference, the input is never used again + return keyRowSets.get(0); + } + final RowSetBuilderSequential builder = RowSetFactory.builderSequential(); + + LongStream.range(0, numRowSets).parallel().mapToObj(keyRowSets::get) + .sorted(Comparator.comparingLong(RowSet::firstRowKey)).forEachOrdered(rs -> { + builder.appendRowSequence(rs); + rs.close(); + }); + return builder.build(); } @Override @NotNull - public RowKeyLookup rowKeyLookup() { - table(); + public RowKeyLookup rowKeyLookup(final DataIndexOptions options) { + table(options); return (final Object key, final boolean usePrev) -> { // Pass the object to the aggregation lookup, then return the resulting row position (which is also the row // key). @@ -232,13 +459,8 @@ public boolean isValid() { final String[] keyColumnNamesArray = keyColumnNames.toArray(String[]::new); try (final CloseableIterator locations = columnSourceManager.locationTable().objectColumnIterator(columnSourceManager.locationColumnName())) { - while (locations.hasNext()) { - if (!locations.next().hasDataIndex(keyColumnNamesArray)) { - return isValid = false; - } - } + return isValid = locations.stream().parallel().allMatch(l -> l.hasDataIndex(keyColumnNamesArray)); } - return isValid = true; } @Override diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/PartitioningColumnDataIndex.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/PartitioningColumnDataIndex.java index 4e7a9bb1a8d..1953470475e 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/PartitioningColumnDataIndex.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/sources/regioned/PartitioningColumnDataIndex.java @@ -8,6 +8,7 @@ import io.deephaven.base.verify.Require; import io.deephaven.engine.rowset.*; import io.deephaven.engine.table.ColumnSource; +import io.deephaven.engine.table.DataIndexOptions; import io.deephaven.engine.table.ModifiedColumnSet; import io.deephaven.engine.table.Table; import io.deephaven.engine.table.TableUpdate; @@ -300,13 +301,13 @@ public Map, String> keyColumnNamesByIndexedColumn() { @Override @NotNull - public Table table() { + public Table table(final DataIndexOptions unused) { return indexTable; } @Override @NotNull - public RowKeyLookup rowKeyLookup() { + public RowKeyLookup rowKeyLookup(final DataIndexOptions unusedOptions) { return (final Object key, final boolean usePrev) -> keyPositionMap.get(key); } diff --git a/engine/table/src/test/java/io/deephaven/engine/table/impl/sources/regioned/TestRegionedColumnSourceManager.java b/engine/table/src/test/java/io/deephaven/engine/table/impl/sources/regioned/TestRegionedColumnSourceManager.java index 0aeb1b05a07..7d3d2e7a08b 100644 --- a/engine/table/src/test/java/io/deephaven/engine/table/impl/sources/regioned/TestRegionedColumnSourceManager.java +++ b/engine/table/src/test/java/io/deephaven/engine/table/impl/sources/regioned/TestRegionedColumnSourceManager.java @@ -388,6 +388,15 @@ private static void checkIndex( @Test public void testStaticBasics() { + testStaticBasics(DataIndexOptions.DEFAULT); + } + + @Test + public void testStaticBasicsPartial() { + testStaticBasics(DataIndexOptions.USING_PARTIAL_TABLE); + } + + private void testStaticBasics(final DataIndexOptions options) { SUT = new RegionedColumnSourceManager(false, componentFactory, ColumnToCodecMappings.EMPTY, columnDefinitions); assertEquals(makeColumnSourceMap(), SUT.getColumnSources()); @@ -451,7 +460,11 @@ public void testStaticBasics() { } captureIndexes(SUT.initialize()); - capturedGroupingColumnIndex.table(); // Force us to build the merged index *before* we check satisfaction + + // Force us to build the merged index *before* we check satisfaction + // the checkIndexes method will call table() a second time with the DEFAULT options; which exercises lazy + // conversion + capturedGroupingColumnIndex.table(options); checkIndexes(); assertEquals(Arrays.asList(tableLocation1A, tableLocation1B), SUT.includedLocations()); @@ -482,7 +495,7 @@ private DataIndexImpl(@NotNull final Table table) { } @Override - public @NotNull Table table() { + public @NotNull Table table(DataIndexOptions ignored) { return table; }