From ddbedfc30af5a530c55a15594b724625f1ef4c8e Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Tue, 12 Mar 2024 17:59:09 -0500 Subject: [PATCH 1/6] Added fix for File.toURI regression --- .../channel/SeekableChannelsProvider.java | 51 +++++++++++++++++-- .../parquet/base/ColumnChunkReaderImpl.java | 3 +- .../parquet/base/ParquetFileReader.java | 5 +- .../deephaven/parquet/table/ParquetTools.java | 7 +-- .../location/ParquetTableLocationKey.java | 7 ++- .../table/ParquetTableReadWriteTest.java | 36 +++++++++++-- 6 files changed, 93 insertions(+), 16 deletions(-) diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java index ea25e8ee712..ed5f5340094 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java @@ -18,26 +18,69 @@ public interface SeekableChannelsProvider extends SafeCloseable { /** - * Take the file source path or URI and convert it to a URI object. + * Take the file source path or URI string and convert it to a URI object. * * @param source The file source path or URI * @return The URI object */ static URI convertToURI(final String source) { + if (source.isEmpty()) { + throw new IllegalArgumentException("Cannot convert empty source to URI"); + } final URI uri; try { uri = new URI(source); } catch (final URISyntaxException e) { // If the URI is invalid, assume it's a file path - return new File(source).toURI(); + return convertFileToURI(new File(source)); } if (uri.getScheme() == null) { - // Need to convert to a "file" URI - return new File(source).toURI(); + // Convert to a "file" URI + return convertFileToURI(new File(source)); } return uri; } + /** + * Takes a file and convert it to a URI object. We should use this method instead of {@link File#toURI()} because + * {@link File#toURI()} internally calls {@link File#isDirectory()}, which can lead to disk access. + * + * @param file The file + * @return The URI object + */ + static URI convertFileToURI(final File file) { + String absPath = file.getAbsolutePath(); + if (absPath.isEmpty()) { + throw new IllegalArgumentException("Cannot convert file with empty path to URI: " + file); + } + if (File.separatorChar != '/') { + absPath = absPath.replace(File.separatorChar, '/'); + } + if (absPath.charAt(0) != '/') { + absPath = "/" + absPath; + } + if (absPath.startsWith("//")) { + absPath = "//" + absPath; + } + try { + return new URI("file", null, absPath, null); + } catch (final URISyntaxException e) { + throw new IllegalArgumentException("Failed to convert file to URI: " + file, e); + } + } + + /** + * Takes a path and convert it to a URI object. We should use this method instead of {@link Path#toUri()} because + * {@link Path#toUri()} internally does a {@code stat} system call to extract information whether file is a + * directory. + * + * @param path The path + * @return The URI object + */ + static URI convertPathToURI(final Path path) { + return convertFileToURI(path.toFile()); + } + /** * Wraps {@link SeekableChannelsProvider#getInputStream(SeekableByteChannel)} to ensure the channel's position is * incremented the exact amount that has been consumed from the resulting input stream. To remain valid, the caller diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java index e553331daa7..e5ec4e90c84 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java @@ -33,6 +33,7 @@ import java.util.function.Function; import static io.deephaven.parquet.base.ParquetFileReader.FILE_URI_SCHEME; +import static io.deephaven.util.channel.SeekableChannelsProvider.convertPathToURI; import static org.apache.parquet.format.Encoding.PLAIN_DICTIONARY; import static org.apache.parquet.format.Encoding.RLE_DICTIONARY; @@ -120,7 +121,7 @@ private URI getURI() { return uri; } if (columnChunk.isSetFile_path() && FILE_URI_SCHEME.equals(rootURI.getScheme())) { - return uri = Path.of(rootURI).resolve(columnChunk.getFile_path()).toUri(); + return uri = convertPathToURI(Path.of(rootURI).resolve(columnChunk.getFile_path())); } else { // TODO(deephaven-core#5066): Add support for reading metadata files from non-file URIs return uri = rootURI; diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java index 3a2971b02fe..ab4b27dcde0 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java @@ -10,14 +10,15 @@ import org.apache.parquet.format.Type; import org.apache.parquet.schema.*; +import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.nio.channels.SeekableByteChannel; import java.nio.charset.StandardCharsets; -import java.nio.file.Path; import java.util.*; +import static io.deephaven.util.channel.SeekableChannelsProvider.convertFileToURI; import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; /** @@ -49,7 +50,7 @@ public ParquetFileReader(final URI parquetFileURI, final SeekableChannelsProvide this.channelsProvider = channelsProvider; if (!parquetFileURI.getRawPath().endsWith(".parquet") && FILE_URI_SCHEME.equals(parquetFileURI.getScheme())) { // Construct a new file URI for the parent directory - rootURI = Path.of(parquetFileURI).getParent().toUri(); + rootURI = convertFileToURI(new File(parquetFileURI).getParentFile()); } else { // TODO(deephaven-core#5066): Add support for reading metadata files from non-file URIs rootURI = parquetFileURI; diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index 2e5fceb7e3b..be1a0e728f5 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -55,6 +55,7 @@ import java.util.*; import static io.deephaven.parquet.base.ParquetFileReader.FILE_URI_SCHEME; +import static io.deephaven.util.channel.SeekableChannelsProvider.convertFileToURI; import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; import static io.deephaven.parquet.table.ParquetTableWriter.PARQUET_FILE_EXTENSION; import static io.deephaven.util.type.TypeUtils.getUnboxedTypeIfBoxed; @@ -961,7 +962,7 @@ public static Table readFlatPartitionedTable( public static Table readSingleFileTable( @NotNull final File file, @NotNull final ParquetInstructions readInstructions) { - return readSingleFileTable(file.toURI(), readInstructions); + return readSingleFileTable(convertFileToURI(file), readInstructions); } /** @@ -1007,7 +1008,7 @@ public static Table readSingleFileTable( @NotNull final File file, @NotNull final ParquetInstructions readInstructions, @NotNull final TableDefinition tableDefinition) { - return readSingleFileTable(file.toURI(), readInstructions, tableDefinition); + return readSingleFileTable(convertFileToURI(file), readInstructions, tableDefinition); } /** @@ -1145,7 +1146,7 @@ public static ParquetFileReader getParquetFileReader(@NotNull final URI parquetF public static ParquetFileReader getParquetFileReaderChecked( @NotNull final File parquetFile, @NotNull final ParquetInstructions readInstructions) throws IOException { - return getParquetFileReaderChecked(parquetFile.toURI(), readInstructions); + return getParquetFileReaderChecked(convertFileToURI(parquetFile), readInstructions); } /** diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java index 04ab35607c2..3dbdb9f845f 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java @@ -23,6 +23,9 @@ import java.util.Map; import java.util.stream.IntStream; +import static io.deephaven.util.channel.SeekableChannelsProvider.convertFileToURI; +import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; + /** * {@link TableLocationKey} implementation for use with data stored in the parquet format. */ @@ -70,7 +73,7 @@ public ParquetTableLocationKey(@NotNull final URI parquetFileUri, final int orde } private static URI validateParquetFile(@NotNull final File file) { - return validateParquetFile(file.toURI()); + return validateParquetFile(convertFileToURI(file)); } private static URI validateParquetFile(@NotNull final URI parquetFileUri) { @@ -189,7 +192,7 @@ public synchronized int[] getRowGroupIndices() { // we're not expecting that in this code path. To support it, discovery tools should figure out // the row groups for a partition themselves and call setRowGroupReaders. final String filePath = rowGroups.get(rgi).getColumns().get(0).getFile_path(); - return filePath == null || new File(filePath).getAbsoluteFile().toURI().equals(uri); + return filePath == null || convertToURI(filePath).equals(uri); }).toArray(); } diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index de729369f55..50ae5fba501 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -69,6 +69,7 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.net.URI; +import java.nio.file.Path; import java.time.Duration; import java.time.Instant; import java.util.ArrayList; @@ -108,6 +109,7 @@ import static io.deephaven.parquet.table.ParquetTools.readTable; import static io.deephaven.parquet.table.ParquetTools.writeTable; import static io.deephaven.util.QueryConstants.*; +import static io.deephaven.util.channel.SeekableChannelsProvider.convertFileToURI; import static org.junit.Assert.*; @Category(OutOfBandTest.class) @@ -874,6 +876,31 @@ private static void verifyFilesInDir(final File parentDir, final String[] expect } } + @Test + public void readFromDirTest() { + final File parentDir = new File(rootFile, "tempDir"); + parentDir.mkdir(); + final Table someTable = TableTools.emptyTable(5).update("A=(int)i"); + final File firstPartition = new File(parentDir, "X=A"); + final File firstDataFile = new File(firstPartition, "data.parquet"); + writeTable(someTable, firstDataFile); + final File secondPartition = new File(parentDir, "X=B"); + final File secondDataFile = new File(secondPartition, "data.parquet"); + writeTable(someTable, secondDataFile); + + final Table expected = readKeyValuePartitionedTable(parentDir, ParquetInstructions.EMPTY); + + String filePath = parentDir.getAbsolutePath(); + Table fromDisk = ParquetTools.readTable(filePath); + assertTableEquals(expected, fromDisk); + + // Read with a trailing slash + assertTrue(!filePath.endsWith("/")); + filePath = filePath + "/"; + fromDisk = ParquetTools.readTable(filePath); + assertTableEquals(expected, fromDisk); + } + /** * These are tests for writing a table to a parquet file and making sure there are no unnecessary files left in the * directory after we finish writing. @@ -927,7 +954,7 @@ public void basicWriteAndReadFromFileURITests() { final String filename = "basicWriteTests.parquet"; final File destFile = new File(rootFile, filename); final String absolutePath = destFile.getAbsolutePath(); - final URI fileURI = destFile.toURI(); + final URI fileURI = convertFileToURI(destFile); ParquetTools.writeTable(tableToSave, absolutePath); // Read from file URI @@ -1204,7 +1231,8 @@ public void partitionedParquetWithDotFilesTest() throws IOException { writeTable(someTable, firstDataFile); writeTable(someTable, secondDataFile); - Table partitionedTable = readKeyValuePartitionedTable(parentDir, EMPTY).select(); + final URI parentURI = convertFileToURI(parentDir); + final Table partitionedTable = readTable(parentURI.toString()).select(); final Set columnsSet = partitionedTable.getDefinition().getColumnNameSet(); assertTrue(columnsSet.size() == 2 && columnsSet.contains("A") && columnsSet.contains("X")); @@ -1214,14 +1242,14 @@ public void partitionedParquetWithDotFilesTest() throws IOException { final File dotDir = new File(firstPartition, ".dotDir"); assertTrue(dotDir.mkdir()); writeTable(someTable, new File(dotDir, "data.parquet")); - Table fromDisk = readKeyValuePartitionedTable(parentDir, EMPTY); + Table fromDisk = readTable(parentURI.toString()); assertTableEquals(fromDisk, partitionedTable); // Add a dot parquet file in one of the partitions directory final Table anotherTable = TableTools.emptyTable(5).update("B=(int)i"); final File pqDotFile = new File(secondPartition, ".dotFile.parquet"); writeTable(anotherTable, pqDotFile); - fromDisk = readKeyValuePartitionedTable(parentDir, EMPTY); + fromDisk = readTable(parentURI.toString()); assertTableEquals(fromDisk, partitionedTable); } From f3e22be37648a5b0f4d180a45b1360ee9556cbac Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Tue, 12 Mar 2024 19:58:29 -0500 Subject: [PATCH 2/6] Code review with Devin --- .../channel/SeekableChannelsProvider.java | 24 ++++++++++++------- .../parquet/base/ColumnChunkReaderImpl.java | 2 +- .../parquet/base/ParquetFileReader.java | 16 +++++++++++-- .../parquet/table/ParquetTableWriter.java | 2 +- .../deephaven/parquet/table/ParquetTools.java | 23 +++++++++--------- .../table/layout/ParquetFileHelper.java | 18 +++++++++++++- .../location/ParquetTableLocationKey.java | 4 ++-- .../table/ParquetTableReadWriteTest.java | 4 ++-- 8 files changed, 64 insertions(+), 29 deletions(-) diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java index ed5f5340094..fdeeeabc6b5 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java @@ -21,9 +21,10 @@ public interface SeekableChannelsProvider extends SafeCloseable { * Take the file source path or URI string and convert it to a URI object. * * @param source The file source path or URI + * @param isDirectory Whether the source is a directory * @return The URI object */ - static URI convertToURI(final String source) { + static URI convertToURI(final String source, final boolean isDirectory) { if (source.isEmpty()) { throw new IllegalArgumentException("Cannot convert empty source to URI"); } @@ -32,11 +33,11 @@ static URI convertToURI(final String source) { uri = new URI(source); } catch (final URISyntaxException e) { // If the URI is invalid, assume it's a file path - return convertFileToURI(new File(source)); + return convertFileToURI(new File(source), isDirectory); } if (uri.getScheme() == null) { // Convert to a "file" URI - return convertFileToURI(new File(source)); + return convertFileToURI(new File(source), isDirectory); } return uri; } @@ -46,9 +47,10 @@ static URI convertToURI(final String source) { * {@link File#toURI()} internally calls {@link File#isDirectory()}, which can lead to disk access. * * @param file The file + * @param isDirectory Whether the source file is a directory * @return The URI object */ - static URI convertFileToURI(final File file) { + static URI convertFileToURI(final File file, final boolean isDirectory) { String absPath = file.getAbsolutePath(); if (absPath.isEmpty()) { throw new IllegalArgumentException("Cannot convert file with empty path to URI: " + file); @@ -59,13 +61,16 @@ static URI convertFileToURI(final File file) { if (absPath.charAt(0) != '/') { absPath = "/" + absPath; } + if (absPath.charAt(absPath.length() - 1) != '/' && isDirectory) { + absPath = absPath + "/"; + } if (absPath.startsWith("//")) { absPath = "//" + absPath; } try { return new URI("file", null, absPath, null); } catch (final URISyntaxException e) { - throw new IllegalArgumentException("Failed to convert file to URI: " + file, e); + throw new IllegalStateException("Failed to convert file to URI: " + file, e); } } @@ -75,17 +80,18 @@ static URI convertFileToURI(final File file) { * directory. * * @param path The path + * @param isDirectory Whether the file is a directory * @return The URI object */ - static URI convertPathToURI(final Path path) { - return convertFileToURI(path.toFile()); + static URI convertPathToURI(final Path path, final boolean isDirectory) { + return convertFileToURI(path.toFile(), isDirectory); } /** * Wraps {@link SeekableChannelsProvider#getInputStream(SeekableByteChannel)} to ensure the channel's position is * incremented the exact amount that has been consumed from the resulting input stream. To remain valid, the caller * must ensure that the resulting input stream isn't re-wrapped by any downstream code in a way that would adversely - * effect the position (such as re-wrapping the resulting input stream with buffering). + * affect the position (such as re-wrapping the resulting input stream with buffering). * *

* Equivalent to {@code ChannelPositionInputStream.of(ch, provider.getInputStream(ch))}. @@ -122,7 +128,7 @@ default SeekableChannelContext makeSingleUseContext() { default SeekableByteChannel getReadChannel(@NotNull SeekableChannelContext channelContext, @NotNull String uriStr) throws IOException { - return getReadChannel(channelContext, convertToURI(uriStr)); + return getReadChannel(channelContext, convertToURI(uriStr, false)); } SeekableByteChannel getReadChannel(@NotNull SeekableChannelContext channelContext, @NotNull URI uri) diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java index e5ec4e90c84..6789d3ff95b 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java @@ -121,7 +121,7 @@ private URI getURI() { return uri; } if (columnChunk.isSetFile_path() && FILE_URI_SCHEME.equals(rootURI.getScheme())) { - return uri = convertPathToURI(Path.of(rootURI).resolve(columnChunk.getFile_path())); + return uri = convertPathToURI(Path.of(rootURI).resolve(columnChunk.getFile_path()), false); } else { // TODO(deephaven-core#5066): Add support for reading metadata files from non-file URIs return uri = rootURI; diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java index ab4b27dcde0..0dda95d4c2b 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java @@ -40,17 +40,29 @@ public class ParquetFileReader { private final URI rootURI; private final MessageType type; + /** + * Create a new ParquetFileReader for the provided source. + * + * @param source The source path or URI for the parquet file or the parquet metadata file + * @param channelsProvider The {@link SeekableChannelsProvider} to use for reading the file + */ public ParquetFileReader(final String source, final SeekableChannelsProvider channelsProvider) throws IOException { - this(convertToURI(source), channelsProvider); + this(convertToURI(source, false), channelsProvider); } + /** + * Create a new ParquetFileReader for the provided source. + * + * @param parquetFileURI The URI for the parquet file or the parquet metadata file + * @param channelsProvider The {@link SeekableChannelsProvider} to use for reading the file + */ public ParquetFileReader(final URI parquetFileURI, final SeekableChannelsProvider channelsProvider) throws IOException { this.channelsProvider = channelsProvider; if (!parquetFileURI.getRawPath().endsWith(".parquet") && FILE_URI_SCHEME.equals(parquetFileURI.getScheme())) { // Construct a new file URI for the parent directory - rootURI = convertFileToURI(new File(parquetFileURI).getParentFile()); + rootURI = convertFileToURI(new File(parquetFileURI).getParentFile(), true); } else { // TODO(deephaven-core#5066): Add support for reading metadata files from non-file URIs rootURI = parquetFileURI; diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java index f2b470f9be7..70ed5a095f8 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java @@ -314,7 +314,7 @@ private static ParquetFileWriter getParquetFileWriter( final Map extraMetaData = new HashMap<>(tableMeta); extraMetaData.put(METADATA_KEY, tableInfoBuilder.build().serializeToJSON()); return new ParquetFileWriter(path, - SeekableChannelsProviderLoader.getInstance().fromServiceLoader(convertToURI(path), null), + SeekableChannelsProviderLoader.getInstance().fromServiceLoader(convertToURI(path, false), null), writeInstructions.getTargetPageSize(), new HeapByteBufferAllocator(), mappedSchema.getParquetSchema(), writeInstructions.getCompressionCodecName(), extraMetaData); diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index be1a0e728f5..e479166293d 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -55,6 +55,7 @@ import java.util.*; import static io.deephaven.parquet.base.ParquetFileReader.FILE_URI_SCHEME; +import static io.deephaven.parquet.table.layout.ParquetFileHelper.convertParquetSourceToURI; import static io.deephaven.util.channel.SeekableChannelsProvider.convertFileToURI; import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; import static io.deephaven.parquet.table.ParquetTableWriter.PARQUET_FILE_EXTENSION; @@ -98,7 +99,7 @@ private ParquetTools() {} * @see ParquetFlatPartitionedLayout */ public static Table readTable(@NotNull final String source) { - return readTableInternal(convertToURI(source), ParquetInstructions.EMPTY); + return readTableInternal(convertParquetSourceToURI(source), ParquetInstructions.EMPTY); } /** @@ -129,7 +130,7 @@ public static Table readTable(@NotNull final String source) { public static Table readTable( @NotNull final String source, @NotNull final ParquetInstructions readInstructions) { - return readTableInternal(convertToURI(source), readInstructions); + return readTableInternal(convertParquetSourceToURI(source), readInstructions); } /** @@ -962,7 +963,7 @@ public static Table readFlatPartitionedTable( public static Table readSingleFileTable( @NotNull final File file, @NotNull final ParquetInstructions readInstructions) { - return readSingleFileTable(convertFileToURI(file), readInstructions); + return readSingleFileTable(convertFileToURI(file, false), readInstructions); } /** @@ -981,7 +982,7 @@ public static Table readSingleFileTable( public static Table readSingleFileTable( @NotNull final String source, @NotNull final ParquetInstructions readInstructions) { - return readSingleFileTable(convertToURI(source), readInstructions); + return readSingleFileTable(convertToURI(source, false), readInstructions); } private static Table readSingleFileTable( @@ -1008,7 +1009,7 @@ public static Table readSingleFileTable( @NotNull final File file, @NotNull final ParquetInstructions readInstructions, @NotNull final TableDefinition tableDefinition) { - return readSingleFileTable(convertFileToURI(file), readInstructions, tableDefinition); + return readSingleFileTable(convertFileToURI(file, false), readInstructions, tableDefinition); } /** @@ -1026,7 +1027,7 @@ public static Table readSingleFileTable( @NotNull final String source, @NotNull final ParquetInstructions readInstructions, @NotNull final TableDefinition tableDefinition) { - return readSingleFileTable(convertToURI(source), readInstructions, tableDefinition); + return readSingleFileTable(convertToURI(source, false), readInstructions, tableDefinition); } private static Table readSingleFileTable( @@ -1106,7 +1107,7 @@ private static ParquetSchemaReader.ColumnDefinitionConsumer makeSchemaReaderCons * Make a {@link ParquetFileReader} for the supplied {@link File}. Wraps {@link IOException} as * {@link TableDataException}. * - * @param parquetFile The {@link File} to read + * @param parquetFile The parquet file or the parquet metadata file * @param readInstructions the instructions for customizations while reading * @return The new {@link ParquetFileReader} */ @@ -1123,7 +1124,7 @@ public static ParquetFileReader getParquetFileReader(@NotNull final File parquet * Make a {@link ParquetFileReader} for the supplied {@link URI}. Wraps {@link IOException} as * {@link TableDataException}. * - * @param parquetFileURI The {@link URI} to read + * @param parquetFileURI The URI for the parquet file or the parquet metadata file * @param readInstructions the instructions for customizations while reading * @return The new {@link ParquetFileReader} */ @@ -1139,20 +1140,20 @@ public static ParquetFileReader getParquetFileReader(@NotNull final URI parquetF /** * Make a {@link ParquetFileReader} for the supplied {@link File}. * - * @param parquetFile The {@link File} to read + * @param parquetFile The parquet file or the parquet metadata file * @return The new {@link ParquetFileReader} * @throws IOException if an IO exception occurs */ public static ParquetFileReader getParquetFileReaderChecked( @NotNull final File parquetFile, @NotNull final ParquetInstructions readInstructions) throws IOException { - return getParquetFileReaderChecked(convertFileToURI(parquetFile), readInstructions); + return getParquetFileReaderChecked(convertFileToURI(parquetFile, false), readInstructions); } /** * Make a {@link ParquetFileReader} for the supplied {@link URI}. * - * @param parquetFileURI The {@link URI} to read + * @param parquetFileURI The URI for the parquet file or the parquet metadata file * @return The new {@link ParquetFileReader} * @throws IOException if an IO exception occurs */ diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFileHelper.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFileHelper.java index 1c9c0edd907..21ae5710a57 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFileHelper.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFileHelper.java @@ -4,10 +4,13 @@ package io.deephaven.parquet.table.layout; import io.deephaven.parquet.table.ParquetTableWriter; +import io.deephaven.util.channel.SeekableChannelsProvider; +import org.jetbrains.annotations.NotNull; +import java.net.URI; import java.nio.file.Path; -final class ParquetFileHelper { +public final class ParquetFileHelper { /** * Used as a filter to select relevant parquet files while reading all files in a directory. */ @@ -15,4 +18,17 @@ static boolean fileNameMatches(final Path path) { final String fileName = path.getFileName().toString(); return fileName.endsWith(ParquetTableWriter.PARQUET_FILE_EXTENSION) && fileName.charAt(0) != '.'; } + + /** + * Convert a parquet source to a URI. + * + * @param source The path or URI of parquet file or directory to examine + * @return The URI + */ + public static URI convertParquetSourceToURI(@NotNull final String source) { + if (source.endsWith(".parquet")) { + return SeekableChannelsProvider.convertToURI(source, false); + } + return SeekableChannelsProvider.convertToURI(source, true); + } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java index 3dbdb9f845f..fa32327f5e9 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java @@ -73,7 +73,7 @@ public ParquetTableLocationKey(@NotNull final URI parquetFileUri, final int orde } private static URI validateParquetFile(@NotNull final File file) { - return validateParquetFile(convertFileToURI(file)); + return validateParquetFile(convertFileToURI(file, false)); } private static URI validateParquetFile(@NotNull final URI parquetFileUri) { @@ -192,7 +192,7 @@ public synchronized int[] getRowGroupIndices() { // we're not expecting that in this code path. To support it, discovery tools should figure out // the row groups for a partition themselves and call setRowGroupReaders. final String filePath = rowGroups.get(rgi).getColumns().get(0).getFile_path(); - return filePath == null || convertToURI(filePath).equals(uri); + return filePath == null || convertToURI(filePath, false).equals(uri); }).toArray(); } diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index 50ae5fba501..0c1d66e23a4 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -954,7 +954,7 @@ public void basicWriteAndReadFromFileURITests() { final String filename = "basicWriteTests.parquet"; final File destFile = new File(rootFile, filename); final String absolutePath = destFile.getAbsolutePath(); - final URI fileURI = convertFileToURI(destFile); + final URI fileURI = convertFileToURI(destFile, false); ParquetTools.writeTable(tableToSave, absolutePath); // Read from file URI @@ -1231,7 +1231,7 @@ public void partitionedParquetWithDotFilesTest() throws IOException { writeTable(someTable, firstDataFile); writeTable(someTable, secondDataFile); - final URI parentURI = convertFileToURI(parentDir); + final URI parentURI = convertFileToURI(parentDir, true); final Table partitionedTable = readTable(parentURI.toString()).select(); final Set columnsSet = partitionedTable.getDefinition().getColumnNameSet(); assertTrue(columnsSet.size() == 2 && columnsSet.contains("A") && columnsSet.contains("X")); From a6b73396074ad0887eda08343d42a1660771a550 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Tue, 12 Mar 2024 21:17:24 -0500 Subject: [PATCH 3/6] Review with Devin part 2 --- .../channel/SeekableChannelsProvider.java | 12 +++++------ .../parquet/base/ColumnChunkReaderImpl.java | 4 ++-- .../parquet/base/ParquetFileReader.java | 3 +-- .../deephaven/parquet/table/ParquetTools.java | 21 ++++++++++++++----- .../table/layout/ParquetFileHelper.java | 18 +--------------- .../location/ParquetTableLocationKey.java | 3 +-- .../table/ParquetTableReadWriteTest.java | 7 +++---- 7 files changed, 30 insertions(+), 38 deletions(-) diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java index fdeeeabc6b5..aee96c4e691 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java @@ -33,11 +33,11 @@ static URI convertToURI(final String source, final boolean isDirectory) { uri = new URI(source); } catch (final URISyntaxException e) { // If the URI is invalid, assume it's a file path - return convertFileToURI(new File(source), isDirectory); + return convertToURI(new File(source), isDirectory); } if (uri.getScheme() == null) { // Convert to a "file" URI - return convertFileToURI(new File(source), isDirectory); + return convertToURI(new File(source), isDirectory); } return uri; } @@ -50,7 +50,7 @@ static URI convertToURI(final String source, final boolean isDirectory) { * @param isDirectory Whether the source file is a directory * @return The URI object */ - static URI convertFileToURI(final File file, final boolean isDirectory) { + static URI convertToURI(final File file, final boolean isDirectory) { String absPath = file.getAbsolutePath(); if (absPath.isEmpty()) { throw new IllegalArgumentException("Cannot convert file with empty path to URI: " + file); @@ -61,7 +61,7 @@ static URI convertFileToURI(final File file, final boolean isDirectory) { if (absPath.charAt(0) != '/') { absPath = "/" + absPath; } - if (absPath.charAt(absPath.length() - 1) != '/' && isDirectory) { + if (isDirectory && absPath.charAt(absPath.length() - 1) != '/') { absPath = absPath + "/"; } if (absPath.startsWith("//")) { @@ -83,8 +83,8 @@ static URI convertFileToURI(final File file, final boolean isDirectory) { * @param isDirectory Whether the file is a directory * @return The URI object */ - static URI convertPathToURI(final Path path, final boolean isDirectory) { - return convertFileToURI(path.toFile(), isDirectory); + static URI convertToURI(final Path path, final boolean isDirectory) { + return convertToURI(path.toFile(), isDirectory); } /** diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java index 6789d3ff95b..9f01b6de543 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java @@ -33,7 +33,7 @@ import java.util.function.Function; import static io.deephaven.parquet.base.ParquetFileReader.FILE_URI_SCHEME; -import static io.deephaven.util.channel.SeekableChannelsProvider.convertPathToURI; +import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; import static org.apache.parquet.format.Encoding.PLAIN_DICTIONARY; import static org.apache.parquet.format.Encoding.RLE_DICTIONARY; @@ -121,7 +121,7 @@ private URI getURI() { return uri; } if (columnChunk.isSetFile_path() && FILE_URI_SCHEME.equals(rootURI.getScheme())) { - return uri = convertPathToURI(Path.of(rootURI).resolve(columnChunk.getFile_path()), false); + return uri = convertToURI(Path.of(rootURI).resolve(columnChunk.getFile_path()), false); } else { // TODO(deephaven-core#5066): Add support for reading metadata files from non-file URIs return uri = rootURI; diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java index 0dda95d4c2b..4cd2e83a49a 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java @@ -18,7 +18,6 @@ import java.nio.charset.StandardCharsets; import java.util.*; -import static io.deephaven.util.channel.SeekableChannelsProvider.convertFileToURI; import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; /** @@ -62,7 +61,7 @@ public ParquetFileReader(final URI parquetFileURI, final SeekableChannelsProvide this.channelsProvider = channelsProvider; if (!parquetFileURI.getRawPath().endsWith(".parquet") && FILE_URI_SCHEME.equals(parquetFileURI.getScheme())) { // Construct a new file URI for the parent directory - rootURI = convertFileToURI(new File(parquetFileURI).getParentFile(), true); + rootURI = convertToURI(new File(parquetFileURI).getParentFile(), true); } else { // TODO(deephaven-core#5066): Add support for reading metadata files from non-file URIs rootURI = parquetFileURI; diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index e479166293d..5f4e5907018 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -55,8 +55,6 @@ import java.util.*; import static io.deephaven.parquet.base.ParquetFileReader.FILE_URI_SCHEME; -import static io.deephaven.parquet.table.layout.ParquetFileHelper.convertParquetSourceToURI; -import static io.deephaven.util.channel.SeekableChannelsProvider.convertFileToURI; import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; import static io.deephaven.parquet.table.ParquetTableWriter.PARQUET_FILE_EXTENSION; import static io.deephaven.util.type.TypeUtils.getUnboxedTypeIfBoxed; @@ -188,6 +186,19 @@ public static Table readTable( return readTableInternal(sourceFile, readInstructions); } + /** + * Convert a parquet source to a URI. + * + * @param source The path or URI of parquet file or directory to examine + * @return The URI + */ + private static URI convertParquetSourceToURI(@NotNull final String source) { + if (source.endsWith(".parquet")) { + return convertToURI(source, false); + } + return convertToURI(source, true); + } + /** * Write a table to a file. * @@ -963,7 +974,7 @@ public static Table readFlatPartitionedTable( public static Table readSingleFileTable( @NotNull final File file, @NotNull final ParquetInstructions readInstructions) { - return readSingleFileTable(convertFileToURI(file, false), readInstructions); + return readSingleFileTable(convertToURI(file, false), readInstructions); } /** @@ -1009,7 +1020,7 @@ public static Table readSingleFileTable( @NotNull final File file, @NotNull final ParquetInstructions readInstructions, @NotNull final TableDefinition tableDefinition) { - return readSingleFileTable(convertFileToURI(file, false), readInstructions, tableDefinition); + return readSingleFileTable(convertToURI(file, false), readInstructions, tableDefinition); } /** @@ -1147,7 +1158,7 @@ public static ParquetFileReader getParquetFileReader(@NotNull final URI parquetF public static ParquetFileReader getParquetFileReaderChecked( @NotNull final File parquetFile, @NotNull final ParquetInstructions readInstructions) throws IOException { - return getParquetFileReaderChecked(convertFileToURI(parquetFile, false), readInstructions); + return getParquetFileReaderChecked(convertToURI(parquetFile, false), readInstructions); } /** diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFileHelper.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFileHelper.java index 21ae5710a57..1c9c0edd907 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFileHelper.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFileHelper.java @@ -4,13 +4,10 @@ package io.deephaven.parquet.table.layout; import io.deephaven.parquet.table.ParquetTableWriter; -import io.deephaven.util.channel.SeekableChannelsProvider; -import org.jetbrains.annotations.NotNull; -import java.net.URI; import java.nio.file.Path; -public final class ParquetFileHelper { +final class ParquetFileHelper { /** * Used as a filter to select relevant parquet files while reading all files in a directory. */ @@ -18,17 +15,4 @@ static boolean fileNameMatches(final Path path) { final String fileName = path.getFileName().toString(); return fileName.endsWith(ParquetTableWriter.PARQUET_FILE_EXTENSION) && fileName.charAt(0) != '.'; } - - /** - * Convert a parquet source to a URI. - * - * @param source The path or URI of parquet file or directory to examine - * @return The URI - */ - public static URI convertParquetSourceToURI(@NotNull final String source) { - if (source.endsWith(".parquet")) { - return SeekableChannelsProvider.convertToURI(source, false); - } - return SeekableChannelsProvider.convertToURI(source, true); - } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java index fa32327f5e9..09a5cdd8d32 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java @@ -23,7 +23,6 @@ import java.util.Map; import java.util.stream.IntStream; -import static io.deephaven.util.channel.SeekableChannelsProvider.convertFileToURI; import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; /** @@ -73,7 +72,7 @@ public ParquetTableLocationKey(@NotNull final URI parquetFileUri, final int orde } private static URI validateParquetFile(@NotNull final File file) { - return validateParquetFile(convertFileToURI(file, false)); + return validateParquetFile(convertToURI(file, false)); } private static URI validateParquetFile(@NotNull final URI parquetFileUri) { diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index 0c1d66e23a4..1c5071daf41 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -69,7 +69,6 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.net.URI; -import java.nio.file.Path; import java.time.Duration; import java.time.Instant; import java.util.ArrayList; @@ -109,7 +108,7 @@ import static io.deephaven.parquet.table.ParquetTools.readTable; import static io.deephaven.parquet.table.ParquetTools.writeTable; import static io.deephaven.util.QueryConstants.*; -import static io.deephaven.util.channel.SeekableChannelsProvider.convertFileToURI; +import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; import static org.junit.Assert.*; @Category(OutOfBandTest.class) @@ -954,7 +953,7 @@ public void basicWriteAndReadFromFileURITests() { final String filename = "basicWriteTests.parquet"; final File destFile = new File(rootFile, filename); final String absolutePath = destFile.getAbsolutePath(); - final URI fileURI = convertFileToURI(destFile, false); + final URI fileURI = convertToURI(destFile, false); ParquetTools.writeTable(tableToSave, absolutePath); // Read from file URI @@ -1231,7 +1230,7 @@ public void partitionedParquetWithDotFilesTest() throws IOException { writeTable(someTable, firstDataFile); writeTable(someTable, secondDataFile); - final URI parentURI = convertFileToURI(parentDir, true); + final URI parentURI = convertToURI(parentDir, true); final Table partitionedTable = readTable(parentURI.toString()).select(); final Set columnsSet = partitionedTable.getDefinition().getColumnNameSet(); assertTrue(columnsSet.size() == 2 && columnsSet.contains("A") && columnsSet.contains("X")); From 7188aa3fe4a9406d0503f08522020e123446bf12 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 13 Mar 2024 09:27:38 -0500 Subject: [PATCH 4/6] Rewording comments --- .../util/channel/SeekableChannelsProvider.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java index aee96c4e691..af8c53b66e9 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java @@ -43,8 +43,9 @@ static URI convertToURI(final String source, final boolean isDirectory) { } /** - * Takes a file and convert it to a URI object. We should use this method instead of {@link File#toURI()} because - * {@link File#toURI()} internally calls {@link File#isDirectory()}, which can lead to disk access. + * Takes a file and convert it to a URI object. This method is preferred instead of {@link File#toURI()} because + * {@link File#toURI()} internally calls {@link File#isDirectory()}, which typically invokes the {@code stat} system + * call, resulting in filesystem metadata access. * * @param file The file * @param isDirectory Whether the source file is a directory @@ -52,9 +53,6 @@ static URI convertToURI(final String source, final boolean isDirectory) { */ static URI convertToURI(final File file, final boolean isDirectory) { String absPath = file.getAbsolutePath(); - if (absPath.isEmpty()) { - throw new IllegalArgumentException("Cannot convert file with empty path to URI: " + file); - } if (File.separatorChar != '/') { absPath = absPath.replace(File.separatorChar, '/'); } @@ -75,9 +73,8 @@ static URI convertToURI(final File file, final boolean isDirectory) { } /** - * Takes a path and convert it to a URI object. We should use this method instead of {@link Path#toUri()} because - * {@link Path#toUri()} internally does a {@code stat} system call to extract information whether file is a - * directory. + * Takes a path and convert it to a URI object. This method is preferred instead of {@link Path#toUri()} because + * {@link Path#toUri()} internally invokes the {@code stat} system call, resulting in filesystem metadata access. * * @param path The path * @param isDirectory Whether the file is a directory From 12edf83082d30df9b25d88eb256ad4725b39ceaa Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 13 Mar 2024 14:33:11 -0500 Subject: [PATCH 5/6] Moved File->URI methods to FileUtils --- .../java/io/deephaven/base/FileUtils.java | 71 +++++++++++++++++++ .../channel/SeekableChannelsProvider.java | 71 +------------------ .../parquet/base/ColumnChunkReaderImpl.java | 2 +- .../parquet/base/ParquetFileReader.java | 2 +- .../parquet/table/ParquetTableWriter.java | 2 +- .../deephaven/parquet/table/ParquetTools.java | 2 +- .../location/ParquetTableLocationKey.java | 2 +- .../table/ParquetTableReadWriteTest.java | 2 +- 8 files changed, 79 insertions(+), 75 deletions(-) diff --git a/Base/src/main/java/io/deephaven/base/FileUtils.java b/Base/src/main/java/io/deephaven/base/FileUtils.java index 983c7d3a5b7..367c6896a05 100644 --- a/Base/src/main/java/io/deephaven/base/FileUtils.java +++ b/Base/src/main/java/io/deephaven/base/FileUtils.java @@ -8,6 +8,9 @@ import org.jetbrains.annotations.Nullable; import java.io.*; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.file.Path; import java.util.ArrayList; public class FileUtils { @@ -249,4 +252,72 @@ public boolean accept(File pathname) { || (pathname.isFile() && (normalFileFilter == null || normalFileFilter.accept(pathname))); } } + + /** + * Take the file source path or URI string and convert it to a URI object. + * + * @param source The file source path or URI + * @param isDirectory Whether the source is a directory + * @return The URI object + */ + public static URI convertToURI(final String source, final boolean isDirectory) { + if (source.isEmpty()) { + throw new IllegalArgumentException("Cannot convert empty source to URI"); + } + final URI uri; + try { + uri = new URI(source); + } catch (final URISyntaxException e) { + // If the URI is invalid, assume it's a file path + return convertToURI(new File(source), isDirectory); + } + if (uri.getScheme() == null) { + // Convert to a "file" URI + return convertToURI(new File(source), isDirectory); + } + return uri; + } + + /** + * Takes a file and convert it to a URI object with {@code "file"} scheme. This method is preferred instead of + * {@link File#toURI()} because {@link File#toURI()} internally calls {@link File#isDirectory()}, which typically + * invokes the {@code stat} system call, resulting in filesystem metadata access. + * + * @param file The file + * @param isDirectory Whether the source file is a directory + * @return The URI object + */ + public static URI convertToURI(final File file, final boolean isDirectory) { + String absPath = file.getAbsolutePath(); + if (File.separatorChar != '/') { + absPath = absPath.replace(File.separatorChar, '/'); + } + if (absPath.charAt(0) != '/') { + absPath = "/" + absPath; + } + if (isDirectory && absPath.charAt(absPath.length() - 1) != '/') { + absPath = absPath + "/"; + } + if (absPath.startsWith("//")) { + absPath = "//" + absPath; + } + try { + return new URI("file", null, absPath, null); + } catch (final URISyntaxException e) { + throw new IllegalStateException("Failed to convert file to URI: " + file, e); + } + } + + /** + * Takes a path and convert it to a URI object with {@code "file"} scheme. This method is preferred instead of + * {@link Path#toUri()} because {@link Path#toUri()} internally invokes the {@code stat} system call, resulting in + * filesystem metadata access. + * + * @param path The path + * @param isDirectory Whether the file is a directory + * @return The URI object + */ + public static URI convertToURI(final Path path, final boolean isDirectory) { + return convertToURI(path.toFile(), isDirectory); + } } diff --git a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java index af8c53b66e9..ba7bcf49c74 100644 --- a/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java +++ b/Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java @@ -6,83 +6,16 @@ import io.deephaven.util.SafeCloseable; import org.jetbrains.annotations.NotNull; -import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URI; -import java.net.URISyntaxException; import java.nio.channels.SeekableByteChannel; import java.nio.file.Path; import java.nio.file.Paths; -public interface SeekableChannelsProvider extends SafeCloseable { - - /** - * Take the file source path or URI string and convert it to a URI object. - * - * @param source The file source path or URI - * @param isDirectory Whether the source is a directory - * @return The URI object - */ - static URI convertToURI(final String source, final boolean isDirectory) { - if (source.isEmpty()) { - throw new IllegalArgumentException("Cannot convert empty source to URI"); - } - final URI uri; - try { - uri = new URI(source); - } catch (final URISyntaxException e) { - // If the URI is invalid, assume it's a file path - return convertToURI(new File(source), isDirectory); - } - if (uri.getScheme() == null) { - // Convert to a "file" URI - return convertToURI(new File(source), isDirectory); - } - return uri; - } - - /** - * Takes a file and convert it to a URI object. This method is preferred instead of {@link File#toURI()} because - * {@link File#toURI()} internally calls {@link File#isDirectory()}, which typically invokes the {@code stat} system - * call, resulting in filesystem metadata access. - * - * @param file The file - * @param isDirectory Whether the source file is a directory - * @return The URI object - */ - static URI convertToURI(final File file, final boolean isDirectory) { - String absPath = file.getAbsolutePath(); - if (File.separatorChar != '/') { - absPath = absPath.replace(File.separatorChar, '/'); - } - if (absPath.charAt(0) != '/') { - absPath = "/" + absPath; - } - if (isDirectory && absPath.charAt(absPath.length() - 1) != '/') { - absPath = absPath + "/"; - } - if (absPath.startsWith("//")) { - absPath = "//" + absPath; - } - try { - return new URI("file", null, absPath, null); - } catch (final URISyntaxException e) { - throw new IllegalStateException("Failed to convert file to URI: " + file, e); - } - } +import static io.deephaven.base.FileUtils.convertToURI; - /** - * Takes a path and convert it to a URI object. This method is preferred instead of {@link Path#toUri()} because - * {@link Path#toUri()} internally invokes the {@code stat} system call, resulting in filesystem metadata access. - * - * @param path The path - * @param isDirectory Whether the file is a directory - * @return The URI object - */ - static URI convertToURI(final Path path, final boolean isDirectory) { - return convertToURI(path.toFile(), isDirectory); - } +public interface SeekableChannelsProvider extends SafeCloseable { /** * Wraps {@link SeekableChannelsProvider#getInputStream(SeekableByteChannel)} to ensure the channel's position is diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java index 9f01b6de543..1166b3956b4 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java @@ -32,8 +32,8 @@ import java.util.NoSuchElementException; import java.util.function.Function; +import static io.deephaven.base.FileUtils.convertToURI; import static io.deephaven.parquet.base.ParquetFileReader.FILE_URI_SCHEME; -import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; import static org.apache.parquet.format.Encoding.PLAIN_DICTIONARY; import static org.apache.parquet.format.Encoding.RLE_DICTIONARY; diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java index 4cd2e83a49a..e97576f95a7 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java @@ -18,7 +18,7 @@ import java.nio.charset.StandardCharsets; import java.util.*; -import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; +import static io.deephaven.base.FileUtils.convertToURI; /** * Top level accessor for a parquet file which can read both from a file path string or a CLI style file URI, diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java index 70ed5a095f8..66f93a7073b 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java @@ -45,7 +45,7 @@ import java.nio.file.Paths; import java.util.*; -import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; +import static io.deephaven.base.FileUtils.convertToURI; /** * API for writing DH tables in parquet format diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index 5f4e5907018..dbbdfd88ad7 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -54,8 +54,8 @@ import java.nio.file.attribute.BasicFileAttributes; import java.util.*; +import static io.deephaven.base.FileUtils.convertToURI; import static io.deephaven.parquet.base.ParquetFileReader.FILE_URI_SCHEME; -import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; import static io.deephaven.parquet.table.ParquetTableWriter.PARQUET_FILE_EXTENSION; import static io.deephaven.util.type.TypeUtils.getUnboxedTypeIfBoxed; diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java index 09a5cdd8d32..b8fdcb1c5be 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java @@ -23,7 +23,7 @@ import java.util.Map; import java.util.stream.IntStream; -import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; +import static io.deephaven.base.FileUtils.convertToURI; /** * {@link TableLocationKey} implementation for use with data stored in the parquet format. diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index 1c5071daf41..0dfeec49305 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -88,6 +88,7 @@ import javax.annotation.Nullable; +import static io.deephaven.base.FileUtils.convertToURI; import static io.deephaven.engine.testutil.TstUtils.assertTableEquals; import static io.deephaven.engine.util.TableTools.booleanCol; import static io.deephaven.engine.util.TableTools.byteCol; @@ -108,7 +109,6 @@ import static io.deephaven.parquet.table.ParquetTools.readTable; import static io.deephaven.parquet.table.ParquetTools.writeTable; import static io.deephaven.util.QueryConstants.*; -import static io.deephaven.util.channel.SeekableChannelsProvider.convertToURI; import static org.junit.Assert.*; @Category(OutOfBandTest.class) From 2b873ff89e133b3222de80697f5ca8a8fb06ac77 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 13 Mar 2024 17:50:27 -0500 Subject: [PATCH 6/6] Trigger CI jobs