From 0b8bcdb8ef041b2e19768517db2807aaab4cf691 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Thu, 6 Feb 2025 17:57:40 -0800 Subject: [PATCH] Remove extraneous padding in plain encoder This fixes two bugs with the padding added by the bytes_to_array function in the plain encoder. This function is used for reading indexes in LanceDB cloud. Problem 1: Prior to this commit, space was reserved based on an incorrect multiplication of the user-supplied buffer's byte length and the bytewidth of the item type. Instead, we should multiply the bytewidth by the element count. The effect of this was previously to pad too much space at the end of parsed arrays. Problem 2: We previously had some logic that was padding a buffer by extending it with 0-valued uint32s, instead of 0-valued uint8. This resulted in a multiplication of the padding added by four. --- rust/lance-arrow/src/lib.rs | 2 +- rust/lance-io/src/encodings/plain.rs | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs index d08992bf3b..0f2db893f6 100644 --- a/rust/lance-arrow/src/lib.rs +++ b/rust/lance-arrow/src/lib.rs @@ -822,7 +822,7 @@ impl BufferExt for arrow_buffer::Buffer { let mut buf = MutableBuffer::with_capacity(size_bytes); let to_fill = size_bytes - bytes.len(); buf.extend(bytes); - buf.extend(std::iter::repeat(0).take(to_fill)); + buf.extend(std::iter::repeat(0 as u8).take(to_fill)); Self::from(buf) } } diff --git a/rust/lance-io/src/encodings/plain.rs b/rust/lance-io/src/encodings/plain.rs index 844a4c516c..4de7166db0 100644 --- a/rust/lance-io/src/encodings/plain.rs +++ b/rust/lance-io/src/encodings/plain.rs @@ -199,7 +199,7 @@ pub fn bytes_to_array( { // this code is taken from // https://github.com/apache/arrow-rs/blob/master/arrow-data/src/data.rs#L748-L768 - let len_plus_offset = bytes.len() + offset; + let len_plus_offset = len + offset; let min_buffer_size = len_plus_offset.saturating_mul(*byte_width); // alignment or size isn't right -- just make a copy @@ -634,6 +634,25 @@ mod tests { test_round_trip(arrs.as_slice(), t).await; } + #[tokio::test] + async fn test_bytes_to_array_padding() { + let bytes = Bytes::from_static(&[0x01, 0x00, 0x02, 0x00, 0x03]); + let arr = bytes_to_array(&DataType::UInt16, bytes, 3, 0).unwrap(); + + let expected = UInt16Array::from(vec![1, 2, 3]); + assert_eq!(arr.as_ref(), &expected); + + // Underlying data is padded to the nearest multiple of two bytes (for u16). + let data = arr.to_data(); + let buf = &data.buffers()[0]; + let repr = format!("{:?}", buf); + assert!( + repr.contains("[1, 0, 2, 0, 3, 0]"), + "Underlying buffer contains unexpected data: {}", + repr + ); + } + #[tokio::test] async fn test_encode_decode_nested_fixed_size_list() { // FixedSizeList of FixedSizeList