Skip to content

Commit

Permalink
Fix FFI array offset handling (#5964)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold authored Jun 26, 2024
1 parent 6bc9514 commit 0a4d8a1
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 63 deletions.
24 changes: 12 additions & 12 deletions arrow-array/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -425,32 +425,32 @@ impl<'a> ImportedArrowArray<'a> {
(length + 1) * (bits / 8)
}
(DataType::Utf8, 2) | (DataType::Binary, 2) => {
// the len of the data buffer (buffer 2) equals the difference between the last value
// and the first value of the offset buffer (buffer 1).
if self.array.is_empty() {
return Ok(0);
}

// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
let len = self.buffer_len(1, dt)?;
// first buffer is the null buffer => add(1)
// we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets.
#[allow(clippy::cast_ptr_alignment)]
let offset_buffer = self.array.buffer(1) as *const i32;
// get first offset
let start = (unsafe { *offset_buffer.add(0) }) as usize;
// get last offset
let end = (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize;
end - start
(unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize
}
(DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => {
// the len of the data buffer (buffer 2) equals the difference between the last value
// and the first value of the offset buffer (buffer 1).
if self.array.is_empty() {
return Ok(0);
}

// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
let len = self.buffer_len(1, dt)?;
// first buffer is the null buffer => add(1)
// we assume that pointer is aligned for `i64`, as Large uses `i64` offsets.
#[allow(clippy::cast_ptr_alignment)]
let offset_buffer = self.array.buffer(1) as *const i64;
// get first offset
let start = (unsafe { *offset_buffer.add(0) }) as usize;
// get last offset
let end = (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize;
end - start
(unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize
}
// buffer len of primitive types
_ => {
Expand Down
55 changes: 4 additions & 51 deletions arrow-data/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,37 +131,6 @@ impl FFI_ArrowArray {
data.buffers().iter().map(|b| Some(b.clone())).collect()
};

// Handle buffer offset for offset buffer.
let offset_offset = match data.data_type() {
DataType::Utf8 | DataType::Binary => {
// Offset buffer is possible a slice of the buffer.
// If we use slice pointer as exported buffer pointer, it will cause
// the consumer to calculate incorrect length of data buffer (buffer 1).
// We need to get the offset of the offset buffer and fill it in
// the `FFI_ArrowArray` offset field.
Some(data.buffers()[0].ptr_offset() / std::mem::size_of::<i32>())
}
DataType::LargeUtf8 | DataType::LargeBinary => {
// Offset buffer is possible a slice of the buffer.
// If we use slice pointer as exported buffer pointer, it will cause
// the consumer to calculate incorrect length of data buffer (buffer 1).
// We need to get the offset of the offset buffer and fill it in
// the `FFI_ArrowArray` offset field.
Some(data.buffers()[0].ptr_offset() / std::mem::size_of::<i64>())
}
_ => None,
};

let offset = if let Some(offset) = offset_offset {
if data.offset() != 0 {
// TODO: Adjust for data offset
panic!("The ArrayData of a slice offset buffer should not have offset");
}
offset
} else {
data.offset()
};

// `n_buffers` is the number of buffers by the spec.
let n_buffers = {
data_layout.buffers.len() + {
Expand All @@ -174,25 +143,9 @@ impl FFI_ArrowArray {

let buffers_ptr = buffers
.iter()
.enumerate()
.flat_map(|(buffer_idx, maybe_buffer)| match maybe_buffer {
Some(b) => {
match (data.data_type(), buffer_idx) {
(
DataType::Utf8
| DataType::LargeUtf8
| DataType::Binary
| DataType::LargeBinary,
1,
) => {
// For offset buffer, take original pointer without offset.
// Buffer offset should be handled by `FFI_ArrowArray` offset field.
Some(b.data_ptr().as_ptr() as *const c_void)
}
// For other buffers, note that `raw_data` takes into account the buffer's offset
_ => Some(b.as_ptr() as *const c_void),
}
}
.flat_map(|maybe_buffer| match maybe_buffer {
// note that `raw_data` takes into account the buffer's offset
Some(b) => Some(b.as_ptr() as *const c_void),
// This is for null buffer. We only put a null pointer for
// null buffer if by spec it can contain null mask.
None if data_layout.can_contain_null_mask => Some(std::ptr::null()),
Expand Down Expand Up @@ -233,7 +186,7 @@ impl FFI_ArrowArray {
Self {
length: data.len() as i64,
null_count: null_count as i64,
offset: offset as i64,
offset: data.offset() as i64,
n_buffers,
n_children,
buffers: private_data.buffers_ptr.as_mut_ptr(),
Expand Down

0 comments on commit 0a4d8a1

Please sign in to comment.