-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-45594: [C++][Parquet] POC: Optimize Parquet DecodeArrow in DeltaLengthByteArray #45622
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -39,6 +39,7 @@ | |||||||||||||
#include "arrow/util/binary_view_util.h" | ||||||||||||||
#include "arrow/util/macros.h" | ||||||||||||||
#include "arrow/util/visibility.h" | ||||||||||||||
#include "arrow/visit_data_inline.h" | ||||||||||||||
|
||||||||||||||
namespace arrow { | ||||||||||||||
|
||||||||||||||
|
@@ -303,6 +304,77 @@ class BaseBinaryBuilder | |||||||||||||
return Status::OK(); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
Status AppendBinaryWithLengths(std::string_view binary, const int32_t* value_lengths, | ||||||||||||||
int64_t length) { | ||||||||||||||
ARROW_RETURN_NOT_OK(Reserve(length)); | ||||||||||||||
UnsafeAppendToBitmap(/*valid_bytes=*/NULLPTR, length); | ||||||||||||||
Comment on lines
+309
to
+310
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you call these after the error checks below? |
||||||||||||||
// All values is valid | ||||||||||||||
int64_t accum_length = 0; | ||||||||||||||
for (int64_t i = 0; i < length; ++i) { | ||||||||||||||
accum_length += value_lengths[i]; | ||||||||||||||
} | ||||||||||||||
if (ARROW_PREDICT_FALSE(binary.size() < static_cast<size_t>(accum_length))) { | ||||||||||||||
return Status::Invalid("Binary data is too short"); | ||||||||||||||
} | ||||||||||||||
Comment on lines
+316
to
+318
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not check equality? It's trivial to call
Suggested change
|
||||||||||||||
if (ARROW_PREDICT_FALSE(binary.size() + value_data_builder_.length() > | ||||||||||||||
std::numeric_limits<int32_t>::max())) { | ||||||||||||||
return Status::Invalid("Append binary data too long"); | ||||||||||||||
} | ||||||||||||||
Comment on lines
+319
to
+322
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's call |
||||||||||||||
std::string_view sub_data = binary.substr(0, accum_length); | ||||||||||||||
ARROW_RETURN_NOT_OK(value_data_builder_.Append( | ||||||||||||||
reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size())); | ||||||||||||||
accum_length = 0; | ||||||||||||||
const int64_t initialize_offset = value_data_builder_.length(); | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||
for (int64_t i = 0; i < length; ++i) { | ||||||||||||||
offsets_builder_.UnsafeAppend( | ||||||||||||||
static_cast<int32_t>(initialize_offset + accum_length)); | ||||||||||||||
accum_length += value_lengths[i]; | ||||||||||||||
} | ||||||||||||||
return Status::OK(); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
Status AppendBinaryWithLengths(std::string_view binary, const int32_t* value_lengths, | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comments below... |
||||||||||||||
int64_t length, int64_t null_count, | ||||||||||||||
const uint8_t* valid_bits, int64_t valid_bits_offset) { | ||||||||||||||
if (valid_bits == NULLPTR || null_count == 0) { | ||||||||||||||
return AppendBinaryWithLengths(binary, value_lengths, length); | ||||||||||||||
} | ||||||||||||||
ARROW_RETURN_NOT_OK(Reserve(length)); | ||||||||||||||
int64_t accum_length = 0; | ||||||||||||||
for (int64_t i = 0; i < length; ++i) { | ||||||||||||||
accum_length += value_lengths[i]; | ||||||||||||||
} | ||||||||||||||
if (ARROW_PREDICT_FALSE(binary.size() < static_cast<size_t>(accum_length))) { | ||||||||||||||
return Status::Invalid("Binary data is too short"); | ||||||||||||||
} | ||||||||||||||
const int64_t original_offset = value_data_builder_.length(); | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you use the same naming as above? e.g. |
||||||||||||||
if (ARROW_PREDICT_FALSE(original_offset + accum_length) > | ||||||||||||||
std::numeric_limits<int32_t>::max()) { | ||||||||||||||
return Status::Invalid("Append binary data too long"); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
std::string_view sub_data = binary.substr(0, accum_length); | ||||||||||||||
ARROW_RETURN_NOT_OK(value_data_builder_.Append( | ||||||||||||||
reinterpret_cast<const uint8_t*>(sub_data.data()), sub_data.size())); | ||||||||||||||
int64_t length_idx = 0; | ||||||||||||||
accum_length = 0; | ||||||||||||||
RETURN_NOT_OK(VisitNullBitmapInline( | ||||||||||||||
valid_bits, valid_bits_offset, length, null_count, | ||||||||||||||
[&]() { | ||||||||||||||
offsets_builder_.UnsafeAppend( | ||||||||||||||
static_cast<int32_t>(original_offset + accum_length)); | ||||||||||||||
accum_length += value_lengths[length_idx]; | ||||||||||||||
++length_idx; | ||||||||||||||
return Status::OK(); | ||||||||||||||
}, | ||||||||||||||
[&]() { | ||||||||||||||
offsets_builder_.UnsafeAppend( | ||||||||||||||
static_cast<int32_t>(original_offset + accum_length)); | ||||||||||||||
return Status::OK(); | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, we're not incrementing |
||||||||||||||
})); | ||||||||||||||
return Status::OK(); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
void Reset() override { | ||||||||||||||
ArrayBuilder::Reset(); | ||||||||||||||
offsets_builder_.Reset(); | ||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not call this
AppendValuesWithLengths
?Also, it should probably take the right offset type?
And you need to add a docstring...