Skip to content

Commit

Permalink
[Opt](Compression) Opt zstd block decompression by `ZSTD_decompressDC…
Browse files Browse the repository at this point in the history
…tx()`.
  • Loading branch information
kaka11chen committed Nov 27, 2023
1 parent 5700332 commit 319b804
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 23 deletions.
32 changes: 9 additions & 23 deletions be/src/util/block_compression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -816,42 +816,28 @@ class ZstdBlockCompression : public BlockCompressionCodec {
return Status::OK();
}

// follow ZSTD official example
// https://github.com/facebook/zstd/blob/dev/examples/streaming_decompression.c
Status decompress(const Slice& input, Slice* output) override {
DContext* context;
bool compress_failed = false;
bool decompress_failed = false;
RETURN_IF_ERROR(_acquire_decompression_ctx(&context));
Defer defer {[&] {
if (compress_failed) {
if (decompress_failed) {
_delete_decompression_ctx(context);
} else {
_release_decompression_ctx(context);
}
}};

ZSTD_inBuffer in_buf = {input.data, input.size, 0};
ZSTD_outBuffer out_buf = {output->data, output->size, 0};

while (in_buf.pos < in_buf.size) {
// do decompress
auto ret = ZSTD_decompressStream(context->ctx, &out_buf, &in_buf);

if (ZSTD_isError(ret)) {
compress_failed = true;
return Status::InvalidArgument("ZSTD_decompressStream error: {}",
ZSTD_getErrorString(ZSTD_getErrorCode(ret)));
}

// ret is ZSTD hint for needed output buffer size
if (ret > 0 && out_buf.pos == out_buf.size) {
compress_failed = true;
return Status::InvalidArgument("ZSTD_decompressStream output buffer full");
}
size_t ret = ZSTD_decompressDCtx(context->ctx, output->data, output->size, input.data,
input.size);
if (ZSTD_isError(ret)) {
decompress_failed = true;
return Status::InvalidArgument("ZSTD_decompressDCtx error: {}",
ZSTD_getErrorString(ZSTD_getErrorCode(ret)));
}

// set decompressed size for caller
output->size = out_buf.pos;
output->size = ret;

return Status::OK();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1412,6 +1412,88 @@ TBLPROPERTIES (

msck repair table parquet_gzip_all_types;

CREATE TABLE `parquet_zstd_all_types`(
`t_null_string` string,
`t_null_varchar` varchar(65535),
`t_null_char` char(10),
`t_null_decimal_precision_2` decimal(2,1),
`t_null_decimal_precision_4` decimal(4,2),
`t_null_decimal_precision_8` decimal(8,4),
`t_null_decimal_precision_17` decimal(17,8),
`t_null_decimal_precision_18` decimal(18,8),
`t_null_decimal_precision_38` decimal(38,16),
`t_empty_string` string,
`t_string` string,
`t_empty_varchar` varchar(65535),
`t_varchar` varchar(65535),
`t_varchar_max_length` varchar(65535),
`t_char` char(10),
`t_int` int,
`t_bigint` bigint,
`t_float` float,
`t_double` double,
`t_boolean_true` boolean,
`t_boolean_false` boolean,
`t_decimal_precision_2` decimal(2,1),
`t_decimal_precision_4` decimal(4,2),
`t_decimal_precision_8` decimal(8,4),
`t_decimal_precision_17` decimal(17,8),
`t_decimal_precision_18` decimal(18,8),
`t_decimal_precision_38` decimal(38,16),
`t_binary` binary,
`t_map_string` map<string,string>,
`t_map_varchar` map<varchar(65535),varchar(65535)>,
`t_map_char` map<char(10),char(10)>,
`t_map_int` map<int,int>,
`t_map_bigint` map<bigint,bigint>,
`t_map_float` map<float,float>,
`t_map_double` map<double,double>,
`t_map_boolean` map<boolean,boolean>,
`t_map_decimal_precision_2` map<decimal(2,1),decimal(2,1)>,
`t_map_decimal_precision_4` map<decimal(4,2),decimal(4,2)>,
`t_map_decimal_precision_8` map<decimal(8,4),decimal(8,4)>,
`t_map_decimal_precision_17` map<decimal(17,8),decimal(17,8)>,
`t_map_decimal_precision_18` map<decimal(18,8),decimal(18,8)>,
`t_map_decimal_precision_38` map<decimal(38,16),decimal(38,16)>,
`t_array_string` array<string>,
`t_array_int` array<int>,
`t_array_bigint` array<bigint>,
`t_array_float` array<float>,
`t_array_double` array<double>,
`t_array_boolean` array<boolean>,
`t_array_varchar` array<varchar(65535)>,
`t_array_char` array<char(10)>,
`t_array_decimal_precision_2` array<decimal(2,1)>,
`t_array_decimal_precision_4` array<decimal(4,2)>,
`t_array_decimal_precision_8` array<decimal(8,4)>,
`t_array_decimal_precision_17` array<decimal(17,8)>,
`t_array_decimal_precision_18` array<decimal(18,8)>,
`t_array_decimal_precision_38` array<decimal(38,16)>,
`t_struct_bigint` struct<s_bigint:bigint>,
`t_complex` map<string,array<struct<s_int:int>>>,
`t_struct_nested` struct<struct_field:array<string>>,
`t_struct_null` struct<struct_field_null:string,struct_field_null2:string>,
`t_struct_non_nulls_after_nulls` struct<struct_non_nulls_after_nulls1:int,struct_non_nulls_after_nulls2:string>,
`t_nested_struct_non_nulls_after_nulls` struct<struct_field1:int,struct_field2:string,strict_field3:struct<nested_struct_field1:int,nested_struct_field2:string>>,
`t_map_null_value` map<string,string>,
`t_array_string_starting_with_nulls` array<string>,
`t_array_string_with_nulls_in_between` array<string>,
`t_array_string_ending_with_nulls` array<string>,
`t_array_string_all_nulls` array<string>
) ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'/user/doris/preinstalled_data/parquet_table/parquet_gzip_all_types'
TBLPROPERTIES (
'transient_lastDdlTime'='1681213018',
"parquet.compression"="ZSTD");

msck repair table parquet_zstd_all_types;

CREATE TABLE `rcbinary_all_types`(
`t_null_string` string,
`t_null_varchar` varchar(65535),
Expand Down
Binary file not shown.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ suite("test_hive_basic_type", "external_docker,hive,external_docker_hive,p0,exte
order_qt_33 """select * from ${catalog_name}.${ex_db_name}.parquet_all_types limit 1;"""

order_qt_36 """select * from ${catalog_name}.${ex_db_name}.parquet_gzip_all_types limit 1;"""
order_qt_42 """select * from ${catalog_name}.${ex_db_name}.parquet_zstd_all_types limit 1;"""

// hive tables of json classes do not necessarily support column separation to identify errors
//order_qt_8 """select * from ${catalog_name}.${ex_db_name}.json_all_types limit 1;"""
Expand Down

0 comments on commit 319b804

Please sign in to comment.