Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove use of deprecated dict_id in datafusion-proto (#14173) #14227

Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f1c1ca9
Remove use of deprecated dict_id in datafusion-proto (#14173)
Jan 22, 2025
02a6a73
Fix issues causing GitHub checks to fail
Jan 24, 2025
f7efa31
Fix issues causing GitHub checks to fail
Jan 25, 2025
efbd98e
Fix issues causing GitHub checks to fail
Jan 25, 2025
50b05d3
Fix issues causing GitHub checks to fail
Jan 26, 2025
4129d2a
Fix issues causing GitHub checks to fail
Jan 26, 2025
23ba251
Fix issues causing GitHub checks to fail
Jan 29, 2025
95f8bac
Fix issues causing GitHub checks to fail
Jan 29, 2025
fc0a2cb
Fix issues causing GitHub checks to fail
Jan 29, 2025
a5e5521
Fix issues causing GitHub checks to fail
Jan 29, 2025
b579e43
Fix issues causing GitHub checks to fail
Jan 30, 2025
67b4548
Fix issues causing GitHub checks to fail
Jan 30, 2025
4d7865d
Fix issues causing GitHub checks to fail
Jan 30, 2025
cf7ead2
Fix issues causing GitHub checks to fail
Jan 30, 2025
e009217
Fix issues causing GitHub checks to fail
Jan 30, 2025
2b38201
Fix issues causing GitHub checks to fail
Jan 30, 2025
9683ec0
Fix issues causing GitHub checks to fail
Jan 30, 2025
f9678b4
Fix issues causing GitHub checks to fail
Jan 31, 2025
1d656cd
Fix issues causing GitHub checks to fail
Jan 31, 2025
d8d5540
Merge branch 'main' into cj-zhukov/Remove-use-of-deprecated-dict_id-i…
cj-zhukov Jan 31, 2025
e9033ed
remove accidental file
alamb Jan 31, 2025
07157a7
undo deletion of test in copy.slt
Feb 3, 2025
662369f
Fix issues causing GitHub checks to fail
Feb 4, 2025
a270c95
Merge remote-tracking branch 'apache/main' into cj-zhukov/Remove-use-…
alamb Feb 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,585 changes: 2,585 additions & 0 deletions datafusion/core/src/physical_optimizer/enforce_sorting.rs

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions datafusion/proto-common/proto/datafusion_common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,7 @@ message Field {
// for complex data types like structs, unions
repeated Field children = 4;
map<string, string> metadata = 5;
int64 dict_id = 6;
bool dict_ordered = 7;
bool dict_ordered = 6;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is dict_ordered still used anywhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure, but we can open PR and handle it. I'm ready to work on it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome -- thanks. Let's get this PR ready to go and then we can work on remvoing that as a follow on

}

message Timestamp{
Expand Down
55 changes: 12 additions & 43 deletions datafusion/proto-common/src/from_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -320,21 +320,8 @@ impl TryFrom<&protobuf::Field> for Field {
type Error = Error;
fn try_from(field: &protobuf::Field) -> Result<Self, Self::Error> {
let datatype = field.arrow_type.as_deref().required("arrow_type")?;
let field = if field.dict_id != 0 {
// https://github.com/apache/datafusion/issues/14173
#[allow(deprecated)]
Self::new_dict(
field.name.as_str(),
datatype,
field.nullable,
field.dict_id,
field.dict_ordered,
)
.with_metadata(field.metadata.clone())
} else {
Self::new(field.name.as_str(), datatype, field.nullable)
.with_metadata(field.metadata.clone())
};
let field = Self::new(field.name.as_str(), datatype, field.nullable)
.with_metadata(field.metadata.clone());
Ok(field)
}
}
Expand Down Expand Up @@ -436,36 +423,18 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {

let id = dict_batch.id();

let fields_using_this_dictionary = {
// See https://github.com/apache/datafusion/issues/14173
#[allow(deprecated)]
schema.fields_with_dict_id(id)
};
let record_batch = read_record_batch(
&buffer,
dict_batch.data().unwrap(),
Arc::new(schema.clone()),
&Default::default(),
None,
&message.version(),
)?;

let first_field = fields_using_this_dictionary.first().ok_or_else(|| {
Error::General("dictionary id not found in schema while deserializing ScalarValue::List".to_string())
})?;
let values: ArrayRef = Arc::clone(record_batch.column(0));

let values: ArrayRef = match first_field.data_type() {
DataType::Dictionary(_, ref value_type) => {
// Make a fake schema for the dictionary batch.
let value = value_type.as_ref().clone();
let schema = Schema::new(vec![Field::new("", value, true)]);
// Read a single column
let record_batch = read_record_batch(
&buffer,
dict_batch.data().unwrap(),
Arc::new(schema),
&Default::default(),
None,
&message.version(),
)?;
Ok(Arc::clone(record_batch.column(0)))
}
_ => Err(Error::General("dictionary id not found in schema while deserializing ScalarValue::List".to_string())),
}?;

Ok((id,values))
Ok((id, values))
}).collect::<datafusion_common::Result<HashMap<_, _>>>()?;

let record_batch = read_record_batch(
Expand Down
23 changes: 0 additions & 23 deletions datafusion/proto-common/src/generated/pbjson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3107,9 +3107,6 @@ impl serde::Serialize for Field {
if !self.metadata.is_empty() {
len += 1;
}
if self.dict_id != 0 {
len += 1;
}
if self.dict_ordered {
len += 1;
}
Expand All @@ -3129,19 +3126,13 @@ impl serde::Serialize for Field {
if !self.metadata.is_empty() {
struct_ser.serialize_field("metadata", &self.metadata)?;
}
if self.dict_id != 0 {
#[allow(clippy::needless_borrow)]
#[allow(clippy::needless_borrows_for_generic_args)]
struct_ser.serialize_field("dictId", ToString::to_string(&self.dict_id).as_str())?;
}
if self.dict_ordered {
struct_ser.serialize_field("dictOrdered", &self.dict_ordered)?;
}
struct_ser.end()
}
}
impl<'de> serde::Deserialize<'de> for Field {
#[allow(deprecated)]
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
Expand All @@ -3153,8 +3144,6 @@ impl<'de> serde::Deserialize<'de> for Field {
"nullable",
"children",
"metadata",
"dict_id",
"dictId",
"dict_ordered",
"dictOrdered",
];
Expand All @@ -3166,7 +3155,6 @@ impl<'de> serde::Deserialize<'de> for Field {
Nullable,
Children,
Metadata,
DictId,
DictOrdered,
}
impl<'de> serde::Deserialize<'de> for GeneratedField {
Expand Down Expand Up @@ -3194,7 +3182,6 @@ impl<'de> serde::Deserialize<'de> for Field {
"nullable" => Ok(GeneratedField::Nullable),
"children" => Ok(GeneratedField::Children),
"metadata" => Ok(GeneratedField::Metadata),
"dictId" | "dict_id" => Ok(GeneratedField::DictId),
"dictOrdered" | "dict_ordered" => Ok(GeneratedField::DictOrdered),
_ => Err(serde::de::Error::unknown_field(value, FIELDS)),
}
Expand All @@ -3220,7 +3207,6 @@ impl<'de> serde::Deserialize<'de> for Field {
let mut nullable__ = None;
let mut children__ = None;
let mut metadata__ = None;
let mut dict_id__ = None;
let mut dict_ordered__ = None;
while let Some(k) = map_.next_key()? {
match k {
Expand Down Expand Up @@ -3256,14 +3242,6 @@ impl<'de> serde::Deserialize<'de> for Field {
map_.next_value::<std::collections::HashMap<_, _>>()?
);
}
GeneratedField::DictId => {
if dict_id__.is_some() {
return Err(serde::de::Error::duplicate_field("dictId"));
}
dict_id__ =
Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
;
}
GeneratedField::DictOrdered => {
if dict_ordered__.is_some() {
return Err(serde::de::Error::duplicate_field("dictOrdered"));
Expand All @@ -3278,7 +3256,6 @@ impl<'de> serde::Deserialize<'de> for Field {
nullable: nullable__.unwrap_or_default(),
children: children__.unwrap_or_default(),
metadata: metadata__.unwrap_or_default(),
dict_id: dict_id__.unwrap_or_default(),
dict_ordered: dict_ordered__.unwrap_or_default(),
})
}
Expand Down
4 changes: 1 addition & 3 deletions datafusion/proto-common/src/generated/prost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,7 @@ pub struct Field {
::prost::alloc::string::String,
::prost::alloc::string::String,
>,
#[prost(int64, tag = "6")]
pub dict_id: i64,
#[prost(bool, tag = "7")]
#[prost(bool, tag = "6")]
pub dict_ordered: bool,
}
#[derive(Clone, PartialEq, ::prost::Message)]
Expand Down
3 changes: 0 additions & 3 deletions datafusion/proto-common/src/to_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,6 @@ impl TryFrom<&Field> for protobuf::Field {
nullable: field.is_nullable(),
children: Vec::new(),
metadata: field.metadata().clone(),
#[allow(deprecated)]
// See https://github.com/apache/datafusion/issues/14173 to remove deprecated dict_id
dict_id: field.dict_id().unwrap_or(0),
dict_ordered: field.dict_is_ordered().unwrap_or(false),
})
}
Expand Down
4 changes: 1 addition & 3 deletions datafusion/proto/src/generated/datafusion_proto_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,7 @@ pub struct Field {
::prost::alloc::string::String,
::prost::alloc::string::String,
>,
#[prost(int64, tag = "6")]
pub dict_id: i64,
#[prost(bool, tag = "7")]
#[prost(bool, tag = "6")]
pub dict_ordered: bool,
}
#[derive(Clone, PartialEq, ::prost::Message)]
Expand Down
72 changes: 0 additions & 72 deletions datafusion/proto/tests/cases/roundtrip_logical_plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1494,20 +1494,6 @@ fn round_trip_scalar_values_and_data_types() {
Field::new("b", DataType::Boolean, false),
ScalarValue::from(false),
)
.with_scalar(
Field::new(
"c",
DataType::Dictionary(
Box::new(DataType::UInt16),
Box::new(DataType::Utf8),
),
false,
),
ScalarValue::Dictionary(
Box::new(DataType::UInt16),
Box::new("value".into()),
),
)
.build()
.unwrap(),
ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
Expand All @@ -1518,25 +1504,6 @@ fn round_trip_scalar_values_and_data_types() {
ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
Field::new("a", DataType::Int32, true),
Field::new("b", DataType::Boolean, false),
Field::new(
"c",
DataType::Dictionary(
Box::new(DataType::UInt16),
Box::new(DataType::Binary),
),
false,
),
Field::new(
"d",
DataType::new_list(
DataType::Dictionary(
Box::new(DataType::UInt16),
Box::new(DataType::Binary),
),
false,
),
false,
),
])))
.unwrap(),
ScalarValue::try_from(&DataType::Map(
Expand Down Expand Up @@ -1815,45 +1782,6 @@ fn round_trip_datatype() {
}
}

// See https://github.com/apache/datafusion/issues/14173 to remove deprecated dict_id
#[allow(deprecated)]
#[test]
fn roundtrip_dict_id() -> Result<()> {
let dict_id = 42;
let field = Field::new(
"keys",
DataType::List(Arc::new(Field::new_dict(
"item",
DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)),
true,
dict_id,
false,
))),
false,
);
let schema = Arc::new(Schema::new(vec![field]));

// encode
let mut buf: Vec<u8> = vec![];
let schema_proto: protobuf::Schema = schema.try_into().unwrap();
schema_proto.encode(&mut buf).unwrap();

// decode
let schema_proto = protobuf::Schema::decode(buf.as_slice()).unwrap();
let decoded: Schema = (&schema_proto).try_into()?;

// assert
let keys = decoded.fields().iter().last().unwrap();
match keys.data_type() {
DataType::List(field) => {
assert_eq!(field.dict_id(), Some(dict_id), "dict_id should be retained");
}
_ => panic!("Invalid type"),
}

Ok(())
}

#[test]
fn roundtrip_null_scalar_values() {
let test_types = vec![
Expand Down
20 changes: 0 additions & 20 deletions datafusion/sqllogictest/test_files/copy.slt
Original file line number Diff line number Diff line change
Expand Up @@ -538,26 +538,6 @@ select * from validate_arrow_file;
1 Foo
2 Bar

# Copy from dict encoded values to single arrow file
query I
COPY (values
('c', arrow_cast('foo', 'Dictionary(Int32, Utf8)')), ('d', arrow_cast('bar', 'Dictionary(Int32, Utf8)')))
to 'test_files/scratch/copy/table_dict.arrow' STORED AS ARROW;
----
2

# Validate single csv output
statement ok
CREATE EXTERNAL TABLE validate_arrow_file_dict
STORED AS arrow
LOCATION 'test_files/scratch/copy/table_dict.arrow';

query TT
select * from validate_arrow_file_dict;
----
c foo
d bar


# Copy from table to folder of json
query I
Expand Down
4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/regexp.slt
Original file line number Diff line number Diff line change
Expand Up @@ -431,8 +431,8 @@ SELECT 'foo\nbar\nbaz' ~ 'bar';
true

statement error
Error during planning: Cannot infer common argument type for regex operation List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata
: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
Error during planning: Cannot infer common argument type for regex operation List(Field { name: "item", data_type: Int64, nullable: true, dict_is_ordered: false, metadata
: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true, dict_is_ordered: false, metadata: {} })
select [1,2] ~ [3];

query B
Expand Down