From 5096d740e54eb1a428d41400c6863951d8146f06 Mon Sep 17 00:00:00 2001 From: Neil Chao Date: Mon, 24 Feb 2025 15:46:36 -0800 Subject: [PATCH] Variant arrow extension type --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/extension/CMakeLists.txt | 2 +- cpp/src/arrow/extension/variant.cc | 60 +++++++++++++++++++++++++ cpp/src/arrow/extension/variant.h | 58 ++++++++++++++++++++++++ cpp/src/arrow/extension/variant_test.cc | 34 ++++++++++++++ cpp/src/parquet/types.h | 2 +- 6 files changed, 155 insertions(+), 2 deletions(-) create mode 100644 cpp/src/arrow/extension/variant.cc create mode 100644 cpp/src/arrow/extension/variant.h create mode 100644 cpp/src/arrow/extension/variant_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index eb9860b240f16..d4177e53494f1 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -377,6 +377,7 @@ set(ARROW_SRCS extension/bool8.cc extension/json.cc extension/uuid.cc + extension/variant.cc pretty_print.cc record_batch.cc result.cc diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 4ab6a35b52e4f..5a39526453639 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -set(CANONICAL_EXTENSION_TESTS bool8_test.cc json_test.cc uuid_test.cc) +set(CANONICAL_EXTENSION_TESTS bool8_test.cc json_test.cc uuid_test.cc variant_test.cc) if(ARROW_JSON) list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc) diff --git a/cpp/src/arrow/extension/variant.cc b/cpp/src/arrow/extension/variant.cc new file mode 100644 index 0000000000000..c6768d05658ec --- /dev/null +++ b/cpp/src/arrow/extension/variant.cc @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/variant.h" + +#include + +#include "arrow/extension_type.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/logging.h" + +namespace arrow::extension { + +bool isBinary(Type::type type) { + return type == Type::BINARY || type == Type::LARGE_BINARY; +} + +bool VariantExtensionType::IsSupportedStorageType( + std::shared_ptr storage_type) { + if (storage_type->id() == Type::STRUCT) { + // TODO(neilechao) assertions for binary types, and non-nullable first field for + // metadata + return storage_type->num_fields() == 3; + } + + return false; +} + +Result> VariantExtensionType::Make( + std::shared_ptr storage_type) { + if (!IsSupportedStorageType(storage_type)) { + return Status::Invalid( + "Invalid storage type for VariantExtensionType, must be struct with binary " + "metadata, value, and typed_value fields: ", + storage_type->ToString()); + } + return std::make_shared(std::move(storage_type)); +} + +std::shared_ptr variant(std::shared_ptr storage_type) { + return VariantExtensionType::Make(std::move(storage_type)).ValueOrDie(); +} + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/variant.h b/cpp/src/arrow/extension/variant.h new file mode 100644 index 0000000000000..f9d0b779b8793 --- /dev/null +++ b/cpp/src/arrow/extension/variant.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/extension_type.h" +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow::extension { + +class ARROW_EXPORT VariantExtensionType : public ExtensionType { + public: + explicit VariantExtensionType(const std::shared_ptr& storage_type) + : ExtensionType(storage_type), storage_type_(storage_type) {} + + std::string extension_name() const override { return "variant.json"; } + + bool ExtensionEquals(const ExtensionType& other) const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + std::string Serialize() const override; + + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + static Result> Make(std::shared_ptr storage_type); + + static bool IsSupportedStorageType(std::shared_ptr storage_type); + + private: + std::shared_ptr storage_type_; +}; + +/// \brief Return a VariantExtensionType instance. +ARROW_EXPORT std::shared_ptr variant(std::shared_ptr storage_type); + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/variant_test.cc b/cpp/src/arrow/extension/variant_test.cc new file mode 100644 index 0000000000000..e87b028ef858d --- /dev/null +++ b/cpp/src/arrow/extension/variant_test.cc @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/variant.h" + +#include "arrow/array/validate.h" +#include "arrow/ipc/test_common.h" +#include "arrow/record_batch.h" +#include "arrow/testing/gtest_util.h" +#include "parquet/exception.h" + +namespace arrow { + +using arrow::ipc::test::RoundtripBatch; + +class TestVariantExtensionType : public ::testing::Test {}; + +TEST_F(TestVariantExtensionType, VariantRoundtrip) { ASSERT_TRUE(false); } + +} // namespace arrow diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index c89d02b912e10..1ad6dc9628617 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -449,7 +449,7 @@ class PARQUET_EXPORT Float16LogicalType : public LogicalType { Float16LogicalType() = default; }; -/// \brief Allowed for physical type BYTE_ARRAY. +/// \brief Allowed for group nodes only. class PARQUET_EXPORT VariantLogicalType : public LogicalType { public: static std::shared_ptr Make();