From 993aa7c86f470c427e14b090c319a20d1fffebb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Milenkovi=C4=87?= Date: Fri, 14 Feb 2025 20:43:26 +0000 Subject: [PATCH] update datafusion proto --- ballista/core/proto/datafusion.proto | 2501 ++++++++++--------- ballista/core/proto/datafusion_common.proto | 1114 ++++----- 2 files changed, 1819 insertions(+), 1796 deletions(-) diff --git a/ballista/core/proto/datafusion.proto b/ballista/core/proto/datafusion.proto index e27f8af21..f0306f31a 100644 --- a/ballista/core/proto/datafusion.proto +++ b/ballista/core/proto/datafusion.proto @@ -16,1243 +16,1264 @@ * limitations under the License. */ -syntax = "proto3"; - -package datafusion; - -option java_multiple_files = true; -option java_package = "org.apache.arrow.datafusion.protobuf"; -option java_outer_classname = "DatafusionProto"; - -import "datafusion_common.proto"; - -// logical plan -// LogicalPlan is a nested type -message LogicalPlanNode { - oneof LogicalPlanType { - ListingTableScanNode listing_scan = 1; - ProjectionNode projection = 3; - SelectionNode selection = 4; - LimitNode limit = 5; - AggregateNode aggregate = 6; - JoinNode join = 7; - SortNode sort = 8; - RepartitionNode repartition = 9; - EmptyRelationNode empty_relation = 10; - CreateExternalTableNode create_external_table = 11; - ExplainNode explain = 12; - WindowNode window = 13; - AnalyzeNode analyze = 14; - CrossJoinNode cross_join = 15; - ValuesNode values = 16; - LogicalExtensionNode extension = 17; - CreateCatalogSchemaNode create_catalog_schema = 18; - UnionNode union = 19; - CreateCatalogNode create_catalog = 20; - SubqueryAliasNode subquery_alias = 21; - CreateViewNode create_view = 22; - DistinctNode distinct = 23; - ViewTableScanNode view_scan = 24; - CustomTableScanNode custom_scan = 25; - PrepareNode prepare = 26; - DropViewNode drop_view = 27; - DistinctOnNode distinct_on = 28; - CopyToNode copy_to = 29; - UnnestNode unnest = 30; - RecursiveQueryNode recursive_query = 31; - CteWorkTableScanNode cte_work_table_scan = 32; - } -} - -message LogicalExtensionNode { - bytes node = 1; - repeated LogicalPlanNode inputs = 2; -} - -message ProjectionColumns { - repeated string columns = 1; -} - -message LogicalExprNodeCollection { - repeated LogicalExprNode logical_expr_nodes = 1; -} - -message SortExprNodeCollection { - repeated SortExprNode sort_expr_nodes = 1; -} - -message ListingTableScanNode { - reserved 1; // was string table_name - TableReference table_name = 14; - repeated string paths = 2; - string file_extension = 3; - ProjectionColumns projection = 4; - datafusion_common.Schema schema = 5; - repeated LogicalExprNode filters = 6; - repeated string table_partition_cols = 7; - bool collect_stat = 8; - uint32 target_partitions = 9; - oneof FileFormatType { - datafusion_common.CsvFormat csv = 10; - datafusion_common.ParquetFormat parquet = 11; - datafusion_common.AvroFormat avro = 12; - datafusion_common.NdJsonFormat json = 15; - } - repeated SortExprNodeCollection file_sort_order = 13; -} - -message ViewTableScanNode { - reserved 1; // was string table_name - TableReference table_name = 6; - LogicalPlanNode input = 2; - datafusion_common.Schema schema = 3; - ProjectionColumns projection = 4; - string definition = 5; -} - -// Logical Plan to Scan a CustomTableProvider registered at runtime -message CustomTableScanNode { - reserved 1; // was string table_name - TableReference table_name = 6; - ProjectionColumns projection = 2; - datafusion_common.Schema schema = 3; - repeated LogicalExprNode filters = 4; - bytes custom_table_data = 5; -} - -message ProjectionNode { - LogicalPlanNode input = 1; - repeated LogicalExprNode expr = 2; - oneof optional_alias { - string alias = 3; - } -} - -message SelectionNode { - LogicalPlanNode input = 1; - LogicalExprNode expr = 2; -} - -message SortNode { - LogicalPlanNode input = 1; - repeated SortExprNode expr = 2; - // Maximum number of highest/lowest rows to fetch; negative means no limit - int64 fetch = 3; -} - -message RepartitionNode { - LogicalPlanNode input = 1; - oneof partition_method { - uint64 round_robin = 2; - HashRepartition hash = 3; - } -} - -message HashRepartition { - repeated LogicalExprNode hash_expr = 1; - uint64 partition_count = 2; -} - -message EmptyRelationNode { - bool produce_one_row = 1; -} - -message CreateExternalTableNode { - reserved 1; // was string name - TableReference name = 9; - string location = 2; - string file_type = 3; - datafusion_common.DfSchema schema = 4; - repeated string table_partition_cols = 5; - bool if_not_exists = 6; - bool temporary = 14; - string definition = 7; - repeated SortExprNodeCollection order_exprs = 10; - bool unbounded = 11; - map options = 8; - datafusion_common.Constraints constraints = 12; - map column_defaults = 13; -} - -message PrepareNode { - string name = 1; - repeated datafusion_common.ArrowType data_types = 2; - LogicalPlanNode input = 3; -} - -message CreateCatalogSchemaNode { - string schema_name = 1; - bool if_not_exists = 2; - datafusion_common.DfSchema schema = 3; -} - -message CreateCatalogNode { - string catalog_name = 1; - bool if_not_exists = 2; - datafusion_common.DfSchema schema = 3; -} - -message DropViewNode { - TableReference name = 1; - bool if_exists = 2; - datafusion_common.DfSchema schema = 3; -} - -message CreateViewNode { - reserved 1; // was string name - TableReference name = 5; - LogicalPlanNode input = 2; - bool or_replace = 3; - bool temporary = 6; - string definition = 4; -} - -// a node containing data for defining values list. unlike in SQL where it's two dimensional, here -// the list is flattened, and with the field n_cols it can be parsed and partitioned into rows -message ValuesNode { - uint64 n_cols = 1; - repeated LogicalExprNode values_list = 2; -} - -message AnalyzeNode { - LogicalPlanNode input = 1; - bool verbose = 2; -} - -message ExplainNode { - LogicalPlanNode input = 1; - bool verbose = 2; -} - -message AggregateNode { - LogicalPlanNode input = 1; - repeated LogicalExprNode group_expr = 2; - repeated LogicalExprNode aggr_expr = 3; -} - -message WindowNode { - LogicalPlanNode input = 1; - repeated LogicalExprNode window_expr = 2; -} - -message JoinNode { - LogicalPlanNode left = 1; - LogicalPlanNode right = 2; - datafusion_common.JoinType join_type = 3; - datafusion_common.JoinConstraint join_constraint = 4; - repeated LogicalExprNode left_join_key = 5; - repeated LogicalExprNode right_join_key = 6; - bool null_equals_null = 7; - LogicalExprNode filter = 8; -} - -message DistinctNode { - LogicalPlanNode input = 1; -} - -message DistinctOnNode { - repeated LogicalExprNode on_expr = 1; - repeated LogicalExprNode select_expr = 2; - repeated SortExprNode sort_expr = 3; - LogicalPlanNode input = 4; -} - -message CopyToNode { - LogicalPlanNode input = 1; - string output_url = 2; - bytes file_type = 3; - repeated string partition_by = 7; -} - -message UnnestNode { - LogicalPlanNode input = 1; - repeated datafusion_common.Column exec_columns = 2; - repeated ColumnUnnestListItem list_type_columns = 3; - repeated uint64 struct_type_columns = 4; - repeated uint64 dependency_indices = 5; - datafusion_common.DfSchema schema = 6; - UnnestOptions options = 7; -} -message ColumnUnnestListItem { - uint32 input_index = 1; - ColumnUnnestListRecursion recursion = 2; -} - -message ColumnUnnestListRecursions { - repeated ColumnUnnestListRecursion recursions = 2; -} - -message ColumnUnnestListRecursion { - datafusion_common.Column output_column = 1; - uint32 depth = 2; -} - -message UnnestOptions { - bool preserve_nulls = 1; - repeated RecursionUnnestOption recursions = 2; -} - -message RecursionUnnestOption { - datafusion_common.Column output_column = 1; - datafusion_common.Column input_column = 2; - uint32 depth = 3; -} - -message UnionNode { - repeated LogicalPlanNode inputs = 1; -} - -message CrossJoinNode { - LogicalPlanNode left = 1; - LogicalPlanNode right = 2; -} - -message LimitNode { - LogicalPlanNode input = 1; - // The number of rows to skip before fetch; non-positive means don't skip any - int64 skip = 2; - // Maximum number of rows to fetch; negative means no limit - int64 fetch = 3; -} - -message SelectionExecNode { - LogicalExprNode expr = 1; -} - -message SubqueryAliasNode { - reserved 2; // Was string alias - LogicalPlanNode input = 1; - TableReference alias = 3; -} - -// logical expressions -message LogicalExprNode { - oneof ExprType { - // column references - datafusion_common.Column column = 1; - - // alias - AliasNode alias = 2; - - datafusion_common.ScalarValue literal = 3; - - // binary expressions - BinaryExprNode binary_expr = 4; - - - // null checks - IsNull is_null_expr = 6; - IsNotNull is_not_null_expr = 7; - Not not_expr = 8; - - BetweenNode between = 9; - CaseNode case_ = 10; - CastNode cast = 11; - NegativeNode negative = 13; - InListNode in_list = 14; - Wildcard wildcard = 15; - // was ScalarFunctionNode scalar_function = 16; - TryCastNode try_cast = 17; - - // window expressions - WindowExprNode window_expr = 18; - - // AggregateUDF expressions - AggregateUDFExprNode aggregate_udf_expr = 19; - - // Scalar UDF expressions - ScalarUDFExprNode scalar_udf_expr = 20; - - // GetIndexedField get_indexed_field = 21; - - GroupingSetNode grouping_set = 22; - - CubeNode cube = 23; - - RollupNode rollup = 24; - - IsTrue is_true = 25; - IsFalse is_false = 26; - IsUnknown is_unknown = 27; - IsNotTrue is_not_true = 28; - IsNotFalse is_not_false = 29; - IsNotUnknown is_not_unknown = 30; - LikeNode like = 31; - ILikeNode ilike = 32; - SimilarToNode similar_to = 33; - - PlaceholderNode placeholder = 34; - - Unnest unnest = 35; - - } -} - -message Wildcard { - TableReference qualifier = 1; -} - -message PlaceholderNode { - string id = 1; - datafusion_common.ArrowType data_type = 2; -} - -message LogicalExprList { - repeated LogicalExprNode expr = 1; -} - -message GroupingSetNode { - repeated LogicalExprList expr = 1; -} - -message CubeNode { - repeated LogicalExprNode expr = 1; -} - -message RollupNode { - repeated LogicalExprNode expr = 1; -} - -message NamedStructField { - datafusion_common.ScalarValue name = 1; -} - -message ListIndex { - LogicalExprNode key = 1; -} - -message ListRange { - LogicalExprNode start = 1; - LogicalExprNode stop = 2; - LogicalExprNode stride = 3; -} - -message IsNull { - LogicalExprNode expr = 1; -} - -message IsNotNull { - LogicalExprNode expr = 1; -} - -message IsTrue { - LogicalExprNode expr = 1; -} - -message IsFalse { - LogicalExprNode expr = 1; -} - -message IsUnknown { - LogicalExprNode expr = 1; -} - -message IsNotTrue { - LogicalExprNode expr = 1; -} - -message IsNotFalse { - LogicalExprNode expr = 1; -} - -message IsNotUnknown { - LogicalExprNode expr = 1; -} - -message Not { - LogicalExprNode expr = 1; -} - -message AliasNode { - LogicalExprNode expr = 1; - string alias = 2; - repeated TableReference relation = 3; -} - -message BinaryExprNode { - // Represents the operands from the left inner most expression - // to the right outer most expression where each of them are chained - // with the operator 'op'. - repeated LogicalExprNode operands = 1; - string op = 3; -} - -message NegativeNode { - LogicalExprNode expr = 1; -} - -message Unnest { - repeated LogicalExprNode exprs = 1; -} - -message InListNode { - LogicalExprNode expr = 1; - repeated LogicalExprNode list = 2; - bool negated = 3; -} - - -message AggregateUDFExprNode { - string fun_name = 1; - repeated LogicalExprNode args = 2; - bool distinct = 5; - LogicalExprNode filter = 3; - repeated SortExprNode order_by = 4; - optional bytes fun_definition = 6; -} - -message ScalarUDFExprNode { - string fun_name = 1; - repeated LogicalExprNode args = 2; - optional bytes fun_definition = 3; -} - -message WindowExprNode { - oneof window_function { - // BuiltInWindowFunction built_in_function = 2; - string udaf = 3; - string udwf = 9; - } - repeated LogicalExprNode exprs = 4; - repeated LogicalExprNode partition_by = 5; - repeated SortExprNode order_by = 6; - // repeated LogicalExprNode filter = 7; - WindowFrame window_frame = 8; - optional bytes fun_definition = 10; -} - -message BetweenNode { - LogicalExprNode expr = 1; - bool negated = 2; - LogicalExprNode low = 3; - LogicalExprNode high = 4; -} - -message LikeNode { - bool negated = 1; - LogicalExprNode expr = 2; - LogicalExprNode pattern = 3; - string escape_char = 4; -} - -message ILikeNode { - bool negated = 1; - LogicalExprNode expr = 2; - LogicalExprNode pattern = 3; - string escape_char = 4; -} - -message SimilarToNode { - bool negated = 1; - LogicalExprNode expr = 2; - LogicalExprNode pattern = 3; - string escape_char = 4; -} - -message CaseNode { - LogicalExprNode expr = 1; - repeated WhenThen when_then_expr = 2; - LogicalExprNode else_expr = 3; -} - -message WhenThen { - LogicalExprNode when_expr = 1; - LogicalExprNode then_expr = 2; -} - -message CastNode { - LogicalExprNode expr = 1; - datafusion_common.ArrowType arrow_type = 2; -} - -message TryCastNode { - LogicalExprNode expr = 1; - datafusion_common.ArrowType arrow_type = 2; -} - -message SortExprNode { - LogicalExprNode expr = 1; - bool asc = 2; - bool nulls_first = 3; -} - -enum WindowFrameUnits { - ROWS = 0; - RANGE = 1; - GROUPS = 2; -} - -message WindowFrame { - WindowFrameUnits window_frame_units = 1; - WindowFrameBound start_bound = 2; - // "optional" keyword is stable in protoc 3.15 but prost is still on 3.14 (see https://github.com/tokio-rs/prost/issues/430 and https://github.com/tokio-rs/prost/pull/455) - // this syntax is ugly but is binary compatible with the "optional" keyword (see https://stackoverflow.com/questions/42622015/how-to-define-an-optional-field-in-protobuf-3) - oneof end_bound { - WindowFrameBound bound = 3; - } -} - -enum WindowFrameBoundType { - CURRENT_ROW = 0; - PRECEDING = 1; - FOLLOWING = 2; -} - -message WindowFrameBound { - WindowFrameBoundType window_frame_bound_type = 1; - datafusion_common.ScalarValue bound_value = 2; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Arrow Data Types -/////////////////////////////////////////////////////////////////////////////////////////////////// - -message FixedSizeBinary{ - int32 length = 1; -} - -enum DateUnit{ - Day = 0; - DateMillisecond = 1; -} - -message AnalyzedLogicalPlanType { - string analyzer_name = 1; -} - -message OptimizedLogicalPlanType { - string optimizer_name = 1; -} - -message OptimizedPhysicalPlanType { - string optimizer_name = 1; -} - -message PlanType { - oneof plan_type_enum { - datafusion_common.EmptyMessage InitialLogicalPlan = 1; - AnalyzedLogicalPlanType AnalyzedLogicalPlan = 7; - datafusion_common.EmptyMessage FinalAnalyzedLogicalPlan = 8; - OptimizedLogicalPlanType OptimizedLogicalPlan = 2; - datafusion_common.EmptyMessage FinalLogicalPlan = 3; - datafusion_common.EmptyMessage InitialPhysicalPlan = 4; - datafusion_common.EmptyMessage InitialPhysicalPlanWithStats = 9; - datafusion_common.EmptyMessage InitialPhysicalPlanWithSchema = 11; - OptimizedPhysicalPlanType OptimizedPhysicalPlan = 5; - datafusion_common.EmptyMessage FinalPhysicalPlan = 6; - datafusion_common.EmptyMessage FinalPhysicalPlanWithStats = 10; - datafusion_common.EmptyMessage FinalPhysicalPlanWithSchema = 12; - datafusion_common.EmptyMessage PhysicalPlanError = 13; - } -} - -message StringifiedPlan { - PlanType plan_type = 1; - string plan = 2; -} - -message BareTableReference { - string table = 1; -} - -message PartialTableReference { - string schema = 1; - string table = 2; -} - -message FullTableReference { - string catalog = 1; - string schema = 2; - string table = 3; -} - -message TableReference { - oneof table_reference_enum { - BareTableReference bare = 1; - PartialTableReference partial = 2; - FullTableReference full = 3; - } -} - -///////////////////////////////////////////////////////////////////////////////////////////////// - -// PhysicalPlanNode is a nested type -message PhysicalPlanNode { - oneof PhysicalPlanType { - ParquetScanExecNode parquet_scan = 1; - CsvScanExecNode csv_scan = 2; - EmptyExecNode empty = 3; - ProjectionExecNode projection = 4; - GlobalLimitExecNode global_limit = 6; - LocalLimitExecNode local_limit = 7; - AggregateExecNode aggregate = 8; - HashJoinExecNode hash_join = 9; - SortExecNode sort = 10; - CoalesceBatchesExecNode coalesce_batches = 11; - FilterExecNode filter = 12; - CoalescePartitionsExecNode merge = 13; - RepartitionExecNode repartition = 14; - WindowAggExecNode window = 15; - CrossJoinExecNode cross_join = 16; - AvroScanExecNode avro_scan = 17; - PhysicalExtensionNode extension = 18; - UnionExecNode union = 19; - ExplainExecNode explain = 20; - SortPreservingMergeExecNode sort_preserving_merge = 21; - NestedLoopJoinExecNode nested_loop_join = 22; - AnalyzeExecNode analyze = 23; - JsonSinkExecNode json_sink = 24; - SymmetricHashJoinExecNode symmetric_hash_join = 25; - InterleaveExecNode interleave = 26; - PlaceholderRowExecNode placeholder_row = 27; - CsvSinkExecNode csv_sink = 28; - ParquetSinkExecNode parquet_sink = 29; - UnnestExecNode unnest = 30; - } -} - -message PartitionColumn { - string name = 1; - datafusion_common.ArrowType arrow_type = 2; -} - - -message FileSinkConfig { - reserved 6; // writer_mode - reserved 8; // was `overwrite` which has been superseded by `insert_op` - - string object_store_url = 1; - repeated PartitionedFile file_groups = 2; - repeated string table_paths = 3; - datafusion_common.Schema output_schema = 4; - repeated PartitionColumn table_partition_cols = 5; - bool keep_partition_by_columns = 9; - InsertOp insert_op = 10; -} - -enum InsertOp { - Append = 0; - Overwrite = 1; - Replace = 2; -} - -message JsonSink { - FileSinkConfig config = 1; - datafusion_common.JsonWriterOptions writer_options = 2; -} - -message JsonSinkExecNode { - PhysicalPlanNode input = 1; - JsonSink sink = 2; - datafusion_common.Schema sink_schema = 3; - PhysicalSortExprNodeCollection sort_order = 4; -} - -message CsvSink { - FileSinkConfig config = 1; - datafusion_common.CsvWriterOptions writer_options = 2; -} - -message CsvSinkExecNode { - PhysicalPlanNode input = 1; - CsvSink sink = 2; - datafusion_common.Schema sink_schema = 3; - PhysicalSortExprNodeCollection sort_order = 4; -} - -message ParquetSink { - FileSinkConfig config = 1; - datafusion_common.TableParquetOptions parquet_options = 2; -} - -message ParquetSinkExecNode { - PhysicalPlanNode input = 1; - ParquetSink sink = 2; - datafusion_common.Schema sink_schema = 3; - PhysicalSortExprNodeCollection sort_order = 4; -} - -message UnnestExecNode { - PhysicalPlanNode input = 1; - datafusion_common.Schema schema = 2; - repeated ListUnnest list_type_columns = 3; - repeated uint64 struct_type_columns = 4; - UnnestOptions options = 5; -} - -message ListUnnest { - uint32 index_in_input_schema = 1; - uint32 depth = 2; -} - -message PhysicalExtensionNode { - bytes node = 1; - repeated PhysicalPlanNode inputs = 2; -} - -// physical expressions -message PhysicalExprNode { - // Was date_time_interval_expr - reserved 17; - - oneof ExprType { - // column references - PhysicalColumn column = 1; - - datafusion_common.ScalarValue literal = 2; - - // binary expressions - PhysicalBinaryExprNode binary_expr = 3; - - // aggregate expressions - PhysicalAggregateExprNode aggregate_expr = 4; - - // null checks - PhysicalIsNull is_null_expr = 5; - PhysicalIsNotNull is_not_null_expr = 6; - PhysicalNot not_expr = 7; - - PhysicalCaseNode case_ = 8; - PhysicalCastNode cast = 9; - PhysicalSortExprNode sort = 10; - PhysicalNegativeNode negative = 11; - PhysicalInListNode in_list = 12; - // was PhysicalScalarFunctionNode scalar_function = 13; - PhysicalTryCastNode try_cast = 14; - // window expressions - PhysicalWindowExprNode window_expr = 15; - - PhysicalScalarUdfNode scalar_udf = 16; - // was PhysicalDateTimeIntervalExprNode date_time_interval_expr = 17; - - PhysicalLikeExprNode like_expr = 18; - - PhysicalExtensionExprNode extension = 19; - - UnknownColumn unknown_column = 20; - } -} - -message PhysicalScalarUdfNode { - string name = 1; - repeated PhysicalExprNode args = 2; - optional bytes fun_definition = 3; - datafusion_common.ArrowType return_type = 4; - bool nullable = 5; -} - -message PhysicalAggregateExprNode { - oneof AggregateFunction { - string user_defined_aggr_function = 4; - } - repeated PhysicalExprNode expr = 2; - repeated PhysicalSortExprNode ordering_req = 5; - bool distinct = 3; - bool ignore_nulls = 6; - optional bytes fun_definition = 7; -} - -message PhysicalWindowExprNode { - oneof window_function { - // BuiltInWindowFunction built_in_function = 2; - string user_defined_aggr_function = 3; - string user_defined_window_function = 10; - } - repeated PhysicalExprNode args = 4; - repeated PhysicalExprNode partition_by = 5; - repeated PhysicalSortExprNode order_by = 6; - WindowFrame window_frame = 7; - string name = 8; - optional bytes fun_definition = 9; -} - -message PhysicalIsNull { - PhysicalExprNode expr = 1; -} - -message PhysicalIsNotNull { - PhysicalExprNode expr = 1; -} - -message PhysicalNot { - PhysicalExprNode expr = 1; -} - -message PhysicalAliasNode { - PhysicalExprNode expr = 1; - string alias = 2; -} - -message PhysicalBinaryExprNode { - PhysicalExprNode l = 1; - PhysicalExprNode r = 2; - string op = 3; -} - -message PhysicalDateTimeIntervalExprNode { - PhysicalExprNode l = 1; - PhysicalExprNode r = 2; - string op = 3; -} - -message PhysicalLikeExprNode { - bool negated = 1; - bool case_insensitive = 2; - PhysicalExprNode expr = 3; - PhysicalExprNode pattern = 4; -} - -message PhysicalSortExprNode { - PhysicalExprNode expr = 1; - bool asc = 2; - bool nulls_first = 3; -} - -message PhysicalWhenThen { - PhysicalExprNode when_expr = 1; - PhysicalExprNode then_expr = 2; -} - -message PhysicalInListNode { - PhysicalExprNode expr = 1; - repeated PhysicalExprNode list = 2; - bool negated = 3; -} - -message PhysicalCaseNode { - PhysicalExprNode expr = 1; - repeated PhysicalWhenThen when_then_expr = 2; - PhysicalExprNode else_expr = 3; -} - -message PhysicalTryCastNode { - PhysicalExprNode expr = 1; - datafusion_common.ArrowType arrow_type = 2; -} - -message PhysicalCastNode { - PhysicalExprNode expr = 1; - datafusion_common.ArrowType arrow_type = 2; -} - -message PhysicalNegativeNode { - PhysicalExprNode expr = 1; -} - -message PhysicalExtensionExprNode { - bytes expr = 1; - repeated PhysicalExprNode inputs = 2; -} - -message FilterExecNode { - PhysicalPlanNode input = 1; - PhysicalExprNode expr = 2; - uint32 default_filter_selectivity = 3; - repeated uint32 projection = 9; -} - -message FileGroup { - repeated PartitionedFile files = 1; -} - -message ScanLimit { - // wrap into a message to make it optional - uint32 limit = 1; -} - -message PhysicalSortExprNodeCollection { - repeated PhysicalSortExprNode physical_sort_expr_nodes = 1; -} - -message FileScanExecConf { - // Was repeated ConfigOption options = 10; - reserved 10; - - repeated FileGroup file_groups = 1; - datafusion_common.Schema schema = 2; - repeated uint32 projection = 4; - ScanLimit limit = 5; - datafusion_common.Statistics statistics = 6; - repeated string table_partition_cols = 7; - string object_store_url = 8; - repeated PhysicalSortExprNodeCollection output_ordering = 9; -} - -message ParquetScanExecNode { - FileScanExecConf base_conf = 1; - - // Was pruning predicate based on a logical expr. - reserved 2; - - PhysicalExprNode predicate = 3; -} - -message CsvScanExecNode { - FileScanExecConf base_conf = 1; - bool has_header = 2; - string delimiter = 3; - string quote = 4; - oneof optional_escape { - string escape = 5; - } - oneof optional_comment { - string comment = 6; - } - bool newlines_in_values = 7; -} - -message AvroScanExecNode { - FileScanExecConf base_conf = 1; -} - -enum PartitionMode { - COLLECT_LEFT = 0; - PARTITIONED = 1; - AUTO = 2; -} - -message HashJoinExecNode { - PhysicalPlanNode left = 1; - PhysicalPlanNode right = 2; - repeated JoinOn on = 3; - datafusion_common.JoinType join_type = 4; - PartitionMode partition_mode = 6; - bool null_equals_null = 7; - JoinFilter filter = 8; - repeated uint32 projection = 9; -} - -enum StreamPartitionMode { - SINGLE_PARTITION = 0; - PARTITIONED_EXEC = 1; -} - -message SymmetricHashJoinExecNode { - PhysicalPlanNode left = 1; - PhysicalPlanNode right = 2; - repeated JoinOn on = 3; - datafusion_common.JoinType join_type = 4; - StreamPartitionMode partition_mode = 6; - bool null_equals_null = 7; - JoinFilter filter = 8; - repeated PhysicalSortExprNode left_sort_exprs = 9; - repeated PhysicalSortExprNode right_sort_exprs = 10; -} - -message InterleaveExecNode { - repeated PhysicalPlanNode inputs = 1; -} - -message UnionExecNode { - repeated PhysicalPlanNode inputs = 1; -} - -message ExplainExecNode { - datafusion_common.Schema schema = 1; - repeated StringifiedPlan stringified_plans = 2; - bool verbose = 3; -} - -message AnalyzeExecNode { - bool verbose = 1; - bool show_statistics = 2; - PhysicalPlanNode input = 3; - datafusion_common.Schema schema = 4; -} - -message CrossJoinExecNode { - PhysicalPlanNode left = 1; - PhysicalPlanNode right = 2; -} - -message PhysicalColumn { - string name = 1; - uint32 index = 2; -} - -message UnknownColumn { - string name = 1; -} - -message JoinOn { - PhysicalExprNode left = 1; - PhysicalExprNode right = 2; -} - -message EmptyExecNode { - datafusion_common.Schema schema = 1; -} - -message PlaceholderRowExecNode { - datafusion_common.Schema schema = 1; -} - -message ProjectionExecNode { - PhysicalPlanNode input = 1; - repeated PhysicalExprNode expr = 2; - repeated string expr_name = 3; -} - -enum AggregateMode { - PARTIAL = 0; - FINAL = 1; - FINAL_PARTITIONED = 2; - SINGLE = 3; - SINGLE_PARTITIONED = 4; -} - -message PartiallySortedInputOrderMode { - repeated uint64 columns = 6; -} - -message WindowAggExecNode { - PhysicalPlanNode input = 1; - repeated PhysicalWindowExprNode window_expr = 2; - repeated PhysicalExprNode partition_keys = 5; - // Set optional to `None` for `BoundedWindowAggExec`. - oneof input_order_mode { - datafusion_common.EmptyMessage linear = 7; - PartiallySortedInputOrderMode partially_sorted = 8; - datafusion_common.EmptyMessage sorted = 9; - } -} - -message MaybeFilter { - PhysicalExprNode expr = 1; -} - -message MaybePhysicalSortExprs { - repeated PhysicalSortExprNode sort_expr = 1; -} - -message AggLimit { - // wrap into a message to make it optional - uint64 limit = 1; -} - -message AggregateExecNode { - repeated PhysicalExprNode group_expr = 1; - repeated PhysicalExprNode aggr_expr = 2; - AggregateMode mode = 3; - PhysicalPlanNode input = 4; - repeated string group_expr_name = 5; - repeated string aggr_expr_name = 6; - // we need the input schema to the partial aggregate to pass to the final aggregate - datafusion_common.Schema input_schema = 7; - repeated PhysicalExprNode null_expr = 8; - repeated bool groups = 9; - repeated MaybeFilter filter_expr = 10; - AggLimit limit = 11; -} - -message GlobalLimitExecNode { - PhysicalPlanNode input = 1; - // The number of rows to skip before fetch - uint32 skip = 2; - // Maximum number of rows to fetch; negative means no limit - int64 fetch = 3; -} - -message LocalLimitExecNode { - PhysicalPlanNode input = 1; - uint32 fetch = 2; -} - -message SortExecNode { - PhysicalPlanNode input = 1; - repeated PhysicalExprNode expr = 2; - // Maximum number of highest/lowest rows to fetch; negative means no limit - int64 fetch = 3; - bool preserve_partitioning = 4; -} - -message SortPreservingMergeExecNode { - PhysicalPlanNode input = 1; - repeated PhysicalExprNode expr = 2; - // Maximum number of highest/lowest rows to fetch; negative means no limit - int64 fetch = 3; -} - -message NestedLoopJoinExecNode { - PhysicalPlanNode left = 1; - PhysicalPlanNode right = 2; - datafusion_common.JoinType join_type = 3; - JoinFilter filter = 4; -} - -message CoalesceBatchesExecNode { - PhysicalPlanNode input = 1; - uint32 target_batch_size = 2; - optional uint32 fetch = 3; -} - -message CoalescePartitionsExecNode { - PhysicalPlanNode input = 1; -} - -message PhysicalHashRepartition { - repeated PhysicalExprNode hash_expr = 1; - uint64 partition_count = 2; -} - -message RepartitionExecNode{ - PhysicalPlanNode input = 1; - // oneof partition_method { - // uint64 round_robin = 2; - // PhysicalHashRepartition hash = 3; - // uint64 unknown = 4; - // } - Partitioning partitioning = 5; -} - -message Partitioning { - oneof partition_method { - uint64 round_robin = 1; - PhysicalHashRepartition hash = 2; - uint64 unknown = 3; - } -} - -message JoinFilter{ - PhysicalExprNode expression = 1; - repeated ColumnIndex column_indices = 2; - datafusion_common.Schema schema = 3; -} - -message ColumnIndex{ - uint32 index = 1; - datafusion_common.JoinSide side = 2; -} - -message PartitionedFile { - string path = 1; - uint64 size = 2; - uint64 last_modified_ns = 3; - repeated datafusion_common.ScalarValue partition_values = 4; - FileRange range = 5; - datafusion_common.Statistics statistics = 6; -} - -message FileRange { - int64 start = 1; - int64 end = 2; -} - -message PartitionStats { - int64 num_rows = 1; - int64 num_batches = 2; - int64 num_bytes = 3; - repeated datafusion_common.ColumnStats column_stats = 4; -} - -message RecursiveQueryNode { - string name = 1; - LogicalPlanNode static_term = 2; - LogicalPlanNode recursive_term = 3; - bool is_distinct = 4; -} - -message CteWorkTableScanNode { - string name = 1; - datafusion_common.Schema schema = 2; -} \ No newline at end of file + syntax = "proto3"; + + package datafusion; + + option java_multiple_files = true; + option java_package = "org.apache.arrow.datafusion.protobuf"; + option java_outer_classname = "DatafusionProto"; + + import "datafusion_common.proto"; + + // logical plan + // LogicalPlan is a nested type + message LogicalPlanNode { + oneof LogicalPlanType { + ListingTableScanNode listing_scan = 1; + ProjectionNode projection = 3; + SelectionNode selection = 4; + LimitNode limit = 5; + AggregateNode aggregate = 6; + JoinNode join = 7; + SortNode sort = 8; + RepartitionNode repartition = 9; + EmptyRelationNode empty_relation = 10; + CreateExternalTableNode create_external_table = 11; + ExplainNode explain = 12; + WindowNode window = 13; + AnalyzeNode analyze = 14; + CrossJoinNode cross_join = 15; + ValuesNode values = 16; + LogicalExtensionNode extension = 17; + CreateCatalogSchemaNode create_catalog_schema = 18; + UnionNode union = 19; + CreateCatalogNode create_catalog = 20; + SubqueryAliasNode subquery_alias = 21; + CreateViewNode create_view = 22; + DistinctNode distinct = 23; + ViewTableScanNode view_scan = 24; + CustomTableScanNode custom_scan = 25; + PrepareNode prepare = 26; + DropViewNode drop_view = 27; + DistinctOnNode distinct_on = 28; + CopyToNode copy_to = 29; + UnnestNode unnest = 30; + RecursiveQueryNode recursive_query = 31; + CteWorkTableScanNode cte_work_table_scan = 32; + DmlNode dml = 33; + } + } + + message LogicalExtensionNode { + bytes node = 1; + repeated LogicalPlanNode inputs = 2; + } + + message ProjectionColumns { + repeated string columns = 1; + } + + message LogicalExprNodeCollection { + repeated LogicalExprNode logical_expr_nodes = 1; + } + + message SortExprNodeCollection { + repeated SortExprNode sort_expr_nodes = 1; + } + + message ListingTableScanNode { + reserved 1; // was string table_name + TableReference table_name = 14; + repeated string paths = 2; + string file_extension = 3; + ProjectionColumns projection = 4; + datafusion_common.Schema schema = 5; + repeated LogicalExprNode filters = 6; + repeated string table_partition_cols = 7; + bool collect_stat = 8; + uint32 target_partitions = 9; + oneof FileFormatType { + datafusion_common.CsvFormat csv = 10; + datafusion_common.ParquetFormat parquet = 11; + datafusion_common.AvroFormat avro = 12; + datafusion_common.NdJsonFormat json = 15; + } + repeated SortExprNodeCollection file_sort_order = 13; + } + + message ViewTableScanNode { + reserved 1; // was string table_name + TableReference table_name = 6; + LogicalPlanNode input = 2; + datafusion_common.Schema schema = 3; + ProjectionColumns projection = 4; + string definition = 5; + } + + // Logical Plan to Scan a CustomTableProvider registered at runtime + message CustomTableScanNode { + reserved 1; // was string table_name + TableReference table_name = 6; + ProjectionColumns projection = 2; + datafusion_common.Schema schema = 3; + repeated LogicalExprNode filters = 4; + bytes custom_table_data = 5; + } + + message ProjectionNode { + LogicalPlanNode input = 1; + repeated LogicalExprNode expr = 2; + oneof optional_alias { + string alias = 3; + } + } + + message SelectionNode { + LogicalPlanNode input = 1; + LogicalExprNode expr = 2; + } + + message SortNode { + LogicalPlanNode input = 1; + repeated SortExprNode expr = 2; + // Maximum number of highest/lowest rows to fetch; negative means no limit + int64 fetch = 3; + } + + message RepartitionNode { + LogicalPlanNode input = 1; + oneof partition_method { + uint64 round_robin = 2; + HashRepartition hash = 3; + } + } + + message HashRepartition { + repeated LogicalExprNode hash_expr = 1; + uint64 partition_count = 2; + } + + message EmptyRelationNode { + bool produce_one_row = 1; + } + + message CreateExternalTableNode { + reserved 1; // was string name + TableReference name = 9; + string location = 2; + string file_type = 3; + datafusion_common.DfSchema schema = 4; + repeated string table_partition_cols = 5; + bool if_not_exists = 6; + bool temporary = 14; + string definition = 7; + repeated SortExprNodeCollection order_exprs = 10; + bool unbounded = 11; + map options = 8; + datafusion_common.Constraints constraints = 12; + map column_defaults = 13; + } + + message PrepareNode { + string name = 1; + repeated datafusion_common.ArrowType data_types = 2; + LogicalPlanNode input = 3; + } + + message CreateCatalogSchemaNode { + string schema_name = 1; + bool if_not_exists = 2; + datafusion_common.DfSchema schema = 3; + } + + message CreateCatalogNode { + string catalog_name = 1; + bool if_not_exists = 2; + datafusion_common.DfSchema schema = 3; + } + + message DropViewNode { + TableReference name = 1; + bool if_exists = 2; + datafusion_common.DfSchema schema = 3; + } + + message CreateViewNode { + reserved 1; // was string name + TableReference name = 5; + LogicalPlanNode input = 2; + bool or_replace = 3; + bool temporary = 6; + string definition = 4; + } + + // a node containing data for defining values list. unlike in SQL where it's two dimensional, here + // the list is flattened, and with the field n_cols it can be parsed and partitioned into rows + message ValuesNode { + uint64 n_cols = 1; + repeated LogicalExprNode values_list = 2; + } + + message AnalyzeNode { + LogicalPlanNode input = 1; + bool verbose = 2; + } + + message ExplainNode { + LogicalPlanNode input = 1; + bool verbose = 2; + } + + message AggregateNode { + LogicalPlanNode input = 1; + repeated LogicalExprNode group_expr = 2; + repeated LogicalExprNode aggr_expr = 3; + } + + message WindowNode { + LogicalPlanNode input = 1; + repeated LogicalExprNode window_expr = 2; + } + + message JoinNode { + LogicalPlanNode left = 1; + LogicalPlanNode right = 2; + datafusion_common.JoinType join_type = 3; + datafusion_common.JoinConstraint join_constraint = 4; + repeated LogicalExprNode left_join_key = 5; + repeated LogicalExprNode right_join_key = 6; + bool null_equals_null = 7; + LogicalExprNode filter = 8; + } + + message DistinctNode { + LogicalPlanNode input = 1; + } + + message DistinctOnNode { + repeated LogicalExprNode on_expr = 1; + repeated LogicalExprNode select_expr = 2; + repeated SortExprNode sort_expr = 3; + LogicalPlanNode input = 4; + } + + message CopyToNode { + LogicalPlanNode input = 1; + string output_url = 2; + bytes file_type = 3; + repeated string partition_by = 7; + } + + message DmlNode{ + enum Type { + UPDATE = 0; + DELETE = 1; + CTAS = 2; + INSERT_APPEND = 3; + INSERT_OVERWRITE = 4; + INSERT_REPLACE = 5; + + } + Type dml_type = 1; + LogicalPlanNode input = 2; + TableReference table_name = 3; + datafusion_common.DfSchema schema = 4; + } + + message UnnestNode { + LogicalPlanNode input = 1; + repeated datafusion_common.Column exec_columns = 2; + repeated ColumnUnnestListItem list_type_columns = 3; + repeated uint64 struct_type_columns = 4; + repeated uint64 dependency_indices = 5; + datafusion_common.DfSchema schema = 6; + UnnestOptions options = 7; + } + message ColumnUnnestListItem { + uint32 input_index = 1; + ColumnUnnestListRecursion recursion = 2; + } + + message ColumnUnnestListRecursions { + repeated ColumnUnnestListRecursion recursions = 2; + } + + message ColumnUnnestListRecursion { + datafusion_common.Column output_column = 1; + uint32 depth = 2; + } + + message UnnestOptions { + bool preserve_nulls = 1; + repeated RecursionUnnestOption recursions = 2; + } + + message RecursionUnnestOption { + datafusion_common.Column output_column = 1; + datafusion_common.Column input_column = 2; + uint32 depth = 3; + } + + message UnionNode { + repeated LogicalPlanNode inputs = 1; + } + + message CrossJoinNode { + LogicalPlanNode left = 1; + LogicalPlanNode right = 2; + } + + message LimitNode { + LogicalPlanNode input = 1; + // The number of rows to skip before fetch; non-positive means don't skip any + int64 skip = 2; + // Maximum number of rows to fetch; negative means no limit + int64 fetch = 3; + } + + message SelectionExecNode { + LogicalExprNode expr = 1; + } + + message SubqueryAliasNode { + reserved 2; // Was string alias + LogicalPlanNode input = 1; + TableReference alias = 3; + } + + // logical expressions + message LogicalExprNode { + oneof ExprType { + // column references + datafusion_common.Column column = 1; + + // alias + AliasNode alias = 2; + + datafusion_common.ScalarValue literal = 3; + + // binary expressions + BinaryExprNode binary_expr = 4; + + + // null checks + IsNull is_null_expr = 6; + IsNotNull is_not_null_expr = 7; + Not not_expr = 8; + + BetweenNode between = 9; + CaseNode case_ = 10; + CastNode cast = 11; + NegativeNode negative = 13; + InListNode in_list = 14; + Wildcard wildcard = 15; + // was ScalarFunctionNode scalar_function = 16; + TryCastNode try_cast = 17; + + // window expressions + WindowExprNode window_expr = 18; + + // AggregateUDF expressions + AggregateUDFExprNode aggregate_udf_expr = 19; + + // Scalar UDF expressions + ScalarUDFExprNode scalar_udf_expr = 20; + + // GetIndexedField get_indexed_field = 21; + + GroupingSetNode grouping_set = 22; + + CubeNode cube = 23; + + RollupNode rollup = 24; + + IsTrue is_true = 25; + IsFalse is_false = 26; + IsUnknown is_unknown = 27; + IsNotTrue is_not_true = 28; + IsNotFalse is_not_false = 29; + IsNotUnknown is_not_unknown = 30; + LikeNode like = 31; + ILikeNode ilike = 32; + SimilarToNode similar_to = 33; + + PlaceholderNode placeholder = 34; + + Unnest unnest = 35; + + } + } + + message Wildcard { + TableReference qualifier = 1; + } + + message PlaceholderNode { + string id = 1; + datafusion_common.ArrowType data_type = 2; + } + + message LogicalExprList { + repeated LogicalExprNode expr = 1; + } + + message GroupingSetNode { + repeated LogicalExprList expr = 1; + } + + message CubeNode { + repeated LogicalExprNode expr = 1; + } + + message RollupNode { + repeated LogicalExprNode expr = 1; + } + + message NamedStructField { + datafusion_common.ScalarValue name = 1; + } + + message ListIndex { + LogicalExprNode key = 1; + } + + message ListRange { + LogicalExprNode start = 1; + LogicalExprNode stop = 2; + LogicalExprNode stride = 3; + } + + message IsNull { + LogicalExprNode expr = 1; + } + + message IsNotNull { + LogicalExprNode expr = 1; + } + + message IsTrue { + LogicalExprNode expr = 1; + } + + message IsFalse { + LogicalExprNode expr = 1; + } + + message IsUnknown { + LogicalExprNode expr = 1; + } + + message IsNotTrue { + LogicalExprNode expr = 1; + } + + message IsNotFalse { + LogicalExprNode expr = 1; + } + + message IsNotUnknown { + LogicalExprNode expr = 1; + } + + message Not { + LogicalExprNode expr = 1; + } + + message AliasNode { + LogicalExprNode expr = 1; + string alias = 2; + repeated TableReference relation = 3; + } + + message BinaryExprNode { + // Represents the operands from the left inner most expression + // to the right outer most expression where each of them are chained + // with the operator 'op'. + repeated LogicalExprNode operands = 1; + string op = 3; + } + + message NegativeNode { + LogicalExprNode expr = 1; + } + + message Unnest { + repeated LogicalExprNode exprs = 1; + } + + message InListNode { + LogicalExprNode expr = 1; + repeated LogicalExprNode list = 2; + bool negated = 3; + } + + + message AggregateUDFExprNode { + string fun_name = 1; + repeated LogicalExprNode args = 2; + bool distinct = 5; + LogicalExprNode filter = 3; + repeated SortExprNode order_by = 4; + optional bytes fun_definition = 6; + } + + message ScalarUDFExprNode { + string fun_name = 1; + repeated LogicalExprNode args = 2; + optional bytes fun_definition = 3; + } + + message WindowExprNode { + oneof window_function { + // BuiltInWindowFunction built_in_function = 2; + string udaf = 3; + string udwf = 9; + } + repeated LogicalExprNode exprs = 4; + repeated LogicalExprNode partition_by = 5; + repeated SortExprNode order_by = 6; + // repeated LogicalExprNode filter = 7; + WindowFrame window_frame = 8; + optional bytes fun_definition = 10; + } + + message BetweenNode { + LogicalExprNode expr = 1; + bool negated = 2; + LogicalExprNode low = 3; + LogicalExprNode high = 4; + } + + message LikeNode { + bool negated = 1; + LogicalExprNode expr = 2; + LogicalExprNode pattern = 3; + string escape_char = 4; + } + + message ILikeNode { + bool negated = 1; + LogicalExprNode expr = 2; + LogicalExprNode pattern = 3; + string escape_char = 4; + } + + message SimilarToNode { + bool negated = 1; + LogicalExprNode expr = 2; + LogicalExprNode pattern = 3; + string escape_char = 4; + } + + message CaseNode { + LogicalExprNode expr = 1; + repeated WhenThen when_then_expr = 2; + LogicalExprNode else_expr = 3; + } + + message WhenThen { + LogicalExprNode when_expr = 1; + LogicalExprNode then_expr = 2; + } + + message CastNode { + LogicalExprNode expr = 1; + datafusion_common.ArrowType arrow_type = 2; + } + + message TryCastNode { + LogicalExprNode expr = 1; + datafusion_common.ArrowType arrow_type = 2; + } + + message SortExprNode { + LogicalExprNode expr = 1; + bool asc = 2; + bool nulls_first = 3; + } + + enum WindowFrameUnits { + ROWS = 0; + RANGE = 1; + GROUPS = 2; + } + + message WindowFrame { + WindowFrameUnits window_frame_units = 1; + WindowFrameBound start_bound = 2; + // "optional" keyword is stable in protoc 3.15 but prost is still on 3.14 (see https://github.com/tokio-rs/prost/issues/430 and https://github.com/tokio-rs/prost/pull/455) + // this syntax is ugly but is binary compatible with the "optional" keyword (see https://stackoverflow.com/questions/42622015/how-to-define-an-optional-field-in-protobuf-3) + oneof end_bound { + WindowFrameBound bound = 3; + } + } + + enum WindowFrameBoundType { + CURRENT_ROW = 0; + PRECEDING = 1; + FOLLOWING = 2; + } + + message WindowFrameBound { + WindowFrameBoundType window_frame_bound_type = 1; + datafusion_common.ScalarValue bound_value = 2; + } + + /////////////////////////////////////////////////////////////////////////////////////////////////// + // Arrow Data Types + /////////////////////////////////////////////////////////////////////////////////////////////////// + + message FixedSizeBinary{ + int32 length = 1; + } + + enum DateUnit{ + Day = 0; + DateMillisecond = 1; + } + + message AnalyzedLogicalPlanType { + string analyzer_name = 1; + } + + message OptimizedLogicalPlanType { + string optimizer_name = 1; + } + + message OptimizedPhysicalPlanType { + string optimizer_name = 1; + } + + message PlanType { + oneof plan_type_enum { + datafusion_common.EmptyMessage InitialLogicalPlan = 1; + AnalyzedLogicalPlanType AnalyzedLogicalPlan = 7; + datafusion_common.EmptyMessage FinalAnalyzedLogicalPlan = 8; + OptimizedLogicalPlanType OptimizedLogicalPlan = 2; + datafusion_common.EmptyMessage FinalLogicalPlan = 3; + datafusion_common.EmptyMessage InitialPhysicalPlan = 4; + datafusion_common.EmptyMessage InitialPhysicalPlanWithStats = 9; + datafusion_common.EmptyMessage InitialPhysicalPlanWithSchema = 11; + OptimizedPhysicalPlanType OptimizedPhysicalPlan = 5; + datafusion_common.EmptyMessage FinalPhysicalPlan = 6; + datafusion_common.EmptyMessage FinalPhysicalPlanWithStats = 10; + datafusion_common.EmptyMessage FinalPhysicalPlanWithSchema = 12; + datafusion_common.EmptyMessage PhysicalPlanError = 13; + } + } + + message StringifiedPlan { + PlanType plan_type = 1; + string plan = 2; + } + + message BareTableReference { + string table = 1; + } + + message PartialTableReference { + string schema = 1; + string table = 2; + } + + message FullTableReference { + string catalog = 1; + string schema = 2; + string table = 3; + } + + message TableReference { + oneof table_reference_enum { + BareTableReference bare = 1; + PartialTableReference partial = 2; + FullTableReference full = 3; + } + } + + ///////////////////////////////////////////////////////////////////////////////////////////////// + + // PhysicalPlanNode is a nested type + message PhysicalPlanNode { + oneof PhysicalPlanType { + ParquetScanExecNode parquet_scan = 1; + CsvScanExecNode csv_scan = 2; + EmptyExecNode empty = 3; + ProjectionExecNode projection = 4; + GlobalLimitExecNode global_limit = 6; + LocalLimitExecNode local_limit = 7; + AggregateExecNode aggregate = 8; + HashJoinExecNode hash_join = 9; + SortExecNode sort = 10; + CoalesceBatchesExecNode coalesce_batches = 11; + FilterExecNode filter = 12; + CoalescePartitionsExecNode merge = 13; + RepartitionExecNode repartition = 14; + WindowAggExecNode window = 15; + CrossJoinExecNode cross_join = 16; + AvroScanExecNode avro_scan = 17; + PhysicalExtensionNode extension = 18; + UnionExecNode union = 19; + ExplainExecNode explain = 20; + SortPreservingMergeExecNode sort_preserving_merge = 21; + NestedLoopJoinExecNode nested_loop_join = 22; + AnalyzeExecNode analyze = 23; + JsonSinkExecNode json_sink = 24; + SymmetricHashJoinExecNode symmetric_hash_join = 25; + InterleaveExecNode interleave = 26; + PlaceholderRowExecNode placeholder_row = 27; + CsvSinkExecNode csv_sink = 28; + ParquetSinkExecNode parquet_sink = 29; + UnnestExecNode unnest = 30; + } + } + + message PartitionColumn { + string name = 1; + datafusion_common.ArrowType arrow_type = 2; + } + + + message FileSinkConfig { + reserved 6; // writer_mode + reserved 8; // was `overwrite` which has been superseded by `insert_op` + + string object_store_url = 1; + repeated PartitionedFile file_groups = 2; + repeated string table_paths = 3; + datafusion_common.Schema output_schema = 4; + repeated PartitionColumn table_partition_cols = 5; + bool keep_partition_by_columns = 9; + InsertOp insert_op = 10; + string file_extension = 11; + } + + enum InsertOp { + Append = 0; + Overwrite = 1; + Replace = 2; + } + + message JsonSink { + FileSinkConfig config = 1; + datafusion_common.JsonWriterOptions writer_options = 2; + } + + message JsonSinkExecNode { + PhysicalPlanNode input = 1; + JsonSink sink = 2; + datafusion_common.Schema sink_schema = 3; + PhysicalSortExprNodeCollection sort_order = 4; + } + + message CsvSink { + FileSinkConfig config = 1; + datafusion_common.CsvWriterOptions writer_options = 2; + } + + message CsvSinkExecNode { + PhysicalPlanNode input = 1; + CsvSink sink = 2; + datafusion_common.Schema sink_schema = 3; + PhysicalSortExprNodeCollection sort_order = 4; + } + + message ParquetSink { + FileSinkConfig config = 1; + datafusion_common.TableParquetOptions parquet_options = 2; + } + + message ParquetSinkExecNode { + PhysicalPlanNode input = 1; + ParquetSink sink = 2; + datafusion_common.Schema sink_schema = 3; + PhysicalSortExprNodeCollection sort_order = 4; + } + + message UnnestExecNode { + PhysicalPlanNode input = 1; + datafusion_common.Schema schema = 2; + repeated ListUnnest list_type_columns = 3; + repeated uint64 struct_type_columns = 4; + UnnestOptions options = 5; + } + + message ListUnnest { + uint32 index_in_input_schema = 1; + uint32 depth = 2; + } + + message PhysicalExtensionNode { + bytes node = 1; + repeated PhysicalPlanNode inputs = 2; + } + + // physical expressions + message PhysicalExprNode { + // Was date_time_interval_expr + reserved 17; + + oneof ExprType { + // column references + PhysicalColumn column = 1; + + datafusion_common.ScalarValue literal = 2; + + // binary expressions + PhysicalBinaryExprNode binary_expr = 3; + + // aggregate expressions + PhysicalAggregateExprNode aggregate_expr = 4; + + // null checks + PhysicalIsNull is_null_expr = 5; + PhysicalIsNotNull is_not_null_expr = 6; + PhysicalNot not_expr = 7; + + PhysicalCaseNode case_ = 8; + PhysicalCastNode cast = 9; + PhysicalSortExprNode sort = 10; + PhysicalNegativeNode negative = 11; + PhysicalInListNode in_list = 12; + // was PhysicalScalarFunctionNode scalar_function = 13; + PhysicalTryCastNode try_cast = 14; + // window expressions + PhysicalWindowExprNode window_expr = 15; + + PhysicalScalarUdfNode scalar_udf = 16; + // was PhysicalDateTimeIntervalExprNode date_time_interval_expr = 17; + + PhysicalLikeExprNode like_expr = 18; + + PhysicalExtensionExprNode extension = 19; + + UnknownColumn unknown_column = 20; + } + } + + message PhysicalScalarUdfNode { + string name = 1; + repeated PhysicalExprNode args = 2; + optional bytes fun_definition = 3; + datafusion_common.ArrowType return_type = 4; + bool nullable = 5; + } + + message PhysicalAggregateExprNode { + oneof AggregateFunction { + string user_defined_aggr_function = 4; + } + repeated PhysicalExprNode expr = 2; + repeated PhysicalSortExprNode ordering_req = 5; + bool distinct = 3; + bool ignore_nulls = 6; + optional bytes fun_definition = 7; + } + + message PhysicalWindowExprNode { + oneof window_function { + // BuiltInWindowFunction built_in_function = 2; + string user_defined_aggr_function = 3; + string user_defined_window_function = 10; + } + repeated PhysicalExprNode args = 4; + repeated PhysicalExprNode partition_by = 5; + repeated PhysicalSortExprNode order_by = 6; + WindowFrame window_frame = 7; + string name = 8; + optional bytes fun_definition = 9; + } + + message PhysicalIsNull { + PhysicalExprNode expr = 1; + } + + message PhysicalIsNotNull { + PhysicalExprNode expr = 1; + } + + message PhysicalNot { + PhysicalExprNode expr = 1; + } + + message PhysicalAliasNode { + PhysicalExprNode expr = 1; + string alias = 2; + } + + message PhysicalBinaryExprNode { + PhysicalExprNode l = 1; + PhysicalExprNode r = 2; + string op = 3; + } + + message PhysicalDateTimeIntervalExprNode { + PhysicalExprNode l = 1; + PhysicalExprNode r = 2; + string op = 3; + } + + message PhysicalLikeExprNode { + bool negated = 1; + bool case_insensitive = 2; + PhysicalExprNode expr = 3; + PhysicalExprNode pattern = 4; + } + + message PhysicalSortExprNode { + PhysicalExprNode expr = 1; + bool asc = 2; + bool nulls_first = 3; + } + + message PhysicalWhenThen { + PhysicalExprNode when_expr = 1; + PhysicalExprNode then_expr = 2; + } + + message PhysicalInListNode { + PhysicalExprNode expr = 1; + repeated PhysicalExprNode list = 2; + bool negated = 3; + } + + message PhysicalCaseNode { + PhysicalExprNode expr = 1; + repeated PhysicalWhenThen when_then_expr = 2; + PhysicalExprNode else_expr = 3; + } + + message PhysicalTryCastNode { + PhysicalExprNode expr = 1; + datafusion_common.ArrowType arrow_type = 2; + } + + message PhysicalCastNode { + PhysicalExprNode expr = 1; + datafusion_common.ArrowType arrow_type = 2; + } + + message PhysicalNegativeNode { + PhysicalExprNode expr = 1; + } + + message PhysicalExtensionExprNode { + bytes expr = 1; + repeated PhysicalExprNode inputs = 2; + } + + message FilterExecNode { + PhysicalPlanNode input = 1; + PhysicalExprNode expr = 2; + uint32 default_filter_selectivity = 3; + repeated uint32 projection = 9; + } + + message FileGroup { + repeated PartitionedFile files = 1; + } + + message ScanLimit { + // wrap into a message to make it optional + uint32 limit = 1; + } + + message PhysicalSortExprNodeCollection { + repeated PhysicalSortExprNode physical_sort_expr_nodes = 1; + } + + message FileScanExecConf { + repeated FileGroup file_groups = 1; + datafusion_common.Schema schema = 2; + repeated uint32 projection = 4; + ScanLimit limit = 5; + datafusion_common.Statistics statistics = 6; + repeated string table_partition_cols = 7; + string object_store_url = 8; + repeated PhysicalSortExprNodeCollection output_ordering = 9; + + // Was repeated ConfigOption options = 10; + reserved 10; + + datafusion_common.Constraints constraints = 11; + } + + message ParquetScanExecNode { + FileScanExecConf base_conf = 1; + + // Was pruning predicate based on a logical expr. + reserved 2; + + PhysicalExprNode predicate = 3; + } + + message CsvScanExecNode { + FileScanExecConf base_conf = 1; + bool has_header = 2; + string delimiter = 3; + string quote = 4; + oneof optional_escape { + string escape = 5; + } + oneof optional_comment { + string comment = 6; + } + bool newlines_in_values = 7; + } + + message AvroScanExecNode { + FileScanExecConf base_conf = 1; + } + + enum PartitionMode { + COLLECT_LEFT = 0; + PARTITIONED = 1; + AUTO = 2; + } + + message HashJoinExecNode { + PhysicalPlanNode left = 1; + PhysicalPlanNode right = 2; + repeated JoinOn on = 3; + datafusion_common.JoinType join_type = 4; + PartitionMode partition_mode = 6; + bool null_equals_null = 7; + JoinFilter filter = 8; + repeated uint32 projection = 9; + } + + enum StreamPartitionMode { + SINGLE_PARTITION = 0; + PARTITIONED_EXEC = 1; + } + + message SymmetricHashJoinExecNode { + PhysicalPlanNode left = 1; + PhysicalPlanNode right = 2; + repeated JoinOn on = 3; + datafusion_common.JoinType join_type = 4; + StreamPartitionMode partition_mode = 6; + bool null_equals_null = 7; + JoinFilter filter = 8; + repeated PhysicalSortExprNode left_sort_exprs = 9; + repeated PhysicalSortExprNode right_sort_exprs = 10; + } + + message InterleaveExecNode { + repeated PhysicalPlanNode inputs = 1; + } + + message UnionExecNode { + repeated PhysicalPlanNode inputs = 1; + } + + message ExplainExecNode { + datafusion_common.Schema schema = 1; + repeated StringifiedPlan stringified_plans = 2; + bool verbose = 3; + } + + message AnalyzeExecNode { + bool verbose = 1; + bool show_statistics = 2; + PhysicalPlanNode input = 3; + datafusion_common.Schema schema = 4; + } + + message CrossJoinExecNode { + PhysicalPlanNode left = 1; + PhysicalPlanNode right = 2; + } + + message PhysicalColumn { + string name = 1; + uint32 index = 2; + } + + message UnknownColumn { + string name = 1; + } + + message JoinOn { + PhysicalExprNode left = 1; + PhysicalExprNode right = 2; + } + + message EmptyExecNode { + datafusion_common.Schema schema = 1; + } + + message PlaceholderRowExecNode { + datafusion_common.Schema schema = 1; + } + + message ProjectionExecNode { + PhysicalPlanNode input = 1; + repeated PhysicalExprNode expr = 2; + repeated string expr_name = 3; + } + + enum AggregateMode { + PARTIAL = 0; + FINAL = 1; + FINAL_PARTITIONED = 2; + SINGLE = 3; + SINGLE_PARTITIONED = 4; + } + + message PartiallySortedInputOrderMode { + repeated uint64 columns = 6; + } + + message WindowAggExecNode { + PhysicalPlanNode input = 1; + repeated PhysicalWindowExprNode window_expr = 2; + repeated PhysicalExprNode partition_keys = 5; + // Set optional to `None` for `BoundedWindowAggExec`. + oneof input_order_mode { + datafusion_common.EmptyMessage linear = 7; + PartiallySortedInputOrderMode partially_sorted = 8; + datafusion_common.EmptyMessage sorted = 9; + } + } + + message MaybeFilter { + PhysicalExprNode expr = 1; + } + + message MaybePhysicalSortExprs { + repeated PhysicalSortExprNode sort_expr = 1; + } + + message AggLimit { + // wrap into a message to make it optional + uint64 limit = 1; + } + + message AggregateExecNode { + repeated PhysicalExprNode group_expr = 1; + repeated PhysicalExprNode aggr_expr = 2; + AggregateMode mode = 3; + PhysicalPlanNode input = 4; + repeated string group_expr_name = 5; + repeated string aggr_expr_name = 6; + // we need the input schema to the partial aggregate to pass to the final aggregate + datafusion_common.Schema input_schema = 7; + repeated PhysicalExprNode null_expr = 8; + repeated bool groups = 9; + repeated MaybeFilter filter_expr = 10; + AggLimit limit = 11; + } + + message GlobalLimitExecNode { + PhysicalPlanNode input = 1; + // The number of rows to skip before fetch + uint32 skip = 2; + // Maximum number of rows to fetch; negative means no limit + int64 fetch = 3; + } + + message LocalLimitExecNode { + PhysicalPlanNode input = 1; + uint32 fetch = 2; + } + + message SortExecNode { + PhysicalPlanNode input = 1; + repeated PhysicalExprNode expr = 2; + // Maximum number of highest/lowest rows to fetch; negative means no limit + int64 fetch = 3; + bool preserve_partitioning = 4; + } + + message SortPreservingMergeExecNode { + PhysicalPlanNode input = 1; + repeated PhysicalExprNode expr = 2; + // Maximum number of highest/lowest rows to fetch; negative means no limit + int64 fetch = 3; + } + + message NestedLoopJoinExecNode { + PhysicalPlanNode left = 1; + PhysicalPlanNode right = 2; + datafusion_common.JoinType join_type = 3; + JoinFilter filter = 4; + repeated uint32 projection = 5; + } + + message CoalesceBatchesExecNode { + PhysicalPlanNode input = 1; + uint32 target_batch_size = 2; + optional uint32 fetch = 3; + } + + message CoalescePartitionsExecNode { + PhysicalPlanNode input = 1; + } + + message PhysicalHashRepartition { + repeated PhysicalExprNode hash_expr = 1; + uint64 partition_count = 2; + } + + message RepartitionExecNode{ + PhysicalPlanNode input = 1; + // oneof partition_method { + // uint64 round_robin = 2; + // PhysicalHashRepartition hash = 3; + // uint64 unknown = 4; + // } + Partitioning partitioning = 5; + } + + message Partitioning { + oneof partition_method { + uint64 round_robin = 1; + PhysicalHashRepartition hash = 2; + uint64 unknown = 3; + } + } + + message JoinFilter{ + PhysicalExprNode expression = 1; + repeated ColumnIndex column_indices = 2; + datafusion_common.Schema schema = 3; + } + + message ColumnIndex{ + uint32 index = 1; + datafusion_common.JoinSide side = 2; + } + + message PartitionedFile { + string path = 1; + uint64 size = 2; + uint64 last_modified_ns = 3; + repeated datafusion_common.ScalarValue partition_values = 4; + FileRange range = 5; + datafusion_common.Statistics statistics = 6; + } + + message FileRange { + int64 start = 1; + int64 end = 2; + } + + message PartitionStats { + int64 num_rows = 1; + int64 num_batches = 2; + int64 num_bytes = 3; + repeated datafusion_common.ColumnStats column_stats = 4; + } + + message RecursiveQueryNode { + string name = 1; + LogicalPlanNode static_term = 2; + LogicalPlanNode recursive_term = 3; + bool is_distinct = 4; + } + + message CteWorkTableScanNode { + string name = 1; + datafusion_common.Schema schema = 2; + } \ No newline at end of file diff --git a/ballista/core/proto/datafusion_common.proto b/ballista/core/proto/datafusion_common.proto index ec089e43d..9dafbb38b 100644 --- a/ballista/core/proto/datafusion_common.proto +++ b/ballista/core/proto/datafusion_common.proto @@ -16,559 +16,561 @@ * limitations under the License. */ -syntax = "proto3"; - -package datafusion_common; - -message ColumnRelation { - string relation = 1; -} - -message Column { - string name = 1; - ColumnRelation relation = 2; -} - -message DfField{ - Field field = 1; - ColumnRelation qualifier = 2; -} - -message DfSchema { - repeated DfField columns = 1; - map metadata = 2; -} - -message CsvFormat { - CsvOptions options = 5; -} - -message ParquetFormat { - // Used to be bool enable_pruning = 1; - reserved 1; - TableParquetOptions options = 2; -} - -message AvroFormat {} - -message NdJsonFormat { - JsonOptions options = 1; -} - - -message PrimaryKeyConstraint{ - repeated uint64 indices = 1; -} - -message UniqueConstraint{ - repeated uint64 indices = 1; -} - -message Constraint{ - oneof constraint_mode{ - PrimaryKeyConstraint primary_key = 1; - UniqueConstraint unique = 2; - } -} - -message Constraints{ - repeated Constraint constraints = 1; -} - -enum JoinType { - INNER = 0; - LEFT = 1; - RIGHT = 2; - FULL = 3; - LEFTSEMI = 4; - LEFTANTI = 5; - RIGHTSEMI = 6; - RIGHTANTI = 7; - LEFTMARK = 8; -} - -enum JoinConstraint { - ON = 0; - USING = 1; -} - -message AvroOptions {} -message ArrowOptions {} - -message Schema { - repeated Field columns = 1; - map metadata = 2; -} - -message Field { - // name of the field - string name = 1; - ArrowType arrow_type = 2; - bool nullable = 3; - // for complex data types like structs, unions - repeated Field children = 4; - map metadata = 5; - int64 dict_id = 6; - bool dict_ordered = 7; -} - -message Timestamp{ - TimeUnit time_unit = 1; - string timezone = 2; -} - -enum TimeUnit{ - Second = 0; - Millisecond = 1; - Microsecond = 2; - Nanosecond = 3; -} - -enum IntervalUnit{ - YearMonth = 0; - DayTime = 1; - MonthDayNano = 2; -} - -message Decimal{ - reserved 1, 2; - uint32 precision = 3; - int32 scale = 4; -} - -message Decimal256Type{ - reserved 1, 2; - uint32 precision = 3; - int32 scale = 4; -} - -message List{ - Field field_type = 1; -} - -message FixedSizeList{ - Field field_type = 1; - int32 list_size = 2; -} - -message Dictionary{ - ArrowType key = 1; - ArrowType value = 2; -} - -message Struct{ - repeated Field sub_field_types = 1; -} - -message Map { - Field field_type = 1; - bool keys_sorted = 2; -} - -enum UnionMode{ - sparse = 0; - dense = 1; -} - -message Union{ - repeated Field union_types = 1; - UnionMode union_mode = 2; - repeated int32 type_ids = 3; -} - -// Used for List/FixedSizeList/LargeList/Struct/Map -message ScalarNestedValue { - message Dictionary { - bytes ipc_message = 1; - bytes arrow_data = 2; - } - - bytes ipc_message = 1; - bytes arrow_data = 2; - Schema schema = 3; - repeated Dictionary dictionaries = 4; -} - -message ScalarTime32Value { - oneof value { - int32 time32_second_value = 1; - int32 time32_millisecond_value = 2; - }; -} - -message ScalarTime64Value { - oneof value { - int64 time64_microsecond_value = 1; - int64 time64_nanosecond_value = 2; - }; -} - -message ScalarTimestampValue { - oneof value { - int64 time_microsecond_value = 1; - int64 time_nanosecond_value = 2; - int64 time_second_value = 3; - int64 time_millisecond_value = 4; - }; - string timezone = 5; -} - -message ScalarDictionaryValue { - ArrowType index_type = 1; - ScalarValue value = 2; -} - -message IntervalDayTimeValue { - int32 days = 1; - int32 milliseconds = 2; -} - -message IntervalMonthDayNanoValue { - int32 months = 1; - int32 days = 2; - int64 nanos = 3; -} - -message UnionField { - int32 field_id = 1; - Field field = 2; -} - -message UnionValue { - // Note that a null union value must have one or more fields, so we - // encode a null UnionValue as one with value_id == 128 - int32 value_id = 1; - ScalarValue value = 2; - repeated UnionField fields = 3; - UnionMode mode = 4; -} - -message ScalarFixedSizeBinary{ - bytes values = 1; - int32 length = 2; -} - -message ScalarValue{ - // was PrimitiveScalarType null_value = 19; - reserved 19; - - oneof value { - // was PrimitiveScalarType null_value = 19; - // Null value of any type - ArrowType null_value = 33; - - bool bool_value = 1; - string utf8_value = 2; - string large_utf8_value = 3; - string utf8_view_value = 23; - int32 int8_value = 4; - int32 int16_value = 5; - int32 int32_value = 6; - int64 int64_value = 7; - uint32 uint8_value = 8; - uint32 uint16_value = 9; - uint32 uint32_value = 10; - uint64 uint64_value = 11; - float float32_value = 12; - double float64_value = 13; - // Literal Date32 value always has a unit of day - int32 date_32_value = 14; - ScalarTime32Value time32_value = 15; - ScalarNestedValue large_list_value = 16; - ScalarNestedValue list_value = 17; - ScalarNestedValue fixed_size_list_value = 18; - ScalarNestedValue struct_value = 32; - ScalarNestedValue map_value = 41; - - Decimal128 decimal128_value = 20; - Decimal256 decimal256_value = 39; - - int64 date_64_value = 21; - int32 interval_yearmonth_value = 24; - - int64 duration_second_value = 35; - int64 duration_millisecond_value = 36; - int64 duration_microsecond_value = 37; - int64 duration_nanosecond_value = 38; - - ScalarTimestampValue timestamp_value = 26; - ScalarDictionaryValue dictionary_value = 27; - bytes binary_value = 28; - bytes large_binary_value = 29; - bytes binary_view_value = 22; - ScalarTime64Value time64_value = 30; - IntervalDayTimeValue interval_daytime_value = 25; - IntervalMonthDayNanoValue interval_month_day_nano = 31; - ScalarFixedSizeBinary fixed_size_binary_value = 34; - UnionValue union_value = 42; - } -} - -message Decimal128{ - bytes value = 1; - int64 p = 2; - int64 s = 3; -} - -message Decimal256{ - bytes value = 1; - int64 p = 2; - int64 s = 3; -} - -// Serialized data type -message ArrowType{ - oneof arrow_type_enum { - EmptyMessage NONE = 1; // arrow::Type::NA - EmptyMessage BOOL = 2; // arrow::Type::BOOL - EmptyMessage UINT8 = 3; // arrow::Type::UINT8 - EmptyMessage INT8 = 4; // arrow::Type::INT8 - EmptyMessage UINT16 = 5; // represents arrow::Type fields in src/arrow/type.h - EmptyMessage INT16 = 6; - EmptyMessage UINT32 = 7; - EmptyMessage INT32 = 8; - EmptyMessage UINT64 = 9; - EmptyMessage INT64 = 10 ; - EmptyMessage FLOAT16 = 11 ; - EmptyMessage FLOAT32 = 12 ; - EmptyMessage FLOAT64 = 13 ; - EmptyMessage UTF8 = 14 ; - EmptyMessage UTF8_VIEW = 35; - EmptyMessage LARGE_UTF8 = 32; - EmptyMessage BINARY = 15 ; - EmptyMessage BINARY_VIEW = 34; - int32 FIXED_SIZE_BINARY = 16 ; - EmptyMessage LARGE_BINARY = 31; - EmptyMessage DATE32 = 17 ; - EmptyMessage DATE64 = 18 ; - TimeUnit DURATION = 19; - Timestamp TIMESTAMP = 20 ; - TimeUnit TIME32 = 21 ; - TimeUnit TIME64 = 22 ; - IntervalUnit INTERVAL = 23 ; - Decimal DECIMAL = 24 ; - Decimal256Type DECIMAL256 = 36; - List LIST = 25; - List LARGE_LIST = 26; - FixedSizeList FIXED_SIZE_LIST = 27; - Struct STRUCT = 28; - Union UNION = 29; - Dictionary DICTIONARY = 30; - Map MAP = 33; - } -} - -//Useful for representing an empty enum variant in rust -// E.G. enum example{One, Two(i32)} -// maps to -// message example{ -// oneof{ -// EmptyMessage One = 1; -// i32 Two = 2; -// } -//} -message EmptyMessage{} - -enum CompressionTypeVariant { - GZIP = 0; - BZIP2 = 1; - XZ = 2; - ZSTD = 3; - UNCOMPRESSED = 4; -} - -message JsonWriterOptions { - CompressionTypeVariant compression = 1; -} - - -message CsvWriterOptions { - // Compression type - CompressionTypeVariant compression = 1; - // Optional column delimiter. Defaults to `b','` - string delimiter = 2; - // Whether to write column names as file headers. Defaults to `true` - bool has_header = 3; - // Optional date format for date arrays - string date_format = 4; - // Optional datetime format for datetime arrays - string datetime_format = 5; - // Optional timestamp format for timestamp arrays - string timestamp_format = 6; - // Optional time format for time arrays - string time_format = 7; - // Optional value to represent null - string null_value = 8; - // Optional quote. Defaults to `b'"'` - string quote = 9; - // Optional escape. Defaults to `'\\'` - string escape = 10; - // Optional flag whether to double quotes, instead of escaping. Defaults to `true` - bool double_quote = 11; -} - -// Options controlling CSV format -message CsvOptions { - bytes has_header = 1; // Indicates if the CSV has a header row - bytes delimiter = 2; // Delimiter character as a byte - bytes quote = 3; // Quote character as a byte - bytes escape = 4; // Optional escape character as a byte - CompressionTypeVariant compression = 5; // Compression type - optional uint64 schema_infer_max_rec = 6; // Optional max records for schema inference - string date_format = 7; // Optional date format - string datetime_format = 8; // Optional datetime format - string timestamp_format = 9; // Optional timestamp format - string timestamp_tz_format = 10; // Optional timestamp with timezone format - string time_format = 11; // Optional time format - string null_value = 12; // Optional representation of null value - string null_regex = 13; // Optional representation of null loading regex - bytes comment = 14; // Optional comment character as a byte - bytes double_quote = 15; // Indicates if quotes are doubled - bytes newlines_in_values = 16; // Indicates if newlines are supported in values - bytes terminator = 17; // Optional terminator character as a byte -} - -// Options controlling CSV format -message JsonOptions { - CompressionTypeVariant compression = 1; // Compression type - optional uint64 schema_infer_max_rec = 2; // Optional max records for schema inference -} - -message TableParquetOptions { - ParquetOptions global = 1; - repeated ParquetColumnSpecificOptions column_specific_options = 2; - map key_value_metadata = 3; -} - -message ParquetColumnSpecificOptions { - string column_name = 1; - ParquetColumnOptions options = 2; -} - -message ParquetColumnOptions { - oneof bloom_filter_enabled_opt { - bool bloom_filter_enabled = 1; - } - - oneof encoding_opt { - string encoding = 2; - } - - oneof dictionary_enabled_opt { - bool dictionary_enabled = 3; - } - - oneof compression_opt { - string compression = 4; - } - - oneof statistics_enabled_opt { - string statistics_enabled = 5; - } - - oneof bloom_filter_fpp_opt { - double bloom_filter_fpp = 6; - } - - oneof bloom_filter_ndv_opt { - uint64 bloom_filter_ndv = 7; - } - - oneof max_statistics_size_opt { - uint32 max_statistics_size = 8; - } -} - -message ParquetOptions { - // Regular fields - bool enable_page_index = 1; // default = true - bool pruning = 2; // default = true - bool skip_metadata = 3; // default = true - bool pushdown_filters = 5; // default = false - bool reorder_filters = 6; // default = false - uint64 data_pagesize_limit = 7; // default = 1024 * 1024 - uint64 write_batch_size = 8; // default = 1024 - string writer_version = 9; // default = "1.0" - // bool bloom_filter_enabled = 20; // default = false - bool allow_single_file_parallelism = 23; // default = true - uint64 maximum_parallel_row_group_writers = 24; // default = 1 - uint64 maximum_buffered_record_batches_per_stream = 25; // default = 2 - bool bloom_filter_on_read = 26; // default = true - bool bloom_filter_on_write = 27; // default = false - bool schema_force_view_types = 28; // default = false - bool binary_as_string = 29; // default = false - - oneof metadata_size_hint_opt { - uint64 metadata_size_hint = 4; - } - - oneof compression_opt { - string compression = 10; - } - - oneof dictionary_enabled_opt { - bool dictionary_enabled = 11; - } - - oneof statistics_enabled_opt { - string statistics_enabled = 13; - } - - oneof max_statistics_size_opt { - uint64 max_statistics_size = 14; - } - - oneof column_index_truncate_length_opt { - uint64 column_index_truncate_length = 17; - } - - oneof encoding_opt { - string encoding = 19; - } - - oneof bloom_filter_fpp_opt { - double bloom_filter_fpp = 21; - } - - oneof bloom_filter_ndv_opt { - uint64 bloom_filter_ndv = 22; - } - - uint64 dictionary_page_size_limit = 12; - - uint64 data_page_row_count_limit = 18; - - uint64 max_row_group_size = 15; - - string created_by = 16; -} - -enum JoinSide { - LEFT_SIDE = 0; - RIGHT_SIDE = 1; - NONE = 2; -} - -message Precision{ - PrecisionInfo precision_info = 1; - ScalarValue val = 2; -} - -enum PrecisionInfo { - EXACT = 0; - INEXACT = 1; - ABSENT = 2; -} - -message Statistics { - Precision num_rows = 1; - Precision total_byte_size = 2; - repeated ColumnStats column_stats = 3; -} - -message ColumnStats { - Precision min_value = 1; - Precision max_value = 2; - Precision null_count = 3; - Precision distinct_count = 4; -} \ No newline at end of file + syntax = "proto3"; + + package datafusion_common; + + message ColumnRelation { + string relation = 1; + } + + message Column { + string name = 1; + ColumnRelation relation = 2; + } + + message DfField{ + Field field = 1; + ColumnRelation qualifier = 2; + } + + message DfSchema { + repeated DfField columns = 1; + map metadata = 2; + } + + message CsvFormat { + CsvOptions options = 5; + } + + message ParquetFormat { + // Used to be bool enable_pruning = 1; + reserved 1; + TableParquetOptions options = 2; + } + + message AvroFormat {} + + message NdJsonFormat { + JsonOptions options = 1; + } + + + message PrimaryKeyConstraint{ + repeated uint64 indices = 1; + } + + message UniqueConstraint{ + repeated uint64 indices = 1; + } + + message Constraint{ + oneof constraint_mode{ + PrimaryKeyConstraint primary_key = 1; + UniqueConstraint unique = 2; + } + } + + message Constraints{ + repeated Constraint constraints = 1; + } + + enum JoinType { + INNER = 0; + LEFT = 1; + RIGHT = 2; + FULL = 3; + LEFTSEMI = 4; + LEFTANTI = 5; + RIGHTSEMI = 6; + RIGHTANTI = 7; + LEFTMARK = 8; + } + + enum JoinConstraint { + ON = 0; + USING = 1; + } + + message AvroOptions {} + message ArrowOptions {} + + message Schema { + repeated Field columns = 1; + map metadata = 2; + } + + message Field { + // name of the field + string name = 1; + ArrowType arrow_type = 2; + bool nullable = 3; + // for complex data types like structs, unions + repeated Field children = 4; + map metadata = 5; + int64 dict_id = 6; + bool dict_ordered = 7; + } + + message Timestamp{ + TimeUnit time_unit = 1; + string timezone = 2; + } + + enum TimeUnit{ + Second = 0; + Millisecond = 1; + Microsecond = 2; + Nanosecond = 3; + } + + enum IntervalUnit{ + YearMonth = 0; + DayTime = 1; + MonthDayNano = 2; + } + + message Decimal{ + reserved 1, 2; + uint32 precision = 3; + int32 scale = 4; + } + + message Decimal256Type{ + reserved 1, 2; + uint32 precision = 3; + int32 scale = 4; + } + + message List{ + Field field_type = 1; + } + + message FixedSizeList{ + Field field_type = 1; + int32 list_size = 2; + } + + message Dictionary{ + ArrowType key = 1; + ArrowType value = 2; + } + + message Struct{ + repeated Field sub_field_types = 1; + } + + message Map { + Field field_type = 1; + bool keys_sorted = 2; + } + + enum UnionMode{ + sparse = 0; + dense = 1; + } + + message Union{ + repeated Field union_types = 1; + UnionMode union_mode = 2; + repeated int32 type_ids = 3; + } + + // Used for List/FixedSizeList/LargeList/Struct/Map + message ScalarNestedValue { + message Dictionary { + bytes ipc_message = 1; + bytes arrow_data = 2; + } + + bytes ipc_message = 1; + bytes arrow_data = 2; + Schema schema = 3; + repeated Dictionary dictionaries = 4; + } + + message ScalarTime32Value { + oneof value { + int32 time32_second_value = 1; + int32 time32_millisecond_value = 2; + }; + } + + message ScalarTime64Value { + oneof value { + int64 time64_microsecond_value = 1; + int64 time64_nanosecond_value = 2; + }; + } + + message ScalarTimestampValue { + oneof value { + int64 time_microsecond_value = 1; + int64 time_nanosecond_value = 2; + int64 time_second_value = 3; + int64 time_millisecond_value = 4; + }; + string timezone = 5; + } + + message ScalarDictionaryValue { + ArrowType index_type = 1; + ScalarValue value = 2; + } + + message IntervalDayTimeValue { + int32 days = 1; + int32 milliseconds = 2; + } + + message IntervalMonthDayNanoValue { + int32 months = 1; + int32 days = 2; + int64 nanos = 3; + } + + message UnionField { + int32 field_id = 1; + Field field = 2; + } + + message UnionValue { + // Note that a null union value must have one or more fields, so we + // encode a null UnionValue as one with value_id == 128 + int32 value_id = 1; + ScalarValue value = 2; + repeated UnionField fields = 3; + UnionMode mode = 4; + } + + message ScalarFixedSizeBinary{ + bytes values = 1; + int32 length = 2; + } + + message ScalarValue{ + // was PrimitiveScalarType null_value = 19; + reserved 19; + + oneof value { + // was PrimitiveScalarType null_value = 19; + // Null value of any type + ArrowType null_value = 33; + + bool bool_value = 1; + string utf8_value = 2; + string large_utf8_value = 3; + string utf8_view_value = 23; + int32 int8_value = 4; + int32 int16_value = 5; + int32 int32_value = 6; + int64 int64_value = 7; + uint32 uint8_value = 8; + uint32 uint16_value = 9; + uint32 uint32_value = 10; + uint64 uint64_value = 11; + float float32_value = 12; + double float64_value = 13; + // Literal Date32 value always has a unit of day + int32 date_32_value = 14; + ScalarTime32Value time32_value = 15; + ScalarNestedValue large_list_value = 16; + ScalarNestedValue list_value = 17; + ScalarNestedValue fixed_size_list_value = 18; + ScalarNestedValue struct_value = 32; + ScalarNestedValue map_value = 41; + + Decimal128 decimal128_value = 20; + Decimal256 decimal256_value = 39; + + int64 date_64_value = 21; + int32 interval_yearmonth_value = 24; + + int64 duration_second_value = 35; + int64 duration_millisecond_value = 36; + int64 duration_microsecond_value = 37; + int64 duration_nanosecond_value = 38; + + ScalarTimestampValue timestamp_value = 26; + ScalarDictionaryValue dictionary_value = 27; + bytes binary_value = 28; + bytes large_binary_value = 29; + bytes binary_view_value = 22; + ScalarTime64Value time64_value = 30; + IntervalDayTimeValue interval_daytime_value = 25; + IntervalMonthDayNanoValue interval_month_day_nano = 31; + ScalarFixedSizeBinary fixed_size_binary_value = 34; + UnionValue union_value = 42; + } + } + + message Decimal128{ + bytes value = 1; + int64 p = 2; + int64 s = 3; + } + + message Decimal256{ + bytes value = 1; + int64 p = 2; + int64 s = 3; + } + + // Serialized data type + message ArrowType{ + oneof arrow_type_enum { + EmptyMessage NONE = 1; // arrow::Type::NA + EmptyMessage BOOL = 2; // arrow::Type::BOOL + EmptyMessage UINT8 = 3; // arrow::Type::UINT8 + EmptyMessage INT8 = 4; // arrow::Type::INT8 + EmptyMessage UINT16 = 5; // represents arrow::Type fields in src/arrow/type.h + EmptyMessage INT16 = 6; + EmptyMessage UINT32 = 7; + EmptyMessage INT32 = 8; + EmptyMessage UINT64 = 9; + EmptyMessage INT64 = 10 ; + EmptyMessage FLOAT16 = 11 ; + EmptyMessage FLOAT32 = 12 ; + EmptyMessage FLOAT64 = 13 ; + EmptyMessage UTF8 = 14 ; + EmptyMessage UTF8_VIEW = 35; + EmptyMessage LARGE_UTF8 = 32; + EmptyMessage BINARY = 15 ; + EmptyMessage BINARY_VIEW = 34; + int32 FIXED_SIZE_BINARY = 16 ; + EmptyMessage LARGE_BINARY = 31; + EmptyMessage DATE32 = 17 ; + EmptyMessage DATE64 = 18 ; + TimeUnit DURATION = 19; + Timestamp TIMESTAMP = 20 ; + TimeUnit TIME32 = 21 ; + TimeUnit TIME64 = 22 ; + IntervalUnit INTERVAL = 23 ; + Decimal DECIMAL = 24 ; + Decimal256Type DECIMAL256 = 36; + List LIST = 25; + List LARGE_LIST = 26; + FixedSizeList FIXED_SIZE_LIST = 27; + Struct STRUCT = 28; + Union UNION = 29; + Dictionary DICTIONARY = 30; + Map MAP = 33; + } + } + + //Useful for representing an empty enum variant in rust + // E.G. enum example{One, Two(i32)} + // maps to + // message example{ + // oneof{ + // EmptyMessage One = 1; + // i32 Two = 2; + // } + //} + message EmptyMessage{} + + enum CompressionTypeVariant { + GZIP = 0; + BZIP2 = 1; + XZ = 2; + ZSTD = 3; + UNCOMPRESSED = 4; + } + + message JsonWriterOptions { + CompressionTypeVariant compression = 1; + } + + + message CsvWriterOptions { + // Compression type + CompressionTypeVariant compression = 1; + // Optional column delimiter. Defaults to `b','` + string delimiter = 2; + // Whether to write column names as file headers. Defaults to `true` + bool has_header = 3; + // Optional date format for date arrays + string date_format = 4; + // Optional datetime format for datetime arrays + string datetime_format = 5; + // Optional timestamp format for timestamp arrays + string timestamp_format = 6; + // Optional time format for time arrays + string time_format = 7; + // Optional value to represent null + string null_value = 8; + // Optional quote. Defaults to `b'"'` + string quote = 9; + // Optional escape. Defaults to `'\\'` + string escape = 10; + // Optional flag whether to double quotes, instead of escaping. Defaults to `true` + bool double_quote = 11; + } + + // Options controlling CSV format + message CsvOptions { + bytes has_header = 1; // Indicates if the CSV has a header row + bytes delimiter = 2; // Delimiter character as a byte + bytes quote = 3; // Quote character as a byte + bytes escape = 4; // Optional escape character as a byte + CompressionTypeVariant compression = 5; // Compression type + optional uint64 schema_infer_max_rec = 6; // Optional max records for schema inference + string date_format = 7; // Optional date format + string datetime_format = 8; // Optional datetime format + string timestamp_format = 9; // Optional timestamp format + string timestamp_tz_format = 10; // Optional timestamp with timezone format + string time_format = 11; // Optional time format + string null_value = 12; // Optional representation of null value + string null_regex = 13; // Optional representation of null loading regex + bytes comment = 14; // Optional comment character as a byte + bytes double_quote = 15; // Indicates if quotes are doubled + bytes newlines_in_values = 16; // Indicates if newlines are supported in values + bytes terminator = 17; // Optional terminator character as a byte + } + + // Options controlling CSV format + message JsonOptions { + CompressionTypeVariant compression = 1; // Compression type + optional uint64 schema_infer_max_rec = 2; // Optional max records for schema inference + } + + message TableParquetOptions { + ParquetOptions global = 1; + repeated ParquetColumnSpecificOptions column_specific_options = 2; + map key_value_metadata = 3; + } + + message ParquetColumnSpecificOptions { + string column_name = 1; + ParquetColumnOptions options = 2; + } + + message ParquetColumnOptions { + oneof bloom_filter_enabled_opt { + bool bloom_filter_enabled = 1; + } + + oneof encoding_opt { + string encoding = 2; + } + + oneof dictionary_enabled_opt { + bool dictionary_enabled = 3; + } + + oneof compression_opt { + string compression = 4; + } + + oneof statistics_enabled_opt { + string statistics_enabled = 5; + } + + oneof bloom_filter_fpp_opt { + double bloom_filter_fpp = 6; + } + + oneof bloom_filter_ndv_opt { + uint64 bloom_filter_ndv = 7; + } + + oneof max_statistics_size_opt { + uint32 max_statistics_size = 8; + } + } + + message ParquetOptions { + // Regular fields + bool enable_page_index = 1; // default = true + bool pruning = 2; // default = true + bool skip_metadata = 3; // default = true + bool pushdown_filters = 5; // default = false + bool reorder_filters = 6; // default = false + uint64 data_pagesize_limit = 7; // default = 1024 * 1024 + uint64 write_batch_size = 8; // default = 1024 + string writer_version = 9; // default = "1.0" + // bool bloom_filter_enabled = 20; // default = false + bool allow_single_file_parallelism = 23; // default = true + uint64 maximum_parallel_row_group_writers = 24; // default = 1 + uint64 maximum_buffered_record_batches_per_stream = 25; // default = 2 + bool bloom_filter_on_read = 26; // default = true + bool bloom_filter_on_write = 27; // default = false + bool schema_force_view_types = 28; // default = false + bool binary_as_string = 29; // default = false + bool skip_arrow_metadata = 30; // default = false + + oneof metadata_size_hint_opt { + uint64 metadata_size_hint = 4; + } + + oneof compression_opt { + string compression = 10; + } + + oneof dictionary_enabled_opt { + bool dictionary_enabled = 11; + } + + oneof statistics_enabled_opt { + string statistics_enabled = 13; + } + + oneof max_statistics_size_opt { + uint64 max_statistics_size = 14; + } + + oneof column_index_truncate_length_opt { + uint64 column_index_truncate_length = 17; + } + + oneof encoding_opt { + string encoding = 19; + } + + oneof bloom_filter_fpp_opt { + double bloom_filter_fpp = 21; + } + + oneof bloom_filter_ndv_opt { + uint64 bloom_filter_ndv = 22; + } + + uint64 dictionary_page_size_limit = 12; + + uint64 data_page_row_count_limit = 18; + + uint64 max_row_group_size = 15; + + string created_by = 16; + } + + enum JoinSide { + LEFT_SIDE = 0; + RIGHT_SIDE = 1; + NONE = 2; + } + + message Precision{ + PrecisionInfo precision_info = 1; + ScalarValue val = 2; + } + + enum PrecisionInfo { + EXACT = 0; + INEXACT = 1; + ABSENT = 2; + } + + message Statistics { + Precision num_rows = 1; + Precision total_byte_size = 2; + repeated ColumnStats column_stats = 3; + } + + message ColumnStats { + Precision min_value = 1; + Precision max_value = 2; + Precision sum_value = 5; + Precision null_count = 3; + Precision distinct_count = 4; + } \ No newline at end of file