From 6d517fd4b703a223a2fd72ba0c9757d224dea45a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 16 Feb 2025 13:52:22 -0500 Subject: [PATCH] Improve SQL Planner docs (#14669) * Improve SQL Planner docs * fix docs * Apply suggestions from code review Co-authored-by: Jonah Gao * Restore builtin term * Add docs to `is_system_variable` * clarify type checking * fix rendering --------- Co-authored-by: Jonah Gao --- datafusion/core/src/lib.rs | 6 +- datafusion/expr/src/planner.rs | 101 +++++++++++++++++----------- datafusion/expr/src/var_provider.rs | 6 ++ datafusion/sql/src/planner.rs | 19 +++++- 4 files changed, 87 insertions(+), 45 deletions(-) diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index d6e092cd2ee7..f4aa366500ef 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -229,9 +229,9 @@ //! 1. The query string is parsed to an Abstract Syntax Tree (AST) //! [`Statement`] using [sqlparser]. //! -//! 2. The AST is converted to a [`LogicalPlan`] and logical -//! expressions [`Expr`]s to compute the desired result by the -//! [`SqlToRel`] planner. +//! 2. The AST is converted to a [`LogicalPlan`] and logical expressions +//! [`Expr`]s to compute the desired result by [`SqlToRel`]. This phase +//! also includes name and type resolution ("binding"). //! //! [`Statement`]: https://docs.rs/sqlparser/latest/sqlparser/ast/enum.Statement.html //! diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs index 42047e8e6caa..04cc26c910cb 100644 --- a/datafusion/expr/src/planner.rs +++ b/datafusion/expr/src/planner.rs @@ -29,13 +29,18 @@ use sqlparser::ast; use crate::{AggregateUDF, Expr, GetFieldAccess, ScalarUDF, TableSource, WindowUDF}; -/// Provides the `SQL` query planner meta-data about tables and -/// functions referenced in SQL statements, without a direct dependency on other -/// DataFusion structures +/// Provides the `SQL` query planner meta-data about tables and +/// functions referenced in SQL statements, without a direct dependency on the +/// `datafusion` Catalog structures such as [`TableProvider`] +/// +/// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html pub trait ContextProvider { - /// Getter for a datasource + /// Returns a table by reference, if it exists fn get_table_source(&self, name: TableReference) -> Result>; + /// Return the type of a file based on its extension (e.g. `.parquet`) + /// + /// This is used to plan `COPY` statements fn get_file_type(&self, _ext: &str) -> Result> { not_impl_err!("Registered file types are not supported") } @@ -49,11 +54,20 @@ pub trait ContextProvider { not_impl_err!("Table Functions are not supported") } - /// This provides a worktable (an intermediate table that is used to store the results of a CTE during execution) - /// We don't directly implement this in the logical plan's ['SqlToRel`] - /// because the sql code needs access to a table that contains execution-related types that can't be a direct dependency - /// of the sql crate (namely, the `CteWorktable`). + /// Provides an intermediate table that is used to store the results of a CTE during execution + /// + /// CTE stands for "Common Table Expression" + /// + /// # Notes + /// We don't directly implement this in [`SqlToRel`] as implementing this function + /// often requires access to a table that contains + /// execution-related types that can't be a direct dependency + /// of the sql crate (for example [`CteWorkTable`]). + /// /// The [`ContextProvider`] provides a way to "hide" this dependency. + /// + /// [`SqlToRel`]: https://docs.rs/datafusion/latest/datafusion/sql/planner/struct.SqlToRel.html + /// [`CteWorkTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/cte_worktable/struct.CteWorkTable.html fn create_cte_work_table( &self, _name: &str, @@ -62,39 +76,44 @@ pub trait ContextProvider { not_impl_err!("Recursive CTE is not implemented") } - /// Getter for expr planners + /// Return [`ExprPlanner`] extensions for planning expressions fn get_expr_planners(&self) -> &[Arc] { &[] } - /// Getter for the data type planner + /// Return [`TypePlanner`] extensions for planning data types fn get_type_planner(&self) -> Option> { None } - /// Getter for a UDF description + /// Return the scalar function with a given name, if any fn get_function_meta(&self, name: &str) -> Option>; - /// Getter for a UDAF description + + /// Return the aggregate function with a given name, if any fn get_aggregate_meta(&self, name: &str) -> Option>; - /// Getter for a UDWF + + /// Return the window function with a given name, if any fn get_window_meta(&self, name: &str) -> Option>; - /// Getter for system/user-defined variable type + + /// Return the system/user-defined variable type, if any + /// + /// A user defined variable is typically accessed via `@var_name` fn get_variable_type(&self, variable_names: &[String]) -> Option; - /// Get configuration options + /// Return overall configuration options fn options(&self) -> &ConfigOptions; - /// Get all user defined scalar function names + /// Return all scalar function names fn udf_names(&self) -> Vec; - /// Get all user defined aggregate function names + /// Return all aggregate function names fn udaf_names(&self) -> Vec; - /// Get all user defined window function names + /// Return all window function names fn udwf_names(&self) -> Vec; } -/// This trait allows users to customize the behavior of the SQL planner +/// Customize planning of SQL AST expressions to [`Expr`]s pub trait ExprPlanner: Debug + Send + Sync { /// Plan the binary operation between two expressions, returns original /// BinaryExpr if not possible @@ -106,9 +125,9 @@ pub trait ExprPlanner: Debug + Send + Sync { Ok(PlannerResult::Original(expr)) } - /// Plan the field access expression + /// Plan the field access expression, such as `foo.bar` /// - /// returns original FieldAccessExpr if not possible + /// returns original [`RawFieldAccessExpr`] if not possible fn plan_field_access( &self, expr: RawFieldAccessExpr, @@ -117,7 +136,7 @@ pub trait ExprPlanner: Debug + Send + Sync { Ok(PlannerResult::Original(expr)) } - /// Plan the array literal, returns OriginalArray if not possible + /// Plan an array literal, such as `[1, 2, 3]` /// /// Returns origin expression arguments if not possible fn plan_array_literal( @@ -128,13 +147,14 @@ pub trait ExprPlanner: Debug + Send + Sync { Ok(PlannerResult::Original(exprs)) } - // Plan the POSITION expression, e.g., POSITION( in ) - // returns origin expression arguments if not possible + /// Plan a `POSITION` expression, such as `POSITION( in )` + /// + /// returns origin expression arguments if not possible fn plan_position(&self, args: Vec) -> Result>> { Ok(PlannerResult::Original(args)) } - /// Plan the dictionary literal `{ key: value, ...}` + /// Plan a dictionary literal, such as `{ key: value, ...}` /// /// Returns origin expression arguments if not possible fn plan_dictionary_literal( @@ -145,27 +165,26 @@ pub trait ExprPlanner: Debug + Send + Sync { Ok(PlannerResult::Original(expr)) } - /// Plan an extract expression, e.g., `EXTRACT(month FROM foo)` + /// Plan an extract expression, such as`EXTRACT(month FROM foo)` /// /// Returns origin expression arguments if not possible fn plan_extract(&self, args: Vec) -> Result>> { Ok(PlannerResult::Original(args)) } - /// Plan an substring expression, e.g., `SUBSTRING( [FROM ] [FOR ])` + /// Plan an substring expression, such as `SUBSTRING( [FROM ] [FOR ])` /// /// Returns origin expression arguments if not possible fn plan_substring(&self, args: Vec) -> Result>> { Ok(PlannerResult::Original(args)) } - /// Plans a struct `struct(expression1[, ..., expression_n])` - /// literal based on the given input expressions. - /// This function takes a vector of expressions and a boolean flag indicating whether - /// the struct uses the optional name + /// Plans a struct literal, such as `{'field1' : expr1, 'field2' : expr2, ...}` + /// + /// This function takes a vector of expressions and a boolean flag + /// indicating whether the struct uses the optional name /// - /// Returns a `PlannerResult` containing either the planned struct expressions or the original - /// input expressions if planning is not possible. + /// Returns the original input expressions if planning is not possible. fn plan_struct_literal( &self, args: Vec, @@ -174,26 +193,26 @@ pub trait ExprPlanner: Debug + Send + Sync { Ok(PlannerResult::Original(args)) } - /// Plans an overlay expression eg `overlay(str PLACING substr FROM pos [FOR count])` + /// Plans an overlay expression, such as `overlay(str PLACING substr FROM pos [FOR count])` /// /// Returns origin expression arguments if not possible fn plan_overlay(&self, args: Vec) -> Result>> { Ok(PlannerResult::Original(args)) } - /// Plan a make_map expression, e.g., `make_map(key1, value1, key2, value2, ...)` + /// Plans a `make_map` expression, such as `make_map(key1, value1, key2, value2, ...)` /// /// Returns origin expression arguments if not possible fn plan_make_map(&self, args: Vec) -> Result>> { Ok(PlannerResult::Original(args)) } - /// Plans compound identifier eg `db.schema.table` for non-empty nested names + /// Plans compound identifier such as `db.schema.table` for non-empty nested names /// - /// Note: + /// # Note: /// Currently compound identifier for outer query schema is not supported. /// - /// Returns planned expression + /// Returns original expression if not possible fn plan_compound_identifier( &self, _field: &Field, @@ -205,7 +224,7 @@ pub trait ExprPlanner: Debug + Send + Sync { ) } - /// Plans `ANY` expression, e.g., `expr = ANY(array_expr)` + /// Plans `ANY` expression, such as `expr = ANY(array_expr)` /// /// Returns origin binary expression if not possible fn plan_any(&self, expr: RawBinaryExpr) -> Result> { @@ -256,9 +275,9 @@ pub enum PlannerResult { Original(T), } -/// This trait allows users to customize the behavior of the data type planning +/// Customize planning SQL types to DataFusion (Arrow) types. pub trait TypePlanner: Debug + Send + Sync { - /// Plan SQL type to DataFusion data type + /// Plan SQL [`ast::DataType`] to DataFusion [`DataType`] /// /// Returns None if not possible fn plan_type(&self, _sql_type: &ast::DataType) -> Result> { diff --git a/datafusion/expr/src/var_provider.rs b/datafusion/expr/src/var_provider.rs index e00cf7407237..708cd576c3ff 100644 --- a/datafusion/expr/src/var_provider.rs +++ b/datafusion/expr/src/var_provider.rs @@ -38,6 +38,12 @@ pub trait VarProvider: std::fmt::Debug { fn get_type(&self, var_names: &[String]) -> Option; } +/// Returns true if the specified string is a "system" variable such as +/// `@@version` +/// +/// See [`SessionContext::register_variable`] for more details +/// +/// [`SessionContext::register_variable`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.register_variable pub fn is_system_variables(variable_names: &[String]) -> bool { !variable_names.is_empty() && variable_names[0].get(0..2) == Some("@@") } diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index a3ac831a1f78..5fb6ef913d8c 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -224,7 +224,24 @@ impl PlannerContext { } } -/// SQL query planner +/// SQL query planner and binder +/// +/// This struct is used to convert a SQL AST into a [`LogicalPlan`]. +/// +/// You can control the behavior of the planner by providing [`ParserOptions`]. +/// +/// It performs the following tasks: +/// +/// 1. Name and type resolution (called "binding" in other systems). This +/// phase looks up table and column names using the [`ContextProvider`]. +/// 2. Mechanical translation of the AST into a [`LogicalPlan`]. +/// +/// It does not perform type coercion, or perform optimization, which are done +/// by subsequent passes. +/// +/// Key interfaces are: +/// * [`Self::sql_statement_to_plan`]: Convert a statement (e.g. `SELECT ...`) into a [`LogicalPlan`] +/// * [`Self::sql_to_expr`]: Convert an expression (e.g. `1 + 2`) into an [`Expr`] pub struct SqlToRel<'a, S: ContextProvider> { pub(crate) context_provider: &'a S, pub(crate) options: ParserOptions,