Skip to content

Commit

Permalink
Improve SQL Planner docs (#14669)
Browse files Browse the repository at this point in the history
* Improve SQL Planner docs

* fix docs

* Apply suggestions from code review

Co-authored-by: Jonah Gao <[email protected]>

* Restore builtin term

* Add docs to `is_system_variable`

* clarify type checking

* fix rendering

---------

Co-authored-by: Jonah Gao <[email protected]>
  • Loading branch information
alamb and jonahgao authored Feb 16, 2025
1 parent 78e8493 commit 6d517fd
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 45 deletions.
6 changes: 3 additions & 3 deletions datafusion/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,9 @@
//! 1. The query string is parsed to an Abstract Syntax Tree (AST)
//! [`Statement`] using [sqlparser].
//!
//! 2. The AST is converted to a [`LogicalPlan`] and logical
//! expressions [`Expr`]s to compute the desired result by the
//! [`SqlToRel`] planner.
//! 2. The AST is converted to a [`LogicalPlan`] and logical expressions
//! [`Expr`]s to compute the desired result by [`SqlToRel`]. This phase
//! also includes name and type resolution ("binding").
//!
//! [`Statement`]: https://docs.rs/sqlparser/latest/sqlparser/ast/enum.Statement.html
//!
Expand Down
101 changes: 60 additions & 41 deletions datafusion/expr/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,18 @@ use sqlparser::ast;

use crate::{AggregateUDF, Expr, GetFieldAccess, ScalarUDF, TableSource, WindowUDF};

/// Provides the `SQL` query planner meta-data about tables and
/// functions referenced in SQL statements, without a direct dependency on other
/// DataFusion structures
/// Provides the `SQL` query planner meta-data about tables and
/// functions referenced in SQL statements, without a direct dependency on the
/// `datafusion` Catalog structures such as [`TableProvider`]
///
/// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html
pub trait ContextProvider {
/// Getter for a datasource
/// Returns a table by reference, if it exists
fn get_table_source(&self, name: TableReference) -> Result<Arc<dyn TableSource>>;

/// Return the type of a file based on its extension (e.g. `.parquet`)
///
/// This is used to plan `COPY` statements
fn get_file_type(&self, _ext: &str) -> Result<Arc<dyn FileType>> {
not_impl_err!("Registered file types are not supported")
}
Expand All @@ -49,11 +54,20 @@ pub trait ContextProvider {
not_impl_err!("Table Functions are not supported")
}

/// This provides a worktable (an intermediate table that is used to store the results of a CTE during execution)
/// We don't directly implement this in the logical plan's ['SqlToRel`]
/// because the sql code needs access to a table that contains execution-related types that can't be a direct dependency
/// of the sql crate (namely, the `CteWorktable`).
/// Provides an intermediate table that is used to store the results of a CTE during execution
///
/// CTE stands for "Common Table Expression"
///
/// # Notes
/// We don't directly implement this in [`SqlToRel`] as implementing this function
/// often requires access to a table that contains
/// execution-related types that can't be a direct dependency
/// of the sql crate (for example [`CteWorkTable`]).
///
/// The [`ContextProvider`] provides a way to "hide" this dependency.
///
/// [`SqlToRel`]: https://docs.rs/datafusion/latest/datafusion/sql/planner/struct.SqlToRel.html
/// [`CteWorkTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/cte_worktable/struct.CteWorkTable.html
fn create_cte_work_table(
&self,
_name: &str,
Expand All @@ -62,39 +76,44 @@ pub trait ContextProvider {
not_impl_err!("Recursive CTE is not implemented")
}

/// Getter for expr planners
/// Return [`ExprPlanner`] extensions for planning expressions
fn get_expr_planners(&self) -> &[Arc<dyn ExprPlanner>] {
&[]
}

/// Getter for the data type planner
/// Return [`TypePlanner`] extensions for planning data types
fn get_type_planner(&self) -> Option<Arc<dyn TypePlanner>> {
None
}

/// Getter for a UDF description
/// Return the scalar function with a given name, if any
fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>>;
/// Getter for a UDAF description

/// Return the aggregate function with a given name, if any
fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>>;
/// Getter for a UDWF

/// Return the window function with a given name, if any
fn get_window_meta(&self, name: &str) -> Option<Arc<WindowUDF>>;
/// Getter for system/user-defined variable type

/// Return the system/user-defined variable type, if any
///
/// A user defined variable is typically accessed via `@var_name`
fn get_variable_type(&self, variable_names: &[String]) -> Option<DataType>;

/// Get configuration options
/// Return overall configuration options
fn options(&self) -> &ConfigOptions;

/// Get all user defined scalar function names
/// Return all scalar function names
fn udf_names(&self) -> Vec<String>;

/// Get all user defined aggregate function names
/// Return all aggregate function names
fn udaf_names(&self) -> Vec<String>;

/// Get all user defined window function names
/// Return all window function names
fn udwf_names(&self) -> Vec<String>;
}

/// This trait allows users to customize the behavior of the SQL planner
/// Customize planning of SQL AST expressions to [`Expr`]s
pub trait ExprPlanner: Debug + Send + Sync {
/// Plan the binary operation between two expressions, returns original
/// BinaryExpr if not possible
Expand All @@ -106,9 +125,9 @@ pub trait ExprPlanner: Debug + Send + Sync {
Ok(PlannerResult::Original(expr))
}

/// Plan the field access expression
/// Plan the field access expression, such as `foo.bar`
///
/// returns original FieldAccessExpr if not possible
/// returns original [`RawFieldAccessExpr`] if not possible
fn plan_field_access(
&self,
expr: RawFieldAccessExpr,
Expand All @@ -117,7 +136,7 @@ pub trait ExprPlanner: Debug + Send + Sync {
Ok(PlannerResult::Original(expr))
}

/// Plan the array literal, returns OriginalArray if not possible
/// Plan an array literal, such as `[1, 2, 3]`
///
/// Returns origin expression arguments if not possible
fn plan_array_literal(
Expand All @@ -128,13 +147,14 @@ pub trait ExprPlanner: Debug + Send + Sync {
Ok(PlannerResult::Original(exprs))
}

// Plan the POSITION expression, e.g., POSITION(<expr> in <expr>)
// returns origin expression arguments if not possible
/// Plan a `POSITION` expression, such as `POSITION(<expr> in <expr>)`
///
/// returns origin expression arguments if not possible
fn plan_position(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
Ok(PlannerResult::Original(args))
}

/// Plan the dictionary literal `{ key: value, ...}`
/// Plan a dictionary literal, such as `{ key: value, ...}`
///
/// Returns origin expression arguments if not possible
fn plan_dictionary_literal(
Expand All @@ -145,27 +165,26 @@ pub trait ExprPlanner: Debug + Send + Sync {
Ok(PlannerResult::Original(expr))
}

/// Plan an extract expression, e.g., `EXTRACT(month FROM foo)`
/// Plan an extract expression, such as`EXTRACT(month FROM foo)`
///
/// Returns origin expression arguments if not possible
fn plan_extract(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
Ok(PlannerResult::Original(args))
}

/// Plan an substring expression, e.g., `SUBSTRING(<expr> [FROM <expr>] [FOR <expr>])`
/// Plan an substring expression, such as `SUBSTRING(<expr> [FROM <expr>] [FOR <expr>])`
///
/// Returns origin expression arguments if not possible
fn plan_substring(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
Ok(PlannerResult::Original(args))
}

/// Plans a struct `struct(expression1[, ..., expression_n])`
/// literal based on the given input expressions.
/// This function takes a vector of expressions and a boolean flag indicating whether
/// the struct uses the optional name
/// Plans a struct literal, such as `{'field1' : expr1, 'field2' : expr2, ...}`
///
/// This function takes a vector of expressions and a boolean flag
/// indicating whether the struct uses the optional name
///
/// Returns a `PlannerResult` containing either the planned struct expressions or the original
/// input expressions if planning is not possible.
/// Returns the original input expressions if planning is not possible.
fn plan_struct_literal(
&self,
args: Vec<Expr>,
Expand All @@ -174,26 +193,26 @@ pub trait ExprPlanner: Debug + Send + Sync {
Ok(PlannerResult::Original(args))
}

/// Plans an overlay expression eg `overlay(str PLACING substr FROM pos [FOR count])`
/// Plans an overlay expression, such as `overlay(str PLACING substr FROM pos [FOR count])`
///
/// Returns origin expression arguments if not possible
fn plan_overlay(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
Ok(PlannerResult::Original(args))
}

/// Plan a make_map expression, e.g., `make_map(key1, value1, key2, value2, ...)`
/// Plans a `make_map` expression, such as `make_map(key1, value1, key2, value2, ...)`
///
/// Returns origin expression arguments if not possible
fn plan_make_map(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
Ok(PlannerResult::Original(args))
}

/// Plans compound identifier eg `db.schema.table` for non-empty nested names
/// Plans compound identifier such as `db.schema.table` for non-empty nested names
///
/// Note:
/// # Note:
/// Currently compound identifier for outer query schema is not supported.
///
/// Returns planned expression
/// Returns original expression if not possible
fn plan_compound_identifier(
&self,
_field: &Field,
Expand All @@ -205,7 +224,7 @@ pub trait ExprPlanner: Debug + Send + Sync {
)
}

/// Plans `ANY` expression, e.g., `expr = ANY(array_expr)`
/// Plans `ANY` expression, such as `expr = ANY(array_expr)`
///
/// Returns origin binary expression if not possible
fn plan_any(&self, expr: RawBinaryExpr) -> Result<PlannerResult<RawBinaryExpr>> {
Expand Down Expand Up @@ -256,9 +275,9 @@ pub enum PlannerResult<T> {
Original(T),
}

/// This trait allows users to customize the behavior of the data type planning
/// Customize planning SQL types to DataFusion (Arrow) types.
pub trait TypePlanner: Debug + Send + Sync {
/// Plan SQL type to DataFusion data type
/// Plan SQL [`ast::DataType`] to DataFusion [`DataType`]
///
/// Returns None if not possible
fn plan_type(&self, _sql_type: &ast::DataType) -> Result<Option<DataType>> {
Expand Down
6 changes: 6 additions & 0 deletions datafusion/expr/src/var_provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ pub trait VarProvider: std::fmt::Debug {
fn get_type(&self, var_names: &[String]) -> Option<DataType>;
}

/// Returns true if the specified string is a "system" variable such as
/// `@@version`
///
/// See [`SessionContext::register_variable`] for more details
///
/// [`SessionContext::register_variable`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.register_variable
pub fn is_system_variables(variable_names: &[String]) -> bool {
!variable_names.is_empty() && variable_names[0].get(0..2) == Some("@@")
}
Expand Down
19 changes: 18 additions & 1 deletion datafusion/sql/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,24 @@ impl PlannerContext {
}
}

/// SQL query planner
/// SQL query planner and binder
///
/// This struct is used to convert a SQL AST into a [`LogicalPlan`].
///
/// You can control the behavior of the planner by providing [`ParserOptions`].
///
/// It performs the following tasks:
///
/// 1. Name and type resolution (called "binding" in other systems). This
/// phase looks up table and column names using the [`ContextProvider`].
/// 2. Mechanical translation of the AST into a [`LogicalPlan`].
///
/// It does not perform type coercion, or perform optimization, which are done
/// by subsequent passes.
///
/// Key interfaces are:
/// * [`Self::sql_statement_to_plan`]: Convert a statement (e.g. `SELECT ...`) into a [`LogicalPlan`]
/// * [`Self::sql_to_expr`]: Convert an expression (e.g. `1 + 2`) into an [`Expr`]
pub struct SqlToRel<'a, S: ContextProvider> {
pub(crate) context_provider: &'a S,
pub(crate) options: ParserOptions,
Expand Down

0 comments on commit 6d517fd

Please sign in to comment.