diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 53d1f26..e4440c9 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -23,10 +23,10 @@ jobs:
- uses: arduino/setup-protoc@v3
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
- - run: cargo rustdoc -p datafusion-federation -- --cfg docsrs
+ - run: cargo rustdoc -p datafusion-flight-sql-server -- --cfg docsrs
- run: chmod -c -R +rX "target/doc"
- run: touch target/doc/index.html
- - run: echo "" > target/doc/index.html
+ - run: echo "" > target/doc/index.html
- if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: actions/upload-pages-artifact@v3
with:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0792434..5eb06a7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -70,4 +70,4 @@ jobs:
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
- run: cargo build --all
- - run: cargo package -p datafusion-federation --allow-dirty
+ - run: cargo package -p datafusion-flight-sql-server --allow-dirty
diff --git a/Cargo.toml b/Cargo.toml
index 4b2cf85..aa6a0d0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,6 @@
resolver = "2"
members = [
- "datafusion-federation",
"datafusion-flight-sql-server",
"datafusion-flight-sql-table-provider",
]
@@ -12,16 +11,15 @@ version = "0.3.5"
edition = "2021"
license = "Apache-2.0"
readme = "README.md"
-repository = "https://github.com/datafusion-contrib/datafusion-federation"
+repository = "https://github.com/datafusion-contrib/datafusion-flight-sql-server"
[workspace.dependencies]
arrow = "53.3"
arrow-flight = { version = "53.3", features = ["flight-sql-experimental"] }
arrow-json = "53.3"
-async-stream = "0.3.5"
async-trait = "0.1.83"
datafusion = "44.0.0"
-datafusion-federation = { path = "./datafusion-federation", version = "0.3.5" }
+datafusion-federation = { version = "0.3.5" }
datafusion-substrait = "44.0.0"
futures = "0.3.31"
tokio = { version = "1.41", features = ["full"] }
diff --git a/README.md b/README.md
index 4891b12..6265e42 100644
--- a/README.md
+++ b/README.md
@@ -1,138 +1,52 @@
-# DataFusion Federation
-
-[![crates.io](https://img.shields.io/crates/v/datafusion-federation.svg)](https://crates.io/crates/datafusion-federation)
-[![docs.rs](https://docs.rs/datafusion-federation/badge.svg)](https://docs.rs/datafusion-federation)
-
-DataFusion Federation allows
-[DataFusion](https://github.com/apache/arrow-datafusion) to execute (part of) a
-query plan by a remote execution engine.
-
- ┌────────────────┐
- ┌────────────┐ │ Remote DBMS(s) │
- SQL Query ───> │ DataFusion │ ───> │ ( execution │
- └────────────┘ │ happens here ) │
- └────────────────┘
-
-The goal is to allow resolving queries across remote query engines while
-pushing down as much compute as possible to the remote database(s). This allows
-execution to happen as close to the storage as possible. This concept is
-referred to as 'query federation'.
-
-> [!TIP]
-> This repository implements the federation framework itself. If you want to
-> connect to a specific database, check out the compatible providers available
-> in
-> [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/).
-
-## Usage
-
-Check out the [examples](./datafusion-federation/examples/) to get a feel for
-how it works.
-
-For a complete step-by-step example of how federation works, you can check the
-example [here](./datafusion-federation/examples/df-csv-advanced.rs).
-
-## Potential use-cases:
-
-- Querying across SQLite, MySQL, PostgreSQL, ...
-- Pushing down SQL or [Substrait](https://substrait.io/) plans.
-- DataFusion -> Flight SQL -> DataFusion
-- ..
-
-## Design concept
-
-Say you have a query plan as follows:
-
- ┌────────────┐
- │ Join │
- └────────────┘
- ▲
- ┌───────┴────────┐
- ┌────────────┐ ┌────────────┐
- │ Scan A │ │ Join │
- └────────────┘ └────────────┘
- ▲
- ┌───────┴────────┐
- ┌────────────┐ ┌────────────┐
- │ Scan B │ │ Scan C │
- └────────────┘ └────────────┘
-
-DataFusion Federation will identify the largest possible sub-plans that
-can be executed by an external database:
-
- ┌────────────┐ Optimizer recognizes
- │ Join │ that B and C are
- └────────────┘ available in an
- ▲ external database
- ┌──────────────┴────────┐
- │ ┌ ─ ─ ─ ─ ─ ─ ┴ ─ ── ─ ─ ─ ─ ─┐
- ┌────────────┐ ┌────────────┐ │
- │ Scan A │ │ │ Join │
- └────────────┘ └────────────┘ │
- │ ▲
- ┌───────┴────────┐ │
- ┌────────────┐ ┌────────────┐ │
- ││ Scan B │ │ Scan C │
- └────────────┘ └────────────┘ │
- ─ ── ─ ─ ── ─ ─ ─ ─ ─ ─ ─ ── ─ ┘
-
-The sub-plans are cut out and replaced by an opaque federation node in the plan:
-
- ┌────────────┐
- │ Join │
- └────────────┘ Rewritten Plan
- ▲
- ┌────────┴───────────┐
- │ │
- ┌────────────┐ ┏━━━━━━━━━━━━━━━━━━┓
- │ Scan A │ ┃ Scan B+C ┃
- └────────────┘ ┃ (TableProvider ┃
- ┃ that can execute ┃
- ┃ sub-plan in an ┃
- ┃external database)┃
- ┗━━━━━━━━━━━━━━━━━━┛
-
-Different databases may have different query languages and execution
-capabilities. To accommodate for this, we allow each 'federation provider' to
-self-determine what part of a sub-plan it will actually federate. This is done
-by letting each federation provider define its own optimizer rule. When a
-sub-plan is 'cut out' of the overall plan, it is first passed the federation
-provider's optimizer rule. This optimizer rule determines the part of the plan
-that is cut out, based on the execution capabilities of the database it
-represents.
-
-## Implementation
-
-A remote database is represented by the `FederationProvider` trait. To identify
-table scans that are available in the same database, they implement
-`FederatedTableSource` trait. This trait allows lookup of the corresponding
-`FederationProvider`.
-
-Identifying sub-plans to federate is done by the `FederationOptimizerRule`.
-This rule needs to be registered in your DataFusion SessionState. One easy way
-to do this is using `default_session_state`. To do its job, the
-`FederationOptimizerRule` currently requires that all TableProviders that need
-to be federated are `FederatedTableProviderAdaptor`s. The
-`FederatedTableProviderAdaptor` also has a fallback mechanism that allows
-implementations to fallback to a 'vanilla' TableProvider in case the
-`FederationOptimizerRule` isn't registered.
-
-The `FederationProvider` can provide a `compute_context`. This allows it to
-differentiate between multiple remote execution context of the same type. For
-example two different mysql instances, database schemas, access level, etc. The
-`FederationProvider` also returns the `Optimizer` that is allows it to
-self-determine what part of a sub-plan it can federate.
-
-The `sql` module implements a generic `FederationProvider` for SQL execution
-engines. A specific SQL engine implements the `SQLExecutor` trait for its
-engine specific execution. There are a number of compatible providers available
-in
-[datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/).
-
-## Status
-
-The project is in alpha status. Contributions welcome; land a PR = commit
-access.
-
-- [Docs (release)](https://docs.rs/datafusion-federation)
-- [Docs (main)](https://datafusion-contrib.github.io/datafusion-federation/)
+# DataFusion Flight SQL Server
+
+The `datafusion-flight-sql-server` is a Flight SQL server that implements the
+necessary endpoints to use DataFusion as the query engine.
+
+## Getting Started
+
+To use `datafusion-flight-sql-server` in your Rust project, run:
+
+```sh
+$ cargo add datafusion-flight-sql-server
+```
+
+## Example
+
+Here's a basic example of setting up a Flight SQL server:
+
+```rust
+use datafusion_flight_sql_server::service::FlightSqlService;
+use datafusion::{
+ execution::{
+ context::SessionContext,
+ options::CsvReadOptions,
+ },
+};
+
+async {
+ let dsn: String = "0.0.0.0:50051".to_string();
+ let remote_ctx = SessionContext::new();
+ remote_ctx
+ .register_csv("test", "./examples/test.csv", CsvReadOptions::new())
+ .await.expect("Register csv");
+
+ FlightSqlService::new(remote_ctx.state()).serve(dsn.clone())
+ .await
+ .expect("Run flight sql service");
+
+};
+```
+
+This example sets up a Flight SQL server listening on `127.0.0.1:50051`.
+
+
+# Acknowledgments
+
+This repository was a Rust crate that was first built as a part of
+[datafusion-federation](https://github.com/datafusion-contrib/datafusion-federation/)
+repository.
+
+For more details about the original repository, please visit
+[datafusion-federation](https://github.com/datafusion-contrib/datafusion-federation/).
+
diff --git a/datafusion-federation/CHANGELOG.md b/datafusion-federation/CHANGELOG.md
deleted file mode 100644
index ba3d0a6..0000000
--- a/datafusion-federation/CHANGELOG.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Changelog
-
-All notable changes to this project will be documented in this file.
-
-The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
-and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-
-## [Unreleased]
-
-## [0.3.5](https://github.com/datafusion-contrib/datafusion-federation/compare/datafusion-federation-v0.3.4...datafusion-federation-v0.3.5) - 2025-01-20
-
-### Other
-
-- Use the Dialect and Unparser constructor when using the plan_to_sql function. (#105)
-
-## [0.3.4](https://github.com/datafusion-contrib/datafusion-federation/compare/datafusion-federation-v0.3.3...datafusion-federation-v0.3.4) - 2025-01-12
-
-### Other
-
-- upgrade datafusion to 44 (#103)
-
-## [0.3.3](https://github.com/datafusion-contrib/datafusion-federation/compare/datafusion-federation-v0.3.2...datafusion-federation-v0.3.3) - 2025-01-04
-
-### Fixed
-
-- handle `LogicalPlan::Limit` separately to preserve skip and offset in `rewrite_table_scans` (#101)
-
-## [0.3.2](https://github.com/datafusion-contrib/datafusion-federation/compare/datafusion-federation-v0.3.1...datafusion-federation-v0.3.2) - 2024-12-05
-
-### Other
-
-- Release plz action: install required dependencies ([#85](https://github.com/datafusion-contrib/datafusion-federation/pull/85))
diff --git a/datafusion-federation/Cargo.toml b/datafusion-federation/Cargo.toml
deleted file mode 100644
index 8844cd2..0000000
--- a/datafusion-federation/Cargo.toml
+++ /dev/null
@@ -1,43 +0,0 @@
-[package]
-name = "datafusion-federation"
-version.workspace = true
-edition.workspace = true
-license.workspace = true
-readme.workspace = true
-repository.workspace = true
-description = "Datafusion federation."
-
-[lib]
-name = "datafusion_federation"
-path = "src/lib.rs"
-
-[package.metadata.docs.rs]
-# Whether to pass `--all-features` to Cargo (default: false)
-all-features = true
-# Whether to pass `--no-default-features` to Cargo (default: false)
-no-default-features = true
-
-[features]
-sql = []
-
-[dependencies]
-futures.workspace = true
-async-trait.workspace = true
-datafusion.workspace = true
-async-stream.workspace = true
-arrow-json.workspace = true
-
-[dev-dependencies]
-tokio.workspace = true
-tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
-tracing = "0.1.40"
-
-[[example]]
-name = "df-csv"
-path = "examples/df-csv.rs"
-required-features = ["sql"]
-
-[[example]]
-name = "df-csv-advanced"
-path = "examples/df-csv-advanced.rs"
-required-features = ["sql"]
diff --git a/datafusion-federation/examples/data/test.csv b/datafusion-federation/examples/data/test.csv
deleted file mode 100644
index 62d0b11..0000000
--- a/datafusion-federation/examples/data/test.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-foo,bar
-a,1
-b,2
-c,3
diff --git a/datafusion-federation/examples/data/test2.csv b/datafusion-federation/examples/data/test2.csv
deleted file mode 100644
index 7196406..0000000
--- a/datafusion-federation/examples/data/test2.csv
+++ /dev/null
@@ -1,7 +0,0 @@
-foo,bar
-a,1
-b,2
-c,3
-d,4
-e,5
-f,6
diff --git a/datafusion-federation/examples/df-csv-advanced.rs b/datafusion-federation/examples/df-csv-advanced.rs
deleted file mode 100644
index e7b709f..0000000
--- a/datafusion-federation/examples/df-csv-advanced.rs
+++ /dev/null
@@ -1,148 +0,0 @@
-mod shared;
-
-use std::sync::Arc;
-
-use datafusion::{
- execution::{
- context::SessionContext, options::CsvReadOptions, session_state::SessionStateBuilder,
- },
- optimizer::Optimizer,
-};
-
-use datafusion_federation::{
- sql::{MultiSchemaProvider, SQLFederationProvider, SQLSchemaProvider},
- FederatedQueryPlanner, FederationOptimizerRule,
-};
-
-use shared::{overwrite_default_schema, MockPostgresExecutor, MockSqliteExecutor};
-
-const CSV_PATH_SQLITE: &str = "./examples/data/test.csv";
-const CSV_PATH_POSTGRES: &str = "./examples/data/test2.csv";
-const TABLE_NAME_SQLITE: &str = "test_sqlite";
-const TABLE_NAME_POSTGRES: &str = "test_pg";
-
-#[tokio::main]
-async fn main() {
- // This example demonstrates how DataFusion, with federation enabled, to
- // executes a query using two execution engines.
- //
- // The query used in this example is:
- //
- // ```sql
- // SELECT t.*
- // FROM test_pg AS t
- // JOIN test_sqlite AS a
- // ON t.foo = a.foo
- // ```
- //
- // In this query, `test_pg` is a table in a PostgreSQL database, and `test_sqlite` is a table
- // in an SQLite database. DataFusion Federation will identify the sub-plans that can be
- // executed by external databases. In this example, there will be only two sub-plans.
- //
- // ┌────────────┐
- // │ Join │
- // └────────────┘
- // ▲
- // ┌───────┴──────────┐
- // ┌──────────────┐ ┌────────────┐
- // │ test_sqlite │ │ Join │
- // └──────────────┘ └────────────┘
- // ▲
- // |
- // ┌────────────┐
- // │ test_pg │
- // └────────────┘
- //
- // Note: For the purpose of this example, both the SQLite and PostgreSQL engines are dummy
- // engines that use DataFusion SessionContexts with registered CSV files. However, this setup
- // works fine for demonstration purposes. If you'd like to use actual SQLite and PostgreSQL
- // engines, you can check out the table-providers repository at
- // https://github.com/datafusion-contrib/datafusion-table-providers/.
-
- /////////////////////
- // Remote sqlite DB
- /////////////////////
- // Create a datafusion::SessionContext and register a csv file as a table in that context
- // This will be passed to the MockSqliteExecutor and acts as a dummy sqlite engine.
- let sqlite_remote_ctx = Arc::new(SessionContext::new());
- // Registers a CSV file
- sqlite_remote_ctx
- .register_csv(TABLE_NAME_SQLITE, CSV_PATH_SQLITE, CsvReadOptions::new())
- .await
- .expect("Register csv file");
-
- let sqlite_known_tables: Vec = [TABLE_NAME_SQLITE].iter().map(|&x| x.into()).collect();
-
- // Create the federation provider
- let sqlite_executor = Arc::new(MockSqliteExecutor::new(sqlite_remote_ctx));
- let sqlite_federation_provider = Arc::new(SQLFederationProvider::new(sqlite_executor));
- // Create the schema provider
- let sqlite_schema_provider = Arc::new(
- SQLSchemaProvider::new_with_tables(sqlite_federation_provider, sqlite_known_tables)
- .await
- .expect("Create new schema provider with tables"),
- );
-
- /////////////////////
- // Remote postgres DB
- /////////////////////
- // Create a datafusion::SessionContext and register a csv file as a table in that context
- // This will be passed to the MockPostgresExecutor and acts as a dummy postgres engine.
- let postgres_remote_ctx = Arc::new(SessionContext::new());
- // Registers a CSV file
- postgres_remote_ctx
- .register_csv(
- TABLE_NAME_POSTGRES,
- CSV_PATH_POSTGRES,
- CsvReadOptions::new(),
- )
- .await
- .expect("Register csv file");
-
- let postgres_known_tables: Vec =
- [TABLE_NAME_POSTGRES].iter().map(|&x| x.into()).collect();
-
- // Create the federation provider
- let postgres_executor = Arc::new(MockPostgresExecutor::new(postgres_remote_ctx));
- let postgres_federation_provider = Arc::new(SQLFederationProvider::new(postgres_executor));
- // Create the schema provider
- let postgres_schema_provider = Arc::new(
- SQLSchemaProvider::new_with_tables(postgres_federation_provider, postgres_known_tables)
- .await
- .expect("Create new schema provider with tables"),
- );
-
- /////////////////////
- // Main(local) DB
- /////////////////////
- // Get the default optimizer rules
- let mut rules = Optimizer::new().rules;
-
- // Create a new federation optimizer rule and add it to the default rules
- rules.push(Arc::new(FederationOptimizerRule::new()));
-
- // Create a new SessionState with the optimizer rule we created above
- let state = SessionStateBuilder::new()
- .with_optimizer_rules(rules)
- .with_query_planner(Arc::new(FederatedQueryPlanner::new()))
- .build();
-
- // Replace the default schema for the main context with the schema providers
- // from the remote DBs
- let schema_provider =
- MultiSchemaProvider::new(vec![sqlite_schema_provider, postgres_schema_provider]);
- overwrite_default_schema(&state, Arc::new(schema_provider))
- .expect("Overwrite the default schema form the main context");
-
- // Create the session context for the main db
- let ctx = SessionContext::new_with_state(state);
-
- // Run a query
- let query = r#"SELECT t.* FROM test_pg as t join test_sqlite as a ON t.foo = a.foo"#;
- let df = ctx
- .sql(query)
- .await
- .expect("Create a dataframe from sql query");
-
- df.show().await.expect("Execute the dataframe");
-}
diff --git a/datafusion-federation/examples/df-csv.rs b/datafusion-federation/examples/df-csv.rs
deleted file mode 100644
index c71c6ab..0000000
--- a/datafusion-federation/examples/df-csv.rs
+++ /dev/null
@@ -1,45 +0,0 @@
-mod shared;
-
-use std::sync::Arc;
-
-use datafusion::{
- error::Result,
- execution::{context::SessionContext, options::CsvReadOptions},
-};
-use datafusion_federation::sql::{SQLFederationProvider, SQLSchemaProvider};
-
-const CSV_PATH: &str = "./examples/data/test.csv";
-const TABLE_NAME: &str = "test";
-
-use shared::{overwrite_default_schema, MockSqliteExecutor};
-
-#[tokio::main]
-async fn main() -> Result<()> {
- // Create a remote context for the mock sqlite DB
- let remote_ctx = Arc::new(SessionContext::new());
-
- // Registers a CSV file
- remote_ctx
- .register_csv(TABLE_NAME, CSV_PATH, CsvReadOptions::new())
- .await?;
- let known_tables: Vec = [TABLE_NAME].iter().map(|&x| x.into()).collect();
-
- // Create the federation provider
- let executor = Arc::new(MockSqliteExecutor::new(remote_ctx));
- let provider = Arc::new(SQLFederationProvider::new(executor));
-
- // Get the schema
- let schema_provider =
- Arc::new(SQLSchemaProvider::new_with_tables(provider, known_tables).await?);
-
- // Main context
- let state = datafusion_federation::default_session_state();
- overwrite_default_schema(&state, schema_provider)?;
- let ctx = SessionContext::new_with_state(state);
-
- // Run a query
- let query = r#"SELECT * FROM test"#;
- let df = ctx.sql(query).await?;
-
- df.show().await
-}
diff --git a/datafusion-federation/examples/shared/mod.rs b/datafusion-federation/examples/shared/mod.rs
deleted file mode 100644
index 4b19990..0000000
--- a/datafusion-federation/examples/shared/mod.rs
+++ /dev/null
@@ -1,136 +0,0 @@
-use std::sync::Arc;
-
-use async_trait::async_trait;
-use datafusion::{
- arrow::datatypes::SchemaRef,
- catalog::SchemaProvider,
- error::{DataFusionError, Result},
- execution::context::{SessionContext, SessionState},
- physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream},
- sql::unparser::dialect::{DefaultDialect, Dialect},
-};
-use futures::TryStreamExt;
-
-use datafusion_federation::sql::SQLExecutor;
-
-pub fn overwrite_default_schema(
- state: &SessionState,
- schema: Arc,
-) -> Result<()> {
- let options = &state.config().options().catalog;
- let catalog = state
- .catalog_list()
- .catalog(options.default_catalog.as_str())
- .unwrap();
-
- catalog.register_schema(options.default_schema.as_str(), schema)?;
- Ok(())
-}
-
-pub struct MockSqliteExecutor {
- session: Arc,
-}
-
-impl MockSqliteExecutor {
- pub fn new(session: Arc) -> Self {
- Self { session }
- }
-}
-
-#[async_trait]
-impl SQLExecutor for MockSqliteExecutor {
- fn name(&self) -> &str {
- "mock_sqlite_executor"
- }
-
- fn compute_context(&self) -> Option {
- // Don't return None here - it will cause incorrect federation with other providers of the
- // same name that also have a compute_context of None.
- // Instead return a random string that will never match any other provider's context.
- Some("sqlite_exec".to_string())
- }
-
- fn execute(&self, sql: &str, schema: SchemaRef) -> Result {
- // Execute it using the remote datafusion session context
- let future_stream = _execute(self.session.clone(), sql.to_string());
- let stream = futures::stream::once(future_stream).try_flatten();
- Ok(Box::pin(RecordBatchStreamAdapter::new(
- schema.clone(),
- stream,
- )))
- }
-
- async fn table_names(&self) -> Result> {
- Err(DataFusionError::NotImplemented(
- "table inference not implemented".to_string(),
- ))
- }
-
- async fn get_table_schema(&self, table_name: &str) -> Result {
- let sql = format!("select * from {table_name} limit 1");
- let df = self.session.sql(&sql).await?;
- let schema = df.schema().as_arrow().clone();
- Ok(Arc::new(schema))
- }
-
- fn dialect(&self) -> Arc {
- Arc::new(DefaultDialect {})
- }
-}
-
-#[allow(dead_code)]
-pub struct MockPostgresExecutor {
- session: Arc,
-}
-
-#[allow(dead_code)]
-impl MockPostgresExecutor {
- pub fn new(session: Arc) -> Self {
- Self { session }
- }
-}
-
-#[async_trait]
-impl SQLExecutor for MockPostgresExecutor {
- fn name(&self) -> &str {
- "mock_postgres_executor"
- }
-
- fn compute_context(&self) -> Option {
- // Don't return None here - it will cause incorrect federation with other providers of the
- // same name that also have a compute_context of None.
- // Instead return a random string that will never match any other provider's context.
- Some("postgres_exec".to_string())
- }
-
- fn execute(&self, sql: &str, schema: SchemaRef) -> Result {
- // Execute it using the remote datafusion session context
- let future_stream = _execute(self.session.clone(), sql.to_string());
- let stream = futures::stream::once(future_stream).try_flatten();
- Ok(Box::pin(RecordBatchStreamAdapter::new(
- schema.clone(),
- stream,
- )))
- }
-
- async fn table_names(&self) -> Result> {
- Err(DataFusionError::NotImplemented(
- "table inference not implemented".to_string(),
- ))
- }
-
- async fn get_table_schema(&self, table_name: &str) -> Result {
- let sql = format!("select * from {table_name} limit 1");
- let df = self.session.sql(&sql).await?;
- let schema = df.schema().as_arrow().clone();
- Ok(Arc::new(schema))
- }
-
- fn dialect(&self) -> Arc {
- Arc::new(DefaultDialect {})
- }
-}
-
-async fn _execute(ctx: Arc, sql: String) -> Result {
- ctx.sql(&sql).await?.execute_stream().await
-}
diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs
deleted file mode 100644
index 377b469..0000000
--- a/datafusion-federation/src/lib.rs
+++ /dev/null
@@ -1,90 +0,0 @@
-mod optimizer;
-mod plan_node;
-pub mod schema_cast;
-#[cfg(feature = "sql")]
-pub mod sql;
-mod table_provider;
-
-use std::{
- fmt,
- hash::{Hash, Hasher},
- sync::Arc,
-};
-
-use datafusion::{
- execution::session_state::{SessionState, SessionStateBuilder},
- optimizer::{optimizer::Optimizer, OptimizerRule},
-};
-
-pub use optimizer::{get_table_source, FederationOptimizerRule};
-pub use plan_node::{
- FederatedPlanNode, FederatedPlanner, FederatedQueryPlanner, FederationPlanner,
-};
-pub use table_provider::{FederatedTableProviderAdaptor, FederatedTableSource};
-
-pub fn default_session_state() -> SessionState {
- let rules = default_optimizer_rules();
- SessionStateBuilder::new()
- .with_optimizer_rules(rules)
- .with_query_planner(Arc::new(FederatedQueryPlanner::new()))
- .with_default_features()
- .build()
-}
-
-pub fn default_optimizer_rules() -> Vec> {
- // Get the default optimizer
- let df_default = Optimizer::new();
- let mut default_rules = df_default.rules;
-
- // Insert the FederationOptimizerRule after the ScalarSubqueryToJoin.
- // This ensures ScalarSubquery are replaced before we try to federate.
- let Some(pos) = default_rules
- .iter()
- .position(|x| x.name() == "scalar_subquery_to_join")
- else {
- panic!("Could not locate ScalarSubqueryToJoin");
- };
-
- // TODO: check if we should allow other optimizers to run before the federation rule.
-
- let federation_rule = Arc::new(FederationOptimizerRule::new());
- default_rules.insert(pos + 1, federation_rule);
-
- default_rules
-}
-
-pub type FederationProviderRef = Arc;
-pub trait FederationProvider: Send + Sync {
- // Returns the name of the provider, used for comparison.
- fn name(&self) -> &str;
-
- // Returns the compute context in which this federation provider
- // will execute a query. For example: database instance & catalog.
- fn compute_context(&self) -> Option;
-
- // Returns an optimizer that can cut out part of the plan
- // to federate it.
- fn optimizer(&self) -> Option>;
-}
-
-impl fmt::Display for dyn FederationProvider {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(f, "{} {:?}", self.name(), self.compute_context())
- }
-}
-
-impl PartialEq for dyn FederationProvider {
- /// Comparing name, args and return_type
- fn eq(&self, other: &dyn FederationProvider) -> bool {
- self.name() == other.name() && self.compute_context() == other.compute_context()
- }
-}
-
-impl Hash for dyn FederationProvider {
- fn hash(&self, state: &mut H) {
- self.name().hash(state);
- self.compute_context().hash(state);
- }
-}
-
-impl Eq for dyn FederationProvider {}
diff --git a/datafusion-federation/src/optimizer/mod.rs b/datafusion-federation/src/optimizer/mod.rs
deleted file mode 100644
index 9c16dfc..0000000
--- a/datafusion-federation/src/optimizer/mod.rs
+++ /dev/null
@@ -1,368 +0,0 @@
-mod scan_result;
-
-use std::sync::Arc;
-
-use datafusion::{
- common::not_impl_err,
- common::tree_node::{Transformed, TreeNode, TreeNodeRecursion},
- datasource::source_as_provider,
- error::Result,
- logical_expr::{Expr, Extension, LogicalPlan, Projection, TableScan, TableSource},
- optimizer::optimizer::{Optimizer, OptimizerConfig, OptimizerRule},
-};
-
-use crate::{
- FederatedTableProviderAdaptor, FederatedTableSource, FederationProvider, FederationProviderRef,
-};
-
-use scan_result::ScanResult;
-
-/// An optimizer rule to identifying sub-plans to federate
-///
-/// The optimizer logic walks over the plan, look for the largest subtrees that only have
-/// TableScans from the same [`FederationProvider`]. There 'largest sub-trees' are passed to their
-/// respective [`FederationProvider::optimizer`].
-#[derive(Default, Debug)]
-pub struct FederationOptimizerRule {}
-
-impl OptimizerRule for FederationOptimizerRule {
- /// Try to rewrite `plan` to an optimized form, returning `Transformed::yes`
- /// if the plan was rewritten and `Transformed::no` if it was not.
- ///
- /// Note: this function is only called if [`Self::supports_rewrite`] returns
- /// true. Otherwise the Optimizer calls [`Self::try_optimize`]
- fn rewrite(
- &self,
- plan: LogicalPlan,
- config: &dyn OptimizerConfig,
- ) -> Result> {
- match self.optimize_plan_recursively(&plan, true, config)? {
- (Some(optimized_plan), _) => Ok(Transformed::yes(optimized_plan)),
- (None, _) => Ok(Transformed::no(plan)),
- }
- }
-
- /// Does this rule support rewriting owned plans (rather than by reference)?
- fn supports_rewrite(&self) -> bool {
- true
- }
-
- /// A human readable name for this optimizer rule
- fn name(&self) -> &str {
- "federation_optimizer_rule"
- }
-}
-
-impl FederationOptimizerRule {
- /// Creates a new [`FederationOptimizerRule`]
- pub fn new() -> Self {
- Self::default()
- }
-
- /// Scans a plan to see if it belongs to a single [`FederationProvider`].
- fn scan_plan_recursively(&self, plan: &LogicalPlan) -> Result {
- let mut sole_provider: ScanResult = ScanResult::None;
-
- plan.apply(&mut |p: &LogicalPlan| -> Result {
- let exprs_provider = self.scan_plan_exprs(p)?;
- sole_provider.merge(exprs_provider);
-
- if sole_provider.is_ambiguous() {
- return Ok(TreeNodeRecursion::Stop);
- }
-
- let sub_provider = get_leaf_provider(p)?;
- sole_provider.add(sub_provider);
-
- Ok(sole_provider.check_recursion())
- })?;
-
- Ok(sole_provider)
- }
-
- /// Scans a plan's expressions to see if it belongs to a single [`FederationProvider`].
- fn scan_plan_exprs(&self, plan: &LogicalPlan) -> Result {
- let mut sole_provider: ScanResult = ScanResult::None;
-
- let exprs = plan.expressions();
- for expr in &exprs {
- let expr_result = self.scan_expr_recursively(expr)?;
- sole_provider.merge(expr_result);
-
- if sole_provider.is_ambiguous() {
- return Ok(sole_provider);
- }
- }
-
- Ok(sole_provider)
- }
-
- /// scans an expression to see if it belongs to a single [`FederationProvider`]
- fn scan_expr_recursively(&self, expr: &Expr) -> Result {
- let mut sole_provider: ScanResult = ScanResult::None;
-
- expr.apply(&mut |e: &Expr| -> Result {
- // TODO: Support other types of sub-queries
- match e {
- Expr::ScalarSubquery(ref subquery) => {
- let plan_result = self.scan_plan_recursively(&subquery.subquery)?;
-
- sole_provider.merge(plan_result);
- Ok(sole_provider.check_recursion())
- }
- Expr::InSubquery(_) => not_impl_err!("InSubquery"),
- Expr::OuterReferenceColumn(..) => {
- // Subqueries that reference outer columns are not supported
- // for now. We handle this here as ambiguity to force
- // federation lower in the plan tree.
- sole_provider = ScanResult::Ambiguous;
- Ok(TreeNodeRecursion::Stop)
- }
- _ => Ok(TreeNodeRecursion::Continue),
- }
- })?;
-
- Ok(sole_provider)
- }
-
- /// Recursively finds the largest sub-plans that can be federated
- /// to a single FederationProvider.
- ///
- /// Returns a plan if a sub-tree was federated, otherwise None.
- ///
- /// Returns a ScanResult of all FederationProviders in the subtree.
- fn optimize_plan_recursively(
- &self,
- plan: &LogicalPlan,
- is_root: bool,
- _config: &dyn OptimizerConfig,
- ) -> Result<(Option, ScanResult)> {
- let mut sole_provider: ScanResult = ScanResult::None;
-
- if let LogicalPlan::Extension(Extension { ref node }) = plan {
- if node.name() == "Federated" {
- // Avoid attempting double federation
- return Ok((None, ScanResult::Ambiguous));
- }
- }
-
- // Check if this plan node is a leaf that determines the FederationProvider
- let leaf_provider = get_leaf_provider(plan)?;
-
- // Check if the expressions contain, a potentially different, FederationProvider
- let exprs_result = self.scan_plan_exprs(plan)?;
- let optimize_expressions = exprs_result.is_some();
-
- // Return early if this is a leaf and there is no ambiguity with the expressions.
- if leaf_provider.is_some() && (exprs_result.is_none() || exprs_result == leaf_provider) {
- return Ok((None, leaf_provider.into()));
- }
- // Aggregate leaf & expression providers
- sole_provider.add(leaf_provider);
- sole_provider.merge(exprs_result);
-
- let inputs = plan.inputs();
- // Return early if there are no sources.
- if inputs.is_empty() && sole_provider.is_none() {
- return Ok((None, ScanResult::None));
- }
-
- // Recursively optimize inputs
- let input_results = inputs
- .iter()
- .map(|i| self.optimize_plan_recursively(i, false, _config))
- .collect::>>()?;
-
- // Aggregate the input providers
- input_results.iter().for_each(|(_, scan_result)| {
- sole_provider.merge(scan_result.clone());
- });
-
- if sole_provider.is_none() {
- // No providers found
- // TODO: Is/should this be reachable?
- return Ok((None, ScanResult::None));
- }
-
- // If all sources are federated to the same provider
- if let ScanResult::Distinct(provider) = sole_provider {
- if !is_root {
- // The largest sub-plan is higher up.
- return Ok((None, ScanResult::Distinct(provider)));
- }
-
- let Some(optimizer) = provider.optimizer() else {
- // No optimizer provided
- return Ok((None, ScanResult::None));
- };
-
- // If this is the root plan node; federate the entire plan
- let optimized = optimizer.optimize(plan.clone(), _config, |_, _| {})?;
- return Ok((Some(optimized), ScanResult::None));
- }
-
- // The plan is ambiguous; any input that is not yet optimized and has a
- // sole provider represents a largest sub-plan and should be federated.
- //
- // We loop over the input optimization results, federate where needed and
- // return a complete list of new inputs for the optimized plan.
- let new_inputs = input_results
- .into_iter()
- .enumerate()
- .map(|(i, (input_plan, input_result))| {
- if let Some(federated_plan) = input_plan {
- // Already federated deeper in the plan tree
- return Ok(federated_plan);
- }
-
- let original_input = (*inputs.get(i).unwrap()).clone();
- if input_result.is_ambiguous() {
- // Can happen if the input is already federated, so use
- // the original input.
- return Ok(original_input);
- }
-
- let provider = input_result.unwrap();
- let Some(provider) = provider else {
- // No provider for this input; use the original input.
- return Ok(original_input);
- };
-
- let Some(optimizer) = provider.optimizer() else {
- // No optimizer for this input; use the original input.
- return Ok(original_input);
- };
-
- // Replace the input with the federated counterpart
- let wrapped = wrap_projection(original_input)?;
- let optimized = optimizer.optimize(wrapped, _config, |_, _| {})?;
-
- Ok(optimized)
- })
- .collect::>>()?;
-
- // Optimize expressions if needed
- let new_expressions = if optimize_expressions {
- self.optimize_plan_exprs(plan, _config)?
- } else {
- plan.expressions()
- };
-
- // Construct the optimized plan
- let new_plan = plan.with_new_exprs(new_expressions, new_inputs)?;
-
- // Return the federated plan
- Ok((Some(new_plan), ScanResult::Ambiguous))
- }
-
- /// Optimizes all exprs of a plan
- fn optimize_plan_exprs(
- &self,
- plan: &LogicalPlan,
- _config: &dyn OptimizerConfig,
- ) -> Result> {
- plan.expressions()
- .iter()
- .map(|expr| {
- let transformed = expr
- .clone()
- .transform(&|e| self.optimize_expr_recursively(e, _config))?;
- Ok(transformed.data)
- })
- .collect::>>()
- }
-
- /// recursively optimize expressions
- /// Current logic: individually federate every sub-query.
- fn optimize_expr_recursively(
- &self,
- expr: Expr,
- _config: &dyn OptimizerConfig,
- ) -> Result> {
- match expr {
- Expr::ScalarSubquery(ref subquery) => {
- // Optimize as root to force federating the sub-query
- let (new_subquery, _) =
- self.optimize_plan_recursively(&subquery.subquery, true, _config)?;
- let Some(new_subquery) = new_subquery else {
- return Ok(Transformed::no(expr));
- };
- Ok(Transformed::yes(Expr::ScalarSubquery(
- subquery.with_plan(new_subquery.into()),
- )))
- }
- Expr::InSubquery(_) => not_impl_err!("InSubquery"),
- _ => Ok(Transformed::no(expr)),
- }
- }
-}
-
-/// NopFederationProvider is used to represent tables that are not federated, but
-/// are resolved by DataFusion. This simplifies the logic of the optimizer rule.
-struct NopFederationProvider {}
-
-impl FederationProvider for NopFederationProvider {
- fn name(&self) -> &str {
- "nop"
- }
-
- fn compute_context(&self) -> Option {
- None
- }
-
- fn optimizer(&self) -> Option> {
- None
- }
-}
-
-fn get_leaf_provider(plan: &LogicalPlan) -> Result