Skip to main content

datafusion/
lib.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#![doc(
19    html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
20    html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
21)]
22#![cfg_attr(docsrs, feature(doc_cfg))]
23// Make sure fast / cheap clones on Arc are explicit:
24// https://github.com/apache/datafusion/issues/11143
25//
26// Eliminate unnecessary function calls(some may be not cheap) due to `xxx_or`
27// for performance. Also avoid abusing `xxx_or_else` for readability:
28// https://github.com/apache/datafusion/issues/15802
29#![cfg_attr(
30    not(test),
31    deny(
32        clippy::clone_on_ref_ptr,
33        clippy::or_fun_call,
34        clippy::unnecessary_lazy_evaluations
35    )
36)]
37#![warn(missing_docs, clippy::needless_borrow)]
38// Use `allow` instead of `expect` for test configuration to explicitly
39// disable the lint for all test code rather than expecting violations
40#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
41
42//! [DataFusion] is an extensible query engine written in Rust that
43//! uses [Apache Arrow] as its in-memory format. DataFusion's target users are
44//! developers building fast and feature rich database and analytic systems,
45//! customized to particular workloads. Please see the [DataFusion website] for
46//! additional documentation, [use cases] and examples.
47//!
48//! "Out of the box," DataFusion offers [SQL] and [`Dataframe`] APIs,
49//! excellent [performance], built-in support for CSV, Parquet, JSON, and Avro,
50//! extensive customization, and a great community.
51//! [Python Bindings] are also available.
52//!
53//! DataFusion features a full query planner, a columnar, streaming, multi-threaded,
54//! vectorized execution engine, and partitioned data  sources. You can
55//! customize DataFusion at almost all points including additional data sources,
56//! query languages, functions, custom operators and more.
57//! See the [Architecture] section below for more details.
58//!
59//! [DataFusion]: https://datafusion.apache.org/
60//! [DataFusion website]: https://datafusion.apache.org
61//! [Apache Arrow]: https://arrow.apache.org
62//! [use cases]: https://datafusion.apache.org/user-guide/introduction.html#use-cases
63//! [SQL]: https://datafusion.apache.org/user-guide/sql/index.html
64//! [`DataFrame`]: dataframe::DataFrame
65//! [performance]: https://benchmark.clickhouse.com/
66//! [Python Bindings]: https://github.com/apache/datafusion-python
67//! [Architecture]: #architecture
68//!
69//! # Examples
70//!
71//! The main entry point for interacting with DataFusion is the
72//! [`SessionContext`]. [`Expr`]s represent expressions such as `a + b`.
73//!
74//! [`SessionContext`]: execution::context::SessionContext
75//!
76//! ## DataFrame
77//!
78//! To execute a query against data stored
79//! in a CSV file using a [`DataFrame`]:
80//!
81//! ```rust
82//! # use datafusion::prelude::*;
83//! # use datafusion::error::Result;
84//! # use datafusion::functions_aggregate::expr_fn::min;
85//! # use datafusion::arrow::array::RecordBatch;
86//!
87//! # #[tokio::main]
88//! # async fn main() -> Result<()> {
89//! let ctx = SessionContext::new();
90//!
91//! // create the dataframe
92//! let df = ctx
93//!     .read_csv("tests/data/example.csv", CsvReadOptions::new())
94//!     .await?;
95//!
96//! // create a plan
97//! let df = df
98//!     .filter(col("a").lt_eq(col("b")))?
99//!     .aggregate(vec![col("a")], vec![min(col("b"))])?
100//!     .limit(0, Some(100))?;
101//!
102//! // execute the plan
103//! let results: Vec<RecordBatch> = df.collect().await?;
104//!
105//! // format the results
106//! let pretty_results =
107//!     arrow::util::pretty::pretty_format_batches(&results)?.to_string();
108//!
109//! let expected = vec![
110//!     "+---+----------------+",
111//!     "| a | min(?table?.b) |",
112//!     "+---+----------------+",
113//!     "| 1 | 2              |",
114//!     "+---+----------------+",
115//! ];
116//!
117//! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
118//! # Ok(())
119//! # }
120//! ```
121//!
122//! ## SQL
123//!
124//! To execute a query against a CSV file using [SQL]:
125//!
126//! ```
127//! # use datafusion::prelude::*;
128//! # use datafusion::error::Result;
129//! # use datafusion::arrow::array::RecordBatch;
130//!
131//! # #[tokio::main]
132//! # async fn main() -> Result<()> {
133//! let ctx = SessionContext::new();
134//!
135//! ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new())
136//!     .await?;
137//!
138//! // create a plan
139//! let df = ctx
140//!     .sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100")
141//!     .await?;
142//!
143//! // execute the plan
144//! let results: Vec<RecordBatch> = df.collect().await?;
145//!
146//! // format the results
147//! let pretty_results =
148//!     arrow::util::pretty::pretty_format_batches(&results)?.to_string();
149//!
150//! let expected = vec![
151//!     "+---+----------------+",
152//!     "| a | min(example.b) |",
153//!     "+---+----------------+",
154//!     "| 1 | 2              |",
155//!     "+---+----------------+",
156//! ];
157//!
158//! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
159//! # Ok(())
160//! # }
161//! ```
162//!
163//! ## More Examples
164//!
165//! There are many additional annotated examples of using DataFusion in the [datafusion-examples] directory.
166//!
167//! [datafusion-examples]: https://github.com/apache/datafusion/tree/main/datafusion-examples
168//!
169//! # Architecture
170//!
171//! <!-- NOTE: The goal of this section is to provide a high level
172//! overview of how DataFusion is organized and then link to other
173//! sections of the docs with more details -->
174//!
175//! You can find a formal description of DataFusion's architecture in our
176//! [SIGMOD 2024 Paper].
177//!
178//! [SIGMOD 2024 Paper]: https://dl.acm.org/doi/10.1145/3626246.3653368
179//!
180//! ## Design Goals
181//! DataFusion's Architecture Goals are:
182//!
183//! 1. Work β€œout of the box”: Provide a very fast, world class query engine with
184//!    minimal setup or required configuration.
185//!
186//! 2. Customizable everything: All behavior should be customizable by
187//!    implementing traits.
188//!
189//! 3. Architecturally boring πŸ₯±: Follow industrial best practice rather than
190//!    trying cutting edge, but unproven, techniques.
191//!
192//! With these principles, users start with a basic, high-performance engine
193//! and specialize it over time to suit their needs and available engineering
194//! capacity.
195//!
196//! ## Overview  Presentations
197//!
198//! The following presentations offer high level overviews of the
199//! different components and how they interact together.
200//!
201//! - [Apr 2023]: The Apache DataFusion Architecture talks
202//!   - _Query Engine_: [recording](https://youtu.be/NVKujPxwSBA) and [slides](https://docs.google.com/presentation/d/1D3GDVas-8y0sA4c8EOgdCvEjVND4s2E7I6zfs67Y4j8/edit#slide=id.p)
203//!   - _Logical Plan and Expressions_: [recording](https://youtu.be/EzZTLiSJnhY) and [slides](https://docs.google.com/presentation/d/1ypylM3-w60kVDW7Q6S99AHzvlBgciTdjsAfqNP85K30)
204//!   - _Physical Plan and Execution_: [recording](https://youtu.be/2jkWU3_w6z0) and [slides](https://docs.google.com/presentation/d/1cA2WQJ2qg6tx6y4Wf8FH2WVSm9JQ5UgmBWATHdik0hg)
205//! - [July 2022]: DataFusion and Arrow: Supercharge Your Data Analytical Tool with a Rusty Query Engine: [recording](https://www.youtube.com/watch?v=Rii1VTn3seQ) and [slides](https://docs.google.com/presentation/d/1q1bPibvu64k2b7LPi7Yyb0k3gA1BiUYiUbEklqW1Ckc/view#slide=id.g11054eeab4c_0_1165)
206//! - [March 2021]: The DataFusion architecture is described in _Query Engine Design and the Rust-Based DataFusion in Apache Arrow_: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts [~ 15 minutes in](https://www.youtube.com/watch?v=K6eCAVEk4kU&t=875s)) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934)
207//! - [February 2021]: How DataFusion is used within the Ballista Project is described in _Ballista: Distributed Compute with Rust and Apache Arrow_: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ)
208//!
209//! ## Customization and Extension
210//!
211//! DataFusion is designed to be highly extensible, so you can
212//! start with a working, full featured engine, and then
213//! specialize any behavior for your use case. For example,
214//! some projects may add custom [`ExecutionPlan`] operators, or create their own
215//! query language that directly creates [`LogicalPlan`] rather than using the
216//! built in SQL planner, [`SqlToRel`].
217//!
218//! In order to achieve this, DataFusion supports extension at many points:
219//!
220//! * read from any datasource ([`TableProvider`])
221//! * define your own catalogs, schemas, and table lists ([`catalog`] and [`CatalogProvider`])
222//! * build your own query language or plans ([`LogicalPlanBuilder`])
223//! * declare and use user-defined functions ([`ScalarUDF`], and [`AggregateUDF`], [`WindowUDF`])
224//! * add custom plan rewrite passes ([`AnalyzerRule`], [`OptimizerRule`]  and [`PhysicalOptimizerRule`])
225//! * extend the planner to use user-defined logical and physical nodes ([`QueryPlanner`])
226//!
227//! You can find examples of each of them in the [datafusion-examples] directory.
228//!
229//! [`TableProvider`]: crate::datasource::TableProvider
230//! [`CatalogProvider`]: crate::catalog::CatalogProvider
231//! [`LogicalPlanBuilder`]: datafusion_expr::logical_plan::builder::LogicalPlanBuilder
232//! [`ScalarUDF`]: crate::logical_expr::ScalarUDF
233//! [`AggregateUDF`]: crate::logical_expr::AggregateUDF
234//! [`WindowUDF`]: crate::logical_expr::WindowUDF
235//! [`QueryPlanner`]: execution::context::QueryPlanner
236//! [`OptimizerRule`]: datafusion_optimizer::optimizer::OptimizerRule
237//! [`AnalyzerRule`]:  datafusion_optimizer::analyzer::AnalyzerRule
238//! [`PhysicalOptimizerRule`]: datafusion_physical_optimizer::PhysicalOptimizerRule
239//!
240//! ## Query Planning and Execution Overview
241//!
242//! ### SQL
243//!
244//! ```text
245//!                 Parsed with            SqlToRel creates
246//!                 sqlparser              initial plan
247//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”             β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
248//! β”‚   SELECT *    β”‚           β”‚Query {  β”‚             β”‚Project      β”‚
249//! β”‚   FROM ...    │──────────▢│..       │────────────▢│  TableScan  β”‚
250//! β”‚               β”‚           β”‚}        β”‚             β”‚    ...      β”‚
251//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜           β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜             β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
252//!
253//!   SQL String                 sqlparser               LogicalPlan
254//!                              AST nodes
255//! ```
256//!
257//! 1. The query string is parsed to an Abstract Syntax Tree (AST)
258//!    [`Statement`] using [sqlparser].
259//!
260//! 2. The AST is converted to a [`LogicalPlan`] and logical expressions
261//!    [`Expr`]s to compute the desired result by [`SqlToRel`]. This phase
262//!    also includes name and type resolution ("binding").
263//!
264//! [`Statement`]: https://docs.rs/sqlparser/latest/sqlparser/ast/enum.Statement.html
265//!
266//! ### DataFrame
267//!
268//! When executing plans using the [`DataFrame`] API, the process is
269//! identical as with SQL, except the DataFrame API builds the
270//! [`LogicalPlan`] directly using [`LogicalPlanBuilder`]. Systems
271//! that have their own custom query languages typically also build
272//! [`LogicalPlan`] directly.
273//!
274//! ### Planning
275//!
276//! ```text
277//!             AnalyzerRules and      PhysicalPlanner          PhysicalOptimizerRules
278//!             OptimizerRules         creates ExecutionPlan    improve performance
279//!             rewrite plan
280//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”        β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”      β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”        β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
281//! β”‚Project      β”‚        β”‚Project(x, y)β”‚      β”‚ProjectExec      β”‚        β”‚ProjectExec      β”‚
282//! β”‚  TableScan  │──...──▢│  TableScan  │─────▢│  ...            │──...──▢│  ...            β”‚
283//! β”‚    ...      β”‚        β”‚    ...      β”‚      β”‚   DataSourceExecβ”‚        β”‚   DataSourceExecβ”‚
284//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜        β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜      β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜        β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
285//!
286//!  LogicalPlan            LogicalPlan         ExecutionPlan             ExecutionPlan
287//! ```
288//!
289//! To process large datasets with many rows as efficiently as
290//! possible, significant effort is spent planning and
291//! optimizing, in the following manner:
292//!
293//! 1. The [`LogicalPlan`] is checked and rewritten to enforce
294//!    semantic rules, such as type coercion, by [`AnalyzerRule`]s
295//!
296//! 2. The [`LogicalPlan`] is rewritten by [`OptimizerRule`]s, such as
297//!    projection and filter pushdown, to improve its efficiency.
298//!
299//! 3. The [`LogicalPlan`] is converted to an [`ExecutionPlan`] by a
300//!    [`PhysicalPlanner`]
301//!
302//! 4. The [`ExecutionPlan`] is rewritten by
303//!    [`PhysicalOptimizerRule`]s, such as sort and join selection, to
304//!    improve its efficiency.
305//!
306//! ## Data Sources
307//!
308//! ```text
309//! Planning       β”‚
310//! requests       β”‚            TableProvider::scan
311//! information    β”‚            creates an
312//! such as schema β”‚            ExecutionPlan
313//!                β”‚
314//!                β–Ό
315//!   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”         β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
316//!   β”‚                         β”‚         β”‚               β”‚
317//!   β”‚impl TableProvider       │────────▢│DataSourceExec β”‚
318//!   β”‚                         β”‚         β”‚               β”‚
319//!   β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜         β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
320//!         TableProvider
321//!         (built in or user provided)    ExecutionPlan
322//! ```
323//!
324//! A [`TableProvider`] provides information for planning and
325//! an [`ExecutionPlan`] for execution. DataFusion includes two built-in
326//! table providers that support common file formats and require no runtime services,
327//! [`ListingTable`] and [`MemTable`]. You can add support for any other data
328//! source and/or file formats by implementing the [`TableProvider`] trait.
329//!
330//! See also:
331//!
332//! 1. [`ListingTable`]: Reads data from one or more Parquet, JSON, CSV, or AVRO
333//!    files in one or more local or remote directories. Supports HIVE style
334//!    partitioning, optional compression, directly reading from remote
335//!    object store, file metadata caching, and more.
336//!
337//! 2. [`MemTable`]: Reads data from in memory [`RecordBatch`]es.
338//!
339//! 3. [`StreamingTable`]: Reads data from potentially unbounded inputs.
340//!
341//! [`ListingTable`]: crate::datasource::listing::ListingTable
342//! [`MemTable`]: crate::datasource::memory::MemTable
343//! [`StreamingTable`]: crate::catalog::streaming::StreamingTable
344//!
345//! ## Plan Representations
346//!
347//! ### Logical Plans
348//! Logical planning yields [`LogicalPlan`] nodes and [`Expr`]
349//! representing expressions which are [`Schema`] aware and represent statements
350//! independent of how they are physically executed.
351//! A [`LogicalPlan`] is a Directed Acyclic Graph (DAG) of other
352//! [`LogicalPlan`]s, each potentially containing embedded [`Expr`]s.
353//!
354//! [`LogicalPlan`]s can be rewritten with [`TreeNode`] API, see the
355//! [`tree_node module`] for more details.
356//!
357//! [`Expr`]s can also be rewritten with [`TreeNode`] API and simplified using
358//! [`ExprSimplifier`]. Examples of working with and executing [`Expr`]s can be
359//! found in the [`expr_api`.rs] example
360//!
361//! [`TreeNode`]: datafusion_common::tree_node::TreeNode
362//! [`tree_node module`]: datafusion_expr::logical_plan::tree_node
363//! [`ExprSimplifier`]: crate::optimizer::simplify_expressions::ExprSimplifier
364//! [`expr_api`.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs
365//!
366//! ### Physical Plans
367//!
368//! An [`ExecutionPlan`] (sometimes referred to as a "physical plan")
369//! is a plan that can be executed against data. It a DAG of other
370//! [`ExecutionPlan`]s each potentially containing expressions that implement the
371//! [`PhysicalExpr`] trait.
372//!
373//! Compared to a [`LogicalPlan`], an [`ExecutionPlan`] has additional concrete
374//! information about how to perform calculations (e.g. hash vs merge
375//! join), and how data flows during execution (e.g. partitioning and
376//! sortedness).
377//!
378//! [cp_solver] performs range propagation analysis on [`PhysicalExpr`]s and
379//! [`PruningPredicate`] can prove certain boolean [`PhysicalExpr`]s used for
380//! filtering can never be `true` using additional statistical information.
381//!
382//! [cp_solver]: crate::physical_expr::intervals::cp_solver
383//! [`PruningPredicate`]: datafusion_physical_optimizer::pruning::PruningPredicate
384//! [`PhysicalExpr`]: crate::physical_plan::PhysicalExpr
385//!
386//! ## Execution
387//!
388//! ```text
389//!            ExecutionPlan::execute             Calling next() on the
390//!            produces a stream                  stream produces the data
391//!
392//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”      β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”         β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
393//! β”‚ProjectExec     β”‚      β”‚impl                     β”‚    β”Œβ”€β”€β”€β–Άβ”‚RecordBatch β”‚
394//! β”‚  ...           │─────▢│SendableRecordBatchStream│─────    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
395//! β”‚  DataSourceExecβ”‚      β”‚                         β”‚    β”‚    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
396//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜      β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜    β”œβ”€β”€β”€β–Άβ”‚RecordBatch β”‚
397//!               β–²                                        β”‚    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
398//! ExecutionPlan β”‚                                        β”‚         ...
399//!               β”‚                                        β”‚
400//!               β”‚                                        β”‚    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
401//!             PhysicalOptimizerRules                     β”œβ”€β”€β”€β–Άβ”‚RecordBatch β”‚
402//!             request information                        β”‚    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
403//!             such as partitioning                       β”‚    β”Œ ─ ─ ─ ─ ─ ─
404//!                                                        └───▢ None        β”‚
405//!                                                             β”” ─ ─ ─ ─ ─ ─
406//! ```
407//!
408//! [`ExecutionPlan`]s process data using the [Apache Arrow] memory
409//! format, making heavy use of functions from the [arrow]
410//! crate. Values are represented with [`ColumnarValue`], which are either
411//! [`ScalarValue`] (single constant values) or [`ArrayRef`] (Arrow
412//! Arrays).
413//!
414//! Calling [`execute`] produces 1 or more partitions of data,
415//! as a [`SendableRecordBatchStream`], which implements a pull based execution
416//! API. Calling [`next()`]`.await` will incrementally compute and return the next
417//! [`RecordBatch`]. Balanced parallelism is achieved using [Volcano style]
418//! "Exchange" operations implemented by [`RepartitionExec`].
419//!
420//! While some recent research such as [Morsel-Driven Parallelism] describes challenges
421//! with the pull style Volcano execution model on NUMA architectures, in practice DataFusion achieves
422//! similar scalability as systems that use push driven schedulers [such as DuckDB].
423//! See the [DataFusion paper in SIGMOD 2024] for more details.
424//!
425//! [`execute`]: physical_plan::ExecutionPlan::execute
426//! [`SendableRecordBatchStream`]: crate::physical_plan::SendableRecordBatchStream
427//! [`ColumnarValue`]: datafusion_expr::ColumnarValue
428//! [`ScalarValue`]: crate::scalar::ScalarValue
429//! [`ArrayRef`]: arrow::array::ArrayRef
430//! [`Stream`]: futures::stream::Stream
431//!
432//! See the [implementors of `ExecutionPlan`] for a list of physical operators available.
433//!
434//! [`RepartitionExec`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/repartition/struct.RepartitionExec.html
435//! [Volcano style]: https://doi.org/10.1145/93605.98720
436//! [Morsel-Driven Parallelism]: https://db.in.tum.de/~leis/papers/morsels.pdf
437//! [DataFusion paper in SIGMOD 2024]: https://github.com/apache/datafusion/files/15149988/DataFusion_Query_Engine___SIGMOD_2024-FINAL-mk4.pdf
438//! [such as DuckDB]: https://github.com/duckdb/duckdb/issues/1583
439//! [implementors of `ExecutionPlan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#implementors
440//!
441//! ## Streaming Execution
442//!
443//! DataFusion is a "streaming" query engine which means [`ExecutionPlan`]s incrementally
444//! read from their input(s) and compute output one [`RecordBatch`] at a time
445//! by continually polling [`SendableRecordBatchStream`]s. Output and
446//! intermediate [`RecordBatch`]s each have approximately `batch_size` rows,
447//! which amortizes per-batch overhead of execution.
448//!
449//! Note that certain operations, sometimes called "pipeline breakers",
450//! (for example full sorts or hash aggregations) are fundamentally non streaming and
451//! must read their input fully before producing **any** output. As much as possible,
452//! other operators read a single [`RecordBatch`] from their input to produce a
453//! single [`RecordBatch`] as output.
454//!
455//! For example, given this SQL:
456//!
457//! ```sql
458//! SELECT name FROM 'data.parquet' WHERE id > 10
459//! ```
460//!
461//! An simplified DataFusion execution plan is shown below. It first reads
462//! data from the Parquet file, then applies the filter, then the projection,
463//! and finally produces output. Each step processes one [`RecordBatch`] at a
464//! time. Multiple batches are processed concurrently on different CPU cores
465//! for plans with multiple partitions.
466//!
467//! ```text
468//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
469//! β”‚ Parquet     │───▢│ DataSource   │───▢│ FilterExec     │───▢│ ProjectionExec   │───▢│ Results  β”‚
470//! β”‚ File        β”‚    β”‚              β”‚    β”‚                β”‚    β”‚                  β”‚    β”‚          β”‚
471//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
472//!                    (reads data)        (id > 10)             (keeps "name" col)
473//!                    RecordBatch ───▢    RecordBatch ────▢     RecordBatch ────▢        RecordBatch
474//! ```
475//!
476//! DataFusion uses the classic "pull" based control flow (explained more in the
477//! next section) to implement streaming execution. As an example,
478//! consider the following SQL query:
479//!
480//! ```sql
481//! SELECT date_trunc('month', time) FROM data WHERE id IN (10,20,30);
482//! ```
483//!
484//! The diagram below shows the call sequence when a consumer calls [`next()`] to
485//! get the next [`RecordBatch`] of output. While it is possible that some
486//! steps run on different threads, typically tokio will use the same thread
487//! that called [`next()`] to read from the input, apply the filter, and
488//! return the results without interleaving any other operations. This results
489//! in excellent cache locality as the same CPU core that produces the data often
490//! consumes it immediately as well.
491//!
492//! ```text
493//!
494//! Step 3: FilterExec calls next()       Step 2: ProjectionExec calls
495//!         on input Stream                  next() on input Stream
496//!         β”Œ ─ ─ ─ ─ ─ ─ ─ ─ ─      β”Œ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐
497//!                            β”‚                                               Step 1: Consumer
498//!         β–Ό                        β–Ό                           β”‚               calls next()
499//! ┏━━━━━━━━━━━━━━━━┓     ┏━━━━━┻━━━━━━━━━━━━━┓      ┏━━━━━━━━━━━━━━━━━━━━━━━━┓
500//! ┃                ┃     ┃                   ┃      ┃                        β—€ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
501//! ┃  DataSource    ┃     ┃                   ┃      ┃                        ┃
502//! ┃    (e.g.       ┃     ┃    FilterExec     ┃      ┃     ProjectionExec     ┃
503//! ┃ ParquetSource) ┃     ┃id IN (10, 20, 30) ┃      ┃date_bin('month', time) ┃
504//! ┃                ┃     ┃                   ┃      ┃                        ┣ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ β–Ά
505//! ┃                ┃     ┃                   ┃      ┃                        ┃
506//! ┗━━━━━━━━━━━━━━━━┛     ┗━━━━━━━━━━━┳━━━━━━━┛      ┗━━━━━━━━━━━━━━━━━━━━━━━━┛
507//!         β”‚                  β–²                                 β–²          Step 6: ProjectionExec
508//!                            β”‚     β”‚                           β”‚        computes date_trunc into a
509//!         β”” ─ ─ ─ ─ ─ ─ ─ ─ ─       ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─          new RecordBatch returned
510//!              β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”                β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”          from client
511//!              β”‚     RecordBatch     β”‚                β”‚ RecordBatch β”‚
512//!              β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜                β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
513//!
514//!           Step 4: DataSource returns a        Step 5: FilterExec returns a new
515//!                single RecordBatch            RecordBatch with only matching rows
516//! ```
517//!
518//! [`next()`]: futures::StreamExt::next
519//!
520//! ## Thread Scheduling, CPU / IO Thread Pools, and [Tokio] [`Runtime`]s
521//!
522//! DataFusion automatically runs each plan with multiple CPU cores using
523//! a [Tokio] [`Runtime`] as a thread pool. While tokio is most commonly used
524//! for asynchronous network I/O, the combination of an efficient, work-stealing
525//! scheduler, and first class compiler support for automatic continuation
526//! generation (`async`) also makes it a compelling choice for CPU intensive
527//! applications as explained in the [Using Rustlang’s Async Tokio
528//! Runtime for CPU-Bound Tasks] blog.
529//!
530//! The number of cores used is determined by the `target_partitions`
531//! configuration setting, which defaults to the number of CPU cores.
532//! While preparing for execution, DataFusion tries to create this many distinct
533//! `async` [`Stream`]s for each [`ExecutionPlan`].
534//! The [`Stream`]s for certain [`ExecutionPlan`]s, such as [`RepartitionExec`]
535//! and [`CoalescePartitionsExec`], spawn [Tokio] [`task`]s, that run on
536//! threads managed by the [`Runtime`].
537//! Many DataFusion [`Stream`]s perform CPU intensive processing.
538//!
539//! ### Cooperative Scheduling
540//!
541//! DataFusion uses cooperative scheduling, which means that each [`Stream`]
542//! is responsible for yielding control back to the [`Runtime`] after
543//! some amount of work is done. Please see the [`coop`] module documentation
544//! for more details.
545//!
546//! [`coop`]: datafusion_physical_plan::coop
547//!
548//! ### Network I/O and CPU intensive tasks
549//!
550//! Using `async` for CPU intensive tasks makes it easy for [`TableProvider`]s
551//! to perform network I/O using standard Rust `async` during execution.
552//! However, this design also makes it very easy to mix CPU intensive and latency
553//! sensitive I/O work on the same thread pool ([`Runtime`]).
554//! Using the same (default) [`Runtime`] is convenient, and often works well for
555//! initial development and processing local files, but it can lead to problems
556//! under load and/or when reading from network sources such as AWS S3.
557//!
558//! ### Optimizing Latency: Throttled CPU / IO under Highly Concurrent Load
559//!
560//! If your system does not fully utilize either the CPU or network bandwidth
561//! during execution, or you see significantly higher tail (e.g. p99) latencies
562//! responding to network requests, **it is likely you need to use a different
563//! [`Runtime`] for DataFusion plans**. The [thread_pools example]
564//! has  an example of how to do so.
565//!
566//! As shown below, using the same [`Runtime`] for both CPU intensive processing
567//! and network requests can introduce significant delays in responding to
568//! those network requests. Delays in processing network requests can and does
569//! lead network flow control to throttle the available bandwidth in response.
570//! This effect can be especially pronounced when running multiple queries
571//! concurrently.
572//!
573//! ```text
574//!                                                                          Legend
575//!
576//!                                                                          ┏━━━━━━┓
577//!                            Processing network request                    ┃      ┃  CPU bound work
578//!                            is delayed due to processing                  ┗━━━━━━┛
579//!                            CPU bound work                                β”Œβ”€β”
580//!                                                                          β”‚ β”‚       Network request
581//!                                         β”‚β”‚                               β””β”€β”˜       processing
582//!
583//!                                         β”‚β”‚
584//!                                ─ ─ ─ ─ ─  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
585//!                               β”‚                                            β”‚
586//!
587//!                               β–Ό                                            β–Ό
588//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           β”Œβ”€β”β”Œβ”€β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”“β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”“β”Œβ”€β”
589//! β”‚             β”‚thread 1   β”‚ β”‚β”‚ │┃     Decoding      ┃┃     Filtering     ┃│ β”‚
590//! β”‚             β”‚           β””β”€β”˜β””β”€β”˜β”—β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”›β”—β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”›β””β”€β”˜
591//! β”‚             β”‚           ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
592//! β”‚Tokio Runtimeβ”‚thread 2   ┃   Decoding   ┃     Filtering     ┃   Decoding   ┃       ...
593//! β”‚(thread pool)β”‚           ┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛
594//! β”‚             β”‚     ...                               ...
595//! β”‚             β”‚           β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”³β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”“β”Œβ”€β” ┏━━━━━━━━━━━━━━┓
596//! β”‚             β”‚thread N   ┃     Decoding      ┃     Filtering     ┃│ β”‚ ┃   Decoding   ┃
597//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜           β”—β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”»β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”›β””β”€β”˜ ┗━━━━━━━━━━━━━━┛
598//!                           ─────────────────────────────────────────────────────────────▢
599//!                                                                                           time
600//! ```
601//!
602//! The bottleneck resulting from network throttling can be avoided
603//! by using separate [`Runtime`]s for the different types of work, as shown
604//! in the diagram below.
605//!
606//! ```text
607//!                    A separate thread pool processes network       Legend
608//!                    requests, reducing the latency for
609//!                    processing each request                        ┏━━━━━━┓
610//!                                                                   ┃      ┃  CPU bound work
611//!                                         β”‚                         ┗━━━━━━┛
612//!                                          β”‚                        β”Œβ”€β”
613//!                               β”Œ ─ ─ ─ ─ β”˜                         β”‚ β”‚       Network request
614//!                                  β”Œ ─ ─ ─ β”˜                        β””β”€β”˜       processing
615//!                               β”‚
616//!                               β–Ό  β–Ό
617//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           β”Œβ”€β”β”Œβ”€β”β”Œβ”€β”
618//! β”‚             β”‚thread 1   β”‚ β”‚β”‚ β”‚β”‚ β”‚
619//! β”‚             β”‚           β””β”€β”˜β””β”€β”˜β””β”€β”˜
620//! β”‚Tokio Runtimeβ”‚                                          ...
621//! β”‚(thread pool)β”‚thread 2
622//! β”‚             β”‚
623//! β”‚"IO Runtime" β”‚     ...
624//! β”‚             β”‚                                                   β”Œβ”€β”
625//! β”‚             β”‚thread N                                           β”‚ β”‚
626//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜                                                   β””β”€β”˜
627//!                           ─────────────────────────────────────────────────────────────▢
628//!                                                                                           time
629//!
630//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           ┏━━━━━━━━━━━━━━━━━━━┓┏━━━━━━━━━━━━━━━━━━━┓
631//! β”‚             β”‚thread 1   ┃     Decoding      ┃┃     Filtering     ┃
632//! β”‚             β”‚           ┗━━━━━━━━━━━━━━━━━━━┛┗━━━━━━━━━━━━━━━━━━━┛
633//! β”‚Tokio Runtimeβ”‚           ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
634//! β”‚(thread pool)β”‚thread 2   ┃   Decoding   ┃     Filtering     ┃   Decoding   ┃       ...
635//! β”‚             β”‚           ┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛
636//! β”‚ CPU Runtime β”‚     ...                               ...
637//! β”‚             β”‚           ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
638//! β”‚             β”‚thread N   ┃     Decoding      ┃     Filtering     ┃   Decoding   ┃
639//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜           ┗━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛
640//!                          ─────────────────────────────────────────────────────────────▢
641//!                                                                                           time
642//! ```
643//!
644//! Note that DataFusion does not use [`tokio::task::spawn_blocking`] for
645//! CPU-bounded work, because `spawn_blocking` is designed for blocking **IO**,
646//! not designed CPU bound tasks. Among other challenges, spawned blocking
647//! tasks can't yield waiting for input (can't call `await`) so they
648//! can't be used to limit the number of concurrent CPU bound tasks or
649//! keep the processing pipeline to the same core.
650//!
651//! [Tokio]:  https://tokio.rs
652//! [`Runtime`]: tokio::runtime::Runtime
653//! [thread_pools example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/thread_pools.rs
654//! [`task`]: tokio::task
655//! [Using Rustlang’s Async Tokio Runtime for CPU-Bound Tasks]: https://thenewstack.io/using-rustlangs-async-tokio-runtime-for-cpu-bound-tasks/
656//! [`RepartitionExec`]: physical_plan::repartition::RepartitionExec
657//! [`CoalescePartitionsExec`]: physical_plan::coalesce_partitions::CoalescePartitionsExec
658//!
659//! ## State Management and Configuration
660//!
661//! [`ConfigOptions`] contain options to control DataFusion's
662//! execution.
663//!
664//! [`ConfigOptions`]: datafusion_common::config::ConfigOptions
665//!
666//! The state required to execute queries is managed by the following
667//! structures:
668//!
669//! 1. [`SessionContext`]: State needed to create [`LogicalPlan`]s such
670//!    as the table definitions and the function registries.
671//!
672//! 2. [`TaskContext`]: State needed for execution such as the
673//!    [`MemoryPool`], [`DiskManager`], and [`ObjectStoreRegistry`].
674//!
675//! 3. [`ExecutionProps`]: Per-execution properties and data (such as
676//!    starting timestamps, etc).
677//!
678//! [`SessionContext`]: crate::execution::context::SessionContext
679//! [`TaskContext`]: crate::execution::context::TaskContext
680//! [`ExecutionProps`]: crate::execution::context::ExecutionProps
681//!
682//! ### Resource Management
683//!
684//! The amount of memory and temporary local disk space used by
685//! DataFusion when running a plan can be controlled using the
686//! [`MemoryPool`] and [`DiskManager`]. Other runtime options can be
687//! found on [`RuntimeEnv`].
688//!
689//! [`DiskManager`]: crate::execution::DiskManager
690//! [`MemoryPool`]: crate::execution::memory_pool::MemoryPool
691//! [`RuntimeEnv`]: crate::execution::runtime_env::RuntimeEnv
692//! [`ObjectStoreRegistry`]: crate::datasource::object_store::ObjectStoreRegistry
693//!
694//! ## Crate Organization
695//!
696//! Most users interact with DataFusion via this crate (`datafusion`), which re-exports
697//! all functionality needed to build and execute queries.
698//!
699//! There are three other crates that provide additional functionality that
700//! must be used directly:
701//! * [`datafusion_proto`]: Plan serialization and deserialization
702//! * [`datafusion_substrait`]: Support for the substrait plan serialization format
703//! * [`datafusion_sqllogictest`] : The DataFusion SQL logic test runner
704//!
705//! [`datafusion_proto`]: https://crates.io/crates/datafusion-proto
706//! [`datafusion_substrait`]: https://crates.io/crates/datafusion-substrait
707//! [`datafusion_sqllogictest`]: https://crates.io/crates/datafusion-sqllogictest
708//!
709//! DataFusion is internally split into multiple sub crates to
710//! enforce modularity and improve compilation times. See the
711//! [list of modules](#modules) for all available sub-crates. Major ones are
712//!
713//! * [datafusion_common]: Common traits and types
714//! * [datafusion_catalog]: Catalog APIs such as [`SchemaProvider`] and [`CatalogProvider`]
715//! * [datafusion_datasource]: File and Data IO such as [`FileSource`] and [`DataSink`]
716//! * [datafusion_session]: [`Session`] and related structures
717//! * [datafusion_execution]: State and structures needed for execution
718//! * [datafusion_expr]: [`LogicalPlan`], [`Expr`] and related logical planning structure
719//! * [datafusion_functions]: Scalar function packages
720//! * [datafusion_functions_aggregate]: Aggregate functions such as `MIN`, `MAX`, `SUM`, etc
721//! * [datafusion_functions_nested]: Scalar function packages for `ARRAY`s, `MAP`s and `STRUCT`s
722//! * [datafusion_functions_table]: Table Functions such as `GENERATE_SERIES`
723//! * [datafusion_functions_window]: Window functions such as `ROW_NUMBER`, `RANK`, etc
724//! * [datafusion_optimizer]: [`OptimizerRule`]s and [`AnalyzerRule`]s
725//! * [datafusion_physical_expr]: [`PhysicalExpr`] and related expressions
726//! * [datafusion_physical_plan]: [`ExecutionPlan`] and related expressions
727//! * [datafusion_physical_optimizer]: [`ExecutionPlan`] and related expressions
728//! * [datafusion_sql]: SQL planner ([`SqlToRel`])
729//!
730//! [`SchemaProvider`]: datafusion_catalog::SchemaProvider
731//! [`CatalogProvider`]: datafusion_catalog::CatalogProvider
732//! [`Session`]: datafusion_session::Session
733//! [`FileSource`]: datafusion_datasource::file::FileSource
734//! [`DataSink`]: datafusion_datasource::sink::DataSink
735//!
736//! ## Citing DataFusion in Academic Papers
737//!
738//! You can use the following citation to reference DataFusion in academic papers:
739//!
740//! ```text
741//! @inproceedings{lamb2024apache
742//!   title={Apache Arrow DataFusion: A Fast, Embeddable, Modular Analytic Query Engine},
743//!   author={Lamb, Andrew and Shen, Yijie and Heres, Dani{\"e}l and Chakraborty, Jayjeet and Kabak, Mehmet Ozan and Hsieh, Liang-Chi and Sun, Chao},
744//!   booktitle={Companion of the 2024 International Conference on Management of Data},
745//!   pages={5--17},
746//!   year={2024}
747//! }
748//! ```
749//!
750//! [sqlparser]: https://docs.rs/sqlparser/latest/sqlparser
751//! [`SqlToRel`]: sql::planner::SqlToRel
752//! [`Expr`]: datafusion_expr::Expr
753//! [`LogicalPlan`]: datafusion_expr::LogicalPlan
754//! [`AnalyzerRule`]: datafusion_optimizer::analyzer::AnalyzerRule
755//! [`OptimizerRule`]: optimizer::optimizer::OptimizerRule
756//! [`ExecutionPlan`]: physical_plan::ExecutionPlan
757//! [`PhysicalPlanner`]: physical_planner::PhysicalPlanner
758//! [`PhysicalOptimizerRule`]: datafusion_physical_optimizer::PhysicalOptimizerRule
759//! [`Schema`]: arrow::datatypes::Schema
760//! [`PhysicalExpr`]: physical_plan::PhysicalExpr
761//! [`RecordBatch`]: arrow::array::RecordBatch
762//! [`RecordBatchReader`]: arrow::record_batch::RecordBatchReader
763//! [`Array`]: arrow::array::Array
764
765/// DataFusion crate version
766pub const DATAFUSION_VERSION: &str = env!("CARGO_PKG_VERSION");
767
768extern crate core;
769
770#[cfg(feature = "sql")]
771extern crate sqlparser;
772
773pub mod dataframe;
774pub mod datasource;
775pub mod error;
776pub mod execution;
777pub mod physical_planner;
778pub mod prelude;
779pub mod scalar;
780
781// Re-export dependencies that are part of DataFusion public API (e.g. via DataFusionError)
782pub use arrow;
783pub use object_store;
784
785#[cfg(feature = "parquet")]
786pub use parquet;
787
788#[cfg(feature = "avro")]
789pub use datafusion_datasource_avro::apache_avro;
790
791// re-export DataFusion sub-crates at the top level. Use `pub use *`
792// so that the contents of the subcrates appears in rustdocs
793// for details, see https://github.com/apache/datafusion/issues/6648
794
795/// re-export of [`datafusion_common`] crate
796pub mod common {
797    pub use datafusion_common::*;
798
799    /// re-export of [`datafusion_common_runtime`] crate
800    pub mod runtime {
801        pub use datafusion_common_runtime::*;
802    }
803}
804
805// Backwards compatibility
806pub use common::config;
807
808// NB datafusion execution is re-exported in the `execution` module
809
810/// re-export of [`datafusion_catalog`] crate
811pub mod catalog {
812    pub use datafusion_catalog::*;
813}
814
815/// re-export of [`datafusion_expr`] crate
816pub mod logical_expr {
817    pub use datafusion_expr::*;
818}
819
820/// re-export of [`datafusion_expr_common`] crate
821pub mod logical_expr_common {
822    pub use datafusion_expr_common::*;
823}
824
825/// re-export of [`datafusion_optimizer`] crate
826pub mod optimizer {
827    pub use datafusion_optimizer::*;
828}
829
830/// re-export of [`datafusion_physical_optimizer`] crate
831pub mod physical_optimizer {
832    pub use datafusion_physical_optimizer::*;
833}
834
835/// re-export of [`datafusion_physical_expr`] crate
836pub mod physical_expr_common {
837    pub use datafusion_physical_expr_common::*;
838}
839
840/// re-export of [`datafusion_physical_expr`] crate
841pub mod physical_expr {
842    pub use datafusion_physical_expr::*;
843}
844
845/// re-export of [`datafusion_physical_expr_adapter`] crate
846pub mod physical_expr_adapter {
847    pub use datafusion_physical_expr_adapter::*;
848}
849
850/// re-export of [`datafusion_physical_plan`] crate
851pub mod physical_plan {
852    pub use datafusion_physical_plan::*;
853}
854
855// Reexport testing macros for compatibility
856pub use datafusion_common::assert_batches_eq;
857pub use datafusion_common::assert_batches_sorted_eq;
858
859/// re-export of [`datafusion_sql`] crate
860#[cfg(feature = "sql")]
861pub mod sql {
862    pub use datafusion_sql::*;
863}
864
865/// re-export of [`datafusion_functions`] crate
866pub mod functions {
867    pub use datafusion_functions::*;
868}
869
870/// re-export of [`datafusion_functions_nested`] crate, if "nested_expressions" feature is enabled
871pub mod functions_nested {
872    #[cfg(feature = "nested_expressions")]
873    pub use datafusion_functions_nested::*;
874}
875
876/// re-export of [`datafusion_functions_aggregate`] crate
877pub mod functions_aggregate {
878    pub use datafusion_functions_aggregate::*;
879}
880
881/// re-export of [`datafusion_functions_window`] crate
882pub mod functions_window {
883    pub use datafusion_functions_window::*;
884}
885
886/// re-export of [`datafusion_functions_table`] crate
887pub mod functions_table {
888    pub use datafusion_functions_table::*;
889}
890
891/// re-export of variable provider for `@name` and `@@name` style runtime values.
892pub mod variable {
893    pub use datafusion_expr::var_provider::{VarProvider, VarType};
894}
895
896#[cfg(not(target_arch = "wasm32"))]
897pub mod test;
898
899mod schema_equivalence;
900pub mod test_util;
901
902#[cfg(doctest)]
903doc_comment::doctest!("../../../README.md", readme_example_test);
904
905// Instructions for Documentation Examples
906//
907// The following commands test the examples from the user guide as part of
908// `cargo test --doc`
909//
910// # Adding new tests:
911//
912// Simply add code like this to your .md file and ensure your md file is
913// included in the lists below.
914//
915// ```rust
916// <code here will be tested>
917// ```
918//
919// Note that sometimes it helps to author the doctest as a standalone program
920// first, and then copy it into the user guide.
921//
922// # Debugging Test Failures
923//
924// Unfortunately, the line numbers reported by doctest do not correspond to the
925// line numbers of in the .md files. Thus, if a doctest fails, use the name of
926// the test to find the relevant file in the list below, and then find the
927// example in that file to fix.
928//
929// For example, if `user_guide_expressions(line 123)` fails,
930// go to `docs/source/user-guide/expressions.md` to find the relevant problem.
931//
932#[cfg(doctest)]
933doc_comment::doctest!(
934    "../../../docs/source/user-guide/arrow-introduction.md",
935    user_guide_arrow_introduction
936);
937
938#[cfg(doctest)]
939doc_comment::doctest!(
940    "../../../docs/source/user-guide/concepts-readings-events.md",
941    user_guide_concepts_readings_events
942);
943
944#[cfg(doctest)]
945doc_comment::doctest!(
946    "../../../docs/source/user-guide/configs.md",
947    user_guide_configs
948);
949
950#[cfg(doctest)]
951doc_comment::doctest!(
952    "../../../docs/source/user-guide/crate-configuration.md",
953    user_guide_crate_configuration
954);
955
956#[cfg(doctest)]
957doc_comment::doctest!(
958    "../../../docs/source/user-guide/dataframe.md",
959    user_guide_dataframe
960);
961
962#[cfg(doctest)]
963doc_comment::doctest!(
964    "../../../docs/source/user-guide/example-usage.md",
965    user_guide_example_usage
966);
967
968#[cfg(doctest)]
969doc_comment::doctest!(
970    "../../../docs/source/user-guide/explain-usage.md",
971    user_guide_explain_usage
972);
973
974#[cfg(doctest)]
975doc_comment::doctest!(
976    "../../../docs/source/user-guide/expressions.md",
977    user_guide_expressions
978);
979
980#[cfg(doctest)]
981doc_comment::doctest!("../../../docs/source/user-guide/faq.md", user_guide_faq);
982
983#[cfg(doctest)]
984doc_comment::doctest!(
985    "../../../docs/source/user-guide/introduction.md",
986    user_guide_introduction
987);
988
989#[cfg(doctest)]
990doc_comment::doctest!(
991    "../../../docs/source/user-guide/cli/datasources.md",
992    user_guide_cli_datasource
993);
994
995#[cfg(doctest)]
996doc_comment::doctest!(
997    "../../../docs/source/user-guide/cli/installation.md",
998    user_guide_cli_installation
999);
1000
1001#[cfg(doctest)]
1002doc_comment::doctest!(
1003    "../../../docs/source/user-guide/cli/overview.md",
1004    user_guide_cli_overview
1005);
1006
1007#[cfg(doctest)]
1008doc_comment::doctest!(
1009    "../../../docs/source/user-guide/cli/usage.md",
1010    user_guide_cli_usage
1011);
1012
1013#[cfg(doctest)]
1014doc_comment::doctest!(
1015    "../../../docs/source/user-guide/features.md",
1016    user_guide_features
1017);
1018
1019#[cfg(doctest)]
1020doc_comment::doctest!(
1021    "../../../docs/source/user-guide/sql/aggregate_functions.md",
1022    user_guide_sql_aggregate_functions
1023);
1024
1025#[cfg(doctest)]
1026doc_comment::doctest!(
1027    "../../../docs/source/user-guide/sql/data_types.md",
1028    user_guide_sql_data_types
1029);
1030
1031#[cfg(doctest)]
1032doc_comment::doctest!(
1033    "../../../docs/source/user-guide/sql/ddl.md",
1034    user_guide_sql_ddl
1035);
1036
1037#[cfg(doctest)]
1038doc_comment::doctest!(
1039    "../../../docs/source/user-guide/sql/dml.md",
1040    user_guide_sql_dml
1041);
1042
1043#[cfg(doctest)]
1044doc_comment::doctest!(
1045    "../../../docs/source/user-guide/sql/explain.md",
1046    user_guide_sql_exmplain
1047);
1048
1049#[cfg(doctest)]
1050doc_comment::doctest!(
1051    "../../../docs/source/user-guide/sql/information_schema.md",
1052    user_guide_sql_information_schema
1053);
1054
1055#[cfg(doctest)]
1056doc_comment::doctest!(
1057    "../../../docs/source/user-guide/sql/operators.md",
1058    user_guide_sql_operators
1059);
1060
1061#[cfg(doctest)]
1062doc_comment::doctest!(
1063    "../../../docs/source/user-guide/sql/prepared_statements.md",
1064    user_guide_prepared_statements
1065);
1066
1067#[cfg(doctest)]
1068doc_comment::doctest!(
1069    "../../../docs/source/user-guide/sql/scalar_functions.md",
1070    user_guide_sql_scalar_functions
1071);
1072
1073#[cfg(doctest)]
1074doc_comment::doctest!(
1075    "../../../docs/source/user-guide/sql/select.md",
1076    user_guide_sql_select
1077);
1078
1079#[cfg(doctest)]
1080doc_comment::doctest!(
1081    "../../../docs/source/user-guide/sql/special_functions.md",
1082    user_guide_sql_special_functions
1083);
1084
1085#[cfg(doctest)]
1086doc_comment::doctest!(
1087    "../../../docs/source/user-guide/sql/subqueries.md",
1088    user_guide_sql_subqueries
1089);
1090
1091#[cfg(doctest)]
1092doc_comment::doctest!(
1093    "../../../docs/source/user-guide/sql/window_functions.md",
1094    user_guide_sql_window_functions
1095);
1096
1097#[cfg(doctest)]
1098doc_comment::doctest!(
1099    "../../../docs/source/user-guide/sql/format_options.md",
1100    user_guide_sql_format_options
1101);
1102
1103#[cfg(doctest)]
1104doc_comment::doctest!(
1105    "../../../docs/source/library-user-guide/functions/adding-udfs.md",
1106    library_user_guide_functions_adding_udfs
1107);
1108
1109#[cfg(doctest)]
1110doc_comment::doctest!(
1111    "../../../docs/source/library-user-guide/functions/spark.md",
1112    library_user_guide_functions_spark
1113);
1114
1115#[cfg(doctest)]
1116doc_comment::doctest!(
1117    "../../../docs/source/library-user-guide/building-logical-plans.md",
1118    library_user_guide_building_logical_plans
1119);
1120
1121#[cfg(doctest)]
1122doc_comment::doctest!(
1123    "../../../docs/source/library-user-guide/catalogs.md",
1124    library_user_guide_catalogs
1125);
1126
1127#[cfg(doctest)]
1128doc_comment::doctest!(
1129    "../../../docs/source/library-user-guide/custom-table-providers.md",
1130    library_user_guide_custom_table_providers
1131);
1132
1133#[cfg(doctest)]
1134doc_comment::doctest!(
1135    "../../../docs/source/library-user-guide/extending-operators.md",
1136    library_user_guide_extending_operators
1137);
1138
1139#[cfg(doctest)]
1140doc_comment::doctest!(
1141    "../../../docs/source/library-user-guide/extensions.md",
1142    library_user_guide_extensions
1143);
1144
1145#[cfg(doctest)]
1146doc_comment::doctest!(
1147    "../../../docs/source/library-user-guide/index.md",
1148    library_user_guide_index
1149);
1150
1151#[cfg(doctest)]
1152doc_comment::doctest!(
1153    "../../../docs/source/library-user-guide/profiling.md",
1154    library_user_guide_profiling
1155);
1156
1157#[cfg(doctest)]
1158doc_comment::doctest!(
1159    "../../../docs/source/library-user-guide/query-optimizer.md",
1160    library_user_guide_query_optimizer
1161);
1162
1163#[cfg(doctest)]
1164doc_comment::doctest!(
1165    "../../../docs/source/library-user-guide/using-the-dataframe-api.md",
1166    library_user_guide_dataframe_api
1167);
1168
1169#[cfg(doctest)]
1170doc_comment::doctest!(
1171    "../../../docs/source/library-user-guide/using-the-sql-api.md",
1172    library_user_guide_sql_api
1173);
1174
1175#[cfg(doctest)]
1176doc_comment::doctest!(
1177    "../../../docs/source/library-user-guide/working-with-exprs.md",
1178    library_user_guide_working_with_exprs
1179);
1180
1181#[cfg(doctest)]
1182doc_comment::doctest!(
1183    "../../../docs/source/library-user-guide/upgrading/46.0.0.md",
1184    library_user_guide_upgrading_46_0_0
1185);
1186
1187#[cfg(doctest)]
1188doc_comment::doctest!(
1189    "../../../docs/source/library-user-guide/upgrading/47.0.0.md",
1190    library_user_guide_upgrading_47_0_0
1191);
1192
1193#[cfg(doctest)]
1194doc_comment::doctest!(
1195    "../../../docs/source/library-user-guide/upgrading/48.0.0.md",
1196    library_user_guide_upgrading_48_0_0
1197);
1198
1199#[cfg(doctest)]
1200doc_comment::doctest!(
1201    "../../../docs/source/library-user-guide/upgrading/48.0.1.md",
1202    library_user_guide_upgrading_48_0_1
1203);
1204
1205#[cfg(doctest)]
1206doc_comment::doctest!(
1207    "../../../docs/source/library-user-guide/upgrading/49.0.0.md",
1208    library_user_guide_upgrading_49_0_0
1209);
1210
1211#[cfg(doctest)]
1212doc_comment::doctest!(
1213    "../../../docs/source/library-user-guide/upgrading/50.0.0.md",
1214    library_user_guide_upgrading_50_0_0
1215);
1216
1217#[cfg(doctest)]
1218doc_comment::doctest!(
1219    "../../../docs/source/library-user-guide/upgrading/51.0.0.md",
1220    library_user_guide_upgrading_51_0_0
1221);
1222
1223#[cfg(doctest)]
1224doc_comment::doctest!(
1225    "../../../docs/source/library-user-guide/upgrading/52.0.0.md",
1226    library_user_guide_upgrading_52_0_0
1227);
1228
1229#[cfg(doctest)]
1230doc_comment::doctest!(
1231    "../../../docs/source/library-user-guide/upgrading/53.0.0.md",
1232    library_user_guide_upgrading_53_0_0
1233);
1234
1235#[cfg(doctest)]
1236doc_comment::doctest!(
1237    "../../../docs/source/contributor-guide/api-health.md",
1238    contributor_guide_api_health
1239);