pub trait DataSource:
Send
+ Sync
+ Debug {
// Required methods
fn open(
&self,
partition: usize,
context: Arc<TaskContext>,
) -> Result<SendableRecordBatchStream>;
fn as_any(&self) -> &dyn Any;
fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> Result;
fn output_partitioning(&self) -> Partitioning;
fn eq_properties(&self) -> EquivalenceProperties;
fn statistics(&self) -> Result<Statistics>;
fn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn DataSource>>;
fn fetch(&self) -> Option<usize>;
fn try_swapping_with_projection(
&self,
_projection: &ProjectionExec,
) -> Result<Option<Arc<dyn ExecutionPlan>>>;
// Provided methods
fn repartitioned(
&self,
_target_partitions: usize,
_repartition_file_min_size: usize,
_output_ordering: Option<LexOrdering>,
) -> Result<Option<Arc<dyn DataSource>>> { ... }
fn metrics(&self) -> ExecutionPlanMetricsSet { ... }
fn try_pushdown_filters(
&self,
filters: Vec<Arc<dyn PhysicalExpr>>,
_config: &ConfigOptions,
) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>> { ... }
}
Expand description
A source of data, typically a list of files or memory
This trait provides common behaviors for abstract sources of data. It has two common implementations:
FileScanConfig
: lists of filesMemorySourceConfig
: in memory list ofRecordBatch
File format specific behaviors are defined by FileSource
§See Also
FileSource
for file format specific implementations (Parquet, Json, etc)DataSourceExec
: TheExecutionPlan
that reads from aDataSource
§Notes
Requires Debug
to assist debugging
The following diagram shows how DataSource, FileSource, and DataSourceExec are related
┌─────────────────────┐ -----► execute path
│ │ ┄┄┄┄┄► init path
│ DataSourceExec │
│ │
└───────▲─────────────┘
┊ │
┊ │
┌──────────▼──────────┐ ┌──────────-──────────┐
│ │ | |
│ DataSource(trait) │ | TableProvider(trait)|
│ │ | |
└───────▲─────────────┘ └─────────────────────┘
┊ │ ┊
┌───────────────┿──┴────────────────┐ ┊
| ┌┄┄┄┄┄┄┄┄┄┄┄┘ | ┊
| ┊ | ┊
┌──────────▼──────────┐ ┌──────────▼──────────┐ ┊
│ │ │ │ ┌──────────▼──────────┐
│ FileScanConfig │ │ MemorySourceConfig │ | |
│ │ │ │ | FileFormat(trait) |
└──────────────▲──────┘ └─────────────────────┘ | |
│ ┊ └─────────────────────┘
│ ┊ ┊
│ ┊ ┊
┌──────────▼──────────┐ ┌──────────▼──────────┐
│ │ │ ArrowSource │
│ FileSource(trait) ◄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄│ ... │
│ │ │ ParquetSource │
└─────────────────────┘ └─────────────────────┘
│
│
│
│
┌──────────▼──────────┐
│ ArrowSource │
│ ... │
│ ParquetSource │
└─────────────────────┘
|
FileOpener (called by FileStream)
│
┌──────────▼──────────┐
│ │
│ RecordBatch │
│ │
└─────────────────────┘
Required Methods§
fn open( &self, partition: usize, context: Arc<TaskContext>, ) -> Result<SendableRecordBatchStream>
fn as_any(&self) -> &dyn Any
Sourcefn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> Result
fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> Result
Format this source for display in explain plans
fn output_partitioning(&self) -> Partitioning
fn eq_properties(&self) -> EquivalenceProperties
fn statistics(&self) -> Result<Statistics>
Sourcefn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn DataSource>>
fn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn DataSource>>
Return a copy of this DataSource with a new fetch limit
fn fetch(&self) -> Option<usize>
fn try_swapping_with_projection( &self, _projection: &ProjectionExec, ) -> Result<Option<Arc<dyn ExecutionPlan>>>
Provided Methods§
Sourcefn repartitioned(
&self,
_target_partitions: usize,
_repartition_file_min_size: usize,
_output_ordering: Option<LexOrdering>,
) -> Result<Option<Arc<dyn DataSource>>>
fn repartitioned( &self, _target_partitions: usize, _repartition_file_min_size: usize, _output_ordering: Option<LexOrdering>, ) -> Result<Option<Arc<dyn DataSource>>>
Return a copy of this DataSource with a new partitioning scheme.
Returns Ok(None)
(the default) if the partitioning cannot be changed.
Refer to ExecutionPlan::repartitioned
for details on when None should be returned.
Repartitioning should not change the output ordering, if this ordering exists.
Refer to MemorySourceConfig::repartition_preserving_order
and the FileSource’s
FileGroupPartitioner::repartition_file_groups
for examples.
fn metrics(&self) -> ExecutionPlanMetricsSet
Sourcefn try_pushdown_filters(
&self,
filters: Vec<Arc<dyn PhysicalExpr>>,
_config: &ConfigOptions,
) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>>
fn try_pushdown_filters( &self, filters: Vec<Arc<dyn PhysicalExpr>>, _config: &ConfigOptions, ) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>>
Try to push down filters into this DataSource.
See ExecutionPlan::handle_child_pushdown_result
for more details.