pub trait DataSource:
Any
+ Send
+ Sync
+ Debug {
Show 17 methods
// Required methods
fn open(
&self,
partition: usize,
context: Arc<TaskContext>,
) -> Result<SendableRecordBatchStream>;
fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> Result;
fn output_partitioning(&self) -> Partitioning;
fn eq_properties(&self) -> EquivalenceProperties;
fn partition_statistics(
&self,
partition: Option<usize>,
) -> Result<Arc<Statistics>>;
fn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn DataSource>>;
fn fetch(&self) -> Option<usize>;
fn try_swapping_with_projection(
&self,
_projection: &ProjectionExprs,
) -> Result<Option<Arc<dyn DataSource>>>;
// Provided methods
fn repartitioned(
&self,
_target_partitions: usize,
_repartition_file_min_size: usize,
_output_ordering: Option<LexOrdering>,
) -> Result<Option<Arc<dyn DataSource>>> { ... }
fn scheduling_type(&self) -> SchedulingType { ... }
fn metrics(&self) -> ExecutionPlanMetricsSet { ... }
fn try_pushdown_filters(
&self,
filters: Vec<Arc<dyn PhysicalExpr>>,
_config: &ConfigOptions,
) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>> { ... }
fn try_pushdown_sort(
&self,
_order: &[PhysicalSortExpr],
) -> Result<SortOrderPushdownResult<Arc<dyn DataSource>>> { ... }
fn with_preserve_order(
&self,
_preserve_order: bool,
) -> Option<Arc<dyn DataSource>> { ... }
fn with_new_state(
&self,
_state: Arc<dyn Any + Send + Sync>,
) -> Option<Arc<dyn DataSource>> { ... }
fn create_sibling_state(&self) -> Option<Arc<dyn Any + Send + Sync>> { ... }
fn open_with_args(
&self,
args: OpenArgs,
) -> Result<SendableRecordBatchStream> { ... }
}Expand description
A source of data, typically a list of files or memory
This trait provides common behaviors for abstract sources of data. It has two common implementations:
FileScanConfig: lists of filesMemorySourceConfig: in memory list ofRecordBatch
File format specific behaviors are defined by FileSource
§See Also
FileSourcefor file format specific implementations (Parquet, Json, etc)DataSourceExec: TheExecutionPlanthat reads from aDataSource
§Notes
Requires Debug to assist debugging
The following diagram shows how DataSource, FileSource, and DataSourceExec are related
┌─────────────────────┐ -----► execute path
│ │ ┄┄┄┄┄► init path
│ DataSourceExec │
│ │
└───────▲─────────────┘
┊ │
┊ │
┌──────────▼──────────┐ ┌──────────-──────────┐
│ │ | |
│ DataSource(trait) │ | TableProvider(trait)|
│ │ | |
└───────▲─────────────┘ └─────────────────────┘
┊ │ ┊
┌───────────────┿──┴────────────────┐ ┊
| ┌┄┄┄┄┄┄┄┄┄┄┄┘ | ┊
| ┊ | ┊
┌──────────▼──────────┐ ┌──────────▼──────────┐ ┊
│ │ │ │ ┌──────────▼──────────┐
│ FileScanConfig │ │ MemorySourceConfig │ | |
│ │ │ │ | FileFormat(trait) |
└──────────────▲──────┘ └─────────────────────┘ | |
│ ┊ └─────────────────────┘
│ ┊ ┊
│ ┊ ┊
┌──────────▼──────────┐ ┌──────────▼──────────┐
│ │ │ ArrowSource │
│ FileSource(trait) ◄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄│ ... │
│ │ │ ParquetSource │
└─────────────────────┘ └─────────────────────┘
│
│
│
│
┌──────────▼──────────┐
│ ArrowSource │
│ ... │
│ ParquetSource │
└─────────────────────┘
|
FileOpener (called by FileStream)
│
┌──────────▼──────────┐
│ │
│ RecordBatch │
│ │
└─────────────────────┘Required Methods§
Sourcefn open(
&self,
partition: usize,
context: Arc<TaskContext>,
) -> Result<SendableRecordBatchStream>
fn open( &self, partition: usize, context: Arc<TaskContext>, ) -> Result<SendableRecordBatchStream>
Open the specified output partition and return its stream of
RecordBatches.
This should be used by data sources that do not need any sibling
coordination. Data sources that want to use per-execution shared state
(for example, to reorder work across partitions at runtime) should
implement Self::open_with_args instead.
Sourcefn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> Result
fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> Result
Format this source for display in explain plans
fn output_partitioning(&self) -> Partitioning
fn eq_properties(&self) -> EquivalenceProperties
Sourcefn partition_statistics(
&self,
partition: Option<usize>,
) -> Result<Arc<Statistics>>
fn partition_statistics( &self, partition: Option<usize>, ) -> Result<Arc<Statistics>>
Returns statistics for a specific partition, or aggregate statistics
across all partitions if partition is None.
Sourcefn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn DataSource>>
fn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn DataSource>>
Return a copy of this DataSource with a new fetch limit
fn fetch(&self) -> Option<usize>
fn try_swapping_with_projection( &self, _projection: &ProjectionExprs, ) -> Result<Option<Arc<dyn DataSource>>>
Provided Methods§
Sourcefn repartitioned(
&self,
_target_partitions: usize,
_repartition_file_min_size: usize,
_output_ordering: Option<LexOrdering>,
) -> Result<Option<Arc<dyn DataSource>>>
fn repartitioned( &self, _target_partitions: usize, _repartition_file_min_size: usize, _output_ordering: Option<LexOrdering>, ) -> Result<Option<Arc<dyn DataSource>>>
Return a copy of this DataSource with a new partitioning scheme.
Returns Ok(None) (the default) if the partitioning cannot be changed.
Refer to ExecutionPlan::repartitioned for details on when None should be returned.
Repartitioning should not change the output ordering, if this ordering exists.
Refer to MemorySourceConfig::repartition_preserving_order
and the FileSource’s
FileGroupPartitioner::repartition_file_groups
for examples.
fn scheduling_type(&self) -> SchedulingType
fn metrics(&self) -> ExecutionPlanMetricsSet
Sourcefn try_pushdown_filters(
&self,
filters: Vec<Arc<dyn PhysicalExpr>>,
_config: &ConfigOptions,
) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>>
fn try_pushdown_filters( &self, filters: Vec<Arc<dyn PhysicalExpr>>, _config: &ConfigOptions, ) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>>
Try to push down filters into this DataSource.
These filters are in terms of the output schema of this DataSource (e.g.
Self::eq_properties and output of any projections pushed into the
source), not the original table schema.
See ExecutionPlan::handle_child_pushdown_result for more details.
Sourcefn try_pushdown_sort(
&self,
_order: &[PhysicalSortExpr],
) -> Result<SortOrderPushdownResult<Arc<dyn DataSource>>>
fn try_pushdown_sort( &self, _order: &[PhysicalSortExpr], ) -> Result<SortOrderPushdownResult<Arc<dyn DataSource>>>
Try to create a new DataSource that produces data in the specified sort order.
§Arguments
order- The desired output ordering
§Returns
Ok(SortOrderPushdownResult::Exact { .. })- Created a source that guarantees exact orderingOk(SortOrderPushdownResult::Inexact { .. })- Created a source optimized for the orderingOk(SortOrderPushdownResult::Unsupported)- Cannot optimize for this orderingErr(e)- Error occurred
Default implementation returns Unsupported.
Sourcefn with_preserve_order(
&self,
_preserve_order: bool,
) -> Option<Arc<dyn DataSource>>
fn with_preserve_order( &self, _preserve_order: bool, ) -> Option<Arc<dyn DataSource>>
Returns a variant of this DataSource that is aware of order-sensitivity.
Sourcefn with_new_state(
&self,
_state: Arc<dyn Any + Send + Sync>,
) -> Option<Arc<dyn DataSource>>
fn with_new_state( &self, _state: Arc<dyn Any + Send + Sync>, ) -> Option<Arc<dyn DataSource>>
Injects arbitrary run-time state into this DataSource, returning a new instance that incorporates that state if it is relevant to the concrete DataSource implementation.
This is a generic entry point: the state can be any type wrapped in
Arc<dyn Any + Send + Sync>. A data source that cares about the state should
down-cast it to the concrete type it expects and, if successful, return a
modified copy of itself that captures the provided value. If the state is
not applicable, the default behaviour is to return None so that parent
nodes can continue propagating the attempt further down the plan tree.
Sourcefn create_sibling_state(&self) -> Option<Arc<dyn Any + Send + Sync>>
fn create_sibling_state(&self) -> Option<Arc<dyn Any + Send + Sync>>
Create per execution state to share across sibling instances of this data source during one execution.
Returns None (the default) if this data source has
no sibling-shared execution state.
Sourcefn open_with_args(&self, args: OpenArgs) -> Result<SendableRecordBatchStream>
fn open_with_args(&self, args: OpenArgs) -> Result<SendableRecordBatchStream>
Open a partition using optional sibling-shared execution state.
The default implementation ignores the additional state and delegates to
Self::open.
Implementations§
Source§impl dyn DataSource
impl dyn DataSource
pub fn is<T: DataSource>(&self) -> bool
pub fn downcast_ref<T: DataSource>(&self) -> Option<&T>
Dyn Compatibility§
This trait is dyn compatible.
In older versions of Rust, dyn compatibility was called "object safety".