Struct FileScanConfig

Source

pub struct FileScanConfig {
    pub object_store_url: ObjectStoreUrl,
    pub file_groups: Vec<FileGroup>,
    pub constraints: Constraints,
    pub limit: Option<usize>,
    pub preserve_order: bool,
    pub output_ordering: Vec<LexOrdering>,
    pub file_compression_type: FileCompressionType,
    pub file_source: Arc<dyn FileSource>,
    pub batch_size: Option<usize>,
    pub expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
    pub partitioned_by_file_group: bool,
    /* private fields */
}

Expand description

FileScanConfig represents scanning data from a group of files

FileScanConfig is used to create a DataSourceExec, the physical plan for scanning files with a particular file format.

The FileSource (e.g. ParquetSource, CsvSource, etc.) is responsible for creating the actual execution plan to read the files based on a FileScanConfig. Fields in a FileScanConfig such as Statistics represent information about the files before any projection or filtering is applied in the file source.

Use FileScanConfigBuilder to construct a FileScanConfig.

Use DataSourceExec::from_data_source to create a DataSourceExec from a FileScanConfig.

§Example

#[derive(Clone)]
// create FileScan config for reading parquet files from file://
let object_store_url = ObjectStoreUrl::local_filesystem();
let file_source = Arc::new(ParquetSource::new(file_schema.clone()));
let config = FileScanConfigBuilder::new(object_store_url, file_source)
  .with_limit(Some(1000))            // read only the first 1000 records
  .with_projection_indices(Some(vec![2, 3])) // project columns 2 and 3
  .expect("Failed to push down projection")
   // Read /tmp/file1.parquet with known size of 1234 bytes in a single group
  .with_file(PartitionedFile::new("file1.parquet", 1234))
  // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes
  // in a  single row group
  .with_file_group(FileGroup::new(vec![
   PartitionedFile::new("file2.parquet", 56),
   PartitionedFile::new("file3.parquet", 78),
  ])).build();
// create an execution plan from the config
let plan: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(config);

Fields§

§object_store_url: ObjectStoreUrl

Object store URL, used to get an ObjectStore instance from RuntimeEnv::object_store

This ObjectStoreUrl should be the prefix of the absolute url for files as file:// or s3://my_bucket. It should not include the path to the file itself. The relevant URL prefix must be registered via RuntimeEnv::register_object_store

§file_groups: Vec<FileGroup>

List of files to be processed, grouped into partitions

Each file must have a schema of file_schema or a subset. If a particular file has a subset, the missing columns are padded with NULLs.

DataFusion may attempt to read each partition of files concurrently, however files within a partition will be read sequentially, one after the next.

§constraints: Constraints

Table constraints

§limit: Option<usize>

The maximum number of records to read from this plan. If None, all records after filtering are returned.

§preserve_order: bool

Whether the scan’s limit is order sensitive When true, files must be read in the exact order specified to produce correct results (e.g., for ORDER BY ... LIMIT queries). When false, DataFusion may reorder file processing for optimization without affecting correctness.

§output_ordering: Vec<LexOrdering>

All equivalent lexicographical output orderings of this file scan, in terms of FileSource::table_schema. See FileScanConfigBuilder::with_output_ordering for more details.

Self::eq_properties uses this information along with projection and filtering information to compute the effective EquivalenceProperties

§file_compression_type: FileCompressionType

File compression type

§file_source: Arc<dyn FileSource>

File source such as ParquetSource, CsvSource, JsonSource, etc.

§batch_size: Option<usize>

Batch size while creating new batches Defaults to datafusion_common::config::ExecutionOptions batch_size.

§expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>

Expression adapter used to adapt filters and projections that are pushed down into the scan from the logical schema to the physical schema of the file.

§partitioned_by_file_group: bool

When true, file_groups are organized by partition column values and output_partitioning will return Hash partitioning on partition columns. This allows the optimizer to skip hash repartitioning for aggregates and joins on partition columns.

If the number of file partitions > target_partitions, the file partitions will be grouped in a round-robin fashion such that number of file partitions = target_partitions.

Struct FileScanConfig Copy item path

§Example

Fields§

Implementations§

impl FileScanConfig

pub fn file_schema(&self) -> &SchemaRef

pub fn table_partition_cols(&self) -> &Vec<FieldRef> ⓘ

pub fn statistics(&self) -> Statistics

pub fn projected_schema(&self) -> Result<Arc<Schema>>

pub fn newlines_in_values(&self) -> bool

pub fn projected_constraints(&self) -> Constraints

pub fn file_column_projection_indices(&self) -> Option<Vec<usize>>

pub fn split_groups_by_statistics_with_target_partitions( table_schema: &SchemaRef, file_groups: &[FileGroup], sort_order: &LexOrdering, target_partitions: usize, ) -> Result<Vec<FileGroup>>

§Parameters

§Returns

pub fn split_groups_by_statistics( table_schema: &SchemaRef, file_groups: &[FileGroup], sort_order: &LexOrdering, ) -> Result<Vec<FileGroup>>

pub fn file_source(&self) -> &Arc<dyn FileSource>

Trait Implementations§

impl Clone for FileScanConfig

fn clone(&self) -> FileScanConfig

fn clone_from(&mut self, source: &Self)

impl DataSource for FileScanConfig

fn repartitioned( &self, target_partitions: usize, repartition_file_min_size: usize, output_ordering: Option<LexOrdering>, ) -> Result<Option<Arc<dyn DataSource>>>

fn output_partitioning(&self) -> Partitioning

fn eq_properties(&self) -> EquivalenceProperties

fn open( &self, partition: usize, context: Arc<TaskContext>, ) -> Result<SendableRecordBatchStream>

fn as_any(&self) -> &dyn Any

fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult

fn scheduling_type(&self) -> SchedulingType

fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics>

fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>>

fn fetch(&self) -> Option<usize>

fn metrics(&self) -> ExecutionPlanMetricsSet

fn try_swapping_with_projection( &self, projection: &ProjectionExprs, ) -> Result<Option<Arc<dyn DataSource>>>

fn try_pushdown_filters( &self, filters: Vec<Arc<dyn PhysicalExpr>>, config: &ConfigOptions, ) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>>

fn try_pushdown_sort( &self, order: &[PhysicalSortExpr], ) -> Result<SortOrderPushdownResult<Arc<dyn DataSource>>>

fn with_preserve_order( &self, preserve_order: bool, ) -> Option<Arc<dyn DataSource>>

impl Debug for FileScanConfig

fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult

impl DisplayAs for FileScanConfig

fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult

impl From<FileScanConfig> for FileScanConfigBuilder

fn from(config: FileScanConfig) -> Self

Auto Trait Implementations§

impl Freeze for FileScanConfig

impl !RefUnwindSafe for FileScanConfig

impl Send for FileScanConfig

impl Sync for FileScanConfig

impl Unpin for FileScanConfig

impl UnsafeUnpin for FileScanConfig

impl !UnwindSafe for FileScanConfig

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

Struct FileScanConfig

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,