Struct FileScanConfig

Source
pub struct FileScanConfig {
    pub object_store_url: ObjectStoreUrl,
    pub file_schema: SchemaRef,
    pub file_groups: Vec<FileGroup>,
    pub constraints: Constraints,
    pub projection: Option<Vec<usize>>,
    pub limit: Option<usize>,
    pub table_partition_cols: Vec<Field>,
    pub output_ordering: Vec<LexOrdering>,
    pub file_compression_type: FileCompressionType,
    pub new_lines_in_values: bool,
    pub file_source: Arc<dyn FileSource>,
    pub batch_size: Option<usize>,
}
Expand description

The base configurations for a DataSourceExec, the a physical plan for any given file format.

Use Self::build to create a DataSourceExec from a ``FileScanConfig`.

§Example

#[derive(Clone)]
// create FileScan config for reading parquet files from file://
let object_store_url = ObjectStoreUrl::local_filesystem();
let file_source = Arc::new(ParquetSource::new());
let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source)
  .with_limit(Some(1000))            // read only the first 1000 records
  .with_projection(Some(vec![2, 3])) // project columns 2 and 3
   // Read /tmp/file1.parquet with known size of 1234 bytes in a single group
  .with_file(PartitionedFile::new("file1.parquet", 1234))
  // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes
  // in a  single row group
  .with_file_group(FileGroup::new(vec![
   PartitionedFile::new("file2.parquet", 56),
   PartitionedFile::new("file3.parquet", 78),
  ])).build();
// create an execution plan from the config
let plan: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(config);

Fields§

§object_store_url: ObjectStoreUrl

Object store URL, used to get an ObjectStore instance from RuntimeEnv::object_store

This ObjectStoreUrl should be the prefix of the absolute url for files as file:// or s3://my_bucket. It should not include the path to the file itself. The relevant URL prefix must be registered via RuntimeEnv::register_object_store

§file_schema: SchemaRef

Schema before projection is applied. It contains the all columns that may appear in the files. It does not include table partition columns that may be added. Note that this is not the schema of the physical files. This is the schema that the physical file schema will be mapped onto, and the schema that the DataSourceExec will return.

§file_groups: Vec<FileGroup>

List of files to be processed, grouped into partitions

Each file must have a schema of file_schema or a subset. If a particular file has a subset, the missing columns are padded with NULLs.

DataFusion may attempt to read each partition of files concurrently, however files within a partition will be read sequentially, one after the next.

§constraints: Constraints

Table constraints

§projection: Option<Vec<usize>>

Columns on which to project the data. Indexes that are higher than the number of columns of file_schema refer to table_partition_cols.

§limit: Option<usize>

The maximum number of records to read from this plan. If None, all records after filtering are returned.

§table_partition_cols: Vec<Field>

The partitioning columns

§output_ordering: Vec<LexOrdering>

All equivalent lexicographical orderings that describe the schema.

§file_compression_type: FileCompressionType

File compression type

§new_lines_in_values: bool

Are new lines in values supported for CSVOptions

§file_source: Arc<dyn FileSource>

File source such as ParquetSource, CsvSource, JsonSource, etc.

§batch_size: Option<usize>

Batch size while creating new batches Defaults to datafusion_common::config::ExecutionOptions batch_size.

Implementations§

Source§

impl FileScanConfig

Source

pub fn new( object_store_url: ObjectStoreUrl, file_schema: SchemaRef, file_source: Arc<dyn FileSource>, ) -> Self

Create a new FileScanConfig with default settings for scanning files.

See example on FileScanConfig

No file groups are added by default. See Self::with_file, Self::with_file_group and Self::with_file_groups.

§Parameters:
Source

pub fn with_source(self, file_source: Arc<dyn FileSource>) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Set the file source

Source

pub fn with_constraints(self, constraints: Constraints) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Set the table constraints of the files

Source

pub fn with_statistics(self, statistics: Statistics) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Set the statistics of the files

Source

pub fn projected_stats(&self) -> Statistics

Source

pub fn projected_schema(&self) -> Arc<Schema>

Source

pub fn projected_constraints(&self) -> Constraints

Source

pub fn with_projection(self, projection: Option<Vec<usize>>) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Set the projection of the files

Source

pub fn with_limit(self, limit: Option<usize>) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Set the limit of the files

Source

pub fn with_file(self, file: PartitionedFile) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Add a file as a single group

See Self::file_groups for more information.

Source

pub fn with_file_groups(self, file_groups: Vec<FileGroup>) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Add the file groups

See Self::file_groups for more information.

Source

pub fn with_file_group(self, file_group: FileGroup) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Add a new file group

See Self::file_groups for more information

Source

pub fn with_table_partition_cols(self, table_partition_cols: Vec<Field>) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Set the partitioning columns of the files

Source

pub fn with_output_ordering(self, output_ordering: Vec<LexOrdering>) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Set the output ordering of the files

Source

pub fn with_file_compression_type( self, file_compression_type: FileCompressionType, ) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Set the file compression type

Source

pub fn with_newlines_in_values(self, new_lines_in_values: bool) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Set the new_lines_in_values property

Source

pub fn with_batch_size(self, batch_size: Option<usize>) -> Self

👎Deprecated since 47.0.0: use FileScanConfigBuilder instead

Set the batch_size property

Source

pub fn newlines_in_values(&self) -> bool

Specifies whether newlines in (quoted) values are supported.

Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to true ensures that newlines in values are parsed successfully, which may reduce performance.

The default behaviour depends on the datafusion.catalog.newlines_in_values setting.

Source

pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec<LexOrdering>)

Project the schema, constraints, and the statistics on the given column indices

Source

pub fn projected_file_column_names(&self) -> Option<Vec<String>>

Source

pub fn projected_file_schema(&self) -> SchemaRef

Projects only file schema, ignoring partition columns

Source

pub fn file_column_projection_indices(&self) -> Option<Vec<usize>>

Source

pub fn split_groups_by_statistics_with_target_partitions( table_schema: &SchemaRef, file_groups: &[FileGroup], sort_order: &LexOrdering, target_partitions: usize, ) -> Result<Vec<FileGroup>>

Splits file groups into new groups based on statistics to enable efficient parallel processing.

The method distributes files across a target number of partitions while ensuring files within each partition maintain sort order based on their min/max statistics.

The algorithm works by:

  1. Takes files sorted by minimum values
  2. For each file:
  • Finds eligible groups (empty or where file’s min > group’s last max)
  • Selects the smallest eligible group
  • Creates a new group if needed
§Parameters
  • table_schema: Schema containing information about the columns
  • file_groups: The original file groups to split
  • sort_order: The lexicographical ordering to maintain within each group
  • target_partitions: The desired number of output partitions
§Returns

A new set of file groups, where files within each group are non-overlapping with respect to their min/max statistics and maintain the specified sort order.

Source

pub fn split_groups_by_statistics( table_schema: &SchemaRef, file_groups: &[FileGroup], sort_order: &LexOrdering, ) -> Result<Vec<FileGroup>>

Attempts to do a bin-packing on files into file groups, such that any two files in a file group are ordered and non-overlapping with respect to their statistics. It will produce the smallest number of file groups possible.

Source

pub fn build(self) -> Arc<DataSourceExec>

👎Deprecated since 47.0.0: use DataSourceExec::new instead

Returns a new DataSourceExec to scan the files specified by this config

Source

pub fn file_source(&self) -> &Arc<dyn FileSource>

Returns the file_source

Trait Implementations§

Source§

impl Clone for FileScanConfig

Source§

fn clone(&self) -> FileScanConfig

Returns a duplicate of the value. Read more
1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
Source§

impl DataSource for FileScanConfig

Source§

fn repartitioned( &self, target_partitions: usize, repartition_file_min_size: usize, output_ordering: Option<LexOrdering>, ) -> Result<Option<Arc<dyn DataSource>>>

If supported by the underlying FileSource, redistribute files across partitions according to their size.

Source§

fn open( &self, partition: usize, context: Arc<TaskContext>, ) -> Result<SendableRecordBatchStream>

Source§

fn as_any(&self) -> &dyn Any

Source§

fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult

Format this source for display in explain plans
Source§

fn output_partitioning(&self) -> Partitioning

Source§

fn eq_properties(&self) -> EquivalenceProperties

Source§

fn statistics(&self) -> Result<Statistics>

Source§

fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>>

Return a copy of this DataSource with a new fetch limit
Source§

fn fetch(&self) -> Option<usize>

Source§

fn metrics(&self) -> ExecutionPlanMetricsSet

Source§

fn try_swapping_with_projection( &self, projection: &ProjectionExec, ) -> Result<Option<Arc<dyn ExecutionPlan>>>

Source§

fn try_pushdown_filters( &self, filters: Vec<Arc<dyn PhysicalExpr>>, config: &ConfigOptions, ) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>>

Try to push down filters into this DataSource. See ExecutionPlan::handle_child_pushdown_result for more details.
Source§

impl Debug for FileScanConfig

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult

Formats the value using the given formatter. Read more
Source§

impl DisplayAs for FileScanConfig

Source§

fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult

Format according to DisplayFormatType, used when verbose representation looks different from the default one Read more
Source§

impl From<FileScanConfig> for FileScanConfigBuilder

Source§

fn from(config: FileScanConfig) -> Self

Converts to this type from the input type.

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> CloneToUninit for T
where T: Clone,

Source§

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)
Performs copy-assignment from self to dest. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

impl<T> ToOwned for T
where T: Clone,

Source§

type Owned = T

The resulting type after obtaining ownership.
Source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
Source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V

Source§

impl<T> ErasedDestructor for T
where T: 'static,