Struct FileScanConfig

Source
pub struct FileScanConfig {
    pub object_store_url: ObjectStoreUrl,
    pub file_schema: SchemaRef,
    pub file_groups: Vec<Vec<PartitionedFile>>,
    pub constraints: Constraints,
    pub statistics: Statistics,
    pub projection: Option<Vec<usize>>,
    pub limit: Option<usize>,
    pub table_partition_cols: Vec<Field>,
    pub output_ordering: Vec<LexOrdering>,
    pub file_compression_type: FileCompressionType,
    pub new_lines_in_values: bool,
    pub file_source: Arc<dyn FileSource>,
}
Expand description

The base configurations for a DataSourceExec, the a physical plan for any given file format.

Use Self::build to create a DataSourceExec from a ``FileScanConfig`.

§Example

// create FileScan config for reading parquet files from file://
let object_store_url = ObjectStoreUrl::local_filesystem();
let file_source = Arc::new(ParquetSource::new());
let config = FileScanConfig::new(object_store_url, file_schema, file_source)
  .with_limit(Some(1000))            // read only the first 1000 records
  .with_projection(Some(vec![2, 3])) // project columns 2 and 3
   // Read /tmp/file1.parquet with known size of 1234 bytes in a single group
  .with_file(PartitionedFile::new("file1.parquet", 1234))
  // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes
  // in a  single row group
  .with_file_group(vec![
   PartitionedFile::new("file2.parquet", 56),
   PartitionedFile::new("file3.parquet", 78),
  ]);
// create an execution plan from the config
let plan: Arc<dyn ExecutionPlan> = config.build();

Fields§

§object_store_url: ObjectStoreUrl

Object store URL, used to get an ObjectStore instance from RuntimeEnv::object_store

This ObjectStoreUrl should be the prefix of the absolute url for files as file:// or s3://my_bucket. It should not include the path to the file itself. The relevant URL prefix must be registered via RuntimeEnv::register_object_store

§file_schema: SchemaRef

Schema before projection is applied. It contains the all columns that may appear in the files. It does not include table partition columns that may be added.

§file_groups: Vec<Vec<PartitionedFile>>

List of files to be processed, grouped into partitions

Each file must have a schema of file_schema or a subset. If a particular file has a subset, the missing columns are padded with NULLs.

DataFusion may attempt to read each partition of files concurrently, however files within a partition will be read sequentially, one after the next.

§constraints: Constraints

Table constraints

§statistics: Statistics

Estimated overall statistics of the files, taking filters into account. Defaults to Statistics::new_unknown.

§projection: Option<Vec<usize>>

Columns on which to project the data. Indexes that are higher than the number of columns of file_schema refer to table_partition_cols.

§limit: Option<usize>

The maximum number of records to read from this plan. If None, all records after filtering are returned.

§table_partition_cols: Vec<Field>

The partitioning columns

§output_ordering: Vec<LexOrdering>

All equivalent lexicographical orderings that describe the schema.

§file_compression_type: FileCompressionType

File compression type

§new_lines_in_values: bool

Are new lines in values supported for CSVOptions

§file_source: Arc<dyn FileSource>

File source such as ParquetSource, CsvSource, JsonSource, etc.

Implementations§

Source§

impl FileScanConfig

Source

pub fn new( object_store_url: ObjectStoreUrl, file_schema: SchemaRef, file_source: Arc<dyn FileSource>, ) -> Self

Create a new FileScanConfig with default settings for scanning files.

See example on FileScanConfig

No file groups are added by default. See Self::with_file, Self::with_file_group and Self::with_file_groups.

§Parameters:
Source

pub fn with_source(self, file_source: Arc<dyn FileSource>) -> Self

Set the file source

Source

pub fn with_constraints(self, constraints: Constraints) -> Self

Set the table constraints of the files

Source

pub fn with_statistics(self, statistics: Statistics) -> Self

Set the statistics of the files

Source

pub fn with_projection(self, projection: Option<Vec<usize>>) -> Self

Set the projection of the files

Source

pub fn with_limit(self, limit: Option<usize>) -> Self

Set the limit of the files

Source

pub fn with_file(self, file: PartitionedFile) -> Self

Add a file as a single group

See Self::file_groups for more information.

Source

pub fn with_file_groups(self, file_groups: Vec<Vec<PartitionedFile>>) -> Self

Add the file groups

See Self::file_groups for more information.

Source

pub fn with_file_group(self, file_group: Vec<PartitionedFile>) -> Self

Add a new file group

See Self::file_groups for more information

Source

pub fn with_table_partition_cols(self, table_partition_cols: Vec<Field>) -> Self

Set the partitioning columns of the files

Source

pub fn with_output_ordering(self, output_ordering: Vec<LexOrdering>) -> Self

Set the output ordering of the files

Source

pub fn with_file_compression_type( self, file_compression_type: FileCompressionType, ) -> Self

Set the file compression type

Source

pub fn with_newlines_in_values(self, new_lines_in_values: bool) -> Self

Set the new_lines_in_values property

Source

pub fn newlines_in_values(&self) -> bool

Specifies whether newlines in (quoted) values are supported.

Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to true ensures that newlines in values are parsed successfully, which may reduce performance.

The default behaviour depends on the datafusion.catalog.newlines_in_values setting.

Source

pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec<LexOrdering>)

Project the schema, constraints, and the statistics on the given column indices

Source

pub fn projected_file_column_names(&self) -> Option<Vec<String>>

Source

pub fn projected_file_schema(&self) -> SchemaRef

Projects only file schema, ignoring partition columns

Source

pub fn file_column_projection_indices(&self) -> Option<Vec<usize>>

Source

pub fn split_groups_by_statistics( table_schema: &SchemaRef, file_groups: &[Vec<PartitionedFile>], sort_order: &LexOrdering, ) -> Result<Vec<Vec<PartitionedFile>>>

Attempts to do a bin-packing on files into file groups, such that any two files in a file group are ordered and non-overlapping with respect to their statistics. It will produce the smallest number of file groups possible.

Source

pub fn build(self) -> Arc<DataSourceExec>

Returns a new DataSourceExec to scan the files specified by this config

Source

pub fn file_source(&self) -> &Arc<dyn FileSource>

Returns the file_source

Trait Implementations§

Source§

impl Clone for FileScanConfig

Source§

fn clone(&self) -> FileScanConfig

Returns a copy of the value. Read more
1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
Source§

impl DataSource for FileScanConfig

Source§

fn repartitioned( &self, target_partitions: usize, repartition_file_min_size: usize, output_ordering: Option<LexOrdering>, ) -> Result<Option<Arc<dyn DataSource>>>

If supported by the underlying FileSource, redistribute files across partitions according to their size.

Source§

fn open( &self, partition: usize, context: Arc<TaskContext>, ) -> Result<SendableRecordBatchStream>

Source§

fn as_any(&self) -> &dyn Any

Source§

fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult

Source§

fn output_partitioning(&self) -> Partitioning

Source§

fn eq_properties(&self) -> EquivalenceProperties

Source§

fn statistics(&self) -> Result<Statistics>

Source§

fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>>

Return a copy of this DataSource with a new fetch limit
Source§

fn fetch(&self) -> Option<usize>

Source§

fn metrics(&self) -> ExecutionPlanMetricsSet

Source§

fn try_swapping_with_projection( &self, projection: &ProjectionExec, ) -> Result<Option<Arc<dyn ExecutionPlan>>>

Source§

impl Debug for FileScanConfig

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult

Formats the value using the given formatter. Read more
Source§

impl DisplayAs for FileScanConfig

Source§

fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult

Format according to DisplayFormatType, used when verbose representation looks different from the default one Read more

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> CloneToUninit for T
where T: Clone,

Source§

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)
Performs copy-assignment from self to dest. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

impl<T> ToOwned for T
where T: Clone,

Source§

type Owned = T

The resulting type after obtaining ownership.
Source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
Source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V

Source§

impl<T> ErasedDestructor for T
where T: 'static,

Source§

impl<T> MaybeSendSync for T