Struct datafusion::datasource::physical_plan::FileScanConfig

source ·
pub struct FileScanConfig {
    pub object_store_url: ObjectStoreUrl,
    pub file_schema: SchemaRef,
    pub file_groups: Vec<Vec<PartitionedFile>>,
    pub statistics: Statistics,
    pub projection: Option<Vec<usize>>,
    pub limit: Option<usize>,
    pub table_partition_cols: Vec<Field>,
    pub output_ordering: Vec<LexOrdering>,
}
Expand description

The base configurations to provide when creating a physical plan for any given file format.

§Example

use datafusion::datasource::listing::PartitionedFile;
// create FileScan config for reading data from file://
let object_store_url = ObjectStoreUrl::local_filesystem();
let config = FileScanConfig::new(object_store_url, file_schema)
  .with_limit(Some(1000))            // read only the first 1000 records
  .with_projection(Some(vec![2, 3])) // project columns 2 and 3
   // Read /tmp/file1.parquet with known size of 1234 bytes in a single group
  .with_file(PartitionedFile::new("file1.parquet", 1234))
  // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes
  // in a  single row group
  .with_file_group(vec![
   PartitionedFile::new("file2.parquet", 56),
   PartitionedFile::new("file3.parquet", 78),
  ]);

Fields§

§object_store_url: ObjectStoreUrl

Object store URL, used to get an ObjectStore instance from RuntimeEnv::object_store

This ObjectStoreUrl should be the prefix of the absolute url for files as file:// or s3://my_bucket. It should not include the path to the file itself. The relevant URL prefix must be registered via RuntimeEnv::register_object_store

§file_schema: SchemaRef

Schema before projection is applied. It contains the all columns that may appear in the files. It does not include table partition columns that may be added.

§file_groups: Vec<Vec<PartitionedFile>>

List of files to be processed, grouped into partitions

Each file must have a schema of file_schema or a subset. If a particular file has a subset, the missing columns are padded with NULLs.

DataFusion may attempt to read each partition of files concurrently, however files within a partition will be read sequentially, one after the next.

§statistics: Statistics

Estimated overall statistics of the files, taking filters into account. Defaults to Statistics::new_unknown.

§projection: Option<Vec<usize>>

Columns on which to project the data. Indexes that are higher than the number of columns of file_schema refer to table_partition_cols.

§limit: Option<usize>

The maximum number of records to read from this plan. If None, all records after filtering are returned.

§table_partition_cols: Vec<Field>

The partitioning columns

§output_ordering: Vec<LexOrdering>

All equivalent lexicographical orderings that describe the schema.

Implementations§

source§

impl FileScanConfig

source

pub fn new(object_store_url: ObjectStoreUrl, file_schema: SchemaRef) -> Self

Create a new FileScanConfig with default settings for scanning files.

See example on FileScanConfig

No file groups are added by default. See Self::with_file, [Self::with_file_group] and Self::with_file_groups.

§Parameters:
source

pub fn with_statistics(self, statistics: Statistics) -> Self

Set the statistics of the files

source

pub fn with_projection(self, projection: Option<Vec<usize>>) -> Self

Set the projection of the files

source

pub fn with_limit(self, limit: Option<usize>) -> Self

Set the limit of the files

source

pub fn with_file(self, file: PartitionedFile) -> Self

Add a file as a single group

See Self::file_groups for more information.

source

pub fn with_file_groups(self, file_groups: Vec<Vec<PartitionedFile>>) -> Self

Add the file groups

See Self::file_groups for more information.

source

pub fn with_file_group(self, file_group: Vec<PartitionedFile>) -> Self

Add a new file group

See Self::file_groups for more information

source

pub fn with_table_partition_cols(self, table_partition_cols: Vec<Field>) -> Self

Set the partitioning columns of the files

source

pub fn with_output_ordering(self, output_ordering: Vec<LexOrdering>) -> Self

Set the output ordering of the files

source

pub fn project(&self) -> (SchemaRef, Statistics, Vec<LexOrdering>)

Project the schema and the statistics on the given column indices

source

pub fn repartition_file_groups( file_groups: Vec<Vec<PartitionedFile>>, target_partitions: usize, repartition_file_min_size: usize, ) -> Option<Vec<Vec<PartitionedFile>>>

👎Deprecated since 33.0.0: Use SessionContext::new_with_config
source

pub fn split_groups_by_statistics( table_schema: &SchemaRef, file_groups: &[Vec<PartitionedFile>], sort_order: &[PhysicalSortExpr], ) -> Result<Vec<Vec<PartitionedFile>>>

Attempts to do a bin-packing on files into file groups, such that any two files in a file group are ordered and non-overlapping with respect to their statistics. It will produce the smallest number of file groups possible.

Trait Implementations§

source§

impl Clone for FileScanConfig

source§

fn clone(&self) -> FileScanConfig

Returns a copy of the value. Read more
1.0.0 · source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
source§

impl Debug for FileScanConfig

source§

fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult

Formats the value using the given formatter. Read more
source§

impl DisplayAs for FileScanConfig

source§

fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult

Format according to DisplayFormatType, used when verbose representation looks different from the default one Read more

Auto Trait Implementations§

Blanket Implementations§

source§

impl<T> Any for T
where T: 'static + ?Sized,

source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
source§

impl<T> Borrow<T> for T
where T: ?Sized,

source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
source§

impl<T> From<T> for T

source§

fn from(t: T) -> T

Returns the argument unchanged.

source§

impl<T, U> Into<U> for T
where U: From<T>,

source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

source§

impl<T> IntoEither for T

source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
source§

impl<T> Same for T

§

type Output = T

Should always be Self
source§

impl<T> ToOwned for T
where T: Clone,

§

type Owned = T

The resulting type after obtaining ownership.
source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

§

type Error = Infallible

The type returned in the event of a conversion error.
source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

source§

fn vzip(self) -> V