Struct FileScanConfig

Source

pub struct FileScanConfig {Show 13 fields
    pub object_store_url: ObjectStoreUrl,
    pub file_schema: SchemaRef,
    pub file_groups: Vec<FileGroup>,
    pub constraints: Constraints,
    pub projection: Option<Vec<usize>>,
    pub limit: Option<usize>,
    pub table_partition_cols: Vec<FieldRef>,
    pub output_ordering: Vec<LexOrdering>,
    pub file_compression_type: FileCompressionType,
    pub new_lines_in_values: bool,
    pub file_source: Arc<dyn FileSource>,
    pub batch_size: Option<usize>,
    pub expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
}

Expand description

The base configurations for a DataSourceExec, the a physical plan for any given file format.

Use DataSourceExec::from_data_source to create a DataSourceExec from a ``FileScanConfig`.

§Example

#[derive(Clone)]
// create FileScan config for reading parquet files from file://
let object_store_url = ObjectStoreUrl::local_filesystem();
let file_source = Arc::new(ParquetSource::new());
let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source)
  .with_limit(Some(1000))            // read only the first 1000 records
  .with_projection(Some(vec![2, 3])) // project columns 2 and 3
   // Read /tmp/file1.parquet with known size of 1234 bytes in a single group
  .with_file(PartitionedFile::new("file1.parquet", 1234))
  // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes
  // in a  single row group
  .with_file_group(FileGroup::new(vec![
   PartitionedFile::new("file2.parquet", 56),
   PartitionedFile::new("file3.parquet", 78),
  ])).build();
// create an execution plan from the config
let plan: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(config);

Fields§

§object_store_url: ObjectStoreUrl

Object store URL, used to get an ObjectStore instance from RuntimeEnv::object_store

This ObjectStoreUrl should be the prefix of the absolute url for files as file:// or s3://my_bucket. It should not include the path to the file itself. The relevant URL prefix must be registered via RuntimeEnv::register_object_store

§file_schema: SchemaRef

Schema before projection is applied. It contains the all columns that may appear in the files. It does not include table partition columns that may be added. Note that this is not the schema of the physical files. This is the schema that the physical file schema will be mapped onto, and the schema that the DataSourceExec will return.

§file_groups: Vec<FileGroup>

List of files to be processed, grouped into partitions

Each file must have a schema of file_schema or a subset. If a particular file has a subset, the missing columns are padded with NULLs.

DataFusion may attempt to read each partition of files concurrently, however files within a partition will be read sequentially, one after the next.

§constraints: Constraints

Table constraints

§projection: Option<Vec<usize>>

Columns on which to project the data. Indexes that are higher than the number of columns of file_schema refer to table_partition_cols.

§limit: Option<usize>

The maximum number of records to read from this plan. If None, all records after filtering are returned.

§table_partition_cols: Vec<FieldRef>

The partitioning columns

§output_ordering: Vec<LexOrdering>

All equivalent lexicographical orderings that describe the schema.

§file_compression_type: FileCompressionType

File compression type

§new_lines_in_values: bool

Are new lines in values supported for CSVOptions

§file_source: Arc<dyn FileSource>

File source such as ParquetSource, CsvSource, JsonSource, etc.

§batch_size: Option<usize>

Batch size while creating new batches Defaults to datafusion_common::config::ExecutionOptions batch_size.

§expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>

Expression adapter used to adapt filters and projections that are pushed down into the scan from the logical schema to the physical schema of the file.

Struct FileScanConfig Copy item path

§Example

Fields§

Implementations§

impl FileScanConfig

pub fn projected_stats(&self) -> Statistics

pub fn projected_schema(&self) -> Arc<Schema>

pub fn projected_constraints(&self) -> Constraints

pub fn newlines_in_values(&self) -> bool

pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec<LexOrdering>)

pub fn projected_file_column_names(&self) -> Option<Vec<String>>

pub fn projected_file_schema(&self) -> SchemaRef

pub fn file_column_projection_indices(&self) -> Option<Vec<usize>>

pub fn split_groups_by_statistics_with_target_partitions( table_schema: &SchemaRef, file_groups: &[FileGroup], sort_order: &LexOrdering, target_partitions: usize, ) -> Result<Vec<FileGroup>>

§Parameters

§Returns

pub fn split_groups_by_statistics( table_schema: &SchemaRef, file_groups: &[FileGroup], sort_order: &LexOrdering, ) -> Result<Vec<FileGroup>>

pub fn file_source(&self) -> &Arc<dyn FileSource>

Trait Implementations§

impl Clone for FileScanConfig

fn clone(&self) -> FileScanConfig

fn clone_from(&mut self, source: &Self)

impl DataSource for FileScanConfig

fn repartitioned( &self, target_partitions: usize, repartition_file_min_size: usize, output_ordering: Option<LexOrdering>, ) -> Result<Option<Arc<dyn DataSource>>>

fn open( &self, partition: usize, context: Arc<TaskContext>, ) -> Result<SendableRecordBatchStream>

fn as_any(&self) -> &dyn Any

fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult

fn output_partitioning(&self) -> Partitioning

fn eq_properties(&self) -> EquivalenceProperties

fn scheduling_type(&self) -> SchedulingType

fn statistics(&self) -> Result<Statistics>

fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>>

fn fetch(&self) -> Option<usize>

fn metrics(&self) -> ExecutionPlanMetricsSet

fn try_swapping_with_projection( &self, projection: &[ProjectionExpr], ) -> Result<Option<Arc<dyn DataSource>>>

fn try_pushdown_filters( &self, filters: Vec<Arc<dyn PhysicalExpr>>, config: &ConfigOptions, ) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>>

impl Debug for FileScanConfig

fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult

impl DisplayAs for FileScanConfig

fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult

impl From<FileScanConfig> for FileScanConfigBuilder

fn from(config: FileScanConfig) -> Self

Auto Trait Implementations§

impl Freeze for FileScanConfig

impl !RefUnwindSafe for FileScanConfig

impl Send for FileScanConfig

impl Sync for FileScanConfig

impl Unpin for FileScanConfig

impl !UnwindSafe for FileScanConfig

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> ErasedDestructor for Twhere T: 'static,

Struct FileScanConfig

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

impl<T> ErasedDestructor for T
where T: 'static,