pub struct FileScanConfig {
pub object_store_url: ObjectStoreUrl,
pub file_schema: SchemaRef,
pub file_groups: Vec<FileGroup>,
pub constraints: Constraints,
pub projection: Option<Vec<usize>>,
pub limit: Option<usize>,
pub table_partition_cols: Vec<Field>,
pub output_ordering: Vec<LexOrdering>,
pub file_compression_type: FileCompressionType,
pub new_lines_in_values: bool,
pub file_source: Arc<dyn FileSource>,
pub batch_size: Option<usize>,
}Expand description
The base configurations for a DataSourceExec, the a physical plan for
any given file format.
Use Self::build to create a DataSourceExec from a ``FileScanConfig`.
§Example
// create FileScan config for reading parquet files from file://
let object_store_url = ObjectStoreUrl::local_filesystem();
let file_source = Arc::new(ParquetSource::new());
let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source)
.with_limit(Some(1000)) // read only the first 1000 records
.with_projection(Some(vec![2, 3])) // project columns 2 and 3
// Read /tmp/file1.parquet with known size of 1234 bytes in a single group
.with_file(PartitionedFile::new("file1.parquet", 1234))
// Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes
// in a single row group
.with_file_group(FileGroup::new(vec![
PartitionedFile::new("file2.parquet", 56),
PartitionedFile::new("file3.parquet", 78),
])).build();
// create an execution plan from the config
let plan: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(config);Fields§
§object_store_url: ObjectStoreUrlObject store URL, used to get an ObjectStore instance from
RuntimeEnv::object_store
This ObjectStoreUrl should be the prefix of the absolute url for files
as file:// or s3://my_bucket. It should not include the path to the
file itself. The relevant URL prefix must be registered via
RuntimeEnv::register_object_store
file_schema: SchemaRefSchema before projection is applied. It contains the all columns that may
appear in the files. It does not include table partition columns
that may be added.
Note that this is not the schema of the physical files.
This is the schema that the physical file schema will be
mapped onto, and the schema that the DataSourceExec will return.
file_groups: Vec<FileGroup>List of files to be processed, grouped into partitions
Each file must have a schema of file_schema or a subset. If
a particular file has a subset, the missing columns are
padded with NULLs.
DataFusion may attempt to read each partition of files concurrently, however files within a partition will be read sequentially, one after the next.
constraints: ConstraintsTable constraints
projection: Option<Vec<usize>>Columns on which to project the data. Indexes that are higher than the
number of columns of file_schema refer to table_partition_cols.
limit: Option<usize>The maximum number of records to read from this plan. If None,
all records after filtering are returned.
table_partition_cols: Vec<Field>The partitioning columns
output_ordering: Vec<LexOrdering>All equivalent lexicographical orderings that describe the schema.
file_compression_type: FileCompressionTypeFile compression type
new_lines_in_values: boolAre new lines in values supported for CSVOptions
file_source: Arc<dyn FileSource>File source such as ParquetSource, CsvSource, JsonSource, etc.
batch_size: Option<usize>Batch size while creating new batches
Defaults to datafusion_common::config::ExecutionOptions batch_size.
Implementations§
Source§impl FileScanConfig
impl FileScanConfig
Sourcepub fn new(
object_store_url: ObjectStoreUrl,
file_schema: SchemaRef,
file_source: Arc<dyn FileSource>,
) -> Self
pub fn new( object_store_url: ObjectStoreUrl, file_schema: SchemaRef, file_source: Arc<dyn FileSource>, ) -> Self
Create a new FileScanConfig with default settings for scanning files.
See example on FileScanConfig
No file groups are added by default. See Self::with_file, Self::with_file_group and
Self::with_file_groups.
§Parameters:
object_store_url: SeeSelf::object_store_urlfile_schema: SeeSelf::file_schema
Sourcepub fn with_source(self, file_source: Arc<dyn FileSource>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_source(self, file_source: Arc<dyn FileSource>) -> Self
Set the file source
Sourcepub fn with_constraints(self, constraints: Constraints) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_constraints(self, constraints: Constraints) -> Self
Set the table constraints of the files
Sourcepub fn with_statistics(self, statistics: Statistics) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_statistics(self, statistics: Statistics) -> Self
Set the statistics of the files
pub fn projected_stats(&self) -> Statistics
pub fn projected_schema(&self) -> Arc<Schema>
pub fn projected_constraints(&self) -> Constraints
Sourcepub fn with_projection(self, projection: Option<Vec<usize>>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_projection(self, projection: Option<Vec<usize>>) -> Self
Set the projection of the files
Sourcepub fn with_limit(self, limit: Option<usize>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_limit(self, limit: Option<usize>) -> Self
Set the limit of the files
Sourcepub fn with_file(self, file: PartitionedFile) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_file(self, file: PartitionedFile) -> Self
Add a file as a single group
See Self::file_groups for more information.
Sourcepub fn with_file_groups(self, file_groups: Vec<FileGroup>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_file_groups(self, file_groups: Vec<FileGroup>) -> Self
Add the file groups
See Self::file_groups for more information.
Sourcepub fn with_file_group(self, file_group: FileGroup) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_file_group(self, file_group: FileGroup) -> Self
Add a new file group
See Self::file_groups for more information
Sourcepub fn with_table_partition_cols(self, table_partition_cols: Vec<Field>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_table_partition_cols(self, table_partition_cols: Vec<Field>) -> Self
Set the partitioning columns of the files
Sourcepub fn with_output_ordering(self, output_ordering: Vec<LexOrdering>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_output_ordering(self, output_ordering: Vec<LexOrdering>) -> Self
Set the output ordering of the files
Sourcepub fn with_file_compression_type(
self,
file_compression_type: FileCompressionType,
) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_file_compression_type( self, file_compression_type: FileCompressionType, ) -> Self
Set the file compression type
Sourcepub fn with_newlines_in_values(self, new_lines_in_values: bool) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_newlines_in_values(self, new_lines_in_values: bool) -> Self
Set the new_lines_in_values property
Sourcepub fn with_batch_size(self, batch_size: Option<usize>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_batch_size(self, batch_size: Option<usize>) -> Self
Set the batch_size property
Sourcepub fn newlines_in_values(&self) -> bool
pub fn newlines_in_values(&self) -> bool
Specifies whether newlines in (quoted) values are supported.
Parsing newlines in quoted values may be affected by execution behaviour such as
parallel file scanning. Setting this to true ensures that newlines in values are
parsed successfully, which may reduce performance.
The default behaviour depends on the datafusion.catalog.newlines_in_values setting.
Sourcepub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec<LexOrdering>)
pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec<LexOrdering>)
Project the schema, constraints, and the statistics on the given column indices
pub fn projected_file_column_names(&self) -> Option<Vec<String>>
Sourcepub fn projected_file_schema(&self) -> SchemaRef
pub fn projected_file_schema(&self) -> SchemaRef
Projects only file schema, ignoring partition columns
pub fn file_column_projection_indices(&self) -> Option<Vec<usize>>
Sourcepub fn split_groups_by_statistics_with_target_partitions(
table_schema: &SchemaRef,
file_groups: &[FileGroup],
sort_order: &LexOrdering,
target_partitions: usize,
) -> Result<Vec<FileGroup>>
pub fn split_groups_by_statistics_with_target_partitions( table_schema: &SchemaRef, file_groups: &[FileGroup], sort_order: &LexOrdering, target_partitions: usize, ) -> Result<Vec<FileGroup>>
Splits file groups into new groups based on statistics to enable efficient parallel processing.
The method distributes files across a target number of partitions while ensuring files within each partition maintain sort order based on their min/max statistics.
The algorithm works by:
- Takes files sorted by minimum values
- For each file:
- Finds eligible groups (empty or where file’s min > group’s last max)
- Selects the smallest eligible group
- Creates a new group if needed
§Parameters
table_schema: Schema containing information about the columnsfile_groups: The original file groups to splitsort_order: The lexicographical ordering to maintain within each grouptarget_partitions: The desired number of output partitions
§Returns
A new set of file groups, where files within each group are non-overlapping with respect to their min/max statistics and maintain the specified sort order.
Sourcepub fn split_groups_by_statistics(
table_schema: &SchemaRef,
file_groups: &[FileGroup],
sort_order: &LexOrdering,
) -> Result<Vec<FileGroup>>
pub fn split_groups_by_statistics( table_schema: &SchemaRef, file_groups: &[FileGroup], sort_order: &LexOrdering, ) -> Result<Vec<FileGroup>>
Attempts to do a bin-packing on files into file groups, such that any two files in a file group are ordered and non-overlapping with respect to their statistics. It will produce the smallest number of file groups possible.
Sourcepub fn build(self) -> Arc<DataSourceExec>
👎Deprecated since 47.0.0: use DataSourceExec::new instead
pub fn build(self) -> Arc<DataSourceExec>
Returns a new DataSourceExec to scan the files specified by this config
Sourcepub fn file_source(&self) -> &Arc<dyn FileSource>
pub fn file_source(&self) -> &Arc<dyn FileSource>
Returns the file_source
Trait Implementations§
Source§impl Clone for FileScanConfig
impl Clone for FileScanConfig
Source§fn clone(&self) -> FileScanConfig
fn clone(&self) -> FileScanConfig
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read moreSource§impl DataSource for FileScanConfig
impl DataSource for FileScanConfig
Source§fn repartitioned(
&self,
target_partitions: usize,
repartition_file_min_size: usize,
output_ordering: Option<LexOrdering>,
) -> Result<Option<Arc<dyn DataSource>>>
fn repartitioned( &self, target_partitions: usize, repartition_file_min_size: usize, output_ordering: Option<LexOrdering>, ) -> Result<Option<Arc<dyn DataSource>>>
If supported by the underlying FileSource, redistribute files across partitions according to their size.
fn open( &self, partition: usize, context: Arc<TaskContext>, ) -> Result<SendableRecordBatchStream>
fn as_any(&self) -> &dyn Any
Source§fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult
fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult
fn output_partitioning(&self) -> Partitioning
fn eq_properties(&self) -> EquivalenceProperties
fn statistics(&self) -> Result<Statistics>
Source§fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>>
fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>>
fn fetch(&self) -> Option<usize>
fn metrics(&self) -> ExecutionPlanMetricsSet
fn try_swapping_with_projection( &self, projection: &ProjectionExec, ) -> Result<Option<Arc<dyn ExecutionPlan>>>
Source§impl Debug for FileScanConfig
impl Debug for FileScanConfig
Source§impl DisplayAs for FileScanConfig
impl DisplayAs for FileScanConfig
Source§impl From<FileScanConfig> for FileScanConfigBuilder
impl From<FileScanConfig> for FileScanConfigBuilder
Source§fn from(config: FileScanConfig) -> Self
fn from(config: FileScanConfig) -> Self
Auto Trait Implementations§
impl Freeze for FileScanConfig
impl !RefUnwindSafe for FileScanConfig
impl Send for FileScanConfig
impl Sync for FileScanConfig
impl Unpin for FileScanConfig
impl !UnwindSafe for FileScanConfig
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more