pub struct FileScanConfig {
pub object_store_url: ObjectStoreUrl,
pub file_schema: SchemaRef,
pub file_groups: Vec<FileGroup>,
pub constraints: Constraints,
pub projection: Option<Vec<usize>>,
pub limit: Option<usize>,
pub table_partition_cols: Vec<Field>,
pub output_ordering: Vec<LexOrdering>,
pub file_compression_type: FileCompressionType,
pub new_lines_in_values: bool,
pub file_source: Arc<dyn FileSource>,
pub batch_size: Option<usize>,
}
Expand description
The base configurations for a DataSourceExec
, the a physical plan for
any given file format.
Use Self::build
to create a DataSourceExec
from a ``FileScanConfig`.
§Example
#[derive(Clone)]
// create FileScan config for reading parquet files from file://
let object_store_url = ObjectStoreUrl::local_filesystem();
let file_source = Arc::new(ParquetSource::new());
let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source)
.with_limit(Some(1000)) // read only the first 1000 records
.with_projection(Some(vec![2, 3])) // project columns 2 and 3
// Read /tmp/file1.parquet with known size of 1234 bytes in a single group
.with_file(PartitionedFile::new("file1.parquet", 1234))
// Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes
// in a single row group
.with_file_group(FileGroup::new(vec![
PartitionedFile::new("file2.parquet", 56),
PartitionedFile::new("file3.parquet", 78),
])).build();
// create an execution plan from the config
let plan: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(config);
Fields§
§object_store_url: ObjectStoreUrl
Object store URL, used to get an ObjectStore
instance from
RuntimeEnv::object_store
This ObjectStoreUrl
should be the prefix of the absolute url for files
as file://
or s3://my_bucket
. It should not include the path to the
file itself. The relevant URL prefix must be registered via
RuntimeEnv::register_object_store
file_schema: SchemaRef
Schema before projection
is applied. It contains the all columns that may
appear in the files. It does not include table partition columns
that may be added.
Note that this is not the schema of the physical files.
This is the schema that the physical file schema will be
mapped onto, and the schema that the DataSourceExec
will return.
file_groups: Vec<FileGroup>
List of files to be processed, grouped into partitions
Each file must have a schema of file_schema
or a subset. If
a particular file has a subset, the missing columns are
padded with NULLs.
DataFusion may attempt to read each partition of files concurrently, however files within a partition will be read sequentially, one after the next.
constraints: Constraints
Table constraints
projection: Option<Vec<usize>>
Columns on which to project the data. Indexes that are higher than the
number of columns of file_schema
refer to table_partition_cols
.
limit: Option<usize>
The maximum number of records to read from this plan. If None
,
all records after filtering are returned.
table_partition_cols: Vec<Field>
The partitioning columns
output_ordering: Vec<LexOrdering>
All equivalent lexicographical orderings that describe the schema.
file_compression_type: FileCompressionType
File compression type
new_lines_in_values: bool
Are new lines in values supported for CSVOptions
file_source: Arc<dyn FileSource>
File source such as ParquetSource
, CsvSource
, JsonSource
, etc.
batch_size: Option<usize>
Batch size while creating new batches
Defaults to datafusion_common::config::ExecutionOptions
batch_size.
Implementations§
Source§impl FileScanConfig
impl FileScanConfig
Sourcepub fn new(
object_store_url: ObjectStoreUrl,
file_schema: SchemaRef,
file_source: Arc<dyn FileSource>,
) -> Self
pub fn new( object_store_url: ObjectStoreUrl, file_schema: SchemaRef, file_source: Arc<dyn FileSource>, ) -> Self
Create a new FileScanConfig
with default settings for scanning files.
See example on FileScanConfig
No file groups are added by default. See Self::with_file
, Self::with_file_group
and
Self::with_file_groups
.
§Parameters:
object_store_url
: SeeSelf::object_store_url
file_schema
: SeeSelf::file_schema
Sourcepub fn with_source(self, file_source: Arc<dyn FileSource>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_source(self, file_source: Arc<dyn FileSource>) -> Self
Set the file source
Sourcepub fn with_constraints(self, constraints: Constraints) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_constraints(self, constraints: Constraints) -> Self
Set the table constraints of the files
Sourcepub fn with_statistics(self, statistics: Statistics) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_statistics(self, statistics: Statistics) -> Self
Set the statistics of the files
pub fn projected_stats(&self) -> Statistics
pub fn projected_schema(&self) -> Arc<Schema>
pub fn projected_constraints(&self) -> Constraints
Sourcepub fn with_projection(self, projection: Option<Vec<usize>>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_projection(self, projection: Option<Vec<usize>>) -> Self
Set the projection of the files
Sourcepub fn with_limit(self, limit: Option<usize>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_limit(self, limit: Option<usize>) -> Self
Set the limit of the files
Sourcepub fn with_file(self, file: PartitionedFile) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_file(self, file: PartitionedFile) -> Self
Add a file as a single group
See Self::file_groups for more information.
Sourcepub fn with_file_groups(self, file_groups: Vec<FileGroup>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_file_groups(self, file_groups: Vec<FileGroup>) -> Self
Add the file groups
See Self::file_groups for more information.
Sourcepub fn with_file_group(self, file_group: FileGroup) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_file_group(self, file_group: FileGroup) -> Self
Add a new file group
See Self::file_groups for more information
Sourcepub fn with_table_partition_cols(self, table_partition_cols: Vec<Field>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_table_partition_cols(self, table_partition_cols: Vec<Field>) -> Self
Set the partitioning columns of the files
Sourcepub fn with_output_ordering(self, output_ordering: Vec<LexOrdering>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_output_ordering(self, output_ordering: Vec<LexOrdering>) -> Self
Set the output ordering of the files
Sourcepub fn with_file_compression_type(
self,
file_compression_type: FileCompressionType,
) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_file_compression_type( self, file_compression_type: FileCompressionType, ) -> Self
Set the file compression type
Sourcepub fn with_newlines_in_values(self, new_lines_in_values: bool) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_newlines_in_values(self, new_lines_in_values: bool) -> Self
Set the new_lines_in_values property
Sourcepub fn with_batch_size(self, batch_size: Option<usize>) -> Self
👎Deprecated since 47.0.0: use FileScanConfigBuilder instead
pub fn with_batch_size(self, batch_size: Option<usize>) -> Self
Set the batch_size property
Sourcepub fn newlines_in_values(&self) -> bool
pub fn newlines_in_values(&self) -> bool
Specifies whether newlines in (quoted) values are supported.
Parsing newlines in quoted values may be affected by execution behaviour such as
parallel file scanning. Setting this to true
ensures that newlines in values are
parsed successfully, which may reduce performance.
The default behaviour depends on the datafusion.catalog.newlines_in_values
setting.
Sourcepub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec<LexOrdering>)
pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec<LexOrdering>)
Project the schema, constraints, and the statistics on the given column indices
pub fn projected_file_column_names(&self) -> Option<Vec<String>>
Sourcepub fn projected_file_schema(&self) -> SchemaRef
pub fn projected_file_schema(&self) -> SchemaRef
Projects only file schema, ignoring partition columns
pub fn file_column_projection_indices(&self) -> Option<Vec<usize>>
Sourcepub fn split_groups_by_statistics_with_target_partitions(
table_schema: &SchemaRef,
file_groups: &[FileGroup],
sort_order: &LexOrdering,
target_partitions: usize,
) -> Result<Vec<FileGroup>>
pub fn split_groups_by_statistics_with_target_partitions( table_schema: &SchemaRef, file_groups: &[FileGroup], sort_order: &LexOrdering, target_partitions: usize, ) -> Result<Vec<FileGroup>>
Splits file groups into new groups based on statistics to enable efficient parallel processing.
The method distributes files across a target number of partitions while ensuring files within each partition maintain sort order based on their min/max statistics.
The algorithm works by:
- Takes files sorted by minimum values
- For each file:
- Finds eligible groups (empty or where file’s min > group’s last max)
- Selects the smallest eligible group
- Creates a new group if needed
§Parameters
table_schema
: Schema containing information about the columnsfile_groups
: The original file groups to splitsort_order
: The lexicographical ordering to maintain within each grouptarget_partitions
: The desired number of output partitions
§Returns
A new set of file groups, where files within each group are non-overlapping with respect to their min/max statistics and maintain the specified sort order.
Sourcepub fn split_groups_by_statistics(
table_schema: &SchemaRef,
file_groups: &[FileGroup],
sort_order: &LexOrdering,
) -> Result<Vec<FileGroup>>
pub fn split_groups_by_statistics( table_schema: &SchemaRef, file_groups: &[FileGroup], sort_order: &LexOrdering, ) -> Result<Vec<FileGroup>>
Attempts to do a bin-packing on files into file groups, such that any two files in a file group are ordered and non-overlapping with respect to their statistics. It will produce the smallest number of file groups possible.
Sourcepub fn build(self) -> Arc<DataSourceExec>
👎Deprecated since 47.0.0: use DataSourceExec::new instead
pub fn build(self) -> Arc<DataSourceExec>
Returns a new DataSourceExec
to scan the files specified by this config
Sourcepub fn file_source(&self) -> &Arc<dyn FileSource>
pub fn file_source(&self) -> &Arc<dyn FileSource>
Returns the file_source
Trait Implementations§
Source§impl Clone for FileScanConfig
impl Clone for FileScanConfig
Source§fn clone(&self) -> FileScanConfig
fn clone(&self) -> FileScanConfig
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source
. Read moreSource§impl DataSource for FileScanConfig
impl DataSource for FileScanConfig
Source§fn repartitioned(
&self,
target_partitions: usize,
repartition_file_min_size: usize,
output_ordering: Option<LexOrdering>,
) -> Result<Option<Arc<dyn DataSource>>>
fn repartitioned( &self, target_partitions: usize, repartition_file_min_size: usize, output_ordering: Option<LexOrdering>, ) -> Result<Option<Arc<dyn DataSource>>>
If supported by the underlying FileSource
, redistribute files across partitions according to their size.
fn open( &self, partition: usize, context: Arc<TaskContext>, ) -> Result<SendableRecordBatchStream>
fn as_any(&self) -> &dyn Any
Source§fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult
fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter<'_>) -> FmtResult
fn output_partitioning(&self) -> Partitioning
fn eq_properties(&self) -> EquivalenceProperties
fn statistics(&self) -> Result<Statistics>
Source§fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>>
fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>>
fn fetch(&self) -> Option<usize>
fn metrics(&self) -> ExecutionPlanMetricsSet
fn try_swapping_with_projection( &self, projection: &ProjectionExec, ) -> Result<Option<Arc<dyn ExecutionPlan>>>
Source§fn try_pushdown_filters(
&self,
filters: Vec<Arc<dyn PhysicalExpr>>,
config: &ConfigOptions,
) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>>
fn try_pushdown_filters( &self, filters: Vec<Arc<dyn PhysicalExpr>>, config: &ConfigOptions, ) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>>
ExecutionPlan::handle_child_pushdown_result
for more details.Source§impl Debug for FileScanConfig
impl Debug for FileScanConfig
Source§impl DisplayAs for FileScanConfig
impl DisplayAs for FileScanConfig
Source§impl From<FileScanConfig> for FileScanConfigBuilder
impl From<FileScanConfig> for FileScanConfigBuilder
Source§fn from(config: FileScanConfig) -> Self
fn from(config: FileScanConfig) -> Self
Auto Trait Implementations§
impl Freeze for FileScanConfig
impl !RefUnwindSafe for FileScanConfig
impl Send for FileScanConfig
impl Sync for FileScanConfig
impl Unpin for FileScanConfig
impl !UnwindSafe for FileScanConfig
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left
is true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left(&self)
returns true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read more