pub struct ListingOptions {
    pub file_extension: String,
    pub format: Arc<dyn FileFormat>,
    pub table_partition_cols: Vec<(String, DataType)>,
    pub collect_stat: bool,
    pub target_partitions: usize,
    pub file_sort_order: Option<Vec<Expr>>,
    pub infinite_source: bool,
}
Expand description

Options for creating a ListingTable

Fields§

§file_extension: String

A suffix on which files should be filtered (leave empty to keep all files on the path)

§format: Arc<dyn FileFormat>

The file format

§table_partition_cols: Vec<(String, DataType)>

The expected partition column names in the folder structure. See Self::with_table_partition_cols for details

§collect_stat: bool

Set true to try to guess statistics from the files. This can add a lot of overhead as it will usually require files to be opened and at least partially parsed.

§target_partitions: usize

Group files to avoid that the number of partitions exceeds this limit

§file_sort_order: Option<Vec<Expr>>

Optional pre-known sort order. Must be SortExprs.

DataFusion may take advantage of this ordering to omit sorts or use more efficient algorithms. Currently sortedness must be provided if it is known by some external mechanism, but may in the future be automatically determined, for example using parquet metadata.

See https://github.com/apache/arrow-datafusion/issues/4177

§infinite_source: bool

Infinite source means that the input is not guaranteed to end. Currently, CSV, JSON, and AVRO formats are supported. In order to support infinite inputs, DataFusion may adjust query plans (e.g. joins) to run the given query in full pipelining mode.

Implementations§

source§

impl ListingOptions

source

pub fn new(format: Arc<dyn FileFormat>) -> Self

Creates an options instance with the given format Default values:

  • no file extension filter
  • no input partition to discover
  • one target partition
  • stat collection
source

pub fn with_infinite_source(self, infinite_source: bool) -> Self

Set unbounded assumption on ListingOptions and returns self.

use std::sync::Arc;
use datafusion::datasource::{listing::ListingOptions, file_format::csv::CsvFormat};
use datafusion::prelude::SessionContext;
let ctx = SessionContext::new();
let listing_options = ListingOptions::new(Arc::new(
    CsvFormat::default()
  )).with_infinite_source(true);

assert_eq!(listing_options.infinite_source, true);
source

pub fn with_file_extension(self, file_extension: impl Into<String>) -> Self

Set file extension on ListingOptions and returns self.


let listing_options = ListingOptions::new(Arc::new(
    ParquetFormat::default()
  ))
  .with_file_extension(".parquet");

assert_eq!(listing_options.file_extension, ".parquet");
source

pub fn with_table_partition_cols( self, table_partition_cols: Vec<(String, DataType)> ) -> Self

Set table partition columns on ListingOptions and returns self.

“partition columns,” used to support Hive Partitioning, are columns added to the data that is read, based on the folder structure where the data resides.

For example, give the following files in your filesystem:

/mnt/nyctaxi/year=2022/month=01/tripdata.parquet
/mnt/nyctaxi/year=2021/month=12/tripdata.parquet
/mnt/nyctaxi/year=2021/month=11/tripdata.parquet

A ListingTable created at /mnt/nyctaxi/ with partition columns “year” and “month” will include new year and month columns while reading the files. The year column would have value 2022 and the month column would have value 01 for the rows read from /mnt/nyctaxi/year=2022/month=01/tripdata.parquet

Notes
  • If only one level (e.g. year in the example above) is specified, the other levels are ignored but the files are still read.

  • Files that don’t follow this partitioning scheme will be ignored.

  • Since the columns have the same value for all rows read from each individual file (such as dates), they are typically dictionary encoded for efficiency. You may use wrap_partition_type_in_dict to request a dictionary-encoded type.

  • The partition columns are solely extracted from the file path. Especially they are NOT part of the parquet files itself.

Example

// listing options for files with paths such as  `/mnt/data/col_a=x/col_b=y/data.parquet`
// `col_a` and `col_b` will be included in the data read from those files
let listing_options = ListingOptions::new(Arc::new(
    ParquetFormat::default()
  ))
  .with_table_partition_cols(vec![("col_a".to_string(), DataType::Utf8),
      ("col_b".to_string(), DataType::Utf8)]);

assert_eq!(listing_options.table_partition_cols, vec![("col_a".to_string(), DataType::Utf8),
    ("col_b".to_string(), DataType::Utf8)]);
source

pub fn with_collect_stat(self, collect_stat: bool) -> Self

Set stat collection on ListingOptions and returns self.


let listing_options = ListingOptions::new(Arc::new(
    ParquetFormat::default()
  ))
  .with_collect_stat(true);

assert_eq!(listing_options.collect_stat, true);
source

pub fn with_target_partitions(self, target_partitions: usize) -> Self

Set number of target partitions on ListingOptions and returns self.


let listing_options = ListingOptions::new(Arc::new(
    ParquetFormat::default()
  ))
  .with_target_partitions(8);

assert_eq!(listing_options.target_partitions, 8);
source

pub fn with_file_sort_order(self, file_sort_order: Option<Vec<Expr>>) -> Self

Set file sort order on ListingOptions and returns self.


 // Tell datafusion that the files are sorted by column "a"
 let file_sort_order = Some(vec![
   col("a").sort(true, true)
 ]);

let listing_options = ListingOptions::new(Arc::new(
    ParquetFormat::default()
  ))
  .with_file_sort_order(file_sort_order.clone());

assert_eq!(listing_options.file_sort_order, file_sort_order);
source

pub async fn infer_schema<'a>( &'a self, state: &SessionState, table_path: &'a ListingTableUrl ) -> Result<SchemaRef>

Infer the schema of the files at the given path on the provided object store. The inferred schema does not include the partitioning columns.

This method will not be called by the table itself but before creating it. This way when creating the logical plan we can decide to resolve the schema locally or ask a remote service to do it (e.g a scheduler).

Trait Implementations§

source§

impl Clone for ListingOptions

source§

fn clone(&self) -> ListingOptions

Returns a copy of the value. Read more
1.0.0 · source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
source§

impl Debug for ListingOptions

source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Auto Trait Implementations§

Blanket Implementations§

source§

impl<T> Any for Twhere T: 'static + ?Sized,

source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
source§

impl<T> Borrow<T> for Twhere T: ?Sized,

source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
source§

impl<T> BorrowMut<T> for Twhere T: ?Sized,

source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
source§

impl<T> From<T> for T

source§

fn from(t: T) -> T

Returns the argument unchanged.

source§

impl<T> Instrument for T

source§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more
source§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
source§

impl<T, U> Into<U> for Twhere U: From<T>,

source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

source§

impl<T> Same<T> for T

§

type Output = T

Should always be Self
source§

impl<T> ToOwned for Twhere T: Clone,

§

type Owned = T

The resulting type after obtaining ownership.
source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
source§

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

§

type Error = Infallible

The type returned in the event of a conversion error.
source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
source§

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
§

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

§

fn vzip(self) -> V

source§

impl<T> WithSubscriber for T

source§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more
source§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more