pub trait FileFormat:
Send
+ Sync
+ Debug {
// Required methods
fn as_any(&self) -> &dyn Any;
fn get_ext(&self) -> String;
fn get_ext_with_compression(
&self,
_file_compression_type: &FileCompressionType,
) -> Result<String>;
fn compression_type(&self) -> Option<FileCompressionType>;
fn infer_schema<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
store: &'life2 Arc<dyn ObjectStore>,
objects: &'life3 [ObjectMeta],
) -> Pin<Box<dyn Future<Output = Result<SchemaRef>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait;
fn infer_stats<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
store: &'life2 Arc<dyn ObjectStore>,
table_schema: SchemaRef,
object: &'life3 ObjectMeta,
) -> Pin<Box<dyn Future<Output = Result<Statistics>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait;
fn create_physical_plan<'life0, 'life1, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
conf: FileScanConfig,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait;
fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource>;
// Provided methods
fn infer_ordering<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
_state: &'life1 dyn Session,
_store: &'life2 Arc<dyn ObjectStore>,
_table_schema: SchemaRef,
_object: &'life3 ObjectMeta,
) -> Pin<Box<dyn Future<Output = Result<Option<LexOrdering>>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait { ... }
fn infer_stats_and_ordering<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
store: &'life2 Arc<dyn ObjectStore>,
table_schema: SchemaRef,
object: &'life3 ObjectMeta,
) -> Pin<Box<dyn Future<Output = Result<FileMeta>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait { ... }
fn create_writer_physical_plan<'life0, 'life1, 'async_trait>(
&'life0 self,
_input: Arc<dyn ExecutionPlan>,
_state: &'life1 dyn Session,
_conf: FileSinkConfig,
_order_requirements: Option<LexRequirement>,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait { ... }
}Expand description
This trait abstracts all the file format specific implementations
from the TableProvider. This helps code re-utilization across
providers that support the same file formats.
Required Methods§
Sourcefn as_any(&self) -> &dyn Any
fn as_any(&self) -> &dyn Any
Returns the table provider as Any so that it can be
downcast to a specific implementation.
Sourcefn get_ext_with_compression(
&self,
_file_compression_type: &FileCompressionType,
) -> Result<String>
fn get_ext_with_compression( &self, _file_compression_type: &FileCompressionType, ) -> Result<String>
Returns the extension for this FileFormat when compressed, e.g. “file.csv.gz” -> csv
Sourcefn compression_type(&self) -> Option<FileCompressionType>
fn compression_type(&self) -> Option<FileCompressionType>
Returns whether this instance uses compression if applicable
Sourcefn infer_schema<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
store: &'life2 Arc<dyn ObjectStore>,
objects: &'life3 [ObjectMeta],
) -> Pin<Box<dyn Future<Output = Result<SchemaRef>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
fn infer_schema<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
store: &'life2 Arc<dyn ObjectStore>,
objects: &'life3 [ObjectMeta],
) -> Pin<Box<dyn Future<Output = Result<SchemaRef>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
Infer the common schema of the provided objects. The objects will usually be analysed up to a given number of records or files (as specified in the format config) then give the estimated common schema. This might fail if the files have schemas that cannot be merged.
Sourcefn infer_stats<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
store: &'life2 Arc<dyn ObjectStore>,
table_schema: SchemaRef,
object: &'life3 ObjectMeta,
) -> Pin<Box<dyn Future<Output = Result<Statistics>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
fn infer_stats<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
store: &'life2 Arc<dyn ObjectStore>,
table_schema: SchemaRef,
object: &'life3 ObjectMeta,
) -> Pin<Box<dyn Future<Output = Result<Statistics>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
Infer the statistics for the provided object. The cost and accuracy of the estimated statistics might vary greatly between file formats.
table_schema is the (combined) schema of the overall table
and may be a superset of the schema contained in this file.
TODO: should the file source return statistics for only columns referred to in the table schema?
Sourcefn create_physical_plan<'life0, 'life1, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
conf: FileScanConfig,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn create_physical_plan<'life0, 'life1, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
conf: FileScanConfig,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Take a list of files and convert it to the appropriate executor according to this file format.
Sourcefn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource>
fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource>
Return the related FileSource such as CsvSource, JsonSource, etc.
§Arguments
table_schema- The table schema to use for the FileSource (includes partition columns)
Provided Methods§
Sourcefn infer_ordering<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
_state: &'life1 dyn Session,
_store: &'life2 Arc<dyn ObjectStore>,
_table_schema: SchemaRef,
_object: &'life3 ObjectMeta,
) -> Pin<Box<dyn Future<Output = Result<Option<LexOrdering>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
fn infer_ordering<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
_state: &'life1 dyn Session,
_store: &'life2 Arc<dyn ObjectStore>,
_table_schema: SchemaRef,
_object: &'life3 ObjectMeta,
) -> Pin<Box<dyn Future<Output = Result<Option<LexOrdering>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
Infer the ordering (sort order) for the provided object from file metadata.
Returns Ok(None) if the file format does not support ordering inference
or if the file does not have ordering information.
table_schema is the (combined) schema of the overall table
and may be a superset of the schema contained in this file.
The default implementation returns Ok(None).
Sourcefn infer_stats_and_ordering<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
store: &'life2 Arc<dyn ObjectStore>,
table_schema: SchemaRef,
object: &'life3 ObjectMeta,
) -> Pin<Box<dyn Future<Output = Result<FileMeta>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
fn infer_stats_and_ordering<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
store: &'life2 Arc<dyn ObjectStore>,
table_schema: SchemaRef,
object: &'life3 ObjectMeta,
) -> Pin<Box<dyn Future<Output = Result<FileMeta>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
Infer both statistics and ordering from a single metadata read.
This is more efficient than calling Self::infer_stats and
Self::infer_ordering separately when both are needed, as it avoids
reading file metadata twice.
The default implementation calls both methods separately. File formats that can extract both from a single read should override this method.
Sourcefn create_writer_physical_plan<'life0, 'life1, 'async_trait>(
&'life0 self,
_input: Arc<dyn ExecutionPlan>,
_state: &'life1 dyn Session,
_conf: FileSinkConfig,
_order_requirements: Option<LexRequirement>,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn create_writer_physical_plan<'life0, 'life1, 'async_trait>(
&'life0 self,
_input: Arc<dyn ExecutionPlan>,
_state: &'life1 dyn Session,
_conf: FileSinkConfig,
_order_requirements: Option<LexRequirement>,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Take a list of files and the configuration to convert it to the appropriate writer executor according to this file format.