pub type ParquetRecordBatchStreamBuilder<T> = ArrowReaderBuilder<AsyncReader<T>>;
Expand description

A builder used to construct a ParquetRecordBatchStream for async reading of a parquet file

In particular, this handles reading the parquet file metadata, allowing consumers to use this information to select what specific columns, row groups, etc… they wish to be read by the resulting stream

See ArrowReaderBuilder for additional member functions

Aliased Type§

struct ParquetRecordBatchStreamBuilder<T> { /* private fields */ }

Implementations§

source§

impl<T: AsyncFileReader + Send + 'static> ParquetRecordBatchStreamBuilder<T>

source

pub async fn new(input: T) -> Result<Self>

Create a new ParquetRecordBatchStreamBuilder with the provided parquet file

§Example
// Open async file containing parquet data
let mut file = tokio::fs::File::from_std(file);
// construct the reader
let mut reader = ParquetRecordBatchStreamBuilder::new(file)
  .await.unwrap().build().unwrap();
// Read batche
let batch: RecordBatch = reader.next().await.unwrap().unwrap();
source

pub async fn new_with_options( input: T, options: ArrowReaderOptions, ) -> Result<Self>

Create a new ParquetRecordBatchStreamBuilder with the provided parquet file and ArrowReaderOptions

source

pub fn new_with_metadata(input: T, metadata: ArrowReaderMetadata) -> Self

Create a ParquetRecordBatchStreamBuilder from the provided ArrowReaderMetadata

This allows loading metadata once and using it to create multiple builders with potentially different settings, that can be read in parallel.

§Example of reading from multiple streams in parallel
// open file with parquet data
let mut file = tokio::fs::File::from_std(file);
// load metadata once
let meta = ArrowReaderMetadata::load_async(&mut file, Default::default()).await.unwrap();
// create two readers, a and b, from the same underlying file
// without reading the metadata again
let mut a = ParquetRecordBatchStreamBuilder::new_with_metadata(
    file.try_clone().await.unwrap(),
    meta.clone()
).build().unwrap();
let mut b = ParquetRecordBatchStreamBuilder::new_with_metadata(file, meta).build().unwrap();

// Can read batches from both readers in parallel
assert_eq!(
  a.next().await.unwrap().unwrap(),
  b.next().await.unwrap().unwrap(),
);
source

pub async fn get_row_group_column_bloom_filter( &mut self, row_group_idx: usize, column_idx: usize, ) -> Result<Option<Sbbf>>

Read bloom filter for a column in a row group Returns None if the column does not have a bloom filter

We should call this function after other forms pruning, such as projection and predicate pushdown.

source

pub fn build(self) -> Result<ParquetRecordBatchStream<T>>