pub struct ArrowReaderBuilder<T> { /* private fields */ }
Expand description

A generic builder for constructing sync or async arrow parquet readers. This is not intended to be used directly, instead you should use the specialization for the type of reader you wish to use

Implementations§

source§

impl<T> ArrowReaderBuilder<T>

source

pub fn metadata(&self) -> &Arc<ParquetMetaData>

Returns a reference to the ParquetMetaData for this parquet file

source

pub fn parquet_schema(&self) -> &SchemaDescriptor

Returns the parquet SchemaDescriptor for this parquet file

source

pub fn schema(&self) -> &SchemaRef

Returns the arrow SchemaRef for this parquet file

source

pub fn with_batch_size(self, batch_size: usize) -> Self

Set the size of RecordBatch to produce. Defaults to 1024 If the batch_size more than the file row count, use the file row count.

source

pub fn with_row_groups(self, row_groups: Vec<usize>) -> Self

Only read data from the provided row group indexes

source

pub fn with_projection(self, mask: ProjectionMask) -> Self

Only read data from the provided column indexes

source

pub fn with_row_selection(self, selection: RowSelection) -> Self

Provide a RowSelection to filter out rows, and avoid fetching their data into memory.

Row group filtering is applied prior to this, and therefore rows from skipped row groups should not be included in the RowSelection

An example use case of this would be applying a selection determined by evaluating predicates against the Index

It is recommended to enable reading the page index if using this functionality, to allow more efficient skipping over data pages. See ArrowReaderOptions::with_page_index

source

pub fn with_row_filter(self, filter: RowFilter) -> Self

Provide a RowFilter to skip decoding rows

Row filters are applied after row group selection and row selection

It is recommended to enable reading the page index if using this functionality, to allow more efficient skipping over data pages. See ArrowReaderOptions::with_page_index.

source

pub fn with_limit(self, limit: usize) -> Self

Provide a limit to the number of rows to be read

The limit will be applied after any Self::with_row_selection and Self::with_row_filter allowing it to limit the final set of rows decoded after any pushed down predicates

It is recommended to enable reading the page index if using this functionality, to allow more efficient skipping over data pages. See ArrowReaderOptions::with_page_index

source

pub fn with_offset(self, offset: usize) -> Self

Provide an offset to skip over the given number of rows

The offset will be applied after any Self::with_row_selection and Self::with_row_filter allowing it to skip rows after any pushed down predicates

It is recommended to enable reading the page index if using this functionality, to allow more efficient skipping over data pages. See ArrowReaderOptions::with_page_index

source§

impl<T: ChunkReader + 'static> ArrowReaderBuilder<SyncReader<T>>

source

pub fn try_new(reader: T) -> Result<Self>

Create a new ParquetRecordBatchReaderBuilder

let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();

// Inspect metadata
assert_eq!(builder.metadata().num_row_groups(), 1);

// Construct reader
let mut reader: ParquetRecordBatchReader = builder.with_row_groups(vec![0]).build().unwrap();

// Read data
let _batch = reader.next().unwrap().unwrap();
source

pub fn try_new_with_options( reader: T, options: ArrowReaderOptions ) -> Result<Self>

source

pub fn new_with_metadata(input: T, metadata: ArrowReaderMetadata) -> Self

Create a ParquetRecordBatchReaderBuilder from the provided ArrowReaderMetadata

This allows loading metadata once and using it to create multiple builders with potentially different settings

let metadata = ArrowReaderMetadata::load(&file, Default::default()).unwrap();
let mut a = ParquetRecordBatchReaderBuilder::new_with_metadata(file.clone(), metadata.clone()).build().unwrap();
let mut b = ParquetRecordBatchReaderBuilder::new_with_metadata(file, metadata).build().unwrap();

// Should be able to read from both in parallel
assert_eq!(a.next().unwrap().unwrap(), b.next().unwrap().unwrap());
source

pub fn build(self) -> Result<ParquetRecordBatchReader>

Build a ParquetRecordBatchReader

Note: this will eagerly evaluate any RowFilter before returning

source§

impl<T: AsyncFileReader + Send + 'static> ArrowReaderBuilder<AsyncReader<T>>

source

pub async fn new(input: T) -> Result<Self>

Create a new ParquetRecordBatchStreamBuilder with the provided parquet file

source

pub async fn new_with_options( input: T, options: ArrowReaderOptions ) -> Result<Self>

Create a new ParquetRecordBatchStreamBuilder with the provided parquet file and ArrowReaderOptions

source

pub fn new_with_metadata(input: T, metadata: ArrowReaderMetadata) -> Self

Create a ParquetRecordBatchStreamBuilder from the provided ArrowReaderMetadata

This allows loading metadata once and using it to create multiple builders with potentially different settings

let mut file = tempfile().unwrap();
let mut file = tokio::fs::File::from_std(file);
let meta = ArrowReaderMetadata::load_async(&mut file, Default::default()).await.unwrap();
let mut a = ParquetRecordBatchStreamBuilder::new_with_metadata(
    file.try_clone().await.unwrap(),
    meta.clone()
).build().unwrap();
let mut b = ParquetRecordBatchStreamBuilder::new_with_metadata(file, meta).build().unwrap();

// Should be able to read from both in parallel
assert_eq!(a.next().await.unwrap().unwrap(), b.next().await.unwrap().unwrap());
source

pub async fn get_row_group_column_bloom_filter( &mut self, row_group_idx: usize, column_idx: usize ) -> Result<Option<Sbbf>>

Read bloom filter for a column in a row group Returns None if the column does not have a bloom filter

We should call this function after other forms pruning, such as projection and predicate pushdown.

source

pub fn build(self) -> Result<ParquetRecordBatchStream<T>>

Auto Trait Implementations§

§

impl<T> Freeze for ArrowReaderBuilder<T>
where T: Freeze,

§

impl<T> !RefUnwindSafe for ArrowReaderBuilder<T>

§

impl<T> Send for ArrowReaderBuilder<T>
where T: Send,

§

impl<T> !Sync for ArrowReaderBuilder<T>

§

impl<T> Unpin for ArrowReaderBuilder<T>
where T: Unpin,

§

impl<T> !UnwindSafe for ArrowReaderBuilder<T>

Blanket Implementations§

source§

impl<T> Any for T
where T: 'static + ?Sized,

source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
source§

impl<T> Borrow<T> for T
where T: ?Sized,

source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
source§

impl<T> From<T> for T

source§

fn from(t: T) -> T

Returns the argument unchanged.

source§

impl<T, U> Into<U> for T
where U: From<T>,

source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

§

type Error = Infallible

The type returned in the event of a conversion error.
source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.