datafusion_python::datafusion_common::arrow::compute::kernels::coalesce

Struct BatchCoalescer

pub struct BatchCoalescer { /* private fields */ }

Expand description

Concatenate multiple RecordBatches

Implements the common pattern of incrementally creating output RecordBatches of a specific size from an input stream of RecordBatches.

This is useful after operations such as filter and take that produce smaller batches, and we want to coalesce them into larger batches for further processing.

See: https://github.com/apache/arrow-rs/issues/6692

§Example

use arrow_array::record_batch;
use arrow_select::coalesce::{BatchCoalescer};
let batch1 = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
let batch2 = record_batch!(("a", Int32, [4, 5])).unwrap();

// Create a `BatchCoalescer` that will produce batches with at least 4 rows
let target_batch_size = 4;
let mut coalescer = BatchCoalescer::new(batch1.schema(), 4);

// push the batches
coalescer.push_batch(batch1).unwrap();
// only pushed 3 rows (not yet 4, enough to produce a batch)
assert!(coalescer.next_completed_batch().is_none());
coalescer.push_batch(batch2).unwrap();
// now we have 5 rows, so we can produce a batch
let finished = coalescer.next_completed_batch().unwrap();
// 4 rows came out (target batch size is 4)
let expected = record_batch!(("a", Int32, [1, 2, 3, 4])).unwrap();
assert_eq!(finished, expected);

// Have no more input, but still have an in-progress batch
assert!(coalescer.next_completed_batch().is_none());
// We can finish the batch, which will produce the remaining rows
coalescer.finish_buffered_batch().unwrap();
let expected = record_batch!(("a", Int32, [5])).unwrap();
assert_eq!(coalescer.next_completed_batch().unwrap(), expected);

// The coalescer is now empty
assert!(coalescer.next_completed_batch().is_none());

§Background

Generally speaking, larger RecordBatches are more efficient to process than smaller RecordBatches (until the CPU cache is exceeded) because there is fixed processing overhead per batch. This coalescer builds up these larger batches incrementally.

┌────────────────────┐
│    RecordBatch     │
│   num_rows = 100   │
└────────────────────┘                 ┌────────────────────┐
                                       │                    │
┌────────────────────┐     Coalesce    │                    │
│                    │      Batches    │                    │
│    RecordBatch     │                 │                    │
│   num_rows = 200   │  ─ ─ ─ ─ ─ ─ ▶  │                    │
│                    │                 │    RecordBatch     │
│                    │                 │   num_rows = 400   │
└────────────────────┘                 │                    │
                                       │                    │
┌────────────────────┐                 │                    │
│                    │                 │                    │
│    RecordBatch     │                 │                    │
│   num_rows = 100   │                 └────────────────────┘
│                    │
└────────────────────┘

§Notes:

Output rows are produced in the same order as the input rows
The output is a sequence of batches, with all but the last being at exactly target_batch_size rows.

Struct BatchCoalescerCopy item path

§Example

§Background

§Notes:

Implementations§

impl BatchCoalescer

pub fn new(schema: Arc<Schema>, batch_size: usize) -> BatchCoalescer

§Arguments

pub fn schema(&self) -> Arc<Schema>

pub fn push_batch_with_filter( &mut self, batch: RecordBatch, filter: &BooleanArray, ) -> Result<(), ArrowError>

§Example

pub fn push_batch(&mut self, batch: RecordBatch) -> Result<(), ArrowError>

§Example

pub fn finish_buffered_batch(&mut self) -> Result<(), ArrowError>

pub fn is_empty(&self) -> bool

pub fn has_completed_batch(&self) -> bool

pub fn next_completed_batch(&mut self) -> Option<RecordBatch>

Trait Implementations§

impl Debug for BatchCoalescer

fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error>

Auto Trait Implementations§

impl Freeze for BatchCoalescer

impl !RefUnwindSafe for BatchCoalescer

impl Send for BatchCoalescer

impl Sync for BatchCoalescer

impl Unpin for BatchCoalescer

impl !UnwindSafe for BatchCoalescer

Blanket Implementations§

impl<T> AlignerFor<1> for T

type Aligner = AlignTo1<T>

impl<T> AlignerFor<1024> for T

type Aligner = AlignTo1024<T>

impl<T> AlignerFor<128> for T

type Aligner = AlignTo128<T>

impl<T> AlignerFor<16> for T

type Aligner = AlignTo16<T>

impl<T> AlignerFor<16384> for T

type Aligner = AlignTo16384<T>

impl<T> AlignerFor<2> for T

type Aligner = AlignTo2<T>

impl<T> AlignerFor<2048> for T

type Aligner = AlignTo2048<T>

impl<T> AlignerFor<256> for T

type Aligner = AlignTo256<T>

impl<T> AlignerFor<32> for T

type Aligner = AlignTo32<T>

impl<T> AlignerFor<32768> for T

type Aligner = AlignTo32768<T>

impl<T> AlignerFor<4> for T

type Aligner = AlignTo4<T>

impl<T> AlignerFor<4096> for T

type Aligner = AlignTo4096<T>

impl<T> AlignerFor<512> for T

type Aligner = AlignTo512<T>

impl<T> AlignerFor<64> for T

type Aligner = AlignTo64<T>

impl<T> AlignerFor<8> for T

type Aligner = AlignTo8<T>

impl<T> AlignerFor<8192> for T

type Aligner = AlignTo8192<T>

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> PolicyExt for Twhere T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>where T: Policy<B, E>, P: Policy<B, E>,

impl<S> ROExtAcc for S

Struct BatchCoalescer

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> PolicyExt for T
where T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

fn f_get_copy<F>(&self, offset: FieldOffset<S, F, Aligned>) -> F
where F: Copy,

fn f_get_copy<F>(&self, offset: FieldOffset<S, F, Unaligned>) -> F
where F: Copy,

impl<T> SelfOps for T
where T: ?Sized,

fn piped<F, U>(self, f: F) -> U
where F: FnOnce(Self) -> U, Self: Sized,

fn piped_ref<'a, F, U>(&'a self, f: F) -> U
where F: FnOnce(&'a Self) -> U,

fn piped_mut<'a, F, U>(&'a mut self, f: F) -> U
where F: FnOnce(&'a mut Self) -> U,

fn mutated<F>(self, f: F) -> Self
where F: FnOnce(&mut Self), Self: Sized,

fn observe<F>(self, f: F) -> Self
where F: FnOnce(&Self), Self: Sized,

fn into_<T>(self) -> T
where Self: Into<T>,

fn as_ref_<T>(&self) -> &T
where Self: AsRef<T>, T: ?Sized,

fn as_mut_<T>(&mut self) -> &mut T
where Self: AsMut<T>, T: ?Sized,

fn drop_(self)
where Self: Sized,

impl<This> TransmuteElement for This
where This: ?Sized,

unsafe fn transmute_element<T>(self) -> Self::TransmutedPtr
where Self: CanTransmuteElement<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<T> TypeIdentity for T
where T: ?Sized,

fn into_type(self) -> Self::Type
where Self: Sized, Self::Type: Sized,

fn from_type(this: Self::Type) -> Self
where Self: Sized, Self::Type: Sized,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<T> ErasedDestructor for T
where T: 'static,

impl<T> Ungil for T
where T: Send,