vortex_array/pipeline/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Vortex crate containing vectorized operator processing.
5//!
6//! This module contains experiments into pipelined data processing within Vortex.
7//!
8//! Arrays (and eventually Layouts) will be convertible into a [`Kernel`] that can then be
9//! exported into a [`ViewMut`] one chunk of [`N`] elements at a time. This allows us to keep
10//! compute largely within the L1 cache, as well as to write out canonical data into externally
11//! provided buffers.
12//!
13//! Each chunk is represented in a canonical physical form, as determined by the logical
14//! [`vortex_dtype::DType`] of the array. This provides a predicate base on which to perform
15//! compute. Unlike DuckDB and other vectorized systems, we force a single canonical representation
16//! instead of supporting multiple encodings because compute push-down is applied a priori to the
17//! logical representation.
18//!
19//! It is a work-in-progress and is not yet used in production.
20
21pub mod bits;
22pub(crate) mod operator;
23pub mod row_selection;
24mod types;
25pub mod vec;
26pub mod view;
27
28use std::cell::RefCell;
29
30pub use row_selection::*;
31pub use types::*;
32use vec::VectorRef;
33use vortex_error::VortexResult;
34
35use self::vec::Vector;
36use self::view::ViewMut;
37use crate::Canonical;
38use crate::operator::Operator;
39use crate::pipeline::bits::BitView;
40
41/// The number of elements in each step of a Vortex evaluation operator.
42pub const N: usize = 1024;
43
44// Number of usize words needed to store N bits
45pub const N_WORDS: usize = N / usize::BITS as usize;
46
47pub trait PipelinedOperator: Operator {
48    /// Defines the row selection of this pipeline operator.
49    fn row_selection(&self) -> RowSelection;
50
51    // Whether this operator works by mutating its first child in-place.
52    //
53    // If `true`, the operator is invoked with the first child's input data passed via the
54    // mutable output view. The node is expected to mutate this data in-place.
55    // TODO(ngates): enable this
56    // fn in_place(&self) -> bool {
57    //     false
58    // }
59
60    /// Bind the operator into a [`Kernel`] for pipelined execution.
61    fn bind(&self, ctx: &dyn BindContext) -> VortexResult<Box<dyn Kernel>>;
62
63    /// Returns the child indices of this operator that are passed to the kernel as input vectors.
64    fn vector_children(&self) -> Vec<usize>;
65
66    /// Returns the child indices of this operator that are passed to the kernel as batch inputs.
67    fn batch_children(&self) -> Vec<usize>;
68}
69
70/// The context used when binding an operator for execution.
71pub trait BindContext {
72    fn children(&self) -> &[VectorId];
73
74    fn batch_inputs(&self) -> &[BatchId];
75}
76
77/// The ID of the vector to use.
78pub type VectorId = usize;
79/// The ID of the batch input to use.
80pub type BatchId = usize;
81
82/// A operator provides a push-based way to emit a stream of canonical data.
83///
84/// By passing multiple vector computations through the same operator, we can amortize
85/// the setup costs (such as DType validation, stats short-circuiting, etc.), and to make better
86/// use of CPU caches by performing all operations while the data is hot.
87pub trait Kernel: Send {
88    /// Attempts to perform a single step of the operator, writing data to the output vector.
89    ///
90    /// The kernel step should be stateless and is passed the chunk index as well as the selection
91    /// mask for this chunk.
92    ///
93    /// Input and output vectors have a `Selection` enum indicating which elements of the vector
94    /// are valid for processing. This is one of:
95    /// * Full - all N elements are valid.
96    /// * Prefix - the first n elements are valid, where n is the true count of the selection mask.
97    /// * Mask - only the elements indicated by the selection mask are valid.
98    ///
99    /// Kernel should inspect the selection enum of the input and iterate the values accordingly.
100    /// They may choose to write the output vector in any selection mode, but should choose the most
101    /// efficient mode possible - not forgetting to update the output vector's selection enum.
102    fn step(
103        &self,
104        ctx: &KernelContext,
105        chunk_idx: usize,
106        selection: &BitView,
107        out: &mut ViewMut,
108    ) -> VortexResult<()>;
109}
110
111/// Context passed to kernels during execution, providing access to vectors.
112pub struct KernelContext {
113    /// The allocated vectors for intermediate results.
114    pub(crate) vectors: Vec<RefCell<Vector>>,
115    /// The computed batch inputs.
116    pub(crate) batch_inputs: Vec<Canonical>,
117}
118
119impl KernelContext {
120    /// Get a vector by its ID.
121    pub fn vector(&self, vector_id: VectorId) -> VectorRef<'_> {
122        VectorRef::new(self.vectors[vector_id].borrow())
123    }
124
125    /// Get a batch input by its ID.
126    pub fn batch_input(&self, batch_id: BatchId) -> &Canonical {
127        &self.batch_inputs[batch_id]
128    }
129}
vortex_array/pipeline/mod.rs

vortex_array/pipeline/
mod.rs