vortex_scan/
lib.rs

1//! The `vortex-scan` crate provides utilities for performing efficient scan operations.
2//!
3//! The [`Scanner`] object is responsible for storing state related to a scan operation, including
4//! expression selectivity metrics, in order to continually optimize the execution plan for each
5//! row-range of the scan.
6#![deny(missing_docs)]
7mod range_scan;
8mod row_mask;
9
10use std::sync::Arc;
11
12pub use range_scan::*;
13pub use row_mask::*;
14use vortex_dtype::DType;
15use vortex_error::VortexResult;
16use vortex_expr::forms::cnf::cnf;
17use vortex_expr::transform::simplify_typed::simplify_typed;
18use vortex_expr::{lit, or, ExprRef};
19
20/// Represents a scan operation to read data from a set of rows, with an optional filter expression,
21/// and a projection expression.
22///
23/// A scan operation can be broken into many [`RangeScanner`] operations, each of which leverages
24/// shared statistics from the parent [`Scanner`] to optimize the order in which filter and projection
25/// operations are applied.
26///
27/// For example, if a filter expression has a top-level `AND` clause, it may be the case that one
28/// clause is significantly more selective than the other. In this case, we may want to compute the
29/// most selective filter first, then prune rows using result of the filter, before evaluating
30/// the second filter over the reduced set of rows.
31#[derive(Debug, Clone)]
32pub struct Scanner {
33    projection: ExprRef,
34    rev_filter: Box<[ExprRef]>,
35    projection_dtype: DType,
36    // A sorted list of row indices to include in the scan. We store row indices since they may
37    // produce a very sparse RowMask.
38    // take_indices: Vec<u64>,
39    // statistics: RwLock<Statistics>
40}
41
42impl Scanner {
43    /// Create a new scan with the given projection and optional filter.
44    pub fn new(dtype: DType, projection: ExprRef, filter: Option<ExprRef>) -> VortexResult<Self> {
45        let projection = simplify_typed(projection, &dtype)?;
46        let filter = filter.map(|f| simplify_typed(f, &dtype)).transpose()?;
47
48        // TODO(ngates): compute and cache a FieldMask based on the referenced fields.
49        //  Where FieldMask ~= Vec<FieldPath>
50        let result_dtype = projection.return_dtype(&dtype)?;
51
52        let conjuncts: Box<[ExprRef]> = if let Some(filter) = filter {
53            let conjuncts = cnf(filter)?;
54            conjuncts
55                .into_iter()
56                .map(|disjunction| {
57                    disjunction
58                        .into_iter()
59                        .reduce(or)
60                        .unwrap_or_else(|| lit(false))
61                })
62                // Reverse the conjuncts so we can pop over the final value each time without a shuffle
63                .rev()
64                .collect()
65        } else {
66            Box::new([])
67        };
68
69        Ok(Self {
70            projection,
71            rev_filter: conjuncts,
72            projection_dtype: result_dtype,
73        })
74    }
75
76    /// Returns the projection expression.
77    pub fn projection(&self) -> &ExprRef {
78        &self.projection
79    }
80
81    /// Compute the result dtype of the scan given the input dtype.
82    pub fn result_dtype(&self) -> &DType {
83        &self.projection_dtype
84    }
85
86    /// Instantiate a new scan for a specific range. The range scan will share statistics with this
87    /// parent scan in order to optimize future range scans.
88    pub fn range_scanner(self: Arc<Self>, row_mask: RowMask) -> VortexResult<RangeScanner> {
89        Ok(RangeScanner::new(
90            self,
91            row_mask.begin(),
92            row_mask.filter_mask().clone(),
93        ))
94    }
95}