vortex_scan/lib.rs
1//! The `vortex-scan` crate provides utilities for performing efficient scan operations.
2//!
3//! The [`Scanner`] object is responsible for storing state related to a scan operation, including
4//! expression selectivity metrics, in order to continually optimize the execution plan for each
5//! row-range of the scan.
6#![deny(missing_docs)]
7mod range_scan;
8mod row_mask;
9
10use std::sync::Arc;
11
12pub use range_scan::*;
13pub use row_mask::*;
14use vortex_dtype::DType;
15use vortex_error::VortexResult;
16use vortex_expr::forms::cnf::cnf;
17use vortex_expr::transform::simplify_typed::simplify_typed;
18use vortex_expr::{lit, or, ExprRef};
19
20/// Represents a scan operation to read data from a set of rows, with an optional filter expression,
21/// and a projection expression.
22///
23/// A scan operation can be broken into many [`RangeScanner`] operations, each of which leverages
24/// shared statistics from the parent [`Scanner`] to optimize the order in which filter and projection
25/// operations are applied.
26///
27/// For example, if a filter expression has a top-level `AND` clause, it may be the case that one
28/// clause is significantly more selective than the other. In this case, we may want to compute the
29/// most selective filter first, then prune rows using result of the filter, before evaluating
30/// the second filter over the reduced set of rows.
31#[derive(Debug, Clone)]
32pub struct Scanner {
33 projection: ExprRef,
34 rev_filter: Box<[ExprRef]>,
35 projection_dtype: DType,
36 // A sorted list of row indices to include in the scan. We store row indices since they may
37 // produce a very sparse RowMask.
38 // take_indices: Vec<u64>,
39 // statistics: RwLock<Statistics>
40}
41
42impl Scanner {
43 /// Create a new scan with the given projection and optional filter.
44 pub fn new(dtype: DType, projection: ExprRef, filter: Option<ExprRef>) -> VortexResult<Self> {
45 let projection = simplify_typed(projection, &dtype)?;
46 let filter = filter.map(|f| simplify_typed(f, &dtype)).transpose()?;
47
48 // TODO(ngates): compute and cache a FieldMask based on the referenced fields.
49 // Where FieldMask ~= Vec<FieldPath>
50 let result_dtype = projection.return_dtype(&dtype)?;
51
52 let conjuncts: Box<[ExprRef]> = if let Some(filter) = filter {
53 let conjuncts = cnf(filter)?;
54 conjuncts
55 .into_iter()
56 .map(|disjunction| {
57 disjunction
58 .into_iter()
59 .reduce(or)
60 .unwrap_or_else(|| lit(false))
61 })
62 // Reverse the conjuncts so we can pop over the final value each time without a shuffle
63 .rev()
64 .collect()
65 } else {
66 Box::new([])
67 };
68
69 Ok(Self {
70 projection,
71 rev_filter: conjuncts,
72 projection_dtype: result_dtype,
73 })
74 }
75
76 /// Returns the projection expression.
77 pub fn projection(&self) -> &ExprRef {
78 &self.projection
79 }
80
81 /// Compute the result dtype of the scan given the input dtype.
82 pub fn result_dtype(&self) -> &DType {
83 &self.projection_dtype
84 }
85
86 /// Instantiate a new scan for a specific range. The range scan will share statistics with this
87 /// parent scan in order to optimize future range scans.
88 pub fn range_scanner(self: Arc<Self>, row_mask: RowMask) -> VortexResult<RangeScanner> {
89 Ok(RangeScanner::new(
90 self,
91 row_mask.begin(),
92 row_mask.filter_mask().clone(),
93 ))
94 }
95}