Skip to main content

arrow_view_state/
filter.rs

1//! `FilterIndex` — sparse filter backed by Roaring Bitmap.
2//!
3//! Stores which physical row IDs pass the active filter. Composes with
4//! `PermutationIndex` via [`FilterIndex::apply_to_permutation`] to produce
5//! a filtered, sorted selection vector.
6
7use arrow_array::{BooleanArray, UInt32Array};
8use roaring::RoaringBitmap;
9
10use crate::permutation::PermutationIndex;
11
12/// A filter over physical row IDs, backed by a Roaring Bitmap.
13///
14/// Construct from matching row IDs or from an Arrow `BooleanArray`,
15/// then apply to a `PermutationIndex` to get a filtered selection vector.
16#[derive(Debug, Clone)]
17pub struct FilterIndex {
18    mask: RoaringBitmap,
19}
20
21impl FilterIndex {
22    /// Construct from an iterator of matching physical row IDs.
23    pub fn from_ids(ids: impl IntoIterator<Item = u32>) -> Self {
24        Self {
25            mask: ids.into_iter().collect(),
26        }
27    }
28
29    /// Construct from an Arrow `BooleanArray` where `true` = passes filter.
30    ///
31    /// Null values are treated as `false` (do not pass filter).
32    pub fn from_boolean_array(arr: &BooleanArray) -> Self {
33        let mask: RoaringBitmap = arr
34            .iter()
35            .enumerate()
36            .filter_map(|(i, v)| {
37                #[allow(clippy::cast_possible_truncation)]
38                v.unwrap_or(false).then_some(i as u32)
39            })
40            .collect();
41        Self { mask }
42    }
43
44    /// Number of rows passing the filter.
45    pub fn len(&self) -> u64 {
46        self.mask.len()
47    }
48
49    /// Whether no rows pass the filter.
50    pub fn is_empty(&self) -> bool {
51        self.mask.is_empty()
52    }
53
54    /// Check whether a specific physical row ID passes the filter.
55    pub fn contains(&self, id: u32) -> bool {
56        self.mask.contains(id)
57    }
58
59    /// Apply this filter to a sorted permutation, returning a new `PermutationIndex`
60    /// containing only the permuted rows that pass the filter, in sort order.
61    ///
62    /// This is the core composition operation:
63    /// `sorted_and_filtered = filter.apply_to_permutation(&sorted)`
64    ///
65    /// Processes in chunks to avoid reading entire mmap indices at once.
66    /// Time: O(n) where n = permutation length, with O(1) membership checks
67    /// via the Roaring Bitmap.
68    #[allow(clippy::cast_possible_truncation)] // len ≤ u32::MAX by construction
69    pub fn apply_to_permutation(&self, permutation: &PermutationIndex) -> PermutationIndex {
70        const CHUNK: usize = 65_536;
71        let len = permutation.len() as usize;
72        let mut values = Vec::new();
73
74        let mut offset = 0;
75        while offset < len {
76            let end = (offset + CHUNK).min(len);
77            let chunk = permutation.read_range(offset, end);
78            for &id in &chunk {
79                if self.mask.contains(id) {
80                    values.push(id);
81                }
82            }
83            offset = end;
84        }
85
86        PermutationIndex::from_array(UInt32Array::from(values))
87    }
88
89    // ── Set algebra ────────────────────────────────────────────────────
90
91    /// Intersect two filters (AND semantics).
92    #[must_use]
93    pub fn intersection(&self, other: &FilterIndex) -> FilterIndex {
94        FilterIndex {
95            mask: &self.mask & &other.mask,
96        }
97    }
98
99    /// Union two filters (OR semantics).
100    #[must_use]
101    pub fn union(&self, other: &FilterIndex) -> FilterIndex {
102        FilterIndex {
103            mask: &self.mask | &other.mask,
104        }
105    }
106
107    /// Negate filter within a universe of `total_rows` physical rows.
108    ///
109    /// Returns a filter containing all rows in `[0, total_rows)` that are
110    /// NOT in the current filter. Uses `insert_range` for O(containers)
111    /// universe construction instead of O(elements) iteration.
112    #[must_use]
113    pub fn negate(&self, total_rows: u32) -> FilterIndex {
114        let mut universe = RoaringBitmap::new();
115        universe.insert_range(0..total_rows);
116        FilterIndex {
117            mask: universe - &self.mask,
118        }
119    }
120
121    /// Set difference: rows in `self` but not in `other`.
122    ///
123    /// Equivalent to `self ∩ ¬other`, but without allocating the negation.
124    #[must_use]
125    pub fn difference(&self, other: &FilterIndex) -> FilterIndex {
126        FilterIndex {
127            mask: &self.mask - &other.mask,
128        }
129    }
130
131    /// Convert to a `BooleanArray` of length `total_rows`.
132    ///
133    /// `true` at position `i` means physical row `i` passes the filter.
134    pub fn into_boolean_array(&self, total_rows: u32) -> BooleanArray {
135        let values: Vec<bool> = (0..total_rows).map(|i| self.mask.contains(i)).collect();
136        BooleanArray::from(values)
137    }
138
139    /// Iterate over matching physical row IDs in ascending order.
140    pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
141        self.mask.iter()
142    }
143
144    /// Access the underlying bitmap (for persistence and index types).
145    #[allow(dead_code)] // used by feature-gated modules
146    pub(crate) fn bitmap(&self) -> &RoaringBitmap {
147        &self.mask
148    }
149
150    /// Construct from a raw bitmap (for persistence and index types).
151    #[allow(dead_code)] // used by feature-gated modules
152    pub(crate) fn from_bitmap(mask: RoaringBitmap) -> Self {
153        Self { mask }
154    }
155
156    /// Construct from a borrowed bitmap (clone).
157    #[allow(dead_code)] // used by feature-gated modules
158    pub(crate) fn from_bitmap_ref(bitmap: &RoaringBitmap) -> Self {
159        Self {
160            mask: bitmap.clone(),
161        }
162    }
163}