arrow_view_state/filter.rs
1//! `FilterIndex` — sparse filter backed by Roaring Bitmap.
2//!
3//! Stores which physical row IDs pass the active filter. Composes with
4//! `PermutationIndex` via [`FilterIndex::apply_to_permutation`] to produce
5//! a filtered, sorted selection vector.
6
7use arrow_array::{BooleanArray, UInt32Array};
8use roaring::RoaringBitmap;
9
10use crate::permutation::PermutationIndex;
11
12/// A filter over physical row IDs, backed by a Roaring Bitmap.
13///
14/// Construct from matching row IDs or from an Arrow `BooleanArray`,
15/// then apply to a `PermutationIndex` to get a filtered selection vector.
16#[derive(Debug, Clone)]
17pub struct FilterIndex {
18 mask: RoaringBitmap,
19}
20
21impl FilterIndex {
22 /// Construct from an iterator of matching physical row IDs.
23 pub fn from_ids(ids: impl IntoIterator<Item = u32>) -> Self {
24 Self {
25 mask: ids.into_iter().collect(),
26 }
27 }
28
29 /// Construct from an Arrow `BooleanArray` where `true` = passes filter.
30 ///
31 /// Null values are treated as `false` (do not pass filter).
32 pub fn from_boolean_array(arr: &BooleanArray) -> Self {
33 let mask: RoaringBitmap = arr
34 .iter()
35 .enumerate()
36 .filter_map(|(i, v)| {
37 #[allow(clippy::cast_possible_truncation)]
38 v.unwrap_or(false).then_some(i as u32)
39 })
40 .collect();
41 Self { mask }
42 }
43
44 /// Number of rows passing the filter.
45 pub fn len(&self) -> u64 {
46 self.mask.len()
47 }
48
49 /// Whether no rows pass the filter.
50 pub fn is_empty(&self) -> bool {
51 self.mask.is_empty()
52 }
53
54 /// Check whether a specific physical row ID passes the filter.
55 pub fn contains(&self, id: u32) -> bool {
56 self.mask.contains(id)
57 }
58
59 /// Apply this filter to a sorted permutation, returning a new `PermutationIndex`
60 /// containing only the permuted rows that pass the filter, in sort order.
61 ///
62 /// This is the core composition operation:
63 /// `sorted_and_filtered = filter.apply_to_permutation(&sorted)`
64 ///
65 /// Processes in chunks to avoid reading entire mmap indices at once.
66 /// Time: O(n) where n = permutation length, with O(1) membership checks
67 /// via the Roaring Bitmap.
68 #[allow(clippy::cast_possible_truncation)] // len ≤ u32::MAX by construction
69 pub fn apply_to_permutation(&self, permutation: &PermutationIndex) -> PermutationIndex {
70 const CHUNK: usize = 65_536;
71 let len = permutation.len() as usize;
72 let mut values = Vec::new();
73
74 let mut offset = 0;
75 while offset < len {
76 let end = (offset + CHUNK).min(len);
77 let chunk = permutation.read_range(offset, end);
78 for &id in &chunk {
79 if self.mask.contains(id) {
80 values.push(id);
81 }
82 }
83 offset = end;
84 }
85
86 PermutationIndex::from_array(UInt32Array::from(values))
87 }
88
89 // ── Set algebra ────────────────────────────────────────────────────
90
91 /// Intersect two filters (AND semantics).
92 #[must_use]
93 pub fn intersection(&self, other: &FilterIndex) -> FilterIndex {
94 FilterIndex {
95 mask: &self.mask & &other.mask,
96 }
97 }
98
99 /// Union two filters (OR semantics).
100 #[must_use]
101 pub fn union(&self, other: &FilterIndex) -> FilterIndex {
102 FilterIndex {
103 mask: &self.mask | &other.mask,
104 }
105 }
106
107 /// Negate filter within a universe of `total_rows` physical rows.
108 ///
109 /// Returns a filter containing all rows in `[0, total_rows)` that are
110 /// NOT in the current filter. Uses `insert_range` for O(containers)
111 /// universe construction instead of O(elements) iteration.
112 #[must_use]
113 pub fn negate(&self, total_rows: u32) -> FilterIndex {
114 let mut universe = RoaringBitmap::new();
115 universe.insert_range(0..total_rows);
116 FilterIndex {
117 mask: universe - &self.mask,
118 }
119 }
120
121 /// Set difference: rows in `self` but not in `other`.
122 ///
123 /// Equivalent to `self ∩ ¬other`, but without allocating the negation.
124 #[must_use]
125 pub fn difference(&self, other: &FilterIndex) -> FilterIndex {
126 FilterIndex {
127 mask: &self.mask - &other.mask,
128 }
129 }
130
131 /// Convert to a `BooleanArray` of length `total_rows`.
132 ///
133 /// `true` at position `i` means physical row `i` passes the filter.
134 pub fn into_boolean_array(&self, total_rows: u32) -> BooleanArray {
135 let values: Vec<bool> = (0..total_rows).map(|i| self.mask.contains(i)).collect();
136 BooleanArray::from(values)
137 }
138
139 /// Iterate over matching physical row IDs in ascending order.
140 pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
141 self.mask.iter()
142 }
143
144 /// Access the underlying bitmap (for persistence and index types).
145 #[allow(dead_code)] // used by feature-gated modules
146 pub(crate) fn bitmap(&self) -> &RoaringBitmap {
147 &self.mask
148 }
149
150 /// Construct from a raw bitmap (for persistence and index types).
151 #[allow(dead_code)] // used by feature-gated modules
152 pub(crate) fn from_bitmap(mask: RoaringBitmap) -> Self {
153 Self { mask }
154 }
155
156 /// Construct from a borrowed bitmap (clone).
157 #[allow(dead_code)] // used by feature-gated modules
158 pub(crate) fn from_bitmap_ref(bitmap: &RoaringBitmap) -> Self {
159 Self {
160 mask: bitmap.clone(),
161 }
162 }
163}