reddb_server/storage/index/
tid_bitmap.rs

1//! TID bitmap — Fase 5 P6 building block.
2//!
3//! Provides a `TidBitmap` data structure that represents a set of
4//! row IDs (TIDs) compactly using a `roaring::RoaringBitmap`
5//! backing store. Multiple bitmaps from different indexes can be
6//! AND/OR/NOT combined to express multi-predicate scans, then
7//! drained in sorted order so the heap fetcher can read pages
8//! sequentially instead of jumping around randomly.
9//!
10//! Mirrors PG's `tidbitmap.c` modulo features we don't have:
11//!
12//! - **Lossy mode**: PG flips entries to "page-only" when the
13//!   bitmap grows past `work_mem`. We use a hard cap and report
14//!   `BitmapTooLarge` when crossed; falling back to sequential
15//!   scan is the caller's call.
16//! - **Hash-of-hash sub-bitmaps**: PG uses a 2-level structure
17//!   keyed by block number → row-within-block. RoaringBitmap
18//!   already does block-of-bits compression internally so we
19//!   skip the explicit block layer.
20//! - **Cross-process sharing**: PG tidbitmaps live in shared
21//!   memory for parallel bitmap heap scans. Single-process for
22//!   now.
23//!
24//! The module is **not yet wired** into a query plan node. Wiring
25//! into the canonical plan (Fase 5 W2+) requires a `BitmapIndex`
26//! plan operator that produces a TidBitmap and a `BitmapHeap`
27//! operator that consumes one — both pending.
28//!
29//! ## Why bitmaps win
30//!
31//! For a query like `WHERE a = 1 AND b = 2` with separate hash
32//! indexes on `a` and `b`:
33//!
34//! - **Old path**: pick one index (say `a`), look up matching
35//!   row IDs, fetch each row from the heap, evaluate `b = 2`
36//!   per row. Random heap I/O for each match.
37//! - **Bitmap path**: lookup `a`'s index → `bitmap_a`, lookup
38//!   `b`'s index → `bitmap_b`, AND them in CPU → `intersection`,
39//!   walk `intersection` in sorted order and fetch rows. The
40//!   sequential-by-page walk turns random reads into prefetch-
41//!   friendly streaming I/O.
42//!
43//! On `WHERE a IN (1,2,3) OR b > 5`, the OR equivalent is:
44//!
45//! - `bitmap_a_eq_1 OR bitmap_a_eq_2 OR bitmap_a_eq_3 OR
46//!    bitmap_b_gt_5` — four index lookups, three OR operations,
47//!   one final heap walk.
48
49use roaring::RoaringBitmap;
50
51/// A sparse, sorted set of row IDs backed by a Roaring bitmap.
52/// Row IDs are `u32` because Roaring's API is u32-native; tables
53/// with more than 4 billion rows need to partition by another
54/// dimension (typically segment id) and use multiple bitmaps.
55#[derive(Debug, Clone, Default)]
56pub struct TidBitmap {
57    inner: RoaringBitmap,
58    /// Soft size cap measured in 32-bit words. Crossing this
59    /// triggers `BitmapError::TooLarge`. Defaults to 32 MB worth
60    /// of bits which holds ~256 million sparse row IDs.
61    cap_bytes: usize,
62}
63
64/// Errors raised by bitmap operations.
65#[derive(Debug)]
66pub enum BitmapError {
67    /// The bitmap exceeded its configured size cap. Caller
68    /// should fall back to sequential scan or split the
69    /// predicate further.
70    TooLarge { current: usize, cap: usize },
71}
72
73impl std::fmt::Display for BitmapError {
74    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
75        match self {
76            Self::TooLarge { current, cap } => {
77                write!(f, "tid bitmap {current} bytes exceeds cap {cap}")
78            }
79        }
80    }
81}
82
83impl std::error::Error for BitmapError {}
84
85impl TidBitmap {
86    /// Create an empty bitmap with the default 32 MB cap.
87    pub fn new() -> Self {
88        Self {
89            inner: RoaringBitmap::new(),
90            cap_bytes: 32 * 1024 * 1024,
91        }
92    }
93
94    /// Create an empty bitmap with a custom cap (bytes). Use 0
95    /// to disable the cap entirely — useful for tests and
96    /// in-memory benchmarks.
97    pub fn with_cap_bytes(cap_bytes: usize) -> Self {
98        Self {
99            inner: RoaringBitmap::new(),
100            cap_bytes,
101        }
102    }
103
104    /// Insert a row ID. Returns `BitmapError::TooLarge` if the
105    /// resulting size exceeds the configured cap; the row is
106    /// NOT inserted in that case so the caller can recover by
107    /// switching strategies.
108    pub fn insert(&mut self, tid: u32) -> Result<bool, BitmapError> {
109        let added = self.inner.insert(tid);
110        self.check_cap()?;
111        Ok(added)
112    }
113
114    /// Bulk insert from any iterator of row IDs. Stops on the
115    /// first cap violation and returns the number of IDs that
116    /// were successfully inserted before the cap was hit.
117    pub fn extend_from_iter(
118        &mut self,
119        iter: impl IntoIterator<Item = u32>,
120    ) -> Result<usize, BitmapError> {
121        let mut count = 0usize;
122        for tid in iter {
123            self.inner.insert(tid);
124            count += 1;
125            // Amortise the cap check: only verify every 4096
126            // insertions to avoid O(n) on the size estimate.
127            if count.is_multiple_of(4096) {
128                self.check_cap()?;
129            }
130        }
131        self.check_cap()?;
132        Ok(count)
133    }
134
135    /// Verify the in-memory size hasn't exceeded the cap. The
136    /// underlying RoaringBitmap exposes `serialized_size()` which
137    /// is a close proxy for actual heap usage.
138    fn check_cap(&self) -> Result<(), BitmapError> {
139        if self.cap_bytes == 0 {
140            return Ok(());
141        }
142        let current = self.inner.serialized_size();
143        if current > self.cap_bytes {
144            return Err(BitmapError::TooLarge {
145                current,
146                cap: self.cap_bytes,
147            });
148        }
149        Ok(())
150    }
151
152    /// Returns true when the bitmap contains the given row ID.
153    /// O(log n) lookup — Roaring does block search internally.
154    pub fn contains(&self, tid: u32) -> bool {
155        self.inner.contains(tid)
156    }
157
158    /// Number of row IDs in the bitmap.
159    pub fn len(&self) -> u64 {
160        self.inner.len()
161    }
162
163    /// True when the bitmap holds no row IDs.
164    pub fn is_empty(&self) -> bool {
165        self.inner.is_empty()
166    }
167
168    /// In-place AND with another bitmap. Used by the planner's
169    /// `WHERE a = 1 AND b = 2` rewrite: lookup each side, AND
170    /// the results.
171    pub fn intersect_with(&mut self, other: &TidBitmap) {
172        self.inner &= &other.inner;
173    }
174
175    /// In-place OR with another bitmap. Used by `WHERE a OR b`
176    /// patterns and IN list expansion.
177    pub fn union_with(&mut self, other: &TidBitmap) {
178        self.inner |= &other.inner;
179    }
180
181    /// In-place ANDNOT — remove every row ID also present in
182    /// `other`. Used by `WHERE a AND NOT b` patterns and EXCEPT
183    /// queries.
184    pub fn difference_with(&mut self, other: &TidBitmap) {
185        self.inner -= &other.inner;
186    }
187
188    /// Iterate row IDs in sorted ascending order. The heap
189    /// fetcher uses this to read pages sequentially.
190    pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
191        self.inner.iter()
192    }
193
194    /// Drain all row IDs in sorted ascending order, consuming
195    /// the bitmap. Equivalent to `iter().collect()` but releases
196    /// the inner storage as soon as iteration completes.
197    pub fn into_sorted_vec(self) -> Vec<u32> {
198        self.inner.into_iter().collect()
199    }
200
201    /// Group row IDs by their containing page number. Returns a
202    /// vector of `(page_id, Vec<row_within_page>)` pairs sorted
203    /// by page_id ascending. The heap fetcher reads each page
204    /// once and extracts every matching row in a single I/O.
205    ///
206    /// `rows_per_page` is the table-specific constant — for
207    /// reddb's default 8 KB pages with ~64-byte rows it's
208    /// roughly 128, but call sites pass the exact value from
209    /// their schema metadata.
210    pub fn group_by_page(&self, rows_per_page: u32) -> Vec<(u32, Vec<u32>)> {
211        if rows_per_page == 0 {
212            return Vec::new();
213        }
214        let mut groups: Vec<(u32, Vec<u32>)> = Vec::new();
215        let mut current_page: Option<u32> = None;
216        let mut current_rows: Vec<u32> = Vec::new();
217        for tid in self.inner.iter() {
218            let page = tid / rows_per_page;
219            let row = tid % rows_per_page;
220            match current_page {
221                Some(p) if p == page => current_rows.push(row),
222                _ => {
223                    if let Some(p) = current_page {
224                        groups.push((p, std::mem::take(&mut current_rows)));
225                    }
226                    current_page = Some(page);
227                    current_rows.push(row);
228                }
229            }
230        }
231        if let Some(p) = current_page {
232            groups.push((p, current_rows));
233        }
234        groups
235    }
236
237    /// Cardinality of the union with another bitmap, computed
238    /// without materialising the union itself. Used by the
239    /// planner's cost estimator to compare AND vs OR rewrites
240    /// without paying the merge cost.
241    pub fn union_cardinality(&self, other: &TidBitmap) -> u64 {
242        self.inner.union_len(&other.inner)
243    }
244
245    /// Cardinality of the intersection with another bitmap, also
246    /// without materialising the result.
247    pub fn intersection_cardinality(&self, other: &TidBitmap) -> u64 {
248        self.inner.intersection_len(&other.inner)
249    }
250}
251
252/// Convenience constructor: build a TidBitmap from any iterable
253/// of row IDs. Skips the cap check if the iterator's
254/// `size_hint().1` exceeds the cap to fail fast.
255pub fn from_iter(iter: impl IntoIterator<Item = u32>) -> Result<TidBitmap, BitmapError> {
256    let mut bitmap = TidBitmap::new();
257    bitmap.extend_from_iter(iter)?;
258    Ok(bitmap)
259}
260
261/// Build the AND of any number of bitmaps. Empty input yields
262/// an empty bitmap. Single-element input returns the bitmap
263/// unchanged. Order doesn't matter (AND is commutative) but
264/// for cost reasons callers should pass the most-selective
265/// bitmap first to minimise intermediate state.
266pub fn intersect_all(mut bitmaps: Vec<TidBitmap>) -> TidBitmap {
267    if bitmaps.is_empty() {
268        return TidBitmap::new();
269    }
270    let mut acc = bitmaps.remove(0);
271    for b in bitmaps {
272        acc.intersect_with(&b);
273        if acc.is_empty() {
274            // Short-circuit: empty intersection stays empty.
275            return acc;
276        }
277    }
278    acc
279}
280
281/// Build the OR of any number of bitmaps. Empty input yields
282/// an empty bitmap. Single-element input returns it unchanged.
283pub fn union_all(bitmaps: Vec<TidBitmap>) -> TidBitmap {
284    let mut iter = bitmaps.into_iter();
285    let Some(mut acc) = iter.next() else {
286        return TidBitmap::new();
287    };
288    for b in iter {
289        acc.union_with(&b);
290    }
291    acc
292}
reddb_server/storage/index/tid_bitmap.rs

reddb_server/storage/index/
tid_bitmap.rs