reddb_server/storage/index/tid_bitmap.rs
1//! TID bitmap — Fase 5 P6 building block.
2//!
3//! Provides a `TidBitmap` data structure that represents a set of
4//! row IDs (TIDs) compactly using a `roaring::RoaringBitmap`
5//! backing store. Multiple bitmaps from different indexes can be
6//! AND/OR/NOT combined to express multi-predicate scans, then
7//! drained in sorted order so the heap fetcher can read pages
8//! sequentially instead of jumping around randomly.
9//!
10//! Mirrors PG's `tidbitmap.c` modulo features we don't have:
11//!
12//! - **Lossy mode**: PG flips entries to "page-only" when the
13//! bitmap grows past `work_mem`. We use a hard cap and report
14//! `BitmapTooLarge` when crossed; falling back to sequential
15//! scan is the caller's call.
16//! - **Hash-of-hash sub-bitmaps**: PG uses a 2-level structure
17//! keyed by block number → row-within-block. RoaringBitmap
18//! already does block-of-bits compression internally so we
19//! skip the explicit block layer.
20//! - **Cross-process sharing**: PG tidbitmaps live in shared
21//! memory for parallel bitmap heap scans. Single-process for
22//! now.
23//!
24//! The module is **not yet wired** into a query plan node. Wiring
25//! into the canonical plan (Fase 5 W2+) requires a `BitmapIndex`
26//! plan operator that produces a TidBitmap and a `BitmapHeap`
27//! operator that consumes one — both pending.
28//!
29//! ## Why bitmaps win
30//!
31//! For a query like `WHERE a = 1 AND b = 2` with separate hash
32//! indexes on `a` and `b`:
33//!
34//! - **Old path**: pick one index (say `a`), look up matching
35//! row IDs, fetch each row from the heap, evaluate `b = 2`
36//! per row. Random heap I/O for each match.
37//! - **Bitmap path**: lookup `a`'s index → `bitmap_a`, lookup
38//! `b`'s index → `bitmap_b`, AND them in CPU → `intersection`,
39//! walk `intersection` in sorted order and fetch rows. The
40//! sequential-by-page walk turns random reads into prefetch-
41//! friendly streaming I/O.
42//!
43//! On `WHERE a IN (1,2,3) OR b > 5`, the OR equivalent is:
44//!
45//! - `bitmap_a_eq_1 OR bitmap_a_eq_2 OR bitmap_a_eq_3 OR
46//! bitmap_b_gt_5` — four index lookups, three OR operations,
47//! one final heap walk.
48
49use roaring::RoaringBitmap;
50
51/// A sparse, sorted set of row IDs backed by a Roaring bitmap.
52/// Row IDs are `u32` because Roaring's API is u32-native; tables
53/// with more than 4 billion rows need to partition by another
54/// dimension (typically segment id) and use multiple bitmaps.
55#[derive(Debug, Clone, Default)]
56pub struct TidBitmap {
57 inner: RoaringBitmap,
58 /// Soft size cap measured in 32-bit words. Crossing this
59 /// triggers `BitmapError::TooLarge`. Defaults to 32 MB worth
60 /// of bits which holds ~256 million sparse row IDs.
61 cap_bytes: usize,
62}
63
64/// Errors raised by bitmap operations.
65#[derive(Debug)]
66pub enum BitmapError {
67 /// The bitmap exceeded its configured size cap. Caller
68 /// should fall back to sequential scan or split the
69 /// predicate further.
70 TooLarge { current: usize, cap: usize },
71}
72
73impl std::fmt::Display for BitmapError {
74 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
75 match self {
76 Self::TooLarge { current, cap } => {
77 write!(f, "tid bitmap {current} bytes exceeds cap {cap}")
78 }
79 }
80 }
81}
82
83impl std::error::Error for BitmapError {}
84
85impl TidBitmap {
86 /// Create an empty bitmap with the default 32 MB cap.
87 pub fn new() -> Self {
88 Self {
89 inner: RoaringBitmap::new(),
90 cap_bytes: 32 * 1024 * 1024,
91 }
92 }
93
94 /// Create an empty bitmap with a custom cap (bytes). Use 0
95 /// to disable the cap entirely — useful for tests and
96 /// in-memory benchmarks.
97 pub fn with_cap_bytes(cap_bytes: usize) -> Self {
98 Self {
99 inner: RoaringBitmap::new(),
100 cap_bytes,
101 }
102 }
103
104 /// Insert a row ID. Returns `BitmapError::TooLarge` if the
105 /// resulting size exceeds the configured cap; the row is
106 /// NOT inserted in that case so the caller can recover by
107 /// switching strategies.
108 pub fn insert(&mut self, tid: u32) -> Result<bool, BitmapError> {
109 let added = self.inner.insert(tid);
110 self.check_cap()?;
111 Ok(added)
112 }
113
114 /// Bulk insert from any iterator of row IDs. Stops on the
115 /// first cap violation and returns the number of IDs that
116 /// were successfully inserted before the cap was hit.
117 pub fn extend_from_iter(
118 &mut self,
119 iter: impl IntoIterator<Item = u32>,
120 ) -> Result<usize, BitmapError> {
121 let mut count = 0usize;
122 for tid in iter {
123 self.inner.insert(tid);
124 count += 1;
125 // Amortise the cap check: only verify every 4096
126 // insertions to avoid O(n) on the size estimate.
127 if count.is_multiple_of(4096) {
128 self.check_cap()?;
129 }
130 }
131 self.check_cap()?;
132 Ok(count)
133 }
134
135 /// Verify the in-memory size hasn't exceeded the cap. The
136 /// underlying RoaringBitmap exposes `serialized_size()` which
137 /// is a close proxy for actual heap usage.
138 fn check_cap(&self) -> Result<(), BitmapError> {
139 if self.cap_bytes == 0 {
140 return Ok(());
141 }
142 let current = self.inner.serialized_size();
143 if current > self.cap_bytes {
144 return Err(BitmapError::TooLarge {
145 current,
146 cap: self.cap_bytes,
147 });
148 }
149 Ok(())
150 }
151
152 /// Returns true when the bitmap contains the given row ID.
153 /// O(log n) lookup — Roaring does block search internally.
154 pub fn contains(&self, tid: u32) -> bool {
155 self.inner.contains(tid)
156 }
157
158 /// Number of row IDs in the bitmap.
159 pub fn len(&self) -> u64 {
160 self.inner.len()
161 }
162
163 /// True when the bitmap holds no row IDs.
164 pub fn is_empty(&self) -> bool {
165 self.inner.is_empty()
166 }
167
168 /// In-place AND with another bitmap. Used by the planner's
169 /// `WHERE a = 1 AND b = 2` rewrite: lookup each side, AND
170 /// the results.
171 pub fn intersect_with(&mut self, other: &TidBitmap) {
172 self.inner &= &other.inner;
173 }
174
175 /// In-place OR with another bitmap. Used by `WHERE a OR b`
176 /// patterns and IN list expansion.
177 pub fn union_with(&mut self, other: &TidBitmap) {
178 self.inner |= &other.inner;
179 }
180
181 /// In-place ANDNOT — remove every row ID also present in
182 /// `other`. Used by `WHERE a AND NOT b` patterns and EXCEPT
183 /// queries.
184 pub fn difference_with(&mut self, other: &TidBitmap) {
185 self.inner -= &other.inner;
186 }
187
188 /// Iterate row IDs in sorted ascending order. The heap
189 /// fetcher uses this to read pages sequentially.
190 pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
191 self.inner.iter()
192 }
193
194 /// Drain all row IDs in sorted ascending order, consuming
195 /// the bitmap. Equivalent to `iter().collect()` but releases
196 /// the inner storage as soon as iteration completes.
197 pub fn into_sorted_vec(self) -> Vec<u32> {
198 self.inner.into_iter().collect()
199 }
200
201 /// Group row IDs by their containing page number. Returns a
202 /// vector of `(page_id, Vec<row_within_page>)` pairs sorted
203 /// by page_id ascending. The heap fetcher reads each page
204 /// once and extracts every matching row in a single I/O.
205 ///
206 /// `rows_per_page` is the table-specific constant — for
207 /// reddb's default 8 KB pages with ~64-byte rows it's
208 /// roughly 128, but call sites pass the exact value from
209 /// their schema metadata.
210 pub fn group_by_page(&self, rows_per_page: u32) -> Vec<(u32, Vec<u32>)> {
211 if rows_per_page == 0 {
212 return Vec::new();
213 }
214 let mut groups: Vec<(u32, Vec<u32>)> = Vec::new();
215 let mut current_page: Option<u32> = None;
216 let mut current_rows: Vec<u32> = Vec::new();
217 for tid in self.inner.iter() {
218 let page = tid / rows_per_page;
219 let row = tid % rows_per_page;
220 match current_page {
221 Some(p) if p == page => current_rows.push(row),
222 _ => {
223 if let Some(p) = current_page {
224 groups.push((p, std::mem::take(&mut current_rows)));
225 }
226 current_page = Some(page);
227 current_rows.push(row);
228 }
229 }
230 }
231 if let Some(p) = current_page {
232 groups.push((p, current_rows));
233 }
234 groups
235 }
236
237 /// Cardinality of the union with another bitmap, computed
238 /// without materialising the union itself. Used by the
239 /// planner's cost estimator to compare AND vs OR rewrites
240 /// without paying the merge cost.
241 pub fn union_cardinality(&self, other: &TidBitmap) -> u64 {
242 self.inner.union_len(&other.inner)
243 }
244
245 /// Cardinality of the intersection with another bitmap, also
246 /// without materialising the result.
247 pub fn intersection_cardinality(&self, other: &TidBitmap) -> u64 {
248 self.inner.intersection_len(&other.inner)
249 }
250}
251
252/// Convenience constructor: build a TidBitmap from any iterable
253/// of row IDs. Skips the cap check if the iterator's
254/// `size_hint().1` exceeds the cap to fail fast.
255pub fn from_iter(iter: impl IntoIterator<Item = u32>) -> Result<TidBitmap, BitmapError> {
256 let mut bitmap = TidBitmap::new();
257 bitmap.extend_from_iter(iter)?;
258 Ok(bitmap)
259}
260
261/// Build the AND of any number of bitmaps. Empty input yields
262/// an empty bitmap. Single-element input returns the bitmap
263/// unchanged. Order doesn't matter (AND is commutative) but
264/// for cost reasons callers should pass the most-selective
265/// bitmap first to minimise intermediate state.
266pub fn intersect_all(mut bitmaps: Vec<TidBitmap>) -> TidBitmap {
267 if bitmaps.is_empty() {
268 return TidBitmap::new();
269 }
270 let mut acc = bitmaps.remove(0);
271 for b in bitmaps {
272 acc.intersect_with(&b);
273 if acc.is_empty() {
274 // Short-circuit: empty intersection stays empty.
275 return acc;
276 }
277 }
278 acc
279}
280
281/// Build the OR of any number of bitmaps. Empty input yields
282/// an empty bitmap. Single-element input returns it unchanged.
283pub fn union_all(bitmaps: Vec<TidBitmap>) -> TidBitmap {
284 let mut iter = bitmaps.into_iter();
285 let Some(mut acc) = iter.next() else {
286 return TidBitmap::new();
287 };
288 for b in iter {
289 acc.union_with(&b);
290 }
291 acc
292}