reddb_server/storage/btree/visibility_map.rs
1//! Visibility map — Fase 5 P2 building block.
2//!
3//! Provides a `VisibilityMap` data structure that tracks per-page
4//! "all-visible" status: a single bit per heap page recording
5//! whether every row on that page is visible to every concurrent
6//! transaction. When the bit is set, the planner can answer
7//! certain queries without ever fetching the heap row — index-
8//! only scans return the indexed columns directly.
9//!
10//! Mirrors PG's `visibilitymap.c` modulo features we don't have:
11//!
12//! - **Crash recovery**: PG's vmap is durable via WAL. Ours is
13//! memory-only for now; on restart every page resets to
14//! "not all-visible" and gets re-marked lazily as queries
15//! verify rows.
16//! - **All-frozen bit**: PG tracks a second bit per page
17//! ("all-frozen") used by the freeze map for vacuum
18//! optimisation. We omit it — reddb doesn't have freeze
19//! semantics yet.
20//! - **Concurrent updates**: PG uses LWLocks per buffer slot
21//! for fine-grained concurrency. Ours uses a single RwLock
22//! over the bitmap. Acceptable for Week 5; later weeks
23//! shard by page range when contention shows up.
24//!
25//! ## Why it matters
26//!
27//! Index-only scans are the killer perf win for "narrow"
28//! queries that select only indexed columns:
29//!
30//! SELECT user_id FROM users WHERE email = ?
31//!
32//! With an index on `email` covering `user_id`, the planner can
33//! skip the heap fetch entirely. But that's only safe when the
34//! tuple's MVCC visibility hasn't changed since the last vacuum
35//! — i.e. the page is "all-visible".
36//!
37//! Without a vmap, every index-only scan candidate must double-
38//! check by fetching the heap row anyway, defeating the point.
39//! With a vmap, the per-page bit is a single memory access.
40//!
41//! ## Marking strategy
42//!
43//! - **Set** the all-visible bit when:
44//! - VACUUM determines every row on the page is visible to
45//! every active xmin (no in-flight inserts / updates).
46//! - A bulk-load operation atomically writes a new page where
47//! every tuple is committed.
48//! - **Clear** the all-visible bit when:
49//! - Any tuple on the page is updated, deleted, or inserted.
50//! - Even a no-op WAL replay: clearing is conservative.
51//!
52//! reddb's MVCC version chains live in the btree itself
53//! (`storage/btree/node.rs:300`), not in row headers, so the
54//! "page" concept here is an *abstract* page — call sites use
55//! the entity ID divided by `entries_per_page` (~256 for 8 KB
56//! pages) as the page index.
57
58use std::sync::RwLock;
59
60/// Per-page visibility tracking bitmap. Pages are indexed by a
61/// dense u32 — sparse table sizes can overshoot the bitmap and
62/// trigger lazy resize via `ensure_capacity`.
63pub struct VisibilityMap {
64 /// Bit-packed visibility bits, indexed by page number.
65 /// `bits[i / 64] & (1 << (i % 64))` set means page `i` is
66 /// all-visible.
67 bits: RwLock<Vec<u64>>,
68}
69
70impl VisibilityMap {
71 /// Create an empty visibility map. Initial capacity is
72 /// minimal; pages get added as `mark_all_visible` extends
73 /// the bitmap.
74 pub fn new() -> Self {
75 Self {
76 bits: RwLock::new(Vec::new()),
77 }
78 }
79
80 /// Pre-allocate room for `pages` pages. Useful when the
81 /// caller knows the table size up-front (e.g. ANALYZE
82 /// freshly imported data).
83 pub fn with_capacity(pages: u32) -> Self {
84 let words = (pages as usize).div_ceil(64);
85 Self {
86 bits: RwLock::new(vec![0u64; words]),
87 }
88 }
89
90 /// Returns true when `page` is marked all-visible. Page
91 /// indexes beyond the current bitmap return false (treated
92 /// as "unknown / not-visible").
93 pub fn is_all_visible(&self, page: u32) -> bool {
94 let bits = self.bits.read().expect("vmap rwlock poisoned");
95 let word_idx = page as usize / 64;
96 if word_idx >= bits.len() {
97 return false;
98 }
99 let bit_idx = page as usize % 64;
100 (bits[word_idx] >> bit_idx) & 1 == 1
101 }
102
103 /// Mark `page` as all-visible. Extends the bitmap on demand.
104 /// Idempotent — calling twice has the same effect as calling
105 /// once.
106 pub fn mark_all_visible(&self, page: u32) {
107 let mut bits = self.bits.write().expect("vmap rwlock poisoned");
108 let word_idx = page as usize / 64;
109 if word_idx >= bits.len() {
110 bits.resize(word_idx + 1, 0);
111 }
112 let bit_idx = page as usize % 64;
113 bits[word_idx] |= 1u64 << bit_idx;
114 }
115
116 /// Clear the all-visible bit for `page`. Called by every
117 /// write path that touches the page (insert / update /
118 /// delete). Cheap no-op when the page wasn't marked or
119 /// doesn't exist in the bitmap yet.
120 pub fn clear_all_visible(&self, page: u32) {
121 let mut bits = self.bits.write().expect("vmap rwlock poisoned");
122 let word_idx = page as usize / 64;
123 if word_idx >= bits.len() {
124 // Page doesn't exist yet — implicit clear, nothing
125 // to do.
126 return;
127 }
128 let bit_idx = page as usize % 64;
129 bits[word_idx] &= !(1u64 << bit_idx);
130 }
131
132 /// Number of all-visible pages currently tracked.
133 pub fn all_visible_count(&self) -> u64 {
134 let bits = self.bits.read().expect("vmap rwlock poisoned");
135 bits.iter().map(|w| w.count_ones() as u64).sum()
136 }
137
138 /// Total number of pages the bitmap can address (capacity,
139 /// not "set count"). Mostly useful for diagnostics and
140 /// memory accounting.
141 pub fn capacity_pages(&self) -> u64 {
142 let bits = self.bits.read().expect("vmap rwlock poisoned");
143 (bits.len() as u64) * 64
144 }
145
146 /// Reset the entire bitmap to "not all-visible". Used by
147 /// crash recovery and DROP TABLE.
148 pub fn clear(&self) {
149 let mut bits = self.bits.write().expect("vmap rwlock poisoned");
150 for w in bits.iter_mut() {
151 *w = 0;
152 }
153 }
154
155 /// Bulk-mark a contiguous range of pages [`start`, `end`)
156 /// as all-visible. Used by VACUUM after a successful sweep
157 /// of a page range.
158 pub fn mark_range_visible(&self, start: u32, end: u32) {
159 if start >= end {
160 return;
161 }
162 let mut bits = self.bits.write().expect("vmap rwlock poisoned");
163 let last_word = (end as usize - 1) / 64;
164 if last_word >= bits.len() {
165 bits.resize(last_word + 1, 0);
166 }
167 for page in start..end {
168 let word_idx = page as usize / 64;
169 let bit_idx = page as usize % 64;
170 bits[word_idx] |= 1u64 << bit_idx;
171 }
172 }
173
174 /// Iterate over (page, all_visible_bool) for the first
175 /// `limit_pages` pages. Diagnostic / debugging helper.
176 pub fn snapshot(&self, limit_pages: u32) -> Vec<(u32, bool)> {
177 let bits = self.bits.read().expect("vmap rwlock poisoned");
178 let mut out = Vec::with_capacity(limit_pages as usize);
179 for page in 0..limit_pages {
180 let word_idx = page as usize / 64;
181 let visible = if word_idx < bits.len() {
182 let bit_idx = page as usize % 64;
183 (bits[word_idx] >> bit_idx) & 1 == 1
184 } else {
185 false
186 };
187 out.push((page, visible));
188 }
189 out
190 }
191}
192
193impl Default for VisibilityMap {
194 fn default() -> Self {
195 Self::new()
196 }
197}
198
199/// Helper: convert an entity ID to a page index using the given
200/// `rows_per_page` constant. The btree's MVCC version chain
201/// doesn't actually map onto fixed-size pages, so this is an
202/// abstraction layer that lets the planner reason about
203/// "page-shaped" visibility regions without committing to a
204/// specific physical layout.
205pub fn page_of(entity_id: u64, rows_per_page: u32) -> u32 {
206 if rows_per_page == 0 {
207 return 0;
208 }
209 (entity_id / rows_per_page as u64) as u32
210}
211
212/// Phase 3.5 wiring callback. The btree write path calls this
213/// after every insert / update / delete to clear the all-visible
214/// bit for the affected page. Centralised so a single function
215/// can be hooked into multiple write call sites without each
216/// rewriting the page-of math.
217pub fn mark_dirty_after_write(vmap: &VisibilityMap, entity_id: u64, rows_per_page: u32) {
218 let page = page_of(entity_id, rows_per_page);
219 vmap.clear_all_visible(page);
220}
221
222/// Phase 3.5 wiring callback for VACUUM / GC. After confirming
223/// every row in a page range is visible to all active txns, the
224/// GC sweeps this with the (start, end) page bounds.
225pub fn mark_clean_after_gc(vmap: &VisibilityMap, start_page: u32, end_page: u32) {
226 vmap.mark_range_visible(start_page, end_page);
227}