Skip to main content

reddb_server/storage/btree/
visibility_map.rs

1//! Visibility map — Fase 5 P2 building block.
2//!
3//! Provides a `VisibilityMap` data structure that tracks per-page
4//! "all-visible" status: a single bit per heap page recording
5//! whether every row on that page is visible to every concurrent
6//! transaction. When the bit is set, the planner can answer
7//! certain queries without ever fetching the heap row — index-
8//! only scans return the indexed columns directly.
9//!
10//! Mirrors PG's `visibilitymap.c` modulo features we don't have:
11//!
12//! - **Crash recovery**: PG's vmap is durable via WAL. Ours is
13//!   memory-only for now; on restart every page resets to
14//!   "not all-visible" and gets re-marked lazily as queries
15//!   verify rows.
16//! - **All-frozen bit**: PG tracks a second bit per page
17//!   ("all-frozen") used by the freeze map for vacuum
18//!   optimisation. We omit it — reddb doesn't have freeze
19//!   semantics yet.
20//! - **Concurrent updates**: PG uses LWLocks per buffer slot
21//!   for fine-grained concurrency. Ours uses a single RwLock
22//!   over the bitmap. Acceptable for Week 5; later weeks
23//!   shard by page range when contention shows up.
24//!
25//! ## Why it matters
26//!
27//! Index-only scans are the killer perf win for "narrow"
28//! queries that select only indexed columns:
29//!
30//!     SELECT user_id FROM users WHERE email = ?
31//!
32//! With an index on `email` covering `user_id`, the planner can
33//! skip the heap fetch entirely. But that's only safe when the
34//! tuple's MVCC visibility hasn't changed since the last vacuum
35//! — i.e. the page is "all-visible".
36//!
37//! Without a vmap, every index-only scan candidate must double-
38//! check by fetching the heap row anyway, defeating the point.
39//! With a vmap, the per-page bit is a single memory access.
40//!
41//! ## Marking strategy
42//!
43//! - **Set** the all-visible bit when:
44//!   - VACUUM determines every row on the page is visible to
45//!     every active xmin (no in-flight inserts / updates).
46//!   - A bulk-load operation atomically writes a new page where
47//!     every tuple is committed.
48//! - **Clear** the all-visible bit when:
49//!   - Any tuple on the page is updated, deleted, or inserted.
50//!   - Even a no-op WAL replay: clearing is conservative.
51//!
52//! reddb's MVCC version chains live in the btree itself
53//! (`storage/btree/node.rs:300`), not in row headers, so the
54//! "page" concept here is an *abstract* page — call sites use
55//! the entity ID divided by `entries_per_page` (~256 for 8 KB
56//! pages) as the page index.
57
58use std::sync::RwLock;
59
60/// Per-page visibility tracking bitmap. Pages are indexed by a
61/// dense u32 — sparse table sizes can overshoot the bitmap and
62/// trigger lazy resize via `ensure_capacity`.
63pub struct VisibilityMap {
64    /// Bit-packed visibility bits, indexed by page number.
65    /// `bits[i / 64] & (1 << (i % 64))` set means page `i` is
66    /// all-visible.
67    bits: RwLock<Vec<u64>>,
68}
69
70impl VisibilityMap {
71    /// Create an empty visibility map. Initial capacity is
72    /// minimal; pages get added as `mark_all_visible` extends
73    /// the bitmap.
74    pub fn new() -> Self {
75        Self {
76            bits: RwLock::new(Vec::new()),
77        }
78    }
79
80    /// Pre-allocate room for `pages` pages. Useful when the
81    /// caller knows the table size up-front (e.g. ANALYZE
82    /// freshly imported data).
83    pub fn with_capacity(pages: u32) -> Self {
84        let words = (pages as usize).div_ceil(64);
85        Self {
86            bits: RwLock::new(vec![0u64; words]),
87        }
88    }
89
90    /// Returns true when `page` is marked all-visible. Page
91    /// indexes beyond the current bitmap return false (treated
92    /// as "unknown / not-visible").
93    pub fn is_all_visible(&self, page: u32) -> bool {
94        let bits = self.bits.read().expect("vmap rwlock poisoned");
95        let word_idx = page as usize / 64;
96        if word_idx >= bits.len() {
97            return false;
98        }
99        let bit_idx = page as usize % 64;
100        (bits[word_idx] >> bit_idx) & 1 == 1
101    }
102
103    /// Mark `page` as all-visible. Extends the bitmap on demand.
104    /// Idempotent — calling twice has the same effect as calling
105    /// once.
106    pub fn mark_all_visible(&self, page: u32) {
107        let mut bits = self.bits.write().expect("vmap rwlock poisoned");
108        let word_idx = page as usize / 64;
109        if word_idx >= bits.len() {
110            bits.resize(word_idx + 1, 0);
111        }
112        let bit_idx = page as usize % 64;
113        bits[word_idx] |= 1u64 << bit_idx;
114    }
115
116    /// Clear the all-visible bit for `page`. Called by every
117    /// write path that touches the page (insert / update /
118    /// delete). Cheap no-op when the page wasn't marked or
119    /// doesn't exist in the bitmap yet.
120    pub fn clear_all_visible(&self, page: u32) {
121        let mut bits = self.bits.write().expect("vmap rwlock poisoned");
122        let word_idx = page as usize / 64;
123        if word_idx >= bits.len() {
124            // Page doesn't exist yet — implicit clear, nothing
125            // to do.
126            return;
127        }
128        let bit_idx = page as usize % 64;
129        bits[word_idx] &= !(1u64 << bit_idx);
130    }
131
132    /// Number of all-visible pages currently tracked.
133    pub fn all_visible_count(&self) -> u64 {
134        let bits = self.bits.read().expect("vmap rwlock poisoned");
135        bits.iter().map(|w| w.count_ones() as u64).sum()
136    }
137
138    /// Total number of pages the bitmap can address (capacity,
139    /// not "set count"). Mostly useful for diagnostics and
140    /// memory accounting.
141    pub fn capacity_pages(&self) -> u64 {
142        let bits = self.bits.read().expect("vmap rwlock poisoned");
143        (bits.len() as u64) * 64
144    }
145
146    /// Reset the entire bitmap to "not all-visible". Used by
147    /// crash recovery and DROP TABLE.
148    pub fn clear(&self) {
149        let mut bits = self.bits.write().expect("vmap rwlock poisoned");
150        for w in bits.iter_mut() {
151            *w = 0;
152        }
153    }
154
155    /// Bulk-mark a contiguous range of pages [`start`, `end`)
156    /// as all-visible. Used by VACUUM after a successful sweep
157    /// of a page range.
158    pub fn mark_range_visible(&self, start: u32, end: u32) {
159        if start >= end {
160            return;
161        }
162        let mut bits = self.bits.write().expect("vmap rwlock poisoned");
163        let last_word = (end as usize - 1) / 64;
164        if last_word >= bits.len() {
165            bits.resize(last_word + 1, 0);
166        }
167        for page in start..end {
168            let word_idx = page as usize / 64;
169            let bit_idx = page as usize % 64;
170            bits[word_idx] |= 1u64 << bit_idx;
171        }
172    }
173
174    /// Iterate over (page, all_visible_bool) for the first
175    /// `limit_pages` pages. Diagnostic / debugging helper.
176    pub fn snapshot(&self, limit_pages: u32) -> Vec<(u32, bool)> {
177        let bits = self.bits.read().expect("vmap rwlock poisoned");
178        let mut out = Vec::with_capacity(limit_pages as usize);
179        for page in 0..limit_pages {
180            let word_idx = page as usize / 64;
181            let visible = if word_idx < bits.len() {
182                let bit_idx = page as usize % 64;
183                (bits[word_idx] >> bit_idx) & 1 == 1
184            } else {
185                false
186            };
187            out.push((page, visible));
188        }
189        out
190    }
191}
192
193impl Default for VisibilityMap {
194    fn default() -> Self {
195        Self::new()
196    }
197}
198
199/// Helper: convert an entity ID to a page index using the given
200/// `rows_per_page` constant. The btree's MVCC version chain
201/// doesn't actually map onto fixed-size pages, so this is an
202/// abstraction layer that lets the planner reason about
203/// "page-shaped" visibility regions without committing to a
204/// specific physical layout.
205pub fn page_of(entity_id: u64, rows_per_page: u32) -> u32 {
206    if rows_per_page == 0 {
207        return 0;
208    }
209    (entity_id / rows_per_page as u64) as u32
210}
211
212/// Phase 3.5 wiring callback. The btree write path calls this
213/// after every insert / update / delete to clear the all-visible
214/// bit for the affected page. Centralised so a single function
215/// can be hooked into multiple write call sites without each
216/// rewriting the page-of math.
217pub fn mark_dirty_after_write(vmap: &VisibilityMap, entity_id: u64, rows_per_page: u32) {
218    let page = page_of(entity_id, rows_per_page);
219    vmap.clear_all_visible(page);
220}
221
222/// Phase 3.5 wiring callback for VACUUM / GC. After confirming
223/// every row in a page range is visible to all active txns, the
224/// GC sweeps this with the (start, end) page bounds.
225pub fn mark_clean_after_gc(vmap: &VisibilityMap, start_page: u32, end_page: u32) {
226    vmap.mark_range_visible(start_page, end_page);
227}