sqlrite/sql/pager/freelist.rs
1//! Persisted free-page list (SQLR-6).
2//!
3//! After `DROP TABLE` / `DROP INDEX` / `ALTER TABLE DROP COLUMN`, the next
4//! `save_database` no longer references the dropped object's pages. Without
5//! a freelist those pages would either be re-numbered (every later table
6//! shifts down → high write amplification) or stranded as orphans on disk.
7//!
8//! The freelist solves both: pages no longer referenced from `sqlrite_master`
9//! go on a persisted stack rooted at `header.freelist_head`, so subsequent
10//! saves can pull from there before extending the file. The unrelated tables
11//! that didn't change keep their page numbers and their pages re-stage byte-
12//! identical → the diff pager skips writing them.
13//!
14//! ## On-disk layout
15//!
16//! Each freelist trunk page carries the standard 7-byte page header with
17//! `page_type = PAGE_TYPE_FREELIST_TRUNK (5)` and `next_page` pointing at the
18//! next trunk in the chain (`0` = end). The 4089-byte payload area holds:
19//!
20//! ```text
21//! 0..2 count: u16 LE — number of free leaf-page IDs that follow
22//! 2..2+4*N page_ids[N]: u32 LE — free leaf-page numbers, ascending
23//! ```
24//!
25//! A trunk holds up to `(PAYLOAD_PER_PAGE - 2) / 4 = 1021` free page IDs.
26//! Larger freelists chain across multiple trunks. The trunk pages themselves
27//! are part of the live page set (they hold metadata) and are *not* on the
28//! freelist they encode.
29
30use std::collections::VecDeque;
31
32use crate::error::{Result, SQLRiteError};
33use crate::sql::pager::page::{PAGE_HEADER_SIZE, PAGE_SIZE, PAYLOAD_PER_PAGE};
34use crate::sql::pager::pager::Pager;
35
36/// Page-type tag for a freelist trunk page. Distinct from existing page tags
37/// (`2 = TableLeaf`, `3 = Overflow`, `4 = InteriorNode`); `1` was retired.
38pub const PAGE_TYPE_FREELIST_TRUNK: u8 = 5;
39
40/// Maximum number of free page IDs a single trunk page can hold.
41/// `PAYLOAD_PER_PAGE` is 4089; we reserve 2 bytes for the count and 4 bytes
42/// per ID, giving `(4089 - 2) / 4 = 1021`.
43pub const FREELIST_IDS_PER_TRUNK: usize = (PAYLOAD_PER_PAGE - 2) / 4;
44
45/// Encodes a single freelist trunk page into the given buffer.
46///
47/// `next_trunk` is the page number of the next trunk in the chain, or `0`
48/// to mark the end. `page_ids` must have at most `FREELIST_IDS_PER_TRUNK`
49/// entries.
50pub fn encode_trunk(buf: &mut [u8; PAGE_SIZE], next_trunk: u32, page_ids: &[u32]) -> Result<()> {
51 if page_ids.len() > FREELIST_IDS_PER_TRUNK {
52 return Err(SQLRiteError::Internal(format!(
53 "freelist trunk overflow: {} ids exceeds capacity {}",
54 page_ids.len(),
55 FREELIST_IDS_PER_TRUNK
56 )));
57 }
58 // Zero out the buffer; trailing payload bytes after the encoded IDs are
59 // unused and must be deterministic so the diff pager can skip an
60 // unchanged trunk.
61 buf.fill(0);
62 buf[0] = PAGE_TYPE_FREELIST_TRUNK;
63 buf[1..5].copy_from_slice(&next_trunk.to_le_bytes());
64 // Per-page `payload_length` field (bytes 5..7) is unused for trunks —
65 // the count field inside the payload self-describes the entries.
66 buf[5..7].copy_from_slice(&0u16.to_le_bytes());
67 let count = page_ids.len() as u16;
68 buf[PAGE_HEADER_SIZE..PAGE_HEADER_SIZE + 2].copy_from_slice(&count.to_le_bytes());
69 let mut off = PAGE_HEADER_SIZE + 2;
70 for id in page_ids {
71 buf[off..off + 4].copy_from_slice(&id.to_le_bytes());
72 off += 4;
73 }
74 Ok(())
75}
76
77/// Decodes one freelist trunk page. Returns `(next_trunk, page_ids)`.
78fn decode_trunk(buf: &[u8; PAGE_SIZE]) -> Result<(u32, Vec<u32>)> {
79 if buf[0] != PAGE_TYPE_FREELIST_TRUNK {
80 return Err(SQLRiteError::General(format!(
81 "expected freelist trunk page (tag {PAGE_TYPE_FREELIST_TRUNK}), got tag {}",
82 buf[0]
83 )));
84 }
85 let next_trunk = u32::from_le_bytes(buf[1..5].try_into().unwrap());
86 let count = u16::from_le_bytes(
87 buf[PAGE_HEADER_SIZE..PAGE_HEADER_SIZE + 2]
88 .try_into()
89 .unwrap(),
90 ) as usize;
91 if count > FREELIST_IDS_PER_TRUNK {
92 return Err(SQLRiteError::General(format!(
93 "freelist trunk count {count} exceeds capacity {FREELIST_IDS_PER_TRUNK}"
94 )));
95 }
96 let mut ids = Vec::with_capacity(count);
97 let mut off = PAGE_HEADER_SIZE + 2;
98 for _ in 0..count {
99 let id = u32::from_le_bytes(buf[off..off + 4].try_into().unwrap());
100 ids.push(id);
101 off += 4;
102 }
103 Ok((next_trunk, ids))
104}
105
106/// Walks the freelist chain rooted at `head` and returns every free leaf-page
107/// ID. The trunk pages themselves are *not* included — they're live metadata.
108/// Returns the trunk page numbers separately so the caller can release them
109/// (the next save re-encodes the freelist from scratch and frees the old
110/// trunks for reuse).
111///
112/// `head == 0` → empty freelist; returns `(vec![], vec![])`.
113pub fn read_freelist(pager: &Pager, head: u32) -> Result<(Vec<u32>, Vec<u32>)> {
114 let mut leaves: Vec<u32> = Vec::new();
115 let mut trunks: Vec<u32> = Vec::new();
116 let mut cursor = head;
117 let mut visited: std::collections::HashSet<u32> = std::collections::HashSet::new();
118 while cursor != 0 {
119 if !visited.insert(cursor) {
120 return Err(SQLRiteError::General(format!(
121 "freelist cycle detected at trunk page {cursor}"
122 )));
123 }
124 let buf = pager.read_page(cursor).ok_or_else(|| {
125 SQLRiteError::General(format!(
126 "freelist trunk page {cursor} is past page_count or unreadable"
127 ))
128 })?;
129 let (next, ids) = decode_trunk(buf)?;
130 leaves.extend(ids);
131 trunks.push(cursor);
132 cursor = next;
133 }
134 Ok((leaves, trunks))
135}
136
137/// Stages the freelist chain into the pager. `free_pages` is the full set
138/// of pages that need persisted-on-the-freelist treatment — both the trunk
139/// pages (metadata) and the leaf entries (the actually-free leaves).
140///
141/// The chain consumes some of `free_pages` as its own trunks: a trunk
142/// page IS a free page that's been temporarily borrowed for metadata.
143/// Returns the new `freelist_head` (or `0` if `free_pages` is empty).
144///
145/// Encoding: `T = ceil(N / (IDS_PER_TRUNK + 1))` trunks; each trunk
146/// (except possibly the last) carries `IDS_PER_TRUNK` leaf IDs. Leaves
147/// are stored ascending within each trunk for deterministic on-disk
148/// bytes (so an unchanged freelist re-stages byte-identical → diff skip).
149pub fn stage_freelist(pager: &mut Pager, free_pages: Vec<u32>) -> Result<u32> {
150 if free_pages.is_empty() {
151 return Ok(0);
152 }
153 // Sort + dedup so the on-disk ordering is deterministic. A duplicate
154 // in the freelist would be a serious bug elsewhere (double-free), but
155 // dedupping here is a cheap defensive guardrail.
156 let mut ids = free_pages;
157 ids.sort_unstable();
158 ids.dedup();
159
160 // Solve N = T + L where L = number of leaf slots used, T = number of
161 // trunks, and L ≤ T * IDS_PER_TRUNK. Smallest T satisfying that is
162 // ceil(N / (IDS_PER_TRUNK + 1)) — the +1 accounts for the trunk
163 // page itself absorbing one of the N pages.
164 let n = ids.len();
165 let t = n.div_ceil(FREELIST_IDS_PER_TRUNK + 1);
166
167 // Take the highest-numbered T pages as trunks. This keeps the leaf
168 // IDs ascending within each trunk and matches the "drain low first"
169 // policy of the allocator on subsequent saves.
170 let leaves_count = n - t;
171 let trunk_pages: Vec<u32> = ids.split_off(leaves_count);
172 let leaves = ids;
173
174 // Lay out leaves across trunks, IDS_PER_TRUNK per trunk.
175 let mut chunks: Vec<&[u32]> = leaves.chunks(FREELIST_IDS_PER_TRUNK).collect();
176 // If there are more trunks than chunks (e.g. N=1 → T=1, L=0), pad
177 // with empty chunks so every trunk gets staged.
178 while chunks.len() < trunk_pages.len() {
179 chunks.push(&[]);
180 }
181
182 for (i, chunk) in chunks.iter().enumerate() {
183 let next = if i + 1 < trunk_pages.len() {
184 trunk_pages[i + 1]
185 } else {
186 0
187 };
188 let mut buf = [0u8; PAGE_SIZE];
189 encode_trunk(&mut buf, next, chunk)?;
190 pager.stage_page(trunk_pages[i], buf);
191 }
192
193 Ok(trunk_pages[0])
194}
195
196/// Helper: collect a freelist into a `VecDeque<u32>` sorted ascending —
197/// the ordering the `PageAllocator` uses to draw next free pages from.
198pub fn freelist_to_deque(leaves: Vec<u32>) -> VecDeque<u32> {
199 let mut sorted = leaves;
200 sorted.sort_unstable();
201 sorted.dedup();
202 VecDeque::from(sorted)
203}
204
205/// Auto-VACUUM (SQLR-10) does not fire on databases below this many
206/// pages. The 4 KiB page size makes 16 pages = 64 KiB — small enough
207/// that the cost of a full-file rewrite is negligible if the user
208/// genuinely wants it (manual `VACUUM;` still works), but large enough
209/// that single-table churn doesn't blow past the threshold and trigger
210/// a noisy compact every few statements.
211pub const MIN_PAGES_FOR_AUTO_VACUUM: u32 = 16;
212
213/// Returns `true` if the on-disk freelist (counting both leaf and
214/// trunk pages — they're all reclaimable bytes) exceeds `threshold`
215/// of `header.page_count`, i.e. the file is bloated enough to be
216/// worth compacting. Returns `false` for tiny databases (under
217/// [`MIN_PAGES_FOR_AUTO_VACUUM`]) and for empty freelists, both as
218/// fast paths to keep the auto-VACUUM hook cheap on the common case.
219///
220/// This is a read-only inspection of the pager's current header and
221/// freelist chain — it does not mutate any state. The caller is
222/// responsible for actually invoking
223/// [`crate::sql::pager::vacuum_database`] when this returns `true`.
224pub fn should_auto_vacuum(pager: &Pager, threshold: f32) -> Result<bool> {
225 let header = pager.header();
226 if header.page_count < MIN_PAGES_FOR_AUTO_VACUUM {
227 return Ok(false);
228 }
229 if header.freelist_head == 0 {
230 return Ok(false);
231 }
232 let (leaves, trunks) = read_freelist(pager, header.freelist_head)?;
233 let free_pages = leaves.len() + trunks.len();
234 let ratio = free_pages as f32 / header.page_count as f32;
235 Ok(ratio > threshold)
236}
237
238#[cfg(test)]
239mod tests {
240 use super::*;
241
242 #[test]
243 fn empty_freelist_round_trip() {
244 let mut buf = [0u8; PAGE_SIZE];
245 encode_trunk(&mut buf, 0, &[]).unwrap();
246 let (next, ids) = decode_trunk(&buf).unwrap();
247 assert_eq!(next, 0);
248 assert!(ids.is_empty());
249 }
250
251 #[test]
252 fn single_chunk_round_trip() {
253 let mut buf = [0u8; PAGE_SIZE];
254 let pages = [3u32, 7, 12, 99];
255 encode_trunk(&mut buf, 42, &pages).unwrap();
256 let (next, ids) = decode_trunk(&buf).unwrap();
257 assert_eq!(next, 42);
258 assert_eq!(ids, pages);
259 }
260
261 #[test]
262 fn full_chunk_fits_capacity() {
263 let mut buf = [0u8; PAGE_SIZE];
264 let pages: Vec<u32> = (1..=FREELIST_IDS_PER_TRUNK as u32).collect();
265 encode_trunk(&mut buf, 0, &pages).unwrap();
266 let (next, ids) = decode_trunk(&buf).unwrap();
267 assert_eq!(next, 0);
268 assert_eq!(ids.len(), FREELIST_IDS_PER_TRUNK);
269 assert_eq!(ids[0], 1);
270 assert_eq!(
271 ids[FREELIST_IDS_PER_TRUNK - 1],
272 FREELIST_IDS_PER_TRUNK as u32
273 );
274 }
275
276 #[test]
277 fn over_capacity_errors() {
278 let mut buf = [0u8; PAGE_SIZE];
279 let pages: Vec<u32> = (1..=(FREELIST_IDS_PER_TRUNK as u32 + 1)).collect();
280 let err = encode_trunk(&mut buf, 0, &pages).unwrap_err();
281 assert!(format!("{err}").contains("freelist trunk overflow"));
282 }
283
284 #[test]
285 fn wrong_tag_errors_on_decode() {
286 let mut buf = [0u8; PAGE_SIZE];
287 buf[0] = 2; // TableLeaf, not freelist
288 let err = decode_trunk(&buf).unwrap_err();
289 assert!(format!("{err}").contains("expected freelist trunk page"));
290 }
291}