sqlrite/sql/pager/freelist.rs
1//! Persisted free-page list (SQLR-6).
2//!
3//! After `DROP TABLE` / `DROP INDEX` / `ALTER TABLE DROP COLUMN`, the next
4//! `save_database` no longer references the dropped object's pages. Without
5//! a freelist those pages would either be re-numbered (every later table
6//! shifts down → high write amplification) or stranded as orphans on disk.
7//!
8//! The freelist solves both: pages no longer referenced from `sqlrite_master`
9//! go on a persisted stack rooted at `header.freelist_head`, so subsequent
10//! saves can pull from there before extending the file. The unrelated tables
11//! that didn't change keep their page numbers and their pages re-stage byte-
12//! identical → the diff pager skips writing them.
13//!
14//! ## On-disk layout
15//!
16//! Each freelist trunk page carries the standard 7-byte page header with
17//! `page_type = PAGE_TYPE_FREELIST_TRUNK (5)` and `next_page` pointing at the
18//! next trunk in the chain (`0` = end). The 4089-byte payload area holds:
19//!
20//! ```text
21//! 0..2 count: u16 LE — number of free leaf-page IDs that follow
22//! 2..2+4*N page_ids[N]: u32 LE — free leaf-page numbers, ascending
23//! ```
24//!
25//! A trunk holds up to `(PAYLOAD_PER_PAGE - 2) / 4 = 1021` free page IDs.
26//! Larger freelists chain across multiple trunks. The trunk pages themselves
27//! are part of the live page set (they hold metadata) and are *not* on the
28//! freelist they encode.
29
30use std::collections::VecDeque;
31
32use crate::error::{Result, SQLRiteError};
33use crate::sql::pager::page::{PAGE_HEADER_SIZE, PAGE_SIZE, PAYLOAD_PER_PAGE};
34use crate::sql::pager::pager::Pager;
35
36/// Page-type tag for a freelist trunk page. Distinct from existing page tags
37/// (`2 = TableLeaf`, `3 = Overflow`, `4 = InteriorNode`); `1` was retired.
38pub const PAGE_TYPE_FREELIST_TRUNK: u8 = 5;
39
40/// Maximum number of free page IDs a single trunk page can hold.
41/// `PAYLOAD_PER_PAGE` is 4089; we reserve 2 bytes for the count and 4 bytes
42/// per ID, giving `(4089 - 2) / 4 = 1021`.
43pub const FREELIST_IDS_PER_TRUNK: usize = (PAYLOAD_PER_PAGE - 2) / 4;
44
45/// Encodes a single freelist trunk page into the given buffer.
46///
47/// `next_trunk` is the page number of the next trunk in the chain, or `0`
48/// to mark the end. `page_ids` must have at most `FREELIST_IDS_PER_TRUNK`
49/// entries.
50pub fn encode_trunk(buf: &mut [u8; PAGE_SIZE], next_trunk: u32, page_ids: &[u32]) -> Result<()> {
51 if page_ids.len() > FREELIST_IDS_PER_TRUNK {
52 return Err(SQLRiteError::Internal(format!(
53 "freelist trunk overflow: {} ids exceeds capacity {}",
54 page_ids.len(),
55 FREELIST_IDS_PER_TRUNK
56 )));
57 }
58 // Zero out the buffer; trailing payload bytes after the encoded IDs are
59 // unused and must be deterministic so the diff pager can skip an
60 // unchanged trunk.
61 buf.fill(0);
62 buf[0] = PAGE_TYPE_FREELIST_TRUNK;
63 buf[1..5].copy_from_slice(&next_trunk.to_le_bytes());
64 // Per-page `payload_length` field (bytes 5..7) is unused for trunks —
65 // the count field inside the payload self-describes the entries.
66 buf[5..7].copy_from_slice(&0u16.to_le_bytes());
67 let count = page_ids.len() as u16;
68 buf[PAGE_HEADER_SIZE..PAGE_HEADER_SIZE + 2].copy_from_slice(&count.to_le_bytes());
69 let mut off = PAGE_HEADER_SIZE + 2;
70 for id in page_ids {
71 buf[off..off + 4].copy_from_slice(&id.to_le_bytes());
72 off += 4;
73 }
74 Ok(())
75}
76
77/// Decodes one freelist trunk page. Returns `(next_trunk, page_ids)`.
78fn decode_trunk(buf: &[u8; PAGE_SIZE]) -> Result<(u32, Vec<u32>)> {
79 if buf[0] != PAGE_TYPE_FREELIST_TRUNK {
80 return Err(SQLRiteError::General(format!(
81 "expected freelist trunk page (tag {PAGE_TYPE_FREELIST_TRUNK}), got tag {}",
82 buf[0]
83 )));
84 }
85 let next_trunk = u32::from_le_bytes(buf[1..5].try_into().unwrap());
86 let count = u16::from_le_bytes(
87 buf[PAGE_HEADER_SIZE..PAGE_HEADER_SIZE + 2]
88 .try_into()
89 .unwrap(),
90 ) as usize;
91 if count > FREELIST_IDS_PER_TRUNK {
92 return Err(SQLRiteError::General(format!(
93 "freelist trunk count {count} exceeds capacity {FREELIST_IDS_PER_TRUNK}"
94 )));
95 }
96 let mut ids = Vec::with_capacity(count);
97 let mut off = PAGE_HEADER_SIZE + 2;
98 for _ in 0..count {
99 let id = u32::from_le_bytes(buf[off..off + 4].try_into().unwrap());
100 ids.push(id);
101 off += 4;
102 }
103 Ok((next_trunk, ids))
104}
105
106/// Walks the freelist chain rooted at `head` and returns every free leaf-page
107/// ID. The trunk pages themselves are *not* included — they're live metadata.
108/// Returns the trunk page numbers separately so the caller can release them
109/// (the next save re-encodes the freelist from scratch and frees the old
110/// trunks for reuse).
111///
112/// `head == 0` → empty freelist; returns `(vec![], vec![])`.
113pub fn read_freelist(pager: &Pager, head: u32) -> Result<(Vec<u32>, Vec<u32>)> {
114 let mut leaves: Vec<u32> = Vec::new();
115 let mut trunks: Vec<u32> = Vec::new();
116 let mut cursor = head;
117 let mut visited: std::collections::HashSet<u32> = std::collections::HashSet::new();
118 while cursor != 0 {
119 if !visited.insert(cursor) {
120 return Err(SQLRiteError::General(format!(
121 "freelist cycle detected at trunk page {cursor}"
122 )));
123 }
124 let buf = pager.read_page(cursor).ok_or_else(|| {
125 SQLRiteError::General(format!(
126 "freelist trunk page {cursor} is past page_count or unreadable"
127 ))
128 })?;
129 let (next, ids) = decode_trunk(buf)?;
130 leaves.extend(ids);
131 trunks.push(cursor);
132 cursor = next;
133 }
134 Ok((leaves, trunks))
135}
136
137/// Stages the freelist chain into the pager. `free_pages` is the full set
138/// of pages that need persisted-on-the-freelist treatment — both the trunk
139/// pages (metadata) and the leaf entries (the actually-free leaves).
140///
141/// The chain consumes some of `free_pages` as its own trunks: a trunk
142/// page IS a free page that's been temporarily borrowed for metadata.
143/// Returns the new `freelist_head` (or `0` if `free_pages` is empty).
144///
145/// Encoding: `T = ceil(N / (IDS_PER_TRUNK + 1))` trunks; each trunk
146/// (except possibly the last) carries `IDS_PER_TRUNK` leaf IDs. Leaves
147/// are stored ascending within each trunk for deterministic on-disk
148/// bytes (so an unchanged freelist re-stages byte-identical → diff skip).
149pub fn stage_freelist(pager: &mut Pager, free_pages: Vec<u32>) -> Result<u32> {
150 if free_pages.is_empty() {
151 return Ok(0);
152 }
153 // Sort + dedup so the on-disk ordering is deterministic. A duplicate
154 // in the freelist would be a serious bug elsewhere (double-free), but
155 // dedupping here is a cheap defensive guardrail.
156 let mut ids = free_pages;
157 ids.sort_unstable();
158 ids.dedup();
159
160 // Solve N = T + L where L = number of leaf slots used, T = number of
161 // trunks, and L ≤ T * IDS_PER_TRUNK. Smallest T satisfying that is
162 // ceil(N / (IDS_PER_TRUNK + 1)) — the +1 accounts for the trunk
163 // page itself absorbing one of the N pages.
164 let n = ids.len();
165 let t = n.div_ceil(FREELIST_IDS_PER_TRUNK + 1);
166
167 // Take the highest-numbered T pages as trunks. This keeps the leaf
168 // IDs ascending within each trunk and matches the "drain low first"
169 // policy of the allocator on subsequent saves.
170 let leaves_count = n - t;
171 let trunk_pages: Vec<u32> = ids.split_off(leaves_count);
172 let leaves = ids;
173
174 // Lay out leaves across trunks, IDS_PER_TRUNK per trunk.
175 let mut chunks: Vec<&[u32]> = leaves.chunks(FREELIST_IDS_PER_TRUNK).collect();
176 // If there are more trunks than chunks (e.g. N=1 → T=1, L=0), pad
177 // with empty chunks so every trunk gets staged.
178 while chunks.len() < trunk_pages.len() {
179 chunks.push(&[]);
180 }
181
182 for (i, chunk) in chunks.iter().enumerate() {
183 let next = if i + 1 < trunk_pages.len() {
184 trunk_pages[i + 1]
185 } else {
186 0
187 };
188 let mut buf = [0u8; PAGE_SIZE];
189 encode_trunk(&mut buf, next, chunk)?;
190 pager.stage_page(trunk_pages[i], buf);
191 }
192
193 Ok(trunk_pages[0])
194}
195
196/// Helper: collect a freelist into a `VecDeque<u32>` sorted ascending —
197/// the ordering the `PageAllocator` uses to draw next free pages from.
198pub fn freelist_to_deque(leaves: Vec<u32>) -> VecDeque<u32> {
199 let mut sorted = leaves;
200 sorted.sort_unstable();
201 sorted.dedup();
202 VecDeque::from(sorted)
203}
204
205#[cfg(test)]
206mod tests {
207 use super::*;
208
209 #[test]
210 fn empty_freelist_round_trip() {
211 let mut buf = [0u8; PAGE_SIZE];
212 encode_trunk(&mut buf, 0, &[]).unwrap();
213 let (next, ids) = decode_trunk(&buf).unwrap();
214 assert_eq!(next, 0);
215 assert!(ids.is_empty());
216 }
217
218 #[test]
219 fn single_chunk_round_trip() {
220 let mut buf = [0u8; PAGE_SIZE];
221 let pages = [3u32, 7, 12, 99];
222 encode_trunk(&mut buf, 42, &pages).unwrap();
223 let (next, ids) = decode_trunk(&buf).unwrap();
224 assert_eq!(next, 42);
225 assert_eq!(ids, pages);
226 }
227
228 #[test]
229 fn full_chunk_fits_capacity() {
230 let mut buf = [0u8; PAGE_SIZE];
231 let pages: Vec<u32> = (1..=FREELIST_IDS_PER_TRUNK as u32).collect();
232 encode_trunk(&mut buf, 0, &pages).unwrap();
233 let (next, ids) = decode_trunk(&buf).unwrap();
234 assert_eq!(next, 0);
235 assert_eq!(ids.len(), FREELIST_IDS_PER_TRUNK);
236 assert_eq!(ids[0], 1);
237 assert_eq!(
238 ids[FREELIST_IDS_PER_TRUNK - 1],
239 FREELIST_IDS_PER_TRUNK as u32
240 );
241 }
242
243 #[test]
244 fn over_capacity_errors() {
245 let mut buf = [0u8; PAGE_SIZE];
246 let pages: Vec<u32> = (1..=(FREELIST_IDS_PER_TRUNK as u32 + 1)).collect();
247 let err = encode_trunk(&mut buf, 0, &pages).unwrap_err();
248 assert!(format!("{err}").contains("freelist trunk overflow"));
249 }
250
251 #[test]
252 fn wrong_tag_errors_on_decode() {
253 let mut buf = [0u8; PAGE_SIZE];
254 buf[0] = 2; // TableLeaf, not freelist
255 let err = decode_trunk(&buf).unwrap_err();
256 assert!(format!("{err}").contains("expected freelist trunk page"));
257 }
258}