sherlock_nsf_parser/bdt.rs
1//! Bucket Descriptor Table (BDT) - the `bucket_index -> file_offset` map.
2//!
3//! # Where the map actually lives (correction to the original Phase B plan)
4//!
5//! The NSF_HANDOFF.md Phase B plan (Section 13.4) said the
6//! `bucket_index -> file_offset` map is built by walking the **Bucket
7//! Descriptor Block (BDB)** referenced from
8//! [`crate::info2::Information2`]. Reading the authoritative libnsfdb
9//! source (`libnsfdb_io_handle.c`) shows that is **wrong**:
10//!
11//! - The BDB (`libnsfdb_io_handle_read_bucket_descriptor_block`) holds
12//! *RRV* bucket descriptors (8 bytes each: file_offset + initial RRV
13//! identifier) plus the Unique Name Key table. Those locate the RRV
14//! buckets themselves and name note items - not the summary/non-summary
15//! data buckets.
16//! - The actual `bucket_index -> file_offset` map is the array of
17//! `nsfdb_summary_bucket_descriptor` / `nsfdb_non_summary_bucket_descriptor`
18//! entries that lives **inside the superblock body**
19//! (`libnsfdb_io_handle_read_superblock`, the
20//! `number_of_summary_bucket_descriptor_pages` walk).
21//!
22//! # The compression prerequisite (the real Phase B blocker)
23//!
24//! The superblock *body* is stored **compressed**. libnsfdb requires
25//! `compression_type == 1` and always runs the body through
26//! `libnsfdb_compression_cx_decompress` before the descriptor array can be
27//! read (`libnsfdb_io_handle.c` ~line 3022 + 3070). Domino "CX"
28//! decompression is not yet implemented in this crate, and its reference
29//! source was not part of the libnsfdb files pulled into the spike. Until
30//! that decompressor lands, [`crate::Database::resolve_bucket_slot`]
31//! returns [`NsfError::CompressionUnsupported`] rather than guess at the
32//! algorithm - a wrong decompressor would silently corrupt evidence.
33//!
34//! This module parses the descriptor arrays from an **already-decompressed**
35//! superblock body. It is the certain, mechanical half of the resolution
36//! path: correct against the libnsfdb layout and unit-tested with synthetic
37//! bodies, and ready to be fed real bytes the moment CX decompression
38//! exists.
39//!
40//! # Decompressed body layout (per `libnsfdb_io_handle_read_superblock`)
41//!
42//! When `number_of_summary_bucket_descriptor_pages > 0`, the body begins
43//! with a single summary page (libnsfdb rejects page counts > 1):
44//!
45//! ```text
46//! offset width region
47//! 0 4 unknown1
48//! 4 10 summary_bucket_page_descriptor
49//! 14 10 summary_bucket_group_descriptor
50//! 24 200 unknown2
51//! 224 14 * number_of_summary_buckets summary_bucket_descriptor[]
52//! ... (pad to 7982 - 14*N) unknown3
53//! total page = 8206 bytes
54//! ```
55//!
56//! Each `summary_bucket_descriptor` is `file_position[4] +
57//! modification_time[8] + largest_free[1] + second_largest_free[1]` = 14
58//! bytes. `file_position` is in 256-byte units (`<<= 8` for the byte
59//! offset).
60//!
61//! The non-summary page (if present) follows immediately:
62//!
63//! ```text
64//! offset width region
65//! 0 4 unknown1
66//! 4 2 non_summary_bucket_page_descriptor
67//! 6 2 non_summary_bucket_group_descriptor
68//! 8 62 unknown2
69//! 70 6 * number_of_non_summary_buckets non_summary_bucket_descriptor[]
70//! ... (pad to 8128 - 6*N) unknown3
71//! total page = 8198 bytes
72//! ```
73//!
74//! Each `non_summary_bucket_descriptor` is `file_position[4] +
75//! largest_free[1] + second_largest_free[1]` = 6 bytes.
76//!
77//! # bucket_index base
78//!
79//! RRV bucket-slot entries skip `bucket_index == 0` as an empty sentinel
80//! (see [`crate::rrv`]), which means the on-disk `bucket_index` is
81//! **1-based**: descriptor-array element `i` is addressed as
82//! `bucket_index == i + 1`. This mirrors the 1-based slot indexing
83//! confirmed in `libnsfdb_bucket_get_slot`. The 1-based mapping is an
84//! inference from the sentinel + the slot-index precedent; it is the one
85//! part of this module that cannot be validated against the corpus until
86//! CX decompression lets a real bucket-slot entry resolve end to end.
87//! Flagged here so it is re-confirmed at that point, not silently trusted.
88
89use crate::error::NsfError;
90use crate::superblock::Superblock;
91
92/// On-disk size of one `nsfdb_summary_bucket_descriptor`.
93pub const SUMMARY_DESCRIPTOR_BYTES: usize = 14;
94/// On-disk size of one `nsfdb_non_summary_bucket_descriptor`.
95pub const NON_SUMMARY_DESCRIPTOR_BYTES: usize = 6;
96
97/// Prefix before the summary descriptor array within the body.
98const SUMMARY_PAGE_PREFIX: usize = 224;
99/// Total bytes a single summary descriptor page occupies.
100const SUMMARY_PAGE_BYTES: usize = 8206;
101/// Prefix before the non-summary descriptor array within the body. The
102/// non-summary page is terminal (no structure follows it in the body we
103/// parse), so its total size - 8198 bytes, documented in the module
104/// header - is not needed to advance a cursor.
105const NON_SUMMARY_PAGE_PREFIX: usize = 70;
106
107/// Parsed bucket-descriptor table: two `bucket_index -> file byte offset`
108/// maps, one for summary buckets and one for non-summary buckets. Both are
109/// 0-based vectors; the on-disk 1-based `bucket_index` is converted by the
110/// accessor methods.
111#[derive(Debug, Clone, PartialEq, Eq)]
112pub struct BucketDescriptorTable {
113 /// Byte offsets of summary buckets, 0-based.
114 pub summary: Vec<u64>,
115 /// Byte offsets of non-summary buckets, 0-based.
116 pub non_summary: Vec<u64>,
117}
118
119impl BucketDescriptorTable {
120 /// Parse the descriptor arrays from a decompressed superblock body.
121 ///
122 /// `body` must be the superblock body *after* CX decompression (the
123 /// bytes that follow the 100-byte header, as libnsfdb addresses them).
124 /// The counts come from the already-parsed [`Superblock`] header.
125 pub fn parse(body: &[u8], sb: &Superblock) -> Result<Self, NsfError> {
126 let u32_at = |buf: &[u8], o: usize| -> Option<u32> {
127 buf.get(o..o + 4)
128 .map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]]))
129 };
130
131 let mut cursor = 0usize;
132 let mut summary = Vec::new();
133 if sb.number_of_summary_bucket_descriptor_pages > 0 {
134 let array_start = cursor + SUMMARY_PAGE_PREFIX;
135 let count = sb.number_of_summary_buckets as usize;
136 summary.reserve(count);
137 for i in 0..count {
138 let off = array_start + i * SUMMARY_DESCRIPTOR_BYTES;
139 let fp = u32_at(body, off).ok_or(NsfError::TooShort {
140 actual: body.len(),
141 required: off + 4,
142 })?;
143 summary.push(u64::from(fp) << 8);
144 }
145 cursor += SUMMARY_PAGE_BYTES;
146 }
147
148 let mut non_summary = Vec::new();
149 if sb.number_of_non_summary_bucket_descriptor_pages > 0 {
150 let array_start = cursor + NON_SUMMARY_PAGE_PREFIX;
151 let count = sb.number_of_non_summary_buckets as usize;
152 non_summary.reserve(count);
153 for i in 0..count {
154 let off = array_start + i * NON_SUMMARY_DESCRIPTOR_BYTES;
155 let fp = u32_at(body, off).ok_or(NsfError::TooShort {
156 actual: body.len(),
157 required: off + 4,
158 })?;
159 non_summary.push(u64::from(fp) << 8);
160 }
161 }
162
163 Ok(Self {
164 summary,
165 non_summary,
166 })
167 }
168
169 /// Byte offset of a summary bucket given its on-disk 1-based
170 /// `bucket_index`. Summary buckets hold note summary-item data, which
171 /// is where note enumeration via the RRV lands.
172 pub fn summary_bucket_offset(&self, bucket_index: u32) -> Result<u64, NsfError> {
173 Self::lookup(&self.summary, bucket_index)
174 }
175
176 /// Byte offset of a non-summary bucket given its on-disk 1-based
177 /// `bucket_index`.
178 pub fn non_summary_bucket_offset(&self, bucket_index: u32) -> Result<u64, NsfError> {
179 Self::lookup(&self.non_summary, bucket_index)
180 }
181
182 fn lookup(map: &[u64], bucket_index: u32) -> Result<u64, NsfError> {
183 if bucket_index == 0 {
184 return Err(NsfError::BucketIndexOutOfRange {
185 requested: 0,
186 available: map.len(),
187 });
188 }
189 let ordinal = (bucket_index - 1) as usize;
190 map.get(ordinal)
191 .copied()
192 .ok_or(NsfError::BucketIndexOutOfRange {
193 requested: bucket_index,
194 available: map.len(),
195 })
196 }
197}
198
199#[cfg(test)]
200mod tests {
201 use super::*;
202 use crate::superblock::{Superblock, SUPERBLOCK_HEADER_BYTES, SUPERBLOCK_SIGNATURE};
203
204 /// Build a superblock header with the given page counts + bucket
205 /// counts so [`BucketDescriptorTable::parse`] can be exercised.
206 fn superblock_with_counts(
207 summary_pages: u32,
208 summary_buckets: u32,
209 non_summary_pages: u32,
210 non_summary_buckets: u32,
211 ) -> Superblock {
212 let mut buf = vec![0u8; SUPERBLOCK_HEADER_BYTES];
213 buf[0..2].copy_from_slice(&SUPERBLOCK_SIGNATURE);
214 buf[14..18].copy_from_slice(&summary_buckets.to_le_bytes());
215 buf[18..22].copy_from_slice(&non_summary_buckets.to_le_bytes());
216 buf[70..74].copy_from_slice(&summary_pages.to_le_bytes());
217 buf[74..78].copy_from_slice(&non_summary_pages.to_le_bytes());
218 Superblock::parse(&buf).unwrap()
219 }
220
221 /// Build a synthetic decompressed body with summary (and optionally
222 /// non-summary) descriptor pages whose file positions encode the
223 /// descriptor index for easy assertions.
224 fn synthetic_body(summary_buckets: u32, non_summary_buckets: u32) -> Vec<u8> {
225 let mut body = Vec::new();
226 // Summary page.
227 if summary_buckets > 0 {
228 let mut page = vec![0u8; SUMMARY_PAGE_BYTES];
229 for i in 0..summary_buckets as usize {
230 let off = SUMMARY_PAGE_PREFIX + i * SUMMARY_DESCRIPTOR_BYTES;
231 // file_position = 0x100 + i so byte offset = (0x100+i) << 8.
232 let fp = 0x100u32 + i as u32;
233 page[off..off + 4].copy_from_slice(&fp.to_le_bytes());
234 }
235 body.extend_from_slice(&page);
236 }
237 // Non-summary page. Sized to cover its prefix + descriptors; it is
238 // the terminal page so its full padded size is not required here.
239 if non_summary_buckets > 0 {
240 let mut page = vec![
241 0u8;
242 NON_SUMMARY_PAGE_PREFIX
243 + non_summary_buckets as usize * NON_SUMMARY_DESCRIPTOR_BYTES
244 ];
245 for i in 0..non_summary_buckets as usize {
246 let off = NON_SUMMARY_PAGE_PREFIX + i * NON_SUMMARY_DESCRIPTOR_BYTES;
247 let fp = 0x900u32 + i as u32;
248 page[off..off + 4].copy_from_slice(&fp.to_le_bytes());
249 }
250 body.extend_from_slice(&page);
251 }
252 body
253 }
254
255 #[test]
256 fn parses_summary_descriptor_array() {
257 let sb = superblock_with_counts(1, 3, 0, 0);
258 let body = synthetic_body(3, 0);
259 let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
260 assert_eq!(bdt.summary.len(), 3);
261 assert_eq!(bdt.summary[0], 0x100u64 << 8);
262 assert_eq!(bdt.summary[1], 0x101u64 << 8);
263 assert_eq!(bdt.summary[2], 0x102u64 << 8);
264 assert!(bdt.non_summary.is_empty());
265 }
266
267 #[test]
268 fn parses_both_pages_with_correct_offsets() {
269 let sb = superblock_with_counts(1, 2, 1, 2);
270 let body = synthetic_body(2, 2);
271 let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
272 assert_eq!(bdt.summary.len(), 2);
273 assert_eq!(bdt.non_summary.len(), 2);
274 // Non-summary page sits after the summary page; its descriptors
275 // must still decode to the 0x900-based positions, proving the
276 // cursor advanced by exactly one summary page.
277 assert_eq!(bdt.non_summary[0], 0x900u64 << 8);
278 assert_eq!(bdt.non_summary[1], 0x901u64 << 8);
279 }
280
281 #[test]
282 fn summary_offset_is_one_based() {
283 let sb = superblock_with_counts(1, 3, 0, 0);
284 let body = synthetic_body(3, 0);
285 let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
286 // bucket_index 1 -> element 0.
287 assert_eq!(bdt.summary_bucket_offset(1).unwrap(), 0x100u64 << 8);
288 assert_eq!(bdt.summary_bucket_offset(3).unwrap(), 0x102u64 << 8);
289 }
290
291 #[test]
292 fn bucket_index_zero_is_rejected() {
293 let sb = superblock_with_counts(1, 1, 0, 0);
294 let body = synthetic_body(1, 0);
295 let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
296 assert!(matches!(
297 bdt.summary_bucket_offset(0),
298 Err(NsfError::BucketIndexOutOfRange { requested: 0, .. })
299 ));
300 }
301
302 #[test]
303 fn bucket_index_past_end_is_rejected() {
304 let sb = superblock_with_counts(1, 2, 0, 0);
305 let body = synthetic_body(2, 0);
306 let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
307 assert!(matches!(
308 bdt.summary_bucket_offset(3),
309 Err(NsfError::BucketIndexOutOfRange {
310 requested: 3,
311 available: 2
312 })
313 ));
314 }
315
316 #[test]
317 fn no_descriptor_pages_yields_empty_maps() {
318 // A database with zero descriptor pages (e.g. a fresh shell)
319 // must produce empty maps, not a panic or an error.
320 let sb = superblock_with_counts(0, 0, 0, 0);
321 let bdt = BucketDescriptorTable::parse(&[], &sb).unwrap();
322 assert!(bdt.summary.is_empty());
323 assert!(bdt.non_summary.is_empty());
324 }
325
326 #[test]
327 fn non_summary_only_starts_at_body_offset_zero() {
328 // When there is no summary page, the non-summary page is the
329 // first thing in the body (cursor must NOT skip a summary page
330 // that isn't there). Mirrors libnsfdb: the summary block is only
331 // advanced past when number_of_summary_bucket_descriptor_pages > 0.
332 let sb = superblock_with_counts(0, 0, 1, 2);
333 let body = synthetic_body(0, 2);
334 let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
335 assert!(bdt.summary.is_empty());
336 assert_eq!(bdt.non_summary.len(), 2);
337 assert_eq!(bdt.non_summary[0], 0x900u64 << 8);
338 assert_eq!(bdt.non_summary[1], 0x901u64 << 8);
339 }
340
341 #[test]
342 fn truncated_body_errors_not_panics() {
343 let sb = superblock_with_counts(1, 3, 0, 0);
344 // Body too short to hold all three descriptors.
345 let body = vec![0u8; SUMMARY_PAGE_PREFIX + SUMMARY_DESCRIPTOR_BYTES];
346 assert!(matches!(
347 BucketDescriptorTable::parse(&body, &sb),
348 Err(NsfError::TooShort { .. })
349 ));
350 }
351}