Skip to main content

sherlock_nsf_parser/
bdt.rs

1//! Bucket Descriptor Table (BDT) - the `bucket_index -> file_offset` map.
2//!
3//! # Where the map actually lives (correction to the original Phase B plan)
4//!
5//! The NSF_HANDOFF.md Phase B plan (Section 13.4) said the
6//! `bucket_index -> file_offset` map is built by walking the **Bucket
7//! Descriptor Block (BDB)** referenced from
8//! [`crate::info2::Information2`]. Reading the authoritative libnsfdb
9//! source (`libnsfdb_io_handle.c`) shows that is **wrong**:
10//!
11//! - The BDB (`libnsfdb_io_handle_read_bucket_descriptor_block`) holds
12//!   *RRV* bucket descriptors (8 bytes each: file_offset + initial RRV
13//!   identifier) plus the Unique Name Key table. Those locate the RRV
14//!   buckets themselves and name note items - not the summary/non-summary
15//!   data buckets.
16//! - The actual `bucket_index -> file_offset` map is the array of
17//!   `nsfdb_summary_bucket_descriptor` / `nsfdb_non_summary_bucket_descriptor`
18//!   entries that lives **inside the superblock body**
19//!   (`libnsfdb_io_handle_read_superblock`, the
20//!   `number_of_summary_bucket_descriptor_pages` walk).
21//!
22//! # The compression prerequisite (the real Phase B blocker)
23//!
24//! The superblock *body* is stored **compressed**. libnsfdb requires
25//! `compression_type == 1` and always runs the body through
26//! `libnsfdb_compression_cx_decompress` before the descriptor array can be
27//! read (`libnsfdb_io_handle.c` ~line 3022 + 3070). Domino "CX"
28//! decompression is not yet implemented in this crate, and its reference
29//! source was not part of the libnsfdb files pulled into the spike. Until
30//! that decompressor lands, [`crate::Database::resolve_bucket_slot`]
31//! returns [`NsfError::CompressionUnsupported`] rather than guess at the
32//! algorithm - a wrong decompressor would silently corrupt evidence.
33//!
34//! This module parses the descriptor arrays from an **already-decompressed**
35//! superblock body. It is the certain, mechanical half of the resolution
36//! path: correct against the libnsfdb layout and unit-tested with synthetic
37//! bodies, and ready to be fed real bytes the moment CX decompression
38//! exists.
39//!
40//! # Decompressed body layout (per `libnsfdb_io_handle_read_superblock`)
41//!
42//! When `number_of_summary_bucket_descriptor_pages > 0`, the body begins
43//! with a single summary page (libnsfdb rejects page counts > 1):
44//!
45//! ```text
46//! offset  width                       region
47//!     0      4                         unknown1
48//!     4     10                         summary_bucket_page_descriptor
49//!    14     10                         summary_bucket_group_descriptor
50//!    24    200                         unknown2
51//!   224     14 * number_of_summary_buckets   summary_bucket_descriptor[]
52//!   ...    (pad to 7982 - 14*N)        unknown3
53//!         total page = 8206 bytes
54//! ```
55//!
56//! Each `summary_bucket_descriptor` is `file_position[4] +
57//! modification_time[8] + largest_free[1] + second_largest_free[1]` = 14
58//! bytes. `file_position` is in 256-byte units (`<<= 8` for the byte
59//! offset).
60//!
61//! The non-summary page (if present) follows immediately:
62//!
63//! ```text
64//! offset  width                       region
65//!     0      4                         unknown1
66//!     4      2                         non_summary_bucket_page_descriptor
67//!     6      2                         non_summary_bucket_group_descriptor
68//!     8     62                         unknown2
69//!    70      6 * number_of_non_summary_buckets   non_summary_bucket_descriptor[]
70//!   ...    (pad to 8128 - 6*N)         unknown3
71//!         total page = 8198 bytes
72//! ```
73//!
74//! Each `non_summary_bucket_descriptor` is `file_position[4] +
75//! largest_free[1] + second_largest_free[1]` = 6 bytes.
76//!
77//! # bucket_index base
78//!
79//! RRV bucket-slot entries skip `bucket_index == 0` as an empty sentinel
80//! (see [`crate::rrv`]), which means the on-disk `bucket_index` is
81//! **1-based**: descriptor-array element `i` is addressed as
82//! `bucket_index == i + 1`. This mirrors the 1-based slot indexing
83//! confirmed in `libnsfdb_bucket_get_slot`. The 1-based mapping is an
84//! inference from the sentinel + the slot-index precedent; it is the one
85//! part of this module that cannot be validated against the corpus until
86//! CX decompression lets a real bucket-slot entry resolve end to end.
87//! Flagged here so it is re-confirmed at that point, not silently trusted.
88
89use crate::error::NsfError;
90use crate::superblock::Superblock;
91
92/// On-disk size of one `nsfdb_summary_bucket_descriptor`.
93pub const SUMMARY_DESCRIPTOR_BYTES: usize = 14;
94/// On-disk size of one `nsfdb_non_summary_bucket_descriptor`.
95pub const NON_SUMMARY_DESCRIPTOR_BYTES: usize = 6;
96
97/// Prefix before the summary descriptor array within the body.
98const SUMMARY_PAGE_PREFIX: usize = 224;
99/// Total bytes a single summary descriptor page occupies.
100const SUMMARY_PAGE_BYTES: usize = 8206;
101/// Prefix before the non-summary descriptor array within the body. The
102/// non-summary page is terminal (no structure follows it in the body we
103/// parse), so its total size - 8198 bytes, documented in the module
104/// header - is not needed to advance a cursor.
105const NON_SUMMARY_PAGE_PREFIX: usize = 70;
106
107/// Parsed bucket-descriptor table: two `bucket_index -> file byte offset`
108/// maps, one for summary buckets and one for non-summary buckets. Both are
109/// 0-based vectors; the on-disk 1-based `bucket_index` is converted by the
110/// accessor methods.
111#[derive(Debug, Clone, PartialEq, Eq)]
112pub struct BucketDescriptorTable {
113    /// Byte offsets of summary buckets, 0-based.
114    pub summary: Vec<u64>,
115    /// Byte offsets of non-summary buckets, 0-based.
116    pub non_summary: Vec<u64>,
117}
118
119impl BucketDescriptorTable {
120    /// Parse the descriptor arrays from a decompressed superblock body.
121    ///
122    /// `body` must be the superblock body *after* CX decompression (the
123    /// bytes that follow the 100-byte header, as libnsfdb addresses them).
124    /// The counts come from the already-parsed [`Superblock`] header.
125    pub fn parse(body: &[u8], sb: &Superblock) -> Result<Self, NsfError> {
126        let u32_at = |buf: &[u8], o: usize| -> Option<u32> {
127            buf.get(o..o + 4)
128                .map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]]))
129        };
130
131        let mut cursor = 0usize;
132        let mut summary = Vec::new();
133        if sb.number_of_summary_bucket_descriptor_pages > 0 {
134            let array_start = cursor + SUMMARY_PAGE_PREFIX;
135            let count = sb.number_of_summary_buckets as usize;
136            summary.reserve(count);
137            for i in 0..count {
138                let off = array_start + i * SUMMARY_DESCRIPTOR_BYTES;
139                let fp = u32_at(body, off).ok_or(NsfError::TooShort {
140                    actual: body.len(),
141                    required: off + 4,
142                })?;
143                summary.push(u64::from(fp) << 8);
144            }
145            cursor += SUMMARY_PAGE_BYTES;
146        }
147
148        let mut non_summary = Vec::new();
149        if sb.number_of_non_summary_bucket_descriptor_pages > 0 {
150            let array_start = cursor + NON_SUMMARY_PAGE_PREFIX;
151            let count = sb.number_of_non_summary_buckets as usize;
152            non_summary.reserve(count);
153            for i in 0..count {
154                let off = array_start + i * NON_SUMMARY_DESCRIPTOR_BYTES;
155                let fp = u32_at(body, off).ok_or(NsfError::TooShort {
156                    actual: body.len(),
157                    required: off + 4,
158                })?;
159                non_summary.push(u64::from(fp) << 8);
160            }
161        }
162
163        Ok(Self {
164            summary,
165            non_summary,
166        })
167    }
168
169    /// Byte offset of a summary bucket given its on-disk 1-based
170    /// `bucket_index`. Summary buckets hold note summary-item data, which
171    /// is where note enumeration via the RRV lands.
172    pub fn summary_bucket_offset(&self, bucket_index: u32) -> Result<u64, NsfError> {
173        Self::lookup(&self.summary, bucket_index)
174    }
175
176    /// Byte offset of a non-summary bucket given its on-disk 1-based
177    /// `bucket_index`.
178    pub fn non_summary_bucket_offset(&self, bucket_index: u32) -> Result<u64, NsfError> {
179        Self::lookup(&self.non_summary, bucket_index)
180    }
181
182    fn lookup(map: &[u64], bucket_index: u32) -> Result<u64, NsfError> {
183        if bucket_index == 0 {
184            return Err(NsfError::BucketIndexOutOfRange {
185                requested: 0,
186                available: map.len(),
187            });
188        }
189        let ordinal = (bucket_index - 1) as usize;
190        map.get(ordinal)
191            .copied()
192            .ok_or(NsfError::BucketIndexOutOfRange {
193                requested: bucket_index,
194                available: map.len(),
195            })
196    }
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202    use crate::superblock::{Superblock, SUPERBLOCK_HEADER_BYTES, SUPERBLOCK_SIGNATURE};
203
204    /// Build a superblock header with the given page counts + bucket
205    /// counts so [`BucketDescriptorTable::parse`] can be exercised.
206    fn superblock_with_counts(
207        summary_pages: u32,
208        summary_buckets: u32,
209        non_summary_pages: u32,
210        non_summary_buckets: u32,
211    ) -> Superblock {
212        let mut buf = vec![0u8; SUPERBLOCK_HEADER_BYTES];
213        buf[0..2].copy_from_slice(&SUPERBLOCK_SIGNATURE);
214        buf[14..18].copy_from_slice(&summary_buckets.to_le_bytes());
215        buf[18..22].copy_from_slice(&non_summary_buckets.to_le_bytes());
216        buf[70..74].copy_from_slice(&summary_pages.to_le_bytes());
217        buf[74..78].copy_from_slice(&non_summary_pages.to_le_bytes());
218        Superblock::parse(&buf).unwrap()
219    }
220
221    /// Build a synthetic decompressed body with summary (and optionally
222    /// non-summary) descriptor pages whose file positions encode the
223    /// descriptor index for easy assertions.
224    fn synthetic_body(summary_buckets: u32, non_summary_buckets: u32) -> Vec<u8> {
225        let mut body = Vec::new();
226        // Summary page.
227        if summary_buckets > 0 {
228            let mut page = vec![0u8; SUMMARY_PAGE_BYTES];
229            for i in 0..summary_buckets as usize {
230                let off = SUMMARY_PAGE_PREFIX + i * SUMMARY_DESCRIPTOR_BYTES;
231                // file_position = 0x100 + i so byte offset = (0x100+i) << 8.
232                let fp = 0x100u32 + i as u32;
233                page[off..off + 4].copy_from_slice(&fp.to_le_bytes());
234            }
235            body.extend_from_slice(&page);
236        }
237        // Non-summary page. Sized to cover its prefix + descriptors; it is
238        // the terminal page so its full padded size is not required here.
239        if non_summary_buckets > 0 {
240            let mut page = vec![
241                0u8;
242                NON_SUMMARY_PAGE_PREFIX
243                    + non_summary_buckets as usize * NON_SUMMARY_DESCRIPTOR_BYTES
244            ];
245            for i in 0..non_summary_buckets as usize {
246                let off = NON_SUMMARY_PAGE_PREFIX + i * NON_SUMMARY_DESCRIPTOR_BYTES;
247                let fp = 0x900u32 + i as u32;
248                page[off..off + 4].copy_from_slice(&fp.to_le_bytes());
249            }
250            body.extend_from_slice(&page);
251        }
252        body
253    }
254
255    #[test]
256    fn parses_summary_descriptor_array() {
257        let sb = superblock_with_counts(1, 3, 0, 0);
258        let body = synthetic_body(3, 0);
259        let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
260        assert_eq!(bdt.summary.len(), 3);
261        assert_eq!(bdt.summary[0], 0x100u64 << 8);
262        assert_eq!(bdt.summary[1], 0x101u64 << 8);
263        assert_eq!(bdt.summary[2], 0x102u64 << 8);
264        assert!(bdt.non_summary.is_empty());
265    }
266
267    #[test]
268    fn parses_both_pages_with_correct_offsets() {
269        let sb = superblock_with_counts(1, 2, 1, 2);
270        let body = synthetic_body(2, 2);
271        let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
272        assert_eq!(bdt.summary.len(), 2);
273        assert_eq!(bdt.non_summary.len(), 2);
274        // Non-summary page sits after the summary page; its descriptors
275        // must still decode to the 0x900-based positions, proving the
276        // cursor advanced by exactly one summary page.
277        assert_eq!(bdt.non_summary[0], 0x900u64 << 8);
278        assert_eq!(bdt.non_summary[1], 0x901u64 << 8);
279    }
280
281    #[test]
282    fn summary_offset_is_one_based() {
283        let sb = superblock_with_counts(1, 3, 0, 0);
284        let body = synthetic_body(3, 0);
285        let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
286        // bucket_index 1 -> element 0.
287        assert_eq!(bdt.summary_bucket_offset(1).unwrap(), 0x100u64 << 8);
288        assert_eq!(bdt.summary_bucket_offset(3).unwrap(), 0x102u64 << 8);
289    }
290
291    #[test]
292    fn bucket_index_zero_is_rejected() {
293        let sb = superblock_with_counts(1, 1, 0, 0);
294        let body = synthetic_body(1, 0);
295        let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
296        assert!(matches!(
297            bdt.summary_bucket_offset(0),
298            Err(NsfError::BucketIndexOutOfRange { requested: 0, .. })
299        ));
300    }
301
302    #[test]
303    fn bucket_index_past_end_is_rejected() {
304        let sb = superblock_with_counts(1, 2, 0, 0);
305        let body = synthetic_body(2, 0);
306        let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
307        assert!(matches!(
308            bdt.summary_bucket_offset(3),
309            Err(NsfError::BucketIndexOutOfRange {
310                requested: 3,
311                available: 2
312            })
313        ));
314    }
315
316    #[test]
317    fn no_descriptor_pages_yields_empty_maps() {
318        // A database with zero descriptor pages (e.g. a fresh shell)
319        // must produce empty maps, not a panic or an error.
320        let sb = superblock_with_counts(0, 0, 0, 0);
321        let bdt = BucketDescriptorTable::parse(&[], &sb).unwrap();
322        assert!(bdt.summary.is_empty());
323        assert!(bdt.non_summary.is_empty());
324    }
325
326    #[test]
327    fn non_summary_only_starts_at_body_offset_zero() {
328        // When there is no summary page, the non-summary page is the
329        // first thing in the body (cursor must NOT skip a summary page
330        // that isn't there). Mirrors libnsfdb: the summary block is only
331        // advanced past when number_of_summary_bucket_descriptor_pages > 0.
332        let sb = superblock_with_counts(0, 0, 1, 2);
333        let body = synthetic_body(0, 2);
334        let bdt = BucketDescriptorTable::parse(&body, &sb).unwrap();
335        assert!(bdt.summary.is_empty());
336        assert_eq!(bdt.non_summary.len(), 2);
337        assert_eq!(bdt.non_summary[0], 0x900u64 << 8);
338        assert_eq!(bdt.non_summary[1], 0x901u64 << 8);
339    }
340
341    #[test]
342    fn truncated_body_errors_not_panics() {
343        let sb = superblock_with_counts(1, 3, 0, 0);
344        // Body too short to hold all three descriptors.
345        let body = vec![0u8; SUMMARY_PAGE_PREFIX + SUMMARY_DESCRIPTOR_BYTES];
346        assert!(matches!(
347            BucketDescriptorTable::parse(&body, &sb),
348            Err(NsfError::TooShort { .. })
349        ));
350    }
351}