Skip to main content

sherlock_nsf_parser/
bdb.rs

1//! Bucket Descriptor Block (BDB) - the master index of every RRV bucket.
2//!
3//! A single RRV bucket maps only a small contiguous slice of NoteIDs. To
4//! enumerate every note in a database you must walk *all* RRV buckets, and
5//! the list of those buckets lives in the BDB. `Information2` carries two
6//! BDB (position, size) slots (a primary copy plus write-ahead-log
7//! redundancy); the freshest by `write_count` is authoritative.
8//!
9//! On-disk layout per `nsfdb_bucket_descriptor_block.h` +
10//! `libnsfdb_io_handle_read_bucket_descriptor_block`:
11//!
12//! ```text
13//! header (66 bytes)
14//!   0   2   signature (0x01 0x00)
15//!   2   2   version   (0x02 0x00)
16//!   4   2   compression_type (must be 1 = CX)
17//!   6   4   uncompressed_size
18//!  10   4   write_count
19//!  14   4   size (total BDB size incl. header + body + footer)
20//!  18   8   modification_time
21//!  26   4   number_of_unique_name_keys
22//!  30   4   unknown1
23//!  34   4   unique_name_key_text_size
24//!  38   4   number_of_rrv_bucket_descriptors
25//!  42   4   number_of_unk_hash_table_entries
26//!  46   8   unknown2
27//!  54   4   checksum
28//!  58   8   unknown3
29//! body (CX-compressed; first 4 bytes of the compressed region are a
30//!       prefix the decompressor skips, exactly like the superblock body)
31//!   decompressed: number_of_rrv_bucket_descriptors * 8 bytes, then the
32//!   Unique Name Key table (not parsed here).
33//! footer (12 bytes): modification_time[8] + checksum[4]
34//! ```
35//!
36//! Each RRV bucket descriptor is 8 bytes: `file_offset[4]` (in 256-byte
37//! units after clearing the type flag) + `initial_rrv_identifier[4]`. The
38//! low bit of `file_offset` is the bucket-type flag: set => non-data,
39//! clear => data. The flag is cleared and the value shifted left 8 to get
40//! the byte offset.
41
42use crate::cx;
43use crate::error::NsfError;
44
45/// BDB header size on disk.
46const BDB_HEADER_BYTES: usize = 66;
47/// BDB footer size on disk.
48const BDB_FOOTER_BYTES: usize = 12;
49/// On-disk size of one RRV bucket descriptor in the decompressed body.
50const RRV_DESCRIPTOR_BYTES: usize = 8;
51/// On-disk size of one Unique Name Key table entry in the decompressed
52/// body: `[text_offset: u32][name_length: u16][unused: u32]`.
53const UNK_ENTRY_BYTES: usize = 10;
54/// Bytes of preamble before the UNK name-text payload begins.
55const UNK_TEXT_PREAMBLE: usize = 4;
56
57/// RRV bucket kind. Data buckets hold document/data NoteIDs; non-data
58/// buckets hold design and special-note NoteIDs.
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum RrvBucketKind {
61    /// Data RRV bucket (`type 'd'` in the reference).
62    Data,
63    /// Non-data RRV bucket (`type 'n'`).
64    NonData,
65}
66
67/// One entry in the BDB: where an RRV bucket lives plus the RRV-identifier
68/// counter it starts from.
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub struct RrvBucketDescriptor {
71    /// Whether this RRV bucket holds data or non-data NoteIDs.
72    pub kind: RrvBucketKind,
73    /// Byte offset of the RRV bucket within the file.
74    pub file_offset: u64,
75    /// The RRV identifier the bucket's first entry corresponds to. (The
76    /// RRV bucket header carries its own `initial_rrv_identifier` too; this
77    /// is the BDB's record of it.)
78    pub initial_rrv_identifier: u32,
79}
80
81/// Parsed Bucket Descriptor Block: the list of every RRV bucket plus the
82/// Unique Name Key table (field-name strings).
83#[derive(Debug, Clone, PartialEq, Eq)]
84pub struct BucketDescriptorBlock {
85    /// Write-count from the header. Higher = fresher (used to pick between
86    /// the primary and WAL-redundant copies).
87    pub write_count: u32,
88    /// Every RRV bucket descriptor, in file order.
89    pub rrv_buckets: Vec<RrvBucketDescriptor>,
90    /// Unique Name Key strings, indexed by `name_id` (a note item's
91    /// `name_id` indexes this vector to recover the field name, e.g.
92    /// `FirstName`, `$UpdatedBy`). Empty when the UNK text region was not
93    /// present / decodable.
94    pub unk_names: Vec<String>,
95    /// Item type byte per `name_id` (UNK entry offset 6). Parallel to
96    /// `unk_names`.
97    pub unk_types: Vec<u8>,
98    /// Item class byte per `name_id` (UNK entry offset 7): 0x03 NUMBER,
99    /// 0x04 TIME, 0x05 TEXT, 0x06 FORMULA, 0x00 NOCOMPUTE. Parallel to
100    /// `unk_names`.
101    pub unk_classes: Vec<u8>,
102}
103
104impl BucketDescriptorBlock {
105    /// Resolve a note item's `name_id` to its field-name string.
106    pub fn name(&self, name_id: u16) -> Option<&str> {
107        self.unk_names.get(name_id as usize).map(|s| s.as_str())
108    }
109
110    /// Authoritative data kind of the field with this `name_id`, from the
111    /// UNK table's class/type bytes. Returns [`FieldKind::Unknown`] when the
112    /// id is out of range.
113    pub fn field_kind(&self, name_id: u16) -> crate::item::FieldKind {
114        let i = name_id as usize;
115        let class = self.unk_classes.get(i).copied().unwrap_or(0xFF);
116        let ty = self.unk_types.get(i).copied().unwrap_or(0xFF);
117        if class == 0xFF {
118            crate::item::FieldKind::Unknown
119        } else {
120            crate::item::field_kind(class, ty)
121        }
122    }
123}
124
125impl BucketDescriptorBlock {
126    /// Parse the BDB located at `offset` (byte offset into the full file
127    /// buffer). `available_size` is the slot's declared size from
128    /// `Information2`; the header's own `size` field must not exceed it.
129    pub fn parse(file: &[u8], offset: u64, available_size: u32) -> Result<Self, NsfError> {
130        let start = offset as usize;
131        let header = file
132            .get(start..start + BDB_HEADER_BYTES)
133            .ok_or(NsfError::TooShort {
134                actual: file.len(),
135                required: start + BDB_HEADER_BYTES,
136            })?;
137
138        if header[0] != 0x01 || header[1] != 0x00 {
139            return Err(NsfError::BadSubrecordSignature {
140                kind: "bucket descriptor block",
141                expected: [0x01, 0x00],
142                observed: [header[0], header[1]],
143            });
144        }
145
146        let u16_at = |o: usize| u16::from_le_bytes([header[o], header[o + 1]]);
147        let u32_at = |o: usize| {
148            u32::from_le_bytes([header[o], header[o + 1], header[o + 2], header[o + 3]])
149        };
150
151        let compression_type = u16_at(4);
152        let uncompressed_size = u32_at(6) as usize;
153        let write_count = u32_at(10);
154        let stored_size = u32_at(14) as usize;
155        let number_of_unique_name_keys = u32_at(26) as usize;
156        let unique_name_key_text_size = u32_at(34) as usize;
157        let number_of_rrv_bucket_descriptors = u32_at(38) as usize;
158
159        if stored_size > available_size as usize {
160            return Err(NsfError::TooShort {
161                actual: available_size as usize,
162                required: stored_size,
163            });
164        }
165        if compression_type != 1 {
166            return Err(NsfError::CompressionUnsupported {
167                structure: "bucket descriptor block",
168                compression_type,
169            });
170        }
171        if stored_size < BDB_HEADER_BYTES + BDB_FOOTER_BYTES + 4 {
172            return Err(NsfError::DecompressionFailed {
173                detail: "bucket descriptor block size too small to hold a compressed body",
174            });
175        }
176
177        let body_len = stored_size - BDB_HEADER_BYTES - BDB_FOOTER_BYTES;
178        let comp_start = start + BDB_HEADER_BYTES;
179        let comp = file
180            .get(comp_start..comp_start + body_len)
181            .ok_or(NsfError::TooShort {
182                actual: file.len(),
183                required: comp_start + body_len,
184            })?;
185        // The body is a chain of length-prefixed CX segments: RRV
186        // descriptors + UNK table (segment 0), the UNK name text
187        // (segment 1), then the UNK hash table (segment 2).
188        let body = cx::decompress_chained(comp, uncompressed_size)?;
189
190        let need = number_of_rrv_bucket_descriptors * RRV_DESCRIPTOR_BYTES;
191        if body.len() < need {
192            return Err(NsfError::TooShort {
193                actual: body.len(),
194                required: need,
195            });
196        }
197
198        let mut rrv_buckets = Vec::with_capacity(number_of_rrv_bucket_descriptors);
199        for i in 0..number_of_rrv_bucket_descriptors {
200            let base = i * RRV_DESCRIPTOR_BYTES;
201            let raw = u32::from_le_bytes([
202                body[base],
203                body[base + 1],
204                body[base + 2],
205                body[base + 3],
206            ]);
207            let initial_rrv_identifier = u32::from_le_bytes([
208                body[base + 4],
209                body[base + 5],
210                body[base + 6],
211                body[base + 7],
212            ]);
213            let kind = if raw & 1 != 0 {
214                RrvBucketKind::NonData
215            } else {
216                RrvBucketKind::Data
217            };
218            let file_offset = u64::from(raw & 0xFFFF_FFFE) << 8;
219            rrv_buckets.push(RrvBucketDescriptor {
220                kind,
221                file_offset,
222                initial_rrv_identifier,
223            });
224        }
225
226        // Unique Name Key table: `name_id` -> field-name string. It follows
227        // the RRV descriptors in the decompressed body; each 10-byte entry
228        // indexes into the name-text payload that follows the table (past a
229        // 4-byte preamble). Out-of-bounds entries degrade to empty strings
230        // rather than failing the whole parse.
231        let unk_table_start = number_of_rrv_bucket_descriptors * RRV_DESCRIPTOR_BYTES;
232        let text_start = unk_table_start + number_of_unique_name_keys * UNK_ENTRY_BYTES;
233        let text_payload_start = text_start + UNK_TEXT_PREAMBLE;
234        let text_end = (text_start + unique_name_key_text_size).min(body.len());
235        let mut unk_names = Vec::with_capacity(number_of_unique_name_keys);
236        let mut unk_types = Vec::with_capacity(number_of_unique_name_keys);
237        let mut unk_classes = Vec::with_capacity(number_of_unique_name_keys);
238        let text = body.get(text_payload_start..text_end).unwrap_or(&[]);
239        for i in 0..number_of_unique_name_keys {
240            let e = unk_table_start + i * UNK_ENTRY_BYTES;
241            // Entry: [text_offset:u32][name_len:u16][item_type:1][item_class:1][unknown:2]
242            let (name, ty, class) = body
243                .get(e..e + UNK_ENTRY_BYTES)
244                .map(|d| {
245                    let off = u32::from_le_bytes([d[0], d[1], d[2], d[3]]) as usize;
246                    let len = u16::from_le_bytes([d[4], d[5]]) as usize;
247                    let name = text
248                        .get(off..off + len)
249                        .map(|s| String::from_utf8_lossy(s).into_owned())
250                        .unwrap_or_default();
251                    (name, d[6], d[7])
252                })
253                .unwrap_or_default();
254            unk_names.push(name);
255            unk_types.push(ty);
256            unk_classes.push(class);
257        }
258
259        Ok(Self {
260            write_count,
261            rrv_buckets,
262            unk_names,
263            unk_types,
264            unk_classes,
265        })
266    }
267}
268
269#[cfg(test)]
270mod tests {
271    use super::*;
272
273    #[test]
274    fn rejects_bad_signature() {
275        let mut buf = vec![0u8; 128];
276        buf[0] = 0xFF;
277        let err = BucketDescriptorBlock::parse(&buf, 0, 128).unwrap_err();
278        assert!(matches!(
279            err,
280            NsfError::BadSubrecordSignature {
281                kind: "bucket descriptor block",
282                ..
283            }
284        ));
285    }
286
287    #[test]
288    fn rejects_unsupported_compression() {
289        let mut buf = vec![0u8; 128];
290        buf[0] = 0x01;
291        buf[1] = 0x00;
292        // compression_type = 0 (uncompressed) is unsupported.
293        buf[4] = 0x00;
294        buf[14..18].copy_from_slice(&100u32.to_le_bytes()); // stored_size
295        let err = BucketDescriptorBlock::parse(&buf, 0, 128).unwrap_err();
296        assert!(matches!(
297            err,
298            NsfError::CompressionUnsupported {
299                structure: "bucket descriptor block",
300                ..
301            }
302        ));
303    }
304
305    #[test]
306    fn rejects_stored_size_over_available() {
307        let mut buf = vec![0u8; 128];
308        buf[0] = 0x01;
309        buf[1] = 0x00;
310        buf[4] = 0x01; // compression_type = CX
311        buf[14..18].copy_from_slice(&4096u32.to_le_bytes()); // stored_size > available
312        let err = BucketDescriptorBlock::parse(&buf, 0, 128).unwrap_err();
313        assert!(matches!(err, NsfError::TooShort { .. }));
314    }
315}