sherlock_nsf_parser/bdb.rs
1//! Bucket Descriptor Block (BDB) - the master index of every RRV bucket.
2//!
3//! A single RRV bucket maps only a small contiguous slice of NoteIDs. To
4//! enumerate every note in a database you must walk *all* RRV buckets, and
5//! the list of those buckets lives in the BDB. `Information2` carries two
6//! BDB (position, size) slots (a primary copy plus write-ahead-log
7//! redundancy); the freshest by `write_count` is authoritative.
8//!
9//! On-disk layout per `nsfdb_bucket_descriptor_block.h` +
10//! `libnsfdb_io_handle_read_bucket_descriptor_block`:
11//!
12//! ```text
13//! header (66 bytes)
14//! 0 2 signature (0x01 0x00)
15//! 2 2 version (0x02 0x00)
16//! 4 2 compression_type (must be 1 = CX)
17//! 6 4 uncompressed_size
18//! 10 4 write_count
19//! 14 4 size (total BDB size incl. header + body + footer)
20//! 18 8 modification_time
21//! 26 4 number_of_unique_name_keys
22//! 30 4 unknown1
23//! 34 4 unique_name_key_text_size
24//! 38 4 number_of_rrv_bucket_descriptors
25//! 42 4 number_of_unk_hash_table_entries
26//! 46 8 unknown2
27//! 54 4 checksum
28//! 58 8 unknown3
29//! body (CX-compressed; first 4 bytes of the compressed region are a
30//! prefix the decompressor skips, exactly like the superblock body)
31//! decompressed: number_of_rrv_bucket_descriptors * 8 bytes, then the
32//! Unique Name Key table (not parsed here).
33//! footer (12 bytes): modification_time[8] + checksum[4]
34//! ```
35//!
36//! Each RRV bucket descriptor is 8 bytes: `file_offset[4]` (in 256-byte
37//! units after clearing the type flag) + `initial_rrv_identifier[4]`. The
38//! low bit of `file_offset` is the bucket-type flag: set => non-data,
39//! clear => data. The flag is cleared and the value shifted left 8 to get
40//! the byte offset.
41
42use crate::cx;
43use crate::error::NsfError;
44
45/// BDB header size on disk.
46const BDB_HEADER_BYTES: usize = 66;
47/// BDB footer size on disk.
48const BDB_FOOTER_BYTES: usize = 12;
49/// On-disk size of one RRV bucket descriptor in the decompressed body.
50const RRV_DESCRIPTOR_BYTES: usize = 8;
51/// On-disk size of one Unique Name Key table entry in the decompressed
52/// body: `[text_offset: u32][name_length: u16][unused: u32]`.
53const UNK_ENTRY_BYTES: usize = 10;
54/// Bytes of preamble before the UNK name-text payload begins.
55const UNK_TEXT_PREAMBLE: usize = 4;
56
57/// RRV bucket kind. Data buckets hold document/data NoteIDs; non-data
58/// buckets hold design and special-note NoteIDs.
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum RrvBucketKind {
61 /// Data RRV bucket (`type 'd'` in the reference).
62 Data,
63 /// Non-data RRV bucket (`type 'n'`).
64 NonData,
65}
66
67/// One entry in the BDB: where an RRV bucket lives plus the RRV-identifier
68/// counter it starts from.
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub struct RrvBucketDescriptor {
71 /// Whether this RRV bucket holds data or non-data NoteIDs.
72 pub kind: RrvBucketKind,
73 /// Byte offset of the RRV bucket within the file.
74 pub file_offset: u64,
75 /// The RRV identifier the bucket's first entry corresponds to. (The
76 /// RRV bucket header carries its own `initial_rrv_identifier` too; this
77 /// is the BDB's record of it.)
78 pub initial_rrv_identifier: u32,
79}
80
81/// Parsed Bucket Descriptor Block: the list of every RRV bucket plus the
82/// Unique Name Key table (field-name strings).
83#[derive(Debug, Clone, PartialEq, Eq)]
84pub struct BucketDescriptorBlock {
85 /// Write-count from the header. Higher = fresher (used to pick between
86 /// the primary and WAL-redundant copies).
87 pub write_count: u32,
88 /// Every RRV bucket descriptor, in file order.
89 pub rrv_buckets: Vec<RrvBucketDescriptor>,
90 /// Unique Name Key strings, indexed by `name_id` (a note item's
91 /// `name_id` indexes this vector to recover the field name, e.g.
92 /// `FirstName`, `$UpdatedBy`). Empty when the UNK text region was not
93 /// present / decodable.
94 pub unk_names: Vec<String>,
95 /// Item type byte per `name_id` (UNK entry offset 6). Parallel to
96 /// `unk_names`.
97 pub unk_types: Vec<u8>,
98 /// Item class byte per `name_id` (UNK entry offset 7): 0x03 NUMBER,
99 /// 0x04 TIME, 0x05 TEXT, 0x06 FORMULA, 0x00 NOCOMPUTE. Parallel to
100 /// `unk_names`.
101 pub unk_classes: Vec<u8>,
102}
103
104impl BucketDescriptorBlock {
105 /// Resolve a note item's `name_id` to its field-name string.
106 pub fn name(&self, name_id: u16) -> Option<&str> {
107 self.unk_names.get(name_id as usize).map(|s| s.as_str())
108 }
109
110 /// Authoritative data kind of the field with this `name_id`, from the
111 /// UNK table's class/type bytes. Returns [`FieldKind::Unknown`] when the
112 /// id is out of range.
113 pub fn field_kind(&self, name_id: u16) -> crate::item::FieldKind {
114 let i = name_id as usize;
115 let class = self.unk_classes.get(i).copied().unwrap_or(0xFF);
116 let ty = self.unk_types.get(i).copied().unwrap_or(0xFF);
117 if class == 0xFF {
118 crate::item::FieldKind::Unknown
119 } else {
120 crate::item::field_kind(class, ty)
121 }
122 }
123}
124
125impl BucketDescriptorBlock {
126 /// Parse the BDB located at `offset` (byte offset into the full file
127 /// buffer). `available_size` is the slot's declared size from
128 /// `Information2`; the header's own `size` field must not exceed it.
129 pub fn parse(file: &[u8], offset: u64, available_size: u32) -> Result<Self, NsfError> {
130 let start = offset as usize;
131 let header = file
132 .get(start..start + BDB_HEADER_BYTES)
133 .ok_or(NsfError::TooShort {
134 actual: file.len(),
135 required: start + BDB_HEADER_BYTES,
136 })?;
137
138 if header[0] != 0x01 || header[1] != 0x00 {
139 return Err(NsfError::BadSubrecordSignature {
140 kind: "bucket descriptor block",
141 expected: [0x01, 0x00],
142 observed: [header[0], header[1]],
143 });
144 }
145
146 let u16_at = |o: usize| u16::from_le_bytes([header[o], header[o + 1]]);
147 let u32_at = |o: usize| {
148 u32::from_le_bytes([header[o], header[o + 1], header[o + 2], header[o + 3]])
149 };
150
151 let compression_type = u16_at(4);
152 let uncompressed_size = u32_at(6) as usize;
153 let write_count = u32_at(10);
154 let stored_size = u32_at(14) as usize;
155 let number_of_unique_name_keys = u32_at(26) as usize;
156 let unique_name_key_text_size = u32_at(34) as usize;
157 let number_of_rrv_bucket_descriptors = u32_at(38) as usize;
158
159 if stored_size > available_size as usize {
160 return Err(NsfError::TooShort {
161 actual: available_size as usize,
162 required: stored_size,
163 });
164 }
165 if compression_type != 1 {
166 return Err(NsfError::CompressionUnsupported {
167 structure: "bucket descriptor block",
168 compression_type,
169 });
170 }
171 if stored_size < BDB_HEADER_BYTES + BDB_FOOTER_BYTES + 4 {
172 return Err(NsfError::DecompressionFailed {
173 detail: "bucket descriptor block size too small to hold a compressed body",
174 });
175 }
176
177 let body_len = stored_size - BDB_HEADER_BYTES - BDB_FOOTER_BYTES;
178 let comp_start = start + BDB_HEADER_BYTES;
179 let comp = file
180 .get(comp_start..comp_start + body_len)
181 .ok_or(NsfError::TooShort {
182 actual: file.len(),
183 required: comp_start + body_len,
184 })?;
185 // The body is a chain of length-prefixed CX segments: RRV
186 // descriptors + UNK table (segment 0), the UNK name text
187 // (segment 1), then the UNK hash table (segment 2).
188 let body = cx::decompress_chained(comp, uncompressed_size)?;
189
190 let need = number_of_rrv_bucket_descriptors * RRV_DESCRIPTOR_BYTES;
191 if body.len() < need {
192 return Err(NsfError::TooShort {
193 actual: body.len(),
194 required: need,
195 });
196 }
197
198 let mut rrv_buckets = Vec::with_capacity(number_of_rrv_bucket_descriptors);
199 for i in 0..number_of_rrv_bucket_descriptors {
200 let base = i * RRV_DESCRIPTOR_BYTES;
201 let raw = u32::from_le_bytes([
202 body[base],
203 body[base + 1],
204 body[base + 2],
205 body[base + 3],
206 ]);
207 let initial_rrv_identifier = u32::from_le_bytes([
208 body[base + 4],
209 body[base + 5],
210 body[base + 6],
211 body[base + 7],
212 ]);
213 let kind = if raw & 1 != 0 {
214 RrvBucketKind::NonData
215 } else {
216 RrvBucketKind::Data
217 };
218 let file_offset = u64::from(raw & 0xFFFF_FFFE) << 8;
219 rrv_buckets.push(RrvBucketDescriptor {
220 kind,
221 file_offset,
222 initial_rrv_identifier,
223 });
224 }
225
226 // Unique Name Key table: `name_id` -> field-name string. It follows
227 // the RRV descriptors in the decompressed body; each 10-byte entry
228 // indexes into the name-text payload that follows the table (past a
229 // 4-byte preamble). Out-of-bounds entries degrade to empty strings
230 // rather than failing the whole parse.
231 let unk_table_start = number_of_rrv_bucket_descriptors * RRV_DESCRIPTOR_BYTES;
232 let text_start = unk_table_start + number_of_unique_name_keys * UNK_ENTRY_BYTES;
233 let text_payload_start = text_start + UNK_TEXT_PREAMBLE;
234 let text_end = (text_start + unique_name_key_text_size).min(body.len());
235 let mut unk_names = Vec::with_capacity(number_of_unique_name_keys);
236 let mut unk_types = Vec::with_capacity(number_of_unique_name_keys);
237 let mut unk_classes = Vec::with_capacity(number_of_unique_name_keys);
238 let text = body.get(text_payload_start..text_end).unwrap_or(&[]);
239 for i in 0..number_of_unique_name_keys {
240 let e = unk_table_start + i * UNK_ENTRY_BYTES;
241 // Entry: [text_offset:u32][name_len:u16][item_type:1][item_class:1][unknown:2]
242 let (name, ty, class) = body
243 .get(e..e + UNK_ENTRY_BYTES)
244 .map(|d| {
245 let off = u32::from_le_bytes([d[0], d[1], d[2], d[3]]) as usize;
246 let len = u16::from_le_bytes([d[4], d[5]]) as usize;
247 let name = text
248 .get(off..off + len)
249 .map(|s| String::from_utf8_lossy(s).into_owned())
250 .unwrap_or_default();
251 (name, d[6], d[7])
252 })
253 .unwrap_or_default();
254 unk_names.push(name);
255 unk_types.push(ty);
256 unk_classes.push(class);
257 }
258
259 Ok(Self {
260 write_count,
261 rrv_buckets,
262 unk_names,
263 unk_types,
264 unk_classes,
265 })
266 }
267}
268
269#[cfg(test)]
270mod tests {
271 use super::*;
272
273 #[test]
274 fn rejects_bad_signature() {
275 let mut buf = vec![0u8; 128];
276 buf[0] = 0xFF;
277 let err = BucketDescriptorBlock::parse(&buf, 0, 128).unwrap_err();
278 assert!(matches!(
279 err,
280 NsfError::BadSubrecordSignature {
281 kind: "bucket descriptor block",
282 ..
283 }
284 ));
285 }
286
287 #[test]
288 fn rejects_unsupported_compression() {
289 let mut buf = vec![0u8; 128];
290 buf[0] = 0x01;
291 buf[1] = 0x00;
292 // compression_type = 0 (uncompressed) is unsupported.
293 buf[4] = 0x00;
294 buf[14..18].copy_from_slice(&100u32.to_le_bytes()); // stored_size
295 let err = BucketDescriptorBlock::parse(&buf, 0, 128).unwrap_err();
296 assert!(matches!(
297 err,
298 NsfError::CompressionUnsupported {
299 structure: "bucket descriptor block",
300 ..
301 }
302 ));
303 }
304
305 #[test]
306 fn rejects_stored_size_over_available() {
307 let mut buf = vec![0u8; 128];
308 buf[0] = 0x01;
309 buf[1] = 0x00;
310 buf[4] = 0x01; // compression_type = CX
311 buf[14..18].copy_from_slice(&4096u32.to_le_bytes()); // stored_size > available
312 let err = BucketDescriptorBlock::parse(&buf, 0, 128).unwrap_err();
313 assert!(matches!(err, NsfError::TooShort { .. }));
314 }
315}