sherlock_nsf_parser/rrv.rs
1//! Record Relocation Vector (RRV) bucket parsing.
2//!
3//! An RRV bucket is a special bucket type that maps NoteIDs to physical
4//! locations - either a file offset (for legacy / overflow records) or
5//! a (bucket_index, slot_index, nonsum) triple (for records stored
6//! inside another bucket). The DBINFO carries `data_rrv_bucket_position`
7//! (file offset in 256-byte units) as the entry point for note
8//! enumeration.
9//!
10//! RRV bucket layout per `libnsfdb/nsfdb_rrv_bucket.h`:
11//!
12//! ```text
13//! offset width field
14//! 0 1 signature (0x06)
15//! 1 1 header_size (0x20 = 32)
16//! 2 4 unknown1
17//! 6 4 initial_rrv_identifier
18//! 10 6 unknown2
19//! 16 2 unknown_size
20//! 18 4 checksum
21//! 22 10 unknown3
22//! ```
23//!
24//! Header is exactly 32 bytes. After the header, RRV entries follow as
25//! a sequence of 8-byte records. Each entry has the layout:
26//!
27//! ```text
28//! 0 4 rrv_entry (u32 LE; high bit selects variant)
29//! 4 4 rrv_entry_bsid (u32 LE; only used in bucket-slot variant)
30//! ```
31//!
32//! Modern ODS bit layout (reverse-engineered from
33//! `libnsfdb_rrv_bucket.c::libnsfdb_rrv_bucket_read`):
34//!
35//! - If `rrv_entry & 0x80000000 == 0` -> file-position variant.
36//! `rrv_entry` is the file position in 256-byte units (multiply by
37//! 256 to get byte offset). Sentinels 0 and 0x7FFFFFFF mark empty
38//! slots.
39//! - If `rrv_entry & 0x80000000 != 0` -> bucket-slot variant.
40//! - `bucket_index` = `rrv_entry & 0x00FFFFFF`
41//! - `nonsum_high` = `(rrv_entry >> 7) & 0x00E00000`
42//! - `slot_index` = `rrv_entry_bsid & 0x000007FF`
43//! - `nonsum_low` = `rrv_entry_bsid >> 11`
44//! - `nonsum` = `nonsum_high | nonsum_low`
45//! - Sentinels: `bucket_index == 0` or `bucket_index == 0x00FFFFFF`
46//! mark empty slots.
47//!
48//! The rrv_identifier counter increments by 4 per entry, starting at
49//! `initial_rrv_identifier`. The identifier IS the NoteID (with the
50//! low 2 bits acting as flag bits per the spec; we expose the raw
51//! identifier and let callers mask).
52
53use crate::error::NsfError;
54
55/// Expected signature byte at offset 0 of every RRV bucket.
56pub const RRV_BUCKET_SIGNATURE: u8 = 0x06;
57/// Fixed RRV bucket header size on disk (matches header_size byte value
58/// `0x20` and the libnsfdb compile-time assertion).
59pub const RRV_BUCKET_HEADER_BYTES: usize = 32;
60/// Bytes consumed per RRV entry (two u32s).
61pub const RRV_ENTRY_BYTES: usize = 8;
62
63/// Empty-slot sentinel for the file-position variant.
64const FILE_POSITION_EMPTY_ALT: u32 = 0x7FFF_FFFF;
65/// Empty-slot sentinel for the bucket-slot variant.
66const BUCKET_INDEX_EMPTY_ALT: u32 = 0x00FF_FFFF;
67
68/// Parsed RRV bucket header.
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub struct RrvBucketHeader {
71 /// Header size byte (always 0x20 for modern format).
72 pub header_size: u8,
73 /// Starting RRV identifier. Each entry's identifier is computed as
74 /// `initial_rrv_identifier + (entry_index * 4)`.
75 pub initial_rrv_identifier: u32,
76 /// Stored XOR-32 checksum.
77 pub checksum: u32,
78}
79
80impl RrvBucketHeader {
81 /// Parse from the first 32 bytes of an RRV bucket. Errors on bad
82 /// signature, bad header size, or short input.
83 pub fn parse(bytes: &[u8]) -> Result<Self, NsfError> {
84 if bytes.len() < RRV_BUCKET_HEADER_BYTES {
85 return Err(NsfError::TooShort {
86 actual: bytes.len(),
87 required: RRV_BUCKET_HEADER_BYTES,
88 });
89 }
90 if bytes[0] != RRV_BUCKET_SIGNATURE {
91 return Err(NsfError::BadFileSignature {
92 observed: [bytes[0], 0],
93 });
94 }
95 if bytes[1] != 0x20 {
96 return Err(NsfError::BadHeaderSize {
97 size: bytes[1] as u32,
98 });
99 }
100 let initial_rrv_identifier =
101 u32::from_le_bytes([bytes[6], bytes[7], bytes[8], bytes[9]]);
102 let checksum = u32::from_le_bytes([bytes[18], bytes[19], bytes[20], bytes[21]]);
103 Ok(Self {
104 header_size: bytes[1],
105 initial_rrv_identifier,
106 checksum,
107 })
108 }
109}
110
111/// Resolved location of an RRV entry. Empty slots are skipped during
112/// iteration so consumers never see them.
113#[derive(Debug, Clone, Copy, PartialEq, Eq)]
114pub enum RrvLocation {
115 /// Record is stored inline in another bucket at the named slot.
116 BucketSlot {
117 /// Index of the bucket holding the record (resolved via the
118 /// Bucket Descriptor Table).
119 bucket_index: u32,
120 /// Slot index inside that bucket.
121 slot_index: u16,
122 /// Non-summary identifier; references additional data outside
123 /// the bucket for records that exceed the bucket-slot capacity.
124 nonsum: u32,
125 },
126 /// Record is stored at a direct file position. The offset is in
127 /// 256-byte units; multiply by 256 to get the byte offset.
128 FilePosition {
129 /// Position in 256-byte units (multiply by 256 for byte offset).
130 file_position_pages: u32,
131 },
132}
133
134impl RrvLocation {
135 /// Byte offset into the file for the file-position variant; None
136 /// for bucket-slot.
137 pub fn file_byte_offset(&self) -> Option<u64> {
138 match self {
139 Self::FilePosition {
140 file_position_pages,
141 } => Some(u64::from(*file_position_pages) * 256),
142 _ => None,
143 }
144 }
145}
146
147/// One parsed RRV entry: the RRV identifier (effectively the NoteID
148/// without flag bits) plus the resolved location.
149#[derive(Debug, Clone, Copy, PartialEq, Eq)]
150pub struct RrvEntry {
151 /// RRV identifier. Derived from
152 /// `initial_rrv_identifier + (entry_index * 4)` during iteration;
153 /// the low 2 bits act as flag bits per the spec.
154 pub rrv_identifier: u32,
155 /// Where the record lives.
156 pub location: RrvLocation,
157}
158
159/// Iterate non-empty RRV entries from a buffer that holds the bucket
160/// header + the full sequence of entries (i.e. the entire RRV bucket).
161///
162/// Empty slots are skipped silently. Trailing bytes shorter than 8 are
163/// ignored. The iterator owns the (parsed) header and a slice cursor
164/// over the entry region.
165pub struct RrvIter<'a> {
166 /// Current rrv_identifier value. Advances by 4 every entry.
167 next_rrv_identifier: u32,
168 /// Remaining entry bytes.
169 remaining: &'a [u8],
170}
171
172impl<'a> RrvIter<'a> {
173 /// Build an iterator from a full RRV-bucket buffer (header + entries).
174 pub fn new(bucket: &'a [u8]) -> Result<(RrvBucketHeader, Self), NsfError> {
175 let header = RrvBucketHeader::parse(bucket)?;
176 let entry_data = &bucket[RRV_BUCKET_HEADER_BYTES..];
177 Ok((
178 header,
179 Self {
180 next_rrv_identifier: header.initial_rrv_identifier,
181 remaining: entry_data,
182 },
183 ))
184 }
185}
186
187impl<'a> Iterator for RrvIter<'a> {
188 type Item = RrvEntry;
189
190 fn next(&mut self) -> Option<Self::Item> {
191 while self.remaining.len() >= RRV_ENTRY_BYTES {
192 let rrv_entry = u32::from_le_bytes([
193 self.remaining[0],
194 self.remaining[1],
195 self.remaining[2],
196 self.remaining[3],
197 ]);
198 let rrv_entry_bsid = u32::from_le_bytes([
199 self.remaining[4],
200 self.remaining[5],
201 self.remaining[6],
202 self.remaining[7],
203 ]);
204 self.remaining = &self.remaining[RRV_ENTRY_BYTES..];
205 let identifier = self.next_rrv_identifier;
206 self.next_rrv_identifier = self.next_rrv_identifier.wrapping_add(4);
207
208 // Bit 31 selects variant.
209 if (rrv_entry & 0x8000_0000) == 0 {
210 // File-position variant. Skip empty markers.
211 if rrv_entry == 0 || rrv_entry == FILE_POSITION_EMPTY_ALT {
212 continue;
213 }
214 return Some(RrvEntry {
215 rrv_identifier: identifier,
216 location: RrvLocation::FilePosition {
217 file_position_pages: rrv_entry,
218 },
219 });
220 } else {
221 // Bucket-slot variant.
222 let bucket_index = rrv_entry & 0x00FF_FFFF;
223 if bucket_index == 0 || bucket_index == BUCKET_INDEX_EMPTY_ALT {
224 continue;
225 }
226 let nonsum_high = (rrv_entry >> 7) & 0x00E0_0000;
227 let slot_index = (rrv_entry_bsid & 0x0000_07FF) as u16;
228 let nonsum_low = rrv_entry_bsid >> 11;
229 let nonsum = nonsum_high | nonsum_low;
230 return Some(RrvEntry {
231 rrv_identifier: identifier,
232 location: RrvLocation::BucketSlot {
233 bucket_index,
234 slot_index,
235 nonsum,
236 },
237 });
238 }
239 }
240 None
241 }
242}
243
244#[cfg(test)]
245mod tests {
246 use super::*;
247
248 fn synthetic_bucket_with_entries(entries: &[(u32, u32)]) -> Vec<u8> {
249 let mut buf = vec![0u8; RRV_BUCKET_HEADER_BYTES + entries.len() * RRV_ENTRY_BYTES];
250 buf[0] = RRV_BUCKET_SIGNATURE;
251 buf[1] = 0x20;
252 // initial_rrv_identifier = 100 (arbitrary)
253 buf[6..10].copy_from_slice(&100u32.to_le_bytes());
254 for (i, (a, b)) in entries.iter().enumerate() {
255 let off = RRV_BUCKET_HEADER_BYTES + i * RRV_ENTRY_BYTES;
256 buf[off..off + 4].copy_from_slice(&a.to_le_bytes());
257 buf[off + 4..off + 8].copy_from_slice(&b.to_le_bytes());
258 }
259 buf
260 }
261
262 #[test]
263 fn parses_header_signature() {
264 let buf = synthetic_bucket_with_entries(&[]);
265 let h = RrvBucketHeader::parse(&buf).unwrap();
266 assert_eq!(h.header_size, 0x20);
267 assert_eq!(h.initial_rrv_identifier, 100);
268 }
269
270 #[test]
271 fn skips_zero_and_alternate_empty_markers_in_file_position_variant() {
272 let buf = synthetic_bucket_with_entries(&[
273 (0, 0),
274 (FILE_POSITION_EMPTY_ALT, 0),
275 (0x0000_2AF0, 0),
276 ]);
277 let (_, iter) = RrvIter::new(&buf).unwrap();
278 let entries: Vec<_> = iter.collect();
279 // Only the third entry should survive; identifier = 100 + 8 = 108.
280 assert_eq!(entries.len(), 1);
281 assert_eq!(entries[0].rrv_identifier, 108);
282 assert!(matches!(
283 entries[0].location,
284 RrvLocation::FilePosition {
285 file_position_pages: 0x0000_2AF0
286 }
287 ));
288 assert_eq!(
289 entries[0].location.file_byte_offset(),
290 Some(0x0000_2AF0 * 256)
291 );
292 }
293
294 #[test]
295 fn parses_bucket_slot_variant_bit_layout() {
296 // Construct a bucket-slot entry: bucket_index=0x123456,
297 // slot_index=0x1AB (0..2047), nonsum=0x00C00042 (must fit
298 // 0x00FFFFFF after recombination).
299 let bucket_index: u32 = 0x0012_3456;
300 let slot_index: u32 = 0x01AB;
301 // Reconstruct rrv_entry: high bit + (nonsum_high << 7) +
302 // bucket_index. We'll only set bucket_index here and verify
303 // the round-trip; nonsum_high = 0 keeps the test focused.
304 let rrv_entry: u32 = 0x8000_0000 | bucket_index;
305 // rrv_entry_bsid: slot_index in low 11 bits, nonsum_low in
306 // the remaining bits.
307 let nonsum_low: u32 = 0x0001_2345; // arbitrary
308 let rrv_entry_bsid: u32 = (nonsum_low << 11) | slot_index;
309 let buf = synthetic_bucket_with_entries(&[(rrv_entry, rrv_entry_bsid)]);
310 let (_, iter) = RrvIter::new(&buf).unwrap();
311 let entries: Vec<_> = iter.collect();
312 assert_eq!(entries.len(), 1);
313 match entries[0].location {
314 RrvLocation::BucketSlot {
315 bucket_index: b,
316 slot_index: s,
317 nonsum,
318 } => {
319 assert_eq!(b, bucket_index);
320 assert_eq!(s, slot_index as u16);
321 // nonsum_high is 0 in this test, so nonsum == nonsum_low.
322 assert_eq!(nonsum, nonsum_low);
323 }
324 other => panic!("expected BucketSlot, got {other:?}"),
325 }
326 }
327
328 #[test]
329 fn rejects_bad_rrv_signature() {
330 let mut buf = synthetic_bucket_with_entries(&[]);
331 buf[0] = 0x55;
332 assert!(RrvBucketHeader::parse(&buf).is_err());
333 }
334
335 #[test]
336 fn rejects_bad_rrv_header_size() {
337 let mut buf = synthetic_bucket_with_entries(&[]);
338 buf[1] = 0x30;
339 assert!(RrvBucketHeader::parse(&buf).is_err());
340 }
341}