Skip to main content

sherlock_nsf_parser/
rrv.rs

1//! Record Relocation Vector (RRV) bucket parsing.
2//!
3//! An RRV bucket is a special bucket type that maps NoteIDs to physical
4//! locations - either a file offset (for legacy / overflow records) or
5//! a (bucket_index, slot_index, nonsum) triple (for records stored
6//! inside another bucket). The DBINFO carries `data_rrv_bucket_position`
7//! (file offset in 256-byte units) as the entry point for note
8//! enumeration.
9//!
10//! RRV bucket layout per `libnsfdb/nsfdb_rrv_bucket.h`:
11//!
12//! ```text
13//! offset  width  field
14//!     0      1   signature (0x06)
15//!     1      1   header_size (0x20 = 32)
16//!     2      4   unknown1
17//!     6      4   initial_rrv_identifier
18//!    10      6   unknown2
19//!    16      2   unknown_size
20//!    18      4   checksum
21//!    22     10   unknown3
22//! ```
23//!
24//! Header is exactly 32 bytes. After the header, RRV entries follow as
25//! a sequence of 8-byte records. Each entry has the layout:
26//!
27//! ```text
28//!     0      4   rrv_entry        (u32 LE; high bit selects variant)
29//!     4      4   rrv_entry_bsid   (u32 LE; only used in bucket-slot variant)
30//! ```
31//!
32//! Modern ODS bit layout (reverse-engineered from
33//! `libnsfdb_rrv_bucket.c::libnsfdb_rrv_bucket_read`):
34//!
35//! - If `rrv_entry & 0x80000000 == 0` -> file-position variant.
36//!   `rrv_entry` is the file position in 256-byte units (multiply by
37//!   256 to get byte offset). Sentinels 0 and 0x7FFFFFFF mark empty
38//!   slots.
39//! - If `rrv_entry & 0x80000000 != 0` -> bucket-slot variant.
40//!   - `bucket_index` = `rrv_entry & 0x00FFFFFF`
41//!   - `nonsum_high`  = `(rrv_entry >> 7) & 0x00E00000`
42//!   - `slot_index`   = `rrv_entry_bsid & 0x000007FF`
43//!   - `nonsum_low`   = `rrv_entry_bsid >> 11`
44//!   - `nonsum`       = `nonsum_high | nonsum_low`
45//!   - Sentinels: `bucket_index == 0` or `bucket_index == 0x00FFFFFF`
46//!     mark empty slots.
47//!
48//! The rrv_identifier counter increments by 4 per entry, starting at
49//! `initial_rrv_identifier`. The identifier IS the NoteID (with the
50//! low 2 bits acting as flag bits per the spec; we expose the raw
51//! identifier and let callers mask).
52
53use crate::error::NsfError;
54
55/// Expected signature byte at offset 0 of every RRV bucket.
56pub const RRV_BUCKET_SIGNATURE: u8 = 0x06;
57/// Fixed RRV bucket header size on disk (matches header_size byte value
58/// `0x20` and the libnsfdb compile-time assertion).
59pub const RRV_BUCKET_HEADER_BYTES: usize = 32;
60/// Bytes consumed per RRV entry (two u32s).
61pub const RRV_ENTRY_BYTES: usize = 8;
62
63/// Empty-slot sentinel for the file-position variant.
64const FILE_POSITION_EMPTY_ALT: u32 = 0x7FFF_FFFF;
65/// Empty-slot sentinel for the bucket-slot variant.
66const BUCKET_INDEX_EMPTY_ALT: u32 = 0x00FF_FFFF;
67
68/// Parsed RRV bucket header.
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub struct RrvBucketHeader {
71    /// Header size byte (always 0x20 for modern format).
72    pub header_size: u8,
73    /// Starting RRV identifier. Each entry's identifier is computed as
74    /// `initial_rrv_identifier + (entry_index * 4)`.
75    pub initial_rrv_identifier: u32,
76    /// Stored XOR-32 checksum.
77    pub checksum: u32,
78}
79
80impl RrvBucketHeader {
81    /// Parse from the first 32 bytes of an RRV bucket. Errors on bad
82    /// signature, bad header size, or short input.
83    pub fn parse(bytes: &[u8]) -> Result<Self, NsfError> {
84        if bytes.len() < RRV_BUCKET_HEADER_BYTES {
85            return Err(NsfError::TooShort {
86                actual: bytes.len(),
87                required: RRV_BUCKET_HEADER_BYTES,
88            });
89        }
90        if bytes[0] != RRV_BUCKET_SIGNATURE {
91            return Err(NsfError::BadFileSignature {
92                observed: [bytes[0], 0],
93            });
94        }
95        if bytes[1] != 0x20 {
96            return Err(NsfError::BadHeaderSize {
97                size: bytes[1] as u32,
98            });
99        }
100        let initial_rrv_identifier =
101            u32::from_le_bytes([bytes[6], bytes[7], bytes[8], bytes[9]]);
102        let checksum = u32::from_le_bytes([bytes[18], bytes[19], bytes[20], bytes[21]]);
103        Ok(Self {
104            header_size: bytes[1],
105            initial_rrv_identifier,
106            checksum,
107        })
108    }
109}
110
111/// Resolved location of an RRV entry. Empty slots are skipped during
112/// iteration so consumers never see them.
113#[derive(Debug, Clone, Copy, PartialEq, Eq)]
114pub enum RrvLocation {
115    /// Record is stored inline in another bucket at the named slot.
116    BucketSlot {
117        /// Index of the bucket holding the record (resolved via the
118        /// Bucket Descriptor Table).
119        bucket_index: u32,
120        /// Slot index inside that bucket.
121        slot_index: u16,
122        /// Non-summary identifier; references additional data outside
123        /// the bucket for records that exceed the bucket-slot capacity.
124        nonsum: u32,
125    },
126    /// Record is stored at a direct file position. The offset is in
127    /// 256-byte units; multiply by 256 to get the byte offset.
128    FilePosition {
129        /// Position in 256-byte units (multiply by 256 for byte offset).
130        file_position_pages: u32,
131    },
132}
133
134impl RrvLocation {
135    /// Byte offset into the file for the file-position variant; None
136    /// for bucket-slot.
137    pub fn file_byte_offset(&self) -> Option<u64> {
138        match self {
139            Self::FilePosition {
140                file_position_pages,
141            } => Some(u64::from(*file_position_pages) * 256),
142            _ => None,
143        }
144    }
145}
146
147/// One parsed RRV entry: the RRV identifier (effectively the NoteID
148/// without flag bits) plus the resolved location.
149#[derive(Debug, Clone, Copy, PartialEq, Eq)]
150pub struct RrvEntry {
151    /// RRV identifier. Derived from
152    /// `initial_rrv_identifier + (entry_index * 4)` during iteration;
153    /// the low 2 bits act as flag bits per the spec.
154    pub rrv_identifier: u32,
155    /// Where the record lives.
156    pub location: RrvLocation,
157}
158
159/// Iterate non-empty RRV entries from a buffer that holds the bucket
160/// header + the full sequence of entries (i.e. the entire RRV bucket).
161///
162/// Empty slots are skipped silently. Trailing bytes shorter than 8 are
163/// ignored. The iterator owns the (parsed) header and a slice cursor
164/// over the entry region.
165pub struct RrvIter<'a> {
166    /// Current rrv_identifier value. Advances by 4 every entry.
167    next_rrv_identifier: u32,
168    /// Remaining entry bytes.
169    remaining: &'a [u8],
170}
171
172impl<'a> RrvIter<'a> {
173    /// Build an iterator from a full RRV-bucket buffer (header + entries).
174    pub fn new(bucket: &'a [u8]) -> Result<(RrvBucketHeader, Self), NsfError> {
175        let header = RrvBucketHeader::parse(bucket)?;
176        let entry_data = &bucket[RRV_BUCKET_HEADER_BYTES..];
177        Ok((
178            header,
179            Self {
180                next_rrv_identifier: header.initial_rrv_identifier,
181                remaining: entry_data,
182            },
183        ))
184    }
185}
186
187impl<'a> Iterator for RrvIter<'a> {
188    type Item = RrvEntry;
189
190    fn next(&mut self) -> Option<Self::Item> {
191        while self.remaining.len() >= RRV_ENTRY_BYTES {
192            let rrv_entry = u32::from_le_bytes([
193                self.remaining[0],
194                self.remaining[1],
195                self.remaining[2],
196                self.remaining[3],
197            ]);
198            let rrv_entry_bsid = u32::from_le_bytes([
199                self.remaining[4],
200                self.remaining[5],
201                self.remaining[6],
202                self.remaining[7],
203            ]);
204            self.remaining = &self.remaining[RRV_ENTRY_BYTES..];
205            let identifier = self.next_rrv_identifier;
206            self.next_rrv_identifier = self.next_rrv_identifier.wrapping_add(4);
207
208            // Bit 31 selects variant.
209            if (rrv_entry & 0x8000_0000) == 0 {
210                // File-position variant. Skip empty markers.
211                if rrv_entry == 0 || rrv_entry == FILE_POSITION_EMPTY_ALT {
212                    continue;
213                }
214                return Some(RrvEntry {
215                    rrv_identifier: identifier,
216                    location: RrvLocation::FilePosition {
217                        file_position_pages: rrv_entry,
218                    },
219                });
220            } else {
221                // Bucket-slot variant.
222                let bucket_index = rrv_entry & 0x00FF_FFFF;
223                if bucket_index == 0 || bucket_index == BUCKET_INDEX_EMPTY_ALT {
224                    continue;
225                }
226                let nonsum_high = (rrv_entry >> 7) & 0x00E0_0000;
227                let slot_index = (rrv_entry_bsid & 0x0000_07FF) as u16;
228                let nonsum_low = rrv_entry_bsid >> 11;
229                let nonsum = nonsum_high | nonsum_low;
230                return Some(RrvEntry {
231                    rrv_identifier: identifier,
232                    location: RrvLocation::BucketSlot {
233                        bucket_index,
234                        slot_index,
235                        nonsum,
236                    },
237                });
238            }
239        }
240        None
241    }
242}
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247
248    fn synthetic_bucket_with_entries(entries: &[(u32, u32)]) -> Vec<u8> {
249        let mut buf = vec![0u8; RRV_BUCKET_HEADER_BYTES + entries.len() * RRV_ENTRY_BYTES];
250        buf[0] = RRV_BUCKET_SIGNATURE;
251        buf[1] = 0x20;
252        // initial_rrv_identifier = 100 (arbitrary)
253        buf[6..10].copy_from_slice(&100u32.to_le_bytes());
254        for (i, (a, b)) in entries.iter().enumerate() {
255            let off = RRV_BUCKET_HEADER_BYTES + i * RRV_ENTRY_BYTES;
256            buf[off..off + 4].copy_from_slice(&a.to_le_bytes());
257            buf[off + 4..off + 8].copy_from_slice(&b.to_le_bytes());
258        }
259        buf
260    }
261
262    #[test]
263    fn parses_header_signature() {
264        let buf = synthetic_bucket_with_entries(&[]);
265        let h = RrvBucketHeader::parse(&buf).unwrap();
266        assert_eq!(h.header_size, 0x20);
267        assert_eq!(h.initial_rrv_identifier, 100);
268    }
269
270    #[test]
271    fn skips_zero_and_alternate_empty_markers_in_file_position_variant() {
272        let buf = synthetic_bucket_with_entries(&[
273            (0, 0),
274            (FILE_POSITION_EMPTY_ALT, 0),
275            (0x0000_2AF0, 0),
276        ]);
277        let (_, iter) = RrvIter::new(&buf).unwrap();
278        let entries: Vec<_> = iter.collect();
279        // Only the third entry should survive; identifier = 100 + 8 = 108.
280        assert_eq!(entries.len(), 1);
281        assert_eq!(entries[0].rrv_identifier, 108);
282        assert!(matches!(
283            entries[0].location,
284            RrvLocation::FilePosition {
285                file_position_pages: 0x0000_2AF0
286            }
287        ));
288        assert_eq!(
289            entries[0].location.file_byte_offset(),
290            Some(0x0000_2AF0 * 256)
291        );
292    }
293
294    #[test]
295    fn parses_bucket_slot_variant_bit_layout() {
296        // Construct a bucket-slot entry: bucket_index=0x123456,
297        // slot_index=0x1AB (0..2047), nonsum=0x00C00042 (must fit
298        // 0x00FFFFFF after recombination).
299        let bucket_index: u32 = 0x0012_3456;
300        let slot_index: u32 = 0x01AB;
301        // Reconstruct rrv_entry: high bit + (nonsum_high << 7) +
302        // bucket_index. We'll only set bucket_index here and verify
303        // the round-trip; nonsum_high = 0 keeps the test focused.
304        let rrv_entry: u32 = 0x8000_0000 | bucket_index;
305        // rrv_entry_bsid: slot_index in low 11 bits, nonsum_low in
306        // the remaining bits.
307        let nonsum_low: u32 = 0x0001_2345; // arbitrary
308        let rrv_entry_bsid: u32 = (nonsum_low << 11) | slot_index;
309        let buf = synthetic_bucket_with_entries(&[(rrv_entry, rrv_entry_bsid)]);
310        let (_, iter) = RrvIter::new(&buf).unwrap();
311        let entries: Vec<_> = iter.collect();
312        assert_eq!(entries.len(), 1);
313        match entries[0].location {
314            RrvLocation::BucketSlot {
315                bucket_index: b,
316                slot_index: s,
317                nonsum,
318            } => {
319                assert_eq!(b, bucket_index);
320                assert_eq!(s, slot_index as u16);
321                // nonsum_high is 0 in this test, so nonsum == nonsum_low.
322                assert_eq!(nonsum, nonsum_low);
323            }
324            other => panic!("expected BucketSlot, got {other:?}"),
325        }
326    }
327
328    #[test]
329    fn rejects_bad_rrv_signature() {
330        let mut buf = synthetic_bucket_with_entries(&[]);
331        buf[0] = 0x55;
332        assert!(RrvBucketHeader::parse(&buf).is_err());
333    }
334
335    #[test]
336    fn rejects_bad_rrv_header_size() {
337        let mut buf = synthetic_bucket_with_entries(&[]);
338        buf[1] = 0x30;
339        assert!(RrvBucketHeader::parse(&buf).is_err());
340    }
341}