sherlock_nsf_parser/
database.rs

1//! High-level `Database::open` API.
2//!
3//! Pulls the file header + DBINFO together, then exposes the entry
4//! points for note enumeration. The actual RRV walk requires having
5//! the file mmapped or fully buffered; this layer keeps the byte
6//! window borrowed so consumers control I/O strategy.
7
8use crate::bdb::BucketDescriptorBlock;
9use crate::bucket::Bucket;
10use crate::cx;
11use crate::error::NsfError;
12use crate::header::DbHeader;
13use crate::info2::{Information2, INFO2_BYTES, INFO2_FILE_OFFSET};
14use crate::note::NoteHeader;
15use crate::rrv::{RrvBucketHeader, RrvEntry, RrvIter, RrvLocation};
16use crate::superblock::{select_freshest, Superblock, SUPERBLOCK_HEADER_BYTES};
17
18/// Body offset where the resident summary-descriptor page begins inside a
19/// single-page database (the libnsfdb-documented prefix `4 + 10 + 10 +
20/// 200`). For a multi-page database the resident page sits after the page
21/// index: `SUMMARY_RESIDENT_PREFIX + (pages - 1) * SUMMARY_DESCRIPTOR_BYTES`.
22const SUMMARY_RESIDENT_PREFIX: usize = 224;
23/// On-disk size of one summary bucket descriptor (`file_position[4] +
24/// modification_time[8] + 2 free-byte fields`).
25const SUMMARY_DESCRIPTOR_BYTES: usize = 14;
26/// Header size that precedes the descriptor array inside an *out-of-body*
27/// summary descriptor page (the pages pointed to by the body page index).
28/// Empirically derived (validated to 99.3% against the fakenames identity
29/// oracle); see the `nsf_b2_addressing_cracked` engineering note. Distinct
30/// from the in-body resident page, which uses [`SUMMARY_RESIDENT_PREFIX`].
31const OUT_OF_BODY_PAGE_HEADER: usize = 250;
32/// Number of bucket descriptors per out-of-body summary page. Empirically
33/// derived (the resident page base lands at `(pages-1)*PER_OUT_OF_BODY_PAGE
34/// + 1`, exactly matching the observed bucket_index range). The resident
35/// page's count comes from `Superblock::number_of_summary_buckets` instead.
36const PER_OUT_OF_BODY_PAGE: usize = 567;
37
38fn read_u32_le(buf: &[u8], offset: usize) -> Option<u32> {
39    buf.get(offset..offset + 4)
40        .map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]]))
41}
42
43/// Top-level handle to a buffered NSF file.
44///
45/// Holds a borrowed slice of the full file bytes. Cheap to construct -
46/// no copies are made. The parser walks the file lazily; consumers pay
47/// for what they enumerate.
48#[derive(Debug)]
49pub struct Database<'a> {
50    bytes: &'a [u8],
51    header: DbHeader,
52}
53
54impl<'a> Database<'a> {
55    /// Open an NSF from a full-file byte buffer. Validates the file
56    /// header and DBINFO; lazy on everything else.
57    pub fn open(bytes: &'a [u8]) -> Result<Self, NsfError> {
58        let header = DbHeader::parse(bytes)?;
59        Ok(Self { bytes, header })
60    }
61
62    /// Parsed database header.
63    pub fn header(&self) -> &DbHeader {
64        &self.header
65    }
66
67    /// True when the database carries a populated data RRV bucket. A
68    /// fresh / never-instantiated template will return false here -
69    /// it has design notes via the non-data RRV but no data notes.
70    pub fn has_data_rrv(&self) -> bool {
71        self.header.data_rrv_bucket_position != 0
72    }
73
74    /// Parse + iterate the data RRV bucket if present. Returns the
75    /// bucket header for diagnostics plus an iterator over the
76    /// non-empty RRV entries.
77    ///
78    /// The data RRV bucket's file position is reported in 256-byte
79    /// units in DBINFO; this method converts to a byte offset and
80    /// reads `rrv_bucket_size` bytes from that point.
81    pub fn data_rrv_iter(&self) -> Result<Option<(RrvBucketHeader, RrvIter<'a>)>, NsfError> {
82        if !self.has_data_rrv() {
83            return Ok(None);
84        }
85        let byte_offset = u64::from(self.header.data_rrv_bucket_position) * 256;
86        let bucket_size = self.header.rrv_bucket_size as u64;
87        let end = byte_offset.saturating_add(bucket_size);
88        if end > self.bytes.len() as u64 {
89            return Err(NsfError::TooShort {
90                actual: self.bytes.len(),
91                required: end as usize,
92            });
93        }
94        let bucket = &self.bytes[byte_offset as usize..end as usize];
95        let (header, iter) = RrvIter::new(bucket)?;
96        Ok(Some((header, iter)))
97    }
98
99    /// Convenience: count non-empty entries in the data RRV. Walks the
100    /// bucket but does not retain the per-entry state.
101    pub fn data_note_count(&self) -> Result<u64, NsfError> {
102        let Some((_, iter)) = self.data_rrv_iter()? else {
103            return Ok(0);
104        };
105        Ok(iter.count() as u64)
106    }
107
108    /// True when the database carries a populated non-data RRV bucket.
109    /// Design notes (forms, views) and, in databases like `fakenames.nsf`,
110    /// the bulk of document notes are reached through the non-data RRV
111    /// rather than the data RRV.
112    pub fn has_non_data_rrv(&self) -> bool {
113        self.header.non_data_rrv_bucket_position != 0
114    }
115
116    /// Parse + iterate the non-data RRV bucket if present. Mirrors
117    /// [`Self::data_rrv_iter`] but reads from
118    /// `non_data_rrv_bucket_position`. Most bucket-slot RRV entries (the
119    /// ones [`Self::resolve_bucket_slot`] resolves) live here.
120    pub fn non_data_rrv_iter(&self) -> Result<Option<(RrvBucketHeader, RrvIter<'a>)>, NsfError> {
121        if !self.has_non_data_rrv() {
122            return Ok(None);
123        }
124        let byte_offset = u64::from(self.header.non_data_rrv_bucket_position) * 256;
125        let bucket_size = self.header.rrv_bucket_size as u64;
126        let end = byte_offset.saturating_add(bucket_size);
127        if end > self.bytes.len() as u64 {
128            return Err(NsfError::TooShort {
129                actual: self.bytes.len(),
130                required: end as usize,
131            });
132        }
133        let bucket = &self.bytes[byte_offset as usize..end as usize];
134        let (header, iter) = RrvIter::new(bucket)?;
135        Ok(Some((header, iter)))
136    }
137
138    /// Collect at most `limit` RRV entries from the data RRV for
139    /// preview / list rendering. Useful for "show the first 200 notes
140    /// in the viewer" without walking 40,000 entries up front.
141    pub fn data_rrv_take(&self, limit: usize) -> Result<Vec<RrvEntry>, NsfError> {
142        let Some((_, iter)) = self.data_rrv_iter()? else {
143            return Ok(Vec::new());
144        };
145        Ok(iter.take(limit).collect())
146    }
147
148    /// Parse the database information extension block 2 (file offset 520,
149    /// 124 bytes). Carries the 4 superblock positions + 2 BDB positions
150    /// plus bucket-size knobs.
151    pub fn information2(&self) -> Result<Information2, NsfError> {
152        let end = INFO2_FILE_OFFSET + INFO2_BYTES;
153        if self.bytes.len() < end {
154            return Err(NsfError::TooShort {
155                actual: self.bytes.len(),
156                required: end,
157            });
158        }
159        Information2::parse(&self.bytes[INFO2_FILE_OFFSET..end])
160    }
161
162    /// Parse every populated superblock copy (skipping uninitialized
163    /// slots). Each entry is `(slot_index, Superblock)` so callers can
164    /// report which copy was loaded. Domino allocates 4 slots and rotates
165    /// commits across them; instantiated databases typically have 3
166    /// populated and 1 empty, with the freshest by `modification_time`
167    /// authoritative (use [`Self::freshest_superblock`]).
168    ///
169    /// Forensic-tool-grade resilience: slots are skipped silently when
170    /// any of these conditions hold, rather than crashing the load:
171    ///
172    /// - Slot is empty (position or size zero).
173    /// - Slot's declared byte offset extends past the file end.
174    /// - Slot's body does not start with the superblock signature
175    ///   `0E 00`. This catches fresh-template uninitialized regions
176    ///   that Domino allocates with `allocation_granularity` but never
177    ///   commits to (empirically these are filled with `AA AA AA AA`,
178    ///   e.g. SB3 of `comparedbs.ntf`).
179    ///
180    /// Other parse failures (e.g. unexpected short read mid-header) are
181    /// not expected in practice with a fully-buffered NSF and would
182    /// surface as errors. The 3-redundant-copy WAL guarantees that
183    /// silently dropping an unreadable slot leaves at least one valid
184    /// copy.
185    pub fn superblocks(&self) -> Result<Vec<(usize, Superblock)>, NsfError> {
186        let info = self.information2()?;
187        let mut out = Vec::with_capacity(4);
188        for (i, slot) in info.superblocks.iter().enumerate() {
189            let Some(byte_offset) = slot.byte_offset() else {
190                continue;
191            };
192            let start = byte_offset as usize;
193            let end = start.saturating_add(SUPERBLOCK_HEADER_BYTES);
194            if end > self.bytes.len() {
195                continue;
196            }
197            match Superblock::parse(&self.bytes[start..end]) {
198                Ok(sb) => out.push((i, sb)),
199                Err(NsfError::BadSubrecordSignature { .. }) => {
200                    // Uninitialized / 0xAA-filled region. Skip silently.
201                }
202                Err(other) => return Err(other),
203            }
204        }
205        Ok(out)
206    }
207
208    /// Convenience: parse all populated superblocks and return the
209    /// freshest one by `modification_time`. The other three copies are
210    /// write-ahead-log redundancy and should be ignored once this one
211    /// is loaded. Returns `None` if no superblock slots are populated
212    /// (extremely rare; would indicate a partially-initialized NSF).
213    pub fn freshest_superblock(&self) -> Result<Option<(usize, Superblock)>, NsfError> {
214        let all = self.superblocks()?;
215        Ok(select_freshest(&all))
216    }
217
218    /// Decompress the freshest superblock's body (the CX-compressed region
219    /// that carries the bucket-descriptor array). Returns `None` when the
220    /// database has no superblock.
221    ///
222    /// Body layout from the superblock byte offset, per the reference:
223    /// `[0,100)` header, then the compressed region of length
224    /// `size - 112` (100-byte header + 12-byte footer removed), of which
225    /// the first 4 bytes are a prefix the decompressor skips. The
226    /// decompressed length is the header's `uncompressed_size` field.
227    pub fn decompressed_superblock_body(&self) -> Result<Option<Vec<u8>>, NsfError> {
228        let Some((slot, sb)) = self.freshest_superblock()? else {
229            return Ok(None);
230        };
231        let info = self.information2()?;
232        let Some(sb_offset) = info.superblocks.get(slot).and_then(|s| s.byte_offset()) else {
233            return Ok(None);
234        };
235        let size = sb.size as usize;
236        // Need at least header (100) + footer (12) + the 4-byte prefix.
237        if size < SUPERBLOCK_HEADER_BYTES + 12 + 4 {
238            return Err(NsfError::DecompressionFailed {
239                detail: "superblock size too small to hold a compressed body",
240            });
241        }
242        let region_start = sb_offset as usize + SUPERBLOCK_HEADER_BYTES;
243        let region_len = size - SUPERBLOCK_HEADER_BYTES - 12;
244        // The body is a chain of length-prefixed CX segments (the leading 4
245        // bytes are the first segment's compressed length). Single-segment
246        // bodies - the common superblock case - decode identically.
247        let region_end = region_start + region_len;
248        let region = self.bytes.get(region_start..region_end).ok_or(NsfError::TooShort {
249            actual: self.bytes.len(),
250            required: region_end,
251        })?;
252        let body = cx::decompress_chained(region, sb.uncompressed_size as usize)?;
253        Ok(Some(body))
254    }
255
256    /// Build the global summary-bucket descriptor map: a 0-based vector of
257    /// file byte offsets where `offsets[bucket_index - 1]` is the byte
258    /// offset of the summary bucket an RRV bucket-slot entry's
259    /// `bucket_index` refers to (`bucket_index` is 1-based on disk).
260    ///
261    /// # Multi-page geometry
262    ///
263    /// On modern ODS the summary bucket descriptors are spread across
264    /// `number_of_summary_bucket_descriptor_pages` pages. The decompressed
265    /// superblock body begins with a page index of `(pages - 1)` stride-14
266    /// records (the page's `file_position` is the first 4 bytes of each
267    /// record); those point to the out-of-body pages. The final (resident)
268    /// page's descriptor array is inline in the body at
269    /// `SUMMARY_RESIDENT_PREFIX + (pages - 1) * SUMMARY_DESCRIPTOR_BYTES`.
270    /// Single-page databases (`pages <= 1`) have only the resident page at
271    /// the libnsfdb-documented offset 224.
272    ///
273    /// libnsfdb itself only handles a single descriptor page (it errors on
274    /// `> 1`), so the multi-page geometry here was reverse-engineered and
275    /// validated against the `rrv_identifier` identity oracle (see
276    /// [`Self::enumerate_notes`]). The out-of-body page header size
277    /// ([`OUT_OF_BODY_PAGE_HEADER`]) and per-page descriptor count
278    /// ([`PER_OUT_OF_BODY_PAGE`]) are empirical constants; mis-fits surface
279    /// as identity-gate failures in [`Self::enumerate_notes`] rather than as
280    /// silently wrong records.
281    pub fn summary_bucket_offsets(&self) -> Result<Vec<u64>, NsfError> {
282        Ok(self
283            .summary_bucket_raw_fps()?
284            .into_iter()
285            .map(|fp| u64::from(fp) << 8)
286            .collect())
287    }
288
289    /// The raw 4-byte `file_position` value of each summary bucket
290    /// descriptor, 0-based by `bucket_index`. The byte offset is
291    /// `fp << 8` (see [`Self::summary_bucket_offsets`]); the raw form is
292    /// retained because the rare group-marker slots carry flag bits inside
293    /// the `file_position` field that [`Self::enumerate_notes`] corrects.
294    fn summary_bucket_raw_fps(&self) -> Result<Vec<u32>, NsfError> {
295        let Some((_, sb)) = self.freshest_superblock()? else {
296            return Ok(Vec::new());
297        };
298        let Some(body) = self.decompressed_superblock_body()? else {
299            return Ok(Vec::new());
300        };
301        let pages = sb.number_of_summary_bucket_descriptor_pages as usize;
302        let n_page_ptrs = pages.saturating_sub(1);
303        let resident_count = sb.number_of_summary_buckets as usize;
304
305        let mut fps = Vec::new();
306
307        // Out-of-body pages, in page-index order.
308        for j in 0..n_page_ptrs {
309            let page_fp = read_u32_le(&body, j * SUMMARY_DESCRIPTOR_BYTES).unwrap_or(0);
310            let page_off = u64::from(page_fp) << 8;
311            for k in 0..PER_OUT_OF_BODY_PAGE {
312                let o = page_off as usize
313                    + OUT_OF_BODY_PAGE_HEADER
314                    + k * SUMMARY_DESCRIPTOR_BYTES;
315                fps.push(read_u32_le(self.bytes, o).unwrap_or(0));
316            }
317        }
318
319        // Resident page, inline in the decompressed body.
320        let resident_prefix = SUMMARY_RESIDENT_PREFIX + n_page_ptrs * SUMMARY_DESCRIPTOR_BYTES;
321        for k in 0..resident_count {
322            let o = resident_prefix + k * SUMMARY_DESCRIPTOR_BYTES;
323            fps.push(read_u32_le(&body, o).unwrap_or(0));
324        }
325
326        Ok(fps)
327    }
328
329    /// Resolve a single RRV bucket-slot pair to the raw bytes of the slot's
330    /// record, using the summary-bucket descriptor map.
331    ///
332    /// This is the physical resolution step: it does not identity-check the
333    /// result. For verified note enumeration (where each resolved record is
334    /// confirmed to carry the requested `rrv_identifier`), use
335    /// [`Self::enumerate_notes`]. Rebuilds the descriptor map on each call;
336    /// callers resolving many entries should prefer `enumerate_notes`, which
337    /// builds the map once.
338    pub fn resolve_bucket_slot(
339        &self,
340        bucket_index: u32,
341        slot_index: u16,
342    ) -> Result<&'a [u8], NsfError> {
343        let offsets = self.summary_bucket_offsets()?;
344        Self::resolve_in(self.bytes, &offsets, bucket_index, slot_index)
345    }
346
347    /// Resolve `bucket_index`/`slot_index` against a prebuilt descriptor map.
348    fn resolve_in(
349        bytes: &'a [u8],
350        offsets: &[u64],
351        bucket_index: u32,
352        slot_index: u16,
353    ) -> Result<&'a [u8], NsfError> {
354        let ordinal = (bucket_index as usize)
355            .checked_sub(1)
356            .ok_or(NsfError::BucketIndexOutOfRange {
357                requested: bucket_index,
358                available: offsets.len(),
359            })?;
360        let off = *offsets
361            .get(ordinal)
362            .ok_or(NsfError::BucketIndexOutOfRange {
363                requested: bucket_index,
364                available: offsets.len(),
365            })?;
366        let start = off as usize;
367        let bucket_bytes = bytes.get(start..).ok_or(NsfError::TooShort {
368            actual: bytes.len(),
369            required: start,
370        })?;
371        let bucket = Bucket::parse(bucket_bytes)?;
372        bucket.slot(slot_index)
373    }
374
375    /// Parse the freshest Bucket Descriptor Block (BDB) - the master index
376    /// of every RRV bucket in the database. Returns `None` when no BDB slot
377    /// is populated (a fresh / never-instantiated shell). Of the two BDB
378    /// copies in [`Information2`] (primary + write-ahead-log redundancy) the
379    /// one with the higher `write_count` is authoritative.
380    pub fn bucket_descriptor_block(&self) -> Result<Option<BucketDescriptorBlock>, NsfError> {
381        let info = self.information2()?;
382        let mut best: Option<BucketDescriptorBlock> = None;
383        for slot in &info.bdbs {
384            let Some(off) = slot.byte_offset() else {
385                continue;
386            };
387            match BucketDescriptorBlock::parse(self.bytes, off, slot.size_bytes) {
388                Ok(bdb) => {
389                    if best.as_ref().map_or(true, |b| bdb.write_count > b.write_count) {
390                        best = Some(bdb);
391                    }
392                }
393                // A malformed / superseded BDB copy is skipped; the other
394                // copy is the WAL redundancy that covers it.
395                Err(_) => continue,
396            }
397        }
398        Ok(best)
399    }
400
401    /// Enumerate every note in the database by walking the BDB -> all RRV
402    /// buckets -> each RRV entry, resolving each to a note record.
403    ///
404    /// Every resolution is **identity-gated**: a note is only accepted if
405    /// the resolved record's `rrv_identifier` (note header offset 6) equals
406    /// the RRV entry's identifier. This is the chain-of-custody guarantee -
407    /// a record is never returned unless it provably is the note the RRV
408    /// entry points to. Entries that no candidate resolves under the gate
409    /// are counted in `unresolved` rather than returned as possibly-wrong
410    /// evidence.
411    ///
412    /// # Group-marker recovery
413    ///
414    /// A small set of summary-descriptor slots (the page's group-boundary
415    /// slots) carry group-marker flag bits inside the `file_position` field:
416    /// the low nibble, or bits 16-19 (in which case the true high nibble
417    /// matches the locally-sequential neighbours). For each bucket-slot
418    /// entry the resolver tries the raw descriptor first, then these
419    /// marker-corrected candidates, accepting the first that passes the
420    /// identity gate. Because acceptance requires an exact 32-bit
421    /// `rrv_identifier` match, a wrong candidate cannot be accepted - the
422    /// recovery is heuristic in *what it tries* but never in *what it
423    /// returns*.
424    pub fn enumerate_notes(&self) -> Result<NoteEnumeration, NsfError> {
425        let mut out = NoteEnumeration::default();
426
427        let Some((_, sb)) = self.freshest_superblock()? else {
428            return Ok(out);
429        };
430        let rrv_bucket_size = sb.rrv_bucket_size as usize;
431        if rrv_bucket_size == 0 {
432            return Ok(out);
433        }
434        let raw_fps = self.summary_bucket_raw_fps()?;
435
436        // Collect every RRV bucket to walk: those listed in the BDB plus
437        // the data and non-data RRV buckets named directly in DBINFO.
438        // Deduped by byte offset - on modern ODS the DBINFO buckets are
439        // usually also in the BDB; on older / simpler databases they may
440        // not be, so both sources are needed for complete enumeration.
441        let mut rrv_offsets: std::collections::BTreeSet<u64> = std::collections::BTreeSet::new();
442        if let Some(bdb) = self.bucket_descriptor_block()? {
443            rrv_offsets.extend(bdb.rrv_buckets.iter().map(|d| d.file_offset));
444        }
445        if self.header.data_rrv_bucket_position != 0 {
446            rrv_offsets.insert(u64::from(self.header.data_rrv_bucket_position) * 256);
447        }
448        if self.header.non_data_rrv_bucket_position != 0 {
449            rrv_offsets.insert(u64::from(self.header.non_data_rrv_bucket_position) * 256);
450        }
451
452        for &bucket_off in &rrv_offsets {
453            let start = bucket_off as usize;
454            let Some(slice) = self.bytes.get(start..start.saturating_add(rrv_bucket_size))
455            else {
456                continue;
457            };
458            let Ok((_, iter)) = RrvIter::new(slice) else {
459                continue;
460            };
461            for entry in iter {
462                let resolved = match entry.location {
463                    RrvLocation::FilePosition {
464                        file_position_pages,
465                    } => {
466                        out.file_position_total += 1;
467                        let off = u64::from(file_position_pages) << 8;
468                        self.bytes
469                            .get(off as usize..)
470                            .and_then(|buf| self.note_if_matches(entry.rrv_identifier, off, buf))
471                    }
472                    RrvLocation::BucketSlot {
473                        bucket_index,
474                        slot_index,
475                        ..
476                    } => {
477                        out.bucket_slot_total += 1;
478                        self.resolve_validated(&raw_fps, bucket_index, slot_index, entry.rrv_identifier)
479                    }
480                };
481                match resolved {
482                    Some(note) => out.notes.push(note),
483                    None => out.unresolved += 1,
484                }
485            }
486        }
487        Ok(out)
488    }
489
490    /// Parse `buf` as a note header and return a [`ResolvedNote`] only if it
491    /// carries `expected_identifier` (the identity gate).
492    fn note_if_matches(
493        &self,
494        expected_identifier: u32,
495        file_offset: u64,
496        buf: &[u8],
497    ) -> Option<ResolvedNote> {
498        match NoteHeader::parse(buf) {
499            Ok(header) if header.rrv_identifier == expected_identifier => Some(ResolvedNote {
500                rrv_identifier: expected_identifier,
501                file_offset,
502                header,
503            }),
504            _ => None,
505        }
506    }
507
508    /// Resolve a bucket-slot entry to an identity-verified note, trying the
509    /// raw descriptor first then group-marker-corrected candidates. Returns
510    /// `None` only if no candidate yields a note carrying `expected_id`.
511    fn resolve_validated(
512        &self,
513        raw_fps: &[u32],
514        bucket_index: u32,
515        slot_index: u16,
516        expected_id: u32,
517    ) -> Option<ResolvedNote> {
518        let ord = (bucket_index as usize).checked_sub(1)?;
519        let primary = *raw_fps.get(ord)?;
520        // High nibble (bits 16-19) of neighbouring descriptors, used to
521        // repair a bits-16-19 group marker (buckets are locally sequential).
522        let prev_hi = ord
523            .checked_sub(1)
524            .and_then(|i| raw_fps.get(i))
525            .map(|f| f & 0x000F_0000)
526            .unwrap_or(0);
527        let next_hi = raw_fps.get(ord + 1).map(|f| f & 0x000F_0000).unwrap_or(0);
528
529        let candidates = [
530            primary,
531            primary & 0xFFFF_FFF0,                    // low-nibble group marker
532            (primary & 0xFFF0_FFFF) | prev_hi,        // bits-16-19 marker, prev high nibble
533            (primary & 0xFFF0_FFFF) | next_hi,        // bits-16-19 marker, next high nibble
534        ];
535
536        for &fp in &candidates {
537            let bucket_off = u64::from(fp) << 8;
538            let Some(buf) = self.bytes.get(bucket_off as usize..) else {
539                continue;
540            };
541            let Ok(bucket) = Bucket::parse(buf) else {
542                continue;
543            };
544            let Ok(slot) = bucket.slot(slot_index) else {
545                continue;
546            };
547            let slot_off = bucket_off + (slot.as_ptr() as usize - buf.as_ptr() as usize) as u64;
548            if let Some(note) = self.note_if_matches(expected_id, slot_off, slot) {
549                return Some(note);
550            }
551        }
552        None
553    }
554
555    /// Return a note's non-summary data object - the separately-stored
556    /// large payload that holds rich-text ($Body / mail bodies), file
557    /// attachments (OBJECT items), and other items too big for the inline
558    /// summary. `None` when the note has no non-summary data.
559    ///
560    /// Location: `non_summary_data_identifier << 8` is the byte offset of
561    /// the object, which opens with a header - signature `0x0010`, then a
562    /// `u32` size and the owning note's `u32` rrv_identifier (both validated
563    /// here) - followed by the payload (a CD-record stream for rich text, or
564    /// object segments for attachments). The returned slice is the whole
565    /// object including that header; record-level decoding (CD records,
566    /// attachment extraction) is a later slice.
567    pub fn non_summary_data(&self, note: &ResolvedNote) -> Option<&'a [u8]> {
568        let id = note.header.non_summary_data_identifier;
569        let size = note.header.non_summary_data_size as usize;
570        if id == 0 || size < 10 {
571            return None;
572        }
573        let off = (u64::from(id) << 8) as usize;
574        let obj = self.bytes.get(off..off.checked_add(size)?)?;
575        // Validate the object header against the note's own metadata so a
576        // wrong / stale identifier never returns unrelated bytes.
577        let hdr_size = u32::from_le_bytes([obj[2], obj[3], obj[4], obj[5]]) as usize;
578        let hdr_rrv = u32::from_le_bytes([obj[6], obj[7], obj[8], obj[9]]);
579        if obj[0] != 0x10 || obj[1] != 0x00 || hdr_size != size || hdr_rrv != note.rrv_identifier {
580            return None;
581        }
582        Some(obj)
583    }
584
585    /// Decode a note's rich-text body and attachments from its non-summary
586    /// data (CD-record stream). Returns `None` when the note has no
587    /// non-summary data or it decodes to nothing. See [`crate::cd`].
588    pub fn note_content(&self, note: &ResolvedNote) -> Option<crate::cd::NoteContent> {
589        let obj = self.non_summary_data(note)?;
590        let content = crate::cd::parse(obj);
591        if content.is_empty() {
592            None
593        } else {
594            Some(content)
595        }
596    }
597
598    /// Parse the items (fields) of a resolved note: each item's name id,
599    /// type/flags, and raw value bytes. See [`crate::item`] for the layout
600    /// and what is / isn't decoded (field-name resolution is a later slice).
601    ///
602    /// The record window is bounded to the note's declared `size` so item
603    /// values cannot read into a neighbouring record.
604    pub fn note_items(&self, note: &ResolvedNote) -> Vec<crate::item::NoteItem<'a>> {
605        let start = note.file_offset as usize;
606        let end = start
607            .saturating_add(note.header.size as usize)
608            .min(self.bytes.len());
609        let Some(record) = self.bytes.get(start..end) else {
610            return Vec::new();
611        };
612        crate::item::parse_items(record, note.header.number_of_note_items)
613    }
614}
615
616/// One note resolved (and identity-verified) by [`Database::enumerate_notes`].
617#[derive(Debug, Clone)]
618pub struct ResolvedNote {
619    /// The RRV identifier the note was reached through (== the note
620    /// header's `rrv_identifier`; the identity gate guarantees equality).
621    pub rrv_identifier: u32,
622    /// Byte offset of the note record within the file.
623    pub file_offset: u64,
624    /// The parsed note header.
625    pub header: NoteHeader,
626}
627
628/// Result of a full-database note enumeration via [`Database::enumerate_notes`].
629#[derive(Debug, Clone, Default)]
630pub struct NoteEnumeration {
631    /// Every identity-verified note, in RRV-walk order.
632    pub notes: Vec<ResolvedNote>,
633    /// RRV entries that could not be resolved to a note carrying the
634    /// expected identifier (failed the identity gate). Reported rather than
635    /// returned as possibly-wrong records.
636    pub unresolved: u64,
637    /// Total bucket-slot RRV entries seen.
638    pub bucket_slot_total: u64,
639    /// Total file-position RRV entries seen.
640    pub file_position_total: u64,
641}
sherlock_nsf_parser/database.rs

sherlock_nsf_parser/
database.rs