Skip to main content

sherlock_nsf_parser/
item.rs

1//! Note item parsing - the fields inside a note record.
2//!
3//! A note record is: the 100-byte note header, then `number_of_note_items`
4//! fixed 8-byte item descriptors, then the item values packed back to back
5//! in descriptor order. Reverse-engineered from the fakenames Person docs
6//! (validated against known field values - street addresses, e-mail
7//! addresses, names).
8//!
9//! Item descriptor (8 bytes):
10//!
11//! ```text
12//! offset  width  field
13//!     0      2   name_id     (Unique Name Key id - the field name lives in
14//!                             the BDB UNK table, deduplicated across notes)
15//!     2      2   type_flags  (item data-type + summary/flag bits)
16//!     4      2   value_size  (byte length of this item's value)
17//!     6      2   reserved
18//! ```
19//!
20//! Each item's value is `value_size` bytes, taken sequentially from the
21//! value region that begins right after the descriptor table at
22//! `NOTE_HEADER_BYTES + number_of_note_items * ITEM_DESCRIPTOR_BYTES`.
23//!
24//! # What is and isn't decoded here
25//!
26//! This exposes each item's `name_id`, `type_flags`, and **raw value
27//! bytes**, plus a best-effort text rendering. Field *names* require the
28//! BDB Unique Name Key text table (not yet decoded - it is stored in a
29//! region of the BDB body that resists the documented single-stream CX
30//! decode). Typed decoding of numbers / times / rich-text (CD records) is
31//! left to later slices; the raw bytes are preserved so nothing is lost.
32
33use crate::note::NOTE_HEADER_BYTES;
34use crate::time::Timedate;
35
36/// On-disk size of one item descriptor.
37pub const ITEM_DESCRIPTOR_BYTES: usize = 8;
38
39/// Authoritative item data kind, derived from the field's `(item_class,
40/// item_type)` bytes in the BDB Unique Name Key table (the on-disk note
41/// item carries no inline type word). Resolve via
42/// [`crate::BucketDescriptorBlock::field_kind`].
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44pub enum FieldKind {
45    /// CLASS_TEXT / TYPE_TEXT.
46    Text,
47    /// CLASS_TEXT / TYPE_TEXT_LIST (multi-value text).
48    TextList,
49    /// CLASS_TEXT / TYPE_RFC822_TEXT (internet headers).
50    Rfc822Text,
51    /// CLASS_NUMBER / TYPE_NUMBER (IEEE-754 double).
52    Number,
53    /// CLASS_NUMBER / TYPE_NUMBER_RANGE.
54    NumberRange,
55    /// CLASS_TIME / TYPE_TIME (TIMEDATE).
56    Time,
57    /// CLASS_TIME / TYPE_TIME_RANGE.
58    TimeRange,
59    /// CLASS_FORMULA.
60    Formula,
61    /// NOCOMPUTE / TYPE_COMPOSITE (CD-record rich text, e.g. `$Body`).
62    RichText,
63    /// NOCOMPUTE / TYPE_OBJECT (file attachment / object).
64    Object,
65    /// NOCOMPUTE / TYPE_HTML.
66    Html,
67    /// NOCOMPUTE / TYPE_MIME_PART.
68    MimePart,
69    /// Unrecognized class/type pairing.
70    Unknown,
71}
72
73impl FieldKind {
74    /// Short human label.
75    pub fn label(self) -> &'static str {
76        match self {
77            FieldKind::Text => "Text",
78            FieldKind::TextList => "Text list",
79            FieldKind::Rfc822Text => "RFC822 text",
80            FieldKind::Number => "Number",
81            FieldKind::NumberRange => "Number range",
82            FieldKind::Time => "Time",
83            FieldKind::TimeRange => "Time range",
84            FieldKind::Formula => "Formula",
85            FieldKind::RichText => "Rich text",
86            FieldKind::Object => "Attachment / object",
87            FieldKind::Html => "HTML",
88            FieldKind::MimePart => "MIME part",
89            FieldKind::Unknown => "Unknown",
90        }
91    }
92}
93
94/// Map a `(item_class, item_type)` pair to a [`FieldKind`]. Class/type are
95/// the bytes at UNK-entry offsets 7 and 6 respectively.
96pub fn field_kind(item_class: u8, item_type: u8) -> FieldKind {
97    match item_class {
98        0x05 => match item_type {
99            0x01 => FieldKind::TextList,
100            0x02 => FieldKind::Rfc822Text,
101            _ => FieldKind::Text,
102        },
103        0x03 => match item_type {
104            0x01 => FieldKind::NumberRange,
105            _ => FieldKind::Number,
106        },
107        0x04 => match item_type {
108            0x01 => FieldKind::TimeRange,
109            _ => FieldKind::Time,
110        },
111        0x06 => FieldKind::Formula,
112        0x00 => match item_type {
113            0x01 => FieldKind::RichText,
114            0x03 => FieldKind::Object,
115            0x15 => FieldKind::Html,
116            0x18 => FieldKind::MimePart,
117            _ => FieldKind::Unknown,
118        },
119        _ => FieldKind::Unknown,
120    }
121}
122
123/// One parsed note item: its name id, type/flags, and raw value bytes.
124#[derive(Debug, Clone, Copy)]
125pub struct NoteItem<'a> {
126    /// Unique Name Key id of the field name. The name string itself lives
127    /// in the BDB UNK table (name resolution is a later slice); the id is
128    /// stable within a database so callers can group / correlate fields.
129    pub name_id: u16,
130    /// Item type + flag bits (the low byte distinguishes the value type
131    /// family; high bits carry summary / sign flags).
132    pub type_flags: u16,
133    /// Raw value bytes, exactly `value_size` long.
134    pub value: &'a [u8],
135}
136
137impl<'a> NoteItem<'a> {
138    /// Best-effort text rendering of the value: runs of printable ASCII are
139    /// kept, other bytes become `.`. Lotus text items (the common case for
140    /// names, addresses, e-mail) render cleanly; binary values (numbers,
141    /// timedates, rich text) render as dotted placeholders. Lossless access
142    /// to the original bytes is via [`Self::value`].
143    pub fn as_text(&self) -> String {
144        self.value
145            .iter()
146            .map(|&b| if (0x20..0x7f).contains(&b) { b as char } else { '.' })
147            .collect()
148    }
149
150    /// True if the value is entirely printable ASCII (a clean text field).
151    pub fn is_printable_text(&self) -> bool {
152        !self.value.is_empty()
153            && self
154                .value
155                .iter()
156                .all(|&b| (0x20..0x7f).contains(&b) || b == b'\t')
157    }
158
159    /// Best-effort human rendering of the value by shape (the on-disk note
160    /// summary does not carry a per-item type tag, so this infers it):
161    ///
162    /// - printable bytes -> text;
163    /// - 8 bytes that validate as a TIMEDATE (sane Julian-day range) -> ISO
164    ///   date; otherwise an IEEE-754 double (the Notes NUMBER type) when it
165    ///   is a sane magnitude;
166    /// - 1/2/4 bytes -> unsigned integer;
167    /// - anything else -> a hex byte summary.
168    ///
169    /// This is a display aid, not an authoritative type decode (proper
170    /// per-field typing from the form design is a later slice). The raw
171    /// bytes remain available via [`Self::value`].
172    pub fn display_value(&self) -> String {
173        if self.value.is_empty() {
174            return String::new();
175        }
176        if self.is_printable_text() {
177            return self.as_text();
178        }
179        match self.value.len() {
180            8 => {
181                if let Ok(td) = Timedate::from_bytes(self.value) {
182                    if let Some(clock) = td.as_clock() {
183                        return clock.to_iso_8601();
184                    }
185                }
186                let bytes: [u8; 8] = self.value.try_into().expect("len checked");
187                let f = f64::from_le_bytes(bytes);
188                if f == 0.0 || (f.is_finite() && f.abs() >= 1e-4 && f.abs() < 1e15) {
189                    if f.fract() == 0.0 {
190                        return format!("{}", f as i64);
191                    }
192                    return format!("{f}");
193                }
194                hex_summary(self.value)
195            }
196            4 => format!(
197                "{}",
198                u32::from_le_bytes(self.value.try_into().expect("len checked"))
199            ),
200            2 => {
201                let v = u16::from_le_bytes([self.value[0], self.value[1]]);
202                // An empty field stores only its 2-byte Notes data-type word
203                // (TYPE_TEXT 0x0500, TYPE_NUMBER 0x0300, TYPE_TIME 0x0400,
204                // ...). Treat those as empty rather than a bogus integer.
205                if is_type_word(v) {
206                    String::new()
207                } else {
208                    format!("{v}")
209                }
210            }
211            1 => format!("{}", self.value[0]),
212            _ => hex_summary(self.value),
213        }
214    }
215}
216
217/// True if `v` is a Notes item data-type constant (the value an empty
218/// field stores in place of data): NUMBER 0x0300, NUMBER_RANGE 0x0301,
219/// TIME 0x0400, TIME_RANGE 0x0401, TEXT 0x0500, TEXT_LIST 0x0501,
220/// FORMULA 0x0600/0x0601, USERID 0x0700.
221fn is_type_word(v: u16) -> bool {
222    matches!(
223        v,
224        0x0300 | 0x0301 | 0x0400 | 0x0401 | 0x0500 | 0x0501 | 0x0600 | 0x0601 | 0x0700
225    )
226}
227
228impl NoteItem<'_> {
229    /// Render the value using the authoritative [`FieldKind`] (from the BDB
230    /// UNK table) rather than guessing by shape. Rich-text and attachment
231    /// values live in the note's non-summary data; here they render as a
232    /// kind marker (use `Database::non_summary_data` for the content).
233    pub fn render(&self, kind: FieldKind) -> String {
234        if self.value.is_empty() {
235            return String::new();
236        }
237        // An empty field stores only its 2-byte type word.
238        if self.value.len() == 2 && is_type_word(u16::from_le_bytes([self.value[0], self.value[1]])) {
239            return String::new();
240        }
241        match kind {
242            FieldKind::Text
243            | FieldKind::TextList
244            | FieldKind::Rfc822Text
245            | FieldKind::Formula
246            | FieldKind::Html
247            | FieldKind::MimePart => {
248                if self.is_printable_text() {
249                    self.as_text()
250                } else {
251                    hex_summary(self.value)
252                }
253            }
254            FieldKind::Number | FieldKind::NumberRange => {
255                if self.value.len() >= 8 {
256                    let b: [u8; 8] = self.value[..8].try_into().expect("len checked");
257                    let f = f64::from_le_bytes(b);
258                    if f.is_finite() && f.fract() == 0.0 && f.abs() < 1e15 {
259                        format!("{}", f as i64)
260                    } else if f.is_finite() {
261                        format!("{f}")
262                    } else {
263                        hex_summary(self.value)
264                    }
265                } else {
266                    self.display_value()
267                }
268            }
269            FieldKind::Time | FieldKind::TimeRange => {
270                if self.value.len() >= 8 {
271                    if let Ok(td) = Timedate::from_bytes(&self.value[..8]) {
272                        if let Some(c) = td.as_clock() {
273                            return c.to_iso_8601();
274                        }
275                    }
276                    hex_summary(self.value)
277                } else {
278                    self.display_value()
279                }
280            }
281            FieldKind::RichText => "(rich text)".to_string(),
282            FieldKind::Object => "(attachment / object)".to_string(),
283            FieldKind::Unknown => self.display_value(),
284        }
285    }
286}
287
288/// Compact hex rendering of up to the first 16 bytes.
289fn hex_summary(b: &[u8]) -> String {
290    let mut s = String::new();
291    for (i, x) in b.iter().take(16).enumerate() {
292        if i > 0 {
293            s.push(' ');
294        }
295        s.push_str(&format!("{x:02x}"));
296    }
297    if b.len() > 16 {
298        s.push_str(" ...");
299    }
300    s
301}
302
303/// Parse the items of a note from its full record bytes (starting at the
304/// note header). `number_of_note_items` comes from the note header. Items
305/// whose value would run past the record are dropped (truncated record);
306/// the walk stops there rather than emitting out-of-bounds slices.
307pub fn parse_items(record: &[u8], number_of_note_items: u16) -> Vec<NoteItem<'_>> {
308    let count = number_of_note_items as usize;
309    let table_end = NOTE_HEADER_BYTES + count * ITEM_DESCRIPTOR_BYTES;
310    if record.len() < table_end {
311        return Vec::new();
312    }
313    let mut items = Vec::with_capacity(count);
314    let mut cursor = table_end;
315    for i in 0..count {
316        let d = NOTE_HEADER_BYTES + i * ITEM_DESCRIPTOR_BYTES;
317        let name_id = u16::from_le_bytes([record[d], record[d + 1]]);
318        let type_flags = u16::from_le_bytes([record[d + 2], record[d + 3]]);
319        let value_size = u16::from_le_bytes([record[d + 4], record[d + 5]]) as usize;
320        let Some(value) = record.get(cursor..cursor + value_size) else {
321            break;
322        };
323        cursor += value_size;
324        items.push(NoteItem {
325            name_id,
326            type_flags,
327            value,
328        });
329    }
330    items
331}
332
333#[cfg(test)]
334mod tests {
335    use super::*;
336
337    /// Build a synthetic note record: 100-byte header + N 8-byte descriptors
338    /// + packed values.
339    fn synthetic(items: &[(u16, u16, &[u8])]) -> Vec<u8> {
340        let mut buf = vec![0u8; NOTE_HEADER_BYTES];
341        // descriptors
342        for (name_id, type_flags, value) in items {
343            buf.extend_from_slice(&name_id.to_le_bytes());
344            buf.extend_from_slice(&type_flags.to_le_bytes());
345            buf.extend_from_slice(&(value.len() as u16).to_le_bytes());
346            buf.extend_from_slice(&0u16.to_le_bytes());
347        }
348        // values
349        for (_, _, value) in items {
350            buf.extend_from_slice(value);
351        }
352        buf
353    }
354
355    #[test]
356    fn parses_packed_text_values() {
357        let rec = synthetic(&[
358            (0x09A1, 0x000C, b"613 Goolagong Pde."),
359            (0x07E5, 0x020C, b"a@b.org"),
360            (0x0036, 0x0004, b""), // empty value
361        ]);
362        let items = parse_items(&rec, 3);
363        assert_eq!(items.len(), 3);
364        assert_eq!(items[0].name_id, 0x09A1);
365        assert_eq!(items[0].as_text(), "613 Goolagong Pde.");
366        assert!(items[0].is_printable_text());
367        assert_eq!(items[1].as_text(), "a@b.org");
368        assert!(items[2].value.is_empty());
369    }
370
371    #[test]
372    fn truncated_record_stops_cleanly() {
373        let mut rec = synthetic(&[(0x0001, 0x000C, b"hello world")]);
374        rec.truncate(rec.len() - 4); // chop the value
375        let items = parse_items(&rec, 1);
376        // Value would overrun -> dropped, no panic.
377        assert!(items.is_empty());
378    }
379
380    #[test]
381    fn zero_items_yields_empty() {
382        let rec = vec![0u8; NOTE_HEADER_BYTES];
383        assert!(parse_items(&rec, 0).is_empty());
384    }
385
386    #[test]
387    fn display_value_renders_by_shape() {
388        let rec = synthetic(&[
389            (1, 0x0C, b"hello"),                 // text
390            (2, 0x04, &0x0500u16.to_le_bytes()), // bare TEXT type word -> empty
391            (3, 0x04, &42u16.to_le_bytes()),     // real 2-byte integer
392            (4, 0x04, &[0x99; 6]),               // 6 bytes -> hex summary
393        ]);
394        let items = parse_items(&rec, 4);
395        assert_eq!(items[0].display_value(), "hello");
396        assert_eq!(items[1].display_value(), ""); // type-word placeholder
397        assert_eq!(items[2].display_value(), "42");
398        assert_eq!(items[3].display_value(), "99 99 99 99 99 99");
399    }
400}