sherlock_nsf_parser/item.rs
1//! Note item parsing - the fields inside a note record.
2//!
3//! A note record is: the 100-byte note header, then `number_of_note_items`
4//! fixed 8-byte item descriptors, then the item values packed back to back
5//! in descriptor order. Reverse-engineered from the fakenames Person docs
6//! (validated against known field values - street addresses, e-mail
7//! addresses, names).
8//!
9//! Item descriptor (8 bytes):
10//!
11//! ```text
12//! offset width field
13//! 0 2 name_id (Unique Name Key id - the field name lives in
14//! the BDB UNK table, deduplicated across notes)
15//! 2 2 type_flags (item data-type + summary/flag bits)
16//! 4 2 value_size (byte length of this item's value)
17//! 6 2 reserved
18//! ```
19//!
20//! Each item's value is `value_size` bytes, taken sequentially from the
21//! value region that begins right after the descriptor table at
22//! `NOTE_HEADER_BYTES + number_of_note_items * ITEM_DESCRIPTOR_BYTES`.
23//!
24//! # What is and isn't decoded here
25//!
26//! This exposes each item's `name_id`, `type_flags`, and **raw value
27//! bytes**, plus a best-effort text rendering. Field *names* require the
28//! BDB Unique Name Key text table (not yet decoded - it is stored in a
29//! region of the BDB body that resists the documented single-stream CX
30//! decode). Typed decoding of numbers / times / rich-text (CD records) is
31//! left to later slices; the raw bytes are preserved so nothing is lost.
32
33use crate::note::NOTE_HEADER_BYTES;
34use crate::time::Timedate;
35
36/// On-disk size of one item descriptor.
37pub const ITEM_DESCRIPTOR_BYTES: usize = 8;
38
39/// Authoritative item data kind, derived from the field's `(item_class,
40/// item_type)` bytes in the BDB Unique Name Key table (the on-disk note
41/// item carries no inline type word). Resolve via
42/// [`crate::BucketDescriptorBlock::field_kind`].
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44pub enum FieldKind {
45 /// CLASS_TEXT / TYPE_TEXT.
46 Text,
47 /// CLASS_TEXT / TYPE_TEXT_LIST (multi-value text).
48 TextList,
49 /// CLASS_TEXT / TYPE_RFC822_TEXT (internet headers).
50 Rfc822Text,
51 /// CLASS_NUMBER / TYPE_NUMBER (IEEE-754 double).
52 Number,
53 /// CLASS_NUMBER / TYPE_NUMBER_RANGE.
54 NumberRange,
55 /// CLASS_TIME / TYPE_TIME (TIMEDATE).
56 Time,
57 /// CLASS_TIME / TYPE_TIME_RANGE.
58 TimeRange,
59 /// CLASS_FORMULA.
60 Formula,
61 /// NOCOMPUTE / TYPE_COMPOSITE (CD-record rich text, e.g. `$Body`).
62 RichText,
63 /// NOCOMPUTE / TYPE_OBJECT (file attachment / object).
64 Object,
65 /// NOCOMPUTE / TYPE_HTML.
66 Html,
67 /// NOCOMPUTE / TYPE_MIME_PART.
68 MimePart,
69 /// Unrecognized class/type pairing.
70 Unknown,
71}
72
73impl FieldKind {
74 /// Short human label.
75 pub fn label(self) -> &'static str {
76 match self {
77 FieldKind::Text => "Text",
78 FieldKind::TextList => "Text list",
79 FieldKind::Rfc822Text => "RFC822 text",
80 FieldKind::Number => "Number",
81 FieldKind::NumberRange => "Number range",
82 FieldKind::Time => "Time",
83 FieldKind::TimeRange => "Time range",
84 FieldKind::Formula => "Formula",
85 FieldKind::RichText => "Rich text",
86 FieldKind::Object => "Attachment / object",
87 FieldKind::Html => "HTML",
88 FieldKind::MimePart => "MIME part",
89 FieldKind::Unknown => "Unknown",
90 }
91 }
92}
93
94/// Map a `(item_class, item_type)` pair to a [`FieldKind`]. Class/type are
95/// the bytes at UNK-entry offsets 7 and 6 respectively.
96pub fn field_kind(item_class: u8, item_type: u8) -> FieldKind {
97 match item_class {
98 0x05 => match item_type {
99 0x01 => FieldKind::TextList,
100 0x02 => FieldKind::Rfc822Text,
101 _ => FieldKind::Text,
102 },
103 0x03 => match item_type {
104 0x01 => FieldKind::NumberRange,
105 _ => FieldKind::Number,
106 },
107 0x04 => match item_type {
108 0x01 => FieldKind::TimeRange,
109 _ => FieldKind::Time,
110 },
111 0x06 => FieldKind::Formula,
112 0x00 => match item_type {
113 0x01 => FieldKind::RichText,
114 0x03 => FieldKind::Object,
115 0x15 => FieldKind::Html,
116 0x18 => FieldKind::MimePart,
117 _ => FieldKind::Unknown,
118 },
119 _ => FieldKind::Unknown,
120 }
121}
122
123/// One parsed note item: its name id, type/flags, and raw value bytes.
124#[derive(Debug, Clone, Copy)]
125pub struct NoteItem<'a> {
126 /// Unique Name Key id of the field name. The name string itself lives
127 /// in the BDB UNK table (name resolution is a later slice); the id is
128 /// stable within a database so callers can group / correlate fields.
129 pub name_id: u16,
130 /// Item type + flag bits (the low byte distinguishes the value type
131 /// family; high bits carry summary / sign flags).
132 pub type_flags: u16,
133 /// Raw value bytes, exactly `value_size` long.
134 pub value: &'a [u8],
135}
136
137impl<'a> NoteItem<'a> {
138 /// Best-effort text rendering of the value: runs of printable ASCII are
139 /// kept, other bytes become `.`. Lotus text items (the common case for
140 /// names, addresses, e-mail) render cleanly; binary values (numbers,
141 /// timedates, rich text) render as dotted placeholders. Lossless access
142 /// to the original bytes is via [`Self::value`].
143 pub fn as_text(&self) -> String {
144 self.value
145 .iter()
146 .map(|&b| if (0x20..0x7f).contains(&b) { b as char } else { '.' })
147 .collect()
148 }
149
150 /// True if the value is entirely printable ASCII (a clean text field).
151 pub fn is_printable_text(&self) -> bool {
152 !self.value.is_empty()
153 && self
154 .value
155 .iter()
156 .all(|&b| (0x20..0x7f).contains(&b) || b == b'\t')
157 }
158
159 /// Best-effort human rendering of the value by shape (the on-disk note
160 /// summary does not carry a per-item type tag, so this infers it):
161 ///
162 /// - printable bytes -> text;
163 /// - 8 bytes that validate as a TIMEDATE (sane Julian-day range) -> ISO
164 /// date; otherwise an IEEE-754 double (the Notes NUMBER type) when it
165 /// is a sane magnitude;
166 /// - 1/2/4 bytes -> unsigned integer;
167 /// - anything else -> a hex byte summary.
168 ///
169 /// This is a display aid, not an authoritative type decode (proper
170 /// per-field typing from the form design is a later slice). The raw
171 /// bytes remain available via [`Self::value`].
172 pub fn display_value(&self) -> String {
173 if self.value.is_empty() {
174 return String::new();
175 }
176 if self.is_printable_text() {
177 return self.as_text();
178 }
179 match self.value.len() {
180 8 => {
181 if let Ok(td) = Timedate::from_bytes(self.value) {
182 if let Some(clock) = td.as_clock() {
183 return clock.to_iso_8601();
184 }
185 }
186 let bytes: [u8; 8] = self.value.try_into().expect("len checked");
187 let f = f64::from_le_bytes(bytes);
188 if f == 0.0 || (f.is_finite() && f.abs() >= 1e-4 && f.abs() < 1e15) {
189 if f.fract() == 0.0 {
190 return format!("{}", f as i64);
191 }
192 return format!("{f}");
193 }
194 hex_summary(self.value)
195 }
196 4 => format!(
197 "{}",
198 u32::from_le_bytes(self.value.try_into().expect("len checked"))
199 ),
200 2 => {
201 let v = u16::from_le_bytes([self.value[0], self.value[1]]);
202 // An empty field stores only its 2-byte Notes data-type word
203 // (TYPE_TEXT 0x0500, TYPE_NUMBER 0x0300, TYPE_TIME 0x0400,
204 // ...). Treat those as empty rather than a bogus integer.
205 if is_type_word(v) {
206 String::new()
207 } else {
208 format!("{v}")
209 }
210 }
211 1 => format!("{}", self.value[0]),
212 _ => hex_summary(self.value),
213 }
214 }
215}
216
217/// True if `v` is a Notes item data-type constant (the value an empty
218/// field stores in place of data): NUMBER 0x0300, NUMBER_RANGE 0x0301,
219/// TIME 0x0400, TIME_RANGE 0x0401, TEXT 0x0500, TEXT_LIST 0x0501,
220/// FORMULA 0x0600/0x0601, USERID 0x0700.
221fn is_type_word(v: u16) -> bool {
222 matches!(
223 v,
224 0x0300 | 0x0301 | 0x0400 | 0x0401 | 0x0500 | 0x0501 | 0x0600 | 0x0601 | 0x0700
225 )
226}
227
228impl NoteItem<'_> {
229 /// Render the value using the authoritative [`FieldKind`] (from the BDB
230 /// UNK table) rather than guessing by shape. Rich-text and attachment
231 /// values live in the note's non-summary data; here they render as a
232 /// kind marker (use `Database::non_summary_data` for the content).
233 pub fn render(&self, kind: FieldKind) -> String {
234 if self.value.is_empty() {
235 return String::new();
236 }
237 // An empty field stores only its 2-byte type word.
238 if self.value.len() == 2 && is_type_word(u16::from_le_bytes([self.value[0], self.value[1]])) {
239 return String::new();
240 }
241 match kind {
242 FieldKind::Text
243 | FieldKind::TextList
244 | FieldKind::Rfc822Text
245 | FieldKind::Formula
246 | FieldKind::Html
247 | FieldKind::MimePart => {
248 if self.is_printable_text() {
249 self.as_text()
250 } else {
251 hex_summary(self.value)
252 }
253 }
254 FieldKind::Number | FieldKind::NumberRange => {
255 if self.value.len() >= 8 {
256 let b: [u8; 8] = self.value[..8].try_into().expect("len checked");
257 let f = f64::from_le_bytes(b);
258 if f.is_finite() && f.fract() == 0.0 && f.abs() < 1e15 {
259 format!("{}", f as i64)
260 } else if f.is_finite() {
261 format!("{f}")
262 } else {
263 hex_summary(self.value)
264 }
265 } else {
266 self.display_value()
267 }
268 }
269 FieldKind::Time | FieldKind::TimeRange => {
270 if self.value.len() >= 8 {
271 if let Ok(td) = Timedate::from_bytes(&self.value[..8]) {
272 if let Some(c) = td.as_clock() {
273 return c.to_iso_8601();
274 }
275 }
276 hex_summary(self.value)
277 } else {
278 self.display_value()
279 }
280 }
281 FieldKind::RichText => "(rich text)".to_string(),
282 FieldKind::Object => "(attachment / object)".to_string(),
283 FieldKind::Unknown => self.display_value(),
284 }
285 }
286}
287
288/// Compact hex rendering of up to the first 16 bytes.
289fn hex_summary(b: &[u8]) -> String {
290 let mut s = String::new();
291 for (i, x) in b.iter().take(16).enumerate() {
292 if i > 0 {
293 s.push(' ');
294 }
295 s.push_str(&format!("{x:02x}"));
296 }
297 if b.len() > 16 {
298 s.push_str(" ...");
299 }
300 s
301}
302
303/// Parse the items of a note from its full record bytes (starting at the
304/// note header). `number_of_note_items` comes from the note header. Items
305/// whose value would run past the record are dropped (truncated record);
306/// the walk stops there rather than emitting out-of-bounds slices.
307pub fn parse_items(record: &[u8], number_of_note_items: u16) -> Vec<NoteItem<'_>> {
308 let count = number_of_note_items as usize;
309 let table_end = NOTE_HEADER_BYTES + count * ITEM_DESCRIPTOR_BYTES;
310 if record.len() < table_end {
311 return Vec::new();
312 }
313 let mut items = Vec::with_capacity(count);
314 let mut cursor = table_end;
315 for i in 0..count {
316 let d = NOTE_HEADER_BYTES + i * ITEM_DESCRIPTOR_BYTES;
317 let name_id = u16::from_le_bytes([record[d], record[d + 1]]);
318 let type_flags = u16::from_le_bytes([record[d + 2], record[d + 3]]);
319 let value_size = u16::from_le_bytes([record[d + 4], record[d + 5]]) as usize;
320 let Some(value) = record.get(cursor..cursor + value_size) else {
321 break;
322 };
323 cursor += value_size;
324 items.push(NoteItem {
325 name_id,
326 type_flags,
327 value,
328 });
329 }
330 items
331}
332
333#[cfg(test)]
334mod tests {
335 use super::*;
336
337 /// Build a synthetic note record: 100-byte header + N 8-byte descriptors
338 /// + packed values.
339 fn synthetic(items: &[(u16, u16, &[u8])]) -> Vec<u8> {
340 let mut buf = vec![0u8; NOTE_HEADER_BYTES];
341 // descriptors
342 for (name_id, type_flags, value) in items {
343 buf.extend_from_slice(&name_id.to_le_bytes());
344 buf.extend_from_slice(&type_flags.to_le_bytes());
345 buf.extend_from_slice(&(value.len() as u16).to_le_bytes());
346 buf.extend_from_slice(&0u16.to_le_bytes());
347 }
348 // values
349 for (_, _, value) in items {
350 buf.extend_from_slice(value);
351 }
352 buf
353 }
354
355 #[test]
356 fn parses_packed_text_values() {
357 let rec = synthetic(&[
358 (0x09A1, 0x000C, b"613 Goolagong Pde."),
359 (0x07E5, 0x020C, b"a@b.org"),
360 (0x0036, 0x0004, b""), // empty value
361 ]);
362 let items = parse_items(&rec, 3);
363 assert_eq!(items.len(), 3);
364 assert_eq!(items[0].name_id, 0x09A1);
365 assert_eq!(items[0].as_text(), "613 Goolagong Pde.");
366 assert!(items[0].is_printable_text());
367 assert_eq!(items[1].as_text(), "a@b.org");
368 assert!(items[2].value.is_empty());
369 }
370
371 #[test]
372 fn truncated_record_stops_cleanly() {
373 let mut rec = synthetic(&[(0x0001, 0x000C, b"hello world")]);
374 rec.truncate(rec.len() - 4); // chop the value
375 let items = parse_items(&rec, 1);
376 // Value would overrun -> dropped, no panic.
377 assert!(items.is_empty());
378 }
379
380 #[test]
381 fn zero_items_yields_empty() {
382 let rec = vec![0u8; NOTE_HEADER_BYTES];
383 assert!(parse_items(&rec, 0).is_empty());
384 }
385
386 #[test]
387 fn display_value_renders_by_shape() {
388 let rec = synthetic(&[
389 (1, 0x0C, b"hello"), // text
390 (2, 0x04, &0x0500u16.to_le_bytes()), // bare TEXT type word -> empty
391 (3, 0x04, &42u16.to_le_bytes()), // real 2-byte integer
392 (4, 0x04, &[0x99; 6]), // 6 bytes -> hex summary
393 ]);
394 let items = parse_items(&rec, 4);
395 assert_eq!(items[0].display_value(), "hello");
396 assert_eq!(items[1].display_value(), ""); // type-word placeholder
397 assert_eq!(items[2].display_value(), "42");
398 assert_eq!(items[3].display_value(), "99 99 99 99 99 99");
399 }
400}