Skip to main content

sherlock_nsf_parser/
cd.rs

1//! Composite Data (CD) record parsing - Lotus Notes rich text + attachments.
2//!
3//! A note's non-summary data object (see [`crate::Database::non_summary_data`])
4//! is a CD-record stream that begins after the object's fixed 68-byte header.
5//! CD records carry the rich-text `$Body` (CDTEXT records) and embedded
6//! file/image attachments (CDFILEHEADER/CDFILESEGMENT, CDIMAGEHEADER/
7//! CDIMAGESEGMENT).
8//!
9//! CD records are NOT part of libnsfdb (which is container-level only); this
10//! was reverse-engineered against fakenames.nsf and cross-checked with the HCL
11//! Notes C API "Composite Data" reference. Validated end to end: a rich-text
12//! body decodes to its prose, and a 1.4 MB JPEG reconstructs from 137 image
13//! segments to a byte-valid `FF D8 ... FF D9` file.
14//!
15//! ## Record framing
16//!
17//! The byte immediately after the 1-byte signature selects the length class:
18//!
19//! ```text
20//! 0xFF -> WSIG: [sig:u8][0xFF][len:u16]   (4-byte header)
21//! 0x00 -> LSIG: [sig:u8][0x00][len:u32]   (6-byte header)
22//! else -> BSIG: [sig:u8][len:u8]          (2-byte header)
23//! ```
24//!
25//! `len` is the total record size including the header. Records are padded to
26//! an even (WORD) boundary: advance by `len + (len & 1)`.
27
28/// CD-record stream offset within a non-summary object (past its 68-byte
29/// header).
30pub const CD_STREAM_START: usize = 0x44;
31
32// Signature low-byte constants.
33const SIG_TEXT: u8 = 0x85;
34const SIG_IMAGEHEADER: u8 = 0x7D;
35const SIG_IMAGESEGMENT: u8 = 0x7C;
36const SIG_FILEHEADER: u8 = 0xA9;
37const SIG_FILESEGMENT: u8 = 0xAA;
38
39/// One CD record: its signature byte and the bytes after the framing header.
40#[derive(Debug, Clone, Copy)]
41pub struct CdRecord<'a> {
42    /// Signature low byte (the `SIG_CD_*` type).
43    pub sig: u8,
44    /// Record payload (between the framing header and the record end).
45    pub body: &'a [u8],
46}
47
48/// Walk the CD-record stream of a non-summary object (records start at
49/// [`CD_STREAM_START`]). Stops cleanly at a malformed / trailing-filler region.
50pub fn walk(obj: &[u8]) -> Vec<CdRecord<'_>> {
51    let mut i = CD_STREAM_START;
52    let mut out = Vec::new();
53    while i + 2 <= obj.len() {
54        let sig = obj[i];
55        let (hdr, total) = match obj[i + 1] {
56            0xFF => {
57                if i + 4 > obj.len() {
58                    break;
59                }
60                (4usize, u16::from_le_bytes([obj[i + 2], obj[i + 3]]) as usize)
61            }
62            0x00 => {
63                if i + 6 > obj.len() {
64                    break;
65                }
66                (
67                    6usize,
68                    u32::from_le_bytes([obj[i + 2], obj[i + 3], obj[i + 4], obj[i + 5]]) as usize,
69                )
70            }
71            b1 => (2usize, b1 as usize),
72        };
73        if total < hdr || i + total > obj.len() {
74            break;
75        }
76        out.push(CdRecord {
77            sig,
78            body: &obj[i + hdr..i + total],
79        });
80        i += total + (total & 1); // even-boundary padding
81    }
82    out
83}
84
85/// What kind of object an [`Attachment`] reconstructs to.
86#[derive(Debug, Clone, Copy, PartialEq, Eq)]
87pub enum AttachmentKind {
88    /// Embedded image (CDIMAGEHEADER/CDIMAGESEGMENT).
89    Image,
90    /// File attachment (CDFILEHEADER/CDFILESEGMENT).
91    File,
92}
93
94/// A reconstructed attachment: suggested name + raw bytes.
95#[derive(Debug, Clone)]
96pub struct Attachment {
97    /// File name (from CDFILEHEADER) or a synthesized `image_N.ext`.
98    pub name: String,
99    /// Reassembled bytes. May be empty for file variants whose segment
100    /// encoding is not yet decoded (the name is still recovered).
101    pub data: Vec<u8>,
102    /// Image vs file.
103    pub kind: AttachmentKind,
104}
105
106/// Decoded rich-text + attachments of a note's non-summary object.
107#[derive(Debug, Clone, Default)]
108pub struct NoteContent {
109    /// Plain-text rendering of the CDTEXT runs (the rich-text body).
110    pub body_text: String,
111    /// Embedded images and file attachments.
112    pub attachments: Vec<Attachment>,
113}
114
115impl NoteContent {
116    /// True when there is neither body text nor any attachment.
117    pub fn is_empty(&self) -> bool {
118        self.body_text.trim().is_empty() && self.attachments.is_empty()
119    }
120}
121
122fn image_ext(image_type: u16) -> &'static str {
123    match image_type {
124        1 => "gif",
125        2 => "jpg",
126        3 => "bmp",
127        _ => "img",
128    }
129}
130
131/// First >= 3-char printable run in a CDFILEHEADER body (the file name).
132fn file_name(body: &[u8]) -> Option<String> {
133    let mut i = 0;
134    while i < body.len() {
135        if body[i].is_ascii_graphic() || body[i] == b' ' {
136            let s = i;
137            while i < body.len() && (body[i].is_ascii_graphic() || body[i] == b' ') {
138                i += 1;
139            }
140            if i - s >= 3 {
141                return Some(String::from_utf8_lossy(&body[s..i]).into_owned());
142            }
143        } else {
144            i += 1;
145        }
146    }
147    None
148}
149
150/// Parse a non-summary object into its rich-text body + attachments.
151pub fn parse(obj: &[u8]) -> NoteContent {
152    let recs = walk(obj);
153    let mut content = NoteContent::default();
154
155    // Body text: concatenate CDTEXT runs (4-byte font/style prefix, then LMBCS;
156    // we emit printable ASCII and treat NUL as a run separator).
157    for r in recs.iter().filter(|r| r.sig == SIG_TEXT) {
158        let text = r.body.get(4..).unwrap_or(&[]);
159        for &b in text {
160            match b {
161                0x09 | 0x0A | 0x0D | 0x20..=0x7E => content.body_text.push(b as char),
162                _ => {}
163            }
164        }
165        content.body_text.push('\n');
166    }
167    while content.body_text.ends_with('\n') {
168        content.body_text.pop();
169    }
170
171    // Attachments: a single pass that groups segments under the most recent
172    // image/file header.
173    let mut cur_image: Option<(u16, Vec<u8>)> = None;
174    let mut cur_file: Option<(String, Vec<u8>)> = None;
175    let mut img_n = 0usize;
176    let finish_image = |content: &mut NoteContent, img: Option<(u16, Vec<u8>)>, n: &mut usize| {
177        if let Some((ty, data)) = img {
178            if !data.is_empty() {
179                *n += 1;
180                content.attachments.push(Attachment {
181                    name: format!("image_{n}.{}", image_ext(ty)),
182                    data,
183                    kind: AttachmentKind::Image,
184                });
185            }
186        }
187    };
188    let finish_file = |content: &mut NoteContent, file: Option<(String, Vec<u8>)>| {
189        if let Some((name, data)) = file {
190            content.attachments.push(Attachment {
191                name,
192                data,
193                kind: AttachmentKind::File,
194            });
195        }
196    };
197
198    for r in &recs {
199        match r.sig {
200            SIG_IMAGEHEADER => {
201                finish_image(&mut content, cur_image.take(), &mut img_n);
202                finish_file(&mut content, cur_file.take());
203                let ty = if r.body.len() >= 2 {
204                    u16::from_le_bytes([r.body[0], r.body[1]])
205                } else {
206                    0
207                };
208                cur_image = Some((ty, Vec::new()));
209            }
210            SIG_IMAGESEGMENT => {
211                if let Some((_, data)) = cur_image.as_mut() {
212                    if r.body.len() >= 4 {
213                        let data_size = u16::from_le_bytes([r.body[0], r.body[1]]) as usize;
214                        let seg = r.body.get(4..4 + data_size).unwrap_or(&r.body[4..]);
215                        data.extend_from_slice(seg);
216                    }
217                }
218            }
219            SIG_FILEHEADER => {
220                finish_image(&mut content, cur_image.take(), &mut img_n);
221                finish_file(&mut content, cur_file.take());
222                let name = file_name(r.body).unwrap_or_else(|| "attachment.bin".to_string());
223                cur_file = Some((name, Vec::new()));
224            }
225            SIG_FILESEGMENT => {
226                if let Some((_, data)) = cur_file.as_mut() {
227                    data.extend_from_slice(r.body);
228                }
229            }
230            _ => {}
231        }
232    }
233    finish_image(&mut content, cur_image.take(), &mut img_n);
234    finish_file(&mut content, cur_file.take());
235
236    content
237}
238
239#[cfg(test)]
240mod tests {
241    use super::*;
242
243    #[test]
244    fn walk_empty_object_is_safe() {
245        assert!(walk(&[]).is_empty());
246        assert!(walk(&[0u8; 10]).is_empty());
247    }
248
249    #[test]
250    fn parse_empty_is_empty() {
251        assert!(parse(&[0u8; 0x44]).is_empty());
252    }
253}