Skip to main content

imessage_database/util/
streamtyped.rs

1/*!
2 Legacy text parser for `streamtyped` body blobs.
3
4 This parser extracts plain text from `attributedBody` when the full
5 typedstream deserializer cannot reconstruct the attributed string.
6*/
7
8use crate::error::streamtyped::StreamTypedError;
9
10/// Literals: `[<Start of Heading> (SOH), +]`
11/// - <https://www.compart.com/en/unicode/U+0001>
12/// - <https://www.compart.com/en/unicode/U+002b>
13const START_PATTERN: [u8; 2] = [0x0001, 0x002b];
14
15/// Literals: `[<Start of Selected Area> (SSA), <Index> (IND)]`
16/// - <https://www.compart.com/en/unicode/U+0086>
17/// - <https://www.compart.com/en/unicode/U+0084>
18const END_PATTERN: [u8; 2] = [0x0086, 0x0084];
19
20/// Parse plain body [text](crate::tables::messages::message::Message::text) from a `streamtyped` `attributedBody` blob.
21///
22/// `attributedBody` `typedstream` data looks like:
23///
24/// ```txt
25/// streamtyped���@���NSAttributedString�NSObject����NSString��+Example message  ��iI���� NSDictionary��i����__kIMMessagePartAttributeName����NSNumber��NSValue��*������
26/// ```
27///
28/// In that example, the returned body text would be `"Example message"`.
29///
30/// ## Parser scope
31///
32/// This string parser only supports unstyled text.
33///
34/// If the message has attachments, there will be one [`U+FFFC`](https://www.compart.com/en/unicode/U+FFFC) character
35/// for each attachment and one [`U+FFFD`](https://www.compart.com/en/unicode/U+FFFD) for app messages that we need
36/// to format.
37///
38/// ## Sample
39///
40/// An iMessage that contains body text like:
41///
42/// ```
43/// let message_text = "\u{FFFC}Check out this photo!";
44/// ```
45///
46/// produces fallback body components like:
47///
48/// ```
49/// use imessage_database::message_types::text_effects::text_effect::TextEffect;
50/// use imessage_database::tables::messages::{models::{AttributedRange, BubbleComponent, AttachmentMeta}};
51///
52/// let result = vec![
53///     BubbleComponent::Run(vec![AttributedRange::attachment(0, 3, AttachmentMeta::default())]),
54///     BubbleComponent::Run(vec![AttributedRange::text(3, 24, vec![TextEffect::Default])]),
55/// ];
56/// ```
57pub fn parse(mut stream: Vec<u8>) -> Result<String, StreamTypedError> {
58    // Find the start index and drain
59    for idx in 0..stream.len() {
60        if idx + 2 > stream.len() {
61            return Err(StreamTypedError::NoStartPattern);
62        }
63        let part = &stream[idx..idx + 2];
64
65        if part == START_PATTERN {
66            // Remove the start pattern from the string
67            stream.drain(..idx + 2);
68            break;
69        }
70    }
71
72    // Find the end index and truncate
73    for idx in 1..stream.len() {
74        if idx >= stream.len() - 2 {
75            return Err(StreamTypedError::NoEndPattern);
76        }
77        let part = &stream[idx..idx + 2];
78
79        if part == END_PATTERN {
80            // Remove the end pattern from the string
81            stream.truncate(idx);
82            break;
83        }
84    }
85
86    // `from_utf8` doesn't allocate, but `from_utf8_lossy` does, so we try the allocation-free
87    // version first and only allocate if it fails
88    match String::from_utf8(stream)
89        .map_err(|non_utf8| String::from_utf8_lossy(non_utf8.as_bytes()).into_owned())
90    {
91        // Valid UTF-8 bodies carry one non-text prefix character.
92        // ['\u{6}', 'T', ...] where `T` is the first real char
93        Ok(string) => drop_chars(1, string),
94        // Lossy-decoded bodies carry three replacement/prefix characters.
95        // ['�', '�', '\0', 'T', ...] where `T` is the first real char
96        Err(string) => drop_chars(3, string),
97    }
98}
99
100/// Drop `offset` chars from the front of a string.
101fn drop_chars(offset: usize, mut string: String) -> Result<String, StreamTypedError> {
102    // Find the index of the specified character offset
103    let (position, _) = string
104        .char_indices()
105        .nth(offset)
106        .ok_or(StreamTypedError::InvalidPrefix)?;
107
108    // Remove the prefix and give the String back
109    string.drain(..position);
110    Ok(string)
111}
112
113#[cfg(test)]
114mod tests {
115    use std::env::current_dir;
116    use std::fs::File;
117    use std::io::Read;
118    use std::vec;
119
120    use crate::util::streamtyped::{drop_chars, parse};
121
122    #[test]
123    fn test_parse_text_clean() {
124        let plist_path = current_dir()
125            .unwrap()
126            .as_path()
127            .join("test_data/typedstream/AttributedBodyTextOnly");
128        let mut file = File::open(plist_path).unwrap();
129        let mut bytes = vec![];
130        file.read_to_end(&mut bytes).unwrap();
131        let parsed = parse(bytes).unwrap();
132
133        let expected = "Noter test".to_string();
134
135        assert_eq!(parsed, expected);
136    }
137
138    #[test]
139    fn test_parse_text_space() {
140        let plist_path = current_dir()
141            .unwrap()
142            .as_path()
143            .join("test_data/typedstream/AttributedBodyTextOnly2");
144        let mut file = File::open(plist_path).unwrap();
145        let mut bytes = vec![];
146        file.read_to_end(&mut bytes).unwrap();
147        let parsed = parse(bytes).unwrap();
148
149        let expected = "Test 3".to_string();
150
151        assert_eq!(parsed, expected);
152    }
153
154    #[test]
155    fn test_parse_text_weird_font() {
156        let plist_path = current_dir()
157            .unwrap()
158            .as_path()
159            .join("test_data/typedstream/WeirdText");
160        let mut file = File::open(plist_path).unwrap();
161        let mut bytes = vec![];
162        file.read_to_end(&mut bytes).unwrap();
163        let parsed = parse(bytes).unwrap();
164
165        let expected = "𝖍𝖊𝖑𝖑𝖔 𝖜𝖔𝖗𝖑𝖉".to_string();
166
167        assert_eq!(parsed, expected);
168    }
169
170    #[test]
171    fn test_parse_text_url() {
172        let plist_path = current_dir()
173            .unwrap()
174            .as_path()
175            .join("test_data/typedstream/URL");
176        let mut file = File::open(plist_path).unwrap();
177        let mut bytes = vec![];
178        file.read_to_end(&mut bytes).unwrap();
179        let parsed = parse(bytes).unwrap();
180
181        let expected = "https://github.com/ReagentX/Logria".to_string();
182
183        assert_eq!(parsed, expected);
184    }
185
186    #[test]
187    fn test_parse_text_multi_part() {
188        let plist_path = current_dir()
189            .unwrap()
190            .as_path()
191            .join("test_data/typedstream/MultiPart");
192        let mut file = File::open(plist_path).unwrap();
193        let mut bytes = vec![];
194        file.read_to_end(&mut bytes).unwrap();
195        let parsed = parse(bytes).unwrap();
196
197        let expected = "\u{FFFC}test 1\u{FFFC}test 2 \u{FFFC}test 3".to_string();
198
199        assert_eq!(parsed, expected);
200    }
201
202    #[test]
203    fn test_parse_text_app() {
204        // This test removed a block of text so the pointers become misaligned during parsing
205        let plist_path = current_dir()
206            .unwrap()
207            .as_path()
208            .join("test_data/typedstream/ExtraData");
209        let mut file = File::open(plist_path).unwrap();
210        let mut bytes = vec![];
211        file.read_to_end(&mut bytes).unwrap();
212        let parsed = parse(bytes).unwrap();
213
214        let expected = "This is parsing";
215
216        assert_eq!(&parsed[..expected.len()], expected);
217    }
218
219    #[test]
220    fn test_parse_text_long() {
221        let plist_path = current_dir()
222            .unwrap()
223            .as_path()
224            .join("test_data/typedstream/LongMessage");
225        let mut file = File::open(plist_path).unwrap();
226        let mut bytes = vec![];
227        file.read_to_end(&mut bytes).unwrap();
228        let parsed = parse(bytes).unwrap();
229
230        let expected = "Sed nibh velit,";
231
232        assert_eq!(&parsed[..expected.len()], expected);
233        assert_eq!(parsed.len(), 2359);
234    }
235
236    #[test]
237    fn test_parse_text_blank() {
238        let plist_path = current_dir()
239            .unwrap()
240            .as_path()
241            .join("test_data/typedstream/Blank");
242        let mut file = File::open(plist_path).unwrap();
243        let mut bytes = vec![];
244        file.read_to_end(&mut bytes).unwrap();
245        let parsed = parse(bytes);
246
247        assert!(&parsed.is_err());
248    }
249
250    #[test]
251    fn test_parse_text_multi_part_deleted() {
252        let plist_path = current_dir()
253            .unwrap()
254            .as_path()
255            .join("test_data/typedstream/MultiPartWithDeleted");
256        let mut file = File::open(plist_path).unwrap();
257        let mut bytes = vec![];
258        file.read_to_end(&mut bytes).unwrap();
259        let parsed = parse(bytes).unwrap();
260        println!("{parsed:?}");
261
262        let expected = "From arbitrary byte stream:\r\u{FFFC}To native Rust data structures:\r";
263
264        assert_eq!(parsed, expected);
265    }
266
267    #[test]
268    fn test_parse_text_attachment() {
269        let plist_path = current_dir()
270            .unwrap()
271            .as_path()
272            .join("test_data/typedstream/Attachment");
273        let mut file = File::open(plist_path).unwrap();
274        let mut bytes = vec![];
275        file.read_to_end(&mut bytes).unwrap();
276        let parsed = parse(bytes).unwrap();
277        println!("{parsed:?}");
278
279        let expected =
280            "\u{FFFC}This is how the notes look to me fyi, in case it helps make sense of anything";
281
282        assert_eq!(parsed, expected);
283    }
284
285    #[test]
286    fn test_parse_text_array() {
287        let plist_path = current_dir()
288            .unwrap()
289            .as_path()
290            .join("test_data/typedstream/Array");
291        let mut file = File::open(plist_path).unwrap();
292        let mut bytes = vec![];
293        file.read_to_end(&mut bytes).unwrap();
294        let parsed = parse(bytes).unwrap();
295        println!("{parsed:?}");
296
297        let expected = "A single ChatGPT instance takes 5MW of power to run";
298
299        assert_eq!(parsed, expected);
300    }
301
302    #[test]
303    fn test_can_drop_chars() {
304        assert_eq!(
305            drop_chars(1, String::from("Hello world")).unwrap(),
306            String::from("ello world")
307        );
308    }
309
310    #[test]
311    fn test_can_drop_chars_none() {
312        assert_eq!(
313            drop_chars(0, String::from("Hello world")).unwrap(),
314            String::from("Hello world")
315        );
316    }
317
318    #[test]
319    fn test_cant_drop_all() {
320        assert!(drop_chars(1000, String::from("Hello world")).is_err());
321    }
322}