imessage_database/util/
streamtyped.rs

1/*!
2 The legacy/fallback simple `typedstream` parser.
3
4 Contains logic to parse text from `attributedBody`'s `typedstream` data.
5
6 It is called `streamtyped` because that is the header string contained in the data.
7*/
8
9use crate::error::streamtyped::StreamTypedError;
10
11/// Literals: `[<Start of Heading> (SOH), +]`
12/// - <https://www.compart.com/en/unicode/U+0001>
13/// - <https://www.compart.com/en/unicode/U+002b>
14const START_PATTERN: [u8; 2] = [0x0001, 0x002b];
15
16/// Literals: `[<Start of Selected Area> (SSA), <Index> (IND)]`
17/// - <https://www.compart.com/en/unicode/U+0086>
18/// - <https://www.compart.com/en/unicode/U+0084>
19const END_PATTERN: [u8; 2] = [0x0086, 0x0084];
20
21/// Parse the body [text](crate::tables::messages::message::Message::text) from a known type of `typedstream` `attributedBody` file.
22///
23/// `attributedBody` `typedstream` data looks like:
24///
25/// ```txt
26/// streamtyped���@���NSAttributedString�NSObject����NSString��+Example message  ��iI���� NSDictionary��i����__kIMMessagePartAttributeName����NSNumber��NSValue��*������
27/// ```
28///
29/// In that example, the returned body text would be `"Example message"`.
30///
31/// ## Legacy parsing
32///
33/// If the `typedstream` data cannot be deserialized, we fall back to this legacy string parsing algorithm that
34/// only supports unstyled text.
35///
36/// If the message has attachments, there will be one [`U+FFFC`](https://www.compart.com/en/unicode/U+FFFC) character
37/// for each attachment and one [`U+FFFD`](https://www.compart.com/en/unicode/U+FFFD) for app messages that we need
38/// to format.
39///
40/// ## Sample
41///
42/// An iMessage that contains body text like:
43///
44/// ```
45/// let message_text = "\u{FFFC}Check out this photo!";
46/// ```
47///
48/// Will have a `body()` of:
49///
50/// ```
51/// use imessage_database::message_types::text_effects::TextEffect;
52/// use imessage_database::tables::messages::{models::{TextAttributes, BubbleComponent, AttachmentMeta}};
53///  
54/// let result = vec![
55///     BubbleComponent::Attachment(AttachmentMeta::default()),
56///     BubbleComponent::Text(vec![TextAttributes::new(3, 24, vec![TextEffect::Default])]),
57/// ];
58/// ```
59pub fn parse(mut stream: Vec<u8>) -> Result<String, StreamTypedError> {
60    // Find the start index and drain
61    for idx in 0..stream.len() {
62        if idx + 2 > stream.len() {
63            return Err(StreamTypedError::NoStartPattern);
64        }
65        let part = &stream[idx..idx + 2];
66
67        if part == START_PATTERN {
68            // Remove the start pattern from the string
69            stream.drain(..idx + 2);
70            break;
71        }
72    }
73
74    // Find the end index and truncate
75    for idx in 1..stream.len() {
76        if idx >= stream.len() - 2 {
77            return Err(StreamTypedError::NoEndPattern);
78        }
79        let part = &stream[idx..idx + 2];
80
81        if part == END_PATTERN {
82            // Remove the end pattern from the string
83            stream.truncate(idx);
84            break;
85        }
86    }
87
88    // `from_utf8` doesn't allocate, but `from_utf8_lossy` does, so we try the allocation-free
89    // version first and only allocate if it fails
90    match String::from_utf8(stream)
91        .map_err(|non_utf8| String::from_utf8_lossy(non_utf8.as_bytes()).into_owned())
92    {
93        // If the bytes are valid unicode, only one char prefixes the actual message
94        // ['\u{6}', 'T', ...] where `T` is the first real char
95        // The prefix char is not always the same
96        Ok(string) => drop_chars(1, string),
97        // If the bytes are not valid unicode, 3 chars prefix the actual message
98        // ['�', '�', '\0', 'T', ...] where `T` is the first real char
99        // The prefix chars are not always the same
100        Err(string) => drop_chars(3, string),
101    }
102}
103
104/// Drop `offset` chars from the front of a String
105fn drop_chars(offset: usize, mut string: String) -> Result<String, StreamTypedError> {
106    // Find the index of the specified character offset
107    let (position, _) = string
108        .char_indices()
109        .nth(offset)
110        .ok_or(StreamTypedError::InvalidPrefix)?;
111
112    // Remove the prefix and give the String back
113    string.drain(..position);
114    Ok(string)
115}
116
117#[cfg(test)]
118mod tests {
119    use std::env::current_dir;
120    use std::fs::File;
121    use std::io::Read;
122    use std::vec;
123
124    use crate::util::streamtyped::{drop_chars, parse};
125
126    #[test]
127    fn test_parse_text_clean() {
128        let plist_path = current_dir()
129            .unwrap()
130            .as_path()
131            .join("test_data/typedstream/AttributedBodyTextOnly");
132        let mut file = File::open(plist_path).unwrap();
133        let mut bytes = vec![];
134        file.read_to_end(&mut bytes).unwrap();
135        let parsed = parse(bytes).unwrap();
136
137        let expected = "Noter test".to_string();
138
139        assert_eq!(parsed, expected);
140    }
141
142    #[test]
143    fn test_parse_text_space() {
144        let plist_path = current_dir()
145            .unwrap()
146            .as_path()
147            .join("test_data/typedstream/AttributedBodyTextOnly2");
148        let mut file = File::open(plist_path).unwrap();
149        let mut bytes = vec![];
150        file.read_to_end(&mut bytes).unwrap();
151        let parsed = parse(bytes).unwrap();
152
153        let expected = "Test 3".to_string();
154
155        assert_eq!(parsed, expected);
156    }
157
158    #[test]
159    fn test_parse_text_weird_font() {
160        let plist_path = current_dir()
161            .unwrap()
162            .as_path()
163            .join("test_data/typedstream/WeirdText");
164        let mut file = File::open(plist_path).unwrap();
165        let mut bytes = vec![];
166        file.read_to_end(&mut bytes).unwrap();
167        let parsed = parse(bytes).unwrap();
168
169        let expected = "𝖍𝖊𝖑𝖑𝖔 𝖜𝖔𝖗𝖑𝖉".to_string();
170
171        assert_eq!(parsed, expected);
172    }
173
174    #[test]
175    fn test_parse_text_url() {
176        let plist_path = current_dir()
177            .unwrap()
178            .as_path()
179            .join("test_data/typedstream/URL");
180        let mut file = File::open(plist_path).unwrap();
181        let mut bytes = vec![];
182        file.read_to_end(&mut bytes).unwrap();
183        let parsed = parse(bytes).unwrap();
184
185        let expected = "https://github.com/ReagentX/Logria".to_string();
186
187        assert_eq!(parsed, expected);
188    }
189
190    #[test]
191    fn test_parse_text_multi_part() {
192        let plist_path = current_dir()
193            .unwrap()
194            .as_path()
195            .join("test_data/typedstream/MultiPart");
196        let mut file = File::open(plist_path).unwrap();
197        let mut bytes = vec![];
198        file.read_to_end(&mut bytes).unwrap();
199        let parsed = parse(bytes).unwrap();
200
201        let expected = "\u{FFFC}test 1\u{FFFC}test 2 \u{FFFC}test 3".to_string();
202
203        assert_eq!(parsed, expected);
204    }
205
206    #[test]
207    fn test_parse_text_app() {
208        // This test removed a block of text so the pointers become misaligned during parsing
209        let plist_path = current_dir()
210            .unwrap()
211            .as_path()
212            .join("test_data/typedstream/ExtraData");
213        let mut file = File::open(plist_path).unwrap();
214        let mut bytes = vec![];
215        file.read_to_end(&mut bytes).unwrap();
216        let parsed = parse(bytes).unwrap();
217
218        let expected = "This is parsing";
219
220        assert_eq!(&parsed[..expected.len()], expected);
221    }
222
223    #[test]
224    fn test_parse_text_long() {
225        let plist_path = current_dir()
226            .unwrap()
227            .as_path()
228            .join("test_data/typedstream/LongMessage");
229        let mut file = File::open(plist_path).unwrap();
230        let mut bytes = vec![];
231        file.read_to_end(&mut bytes).unwrap();
232        let parsed = parse(bytes).unwrap();
233
234        let expected = "Sed nibh velit,";
235
236        assert_eq!(&parsed[..expected.len()], expected);
237        assert_eq!(parsed.len(), 2359);
238    }
239
240    #[test]
241    fn test_parse_text_blank() {
242        let plist_path = current_dir()
243            .unwrap()
244            .as_path()
245            .join("test_data/typedstream/Blank");
246        let mut file = File::open(plist_path).unwrap();
247        let mut bytes = vec![];
248        file.read_to_end(&mut bytes).unwrap();
249        let parsed = parse(bytes);
250
251        assert!(&parsed.is_err());
252    }
253
254    #[test]
255    fn test_parse_text_multi_part_deleted() {
256        let plist_path = current_dir()
257            .unwrap()
258            .as_path()
259            .join("test_data/typedstream/MultiPartWithDeleted");
260        let mut file = File::open(plist_path).unwrap();
261        let mut bytes = vec![];
262        file.read_to_end(&mut bytes).unwrap();
263        let parsed = parse(bytes).unwrap();
264        println!("{parsed:?}");
265
266        let expected = "From arbitrary byte stream:\r\u{FFFC}To native Rust data structures:\r";
267
268        assert_eq!(parsed, expected);
269    }
270
271    #[test]
272    fn test_parse_text_attachment() {
273        let plist_path = current_dir()
274            .unwrap()
275            .as_path()
276            .join("test_data/typedstream/Attachment");
277        let mut file = File::open(plist_path).unwrap();
278        let mut bytes = vec![];
279        file.read_to_end(&mut bytes).unwrap();
280        let parsed = parse(bytes).unwrap();
281        println!("{parsed:?}");
282
283        let expected =
284            "\u{FFFC}This is how the notes look to me fyi, in case it helps make sense of anything";
285
286        assert_eq!(parsed, expected);
287    }
288
289    #[test]
290    fn test_parse_text_array() {
291        let plist_path = current_dir()
292            .unwrap()
293            .as_path()
294            .join("test_data/typedstream/Array");
295        let mut file = File::open(plist_path).unwrap();
296        let mut bytes = vec![];
297        file.read_to_end(&mut bytes).unwrap();
298        let parsed = parse(bytes).unwrap();
299        println!("{parsed:?}");
300
301        let expected = "A single ChatGPT instance takes 5MW of power to run";
302
303        assert_eq!(parsed, expected);
304    }
305
306    #[test]
307    fn test_can_drop_chars() {
308        assert_eq!(
309            drop_chars(1, String::from("Hello world")).unwrap(),
310            String::from("ello world")
311        );
312    }
313
314    #[test]
315    fn test_can_drop_chars_none() {
316        assert_eq!(
317            drop_chars(0, String::from("Hello world")).unwrap(),
318            String::from("Hello world")
319        );
320    }
321
322    #[test]
323    fn test_cant_drop_all() {
324        assert!(drop_chars(1000, String::from("Hello world")).is_err());
325    }
326}