imessage_database/util/typedstream/
parser.rs

1/*!
2 Logic used to deserialize data from a `typedstream`, focussing specifically on [`NSAttributedString`](https://developer.apple.com/documentation/foundation/nsattributedstring).
3
4 Logic reverse engineered from `typedstream` source located at:
5   - [`typedstream.h`](https://github.com/gnustep/libobjc/blob/master/objc/typedstream.h)
6   - [`archive.c`](https://github.com/gnustep/libobjc/blob/master/archive.c)
7   - [`objc/typedstream.m`](https://securitronlinux.com/news/html/d4/d6c/typedstream_8m.html)
8
9 A writeup about the reverse engineering of `typedstream` can be found [here](https://chrissardegna.com/blog/reverse-engineering-apples-typedstream-format/).
10*/
11use std::collections::HashSet;
12
13use crate::{
14    error::typedstream::TypedStreamError,
15    util::typedstream::models::{Archivable, Class, ClassResult, OutputData, Type},
16};
17
18/// Indicates an [`i16`] in the byte stream
19const I_16: u8 = 0x81;
20/// Indicates an [`i32`] in the byte stream
21const I_32: u8 = 0x82;
22/// Indicates an [`f32`] or [`f64`] in the byte stream; the [`Type`] determines the size
23const DECIMAL: u8 = 0x83;
24/// Indicates the start of a new object
25const START: u8 = 0x84;
26/// Indicates that there is no more data to parse, for example the end of a class inheritance chain
27const EMPTY: u8 = 0x85;
28/// Indicates the last byte of an object
29const END: u8 = 0x86;
30/// Bytes equal or greater in value than the reference tag indicate an index in the table of already-seen types
31const REFERENCE_TAG: u64 = 0x92;
32
33/// Contains logic and data used to deserialize data from a `typedstream`.
34///
35/// `typedstream` is a binary serialization format developed by NeXT and later adopted by Apple.
36/// It's designed to serialize and deserialize complex object graphs and data structures in C and Objective-C.
37///
38/// A `typedstream` begins with a header that includes format version and architecture information,
39/// followed by a stream of typed data elements. Each element is prefixed with type information,
40/// allowing the [`TypedStreamReader`] to understand the original data structures.
41#[derive(Debug)]
42pub struct TypedStreamReader<'a> {
43    /// The `typedstream` we want to parse
44    stream: &'a [u8],
45    /// The current index we are at in the stream
46    idx: usize,
47    /// As we parse the `typedstream`, build a table of seen [`Type`]s to reference in the future
48    ///
49    /// The first time a [`Type`] is seen, it is present in the stream literally,
50    /// but afterwards are only referenced by index in order of appearance.
51    types_table: Vec<Vec<Type>>,
52    /// As we parse the `typedstream`, build a table of seen archivable data to reference in the future
53    object_table: Vec<Archivable>,
54    /// We want to copy embedded types the first time they are seen, even if the types were resolved through references
55    seen_embedded_types: HashSet<u32>,
56    /// Stores the position of the current [`Archivable::Placeholder`]
57    placeholder: Option<usize>,
58}
59
60impl<'a> TypedStreamReader<'a> {
61    /// Given a stream, construct a reader instance to parse it.
62    ///
63    /// # Example:
64    ///
65    /// ```
66    /// use imessage_database::util::typedstream::parser::TypedStreamReader;
67    ///
68    /// let bytes: Vec<u8> = vec![]; // Example stream
69    /// let mut reader = TypedStreamReader::from(&bytes);
70    /// ```
71    pub fn from(stream: &'a [u8]) -> Self {
72        Self {
73            stream,
74            idx: 0,
75            types_table: vec![],
76            object_table: vec![],
77            seen_embedded_types: HashSet::new(),
78            placeholder: None,
79        }
80    }
81
82    /// Read a signed integer from the stream. Because we don't know the size of the integer ahead of time,
83    /// we store it in the largest possible value.
84    fn read_signed_int(&mut self) -> Result<i64, TypedStreamError> {
85        match self.get_current_byte()? {
86            I_16 => {
87                let size = 2;
88                self.idx += 1;
89                let value = i16::from_le_bytes(
90                    self.read_exact_bytes(size)?
91                        .try_into()
92                        .map_err(TypedStreamError::SliceError)?,
93                );
94                Ok(value as i64)
95            }
96            I_32 => {
97                let size = 4;
98                self.idx += 1;
99                let value = i32::from_le_bytes(
100                    self.read_exact_bytes(size)?
101                        .try_into()
102                        .map_err(TypedStreamError::SliceError)?,
103                );
104                Ok(value as i64)
105            }
106            _ => {
107                if self.get_current_byte()? > REFERENCE_TAG as u8 && self.get_next_byte()? != END {
108                    self.idx += 1;
109                    return self.read_signed_int();
110                }
111                let value = i8::from_le_bytes([self.get_current_byte()?]);
112                self.idx += 1;
113                Ok(value as i64)
114            }
115        }
116    }
117
118    /// Read an unsigned integer from the stream. Because we don't know the size of the integer ahead of time,
119    /// we store it in the largest possible value.
120    fn read_unsigned_int(&mut self) -> Result<u64, TypedStreamError> {
121        match self.get_current_byte()? {
122            I_16 => {
123                let size = 2;
124                self.idx += 1;
125                let value = u16::from_le_bytes(
126                    self.read_exact_bytes(size)?
127                        .try_into()
128                        .map_err(TypedStreamError::SliceError)?,
129                );
130                Ok(value as u64)
131            }
132            I_32 => {
133                let size = 4;
134                self.idx += 1;
135                let value = u32::from_le_bytes(
136                    self.read_exact_bytes(size)?
137                        .try_into()
138                        .map_err(TypedStreamError::SliceError)?,
139                );
140                Ok(value as u64)
141            }
142            _ => {
143                let value = u8::from_le_bytes([self.get_current_byte()?]);
144                self.idx += 1;
145                Ok(value as u64)
146            }
147        }
148    }
149
150    /// Read a single-precision float from the byte stream
151    fn read_float(&mut self) -> Result<f32, TypedStreamError> {
152        match self.get_current_byte()? {
153            DECIMAL => {
154                let size = 4;
155                self.idx += 1;
156                let value = f32::from_le_bytes(
157                    self.read_exact_bytes(size)?
158                        .try_into()
159                        .map_err(TypedStreamError::SliceError)?,
160                );
161                Ok(value)
162            }
163            I_16 | I_32 => Ok(self.read_signed_int()? as f32),
164            _ => {
165                self.idx += 1;
166                Ok(self.read_signed_int()? as f32)
167            }
168        }
169    }
170
171    /// Read a double-precision float from the byte stream
172    fn read_double(&mut self) -> Result<f64, TypedStreamError> {
173        match self.get_current_byte()? {
174            DECIMAL => {
175                let size = 8;
176                self.idx += 1;
177                let value = f64::from_le_bytes(
178                    self.read_exact_bytes(size)?
179                        .try_into()
180                        .map_err(TypedStreamError::SliceError)?,
181                );
182                Ok(value)
183            }
184            I_16 | I_32 => Ok(self.read_signed_int()? as f64),
185            _ => {
186                self.idx += 1;
187                Ok(self.read_signed_int()? as f64)
188            }
189        }
190    }
191
192    /// Read exactly `n` bytes from the stream
193    fn read_exact_bytes(&mut self, n: usize) -> Result<&[u8], TypedStreamError> {
194        let range =
195            self.stream
196                .get(self.idx..self.idx + n)
197                .ok_or(TypedStreamError::OutOfBounds(
198                    self.idx + n,
199                    self.stream.len(),
200                ))?;
201        self.idx += n;
202        Ok(range)
203    }
204
205    /// Read `n` bytes as a String
206    fn read_exact_as_string(
207        &mut self,
208        n: usize,
209        string: &mut String,
210    ) -> Result<(), TypedStreamError> {
211        let str = std::str::from_utf8(self.read_exact_bytes(n)?)
212            .map_err(TypedStreamError::StringParseError)?;
213        string.push_str(str);
214        Ok(())
215    }
216
217    /// Get the byte at a given index, if the index is within the bounds of the `typedstream`
218    fn get_byte(&self, byte_idx: usize) -> Result<u8, TypedStreamError> {
219        if byte_idx < self.stream.len() {
220            return Ok(self.stream[byte_idx]);
221        }
222        Err(TypedStreamError::OutOfBounds(byte_idx, self.stream.len()))
223    }
224
225    /// Read the current byte
226    fn get_current_byte(&self) -> Result<u8, TypedStreamError> {
227        self.get_byte(self.idx)
228    }
229
230    /// Read the next byte
231    fn get_next_byte(&self) -> Result<u8, TypedStreamError> {
232        self.get_byte(self.idx + 1)
233    }
234
235    /// Read some bytes as an array
236    fn read_array(&mut self, size: usize) -> Result<Vec<u8>, TypedStreamError> {
237        Ok(self.read_exact_bytes(size)?.to_vec())
238    }
239
240    /// Determine the current types
241    fn read_type(&mut self) -> Result<Vec<Type>, TypedStreamError> {
242        let length = self.read_unsigned_int()?;
243
244        let types = self.read_exact_bytes(length as usize)?;
245
246        // Handle array size
247        if types.first() == Some(&0x5b) {
248            return Type::get_array_length(types).ok_or(TypedStreamError::InvalidArray);
249        }
250
251        Ok(types.iter().map(Type::from_byte).collect())
252    }
253
254    /// Read a reference pointer for a Type
255    fn read_pointer(&mut self) -> Result<u32, TypedStreamError> {
256        let pointer = self.get_current_byte()?;
257        let result = (pointer as u32)
258            .checked_sub(REFERENCE_TAG as u32)
259            .ok_or(TypedStreamError::InvalidPointer(pointer));
260        self.idx += 1;
261        result
262    }
263
264    /// Read a class
265    fn read_class(&mut self) -> Result<ClassResult, TypedStreamError> {
266        let mut out_v: Vec<Archivable> = vec![];
267        match self.get_current_byte()? {
268            START => {
269                // Skip some header bytes
270                while self.get_current_byte()? == START {
271                    self.idx += 1;
272                }
273                let length = self.read_unsigned_int()?;
274
275                if length >= REFERENCE_TAG {
276                    let index = length - REFERENCE_TAG;
277                    return Ok(ClassResult::Index(index as usize));
278                }
279
280                let mut class_name = String::with_capacity(length as usize);
281                self.read_exact_as_string(length as usize, &mut class_name)?;
282
283                let version = self.read_unsigned_int()?;
284
285                self.types_table
286                    .push(vec![Type::new_string(class_name.clone())]);
287
288                out_v.push(Archivable::Class(Class::new(class_name, version)));
289
290                if let ClassResult::ClassHierarchy(parent) = self.read_class()? {
291                    out_v.extend(parent);
292                }
293            }
294            EMPTY => {
295                self.idx += 1;
296            }
297            _ => {
298                let index = self.read_pointer()?;
299                return Ok(ClassResult::Index(index as usize));
300            }
301        }
302        Ok(ClassResult::ClassHierarchy(out_v))
303    }
304
305    /// Read an object into the cache and emit, or emit an already-cached object
306    fn read_object(&mut self) -> Result<Option<&Archivable>, TypedStreamError> {
307        match self.get_current_byte()? {
308            START => {
309                match self.read_class()? {
310                    ClassResult::Index(idx) => {
311                        return Ok(self.object_table.get(idx));
312                    }
313                    ClassResult::ClassHierarchy(classes) => {
314                        for class in classes.into_iter() {
315                            self.object_table.push(class)
316                        }
317                    }
318                }
319                Ok(None)
320            }
321            EMPTY => {
322                self.idx += 1;
323                Ok(None)
324            }
325            _ => {
326                let index = self.read_pointer()?;
327                Ok(self.object_table.get(index as usize))
328            }
329        }
330    }
331
332    /// Read String data
333    fn read_string(&mut self) -> Result<String, TypedStreamError> {
334        let length = self.read_unsigned_int()?;
335        let mut string = String::with_capacity(length as usize);
336        self.read_exact_as_string(length as usize, &mut string)?;
337
338        Ok(string)
339    }
340
341    /// [`Archivable`] data can be embedded on a class or in a C String marked as [`Type::EmbeddedData`]
342    fn read_embedded_data(&mut self) -> Result<Option<Archivable>, TypedStreamError> {
343        // Skip the 0x84
344        self.idx += 1;
345        match self.get_type(true)? {
346            Some(types) => self.read_types(types),
347            None => Ok(None),
348        }
349    }
350
351    /// Gets the current type from the stream, either by reading it from the stream or reading it from
352    /// the specified index of [`TypedStreamReader::types_table`]. Because methods that use this type can also mutate self,
353    /// returning a reference here means other methods could make that reference to the table invalid,
354    /// which is disallowed in Rust. Thus, we return a clone of the cached data.
355    fn get_type(&mut self, embedded: bool) -> Result<Option<Vec<Type>>, TypedStreamError> {
356        match self.get_current_byte()? {
357            START => {
358                // Ignore repeated types, for example in a dict
359                self.idx += 1;
360
361                let object_types = self.read_type()?;
362
363                // Embedded data is stored as a C String in the objects table
364                if embedded {
365                    self.object_table
366                        .push(Archivable::Type(object_types.clone()));
367                }
368                self.types_table.push(object_types);
369                Ok(self.types_table.last().cloned())
370            }
371            END => {
372                // This indicates the end of the current object
373                Ok(None)
374            }
375            _ => {
376                // Ignore repeated types, for example in a dict
377                while self.get_current_byte()? == self.get_next_byte()? {
378                    self.idx += 1;
379                }
380
381                let ref_tag = self.read_pointer()?;
382                let result = self.types_table.get(ref_tag as usize);
383
384                if embedded {
385                    if let Some(res) = result {
386                        // We only want to include the first embedded reference tag, not subsequent references to the same embed
387                        if !self.seen_embedded_types.contains(&ref_tag) {
388                            self.object_table.push(Archivable::Type(res.clone()));
389                            self.seen_embedded_types.insert(ref_tag);
390                        }
391                    }
392                }
393
394                Ok(result.cloned())
395            }
396        }
397    }
398
399    /// Given some [`Type`]s, look at the stream and parse the data according to the specified [`Type`]
400    fn read_types(
401        &mut self,
402        found_types: Vec<Type>,
403    ) -> Result<Option<Archivable>, TypedStreamError> {
404        let mut out_v = vec![];
405        let mut is_obj: bool = false;
406
407        for found_type in found_types {
408            match found_type {
409                Type::Utf8String => out_v.push(OutputData::String(self.read_string()?)),
410                Type::EmbeddedData => {
411                    return self.read_embedded_data();
412                }
413                Type::Object => {
414                    is_obj = true;
415                    let length = self.object_table.len();
416                    self.placeholder = Some(length);
417                    self.object_table.push(Archivable::Placeholder);
418                    if let Some(object) = self.read_object()? {
419                        match object.clone() {
420                            Archivable::Object(_, data) => {
421                                // If this is a new object, i.e. one without any data, we add the data into it later
422                                // If the object already has data in it, we just want to return that object
423                                if !data.is_empty() {
424                                    let result = Ok(Some(object.clone()));
425                                    self.placeholder = None;
426                                    self.object_table.pop();
427                                    return result;
428                                }
429                                out_v.extend(data)
430                            }
431                            Archivable::Class(cls) => out_v.push(OutputData::Class(cls)),
432                            Archivable::Data(data) => out_v.extend(data),
433                            // These cases are used internally in the objects table but should not be present in any output
434                            Archivable::Placeholder | Archivable::Type(_) => {}
435                        }
436                    }
437                }
438                Type::SignedInt => out_v.push(OutputData::SignedInteger(self.read_signed_int()?)),
439                Type::UnsignedInt => {
440                    out_v.push(OutputData::UnsignedInteger(self.read_unsigned_int()?))
441                }
442                Type::Float => out_v.push(OutputData::Float(self.read_float()?)),
443                Type::Double => out_v.push(OutputData::Double(self.read_double()?)),
444                Type::Unknown(byte) => out_v.push(OutputData::Byte(byte)),
445                Type::String(s) => out_v.push(OutputData::String(s)),
446                Type::Array(size) => out_v.push(OutputData::Array(self.read_array(size)?)),
447            };
448        }
449
450        // If we had reserved a place for an object, fill that spot
451        if let Some(spot) = self.placeholder {
452            if !out_v.is_empty() {
453                // We got a class, but do not have its respective data yet
454                if let Some(OutputData::Class(class)) = out_v.last() {
455                    self.object_table[spot] = Archivable::Object(class.clone(), vec![]);
456                // The spot after the current placeholder contains the class at the top of the class heirarchy, i.e.
457                // if we get a placeholder and then find a new class heirarchy, the object table holds the class chain
458                // in descending order of inheritance
459                } else if let Some(Archivable::Class(class)) = self.object_table.get(spot + 1) {
460                    self.object_table[spot] = Archivable::Object(class.clone(), out_v.clone());
461                    self.placeholder = None;
462                    return Ok(self.object_table.get(spot).cloned());
463                // We got some data for a class that was already seen
464                } else if let Some(Archivable::Object(_, data)) = self.object_table.get_mut(spot) {
465                    data.extend(out_v.clone());
466                    self.placeholder = None;
467                    return Ok(self.object_table.get(spot).cloned());
468                // We got some data that is not part of a class, i.e. a field in the parent object for which we don't know the name
469                } else {
470                    self.object_table[spot] = Archivable::Data(out_v.clone());
471                    self.placeholder = None;
472                    return Ok(self.object_table.get(spot).cloned());
473                }
474            }
475        }
476
477        if !out_v.is_empty() && !is_obj {
478            return Ok(Some(Archivable::Data(out_v.clone())));
479        }
480        Ok(None)
481    }
482
483    /// In the original source there are several variants of the header, but we
484    /// only need to validate that this is the header used by macOS/iOS, as iMessage
485    /// is probably not available on any NeXT platform
486    pub(crate) fn validate_header(&mut self) -> Result<(), TypedStreamError> {
487        // Encoding type
488        let typedstream_version = self.read_unsigned_int()?;
489        // Encoding signature
490        let signature = self.read_string()?;
491        // System version
492        let system_version = self.read_signed_int()?;
493
494        if typedstream_version != 4 || signature != "streamtyped" || system_version != 1000 {
495            return Err(TypedStreamError::InvalidHeader);
496        }
497
498        Ok(())
499    }
500
501    /// Attempt to get the data from the `typedstream`.
502    ///
503    /// Given a stream, construct a reader object to parse it. `typedstream` data doesn't include property
504    /// names, so data is stored on [`Object`](crate::util::typedstream::models::Archivable::Object)s in order of appearance.
505    ///
506    /// Yields a new [`Archivable`] as they occur in the stream, but does not retain the object's inheritance heirarchy.
507    /// Callers are responsible for assembling the deserialized stream into a useful data structure.
508    ///
509    /// # Example:
510    ///
511    /// ```
512    /// use imessage_database::util::typedstream::parser::TypedStreamReader;
513    ///
514    /// let bytes: Vec<u8> = vec![]; // Example stream
515    /// let mut reader = TypedStreamReader::from(&bytes);
516    /// let result = reader.parse();
517    /// ```
518    ///
519    /// # Sample output:
520    /// ```txt
521    /// [
522    ///     Object(Class { name: "NSMutableString", version: 1 }, [String("Example")]) // The message text
523    ///     Data([Integer(1), Integer(7)])  // The next object describes properties for the range of chars 1 through 7
524    ///     Object(Class { name: "NSDictionary", version: 0 }, [Integer(1)])  // The first property is a `NSDictionary` with 1 item
525    ///     Object(Class { name: "NSString", version: 1 }, [String("__kIMMessagePartAttributeName")])  // The first key in the `NSDictionary`
526    ///     Object(Class { name: "NSNumber", version: 0 }, [Integer(0)])  // The first value in the `NSDictionary`
527    /// ]
528    /// ```
529    pub fn parse(&mut self) -> Result<Vec<Archivable>, TypedStreamError> {
530        let mut out_v = vec![];
531
532        self.validate_header()?;
533
534        while self.idx < self.stream.len() {
535            if self.get_current_byte()? == END {
536                self.idx += 1;
537                continue;
538            }
539
540            // First, get the current type
541            if let Some(found_types) = self.get_type(false)? {
542                let result = self.read_types(found_types);
543                if let Ok(Some(res)) = result {
544                    out_v.push(res);
545                }
546            }
547        }
548
549        Ok(out_v)
550    }
551}