imessage_database/util/typedstream/
parser.rs

1/*!
2 Logic used to deserialize data from a `typedstream`, focussing specifically on [`NSAttributedString`](https://developer.apple.com/documentation/foundation/nsattributedstring).
3
4 Logic reverse engineered from `typedstream` source located at:
5   - [`typedstream.h`](https://github.com/gnustep/libobjc/blob/master/objc/typedstream.h)
6   - [`archive.c`](https://github.com/gnustep/libobjc/blob/master/archive.c)
7   - [`objc/typedstream.m`](https://securitronlinux.com/news/html/d4/d6c/typedstream_8m.html)
8
9 A writeup about the reverse engineering of `typedstream` can be found [here](https://chrissardegna.com/blog/reverse-engineering-apples-typedstream-format/).
10*/
11use std::collections::HashSet;
12
13use crate::{
14    error::typedstream::TypedStreamError,
15    util::typedstream::models::{Archivable, Class, ClassResult, OutputData, Type},
16};
17
18/// Indicates an [`i16`] in the byte stream
19const I_16: u8 = 0x81;
20/// Indicates an [`i32`] in the byte stream
21const I_32: u8 = 0x82;
22/// Indicates an [`f32`] or [`f64`] in the byte stream; the [`Type`] determines the size
23const DECIMAL: u8 = 0x83;
24/// Indicates the start of a new object
25const START: u8 = 0x84;
26/// Indicates that there is no more data to parse, for example the end of a class inheritance chain
27const EMPTY: u8 = 0x85;
28/// Indicates the last byte of an object
29const END: u8 = 0x86;
30/// Bytes equal or greater in value than the reference tag indicate an index in the table of already-seen types
31const REFERENCE_TAG: u64 = 0x92;
32
33/// Contains logic and data used to deserialize data from a `typedstream`.
34///
35/// `typedstream` is a binary serialization format developed by NeXT and later adopted by Apple.
36/// It's designed to serialize and deserialize complex object graphs and data structures in C and Objective-C.
37///
38/// A `typedstream` begins with a header that includes format version and architecture information,
39/// followed by a stream of typed data elements. Each element is prefixed with type information,
40/// allowing the [`TypedStreamReader`] to understand the original data structures.
41#[derive(Debug)]
42pub struct TypedStreamReader<'a> {
43    /// The `typedstream` we want to parse
44    stream: &'a [u8],
45    /// The current index we are at in the stream
46    idx: usize,
47    /// As we parse the `typedstream`, build a table of seen [`Type`]s to reference in the future
48    ///
49    /// The first time a [`Type`] is seen, it is present in the stream literally,
50    /// but afterwards are only referenced by index in order of appearance.
51    types_table: Vec<Vec<Type>>,
52    /// As we parse the `typedstream`, build a table of seen archivable data to reference in the future
53    object_table: Vec<Archivable>,
54    /// We want to copy embedded types the first time they are seen, even if the types were resolved through references
55    seen_embedded_types: HashSet<u32>,
56    /// Stores the position of the current [`Archivable::Placeholder`]
57    placeholder: Option<usize>,
58}
59
60impl<'a> TypedStreamReader<'a> {
61    /// Given a stream, construct a reader instance to parse it.
62    ///
63    /// # Example:
64    ///
65    /// ```
66    /// use imessage_database::util::typedstream::parser::TypedStreamReader;
67    ///
68    /// let bytes: Vec<u8> = vec![]; // Example stream
69    /// let mut reader = TypedStreamReader::from(&bytes);
70    /// ```
71    pub fn from(stream: &'a [u8]) -> Self {
72        Self {
73            stream,
74            idx: 0,
75            types_table: vec![],
76            object_table: vec![],
77            seen_embedded_types: HashSet::new(),
78            placeholder: None,
79        }
80    }
81
82    /// Read a signed integer from the stream. Because we don't know the size of the integer ahead of time,
83    /// we store it in the largest possible value.
84    fn read_signed_int(&mut self) -> Result<i64, TypedStreamError> {
85        match self.get_current_byte()? {
86            I_16 => {
87                let size = 2;
88                self.idx += 1;
89                let value = i16::from_le_bytes(
90                    <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
91                        .map_err(TypedStreamError::SliceError)?,
92                );
93                Ok(value as i64)
94            }
95            I_32 => {
96                let size = 4;
97                self.idx += 1;
98                let value = i32::from_le_bytes(
99                    <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
100                        .map_err(TypedStreamError::SliceError)?,
101                );
102                Ok(value as i64)
103            }
104            _ => {
105                if self.get_current_byte()? > REFERENCE_TAG as u8 && self.get_next_byte()? != END {
106                    self.idx += 1;
107                    return self.read_signed_int();
108                }
109                let value = i8::from_le_bytes([self.get_current_byte()?]);
110                self.idx += 1;
111                Ok(value as i64)
112            }
113        }
114    }
115
116    /// Read an unsigned integer from the stream. Because we don't know the size of the integer ahead of time,
117    /// we store it in the largest possible value.
118    fn read_unsigned_int(&mut self) -> Result<u64, TypedStreamError> {
119        match self.get_current_byte()? {
120            I_16 => {
121                let size = 2;
122                self.idx += 1;
123                let value = u16::from_le_bytes(
124                    <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
125                        .map_err(TypedStreamError::SliceError)?,
126                );
127                Ok(value as u64)
128            }
129            I_32 => {
130                let size = 4;
131                self.idx += 1;
132                let value = u32::from_le_bytes(
133                    <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
134                        .map_err(TypedStreamError::SliceError)?,
135                );
136                Ok(value as u64)
137            }
138            _ => {
139                let value = u8::from_le_bytes([self.get_current_byte()?]);
140                self.idx += 1;
141                Ok(value as u64)
142            }
143        }
144    }
145
146    /// Read a single-precision float from the byte stream
147    fn read_float(&mut self) -> Result<f32, TypedStreamError> {
148        match self.get_current_byte()? {
149            DECIMAL => {
150                let size = 4;
151                self.idx += 1;
152                let value = f32::from_le_bytes(
153                    <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
154                        .map_err(TypedStreamError::SliceError)?,
155                );
156                Ok(value)
157            }
158            I_16 | I_32 => Ok(self.read_signed_int()? as f32),
159            _ => {
160                self.idx += 1;
161                Ok(self.read_signed_int()? as f32)
162            }
163        }
164    }
165
166    /// Read a double-precision float from the byte stream
167    fn read_double(&mut self) -> Result<f64, TypedStreamError> {
168        match self.get_current_byte()? {
169            DECIMAL => {
170                let size = 8;
171                self.idx += 1;
172                let value = f64::from_le_bytes(
173                    <[u8; 8]>::try_from(self.read_exact_bytes(size)?)
174                        .map_err(TypedStreamError::SliceError)?,
175                );
176                Ok(value)
177            }
178            I_16 | I_32 => Ok(self.read_signed_int()? as f64),
179            _ => {
180                self.idx += 1;
181                Ok(self.read_signed_int()? as f64)
182            }
183        }
184    }
185
186    /// Read exactly `n` bytes from the stream
187    fn read_exact_bytes(&mut self, n: usize) -> Result<&[u8], TypedStreamError> {
188        let range =
189            self.stream
190                .get(self.idx..self.idx + n)
191                .ok_or(TypedStreamError::OutOfBounds(
192                    self.idx + n,
193                    self.stream.len(),
194                ))?;
195        self.idx += n;
196        Ok(range)
197    }
198
199    /// Read `n` bytes as a String
200    fn read_exact_as_string(
201        &mut self,
202        n: usize,
203        string: &mut String,
204    ) -> Result<(), TypedStreamError> {
205        let str = std::str::from_utf8(self.read_exact_bytes(n)?)
206            .map_err(TypedStreamError::StringParseError)?;
207        string.push_str(str);
208        Ok(())
209    }
210
211    /// Get the byte at a given index, if the index is within the bounds of the `typedstream`
212    fn get_byte(&self, byte_idx: usize) -> Result<u8, TypedStreamError> {
213        if byte_idx < self.stream.len() {
214            return Ok(self.stream[byte_idx]);
215        }
216        Err(TypedStreamError::OutOfBounds(byte_idx, self.stream.len()))
217    }
218
219    /// Read the current byte
220    fn get_current_byte(&self) -> Result<u8, TypedStreamError> {
221        self.get_byte(self.idx)
222    }
223
224    /// Read the next byte
225    fn get_next_byte(&self) -> Result<u8, TypedStreamError> {
226        self.get_byte(self.idx + 1)
227    }
228
229    /// Read some bytes as an array
230    fn read_array(&mut self, size: usize) -> Result<Vec<u8>, TypedStreamError> {
231        Ok(self.read_exact_bytes(size)?.to_vec())
232    }
233
234    /// Determine the current types
235    fn read_type(&mut self) -> Result<Vec<Type>, TypedStreamError> {
236        let length = self.read_unsigned_int()?;
237
238        let types = self.read_exact_bytes(length as usize)?;
239
240        // Handle array size
241        if types.first() == Some(&0x5b) {
242            return Type::get_array_length(types).ok_or(TypedStreamError::InvalidArray);
243        }
244
245        Ok(types.iter().map(Type::from_byte).collect())
246    }
247
248    /// Read a reference pointer for a Type
249    fn read_pointer(&mut self) -> Result<u32, TypedStreamError> {
250        let pointer = self.get_current_byte()?;
251        let result = (pointer as u32)
252            .checked_sub(REFERENCE_TAG as u32)
253            .ok_or(TypedStreamError::InvalidPointer(pointer));
254        self.idx += 1;
255        result
256    }
257
258    /// Read a class
259    fn read_class(&mut self) -> Result<ClassResult, TypedStreamError> {
260        let mut out_v: Vec<Archivable> = vec![];
261        match self.get_current_byte()? {
262            START => {
263                // Skip some header bytes
264                while self.get_current_byte()? == START {
265                    self.idx += 1;
266                }
267                let length = self.read_unsigned_int()?;
268
269                if length >= REFERENCE_TAG {
270                    let index = length - REFERENCE_TAG;
271                    return Ok(ClassResult::Index(index as usize));
272                }
273
274                let mut class_name = String::with_capacity(length as usize);
275                self.read_exact_as_string(length as usize, &mut class_name)?;
276
277                let version = self.read_unsigned_int()?;
278
279                self.types_table
280                    .push(vec![Type::new_string(class_name.clone())]);
281
282                out_v.push(Archivable::Class(Class::new(class_name, version)));
283
284                if let ClassResult::ClassHierarchy(parent) = self.read_class()? {
285                    out_v.extend(parent);
286                }
287            }
288            EMPTY => {
289                self.idx += 1;
290            }
291            _ => {
292                let index = self.read_pointer()?;
293                return Ok(ClassResult::Index(index as usize));
294            }
295        }
296        Ok(ClassResult::ClassHierarchy(out_v))
297    }
298
299    /// Read an object into the cache and emit, or emit an already-cached object
300    fn read_object(&mut self) -> Result<Option<&Archivable>, TypedStreamError> {
301        match self.get_current_byte()? {
302            START => {
303                match self.read_class()? {
304                    ClassResult::Index(idx) => {
305                        return Ok(self.object_table.get(idx));
306                    }
307                    ClassResult::ClassHierarchy(classes) => {
308                        for class in classes.into_iter() {
309                            self.object_table.push(class)
310                        }
311                    }
312                }
313                Ok(None)
314            }
315            EMPTY => {
316                self.idx += 1;
317                Ok(None)
318            }
319            _ => {
320                let index = self.read_pointer()?;
321                Ok(self.object_table.get(index as usize))
322            }
323        }
324    }
325
326    /// Read String data
327    fn read_string(&mut self) -> Result<String, TypedStreamError> {
328        let length = self.read_unsigned_int()?;
329        let mut string = String::with_capacity(length as usize);
330        self.read_exact_as_string(length as usize, &mut string)?;
331
332        Ok(string)
333    }
334
335    /// [`Archivable`] data can be embedded on a class or in a C String marked as [`Type::EmbeddedData`]
336    fn read_embedded_data(&mut self) -> Result<Option<Archivable>, TypedStreamError> {
337        // Skip the 0x84
338        self.idx += 1;
339        match self.get_type(true)? {
340            Some(types) => self.read_types(types),
341            None => Ok(None),
342        }
343    }
344
345    /// Gets the current type from the stream, either by reading it from the stream or reading it from
346    /// the specified index of [`TypedStreamReader::types_table`]. Because methods that use this type can also mutate self,
347    /// returning a reference here means other methods could make that reference to the table invalid,
348    /// which is disallowed in Rust. Thus, we return a clone of the cached data.
349    fn get_type(&mut self, embedded: bool) -> Result<Option<Vec<Type>>, TypedStreamError> {
350        match self.get_current_byte()? {
351            START => {
352                // Ignore repeated types, for example in a dict
353                self.idx += 1;
354
355                let object_types = self.read_type()?;
356
357                // Embedded data is stored as a C String in the objects table
358                if embedded {
359                    self.object_table
360                        .push(Archivable::Type(object_types.clone()));
361                    // We only want to include the first embedded reference tag, not subsequent references to the same embed
362                    self.seen_embedded_types
363                        .insert(self.object_table.len().saturating_sub(1) as u32);
364                }
365                self.types_table.push(object_types);
366                Ok(self.types_table.last().cloned())
367            }
368            END => {
369                // This indicates the end of the current object
370                Ok(None)
371            }
372            _ => {
373                // Ignore repeated types, for example in a dict
374                while self.get_current_byte()? == self.get_next_byte()? {
375                    self.idx += 1;
376                }
377
378                let ref_tag = self.read_pointer()?;
379                let result = self.types_table.get(ref_tag as usize);
380
381                if embedded {
382                    if let Some(res) = result {
383                        // We only want to include the first embedded reference tag, not subsequent references to the same embed
384                        if !self.seen_embedded_types.contains(&ref_tag) {
385                            self.object_table.push(Archivable::Type(res.clone()));
386                            self.seen_embedded_types.insert(ref_tag);
387                        }
388                    }
389                }
390
391                Ok(result.cloned())
392            }
393        }
394    }
395
396    /// Given some [`Type`]s, look at the stream and parse the data according to the specified [`Type`]
397    fn read_types(
398        &mut self,
399        found_types: Vec<Type>,
400    ) -> Result<Option<Archivable>, TypedStreamError> {
401        let mut out_v = vec![];
402        let mut is_obj: bool = false;
403
404        for found_type in found_types {
405            match found_type {
406                Type::Utf8String => out_v.push(OutputData::String(self.read_string()?)),
407                Type::EmbeddedData => {
408                    return self.read_embedded_data();
409                }
410                Type::Object => {
411                    is_obj = true;
412                    let length = self.object_table.len();
413                    self.placeholder = Some(length);
414                    self.object_table.push(Archivable::Placeholder);
415                    if let Some(object) = self.read_object()? {
416                        match object.clone() {
417                            Archivable::Object(_, data) => {
418                                // If this is a new object, i.e. one without any data, we add the data into it later
419                                // If the object already has data in it, we just want to return that object
420                                if !data.is_empty() {
421                                    let result = Ok(Some(object.clone()));
422                                    self.placeholder = None;
423                                    self.object_table.pop();
424                                    return result;
425                                }
426                                out_v.extend(data)
427                            }
428                            Archivable::Class(cls) => out_v.push(OutputData::Class(cls)),
429                            Archivable::Data(data) => out_v.extend(data),
430                            // These cases are used internally in the objects table but should not be present in any output
431                            Archivable::Placeholder | Archivable::Type(_) => {}
432                        }
433                    }
434                }
435                Type::SignedInt => out_v.push(OutputData::SignedInteger(self.read_signed_int()?)),
436                Type::UnsignedInt => {
437                    out_v.push(OutputData::UnsignedInteger(self.read_unsigned_int()?))
438                }
439                Type::Float => out_v.push(OutputData::Float(self.read_float()?)),
440                Type::Double => out_v.push(OutputData::Double(self.read_double()?)),
441                Type::Unknown(byte) => out_v.push(OutputData::Byte(byte)),
442                Type::String(s) => out_v.push(OutputData::String(s)),
443                Type::Array(size) => out_v.push(OutputData::Array(self.read_array(size)?)),
444            };
445        }
446
447        // If we had reserved a place for an object, fill that spot
448        if let Some(spot) = self.placeholder {
449            if !out_v.is_empty() {
450                // We got a class, but do not have its respective data yet
451                if let Some(OutputData::Class(class)) = out_v.last() {
452                    self.object_table[spot] = Archivable::Object(class.clone(), vec![]);
453                // The spot after the current placeholder contains the class at the top of the class heirarchy, i.e.
454                // if we get a placeholder and then find a new class heirarchy, the object table holds the class chain
455                // in descending order of inheritance
456                } else if let Some(Archivable::Class(class)) = self.object_table.get(spot + 1) {
457                    self.object_table[spot] = Archivable::Object(class.clone(), out_v.clone());
458                    self.placeholder = None;
459                    return Ok(self.object_table.get(spot).cloned());
460                // We got some data for a class that was already seen
461                } else if let Some(Archivable::Object(_, data)) = self.object_table.get_mut(spot) {
462                    data.extend(out_v.clone());
463                    self.placeholder = None;
464                    return Ok(self.object_table.get(spot).cloned());
465                // We got some data that is not part of a class, i.e. a field in the parent object for which we don't know the name
466                } else {
467                    self.object_table[spot] = Archivable::Data(out_v.clone());
468                    self.placeholder = None;
469                    return Ok(self.object_table.get(spot).cloned());
470                }
471            }
472        }
473
474        if !out_v.is_empty() && !is_obj {
475            return Ok(Some(Archivable::Data(out_v.clone())));
476        }
477        Ok(None)
478    }
479
480    /// In the original source there are several variants of the header, but we
481    /// only need to validate that this is the header used by macOS/iOS, as iMessage
482    /// is probably not available on any NeXT platform
483    pub(crate) fn validate_header(&mut self) -> Result<(), TypedStreamError> {
484        // Encoding type
485        let typedstream_version = self.read_unsigned_int()?;
486        // Encoding signature
487        let signature = self.read_string()?;
488        // System version
489        let system_version = self.read_signed_int()?;
490
491        if typedstream_version != 4 || signature != "streamtyped" || system_version != 1000 {
492            return Err(TypedStreamError::InvalidHeader);
493        }
494
495        Ok(())
496    }
497
498    /// Attempt to get the data from the `typedstream`.
499    ///
500    /// Given a stream, construct a reader object to parse it. `typedstream` data doesn't include property
501    /// names, so data is stored on [`Object`](crate::util::typedstream::models::Archivable::Object)s in order of appearance.
502    ///
503    /// Yields a new [`Archivable`] as they occur in the stream, but does not retain the object's inheritance heirarchy.
504    /// Callers are responsible for assembling the deserialized stream into a useful data structure.
505    ///
506    /// # Example:
507    ///
508    /// ```
509    /// use imessage_database::util::typedstream::parser::TypedStreamReader;
510    ///
511    /// let bytes: Vec<u8> = vec![]; // Example stream
512    /// let mut reader = TypedStreamReader::from(&bytes);
513    /// let result = reader.parse();
514    /// ```
515    ///
516    /// # Sample output:
517    /// ```txt
518    /// [
519    ///     Object(Class { name: "NSMutableString", version: 1 }, [String("Example")]) // The message text
520    ///     Data([Integer(1), Integer(7)])  // The next object describes properties for the range of chars 1 through 7
521    ///     Object(Class { name: "NSDictionary", version: 0 }, [Integer(1)])  // The first property is a `NSDictionary` with 1 item
522    ///     Object(Class { name: "NSString", version: 1 }, [String("__kIMMessagePartAttributeName")])  // The first key in the `NSDictionary`
523    ///     Object(Class { name: "NSNumber", version: 0 }, [Integer(0)])  // The first value in the `NSDictionary`
524    /// ]
525    /// ```
526    pub fn parse(&mut self) -> Result<Vec<Archivable>, TypedStreamError> {
527        let mut out_v = vec![];
528
529        self.validate_header()?;
530
531        while self.idx < self.stream.len() {
532            if self.get_current_byte()? == END {
533                self.idx += 1;
534                continue;
535            }
536
537            // First, get the current type
538            if let Some(found_types) = self.get_type(false)? {
539                let result = self.read_types(found_types);
540                if let Ok(Some(res)) = result {
541                    out_v.push(res);
542                }
543            }
544        }
545
546        Ok(out_v)
547    }
548}