imessage_database/util/typedstream/
parser.rs

1/*!
2 Logic used to deserialize data from a `typedstream`, focussing specifically on [`NSAttributedString`](https://developer.apple.com/documentation/foundation/nsattributedstring).
3
4 Logic reverse engineered from `typedstream` source located at:
5   - [`typedstream.h`](https://github.com/gnustep/libobjc/blob/master/objc/typedstream.h)
6   - [`archive.c`](https://github.com/gnustep/libobjc/blob/master/archive.c)
7   - [`objc/typedstream.m`](https://securitronlinux.com/news/html/d4/d6c/typedstream_8m.html)
8
9 A writeup about the reverse engineering of `typedstream` can be found [here](https://chrissardegna.com/blog/reverse-engineering-apples-typedstream-format/).
10*/
11use std::collections::HashSet;
12
13use crate::{
14    error::typedstream::TypedStreamError,
15    util::typedstream::models::{Archivable, Class, ClassResult, OutputData, Type},
16};
17
18/// Indicates an [`i16`] in the byte stream
19const I_16: u8 = 0x81;
20/// Indicates an [`i32`] in the byte stream
21const I_32: u8 = 0x82;
22/// Indicates an [`f32`] or [`f64`] in the byte stream; the [`Type`] determines the size
23const DECIMAL: u8 = 0x83;
24/// Indicates the start of a new object
25const START: u8 = 0x84;
26/// Indicates that there is no more data to parse, for example the end of a class inheritance chain
27const EMPTY: u8 = 0x85;
28/// Indicates the last byte of an object
29const END: u8 = 0x86;
30/// Bytes equal or greater in value than the reference tag indicate an index in the table of already-seen types
31const REFERENCE_TAG: u64 = 0x92;
32
33/// Contains logic and data used to deserialize data from a `typedstream`.
34///
35/// `typedstream` is a binary serialization format developed by `NeXT` and later adopted by Apple.
36/// It's designed to serialize and deserialize complex object graphs and data structures in C and Objective-C.
37///
38/// A `typedstream` begins with a header that includes format version and architecture information,
39/// followed by a stream of typed data elements. Each element is prefixed with type information,
40/// allowing the [`TypedStreamReader`] to understand the original data structures.
41#[derive(Debug)]
42pub struct TypedStreamReader<'a> {
43    /// The `typedstream` we want to parse
44    stream: &'a [u8],
45    /// The current index we are at in the stream
46    idx: usize,
47    /// As we parse the `typedstream`, build a table of seen [`Type`]s to reference in the future
48    ///
49    /// The first time a [`Type`] is seen, it is present in the stream literally,
50    /// but afterwards are only referenced by index in order of appearance.
51    types_table: Vec<Vec<Type>>,
52    /// As we parse the `typedstream`, build a table of seen archivable data to reference in the future
53    object_table: Vec<Archivable>,
54    /// We want to copy embedded types the first time they are seen, even if the types were resolved through references
55    seen_embedded_types: HashSet<u32>,
56    /// Stores the position of the current [`Archivable::Placeholder`]
57    placeholder: Option<usize>,
58}
59
60impl<'a> TypedStreamReader<'a> {
61    /// Given a stream, construct a reader instance to parse it.
62    ///
63    /// # Example:
64    ///
65    /// ```
66    /// use imessage_database::util::typedstream::parser::TypedStreamReader;
67    ///
68    /// let bytes: Vec<u8> = vec![]; // Example stream
69    /// let mut reader = TypedStreamReader::from(&bytes);
70    /// ```
71    #[must_use]
72    pub fn from(stream: &'a [u8]) -> Self {
73        Self {
74            stream,
75            idx: 0,
76            types_table: Vec::with_capacity(16),
77            object_table: Vec::with_capacity(32),
78            seen_embedded_types: HashSet::with_capacity(8),
79            placeholder: None,
80        }
81    }
82
83    /// Read a signed integer from the stream. Because we don't know the size of the integer ahead of time,
84    /// we store it in the largest possible value.
85    fn read_signed_int(&mut self) -> Result<i64, TypedStreamError> {
86        match self.get_current_byte()? {
87            I_16 => {
88                let size = 2;
89                self.idx += 1;
90                let value = i16::from_le_bytes(
91                    <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
92                        .map_err(TypedStreamError::SliceError)?,
93                );
94                Ok(i64::from(value))
95            }
96            I_32 => {
97                let size = 4;
98                self.idx += 1;
99                let value = i32::from_le_bytes(
100                    <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
101                        .map_err(TypedStreamError::SliceError)?,
102                );
103                Ok(i64::from(value))
104            }
105            _ => {
106                if self.get_current_byte()? > REFERENCE_TAG as u8 && self.get_next_byte()? != END {
107                    self.idx += 1;
108                    return self.read_signed_int();
109                }
110                let value = i8::from_le_bytes([self.get_current_byte()?]);
111                self.idx += 1;
112                Ok(i64::from(value))
113            }
114        }
115    }
116
117    /// Read an unsigned integer from the stream. Because we don't know the size of the integer ahead of time,
118    /// we store it in the largest possible value.
119    fn read_unsigned_int(&mut self) -> Result<u64, TypedStreamError> {
120        match self.get_current_byte()? {
121            I_16 => {
122                let size = 2;
123                self.idx += 1;
124                let value = u16::from_le_bytes(
125                    <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
126                        .map_err(TypedStreamError::SliceError)?,
127                );
128                Ok(u64::from(value))
129            }
130            I_32 => {
131                let size = 4;
132                self.idx += 1;
133                let value = u32::from_le_bytes(
134                    <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
135                        .map_err(TypedStreamError::SliceError)?,
136                );
137                Ok(u64::from(value))
138            }
139            _ => {
140                let value = u8::from_le_bytes([self.get_current_byte()?]);
141                self.idx += 1;
142                Ok(u64::from(value))
143            }
144        }
145    }
146
147    /// Read a single-precision float from the byte stream
148    fn read_float(&mut self) -> Result<f32, TypedStreamError> {
149        match self.get_current_byte()? {
150            DECIMAL => {
151                let size = 4;
152                self.idx += 1;
153                let value = f32::from_le_bytes(
154                    <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
155                        .map_err(TypedStreamError::SliceError)?,
156                );
157                Ok(value)
158            }
159            I_16 | I_32 => Ok(self.read_signed_int()? as f32),
160            _ => {
161                self.idx += 1;
162                Ok(self.read_signed_int()? as f32)
163            }
164        }
165    }
166
167    /// Read a double-precision float from the byte stream
168    fn read_double(&mut self) -> Result<f64, TypedStreamError> {
169        match self.get_current_byte()? {
170            DECIMAL => {
171                let size = 8;
172                self.idx += 1;
173                let value = f64::from_le_bytes(
174                    <[u8; 8]>::try_from(self.read_exact_bytes(size)?)
175                        .map_err(TypedStreamError::SliceError)?,
176                );
177                Ok(value)
178            }
179            I_16 | I_32 => Ok(self.read_signed_int()? as f64),
180            _ => {
181                self.idx += 1;
182                Ok(self.read_signed_int()? as f64)
183            }
184        }
185    }
186
187    /// Read exactly `n` bytes from the stream
188    fn read_exact_bytes(&mut self, n: usize) -> Result<&[u8], TypedStreamError> {
189        let range =
190            self.stream
191                .get(self.idx..self.idx + n)
192                .ok_or(TypedStreamError::OutOfBounds(
193                    self.idx + n,
194                    self.stream.len(),
195                ))?;
196        self.idx += n;
197        Ok(range)
198    }
199
200    /// Read `n` bytes as a String
201    fn read_exact_as_string(
202        &mut self,
203        n: usize,
204        string: &mut String,
205    ) -> Result<(), TypedStreamError> {
206        let str = std::str::from_utf8(self.read_exact_bytes(n)?)
207            .map_err(TypedStreamError::StringParseError)?;
208        string.push_str(str);
209        Ok(())
210    }
211
212    /// Get the byte at a given index, if the index is within the bounds of the `typedstream`
213    fn get_byte(&self, byte_idx: usize) -> Result<u8, TypedStreamError> {
214        if byte_idx < self.stream.len() {
215            return Ok(self.stream[byte_idx]);
216        }
217        Err(TypedStreamError::OutOfBounds(byte_idx, self.stream.len()))
218    }
219
220    /// Read the current byte
221    fn get_current_byte(&self) -> Result<u8, TypedStreamError> {
222        self.get_byte(self.idx)
223    }
224
225    /// Read the next byte
226    fn get_next_byte(&self) -> Result<u8, TypedStreamError> {
227        self.get_byte(self.idx + 1)
228    }
229
230    /// Read some bytes as an array
231    fn read_array(&mut self, size: usize) -> Result<Vec<u8>, TypedStreamError> {
232        Ok(self.read_exact_bytes(size)?.to_vec())
233    }
234
235    /// Determine the current types
236    fn read_type(&mut self) -> Result<Vec<Type>, TypedStreamError> {
237        let length = self.read_unsigned_int()?;
238
239        let types = self.read_exact_bytes(length as usize)?;
240
241        // Handle array size
242        if types.first() == Some(&0x5b) {
243            return Type::get_array_length(types).ok_or(TypedStreamError::InvalidArray);
244        }
245
246        Ok(types.iter().map(Type::from_byte).collect())
247    }
248
249    /// Read a reference pointer for a Type
250    fn read_pointer(&mut self) -> Result<u32, TypedStreamError> {
251        let pointer = self.get_current_byte()?;
252        let result = u32::from(pointer)
253            .checked_sub(REFERENCE_TAG as u32)
254            .ok_or(TypedStreamError::InvalidPointer(pointer as usize));
255        self.idx += 1;
256        result
257    }
258
259    /// Read a class
260    fn read_class(&mut self) -> Result<ClassResult, TypedStreamError> {
261        let mut out_v: Vec<Archivable> = Vec::with_capacity(4);
262        match self.get_current_byte()? {
263            START => {
264                // Skip some header bytes
265                while self.get_current_byte()? == START {
266                    self.idx += 1;
267                }
268                let length = self.read_unsigned_int()?;
269
270                if length >= REFERENCE_TAG {
271                    let index = length - REFERENCE_TAG;
272                    return Ok(ClassResult::Index(index as usize));
273                }
274
275                let mut class_name = String::with_capacity(length as usize);
276                self.read_exact_as_string(length as usize, &mut class_name)?;
277
278                let version = self.read_unsigned_int()?;
279
280                self.types_table
281                    .push(vec![Type::new_string(class_name.clone())]);
282
283                out_v.push(Archivable::Class(Class::new(class_name, version)));
284
285                if let ClassResult::ClassHierarchy(parent) = self.read_class()? {
286                    out_v.extend(parent);
287                }
288            }
289            EMPTY => {
290                self.idx += 1;
291            }
292            _ => {
293                let index = self.read_pointer()?;
294                return Ok(ClassResult::Index(index as usize));
295            }
296        }
297        Ok(ClassResult::ClassHierarchy(out_v))
298    }
299
300    /// Read an object into the cache and emit, or emit an already-cached object
301    fn read_object(&mut self) -> Result<Option<&Archivable>, TypedStreamError> {
302        match self.get_current_byte()? {
303            START => {
304                match self.read_class()? {
305                    ClassResult::Index(idx) => {
306                        return Ok(self.object_table.get(idx));
307                    }
308                    ClassResult::ClassHierarchy(classes) => {
309                        for class in classes {
310                            self.object_table.push(class);
311                        }
312                    }
313                }
314                Ok(None)
315            }
316            EMPTY => {
317                self.idx += 1;
318                Ok(None)
319            }
320            _ => {
321                let index = self.read_pointer()?;
322                Ok(self.object_table.get(index as usize))
323            }
324        }
325    }
326
327    /// Read String data
328    fn read_string(&mut self) -> Result<String, TypedStreamError> {
329        let length = self.read_unsigned_int()?;
330        let mut string = String::with_capacity(length as usize);
331        self.read_exact_as_string(length as usize, &mut string)?;
332
333        Ok(string)
334    }
335
336    /// [`Archivable`] data can be embedded on a class or in a C String marked as [`Type::EmbeddedData`]
337    fn read_embedded_data(&mut self) -> Result<Option<Archivable>, TypedStreamError> {
338        // Skip the 0x84
339        self.idx += 1;
340        match self.get_type(true)? {
341            Some(type_index) => self.read_types(type_index),
342            None => Ok(None),
343        }
344    }
345
346    /// Gets the current type from the stream, either by reading it from the stream or reading it from
347    /// the specified index of [`TypedStreamReader::types_table`]. Returns an index into the types table
348    /// to avoid cloning large type vectors.
349    fn get_type(&mut self, embedded: bool) -> Result<Option<usize>, TypedStreamError> {
350        match self.get_current_byte()? {
351            START => {
352                // Skip the start byte
353                self.idx += 1;
354
355                let object_types = self.read_type()?;
356                let type_index = self.types_table.len();
357
358                // Embedded data is stored as a C String in the objects table
359                if embedded {
360                    self.object_table
361                        .push(Archivable::Type(object_types.clone()));
362                    // We only want to include the first embedded reference tag, not subsequent references to the same embed
363                    self.seen_embedded_types
364                        .insert(self.object_table.len().saturating_sub(1) as u32);
365                }
366
367                self.types_table.push(object_types);
368                Ok(Some(type_index))
369            }
370            END => {
371                // This indicates the end of the current object
372                Ok(None)
373            }
374            _ => {
375                let ref_tag = self.read_pointer()?;
376
377                if ref_tag as usize >= self.types_table.len() {
378                    return Ok(None);
379                }
380
381                if embedded {
382                    // We only want to include the first embedded reference tag, not subsequent references to the same embed
383                    if !self.seen_embedded_types.contains(&ref_tag) {
384                        if let Some(types) = self.types_table.get(ref_tag as usize) {
385                            self.object_table.push(Archivable::Type(types.clone()));
386                            self.seen_embedded_types.insert(ref_tag);
387                        }
388                    }
389                }
390
391                Ok(Some(ref_tag as usize))
392            }
393        }
394    }
395
396    /// Given some [`Type`]s referenced by index, look at the stream and parse the data according to the specified [`Type`]
397    fn read_types(&mut self, type_index: usize) -> Result<Option<Archivable>, TypedStreamError> {
398        // Validate the index first
399        if type_index >= self.types_table.len() {
400            return Err(TypedStreamError::InvalidPointer(type_index));
401        }
402
403        let mut out_v = Vec::with_capacity(8);
404        let mut is_obj: bool = false;
405
406        // Process types one by one to avoid borrowing conflicts
407        let types_len = self.types_table[type_index].len();
408        for i in 0..types_len {
409            match &self.types_table[type_index][i] {
410                Type::Utf8String => out_v.push(OutputData::String(self.read_string()?)),
411                Type::EmbeddedData => {
412                    return self.read_embedded_data();
413                }
414                Type::Object => {
415                    is_obj = true;
416                    let length = self.object_table.len();
417                    self.placeholder = Some(length);
418                    self.object_table.push(Archivable::Placeholder);
419                    if let Some(object) = self.read_object()? {
420                        match object {
421                            Archivable::Object(_, data) => {
422                                // If this is a new object, i.e. one without any data, we add the data into it later
423                                // If the object already has data in it, we just want to return that object
424                                if !data.is_empty() {
425                                    let result = Ok(Some(object.clone()));
426                                    self.placeholder = None;
427                                    self.object_table.pop();
428                                    return result;
429                                }
430                                out_v.extend_from_slice(data);
431                            }
432                            Archivable::Class(cls) => out_v.push(OutputData::Class(cls.clone())),
433                            Archivable::Data(data) => out_v.extend_from_slice(data),
434                            // These cases are used internally in the objects table but should not be present in any output
435                            Archivable::Placeholder | Archivable::Type(_) => {}
436                        }
437                    }
438                }
439                Type::SignedInt => out_v.push(OutputData::SignedInteger(self.read_signed_int()?)),
440                Type::UnsignedInt => {
441                    out_v.push(OutputData::UnsignedInteger(self.read_unsigned_int()?));
442                }
443                Type::Float => out_v.push(OutputData::Float(self.read_float()?)),
444                Type::Double => out_v.push(OutputData::Double(self.read_double()?)),
445                Type::Unknown(byte) => out_v.push(OutputData::Byte(*byte)),
446                Type::String(s) => out_v.push(OutputData::String(s.to_string())),
447                Type::Array(size) => out_v.push(OutputData::Array(self.read_array(*size)?)),
448            }
449        }
450
451        // If we had reserved a place for an object, fill that spot
452        if let Some(spot) = self.placeholder {
453            if !out_v.is_empty() {
454                // We got a class, but do not have its respective data yet
455                if let Some(OutputData::Class(class)) = out_v.last() {
456                    self.object_table[spot] = Archivable::Object(class.clone(), vec![]);
457                // The spot after the current placeholder contains the class at the top of the class heirarchy, i.e.
458                // if we get a placeholder and then find a new class heirarchy, the object table holds the class chain
459                // in descending order of inheritance
460                } else if let Some(Archivable::Class(class)) = self.object_table.get(spot + 1) {
461                    self.object_table[spot] = Archivable::Object(class.clone(), out_v);
462                    self.placeholder = None;
463                    return Ok(self.object_table.get(spot).cloned());
464                // We got some data for a class that was already seen
465                } else if let Some(Archivable::Object(_, data)) = self.object_table.get_mut(spot) {
466                    data.extend(out_v);
467                    self.placeholder = None;
468                    return Ok(self.object_table.get(spot).cloned());
469                // We got some data that is not part of a class, i.e. a field in the parent object for which we don't know the name
470                } else {
471                    self.object_table[spot] = Archivable::Data(out_v);
472                    self.placeholder = None;
473                    return Ok(self.object_table.get(spot).cloned());
474                }
475            }
476        }
477
478        // If we have no object, but have data, return it as a Data type
479        if !out_v.is_empty() && !is_obj {
480            return Ok(Some(Archivable::Data(out_v)));
481        }
482        Ok(None)
483    }
484
485    /// In the original source there are several variants of the header, but we
486    /// only need to validate that this is the header used by macOS/iOS, as iMessage
487    /// is probably not available on any `NeXT` platform
488    pub(crate) fn validate_header(&mut self) -> Result<(), TypedStreamError> {
489        // Encoding type
490        let typedstream_version = self.read_unsigned_int()?;
491        // Encoding signature
492        let signature = self.read_string()?;
493        // System version
494        let system_version = self.read_signed_int()?;
495
496        if typedstream_version != 4 || signature != "streamtyped" || system_version != 1000 {
497            return Err(TypedStreamError::InvalidHeader);
498        }
499
500        Ok(())
501    }
502
503    /// Attempt to get the data from the `typedstream`.
504    ///
505    /// Given a stream, construct a reader object to parse it. `typedstream` data doesn't include property
506    /// names, so data is stored on [`Object`](crate::util::typedstream::models::Archivable::Object)s in order of appearance.
507    ///
508    /// Yields a new [`Archivable`] as they occur in the stream, but does not retain the object's inheritance heirarchy.
509    /// Callers are responsible for assembling the deserialized stream into a useful data structure.
510    ///
511    /// # Example:
512    ///
513    /// ```
514    /// use imessage_database::util::typedstream::parser::TypedStreamReader;
515    ///
516    /// let bytes: Vec<u8> = vec![]; // Example stream
517    /// let mut reader = TypedStreamReader::from(&bytes);
518    /// let result = reader.parse();
519    /// ```
520    ///
521    /// # Sample output:
522    /// ```txt
523    /// [
524    ///     Object(Class { name: "NSMutableString", version: 1 }, [String("Example")]) // The message text
525    ///     Data([Integer(1), Integer(7)])  // The next object describes properties for the range of chars 1 through 7
526    ///     Object(Class { name: "NSDictionary", version: 0 }, [Integer(1)])  // The first property is a `NSDictionary` with 1 item
527    ///     Object(Class { name: "NSString", version: 1 }, [String("__kIMMessagePartAttributeName")])  // The first key in the `NSDictionary`
528    ///     Object(Class { name: "NSNumber", version: 0 }, [Integer(0)])  // The first value in the `NSDictionary`
529    /// ]
530    /// ```
531    pub fn parse(&mut self) -> Result<Vec<Archivable>, TypedStreamError> {
532        let mut out_v = Vec::with_capacity(16); // Pre-allocate for better performance
533
534        self.validate_header()?;
535
536        while self.idx < self.stream.len() {
537            if self.get_current_byte()? == END {
538                self.idx += 1;
539                continue;
540            }
541
542            // First, get the current type
543            if let Some(type_index) = self.get_type(false)? {
544                let result = self.read_types(type_index);
545                if let Ok(Some(res)) = result {
546                    out_v.push(res);
547                }
548            }
549        }
550
551        Ok(out_v)
552    }
553}