imessage_database/util/typedstream/
parser.rs

1/*!
2 Logic used to deserialize data from a `typedstream`, focussing specifically on [`NSAttributedString`](https://developer.apple.com/documentation/foundation/nsattributedstring).
3
4 Logic reverse engineered from `typedstream` source located at:
5   - [`typedstream.h`](https://github.com/gnustep/libobjc/blob/master/objc/typedstream.h)
6   - [`archive.c`](https://github.com/gnustep/libobjc/blob/master/archive.c)
7   - [`objc/typedstream.m`](https://securitronlinux.com/news/html/d4/d6c/typedstream_8m.html)
8
9 A writeup about the reverse engineering of `typedstream` can be found [here](https://chrissardegna.com/blog/reverse-engineering-apples-typedstream-format/).
10*/
11use std::collections::HashSet;
12
13use crate::{
14    error::typedstream::TypedStreamError,
15    util::typedstream::models::{Archivable, Class, ClassResult, OutputData, Type},
16};
17
18/// Indicates an [`i16`] in the byte stream
19const I_16: u8 = 0x81;
20/// Indicates an [`i32`] in the byte stream
21const I_32: u8 = 0x82;
22/// Indicates an [`f32`] or [`f64`] in the byte stream; the [`Type`] determines the size
23const DECIMAL: u8 = 0x83;
24/// Indicates the start of a new object
25const START: u8 = 0x84;
26/// Indicates that there is no more data to parse, for example the end of a class inheritance chain
27const EMPTY: u8 = 0x85;
28/// Indicates the last byte of an object
29const END: u8 = 0x86;
30/// Bytes equal or greater in value than the reference tag indicate an index in the table of already-seen types
31const REFERENCE_TAG: u64 = 0x92;
32
33/// Contains logic and data used to deserialize data from a `typedstream`.
34///
35/// `typedstream` is a binary serialization format developed by `NeXT` and later adopted by Apple.
36/// It's designed to serialize and deserialize complex object graphs and data structures in C and Objective-C.
37///
38/// A `typedstream` begins with a header that includes format version and architecture information,
39/// followed by a stream of typed data elements. Each element is prefixed with type information,
40/// allowing the [`TypedStreamReader`] to understand the original data structures.
41#[derive(Debug)]
42pub struct TypedStreamReader<'a> {
43    /// The `typedstream` we want to parse
44    stream: &'a [u8],
45    /// The current index we are at in the stream
46    idx: usize,
47    /// As we parse the `typedstream`, build a table of seen [`Type`]s to reference in the future
48    ///
49    /// The first time a [`Type`] is seen, it is present in the stream literally,
50    /// but afterwards are only referenced by index in order of appearance.
51    types_table: Vec<Vec<Type>>,
52    /// As we parse the `typedstream`, build a table of seen archivable data to reference in the future
53    object_table: Vec<Archivable>,
54    /// We want to copy embedded types the first time they are seen, even if the types were resolved through references
55    seen_embedded_types: HashSet<u32>,
56    /// Stores the position of the current [`Archivable::Placeholder`]
57    placeholder: Option<usize>,
58}
59
60impl<'a> TypedStreamReader<'a> {
61    /// Given a stream, construct a reader instance to parse it.
62    ///
63    /// # Example:
64    ///
65    /// ```
66    /// use imessage_database::util::typedstream::parser::TypedStreamReader;
67    ///
68    /// let bytes: Vec<u8> = vec![]; // Example stream
69    /// let mut reader = TypedStreamReader::from(&bytes);
70    /// ```
71    #[must_use]
72    pub fn from(stream: &'a [u8]) -> Self {
73        Self {
74            stream,
75            idx: 0,
76            types_table: vec![],
77            object_table: vec![],
78            seen_embedded_types: HashSet::new(),
79            placeholder: None,
80        }
81    }
82
83    /// Read a signed integer from the stream. Because we don't know the size of the integer ahead of time,
84    /// we store it in the largest possible value.
85    fn read_signed_int(&mut self) -> Result<i64, TypedStreamError> {
86        match self.get_current_byte()? {
87            I_16 => {
88                let size = 2;
89                self.idx += 1;
90                let value = i16::from_le_bytes(
91                    <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
92                        .map_err(TypedStreamError::SliceError)?,
93                );
94                Ok(i64::from(value))
95            }
96            I_32 => {
97                let size = 4;
98                self.idx += 1;
99                let value = i32::from_le_bytes(
100                    <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
101                        .map_err(TypedStreamError::SliceError)?,
102                );
103                Ok(i64::from(value))
104            }
105            _ => {
106                if self.get_current_byte()? > REFERENCE_TAG as u8 && self.get_next_byte()? != END {
107                    self.idx += 1;
108                    return self.read_signed_int();
109                }
110                let value = i8::from_le_bytes([self.get_current_byte()?]);
111                self.idx += 1;
112                Ok(i64::from(value))
113            }
114        }
115    }
116
117    /// Read an unsigned integer from the stream. Because we don't know the size of the integer ahead of time,
118    /// we store it in the largest possible value.
119    fn read_unsigned_int(&mut self) -> Result<u64, TypedStreamError> {
120        match self.get_current_byte()? {
121            I_16 => {
122                let size = 2;
123                self.idx += 1;
124                let value = u16::from_le_bytes(
125                    <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
126                        .map_err(TypedStreamError::SliceError)?,
127                );
128                Ok(u64::from(value))
129            }
130            I_32 => {
131                let size = 4;
132                self.idx += 1;
133                let value = u32::from_le_bytes(
134                    <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
135                        .map_err(TypedStreamError::SliceError)?,
136                );
137                Ok(u64::from(value))
138            }
139            _ => {
140                let value = u8::from_le_bytes([self.get_current_byte()?]);
141                self.idx += 1;
142                Ok(u64::from(value))
143            }
144        }
145    }
146
147    /// Read a single-precision float from the byte stream
148    fn read_float(&mut self) -> Result<f32, TypedStreamError> {
149        match self.get_current_byte()? {
150            DECIMAL => {
151                let size = 4;
152                self.idx += 1;
153                let value = f32::from_le_bytes(
154                    <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
155                        .map_err(TypedStreamError::SliceError)?,
156                );
157                Ok(value)
158            }
159            I_16 | I_32 => Ok(self.read_signed_int()? as f32),
160            _ => {
161                self.idx += 1;
162                Ok(self.read_signed_int()? as f32)
163            }
164        }
165    }
166
167    /// Read a double-precision float from the byte stream
168    fn read_double(&mut self) -> Result<f64, TypedStreamError> {
169        match self.get_current_byte()? {
170            DECIMAL => {
171                let size = 8;
172                self.idx += 1;
173                let value = f64::from_le_bytes(
174                    <[u8; 8]>::try_from(self.read_exact_bytes(size)?)
175                        .map_err(TypedStreamError::SliceError)?,
176                );
177                Ok(value)
178            }
179            I_16 | I_32 => Ok(self.read_signed_int()? as f64),
180            _ => {
181                self.idx += 1;
182                Ok(self.read_signed_int()? as f64)
183            }
184        }
185    }
186
187    /// Read exactly `n` bytes from the stream
188    fn read_exact_bytes(&mut self, n: usize) -> Result<&[u8], TypedStreamError> {
189        let range =
190            self.stream
191                .get(self.idx..self.idx + n)
192                .ok_or(TypedStreamError::OutOfBounds(
193                    self.idx + n,
194                    self.stream.len(),
195                ))?;
196        self.idx += n;
197        Ok(range)
198    }
199
200    /// Read `n` bytes as a String
201    fn read_exact_as_string(
202        &mut self,
203        n: usize,
204        string: &mut String,
205    ) -> Result<(), TypedStreamError> {
206        let str = std::str::from_utf8(self.read_exact_bytes(n)?)
207            .map_err(TypedStreamError::StringParseError)?;
208        string.push_str(str);
209        Ok(())
210    }
211
212    /// Get the byte at a given index, if the index is within the bounds of the `typedstream`
213    fn get_byte(&self, byte_idx: usize) -> Result<u8, TypedStreamError> {
214        if byte_idx < self.stream.len() {
215            return Ok(self.stream[byte_idx]);
216        }
217        Err(TypedStreamError::OutOfBounds(byte_idx, self.stream.len()))
218    }
219
220    /// Read the current byte
221    fn get_current_byte(&self) -> Result<u8, TypedStreamError> {
222        self.get_byte(self.idx)
223    }
224
225    /// Read the next byte
226    fn get_next_byte(&self) -> Result<u8, TypedStreamError> {
227        self.get_byte(self.idx + 1)
228    }
229
230    /// Read some bytes as an array
231    fn read_array(&mut self, size: usize) -> Result<Vec<u8>, TypedStreamError> {
232        Ok(self.read_exact_bytes(size)?.to_vec())
233    }
234
235    /// Determine the current types
236    fn read_type(&mut self) -> Result<Vec<Type>, TypedStreamError> {
237        let length = self.read_unsigned_int()?;
238
239        let types = self.read_exact_bytes(length as usize)?;
240
241        // Handle array size
242        if types.first() == Some(&0x5b) {
243            return Type::get_array_length(types).ok_or(TypedStreamError::InvalidArray);
244        }
245
246        Ok(types.iter().map(Type::from_byte).collect())
247    }
248
249    /// Read a reference pointer for a Type
250    fn read_pointer(&mut self) -> Result<u32, TypedStreamError> {
251        let pointer = self.get_current_byte()?;
252        let result = u32::from(pointer)
253            .checked_sub(REFERENCE_TAG as u32)
254            .ok_or(TypedStreamError::InvalidPointer(pointer));
255        self.idx += 1;
256        result
257    }
258
259    /// Read a class
260    fn read_class(&mut self) -> Result<ClassResult, TypedStreamError> {
261        let mut out_v: Vec<Archivable> = vec![];
262        match self.get_current_byte()? {
263            START => {
264                // Skip some header bytes
265                while self.get_current_byte()? == START {
266                    self.idx += 1;
267                }
268                let length = self.read_unsigned_int()?;
269
270                if length >= REFERENCE_TAG {
271                    let index = length - REFERENCE_TAG;
272                    return Ok(ClassResult::Index(index as usize));
273                }
274
275                let mut class_name = String::with_capacity(length as usize);
276                self.read_exact_as_string(length as usize, &mut class_name)?;
277
278                let version = self.read_unsigned_int()?;
279
280                self.types_table
281                    .push(vec![Type::new_string(class_name.clone())]);
282
283                out_v.push(Archivable::Class(Class::new(class_name, version)));
284
285                if let ClassResult::ClassHierarchy(parent) = self.read_class()? {
286                    out_v.extend(parent);
287                }
288            }
289            EMPTY => {
290                self.idx += 1;
291            }
292            _ => {
293                let index = self.read_pointer()?;
294                return Ok(ClassResult::Index(index as usize));
295            }
296        }
297        Ok(ClassResult::ClassHierarchy(out_v))
298    }
299
300    /// Read an object into the cache and emit, or emit an already-cached object
301    fn read_object(&mut self) -> Result<Option<&Archivable>, TypedStreamError> {
302        match self.get_current_byte()? {
303            START => {
304                match self.read_class()? {
305                    ClassResult::Index(idx) => {
306                        return Ok(self.object_table.get(idx));
307                    }
308                    ClassResult::ClassHierarchy(classes) => {
309                        for class in classes {
310                            self.object_table.push(class);
311                        }
312                    }
313                }
314                Ok(None)
315            }
316            EMPTY => {
317                self.idx += 1;
318                Ok(None)
319            }
320            _ => {
321                let index = self.read_pointer()?;
322                Ok(self.object_table.get(index as usize))
323            }
324        }
325    }
326
327    /// Read String data
328    fn read_string(&mut self) -> Result<String, TypedStreamError> {
329        let length = self.read_unsigned_int()?;
330        let mut string = String::with_capacity(length as usize);
331        self.read_exact_as_string(length as usize, &mut string)?;
332
333        Ok(string)
334    }
335
336    /// [`Archivable`] data can be embedded on a class or in a C String marked as [`Type::EmbeddedData`]
337    fn read_embedded_data(&mut self) -> Result<Option<Archivable>, TypedStreamError> {
338        // Skip the 0x84
339        self.idx += 1;
340        match self.get_type(true)? {
341            Some(types) => self.read_types(types),
342            None => Ok(None),
343        }
344    }
345
346    /// Gets the current type from the stream, either by reading it from the stream or reading it from
347    /// the specified index of [`TypedStreamReader::types_table`]. Because methods that use this type can also mutate self,
348    /// returning a reference here means other methods could make that reference to the table invalid,
349    /// which is disallowed in Rust. Thus, we return a clone of the cached data.
350    fn get_type(&mut self, embedded: bool) -> Result<Option<Vec<Type>>, TypedStreamError> {
351        match self.get_current_byte()? {
352            START => {
353                // Ignore repeated types, for example in a dict
354                self.idx += 1;
355
356                let object_types = self.read_type()?;
357
358                // Embedded data is stored as a C String in the objects table
359                if embedded {
360                    self.object_table
361                        .push(Archivable::Type(object_types.clone()));
362                    // We only want to include the first embedded reference tag, not subsequent references to the same embed
363                    self.seen_embedded_types
364                        .insert(self.object_table.len().saturating_sub(1) as u32);
365                }
366                self.types_table.push(object_types);
367                Ok(self.types_table.last().cloned())
368            }
369            END => {
370                // This indicates the end of the current object
371                Ok(None)
372            }
373            _ => {
374                // Ignore repeated types, for example in a dict
375                while self.get_current_byte()? == self.get_next_byte()? {
376                    self.idx += 1;
377                }
378
379                let ref_tag = self.read_pointer()?;
380                let result = self.types_table.get(ref_tag as usize);
381
382                if embedded {
383                    if let Some(res) = result {
384                        // We only want to include the first embedded reference tag, not subsequent references to the same embed
385                        if !self.seen_embedded_types.contains(&ref_tag) {
386                            self.object_table.push(Archivable::Type(res.clone()));
387                            self.seen_embedded_types.insert(ref_tag);
388                        }
389                    }
390                }
391
392                Ok(result.cloned())
393            }
394        }
395    }
396
397    /// Given some [`Type`]s, look at the stream and parse the data according to the specified [`Type`]
398    fn read_types(
399        &mut self,
400        found_types: Vec<Type>,
401    ) -> Result<Option<Archivable>, TypedStreamError> {
402        let mut out_v = vec![];
403        let mut is_obj: bool = false;
404
405        for found_type in found_types {
406            match found_type {
407                Type::Utf8String => out_v.push(OutputData::String(self.read_string()?)),
408                Type::EmbeddedData => {
409                    return self.read_embedded_data();
410                }
411                Type::Object => {
412                    is_obj = true;
413                    let length = self.object_table.len();
414                    self.placeholder = Some(length);
415                    self.object_table.push(Archivable::Placeholder);
416                    if let Some(object) = self.read_object()? {
417                        match object.clone() {
418                            Archivable::Object(_, data) => {
419                                // If this is a new object, i.e. one without any data, we add the data into it later
420                                // If the object already has data in it, we just want to return that object
421                                if !data.is_empty() {
422                                    let result = Ok(Some(object.clone()));
423                                    self.placeholder = None;
424                                    self.object_table.pop();
425                                    return result;
426                                }
427                                out_v.extend(data);
428                            }
429                            Archivable::Class(cls) => out_v.push(OutputData::Class(cls)),
430                            Archivable::Data(data) => out_v.extend(data),
431                            // These cases are used internally in the objects table but should not be present in any output
432                            Archivable::Placeholder | Archivable::Type(_) => {}
433                        }
434                    }
435                }
436                Type::SignedInt => out_v.push(OutputData::SignedInteger(self.read_signed_int()?)),
437                Type::UnsignedInt => {
438                    out_v.push(OutputData::UnsignedInteger(self.read_unsigned_int()?));
439                }
440                Type::Float => out_v.push(OutputData::Float(self.read_float()?)),
441                Type::Double => out_v.push(OutputData::Double(self.read_double()?)),
442                Type::Unknown(byte) => out_v.push(OutputData::Byte(byte)),
443                Type::String(s) => out_v.push(OutputData::String(s)),
444                Type::Array(size) => out_v.push(OutputData::Array(self.read_array(size)?)),
445            }
446        }
447
448        // If we had reserved a place for an object, fill that spot
449        if let Some(spot) = self.placeholder {
450            if !out_v.is_empty() {
451                // We got a class, but do not have its respective data yet
452                if let Some(OutputData::Class(class)) = out_v.last() {
453                    self.object_table[spot] = Archivable::Object(class.clone(), vec![]);
454                // The spot after the current placeholder contains the class at the top of the class heirarchy, i.e.
455                // if we get a placeholder and then find a new class heirarchy, the object table holds the class chain
456                // in descending order of inheritance
457                } else if let Some(Archivable::Class(class)) = self.object_table.get(spot + 1) {
458                    self.object_table[spot] = Archivable::Object(class.clone(), out_v.clone());
459                    self.placeholder = None;
460                    return Ok(self.object_table.get(spot).cloned());
461                // We got some data for a class that was already seen
462                } else if let Some(Archivable::Object(_, data)) = self.object_table.get_mut(spot) {
463                    data.extend(out_v.clone());
464                    self.placeholder = None;
465                    return Ok(self.object_table.get(spot).cloned());
466                // We got some data that is not part of a class, i.e. a field in the parent object for which we don't know the name
467                } else {
468                    self.object_table[spot] = Archivable::Data(out_v.clone());
469                    self.placeholder = None;
470                    return Ok(self.object_table.get(spot).cloned());
471                }
472            }
473        }
474
475        if !out_v.is_empty() && !is_obj {
476            return Ok(Some(Archivable::Data(out_v.clone())));
477        }
478        Ok(None)
479    }
480
481    /// In the original source there are several variants of the header, but we
482    /// only need to validate that this is the header used by macOS/iOS, as iMessage
483    /// is probably not available on any `NeXT` platform
484    pub(crate) fn validate_header(&mut self) -> Result<(), TypedStreamError> {
485        // Encoding type
486        let typedstream_version = self.read_unsigned_int()?;
487        // Encoding signature
488        let signature = self.read_string()?;
489        // System version
490        let system_version = self.read_signed_int()?;
491
492        if typedstream_version != 4 || signature != "streamtyped" || system_version != 1000 {
493            return Err(TypedStreamError::InvalidHeader);
494        }
495
496        Ok(())
497    }
498
499    /// Attempt to get the data from the `typedstream`.
500    ///
501    /// Given a stream, construct a reader object to parse it. `typedstream` data doesn't include property
502    /// names, so data is stored on [`Object`](crate::util::typedstream::models::Archivable::Object)s in order of appearance.
503    ///
504    /// Yields a new [`Archivable`] as they occur in the stream, but does not retain the object's inheritance heirarchy.
505    /// Callers are responsible for assembling the deserialized stream into a useful data structure.
506    ///
507    /// # Example:
508    ///
509    /// ```
510    /// use imessage_database::util::typedstream::parser::TypedStreamReader;
511    ///
512    /// let bytes: Vec<u8> = vec![]; // Example stream
513    /// let mut reader = TypedStreamReader::from(&bytes);
514    /// let result = reader.parse();
515    /// ```
516    ///
517    /// # Sample output:
518    /// ```txt
519    /// [
520    ///     Object(Class { name: "NSMutableString", version: 1 }, [String("Example")]) // The message text
521    ///     Data([Integer(1), Integer(7)])  // The next object describes properties for the range of chars 1 through 7
522    ///     Object(Class { name: "NSDictionary", version: 0 }, [Integer(1)])  // The first property is a `NSDictionary` with 1 item
523    ///     Object(Class { name: "NSString", version: 1 }, [String("__kIMMessagePartAttributeName")])  // The first key in the `NSDictionary`
524    ///     Object(Class { name: "NSNumber", version: 0 }, [Integer(0)])  // The first value in the `NSDictionary`
525    /// ]
526    /// ```
527    pub fn parse(&mut self) -> Result<Vec<Archivable>, TypedStreamError> {
528        let mut out_v = vec![];
529
530        self.validate_header()?;
531
532        while self.idx < self.stream.len() {
533            if self.get_current_byte()? == END {
534                self.idx += 1;
535                continue;
536            }
537
538            // First, get the current type
539            if let Some(found_types) = self.get_type(false)? {
540                let result = self.read_types(found_types);
541                if let Ok(Some(res)) = result {
542                    out_v.push(res);
543                }
544            }
545        }
546
547        Ok(out_v)
548    }
549}