imessage_database/util/typedstream/parser.rs
1/*!
2 Logic used to deserialize data from a `typedstream`, focussing specifically on [`NSAttributedString`](https://developer.apple.com/documentation/foundation/nsattributedstring).
3
4 Logic reverse engineered from `typedstream` source located at:
5 - [`typedstream.h`](https://github.com/gnustep/libobjc/blob/master/objc/typedstream.h)
6 - [`archive.c`](https://github.com/gnustep/libobjc/blob/master/archive.c)
7 - [`objc/typedstream.m`](https://securitronlinux.com/news/html/d4/d6c/typedstream_8m.html)
8
9 A writeup about the reverse engineering of `typedstream` can be found [here](https://chrissardegna.com/blog/reverse-engineering-apples-typedstream-format/).
10*/
11use std::collections::HashSet;
12
13use crate::{
14 error::typedstream::TypedStreamError,
15 util::typedstream::models::{Archivable, Class, ClassResult, OutputData, Type},
16};
17
18/// Indicates an [`i16`] in the byte stream
19const I_16: u8 = 0x81;
20/// Indicates an [`i32`] in the byte stream
21const I_32: u8 = 0x82;
22/// Indicates an [`f32`] or [`f64`] in the byte stream; the [`Type`] determines the size
23const DECIMAL: u8 = 0x83;
24/// Indicates the start of a new object
25const START: u8 = 0x84;
26/// Indicates that there is no more data to parse, for example the end of a class inheritance chain
27const EMPTY: u8 = 0x85;
28/// Indicates the last byte of an object
29const END: u8 = 0x86;
30/// Bytes equal or greater in value than the reference tag indicate an index in the table of already-seen types
31const REFERENCE_TAG: u64 = 0x92;
32
33/// Contains logic and data used to deserialize data from a `typedstream`.
34///
35/// `typedstream` is a binary serialization format developed by NeXT and later adopted by Apple.
36/// It's designed to serialize and deserialize complex object graphs and data structures in C and Objective-C.
37///
38/// A `typedstream` begins with a header that includes format version and architecture information,
39/// followed by a stream of typed data elements. Each element is prefixed with type information,
40/// allowing the [`TypedStreamReader`] to understand the original data structures.
41#[derive(Debug)]
42pub struct TypedStreamReader<'a> {
43 /// The `typedstream` we want to parse
44 stream: &'a [u8],
45 /// The current index we are at in the stream
46 idx: usize,
47 /// As we parse the `typedstream`, build a table of seen [`Type`]s to reference in the future
48 ///
49 /// The first time a [`Type`] is seen, it is present in the stream literally,
50 /// but afterwards are only referenced by index in order of appearance.
51 types_table: Vec<Vec<Type>>,
52 /// As we parse the `typedstream`, build a table of seen archivable data to reference in the future
53 object_table: Vec<Archivable>,
54 /// We want to copy embedded types the first time they are seen, even if the types were resolved through references
55 seen_embedded_types: HashSet<u32>,
56 /// Stores the position of the current [`Archivable::Placeholder`]
57 placeholder: Option<usize>,
58}
59
60impl<'a> TypedStreamReader<'a> {
61 /// Given a stream, construct a reader instance to parse it.
62 ///
63 /// # Example:
64 ///
65 /// ```
66 /// use imessage_database::util::typedstream::parser::TypedStreamReader;
67 ///
68 /// let bytes: Vec<u8> = vec![]; // Example stream
69 /// let mut reader = TypedStreamReader::from(&bytes);
70 /// ```
71 pub fn from(stream: &'a [u8]) -> Self {
72 Self {
73 stream,
74 idx: 0,
75 types_table: vec![],
76 object_table: vec![],
77 seen_embedded_types: HashSet::new(),
78 placeholder: None,
79 }
80 }
81
82 /// Read a signed integer from the stream. Because we don't know the size of the integer ahead of time,
83 /// we store it in the largest possible value.
84 fn read_signed_int(&mut self) -> Result<i64, TypedStreamError> {
85 match self.get_current_byte()? {
86 I_16 => {
87 let size = 2;
88 self.idx += 1;
89 let value = i16::from_le_bytes(
90 <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
91 .map_err(TypedStreamError::SliceError)?,
92 );
93 Ok(value as i64)
94 }
95 I_32 => {
96 let size = 4;
97 self.idx += 1;
98 let value = i32::from_le_bytes(
99 <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
100 .map_err(TypedStreamError::SliceError)?,
101 );
102 Ok(value as i64)
103 }
104 _ => {
105 if self.get_current_byte()? > REFERENCE_TAG as u8 && self.get_next_byte()? != END {
106 self.idx += 1;
107 return self.read_signed_int();
108 }
109 let value = i8::from_le_bytes([self.get_current_byte()?]);
110 self.idx += 1;
111 Ok(value as i64)
112 }
113 }
114 }
115
116 /// Read an unsigned integer from the stream. Because we don't know the size of the integer ahead of time,
117 /// we store it in the largest possible value.
118 fn read_unsigned_int(&mut self) -> Result<u64, TypedStreamError> {
119 match self.get_current_byte()? {
120 I_16 => {
121 let size = 2;
122 self.idx += 1;
123 let value = u16::from_le_bytes(
124 <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
125 .map_err(TypedStreamError::SliceError)?,
126 );
127 Ok(value as u64)
128 }
129 I_32 => {
130 let size = 4;
131 self.idx += 1;
132 let value = u32::from_le_bytes(
133 <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
134 .map_err(TypedStreamError::SliceError)?,
135 );
136 Ok(value as u64)
137 }
138 _ => {
139 let value = u8::from_le_bytes([self.get_current_byte()?]);
140 self.idx += 1;
141 Ok(value as u64)
142 }
143 }
144 }
145
146 /// Read a single-precision float from the byte stream
147 fn read_float(&mut self) -> Result<f32, TypedStreamError> {
148 match self.get_current_byte()? {
149 DECIMAL => {
150 let size = 4;
151 self.idx += 1;
152 let value = f32::from_le_bytes(
153 <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
154 .map_err(TypedStreamError::SliceError)?,
155 );
156 Ok(value)
157 }
158 I_16 | I_32 => Ok(self.read_signed_int()? as f32),
159 _ => {
160 self.idx += 1;
161 Ok(self.read_signed_int()? as f32)
162 }
163 }
164 }
165
166 /// Read a double-precision float from the byte stream
167 fn read_double(&mut self) -> Result<f64, TypedStreamError> {
168 match self.get_current_byte()? {
169 DECIMAL => {
170 let size = 8;
171 self.idx += 1;
172 let value = f64::from_le_bytes(
173 <[u8; 8]>::try_from(self.read_exact_bytes(size)?)
174 .map_err(TypedStreamError::SliceError)?,
175 );
176 Ok(value)
177 }
178 I_16 | I_32 => Ok(self.read_signed_int()? as f64),
179 _ => {
180 self.idx += 1;
181 Ok(self.read_signed_int()? as f64)
182 }
183 }
184 }
185
186 /// Read exactly `n` bytes from the stream
187 fn read_exact_bytes(&mut self, n: usize) -> Result<&[u8], TypedStreamError> {
188 let range =
189 self.stream
190 .get(self.idx..self.idx + n)
191 .ok_or(TypedStreamError::OutOfBounds(
192 self.idx + n,
193 self.stream.len(),
194 ))?;
195 self.idx += n;
196 Ok(range)
197 }
198
199 /// Read `n` bytes as a String
200 fn read_exact_as_string(
201 &mut self,
202 n: usize,
203 string: &mut String,
204 ) -> Result<(), TypedStreamError> {
205 let str = std::str::from_utf8(self.read_exact_bytes(n)?)
206 .map_err(TypedStreamError::StringParseError)?;
207 string.push_str(str);
208 Ok(())
209 }
210
211 /// Get the byte at a given index, if the index is within the bounds of the `typedstream`
212 fn get_byte(&self, byte_idx: usize) -> Result<u8, TypedStreamError> {
213 if byte_idx < self.stream.len() {
214 return Ok(self.stream[byte_idx]);
215 }
216 Err(TypedStreamError::OutOfBounds(byte_idx, self.stream.len()))
217 }
218
219 /// Read the current byte
220 fn get_current_byte(&self) -> Result<u8, TypedStreamError> {
221 self.get_byte(self.idx)
222 }
223
224 /// Read the next byte
225 fn get_next_byte(&self) -> Result<u8, TypedStreamError> {
226 self.get_byte(self.idx + 1)
227 }
228
229 /// Read some bytes as an array
230 fn read_array(&mut self, size: usize) -> Result<Vec<u8>, TypedStreamError> {
231 Ok(self.read_exact_bytes(size)?.to_vec())
232 }
233
234 /// Determine the current types
235 fn read_type(&mut self) -> Result<Vec<Type>, TypedStreamError> {
236 let length = self.read_unsigned_int()?;
237
238 let types = self.read_exact_bytes(length as usize)?;
239
240 // Handle array size
241 if types.first() == Some(&0x5b) {
242 return Type::get_array_length(types).ok_or(TypedStreamError::InvalidArray);
243 }
244
245 Ok(types.iter().map(Type::from_byte).collect())
246 }
247
248 /// Read a reference pointer for a Type
249 fn read_pointer(&mut self) -> Result<u32, TypedStreamError> {
250 let pointer = self.get_current_byte()?;
251 let result = (pointer as u32)
252 .checked_sub(REFERENCE_TAG as u32)
253 .ok_or(TypedStreamError::InvalidPointer(pointer));
254 self.idx += 1;
255 result
256 }
257
258 /// Read a class
259 fn read_class(&mut self) -> Result<ClassResult, TypedStreamError> {
260 let mut out_v: Vec<Archivable> = vec![];
261 match self.get_current_byte()? {
262 START => {
263 // Skip some header bytes
264 while self.get_current_byte()? == START {
265 self.idx += 1;
266 }
267 let length = self.read_unsigned_int()?;
268
269 if length >= REFERENCE_TAG {
270 let index = length - REFERENCE_TAG;
271 return Ok(ClassResult::Index(index as usize));
272 }
273
274 let mut class_name = String::with_capacity(length as usize);
275 self.read_exact_as_string(length as usize, &mut class_name)?;
276
277 let version = self.read_unsigned_int()?;
278
279 self.types_table
280 .push(vec![Type::new_string(class_name.clone())]);
281
282 out_v.push(Archivable::Class(Class::new(class_name, version)));
283
284 if let ClassResult::ClassHierarchy(parent) = self.read_class()? {
285 out_v.extend(parent);
286 }
287 }
288 EMPTY => {
289 self.idx += 1;
290 }
291 _ => {
292 let index = self.read_pointer()?;
293 return Ok(ClassResult::Index(index as usize));
294 }
295 }
296 Ok(ClassResult::ClassHierarchy(out_v))
297 }
298
299 /// Read an object into the cache and emit, or emit an already-cached object
300 fn read_object(&mut self) -> Result<Option<&Archivable>, TypedStreamError> {
301 match self.get_current_byte()? {
302 START => {
303 match self.read_class()? {
304 ClassResult::Index(idx) => {
305 return Ok(self.object_table.get(idx));
306 }
307 ClassResult::ClassHierarchy(classes) => {
308 for class in classes.into_iter() {
309 self.object_table.push(class)
310 }
311 }
312 }
313 Ok(None)
314 }
315 EMPTY => {
316 self.idx += 1;
317 Ok(None)
318 }
319 _ => {
320 let index = self.read_pointer()?;
321 Ok(self.object_table.get(index as usize))
322 }
323 }
324 }
325
326 /// Read String data
327 fn read_string(&mut self) -> Result<String, TypedStreamError> {
328 let length = self.read_unsigned_int()?;
329 let mut string = String::with_capacity(length as usize);
330 self.read_exact_as_string(length as usize, &mut string)?;
331
332 Ok(string)
333 }
334
335 /// [`Archivable`] data can be embedded on a class or in a C String marked as [`Type::EmbeddedData`]
336 fn read_embedded_data(&mut self) -> Result<Option<Archivable>, TypedStreamError> {
337 // Skip the 0x84
338 self.idx += 1;
339 match self.get_type(true)? {
340 Some(types) => self.read_types(types),
341 None => Ok(None),
342 }
343 }
344
345 /// Gets the current type from the stream, either by reading it from the stream or reading it from
346 /// the specified index of [`TypedStreamReader::types_table`]. Because methods that use this type can also mutate self,
347 /// returning a reference here means other methods could make that reference to the table invalid,
348 /// which is disallowed in Rust. Thus, we return a clone of the cached data.
349 fn get_type(&mut self, embedded: bool) -> Result<Option<Vec<Type>>, TypedStreamError> {
350 match self.get_current_byte()? {
351 START => {
352 // Ignore repeated types, for example in a dict
353 self.idx += 1;
354
355 let object_types = self.read_type()?;
356
357 // Embedded data is stored as a C String in the objects table
358 if embedded {
359 self.object_table
360 .push(Archivable::Type(object_types.clone()));
361 // We only want to include the first embedded reference tag, not subsequent references to the same embed
362 self.seen_embedded_types
363 .insert(self.object_table.len().saturating_sub(1) as u32);
364 }
365 self.types_table.push(object_types);
366 Ok(self.types_table.last().cloned())
367 }
368 END => {
369 // This indicates the end of the current object
370 Ok(None)
371 }
372 _ => {
373 // Ignore repeated types, for example in a dict
374 while self.get_current_byte()? == self.get_next_byte()? {
375 self.idx += 1;
376 }
377
378 let ref_tag = self.read_pointer()?;
379 let result = self.types_table.get(ref_tag as usize);
380
381 if embedded {
382 if let Some(res) = result {
383 // We only want to include the first embedded reference tag, not subsequent references to the same embed
384 if !self.seen_embedded_types.contains(&ref_tag) {
385 self.object_table.push(Archivable::Type(res.clone()));
386 self.seen_embedded_types.insert(ref_tag);
387 }
388 }
389 }
390
391 Ok(result.cloned())
392 }
393 }
394 }
395
396 /// Given some [`Type`]s, look at the stream and parse the data according to the specified [`Type`]
397 fn read_types(
398 &mut self,
399 found_types: Vec<Type>,
400 ) -> Result<Option<Archivable>, TypedStreamError> {
401 let mut out_v = vec![];
402 let mut is_obj: bool = false;
403
404 for found_type in found_types {
405 match found_type {
406 Type::Utf8String => out_v.push(OutputData::String(self.read_string()?)),
407 Type::EmbeddedData => {
408 return self.read_embedded_data();
409 }
410 Type::Object => {
411 is_obj = true;
412 let length = self.object_table.len();
413 self.placeholder = Some(length);
414 self.object_table.push(Archivable::Placeholder);
415 if let Some(object) = self.read_object()? {
416 match object.clone() {
417 Archivable::Object(_, data) => {
418 // If this is a new object, i.e. one without any data, we add the data into it later
419 // If the object already has data in it, we just want to return that object
420 if !data.is_empty() {
421 let result = Ok(Some(object.clone()));
422 self.placeholder = None;
423 self.object_table.pop();
424 return result;
425 }
426 out_v.extend(data)
427 }
428 Archivable::Class(cls) => out_v.push(OutputData::Class(cls)),
429 Archivable::Data(data) => out_v.extend(data),
430 // These cases are used internally in the objects table but should not be present in any output
431 Archivable::Placeholder | Archivable::Type(_) => {}
432 }
433 }
434 }
435 Type::SignedInt => out_v.push(OutputData::SignedInteger(self.read_signed_int()?)),
436 Type::UnsignedInt => {
437 out_v.push(OutputData::UnsignedInteger(self.read_unsigned_int()?))
438 }
439 Type::Float => out_v.push(OutputData::Float(self.read_float()?)),
440 Type::Double => out_v.push(OutputData::Double(self.read_double()?)),
441 Type::Unknown(byte) => out_v.push(OutputData::Byte(byte)),
442 Type::String(s) => out_v.push(OutputData::String(s)),
443 Type::Array(size) => out_v.push(OutputData::Array(self.read_array(size)?)),
444 };
445 }
446
447 // If we had reserved a place for an object, fill that spot
448 if let Some(spot) = self.placeholder {
449 if !out_v.is_empty() {
450 // We got a class, but do not have its respective data yet
451 if let Some(OutputData::Class(class)) = out_v.last() {
452 self.object_table[spot] = Archivable::Object(class.clone(), vec![]);
453 // The spot after the current placeholder contains the class at the top of the class heirarchy, i.e.
454 // if we get a placeholder and then find a new class heirarchy, the object table holds the class chain
455 // in descending order of inheritance
456 } else if let Some(Archivable::Class(class)) = self.object_table.get(spot + 1) {
457 self.object_table[spot] = Archivable::Object(class.clone(), out_v.clone());
458 self.placeholder = None;
459 return Ok(self.object_table.get(spot).cloned());
460 // We got some data for a class that was already seen
461 } else if let Some(Archivable::Object(_, data)) = self.object_table.get_mut(spot) {
462 data.extend(out_v.clone());
463 self.placeholder = None;
464 return Ok(self.object_table.get(spot).cloned());
465 // We got some data that is not part of a class, i.e. a field in the parent object for which we don't know the name
466 } else {
467 self.object_table[spot] = Archivable::Data(out_v.clone());
468 self.placeholder = None;
469 return Ok(self.object_table.get(spot).cloned());
470 }
471 }
472 }
473
474 if !out_v.is_empty() && !is_obj {
475 return Ok(Some(Archivable::Data(out_v.clone())));
476 }
477 Ok(None)
478 }
479
480 /// In the original source there are several variants of the header, but we
481 /// only need to validate that this is the header used by macOS/iOS, as iMessage
482 /// is probably not available on any NeXT platform
483 pub(crate) fn validate_header(&mut self) -> Result<(), TypedStreamError> {
484 // Encoding type
485 let typedstream_version = self.read_unsigned_int()?;
486 // Encoding signature
487 let signature = self.read_string()?;
488 // System version
489 let system_version = self.read_signed_int()?;
490
491 if typedstream_version != 4 || signature != "streamtyped" || system_version != 1000 {
492 return Err(TypedStreamError::InvalidHeader);
493 }
494
495 Ok(())
496 }
497
498 /// Attempt to get the data from the `typedstream`.
499 ///
500 /// Given a stream, construct a reader object to parse it. `typedstream` data doesn't include property
501 /// names, so data is stored on [`Object`](crate::util::typedstream::models::Archivable::Object)s in order of appearance.
502 ///
503 /// Yields a new [`Archivable`] as they occur in the stream, but does not retain the object's inheritance heirarchy.
504 /// Callers are responsible for assembling the deserialized stream into a useful data structure.
505 ///
506 /// # Example:
507 ///
508 /// ```
509 /// use imessage_database::util::typedstream::parser::TypedStreamReader;
510 ///
511 /// let bytes: Vec<u8> = vec![]; // Example stream
512 /// let mut reader = TypedStreamReader::from(&bytes);
513 /// let result = reader.parse();
514 /// ```
515 ///
516 /// # Sample output:
517 /// ```txt
518 /// [
519 /// Object(Class { name: "NSMutableString", version: 1 }, [String("Example")]) // The message text
520 /// Data([Integer(1), Integer(7)]) // The next object describes properties for the range of chars 1 through 7
521 /// Object(Class { name: "NSDictionary", version: 0 }, [Integer(1)]) // The first property is a `NSDictionary` with 1 item
522 /// Object(Class { name: "NSString", version: 1 }, [String("__kIMMessagePartAttributeName")]) // The first key in the `NSDictionary`
523 /// Object(Class { name: "NSNumber", version: 0 }, [Integer(0)]) // The first value in the `NSDictionary`
524 /// ]
525 /// ```
526 pub fn parse(&mut self) -> Result<Vec<Archivable>, TypedStreamError> {
527 let mut out_v = vec![];
528
529 self.validate_header()?;
530
531 while self.idx < self.stream.len() {
532 if self.get_current_byte()? == END {
533 self.idx += 1;
534 continue;
535 }
536
537 // First, get the current type
538 if let Some(found_types) = self.get_type(false)? {
539 let result = self.read_types(found_types);
540 if let Ok(Some(res)) = result {
541 out_v.push(res);
542 }
543 }
544 }
545
546 Ok(out_v)
547 }
548}