imessage_database/util/typedstream/parser.rs
1/*!
2 Logic used to deserialize data from a `typedstream`, focussing specifically on [`NSAttributedString`](https://developer.apple.com/documentation/foundation/nsattributedstring).
3
4 Logic reverse engineered from `typedstream` source located at:
5 - [`typedstream.h`](https://github.com/gnustep/libobjc/blob/master/objc/typedstream.h)
6 - [`archive.c`](https://github.com/gnustep/libobjc/blob/master/archive.c)
7 - [`objc/typedstream.m`](https://securitronlinux.com/news/html/d4/d6c/typedstream_8m.html)
8
9 A writeup about the reverse engineering of `typedstream` can be found [here](https://chrissardegna.com/blog/reverse-engineering-apples-typedstream-format/).
10*/
11use std::collections::HashSet;
12
13use crate::{
14 error::typedstream::TypedStreamError,
15 util::typedstream::models::{Archivable, Class, ClassResult, OutputData, Type},
16};
17
18/// Indicates an [`i16`] in the byte stream
19const I_16: u8 = 0x81;
20/// Indicates an [`i32`] in the byte stream
21const I_32: u8 = 0x82;
22/// Indicates an [`f32`] or [`f64`] in the byte stream; the [`Type`] determines the size
23const DECIMAL: u8 = 0x83;
24/// Indicates the start of a new object
25const START: u8 = 0x84;
26/// Indicates that there is no more data to parse, for example the end of a class inheritance chain
27const EMPTY: u8 = 0x85;
28/// Indicates the last byte of an object
29const END: u8 = 0x86;
30/// Bytes equal or greater in value than the reference tag indicate an index in the table of already-seen types
31const REFERENCE_TAG: u64 = 0x92;
32
33/// Contains logic and data used to deserialize data from a `typedstream`.
34///
35/// `typedstream` is a binary serialization format developed by `NeXT` and later adopted by Apple.
36/// It's designed to serialize and deserialize complex object graphs and data structures in C and Objective-C.
37///
38/// A `typedstream` begins with a header that includes format version and architecture information,
39/// followed by a stream of typed data elements. Each element is prefixed with type information,
40/// allowing the [`TypedStreamReader`] to understand the original data structures.
41#[derive(Debug)]
42pub struct TypedStreamReader<'a> {
43 /// The `typedstream` we want to parse
44 stream: &'a [u8],
45 /// The current index we are at in the stream
46 idx: usize,
47 /// As we parse the `typedstream`, build a table of seen [`Type`]s to reference in the future
48 ///
49 /// The first time a [`Type`] is seen, it is present in the stream literally,
50 /// but afterwards are only referenced by index in order of appearance.
51 types_table: Vec<Vec<Type>>,
52 /// As we parse the `typedstream`, build a table of seen archivable data to reference in the future
53 object_table: Vec<Archivable>,
54 /// We want to copy embedded types the first time they are seen, even if the types were resolved through references
55 seen_embedded_types: HashSet<u32>,
56 /// Stores the position of the current [`Archivable::Placeholder`]
57 placeholder: Option<usize>,
58}
59
60impl<'a> TypedStreamReader<'a> {
61 /// Given a stream, construct a reader instance to parse it.
62 ///
63 /// # Example:
64 ///
65 /// ```
66 /// use imessage_database::util::typedstream::parser::TypedStreamReader;
67 ///
68 /// let bytes: Vec<u8> = vec![]; // Example stream
69 /// let mut reader = TypedStreamReader::from(&bytes);
70 /// ```
71 #[must_use]
72 pub fn from(stream: &'a [u8]) -> Self {
73 Self {
74 stream,
75 idx: 0,
76 types_table: Vec::with_capacity(16),
77 object_table: Vec::with_capacity(32),
78 seen_embedded_types: HashSet::with_capacity(8),
79 placeholder: None,
80 }
81 }
82
83 /// Read a signed integer from the stream. Because we don't know the size of the integer ahead of time,
84 /// we store it in the largest possible value.
85 fn read_signed_int(&mut self) -> Result<i64, TypedStreamError> {
86 match self.get_current_byte()? {
87 I_16 => {
88 let size = 2;
89 self.idx += 1;
90 let value = i16::from_le_bytes(
91 <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
92 .map_err(TypedStreamError::SliceError)?,
93 );
94 Ok(i64::from(value))
95 }
96 I_32 => {
97 let size = 4;
98 self.idx += 1;
99 let value = i32::from_le_bytes(
100 <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
101 .map_err(TypedStreamError::SliceError)?,
102 );
103 Ok(i64::from(value))
104 }
105 _ => {
106 if self.get_current_byte()? > REFERENCE_TAG as u8 && self.get_next_byte()? != END {
107 self.idx += 1;
108 return self.read_signed_int();
109 }
110 let value = i8::from_le_bytes([self.get_current_byte()?]);
111 self.idx += 1;
112 Ok(i64::from(value))
113 }
114 }
115 }
116
117 /// Read an unsigned integer from the stream. Because we don't know the size of the integer ahead of time,
118 /// we store it in the largest possible value.
119 fn read_unsigned_int(&mut self) -> Result<u64, TypedStreamError> {
120 match self.get_current_byte()? {
121 I_16 => {
122 let size = 2;
123 self.idx += 1;
124 let value = u16::from_le_bytes(
125 <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
126 .map_err(TypedStreamError::SliceError)?,
127 );
128 Ok(u64::from(value))
129 }
130 I_32 => {
131 let size = 4;
132 self.idx += 1;
133 let value = u32::from_le_bytes(
134 <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
135 .map_err(TypedStreamError::SliceError)?,
136 );
137 Ok(u64::from(value))
138 }
139 _ => {
140 let value = u8::from_le_bytes([self.get_current_byte()?]);
141 self.idx += 1;
142 Ok(u64::from(value))
143 }
144 }
145 }
146
147 /// Read a single-precision float from the byte stream
148 fn read_float(&mut self) -> Result<f32, TypedStreamError> {
149 match self.get_current_byte()? {
150 DECIMAL => {
151 let size = 4;
152 self.idx += 1;
153 let value = f32::from_le_bytes(
154 <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
155 .map_err(TypedStreamError::SliceError)?,
156 );
157 Ok(value)
158 }
159 I_16 | I_32 => Ok(self.read_signed_int()? as f32),
160 _ => {
161 self.idx += 1;
162 Ok(self.read_signed_int()? as f32)
163 }
164 }
165 }
166
167 /// Read a double-precision float from the byte stream
168 fn read_double(&mut self) -> Result<f64, TypedStreamError> {
169 match self.get_current_byte()? {
170 DECIMAL => {
171 let size = 8;
172 self.idx += 1;
173 let value = f64::from_le_bytes(
174 <[u8; 8]>::try_from(self.read_exact_bytes(size)?)
175 .map_err(TypedStreamError::SliceError)?,
176 );
177 Ok(value)
178 }
179 I_16 | I_32 => Ok(self.read_signed_int()? as f64),
180 _ => {
181 self.idx += 1;
182 Ok(self.read_signed_int()? as f64)
183 }
184 }
185 }
186
187 /// Read exactly `n` bytes from the stream
188 fn read_exact_bytes(&mut self, n: usize) -> Result<&[u8], TypedStreamError> {
189 let range =
190 self.stream
191 .get(self.idx..self.idx + n)
192 .ok_or(TypedStreamError::OutOfBounds(
193 self.idx + n,
194 self.stream.len(),
195 ))?;
196 self.idx += n;
197 Ok(range)
198 }
199
200 /// Read `n` bytes as a String
201 fn read_exact_as_string(
202 &mut self,
203 n: usize,
204 string: &mut String,
205 ) -> Result<(), TypedStreamError> {
206 let str = std::str::from_utf8(self.read_exact_bytes(n)?)
207 .map_err(TypedStreamError::StringParseError)?;
208 string.push_str(str);
209 Ok(())
210 }
211
212 /// Get the byte at a given index, if the index is within the bounds of the `typedstream`
213 fn get_byte(&self, byte_idx: usize) -> Result<u8, TypedStreamError> {
214 if byte_idx < self.stream.len() {
215 return Ok(self.stream[byte_idx]);
216 }
217 Err(TypedStreamError::OutOfBounds(byte_idx, self.stream.len()))
218 }
219
220 /// Read the current byte
221 fn get_current_byte(&self) -> Result<u8, TypedStreamError> {
222 self.get_byte(self.idx)
223 }
224
225 /// Read the next byte
226 fn get_next_byte(&self) -> Result<u8, TypedStreamError> {
227 self.get_byte(self.idx + 1)
228 }
229
230 /// Read some bytes as an array
231 fn read_array(&mut self, size: usize) -> Result<Vec<u8>, TypedStreamError> {
232 Ok(self.read_exact_bytes(size)?.to_vec())
233 }
234
235 /// Determine the current types
236 fn read_type(&mut self) -> Result<Vec<Type>, TypedStreamError> {
237 let length = self.read_unsigned_int()?;
238
239 let types = self.read_exact_bytes(length as usize)?;
240
241 // Handle array size
242 if types.first() == Some(&0x5b) {
243 return Type::get_array_length(types).ok_or(TypedStreamError::InvalidArray);
244 }
245
246 Ok(types.iter().map(Type::from_byte).collect())
247 }
248
249 /// Read a reference pointer for a Type
250 fn read_pointer(&mut self) -> Result<u32, TypedStreamError> {
251 let pointer = self.get_current_byte()?;
252 let result = u32::from(pointer)
253 .checked_sub(REFERENCE_TAG as u32)
254 .ok_or(TypedStreamError::InvalidPointer(pointer as usize));
255 self.idx += 1;
256 result
257 }
258
259 /// Read a class
260 fn read_class(&mut self) -> Result<ClassResult, TypedStreamError> {
261 let mut out_v: Vec<Archivable> = Vec::with_capacity(4);
262 match self.get_current_byte()? {
263 START => {
264 // Skip some header bytes
265 while self.get_current_byte()? == START {
266 self.idx += 1;
267 }
268 let length = self.read_unsigned_int()?;
269
270 if length >= REFERENCE_TAG {
271 let index = length - REFERENCE_TAG;
272 return Ok(ClassResult::Index(index as usize));
273 }
274
275 let mut class_name = String::with_capacity(length as usize);
276 self.read_exact_as_string(length as usize, &mut class_name)?;
277
278 let version = self.read_unsigned_int()?;
279
280 self.types_table
281 .push(vec![Type::new_string(class_name.clone())]);
282
283 out_v.push(Archivable::Class(Class::new(class_name, version)));
284
285 if let ClassResult::ClassHierarchy(parent) = self.read_class()? {
286 out_v.extend(parent);
287 }
288 }
289 EMPTY => {
290 self.idx += 1;
291 }
292 _ => {
293 let index = self.read_pointer()?;
294 return Ok(ClassResult::Index(index as usize));
295 }
296 }
297 Ok(ClassResult::ClassHierarchy(out_v))
298 }
299
300 /// Read an object into the cache and emit, or emit an already-cached object
301 fn read_object(&mut self) -> Result<Option<&Archivable>, TypedStreamError> {
302 match self.get_current_byte()? {
303 START => {
304 match self.read_class()? {
305 ClassResult::Index(idx) => {
306 return Ok(self.object_table.get(idx));
307 }
308 ClassResult::ClassHierarchy(classes) => {
309 for class in classes {
310 self.object_table.push(class);
311 }
312 }
313 }
314 Ok(None)
315 }
316 EMPTY => {
317 self.idx += 1;
318 Ok(None)
319 }
320 _ => {
321 let index = self.read_pointer()?;
322 Ok(self.object_table.get(index as usize))
323 }
324 }
325 }
326
327 /// Read String data
328 fn read_string(&mut self) -> Result<String, TypedStreamError> {
329 let length = self.read_unsigned_int()?;
330 let mut string = String::with_capacity(length as usize);
331 self.read_exact_as_string(length as usize, &mut string)?;
332
333 Ok(string)
334 }
335
336 /// [`Archivable`] data can be embedded on a class or in a C String marked as [`Type::EmbeddedData`]
337 fn read_embedded_data(&mut self) -> Result<Option<Archivable>, TypedStreamError> {
338 // Skip the 0x84
339 self.idx += 1;
340 match self.get_type(true)? {
341 Some(type_index) => self.read_types(type_index),
342 None => Ok(None),
343 }
344 }
345
346 /// Gets the current type from the stream, either by reading it from the stream or reading it from
347 /// the specified index of [`TypedStreamReader::types_table`]. Returns an index into the types table
348 /// to avoid cloning large type vectors.
349 fn get_type(&mut self, embedded: bool) -> Result<Option<usize>, TypedStreamError> {
350 match self.get_current_byte()? {
351 START => {
352 // Skip the start byte
353 self.idx += 1;
354
355 let object_types = self.read_type()?;
356 let type_index = self.types_table.len();
357
358 // Embedded data is stored as a C String in the objects table
359 if embedded {
360 self.object_table
361 .push(Archivable::Type(object_types.clone()));
362 // We only want to include the first embedded reference tag, not subsequent references to the same embed
363 self.seen_embedded_types
364 .insert(self.object_table.len().saturating_sub(1) as u32);
365 }
366
367 self.types_table.push(object_types);
368 Ok(Some(type_index))
369 }
370 END => {
371 // This indicates the end of the current object
372 Ok(None)
373 }
374 _ => {
375 let ref_tag = self.read_pointer()?;
376
377 if ref_tag as usize >= self.types_table.len() {
378 return Ok(None);
379 }
380
381 if embedded {
382 // We only want to include the first embedded reference tag, not subsequent references to the same embed
383 if !self.seen_embedded_types.contains(&ref_tag) {
384 if let Some(types) = self.types_table.get(ref_tag as usize) {
385 self.object_table.push(Archivable::Type(types.clone()));
386 self.seen_embedded_types.insert(ref_tag);
387 }
388 }
389 }
390
391 Ok(Some(ref_tag as usize))
392 }
393 }
394 }
395
396 /// Given some [`Type`]s referenced by index, look at the stream and parse the data according to the specified [`Type`]
397 fn read_types(&mut self, type_index: usize) -> Result<Option<Archivable>, TypedStreamError> {
398 // Validate the index first
399 if type_index >= self.types_table.len() {
400 return Err(TypedStreamError::InvalidPointer(type_index));
401 }
402
403 let mut out_v = Vec::with_capacity(8);
404 let mut is_obj: bool = false;
405
406 // Process types one by one to avoid borrowing conflicts
407 let types_len = self.types_table[type_index].len();
408 for i in 0..types_len {
409 match &self.types_table[type_index][i] {
410 Type::Utf8String => out_v.push(OutputData::String(self.read_string()?)),
411 Type::EmbeddedData => {
412 return self.read_embedded_data();
413 }
414 Type::Object => {
415 is_obj = true;
416 let length = self.object_table.len();
417 self.placeholder = Some(length);
418 self.object_table.push(Archivable::Placeholder);
419 if let Some(object) = self.read_object()? {
420 match object {
421 Archivable::Object(_, data) => {
422 // If this is a new object, i.e. one without any data, we add the data into it later
423 // If the object already has data in it, we just want to return that object
424 if !data.is_empty() {
425 let result = Ok(Some(object.clone()));
426 self.placeholder = None;
427 self.object_table.pop();
428 return result;
429 }
430 out_v.extend_from_slice(data);
431 }
432 Archivable::Class(cls) => out_v.push(OutputData::Class(cls.clone())),
433 Archivable::Data(data) => out_v.extend_from_slice(data),
434 // These cases are used internally in the objects table but should not be present in any output
435 Archivable::Placeholder | Archivable::Type(_) => {}
436 }
437 }
438 }
439 Type::SignedInt => out_v.push(OutputData::SignedInteger(self.read_signed_int()?)),
440 Type::UnsignedInt => {
441 out_v.push(OutputData::UnsignedInteger(self.read_unsigned_int()?));
442 }
443 Type::Float => out_v.push(OutputData::Float(self.read_float()?)),
444 Type::Double => out_v.push(OutputData::Double(self.read_double()?)),
445 Type::Unknown(byte) => out_v.push(OutputData::Byte(*byte)),
446 Type::String(s) => out_v.push(OutputData::String(s.to_string())),
447 Type::Array(size) => out_v.push(OutputData::Array(self.read_array(*size)?)),
448 }
449 }
450
451 // If we had reserved a place for an object, fill that spot
452 if let Some(spot) = self.placeholder {
453 if !out_v.is_empty() {
454 // We got a class, but do not have its respective data yet
455 if let Some(OutputData::Class(class)) = out_v.last() {
456 self.object_table[spot] = Archivable::Object(class.clone(), vec![]);
457 // The spot after the current placeholder contains the class at the top of the class heirarchy, i.e.
458 // if we get a placeholder and then find a new class heirarchy, the object table holds the class chain
459 // in descending order of inheritance
460 } else if let Some(Archivable::Class(class)) = self.object_table.get(spot + 1) {
461 self.object_table[spot] = Archivable::Object(class.clone(), out_v);
462 self.placeholder = None;
463 return Ok(self.object_table.get(spot).cloned());
464 // We got some data for a class that was already seen
465 } else if let Some(Archivable::Object(_, data)) = self.object_table.get_mut(spot) {
466 data.extend(out_v);
467 self.placeholder = None;
468 return Ok(self.object_table.get(spot).cloned());
469 // We got some data that is not part of a class, i.e. a field in the parent object for which we don't know the name
470 } else {
471 self.object_table[spot] = Archivable::Data(out_v);
472 self.placeholder = None;
473 return Ok(self.object_table.get(spot).cloned());
474 }
475 }
476 }
477
478 // If we have no object, but have data, return it as a Data type
479 if !out_v.is_empty() && !is_obj {
480 return Ok(Some(Archivable::Data(out_v)));
481 }
482 Ok(None)
483 }
484
485 /// In the original source there are several variants of the header, but we
486 /// only need to validate that this is the header used by macOS/iOS, as iMessage
487 /// is probably not available on any `NeXT` platform
488 pub(crate) fn validate_header(&mut self) -> Result<(), TypedStreamError> {
489 // Encoding type
490 let typedstream_version = self.read_unsigned_int()?;
491 // Encoding signature
492 let signature = self.read_string()?;
493 // System version
494 let system_version = self.read_signed_int()?;
495
496 if typedstream_version != 4 || signature != "streamtyped" || system_version != 1000 {
497 return Err(TypedStreamError::InvalidHeader);
498 }
499
500 Ok(())
501 }
502
503 /// Attempt to get the data from the `typedstream`.
504 ///
505 /// Given a stream, construct a reader object to parse it. `typedstream` data doesn't include property
506 /// names, so data is stored on [`Object`](crate::util::typedstream::models::Archivable::Object)s in order of appearance.
507 ///
508 /// Yields a new [`Archivable`] as they occur in the stream, but does not retain the object's inheritance heirarchy.
509 /// Callers are responsible for assembling the deserialized stream into a useful data structure.
510 ///
511 /// # Example:
512 ///
513 /// ```
514 /// use imessage_database::util::typedstream::parser::TypedStreamReader;
515 ///
516 /// let bytes: Vec<u8> = vec![]; // Example stream
517 /// let mut reader = TypedStreamReader::from(&bytes);
518 /// let result = reader.parse();
519 /// ```
520 ///
521 /// # Sample output:
522 /// ```txt
523 /// [
524 /// Object(Class { name: "NSMutableString", version: 1 }, [String("Example")]) // The message text
525 /// Data([Integer(1), Integer(7)]) // The next object describes properties for the range of chars 1 through 7
526 /// Object(Class { name: "NSDictionary", version: 0 }, [Integer(1)]) // The first property is a `NSDictionary` with 1 item
527 /// Object(Class { name: "NSString", version: 1 }, [String("__kIMMessagePartAttributeName")]) // The first key in the `NSDictionary`
528 /// Object(Class { name: "NSNumber", version: 0 }, [Integer(0)]) // The first value in the `NSDictionary`
529 /// ]
530 /// ```
531 pub fn parse(&mut self) -> Result<Vec<Archivable>, TypedStreamError> {
532 let mut out_v = Vec::with_capacity(16); // Pre-allocate for better performance
533
534 self.validate_header()?;
535
536 while self.idx < self.stream.len() {
537 if self.get_current_byte()? == END {
538 self.idx += 1;
539 continue;
540 }
541
542 // First, get the current type
543 if let Some(type_index) = self.get_type(false)? {
544 let result = self.read_types(type_index);
545 if let Ok(Some(res)) = result {
546 out_v.push(res);
547 }
548 }
549 }
550
551 Ok(out_v)
552 }
553}