imessage_database/util/typedstream/parser.rs
1/*!
2 Logic used to deserialize data from a `typedstream`, focussing specifically on [`NSAttributedString`](https://developer.apple.com/documentation/foundation/nsattributedstring).
3
4 Logic reverse engineered from `typedstream` source located at:
5 - [`typedstream.h`](https://github.com/gnustep/libobjc/blob/master/objc/typedstream.h)
6 - [`archive.c`](https://github.com/gnustep/libobjc/blob/master/archive.c)
7 - [`objc/typedstream.m`](https://securitronlinux.com/news/html/d4/d6c/typedstream_8m.html)
8
9 A writeup about the reverse engineering of `typedstream` can be found [here](https://chrissardegna.com/blog/reverse-engineering-apples-typedstream-format/).
10*/
11use std::collections::HashSet;
12
13use crate::{
14 error::typedstream::TypedStreamError,
15 util::typedstream::models::{Archivable, Class, ClassResult, OutputData, Type},
16};
17
18/// Indicates an [`i16`] in the byte stream
19const I_16: u8 = 0x81;
20/// Indicates an [`i32`] in the byte stream
21const I_32: u8 = 0x82;
22/// Indicates an [`f32`] or [`f64`] in the byte stream; the [`Type`] determines the size
23const DECIMAL: u8 = 0x83;
24/// Indicates the start of a new object
25const START: u8 = 0x84;
26/// Indicates that there is no more data to parse, for example the end of a class inheritance chain
27const EMPTY: u8 = 0x85;
28/// Indicates the last byte of an object
29const END: u8 = 0x86;
30/// Bytes equal or greater in value than the reference tag indicate an index in the table of already-seen types
31const REFERENCE_TAG: u64 = 0x92;
32
33/// Contains logic and data used to deserialize data from a `typedstream`.
34///
35/// `typedstream` is a binary serialization format developed by `NeXT` and later adopted by Apple.
36/// It's designed to serialize and deserialize complex object graphs and data structures in C and Objective-C.
37///
38/// A `typedstream` begins with a header that includes format version and architecture information,
39/// followed by a stream of typed data elements. Each element is prefixed with type information,
40/// allowing the [`TypedStreamReader`] to understand the original data structures.
41#[derive(Debug)]
42pub struct TypedStreamReader<'a> {
43 /// The `typedstream` we want to parse
44 stream: &'a [u8],
45 /// The current index we are at in the stream
46 idx: usize,
47 /// As we parse the `typedstream`, build a table of seen [`Type`]s to reference in the future
48 ///
49 /// The first time a [`Type`] is seen, it is present in the stream literally,
50 /// but afterwards are only referenced by index in order of appearance.
51 types_table: Vec<Vec<Type>>,
52 /// As we parse the `typedstream`, build a table of seen archivable data to reference in the future
53 object_table: Vec<Archivable>,
54 /// We want to copy embedded types the first time they are seen, even if the types were resolved through references
55 seen_embedded_types: HashSet<u32>,
56 /// Stores the position of the current [`Archivable::Placeholder`]
57 placeholder: Option<usize>,
58}
59
60impl<'a> TypedStreamReader<'a> {
61 /// Given a stream, construct a reader instance to parse it.
62 ///
63 /// # Example:
64 ///
65 /// ```
66 /// use imessage_database::util::typedstream::parser::TypedStreamReader;
67 ///
68 /// let bytes: Vec<u8> = vec![]; // Example stream
69 /// let mut reader = TypedStreamReader::from(&bytes);
70 /// ```
71 #[must_use]
72 pub fn from(stream: &'a [u8]) -> Self {
73 Self {
74 stream,
75 idx: 0,
76 types_table: vec![],
77 object_table: vec![],
78 seen_embedded_types: HashSet::new(),
79 placeholder: None,
80 }
81 }
82
83 /// Read a signed integer from the stream. Because we don't know the size of the integer ahead of time,
84 /// we store it in the largest possible value.
85 fn read_signed_int(&mut self) -> Result<i64, TypedStreamError> {
86 match self.get_current_byte()? {
87 I_16 => {
88 let size = 2;
89 self.idx += 1;
90 let value = i16::from_le_bytes(
91 <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
92 .map_err(TypedStreamError::SliceError)?,
93 );
94 Ok(i64::from(value))
95 }
96 I_32 => {
97 let size = 4;
98 self.idx += 1;
99 let value = i32::from_le_bytes(
100 <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
101 .map_err(TypedStreamError::SliceError)?,
102 );
103 Ok(i64::from(value))
104 }
105 _ => {
106 if self.get_current_byte()? > REFERENCE_TAG as u8 && self.get_next_byte()? != END {
107 self.idx += 1;
108 return self.read_signed_int();
109 }
110 let value = i8::from_le_bytes([self.get_current_byte()?]);
111 self.idx += 1;
112 Ok(i64::from(value))
113 }
114 }
115 }
116
117 /// Read an unsigned integer from the stream. Because we don't know the size of the integer ahead of time,
118 /// we store it in the largest possible value.
119 fn read_unsigned_int(&mut self) -> Result<u64, TypedStreamError> {
120 match self.get_current_byte()? {
121 I_16 => {
122 let size = 2;
123 self.idx += 1;
124 let value = u16::from_le_bytes(
125 <[u8; 2]>::try_from(self.read_exact_bytes(size)?)
126 .map_err(TypedStreamError::SliceError)?,
127 );
128 Ok(u64::from(value))
129 }
130 I_32 => {
131 let size = 4;
132 self.idx += 1;
133 let value = u32::from_le_bytes(
134 <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
135 .map_err(TypedStreamError::SliceError)?,
136 );
137 Ok(u64::from(value))
138 }
139 _ => {
140 let value = u8::from_le_bytes([self.get_current_byte()?]);
141 self.idx += 1;
142 Ok(u64::from(value))
143 }
144 }
145 }
146
147 /// Read a single-precision float from the byte stream
148 fn read_float(&mut self) -> Result<f32, TypedStreamError> {
149 match self.get_current_byte()? {
150 DECIMAL => {
151 let size = 4;
152 self.idx += 1;
153 let value = f32::from_le_bytes(
154 <[u8; 4]>::try_from(self.read_exact_bytes(size)?)
155 .map_err(TypedStreamError::SliceError)?,
156 );
157 Ok(value)
158 }
159 I_16 | I_32 => Ok(self.read_signed_int()? as f32),
160 _ => {
161 self.idx += 1;
162 Ok(self.read_signed_int()? as f32)
163 }
164 }
165 }
166
167 /// Read a double-precision float from the byte stream
168 fn read_double(&mut self) -> Result<f64, TypedStreamError> {
169 match self.get_current_byte()? {
170 DECIMAL => {
171 let size = 8;
172 self.idx += 1;
173 let value = f64::from_le_bytes(
174 <[u8; 8]>::try_from(self.read_exact_bytes(size)?)
175 .map_err(TypedStreamError::SliceError)?,
176 );
177 Ok(value)
178 }
179 I_16 | I_32 => Ok(self.read_signed_int()? as f64),
180 _ => {
181 self.idx += 1;
182 Ok(self.read_signed_int()? as f64)
183 }
184 }
185 }
186
187 /// Read exactly `n` bytes from the stream
188 fn read_exact_bytes(&mut self, n: usize) -> Result<&[u8], TypedStreamError> {
189 let range =
190 self.stream
191 .get(self.idx..self.idx + n)
192 .ok_or(TypedStreamError::OutOfBounds(
193 self.idx + n,
194 self.stream.len(),
195 ))?;
196 self.idx += n;
197 Ok(range)
198 }
199
200 /// Read `n` bytes as a String
201 fn read_exact_as_string(
202 &mut self,
203 n: usize,
204 string: &mut String,
205 ) -> Result<(), TypedStreamError> {
206 let str = std::str::from_utf8(self.read_exact_bytes(n)?)
207 .map_err(TypedStreamError::StringParseError)?;
208 string.push_str(str);
209 Ok(())
210 }
211
212 /// Get the byte at a given index, if the index is within the bounds of the `typedstream`
213 fn get_byte(&self, byte_idx: usize) -> Result<u8, TypedStreamError> {
214 if byte_idx < self.stream.len() {
215 return Ok(self.stream[byte_idx]);
216 }
217 Err(TypedStreamError::OutOfBounds(byte_idx, self.stream.len()))
218 }
219
220 /// Read the current byte
221 fn get_current_byte(&self) -> Result<u8, TypedStreamError> {
222 self.get_byte(self.idx)
223 }
224
225 /// Read the next byte
226 fn get_next_byte(&self) -> Result<u8, TypedStreamError> {
227 self.get_byte(self.idx + 1)
228 }
229
230 /// Read some bytes as an array
231 fn read_array(&mut self, size: usize) -> Result<Vec<u8>, TypedStreamError> {
232 Ok(self.read_exact_bytes(size)?.to_vec())
233 }
234
235 /// Determine the current types
236 fn read_type(&mut self) -> Result<Vec<Type>, TypedStreamError> {
237 let length = self.read_unsigned_int()?;
238
239 let types = self.read_exact_bytes(length as usize)?;
240
241 // Handle array size
242 if types.first() == Some(&0x5b) {
243 return Type::get_array_length(types).ok_or(TypedStreamError::InvalidArray);
244 }
245
246 Ok(types.iter().map(Type::from_byte).collect())
247 }
248
249 /// Read a reference pointer for a Type
250 fn read_pointer(&mut self) -> Result<u32, TypedStreamError> {
251 let pointer = self.get_current_byte()?;
252 let result = u32::from(pointer)
253 .checked_sub(REFERENCE_TAG as u32)
254 .ok_or(TypedStreamError::InvalidPointer(pointer));
255 self.idx += 1;
256 result
257 }
258
259 /// Read a class
260 fn read_class(&mut self) -> Result<ClassResult, TypedStreamError> {
261 let mut out_v: Vec<Archivable> = vec![];
262 match self.get_current_byte()? {
263 START => {
264 // Skip some header bytes
265 while self.get_current_byte()? == START {
266 self.idx += 1;
267 }
268 let length = self.read_unsigned_int()?;
269
270 if length >= REFERENCE_TAG {
271 let index = length - REFERENCE_TAG;
272 return Ok(ClassResult::Index(index as usize));
273 }
274
275 let mut class_name = String::with_capacity(length as usize);
276 self.read_exact_as_string(length as usize, &mut class_name)?;
277
278 let version = self.read_unsigned_int()?;
279
280 self.types_table
281 .push(vec![Type::new_string(class_name.clone())]);
282
283 out_v.push(Archivable::Class(Class::new(class_name, version)));
284
285 if let ClassResult::ClassHierarchy(parent) = self.read_class()? {
286 out_v.extend(parent);
287 }
288 }
289 EMPTY => {
290 self.idx += 1;
291 }
292 _ => {
293 let index = self.read_pointer()?;
294 return Ok(ClassResult::Index(index as usize));
295 }
296 }
297 Ok(ClassResult::ClassHierarchy(out_v))
298 }
299
300 /// Read an object into the cache and emit, or emit an already-cached object
301 fn read_object(&mut self) -> Result<Option<&Archivable>, TypedStreamError> {
302 match self.get_current_byte()? {
303 START => {
304 match self.read_class()? {
305 ClassResult::Index(idx) => {
306 return Ok(self.object_table.get(idx));
307 }
308 ClassResult::ClassHierarchy(classes) => {
309 for class in classes {
310 self.object_table.push(class);
311 }
312 }
313 }
314 Ok(None)
315 }
316 EMPTY => {
317 self.idx += 1;
318 Ok(None)
319 }
320 _ => {
321 let index = self.read_pointer()?;
322 Ok(self.object_table.get(index as usize))
323 }
324 }
325 }
326
327 /// Read String data
328 fn read_string(&mut self) -> Result<String, TypedStreamError> {
329 let length = self.read_unsigned_int()?;
330 let mut string = String::with_capacity(length as usize);
331 self.read_exact_as_string(length as usize, &mut string)?;
332
333 Ok(string)
334 }
335
336 /// [`Archivable`] data can be embedded on a class or in a C String marked as [`Type::EmbeddedData`]
337 fn read_embedded_data(&mut self) -> Result<Option<Archivable>, TypedStreamError> {
338 // Skip the 0x84
339 self.idx += 1;
340 match self.get_type(true)? {
341 Some(types) => self.read_types(types),
342 None => Ok(None),
343 }
344 }
345
346 /// Gets the current type from the stream, either by reading it from the stream or reading it from
347 /// the specified index of [`TypedStreamReader::types_table`]. Because methods that use this type can also mutate self,
348 /// returning a reference here means other methods could make that reference to the table invalid,
349 /// which is disallowed in Rust. Thus, we return a clone of the cached data.
350 fn get_type(&mut self, embedded: bool) -> Result<Option<Vec<Type>>, TypedStreamError> {
351 match self.get_current_byte()? {
352 START => {
353 // Ignore repeated types, for example in a dict
354 self.idx += 1;
355
356 let object_types = self.read_type()?;
357
358 // Embedded data is stored as a C String in the objects table
359 if embedded {
360 self.object_table
361 .push(Archivable::Type(object_types.clone()));
362 // We only want to include the first embedded reference tag, not subsequent references to the same embed
363 self.seen_embedded_types
364 .insert(self.object_table.len().saturating_sub(1) as u32);
365 }
366 self.types_table.push(object_types);
367 Ok(self.types_table.last().cloned())
368 }
369 END => {
370 // This indicates the end of the current object
371 Ok(None)
372 }
373 _ => {
374 // Ignore repeated types, for example in a dict
375 while self.get_current_byte()? == self.get_next_byte()? {
376 self.idx += 1;
377 }
378
379 let ref_tag = self.read_pointer()?;
380 let result = self.types_table.get(ref_tag as usize);
381
382 if embedded {
383 if let Some(res) = result {
384 // We only want to include the first embedded reference tag, not subsequent references to the same embed
385 if !self.seen_embedded_types.contains(&ref_tag) {
386 self.object_table.push(Archivable::Type(res.clone()));
387 self.seen_embedded_types.insert(ref_tag);
388 }
389 }
390 }
391
392 Ok(result.cloned())
393 }
394 }
395 }
396
397 /// Given some [`Type`]s, look at the stream and parse the data according to the specified [`Type`]
398 fn read_types(
399 &mut self,
400 found_types: Vec<Type>,
401 ) -> Result<Option<Archivable>, TypedStreamError> {
402 let mut out_v = vec![];
403 let mut is_obj: bool = false;
404
405 for found_type in found_types {
406 match found_type {
407 Type::Utf8String => out_v.push(OutputData::String(self.read_string()?)),
408 Type::EmbeddedData => {
409 return self.read_embedded_data();
410 }
411 Type::Object => {
412 is_obj = true;
413 let length = self.object_table.len();
414 self.placeholder = Some(length);
415 self.object_table.push(Archivable::Placeholder);
416 if let Some(object) = self.read_object()? {
417 match object.clone() {
418 Archivable::Object(_, data) => {
419 // If this is a new object, i.e. one without any data, we add the data into it later
420 // If the object already has data in it, we just want to return that object
421 if !data.is_empty() {
422 let result = Ok(Some(object.clone()));
423 self.placeholder = None;
424 self.object_table.pop();
425 return result;
426 }
427 out_v.extend(data);
428 }
429 Archivable::Class(cls) => out_v.push(OutputData::Class(cls)),
430 Archivable::Data(data) => out_v.extend(data),
431 // These cases are used internally in the objects table but should not be present in any output
432 Archivable::Placeholder | Archivable::Type(_) => {}
433 }
434 }
435 }
436 Type::SignedInt => out_v.push(OutputData::SignedInteger(self.read_signed_int()?)),
437 Type::UnsignedInt => {
438 out_v.push(OutputData::UnsignedInteger(self.read_unsigned_int()?));
439 }
440 Type::Float => out_v.push(OutputData::Float(self.read_float()?)),
441 Type::Double => out_v.push(OutputData::Double(self.read_double()?)),
442 Type::Unknown(byte) => out_v.push(OutputData::Byte(byte)),
443 Type::String(s) => out_v.push(OutputData::String(s)),
444 Type::Array(size) => out_v.push(OutputData::Array(self.read_array(size)?)),
445 }
446 }
447
448 // If we had reserved a place for an object, fill that spot
449 if let Some(spot) = self.placeholder {
450 if !out_v.is_empty() {
451 // We got a class, but do not have its respective data yet
452 if let Some(OutputData::Class(class)) = out_v.last() {
453 self.object_table[spot] = Archivable::Object(class.clone(), vec![]);
454 // The spot after the current placeholder contains the class at the top of the class heirarchy, i.e.
455 // if we get a placeholder and then find a new class heirarchy, the object table holds the class chain
456 // in descending order of inheritance
457 } else if let Some(Archivable::Class(class)) = self.object_table.get(spot + 1) {
458 self.object_table[spot] = Archivable::Object(class.clone(), out_v.clone());
459 self.placeholder = None;
460 return Ok(self.object_table.get(spot).cloned());
461 // We got some data for a class that was already seen
462 } else if let Some(Archivable::Object(_, data)) = self.object_table.get_mut(spot) {
463 data.extend(out_v.clone());
464 self.placeholder = None;
465 return Ok(self.object_table.get(spot).cloned());
466 // We got some data that is not part of a class, i.e. a field in the parent object for which we don't know the name
467 } else {
468 self.object_table[spot] = Archivable::Data(out_v.clone());
469 self.placeholder = None;
470 return Ok(self.object_table.get(spot).cloned());
471 }
472 }
473 }
474
475 if !out_v.is_empty() && !is_obj {
476 return Ok(Some(Archivable::Data(out_v.clone())));
477 }
478 Ok(None)
479 }
480
481 /// In the original source there are several variants of the header, but we
482 /// only need to validate that this is the header used by macOS/iOS, as iMessage
483 /// is probably not available on any `NeXT` platform
484 pub(crate) fn validate_header(&mut self) -> Result<(), TypedStreamError> {
485 // Encoding type
486 let typedstream_version = self.read_unsigned_int()?;
487 // Encoding signature
488 let signature = self.read_string()?;
489 // System version
490 let system_version = self.read_signed_int()?;
491
492 if typedstream_version != 4 || signature != "streamtyped" || system_version != 1000 {
493 return Err(TypedStreamError::InvalidHeader);
494 }
495
496 Ok(())
497 }
498
499 /// Attempt to get the data from the `typedstream`.
500 ///
501 /// Given a stream, construct a reader object to parse it. `typedstream` data doesn't include property
502 /// names, so data is stored on [`Object`](crate::util::typedstream::models::Archivable::Object)s in order of appearance.
503 ///
504 /// Yields a new [`Archivable`] as they occur in the stream, but does not retain the object's inheritance heirarchy.
505 /// Callers are responsible for assembling the deserialized stream into a useful data structure.
506 ///
507 /// # Example:
508 ///
509 /// ```
510 /// use imessage_database::util::typedstream::parser::TypedStreamReader;
511 ///
512 /// let bytes: Vec<u8> = vec![]; // Example stream
513 /// let mut reader = TypedStreamReader::from(&bytes);
514 /// let result = reader.parse();
515 /// ```
516 ///
517 /// # Sample output:
518 /// ```txt
519 /// [
520 /// Object(Class { name: "NSMutableString", version: 1 }, [String("Example")]) // The message text
521 /// Data([Integer(1), Integer(7)]) // The next object describes properties for the range of chars 1 through 7
522 /// Object(Class { name: "NSDictionary", version: 0 }, [Integer(1)]) // The first property is a `NSDictionary` with 1 item
523 /// Object(Class { name: "NSString", version: 1 }, [String("__kIMMessagePartAttributeName")]) // The first key in the `NSDictionary`
524 /// Object(Class { name: "NSNumber", version: 0 }, [Integer(0)]) // The first value in the `NSDictionary`
525 /// ]
526 /// ```
527 pub fn parse(&mut self) -> Result<Vec<Archivable>, TypedStreamError> {
528 let mut out_v = vec![];
529
530 self.validate_header()?;
531
532 while self.idx < self.stream.len() {
533 if self.get_current_byte()? == END {
534 self.idx += 1;
535 continue;
536 }
537
538 // First, get the current type
539 if let Some(found_types) = self.get_type(false)? {
540 let result = self.read_types(found_types);
541 if let Ok(Some(res)) = result {
542 out_v.push(res);
543 }
544 }
545 }
546
547 Ok(out_v)
548 }
549}