imessage_database/util/typedstream/parser.rs
1/*!
2 Logic used to deserialize data from a `typedstream`, focussing specifically on [`NSAttributedString`](https://developer.apple.com/documentation/foundation/nsattributedstring).
3
4 Logic reverse engineered from `typedstream` source located at:
5 - [`typedstream.h`](https://github.com/gnustep/libobjc/blob/master/objc/typedstream.h)
6 - [`archive.c`](https://github.com/gnustep/libobjc/blob/master/archive.c)
7 - [`objc/typedstream.m`](https://securitronlinux.com/news/html/d4/d6c/typedstream_8m.html)
8
9 A writeup about the reverse engineering of `typedstream` can be found [here](https://chrissardegna.com/blog/reverse-engineering-apples-typedstream-format/).
10*/
11use std::collections::HashSet;
12
13use crate::{
14 error::typedstream::TypedStreamError,
15 util::typedstream::models::{Archivable, Class, ClassResult, OutputData, Type},
16};
17
18/// Indicates an [`i16`] in the byte stream
19const I_16: u8 = 0x81;
20/// Indicates an [`i32`] in the byte stream
21const I_32: u8 = 0x82;
22/// Indicates an [`f32`] or [`f64`] in the byte stream; the [`Type`] determines the size
23const DECIMAL: u8 = 0x83;
24/// Indicates the start of a new object
25const START: u8 = 0x84;
26/// Indicates that there is no more data to parse, for example the end of a class inheritance chain
27const EMPTY: u8 = 0x85;
28/// Indicates the last byte of an object
29const END: u8 = 0x86;
30/// Bytes equal or greater in value than the reference tag indicate an index in the table of already-seen types
31const REFERENCE_TAG: u64 = 0x92;
32
33/// Contains logic and data used to deserialize data from a `typedstream`.
34///
35/// `typedstream` is a binary serialization format developed by NeXT and later adopted by Apple.
36/// It's designed to serialize and deserialize complex object graphs and data structures in C and Objective-C.
37///
38/// A `typedstream` begins with a header that includes format version and architecture information,
39/// followed by a stream of typed data elements. Each element is prefixed with type information,
40/// allowing the [`TypedStreamReader`] to understand the original data structures.
41#[derive(Debug)]
42pub struct TypedStreamReader<'a> {
43 /// The `typedstream` we want to parse
44 stream: &'a [u8],
45 /// The current index we are at in the stream
46 idx: usize,
47 /// As we parse the `typedstream`, build a table of seen [`Type`]s to reference in the future
48 ///
49 /// The first time a [`Type`] is seen, it is present in the stream literally,
50 /// but afterwards are only referenced by index in order of appearance.
51 types_table: Vec<Vec<Type>>,
52 /// As we parse the `typedstream`, build a table of seen archivable data to reference in the future
53 object_table: Vec<Archivable>,
54 /// We want to copy embedded types the first time they are seen, even if the types were resolved through references
55 seen_embedded_types: HashSet<u32>,
56 /// Stores the position of the current [`Archivable::Placeholder`]
57 placeholder: Option<usize>,
58}
59
60impl<'a> TypedStreamReader<'a> {
61 /// Given a stream, construct a reader instance to parse it.
62 ///
63 /// # Example:
64 ///
65 /// ```
66 /// use imessage_database::util::typedstream::parser::TypedStreamReader;
67 ///
68 /// let bytes: Vec<u8> = vec![]; // Example stream
69 /// let mut reader = TypedStreamReader::from(&bytes);
70 /// ```
71 pub fn from(stream: &'a [u8]) -> Self {
72 Self {
73 stream,
74 idx: 0,
75 types_table: vec![],
76 object_table: vec![],
77 seen_embedded_types: HashSet::new(),
78 placeholder: None,
79 }
80 }
81
82 /// Read a signed integer from the stream. Because we don't know the size of the integer ahead of time,
83 /// we store it in the largest possible value.
84 fn read_signed_int(&mut self) -> Result<i64, TypedStreamError> {
85 match self.get_current_byte()? {
86 I_16 => {
87 let size = 2;
88 self.idx += 1;
89 let value = i16::from_le_bytes(
90 self.read_exact_bytes(size)?
91 .try_into()
92 .map_err(TypedStreamError::SliceError)?,
93 );
94 Ok(value as i64)
95 }
96 I_32 => {
97 let size = 4;
98 self.idx += 1;
99 let value = i32::from_le_bytes(
100 self.read_exact_bytes(size)?
101 .try_into()
102 .map_err(TypedStreamError::SliceError)?,
103 );
104 Ok(value as i64)
105 }
106 _ => {
107 if self.get_current_byte()? > REFERENCE_TAG as u8 && self.get_next_byte()? != END {
108 self.idx += 1;
109 return self.read_signed_int();
110 }
111 let value = i8::from_le_bytes([self.get_current_byte()?]);
112 self.idx += 1;
113 Ok(value as i64)
114 }
115 }
116 }
117
118 /// Read an unsigned integer from the stream. Because we don't know the size of the integer ahead of time,
119 /// we store it in the largest possible value.
120 fn read_unsigned_int(&mut self) -> Result<u64, TypedStreamError> {
121 match self.get_current_byte()? {
122 I_16 => {
123 let size = 2;
124 self.idx += 1;
125 let value = u16::from_le_bytes(
126 self.read_exact_bytes(size)?
127 .try_into()
128 .map_err(TypedStreamError::SliceError)?,
129 );
130 Ok(value as u64)
131 }
132 I_32 => {
133 let size = 4;
134 self.idx += 1;
135 let value = u32::from_le_bytes(
136 self.read_exact_bytes(size)?
137 .try_into()
138 .map_err(TypedStreamError::SliceError)?,
139 );
140 Ok(value as u64)
141 }
142 _ => {
143 let value = u8::from_le_bytes([self.get_current_byte()?]);
144 self.idx += 1;
145 Ok(value as u64)
146 }
147 }
148 }
149
150 /// Read a single-precision float from the byte stream
151 fn read_float(&mut self) -> Result<f32, TypedStreamError> {
152 match self.get_current_byte()? {
153 DECIMAL => {
154 let size = 4;
155 self.idx += 1;
156 let value = f32::from_le_bytes(
157 self.read_exact_bytes(size)?
158 .try_into()
159 .map_err(TypedStreamError::SliceError)?,
160 );
161 Ok(value)
162 }
163 I_16 | I_32 => Ok(self.read_signed_int()? as f32),
164 _ => {
165 self.idx += 1;
166 Ok(self.read_signed_int()? as f32)
167 }
168 }
169 }
170
171 /// Read a double-precision float from the byte stream
172 fn read_double(&mut self) -> Result<f64, TypedStreamError> {
173 match self.get_current_byte()? {
174 DECIMAL => {
175 let size = 8;
176 self.idx += 1;
177 let value = f64::from_le_bytes(
178 self.read_exact_bytes(size)?
179 .try_into()
180 .map_err(TypedStreamError::SliceError)?,
181 );
182 Ok(value)
183 }
184 I_16 | I_32 => Ok(self.read_signed_int()? as f64),
185 _ => {
186 self.idx += 1;
187 Ok(self.read_signed_int()? as f64)
188 }
189 }
190 }
191
192 /// Read exactly `n` bytes from the stream
193 fn read_exact_bytes(&mut self, n: usize) -> Result<&[u8], TypedStreamError> {
194 let range =
195 self.stream
196 .get(self.idx..self.idx + n)
197 .ok_or(TypedStreamError::OutOfBounds(
198 self.idx + n,
199 self.stream.len(),
200 ))?;
201 self.idx += n;
202 Ok(range)
203 }
204
205 /// Read `n` bytes as a String
206 fn read_exact_as_string(
207 &mut self,
208 n: usize,
209 string: &mut String,
210 ) -> Result<(), TypedStreamError> {
211 let str = std::str::from_utf8(self.read_exact_bytes(n)?)
212 .map_err(TypedStreamError::StringParseError)?;
213 string.push_str(str);
214 Ok(())
215 }
216
217 /// Get the byte at a given index, if the index is within the bounds of the `typedstream`
218 fn get_byte(&self, byte_idx: usize) -> Result<u8, TypedStreamError> {
219 if byte_idx < self.stream.len() {
220 return Ok(self.stream[byte_idx]);
221 }
222 Err(TypedStreamError::OutOfBounds(byte_idx, self.stream.len()))
223 }
224
225 /// Read the current byte
226 fn get_current_byte(&self) -> Result<u8, TypedStreamError> {
227 self.get_byte(self.idx)
228 }
229
230 /// Read the next byte
231 fn get_next_byte(&self) -> Result<u8, TypedStreamError> {
232 self.get_byte(self.idx + 1)
233 }
234
235 /// Read some bytes as an array
236 fn read_array(&mut self, size: usize) -> Result<Vec<u8>, TypedStreamError> {
237 Ok(self.read_exact_bytes(size)?.to_vec())
238 }
239
240 /// Determine the current types
241 fn read_type(&mut self) -> Result<Vec<Type>, TypedStreamError> {
242 let length = self.read_unsigned_int()?;
243
244 let types = self.read_exact_bytes(length as usize)?;
245
246 // Handle array size
247 if types.first() == Some(&0x5b) {
248 return Type::get_array_length(types).ok_or(TypedStreamError::InvalidArray);
249 }
250
251 Ok(types.iter().map(Type::from_byte).collect())
252 }
253
254 /// Read a reference pointer for a Type
255 fn read_pointer(&mut self) -> Result<u32, TypedStreamError> {
256 let pointer = self.get_current_byte()?;
257 let result = (pointer as u32)
258 .checked_sub(REFERENCE_TAG as u32)
259 .ok_or(TypedStreamError::InvalidPointer(pointer));
260 self.idx += 1;
261 result
262 }
263
264 /// Read a class
265 fn read_class(&mut self) -> Result<ClassResult, TypedStreamError> {
266 let mut out_v: Vec<Archivable> = vec![];
267 match self.get_current_byte()? {
268 START => {
269 // Skip some header bytes
270 while self.get_current_byte()? == START {
271 self.idx += 1;
272 }
273 let length = self.read_unsigned_int()?;
274
275 if length >= REFERENCE_TAG {
276 let index = length - REFERENCE_TAG;
277 return Ok(ClassResult::Index(index as usize));
278 }
279
280 let mut class_name = String::with_capacity(length as usize);
281 self.read_exact_as_string(length as usize, &mut class_name)?;
282
283 let version = self.read_unsigned_int()?;
284
285 self.types_table
286 .push(vec![Type::new_string(class_name.clone())]);
287
288 out_v.push(Archivable::Class(Class::new(class_name, version)));
289
290 if let ClassResult::ClassHierarchy(parent) = self.read_class()? {
291 out_v.extend(parent);
292 }
293 }
294 EMPTY => {
295 self.idx += 1;
296 }
297 _ => {
298 let index = self.read_pointer()?;
299 return Ok(ClassResult::Index(index as usize));
300 }
301 }
302 Ok(ClassResult::ClassHierarchy(out_v))
303 }
304
305 /// Read an object into the cache and emit, or emit an already-cached object
306 fn read_object(&mut self) -> Result<Option<&Archivable>, TypedStreamError> {
307 match self.get_current_byte()? {
308 START => {
309 match self.read_class()? {
310 ClassResult::Index(idx) => {
311 return Ok(self.object_table.get(idx));
312 }
313 ClassResult::ClassHierarchy(classes) => {
314 for class in classes.into_iter() {
315 self.object_table.push(class)
316 }
317 }
318 }
319 Ok(None)
320 }
321 EMPTY => {
322 self.idx += 1;
323 Ok(None)
324 }
325 _ => {
326 let index = self.read_pointer()?;
327 Ok(self.object_table.get(index as usize))
328 }
329 }
330 }
331
332 /// Read String data
333 fn read_string(&mut self) -> Result<String, TypedStreamError> {
334 let length = self.read_unsigned_int()?;
335 let mut string = String::with_capacity(length as usize);
336 self.read_exact_as_string(length as usize, &mut string)?;
337
338 Ok(string)
339 }
340
341 /// [`Archivable`] data can be embedded on a class or in a C String marked as [`Type::EmbeddedData`]
342 fn read_embedded_data(&mut self) -> Result<Option<Archivable>, TypedStreamError> {
343 // Skip the 0x84
344 self.idx += 1;
345 match self.get_type(true)? {
346 Some(types) => self.read_types(types),
347 None => Ok(None),
348 }
349 }
350
351 /// Gets the current type from the stream, either by reading it from the stream or reading it from
352 /// the specified index of [`TypedStreamReader::types_table`]. Because methods that use this type can also mutate self,
353 /// returning a reference here means other methods could make that reference to the table invalid,
354 /// which is disallowed in Rust. Thus, we return a clone of the cached data.
355 fn get_type(&mut self, embedded: bool) -> Result<Option<Vec<Type>>, TypedStreamError> {
356 match self.get_current_byte()? {
357 START => {
358 // Ignore repeated types, for example in a dict
359 self.idx += 1;
360
361 let object_types = self.read_type()?;
362
363 // Embedded data is stored as a C String in the objects table
364 if embedded {
365 self.object_table
366 .push(Archivable::Type(object_types.clone()));
367 }
368 self.types_table.push(object_types);
369 Ok(self.types_table.last().cloned())
370 }
371 END => {
372 // This indicates the end of the current object
373 Ok(None)
374 }
375 _ => {
376 // Ignore repeated types, for example in a dict
377 while self.get_current_byte()? == self.get_next_byte()? {
378 self.idx += 1;
379 }
380
381 let ref_tag = self.read_pointer()?;
382 let result = self.types_table.get(ref_tag as usize);
383
384 if embedded {
385 if let Some(res) = result {
386 // We only want to include the first embedded reference tag, not subsequent references to the same embed
387 if !self.seen_embedded_types.contains(&ref_tag) {
388 self.object_table.push(Archivable::Type(res.clone()));
389 self.seen_embedded_types.insert(ref_tag);
390 }
391 }
392 }
393
394 Ok(result.cloned())
395 }
396 }
397 }
398
399 /// Given some [`Type`]s, look at the stream and parse the data according to the specified [`Type`]
400 fn read_types(
401 &mut self,
402 found_types: Vec<Type>,
403 ) -> Result<Option<Archivable>, TypedStreamError> {
404 let mut out_v = vec![];
405 let mut is_obj: bool = false;
406
407 for found_type in found_types {
408 match found_type {
409 Type::Utf8String => out_v.push(OutputData::String(self.read_string()?)),
410 Type::EmbeddedData => {
411 return self.read_embedded_data();
412 }
413 Type::Object => {
414 is_obj = true;
415 let length = self.object_table.len();
416 self.placeholder = Some(length);
417 self.object_table.push(Archivable::Placeholder);
418 if let Some(object) = self.read_object()? {
419 match object.clone() {
420 Archivable::Object(_, data) => {
421 // If this is a new object, i.e. one without any data, we add the data into it later
422 // If the object already has data in it, we just want to return that object
423 if !data.is_empty() {
424 let result = Ok(Some(object.clone()));
425 self.placeholder = None;
426 self.object_table.pop();
427 return result;
428 }
429 out_v.extend(data)
430 }
431 Archivable::Class(cls) => out_v.push(OutputData::Class(cls)),
432 Archivable::Data(data) => out_v.extend(data),
433 // These cases are used internally in the objects table but should not be present in any output
434 Archivable::Placeholder | Archivable::Type(_) => {}
435 }
436 }
437 }
438 Type::SignedInt => out_v.push(OutputData::SignedInteger(self.read_signed_int()?)),
439 Type::UnsignedInt => {
440 out_v.push(OutputData::UnsignedInteger(self.read_unsigned_int()?))
441 }
442 Type::Float => out_v.push(OutputData::Float(self.read_float()?)),
443 Type::Double => out_v.push(OutputData::Double(self.read_double()?)),
444 Type::Unknown(byte) => out_v.push(OutputData::Byte(byte)),
445 Type::String(s) => out_v.push(OutputData::String(s)),
446 Type::Array(size) => out_v.push(OutputData::Array(self.read_array(size)?)),
447 };
448 }
449
450 // If we had reserved a place for an object, fill that spot
451 if let Some(spot) = self.placeholder {
452 if !out_v.is_empty() {
453 // We got a class, but do not have its respective data yet
454 if let Some(OutputData::Class(class)) = out_v.last() {
455 self.object_table[spot] = Archivable::Object(class.clone(), vec![]);
456 // The spot after the current placeholder contains the class at the top of the class heirarchy, i.e.
457 // if we get a placeholder and then find a new class heirarchy, the object table holds the class chain
458 // in descending order of inheritance
459 } else if let Some(Archivable::Class(class)) = self.object_table.get(spot + 1) {
460 self.object_table[spot] = Archivable::Object(class.clone(), out_v.clone());
461 self.placeholder = None;
462 return Ok(self.object_table.get(spot).cloned());
463 // We got some data for a class that was already seen
464 } else if let Some(Archivable::Object(_, data)) = self.object_table.get_mut(spot) {
465 data.extend(out_v.clone());
466 self.placeholder = None;
467 return Ok(self.object_table.get(spot).cloned());
468 // We got some data that is not part of a class, i.e. a field in the parent object for which we don't know the name
469 } else {
470 self.object_table[spot] = Archivable::Data(out_v.clone());
471 self.placeholder = None;
472 return Ok(self.object_table.get(spot).cloned());
473 }
474 }
475 }
476
477 if !out_v.is_empty() && !is_obj {
478 return Ok(Some(Archivable::Data(out_v.clone())));
479 }
480 Ok(None)
481 }
482
483 /// In the original source there are several variants of the header, but we
484 /// only need to validate that this is the header used by macOS/iOS, as iMessage
485 /// is probably not available on any NeXT platform
486 pub(crate) fn validate_header(&mut self) -> Result<(), TypedStreamError> {
487 // Encoding type
488 let typedstream_version = self.read_unsigned_int()?;
489 // Encoding signature
490 let signature = self.read_string()?;
491 // System version
492 let system_version = self.read_signed_int()?;
493
494 if typedstream_version != 4 || signature != "streamtyped" || system_version != 1000 {
495 return Err(TypedStreamError::InvalidHeader);
496 }
497
498 Ok(())
499 }
500
501 /// Attempt to get the data from the `typedstream`.
502 ///
503 /// Given a stream, construct a reader object to parse it. `typedstream` data doesn't include property
504 /// names, so data is stored on [`Object`](crate::util::typedstream::models::Archivable::Object)s in order of appearance.
505 ///
506 /// Yields a new [`Archivable`] as they occur in the stream, but does not retain the object's inheritance heirarchy.
507 /// Callers are responsible for assembling the deserialized stream into a useful data structure.
508 ///
509 /// # Example:
510 ///
511 /// ```
512 /// use imessage_database::util::typedstream::parser::TypedStreamReader;
513 ///
514 /// let bytes: Vec<u8> = vec![]; // Example stream
515 /// let mut reader = TypedStreamReader::from(&bytes);
516 /// let result = reader.parse();
517 /// ```
518 ///
519 /// # Sample output:
520 /// ```txt
521 /// [
522 /// Object(Class { name: "NSMutableString", version: 1 }, [String("Example")]) // The message text
523 /// Data([Integer(1), Integer(7)]) // The next object describes properties for the range of chars 1 through 7
524 /// Object(Class { name: "NSDictionary", version: 0 }, [Integer(1)]) // The first property is a `NSDictionary` with 1 item
525 /// Object(Class { name: "NSString", version: 1 }, [String("__kIMMessagePartAttributeName")]) // The first key in the `NSDictionary`
526 /// Object(Class { name: "NSNumber", version: 0 }, [Integer(0)]) // The first value in the `NSDictionary`
527 /// ]
528 /// ```
529 pub fn parse(&mut self) -> Result<Vec<Archivable>, TypedStreamError> {
530 let mut out_v = vec![];
531
532 self.validate_header()?;
533
534 while self.idx < self.stream.len() {
535 if self.get_current_byte()? == END {
536 self.idx += 1;
537 continue;
538 }
539
540 // First, get the current type
541 if let Some(found_types) = self.get_type(false)? {
542 let result = self.read_types(found_types);
543 if let Ok(Some(res)) = result {
544 out_v.push(res);
545 }
546 }
547 }
548
549 Ok(out_v)
550 }
551}