imessage_database/util/data_detected.rs
1/*!
2 Navigation helpers for Apple's `DDScannerResult` archives.
3
4 These payloads are [`NSKeyedArchiver`](crate::util::plist) archives produced by
5 the private `DataDetectorsCore` framework and stored inline in a message's
6 attributed body (e.g. under `__kIMDataDetectedAttributeName`,
7 `__kIMMoneyAttributeName`, or `__kIMAddressAttributeName`). Each archive
8 describes a tree of *scanner results*; every node has a type
9 ([`kind`](ScannerResult::kind)), an optional value ([`value`](ScannerResult::value)),
10 the substring it matched ([`matched`](ScannerResult::matched)), and zero or more
11 nested results ([`children`](ScannerResult::children)).
12
13 [`ScannerResult`] is a lazy, borrowing cursor over one node of that tree. The
14 semantic detector types parse themselves from a node via [`FromScannerResult`].
15*/
16
17use std::io::Cursor;
18
19use crabstep::deserializer::iter::Property;
20use plist::{Dictionary, Value};
21
22/// Maximum scanner-result depth before traversal stops.
23///
24/// `NSKeyedArchiver` graphs are deduplicated by `UID` and may contain reference
25/// cycles, so recursion is bounded for malformed payloads.
26const MAX_DEPTH: usize = 8;
27
28/// Borrowing, lazily-resolved cursor over one `DDScannerResult` tree node.
29///
30/// Fields are stored as `UID` indices into the archive's `$objects` table and
31/// resolved on access, so constructing or walking a `ScannerResult` allocates
32/// nothing beyond the child-index list produced by [`children`](Self::children).
33#[derive(Clone, Copy)]
34pub struct ScannerResult<'a> {
35 /// The archive's `$objects` table; every field is a `UID` index into this.
36 objects: &'a [Value],
37 /// The index of this node within `objects`.
38 index: usize,
39 /// How deep this node sits in the tree, used to bound recursion.
40 depth: usize,
41}
42
43impl<'a> ScannerResult<'a> {
44 /// Resolve the root scanner result from a parsed detector archive.
45 ///
46 /// The root index is stored under `$top.dd-result` (falling back to
47 /// `$top.root`) and points into the archive's `$objects` table.
48 #[must_use]
49 pub fn root(plist: &'a Value) -> Option<Self> {
50 let body = plist.as_dictionary()?;
51 let objects = body.get("$objects")?.as_array()?;
52 let top = body.get("$top")?.as_dictionary()?;
53 let index = top
54 .get("dd-result")
55 .or_else(|| top.get("root"))
56 .and_then(uid_index)?;
57 Some(Self {
58 objects,
59 index,
60 depth: 0,
61 })
62 }
63
64 /// The result type from the `T` field (e.g. `"Money"`, `"Unit"`, `"TrackingNumber"`).
65 #[must_use]
66 pub fn kind(&self) -> Option<&'a str> {
67 self.field_string("T")
68 }
69
70 /// The result value from the `V` field, if present.
71 #[must_use]
72 pub fn value(&self) -> Option<&'a str> {
73 self.field_string("V")
74 }
75
76 /// The substring of the message text this result matched from the `MS` field.
77 #[must_use]
78 pub fn matched(&self) -> Option<&'a str> {
79 self.field_string("MS")
80 }
81
82 /// Child results from the `SR` array, depth-bounded so cyclic archives
83 /// terminate.
84 pub fn children(&self) -> impl Iterator<Item = ScannerResult<'a>> + '_ {
85 self.child_indices()
86 .unwrap_or_default()
87 .into_iter()
88 .map(|index| ScannerResult {
89 objects: self.objects,
90 index,
91 depth: self.depth + 1,
92 })
93 }
94
95 /// The dictionary backing this node.
96 fn dict(&self) -> Option<&'a Dictionary> {
97 self.objects.get(self.index)?.as_dictionary()
98 }
99
100 /// Resolve a `UID`-referenced string field by key.
101 fn field_string(&self, key: &str) -> Option<&'a str> {
102 let reference = self.dict()?.get(key)?;
103 self.objects.get(uid_index(reference)?)?.as_string()
104 }
105
106 /// Resolve the `SR` array to the object indices of its child results, or
107 /// `None` once the depth bound is reached.
108 fn child_indices(&self) -> Option<Vec<usize>> {
109 if self.depth >= MAX_DEPTH {
110 return None;
111 }
112 let sub_results = self.dict()?.get("SR")?;
113 let array = self
114 .objects
115 .get(uid_index(sub_results)?)?
116 .as_dictionary()?
117 .get("NS.objects")?
118 .as_array()?;
119 Some(array.iter().filter_map(uid_index).collect())
120 }
121}
122
123/// Type that can recognize itself from a [`ScannerResult`] node.
124///
125/// Returning `None` means "this node is not of the implementing type," which is
126/// an expected outcome rather than an error.
127pub trait FromScannerResult: Sized {
128 /// Byte markers used to reject impossible payloads before plist parsing.
129 ///
130 /// When non-empty, [`from_attribute`](Self::from_attribute) parses the
131 /// payload only if it contains at least one of these byte sequences. This
132 /// skips deserializing results from the shared `__kIMDataDetectedAttributeName`
133 /// attribute that cannot be `Self`, since that attribute carries every
134 /// data-detector type. Types parsed from a dedicated attribute leave this
135 /// empty (the default).
136 const MARKERS: &[&[u8]] = &[];
137
138 /// Parse `Self` from a scanner-result node, or return `None` on mismatch.
139 fn from_scanner_result(result: &ScannerResult<'_>) -> Option<Self>;
140
141 /// Parse `Self` from a typedstream attribute carrying a `DDScannerResult`
142 /// archive (`NSData` or `NSMutableData`).
143 ///
144 /// Returns `None` when the value is not data, fails the
145 /// [`MARKERS`](Self::MARKERS) pre-filter, is not a valid archive, or does
146 /// not represent a `Self`.
147 fn from_attribute<'p>(value: &Property<'p, 'p>) -> Option<Self> {
148 let data = value.as_data()?;
149 if !Self::MARKERS.is_empty()
150 && !Self::MARKERS
151 .iter()
152 .any(|marker| data.windows(marker.len()).any(|window| window == *marker))
153 {
154 return None;
155 }
156 let plist = Value::from_reader(Cursor::new(data)).ok()?;
157 Self::from_scanner_result(&ScannerResult::root(&plist)?)
158 }
159}
160
161/// Interpret a plist `UID` as an index into the `$objects` table.
162fn uid_index(value: &Value) -> Option<usize> {
163 usize::try_from(value.as_uid()?.get()).ok()
164}