imessage_database/tables/
chat_handle.rs

1/*!
2 This module represents the chat to handle join table.
3*/
4
5use std::collections::{BTreeSet, HashMap, HashSet};
6
7use crate::{
8    error::table::TableError,
9    tables::table::{CHAT_HANDLE_JOIN, CHAT_MESSAGE_JOIN, Cacheable, Diagnostic, Table},
10    util::output::{done_processing, processing},
11};
12use rusqlite::{CachedStatement, Connection, Error, Result, Row};
13
14// MARK: Struct
15/// Represents a single row in the `chat_handle_join` table.
16pub struct ChatToHandle {
17    chat_id: i32,
18    handle_id: i32,
19}
20
21impl Table for ChatToHandle {
22    fn from_row(row: &Row) -> Result<ChatToHandle> {
23        Ok(ChatToHandle {
24            chat_id: row.get("chat_id")?,
25            handle_id: row.get("handle_id")?,
26        })
27    }
28
29    fn get(db: &'_ Connection) -> Result<CachedStatement<'_>, TableError> {
30        Ok(db.prepare_cached(&format!("SELECT * FROM {CHAT_HANDLE_JOIN}"))?)
31    }
32
33    fn extract(chat_to_handle: Result<Result<Self, Error>, Error>) -> Result<Self, TableError> {
34        match chat_to_handle {
35            Ok(Ok(chat_to_handle)) => Ok(chat_to_handle),
36            Err(why) | Ok(Err(why)) => Err(TableError::QueryError(why)),
37        }
38    }
39}
40
41// MARK: Cache
42impl Cacheable for ChatToHandle {
43    type K = i32;
44    type V = BTreeSet<i32>;
45    /// Generate a hashmap containing each chatroom's ID pointing to a `HashSet` of participant handle IDs
46    ///
47    /// # Example:
48    ///
49    /// ```
50    /// use imessage_database::util::dirs::default_db_path;
51    /// use imessage_database::tables::table::{Cacheable, get_connection};
52    /// use imessage_database::tables::chat_handle::ChatToHandle;
53    ///
54    /// let db_path = default_db_path();
55    /// let conn = get_connection(&db_path).unwrap();
56    /// let chatrooms = ChatToHandle::cache(&conn);
57    /// ```
58    fn cache(db: &Connection) -> Result<HashMap<Self::K, Self::V>, TableError> {
59        let mut cache: HashMap<i32, BTreeSet<i32>> = HashMap::new();
60
61        let mut rows = ChatToHandle::get(db)?;
62        let mappings = rows.query_map([], |row| Ok(ChatToHandle::from_row(row)))?;
63
64        for mapping in mappings {
65            let joiner = ChatToHandle::extract(mapping)?;
66            if let Some(handles) = cache.get_mut(&joiner.chat_id) {
67                handles.insert(joiner.handle_id);
68            } else {
69                let mut data_to_cache = BTreeSet::new();
70                data_to_cache.insert(joiner.handle_id);
71                cache.insert(joiner.chat_id, data_to_cache);
72            }
73        }
74
75        Ok(cache)
76    }
77}
78
79// MARK: Diagnostic
80impl Diagnostic for ChatToHandle {
81    /// Emit diagnostic data for the Chat to Handle join table
82    ///
83    /// Get the number of chats referenced in the messages table
84    /// that do not exist in this join table:
85    ///
86    /// # Example:
87    ///
88    /// ```
89    /// use imessage_database::util::dirs::default_db_path;
90    /// use imessage_database::tables::table::{Diagnostic, get_connection};
91    /// use imessage_database::tables::chat_handle::ChatToHandle;
92    ///
93    /// let db_path = default_db_path();
94    /// let conn = get_connection(&db_path).unwrap();
95    /// ChatToHandle::run_diagnostic(&conn);
96    /// ```
97    fn run_diagnostic(db: &Connection) -> Result<(), TableError> {
98        processing();
99
100        // Get the Chat IDs that are associated with messages
101        let mut statement_message_chats =
102            db.prepare(&format!("SELECT DISTINCT chat_id from {CHAT_MESSAGE_JOIN}"))?;
103        let statement_message_chat_rows =
104            statement_message_chats.query_map([], |row: &Row| -> Result<i32> { row.get(0) })?;
105        let mut unique_chats_from_messages: HashSet<i32> = HashSet::new();
106        statement_message_chat_rows.into_iter().for_each(|row| {
107            if let Ok(row) = row {
108                unique_chats_from_messages.insert(row);
109            }
110        });
111
112        // Get the Chat IDs that are associated with handles
113        let mut statement_handle_chats =
114            db.prepare(&format!("SELECT DISTINCT chat_id from {CHAT_HANDLE_JOIN}"))?;
115        let statement_handle_chat_rows =
116            statement_handle_chats.query_map([], |row: &Row| -> Result<i32> { row.get(0) })?;
117        let mut unique_chats_from_handles: HashSet<i32> = HashSet::new();
118        statement_handle_chat_rows.into_iter().for_each(|row| {
119            if let Ok(row) = row {
120                unique_chats_from_handles.insert(row);
121            }
122        });
123
124        // Cache all chats
125        let all_chats = Self::cache(db)?;
126
127        // Cache chatroom participants
128        let chatroom_participants = ChatToHandle::cache(db)?;
129        let chat_handle_lookup = ChatToHandle::get_chat_lookup_map(db)?;
130
131        // Deduplicate chatroom participants
132        let real_chatrooms = ChatToHandle::dedupe(&chatroom_participants, &chat_handle_lookup)?;
133
134        // Calculate total duplicated chats
135        let total_dupes =
136            all_chats.len() - HashSet::<&i32>::from_iter(real_chatrooms.values()).len();
137
138        done_processing();
139
140        // Find the set difference and emit
141        let chats_with_no_handles = unique_chats_from_messages
142            .difference(&unique_chats_from_handles)
143            .count();
144
145        println!("Thread diagnostic data:");
146        println!("    Total chats: {}", all_chats.len());
147
148        if total_dupes > 0 {
149            println!("    Total duplicated chats: {total_dupes}");
150        }
151
152        if chats_with_no_handles > 0 {
153            println!("    Chats with no handles: {chats_with_no_handles:?}");
154        }
155        Ok(())
156    }
157}
158
159impl ChatToHandle {
160    /// Get the chat lookup map from the database, if it exists
161    ///
162    /// This is used to map chat IDs that are split across services to a canonical chat ID
163    /// for deduplication purposes.
164    ///
165    /// # Example:
166    ///
167    /// ```
168    /// use imessage_database::util::dirs::default_db_path;
169    /// use imessage_database::tables::table::{Diagnostic, get_connection};
170    /// use imessage_database::tables::chat_handle::ChatToHandle;
171    ///
172    /// let db_path = default_db_path();
173    /// let conn = get_connection(&db_path).unwrap();
174    /// ChatToHandle::get_chat_lookup_map(&conn);
175    /// ```
176    pub fn get_chat_lookup_map(conn: &Connection) -> Result<HashMap<i32, i32>, TableError> {
177        // Query `chat_lookup`, if it exists, to merge chat IDs split across services
178        let mut stmt = conn.prepare(
179            "
180WITH RECURSIVE
181  adj AS (
182    SELECT DISTINCT a.chat AS u, b.chat AS v
183    FROM chat_lookup a
184    JOIN chat_lookup b
185      ON a.identifier = b.identifier
186  ),
187  reach(root, chat) AS (
188    SELECT u AS root, v AS chat FROM adj
189    UNION
190    SELECT r.root, a.v
191    FROM reach r
192    JOIN adj a ON a.u = r.chat
193  ),
194  canon AS (
195    SELECT chat, MAX(root) AS canonical_chat
196    FROM reach
197    GROUP BY chat
198  )
199SELECT chat, canonical_chat
200FROM canon
201ORDER BY chat;
202        ",
203        );
204        let mut chat_lookup_map: HashMap<i32, i32> = HashMap::new();
205
206        if let Ok(statement) = stmt.as_mut() {
207            // Build chat lookup map
208            let chat_lookup_rows = statement.query_map([], |row| {
209                let chat: i32 = row.get(0)?;
210                let canonical: i32 = row.get(1)?;
211                Ok((chat, canonical))
212            });
213
214            // Populate chat lookup map
215            if let Ok(chat_lookup_rows) = chat_lookup_rows {
216                for row in chat_lookup_rows {
217                    let (chat_id, canonical_chat) = row?;
218                    chat_lookup_map.insert(chat_id, canonical_chat);
219                }
220            }
221        }
222        Ok(chat_lookup_map)
223    }
224
225    /// Given the initial set of duplicated chats, deduplicate them based on the participants
226    ///
227    /// This returns a new hashmap that maps the real chat ID to a new deduplicated unique chat ID
228    /// that represents a single chat for all of the same participants, even if they have multiple handles.
229    ///
230    /// Assuming no new chat-handle relationships have been written to the database, deduplicated data is deterministic across runs.
231    ///
232    /// # Example:
233    ///
234    /// ```
235    /// use std::collections::HashMap;
236    ///
237    /// use imessage_database::util::dirs::default_db_path;
238    /// use imessage_database::tables::table::{Cacheable, Deduplicate, get_connection};
239    /// use imessage_database::tables::chat_handle::ChatToHandle;
240    ///
241    /// let db_path = default_db_path();
242    /// let conn = get_connection(&db_path).unwrap();
243    /// let chatrooms = ChatToHandle::cache(&conn).unwrap();
244    /// let deduped_chatrooms = ChatToHandle::dedupe(&chatrooms, &HashMap::new());
245    /// ```
246    pub fn dedupe(
247        duplicated_data: &HashMap<i32, BTreeSet<i32>>,
248        chat_lookup_map: &HashMap<i32, i32>,
249    ) -> Result<HashMap<i32, i32>, TableError> {
250        let mut deduplicated_chats: HashMap<i32, i32> = HashMap::new();
251        let mut participants_to_unique_chat_id: HashMap<BTreeSet<i32>, i32> = HashMap::new();
252
253        // Build cache of each unique set of participants to a new identifier
254        let mut unique_chat_identifier = 0;
255
256        // Iterate over the values in a deterministic order
257        let mut sorted_dupes: Vec<(&i32, &BTreeSet<i32>)> = duplicated_data.iter().collect();
258        sorted_dupes.sort_by(|(a, _), (b, _)| a.cmp(b));
259
260        // Map each chat ID to the deduplicated unique chat ID
261        for (chat_id, participants) in sorted_dupes {
262            // If this set of participants has already been seen, map to the existing unique chat ID
263            if let Some(id) = participants_to_unique_chat_id.get(participants) {
264                deduplicated_chats.insert(chat_id.to_owned(), id.to_owned());
265            } else {
266                // If `chat_lookup` exists, map to canonical chat ID
267                let mapped_id = if let Some(canonical_chat) = chat_lookup_map.get(chat_id) {
268                    canonical_chat
269                } else {
270                    chat_id
271                };
272
273                // Check if the mapped ID has already been seen
274                if let Some(id) = deduplicated_chats.get(mapped_id) {
275                    // Map to the existing unique chat ID
276                    deduplicated_chats.insert(*chat_id, id.to_owned());
277                } else {
278                    // New set of participants, assign a new unique chat ID
279                    participants_to_unique_chat_id
280                        .insert(participants.to_owned(), unique_chat_identifier);
281
282                    // Map chat ID to unique chat ID
283                    deduplicated_chats.insert(chat_id.to_owned(), unique_chat_identifier);
284                    unique_chat_identifier += 1;
285                }
286            }
287        }
288        Ok(deduplicated_chats)
289    }
290}
291
292// MARK: Tests
293#[cfg(test)]
294mod tests {
295    use crate::tables::chat_handle::ChatToHandle;
296    use std::collections::{BTreeSet, HashMap, HashSet};
297
298    #[test]
299    fn can_dedupe() {
300        let mut input: HashMap<i32, BTreeSet<i32>> = HashMap::new();
301        input.insert(1, BTreeSet::from([1])); // 0
302        input.insert(2, BTreeSet::from([1])); // 0
303        input.insert(3, BTreeSet::from([1])); // 0
304        input.insert(4, BTreeSet::from([2])); // 1
305        input.insert(5, BTreeSet::from([2])); // 1
306        input.insert(6, BTreeSet::from([3])); // 2
307
308        let output = ChatToHandle::dedupe(&input, &HashMap::new());
309        let expected_deduped_ids: HashSet<i32> = output.unwrap().values().copied().collect();
310        assert_eq!(expected_deduped_ids.len(), 3);
311    }
312
313    #[test]
314    fn can_dedupe_multi() {
315        let mut input: HashMap<i32, BTreeSet<i32>> = HashMap::new();
316        input.insert(1, BTreeSet::from([1, 2])); // 0
317        input.insert(2, BTreeSet::from([1])); // 1
318        input.insert(3, BTreeSet::from([1])); // 1
319        input.insert(4, BTreeSet::from([2, 1])); // 0
320        input.insert(5, BTreeSet::from([2, 3])); // 2
321        input.insert(6, BTreeSet::from([3])); // 3
322
323        let output = ChatToHandle::dedupe(&input, &HashMap::new());
324        let expected_deduped_ids: HashSet<i32> = output.unwrap().values().copied().collect();
325        assert_eq!(expected_deduped_ids.len(), 4);
326    }
327
328    #[test]
329    // Simulate 3 runs of the program and ensure that the order of the deduplicated contacts is stable
330    fn test_same_values() {
331        let mut input_1: HashMap<i32, BTreeSet<i32>> = HashMap::new();
332        input_1.insert(1, BTreeSet::from([1]));
333        input_1.insert(2, BTreeSet::from([1]));
334        input_1.insert(3, BTreeSet::from([1]));
335        input_1.insert(4, BTreeSet::from([2]));
336        input_1.insert(5, BTreeSet::from([2]));
337        input_1.insert(6, BTreeSet::from([3]));
338
339        let mut input_2: HashMap<i32, BTreeSet<i32>> = HashMap::new();
340        input_2.insert(1, BTreeSet::from([1]));
341        input_2.insert(2, BTreeSet::from([1]));
342        input_2.insert(3, BTreeSet::from([1]));
343        input_2.insert(4, BTreeSet::from([2]));
344        input_2.insert(5, BTreeSet::from([2]));
345        input_2.insert(6, BTreeSet::from([3]));
346
347        let mut input_3: HashMap<i32, BTreeSet<i32>> = HashMap::new();
348        input_3.insert(1, BTreeSet::from([1]));
349        input_3.insert(2, BTreeSet::from([1]));
350        input_3.insert(3, BTreeSet::from([1]));
351        input_3.insert(4, BTreeSet::from([2]));
352        input_3.insert(5, BTreeSet::from([2]));
353        input_3.insert(6, BTreeSet::from([3]));
354
355        let mut output_1 = ChatToHandle::dedupe(&input_1, &HashMap::new())
356            .unwrap()
357            .into_iter()
358            .collect::<Vec<(i32, i32)>>();
359        let mut output_2 = ChatToHandle::dedupe(&input_2, &HashMap::new())
360            .unwrap()
361            .into_iter()
362            .collect::<Vec<(i32, i32)>>();
363        let mut output_3 = ChatToHandle::dedupe(&input_3, &HashMap::new())
364            .unwrap()
365            .into_iter()
366            .collect::<Vec<(i32, i32)>>();
367
368        output_1.sort_unstable();
369        output_2.sort_unstable();
370        output_3.sort_unstable();
371
372        assert_eq!(output_1, output_2);
373        assert_eq!(output_1, output_3);
374        assert_eq!(output_2, output_3);
375    }
376
377    #[test]
378    fn can_dedupe_with_chat_lookup_map() {
379        let mut input: HashMap<i32, BTreeSet<i32>> = HashMap::new();
380        input.insert(0, BTreeSet::from([1])); // Canonical 0
381        input.insert(1, BTreeSet::from([1])); // Maps to 0
382        input.insert(2, BTreeSet::from([3])); // Maps to 5
383        input.insert(4, BTreeSet::from([2])); // Maps to 0
384        input.insert(5, BTreeSet::from([1])); // Canonical 5
385
386        let mut chat_lookup_map: HashMap<i32, i32> = HashMap::new();
387        chat_lookup_map.insert(2, 5);
388        chat_lookup_map.insert(4, 0);
389
390        let output = ChatToHandle::dedupe(&input, &chat_lookup_map).unwrap();
391
392        // Chats 0,1,4 map to 0, so same deduplicated ID
393        assert_eq!(output.get(&0), output.get(&1));
394        assert_eq!(output.get(&0), output.get(&4));
395        // Chat 2 maps to 5, different
396        assert_ne!(output.get(&2), output.get(&1));
397    }
398
399    #[test]
400    fn can_dedupe_with_lookup_map_overriding_participants() {
401        let mut input: HashMap<i32, BTreeSet<i32>> = HashMap::new();
402        input.insert(0, BTreeSet::from([1, 2])); // Canonical 0
403        input.insert(1, BTreeSet::from([1, 2])); // Maps to 0
404        input.insert(2, BTreeSet::from([3, 4])); // Maps to 0
405        input.insert(3, BTreeSet::from([1, 2])); // No mapping
406
407        let mut chat_lookup_map: HashMap<i32, i32> = HashMap::new();
408        chat_lookup_map.insert(1, 0);
409        chat_lookup_map.insert(2, 0);
410
411        let output = ChatToHandle::dedupe(&input, &chat_lookup_map).unwrap();
412
413        // Chats 0,1,2 all map to 0, so same deduplicated ID
414        assert_eq!(output.get(&0), output.get(&1));
415        assert_eq!(output.get(&0), output.get(&2));
416        // Chat 3 no mapping, same participants as 0 and 1, so same
417        assert_eq!(output.get(&3), output.get(&0));
418    }
419
420    #[test]
421    fn can_dedupe_mixed_lookup_and_participants() {
422        let mut input: HashMap<i32, BTreeSet<i32>> = HashMap::new();
423        input.insert(0, BTreeSet::from([1])); // Canonical 0
424        input.insert(1, BTreeSet::from([1])); // Maps to 0
425        input.insert(2, BTreeSet::from([3])); // No mapping
426        input.insert(3, BTreeSet::from([2])); // Maps to 0
427        input.insert(4, BTreeSet::from([3])); // No mapping
428
429        let mut chat_lookup_map: HashMap<i32, i32> = HashMap::new();
430        chat_lookup_map.insert(1, 0);
431        chat_lookup_map.insert(3, 0);
432
433        let output = ChatToHandle::dedupe(&input, &chat_lookup_map).unwrap();
434
435        // Chats 0,1,3 map to 0, same ID
436        assert_eq!(output.get(&0), output.get(&1));
437        assert_eq!(output.get(&0), output.get(&3));
438        // Chat 2 no mapping, different from 1
439        assert_ne!(output.get(&2), output.get(&1));
440        // Chat 4 same participants as 2, same as 2
441        assert_eq!(output.get(&4), output.get(&2));
442        // 3 and 4 different
443        assert_ne!(output.get(&3), output.get(&4));
444    }
445}