Skip to main content

mxr_sync/
threading.rs

1//! JWZ threading algorithm — reconstruct threads from In-Reply-To + References headers.
2//! See https://www.jwz.org/doc/threading.html
3
4use chrono::{DateTime, Utc};
5use std::collections::HashMap;
6
7#[derive(Debug, Clone)]
8pub struct MessageForThreading {
9    pub message_id: String,
10    pub in_reply_to: Option<String>,
11    pub references: Vec<String>,
12    pub date: DateTime<Utc>,
13    pub subject: String,
14}
15
16#[derive(Debug, Clone)]
17struct Container {
18    message: Option<MessageForThreading>,
19    parent: Option<String>,
20    children: Vec<String>,
21}
22
23impl Container {
24    fn empty() -> Self {
25        Self {
26            message: None,
27            parent: None,
28            children: Vec::new(),
29        }
30    }
31}
32
33#[derive(Debug, Clone)]
34pub struct ThreadTree {
35    pub root_message_id: String,
36    pub messages: Vec<String>,
37}
38
39/// JWZ threading: reconstruct threads from In-Reply-To + References headers.
40pub fn thread_messages(messages: &[MessageForThreading]) -> Vec<ThreadTree> {
41    if messages.is_empty() {
42        return Vec::new();
43    }
44
45    let message_map: HashMap<&str, &MessageForThreading> = messages
46        .iter()
47        .map(|message| (message.message_id.as_str(), message))
48        .collect();
49    let mut id_table: HashMap<String, Container> = HashMap::new();
50
51    // Step 1: Build ID table
52    for msg in messages {
53        let container = id_table
54            .entry(msg.message_id.clone())
55            .or_insert_with(Container::empty);
56        container.message = Some(msg.clone());
57
58        // Walk References header, link each pair as parent→child
59        let mut prev_id: Option<&str> = None;
60        for ref_id in &msg.references {
61            id_table
62                .entry(ref_id.clone())
63                .or_insert_with(Container::empty);
64            if let Some(parent_id) = prev_id {
65                // Set parent_id as parent of ref_id (if not already parented and no cycle)
66                let ref_container = id_table.get(ref_id).unwrap();
67                if ref_container.parent.is_none()
68                    && !would_create_cycle(&id_table, parent_id, ref_id)
69                {
70                    // Clone to avoid borrow issues
71                    let parent_id_owned = parent_id.to_string();
72                    let ref_id_owned = ref_id.clone();
73                    if let Some(c) = id_table.get_mut(&ref_id_owned) {
74                        c.parent = Some(parent_id_owned.clone());
75                    }
76                    if let Some(p) = id_table.get_mut(&parent_id_owned) {
77                        if !p.children.contains(&ref_id_owned) {
78                            p.children.push(ref_id_owned);
79                        }
80                    }
81                }
82            }
83            prev_id = Some(ref_id);
84        }
85
86        // Set last reference (or In-Reply-To) as parent of this message
87        let parent = msg.in_reply_to.as_deref().or(prev_id);
88        if let Some(parent_id) = parent {
89            if parent_id != msg.message_id
90                && !would_create_cycle(&id_table, parent_id, &msg.message_id)
91            {
92                let parent_id_owned = parent_id.to_string();
93                let msg_id = msg.message_id.clone();
94                if let Some(c) = id_table.get_mut(&msg_id) {
95                    c.parent = Some(parent_id_owned.clone());
96                }
97                if let Some(p) = id_table.get_mut(&parent_id_owned) {
98                    if !p.children.contains(&msg_id) {
99                        p.children.push(msg_id);
100                    }
101                }
102            }
103        }
104    }
105
106    // Step 2: Find root set
107    // A root is either:
108    // - A container with no parent that has a message
109    // - A container whose parent is a phantom (no message) with no parent itself
110    let mut roots: Vec<String> = Vec::new();
111    for (id, container) in &id_table {
112        if container.message.is_none() {
113            continue;
114        }
115        match &container.parent {
116            None => roots.push(id.clone()),
117            Some(parent_id) => {
118                // If parent is phantom (no message) and has no parent, promote this as root
119                if let Some(parent) = id_table.get(parent_id) {
120                    if parent.message.is_none() && parent.parent.is_none() {
121                        roots.push(id.clone());
122                    }
123                }
124            }
125        }
126    }
127
128    // Sort roots by date (earliest first)
129    roots.sort_by(|a, b| {
130        let date_a = id_table
131            .get(a)
132            .and_then(|c| c.message.as_ref())
133            .map(|m| m.date)
134            .unwrap_or_default();
135        let date_b = id_table
136            .get(b)
137            .and_then(|c| c.message.as_ref())
138            .map(|m| m.date)
139            .unwrap_or_default();
140        date_a.cmp(&date_b)
141    });
142
143    // Step 3: Build thread trees
144    let mut threads = Vec::new();
145    for root_id in &roots {
146        let mut thread_messages = Vec::new();
147        collect_thread_messages(&id_table, root_id, &mut thread_messages);
148        if !thread_messages.is_empty() {
149            threads.push(ThreadTree {
150                root_message_id: root_id.clone(),
151                messages: thread_messages,
152            });
153        }
154    }
155
156    merge_subject_fallback_threads(threads, &message_map)
157}
158
159fn collect_thread_messages(table: &HashMap<String, Container>, id: &str, out: &mut Vec<String>) {
160    if let Some(container) = table.get(id) {
161        if container.message.is_some() {
162            out.push(id.to_string());
163        }
164        for child_id in &container.children {
165            collect_thread_messages(table, child_id, out);
166        }
167    }
168}
169
170fn would_create_cycle(table: &HashMap<String, Container>, parent_id: &str, child_id: &str) -> bool {
171    // Walk up from parent_id; if we reach child_id, it's a cycle
172    let mut current = Some(parent_id);
173    while let Some(id) = current {
174        if id == child_id {
175            return true;
176        }
177        current = table.get(id).and_then(|c| c.parent.as_deref());
178    }
179    false
180}
181
182fn merge_subject_fallback_threads(
183    threads: Vec<ThreadTree>,
184    message_map: &HashMap<&str, &MessageForThreading>,
185) -> Vec<ThreadTree> {
186    let mut merged: Vec<ThreadTree> = Vec::new();
187    let mut subject_roots: HashMap<String, usize> = HashMap::new();
188
189    for thread in threads {
190        let Some(first_id) = thread.messages.first() else {
191            continue;
192        };
193        let key = message_map
194            .get(first_id.as_str())
195            .map(|message| normalize_subject(&message.subject))
196            .unwrap_or_default();
197        let has_headers = thread.messages.iter().any(|id| {
198            message_map.get(id.as_str()).is_some_and(|message| {
199                message.in_reply_to.is_some() || !message.references.is_empty()
200            })
201        });
202
203        if !has_headers && !key.is_empty() {
204            if let Some(index) = subject_roots.get(&key).copied() {
205                let target = &mut merged[index];
206                for message_id in &thread.messages {
207                    if !target.messages.contains(message_id) {
208                        target.messages.push(message_id.clone());
209                    }
210                }
211                sort_thread_messages(target, message_map);
212                continue;
213            }
214        }
215
216        let mut thread = thread;
217        sort_thread_messages(&mut thread, message_map);
218        if !key.is_empty() {
219            subject_roots.entry(key).or_insert_with(|| merged.len());
220        }
221        merged.push(thread);
222    }
223
224    merged
225}
226
227fn sort_thread_messages(
228    thread: &mut ThreadTree,
229    message_map: &HashMap<&str, &MessageForThreading>,
230) {
231    thread.messages.sort_by(|left, right| {
232        let left_date = message_map
233            .get(left.as_str())
234            .map(|message| message.date)
235            .unwrap_or_default();
236        let right_date = message_map
237            .get(right.as_str())
238            .map(|message| message.date)
239            .unwrap_or_default();
240        left_date.cmp(&right_date)
241    });
242    if let Some(first) = thread.messages.first() {
243        thread.root_message_id = first.clone();
244    }
245}
246
247fn normalize_subject(subject: &str) -> String {
248    let mut normalized = subject.trim();
249
250    loop {
251        let Some(prefix_end) = normalized.find(':') else {
252            break;
253        };
254        let prefix = normalized[..prefix_end].trim();
255        let lower = prefix.to_ascii_lowercase();
256        let base = lower
257            .trim_end_matches(|ch: char| {
258                ch.is_ascii_digit() || matches!(ch, '[' | ']' | '(' | ')' | ' ')
259            })
260            .trim();
261
262        if matches!(
263            base,
264            "re" | "fw" | "fwd" | "aw" | "sv" | "antw" | "rv" | "odp" | "tr" | "wg"
265        ) {
266            normalized = normalized[prefix_end + 1..].trim();
267            continue;
268        }
269
270        break;
271    }
272
273    normalized.to_ascii_lowercase()
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279
280    fn msg(id: &str, reply_to: Option<&str>, refs: &[&str], subject: &str) -> MessageForThreading {
281        MessageForThreading {
282            message_id: id.to_string(),
283            in_reply_to: reply_to.map(|s| s.to_string()),
284            references: refs.iter().map(|s| s.to_string()).collect(),
285            date: Utc::now(),
286            subject: subject.to_string(),
287        }
288    }
289
290    #[test]
291    fn jwz_threading_basic() {
292        let messages = vec![
293            msg("msg1@ex", None, &[], "Hello"),
294            msg("msg2@ex", Some("msg1@ex"), &["msg1@ex"], "Re: Hello"),
295            msg(
296                "msg3@ex",
297                Some("msg2@ex"),
298                &["msg1@ex", "msg2@ex"],
299                "Re: Re: Hello",
300            ),
301        ];
302
303        let threads = thread_messages(&messages);
304        assert_eq!(threads.len(), 1, "Should form one thread");
305        assert_eq!(
306            threads[0].messages.len(),
307            3,
308            "Thread should have 3 messages"
309        );
310        assert_eq!(threads[0].root_message_id, "msg1@ex");
311    }
312
313    #[test]
314    fn jwz_threading_two_independent_threads() {
315        let messages = vec![
316            msg("a1@ex", None, &[], "Topic A"),
317            msg("a2@ex", Some("a1@ex"), &["a1@ex"], "Re: Topic A"),
318            msg("b1@ex", None, &[], "Topic B"),
319            msg("b2@ex", Some("b1@ex"), &["b1@ex"], "Re: Topic B"),
320        ];
321
322        let threads = thread_messages(&messages);
323        assert_eq!(threads.len(), 2, "Should form two threads");
324    }
325
326    #[test]
327    fn jwz_threading_missing_references() {
328        // msg2 replies to msg1, but msg1 is not in our set
329        let messages = vec![
330            msg("msg2@ex", Some("msg1@ex"), &["msg1@ex"], "Re: Hello"),
331            msg(
332                "msg3@ex",
333                Some("msg2@ex"),
334                &["msg1@ex", "msg2@ex"],
335                "Re: Re: Hello",
336            ),
337        ];
338
339        let threads = thread_messages(&messages);
340        // msg2 should be the root (since msg1 is a phantom container without a message)
341        assert_eq!(threads.len(), 1);
342        assert_eq!(threads[0].messages.len(), 2);
343    }
344
345    #[test]
346    fn jwz_threading_no_replies() {
347        let messages = vec![
348            msg("msg1@ex", None, &[], "Hello"),
349            msg("msg2@ex", None, &[], "World"),
350        ];
351
352        let threads = thread_messages(&messages);
353        assert_eq!(threads.len(), 2, "Each message is its own thread");
354    }
355
356    #[test]
357    fn jwz_threading_empty_input() {
358        let threads = thread_messages(&[]);
359        assert!(threads.is_empty());
360    }
361
362    #[test]
363    fn subject_fallback_groups_headerless_replies() {
364        let messages = vec![
365            msg("msg1@ex", None, &[], "Hello"),
366            msg("msg2@ex", None, &[], "Re: Hello"),
367            msg("msg3@ex", None, &[], "AW: Hello"),
368        ];
369
370        let threads = thread_messages(&messages);
371        assert_eq!(threads.len(), 1);
372        assert_eq!(threads[0].messages.len(), 3);
373    }
374
375    #[test]
376    fn subject_fallback_attaches_headerless_reply_to_header_thread() {
377        let messages = vec![
378            msg("msg1@ex", None, &[], "Topic"),
379            msg("msg2@ex", Some("msg1@ex"), &["msg1@ex"], "Re: Topic"),
380            msg("msg3@ex", None, &[], "SV: Topic"),
381        ];
382
383        let threads = thread_messages(&messages);
384        assert_eq!(threads.len(), 1);
385        assert_eq!(threads[0].messages.len(), 3);
386    }
387
388    #[test]
389    fn subject_fallback_does_not_merge_independent_header_threads() {
390        let messages = vec![
391            msg("root-a@ex", None, &[], "Topic"),
392            msg("reply-a@ex", Some("root-a@ex"), &["root-a@ex"], "Re: Topic"),
393            msg("root-b@ex", None, &[], "Topic"),
394            msg("reply-b@ex", Some("root-b@ex"), &["root-b@ex"], "Re: Topic"),
395        ];
396
397        let threads = thread_messages(&messages);
398        assert_eq!(threads.len(), 2);
399    }
400}