mail_parser/parsers/fields/
thread.rs

1/*
2 * SPDX-FileCopyrightText: 2020 Stalwart Labs LLC <hello@stalw.art>
3 *
4 * SPDX-License-Identifier: Apache-2.0 OR MIT
5 */
6
7fn is_re_prefix(prefix: &str) -> bool {
8    hashify::tiny_set! {prefix.as_bytes(),
9        "re",
10        "res",
11        "sv",
12        "antw",
13        "ref",
14        "aw",
15        "απ",
16        "השב",
17        "vá",
18        "r",
19        "rif",
20        "bls",
21        "odp",
22        "ynt",
23        "atb",
24        "رد",
25        "回复",
26        "转发",
27    }
28}
29
30fn is_fwd_prefix(prefix: &str) -> bool {
31    hashify::tiny_set! {prefix.as_bytes(),
32    "fwd",
33    "fw",
34    "rv",
35    "enc",
36    "vs",
37    "doorst",
38    "vl",
39    "tr",
40    "wg",
41    "πρθ",
42    "הועבר",
43    "továbbítás",
44    "i",
45    "fs",
46    "trs",
47    "vb",
48    "pd",
49    "i̇lt",
50    "yml",
51    "إعادة توجيه",
52    "回覆",
53    "轉寄",
54    }
55}
56
57pub fn thread_name(text: &str) -> &str {
58    let mut token_start = 0;
59    let mut token_end = 0;
60
61    let mut thread_name_start = 0;
62    let mut fwd_start = 0;
63    let mut fwd_end = 0;
64    let mut last_blob_end = 0;
65
66    let mut in_blob = false;
67    let mut in_blob_ignore = false;
68    let mut seen_header = false;
69    let mut seen_blob_header = false;
70    let mut token_found = false;
71
72    for (pos, ch) in text.char_indices() {
73        match ch {
74            '[' => {
75                if !in_blob {
76                    if token_found {
77                        if token_end == 0 {
78                            token_end = pos;
79                        }
80                        let prefix = text[token_start..token_end].to_lowercase();
81                        if is_re_prefix(prefix.as_ref()) || is_fwd_prefix(prefix.as_ref()) {
82                            seen_header = true;
83                        } else {
84                            break;
85                        }
86                    }
87                    token_found = false;
88                    in_blob = true;
89                } else {
90                    break;
91                }
92            }
93            ']' if in_blob => {
94                if seen_blob_header && token_found {
95                    fwd_start = token_start;
96                    fwd_end = pos;
97                }
98                if !seen_header {
99                    last_blob_end = pos + 1;
100                }
101                in_blob = false;
102                token_found = false;
103                seen_blob_header = false;
104                in_blob_ignore = false;
105            }
106            ':' if !in_blob => {
107                if (seen_header && token_found) || (!seen_header && !token_found) {
108                    break;
109                } else if !seen_header {
110                    if token_end == 0 {
111                        token_end = pos;
112                    }
113                    let prefix = text[token_start..token_end].to_lowercase();
114                    if !is_re_prefix(prefix.as_ref()) && !is_fwd_prefix(prefix.as_ref()) {
115                        break;
116                    }
117                } else {
118                    seen_header = false;
119                }
120                thread_name_start = pos + 1;
121                token_found = false;
122            }
123            ':' if in_blob && !in_blob_ignore => {
124                if token_end == 0 {
125                    token_end = pos;
126                }
127
128                let prefix = text[token_start..token_end].to_lowercase();
129                if is_fwd_prefix(prefix.as_ref()) {
130                    token_found = false;
131                    seen_blob_header = true;
132                } else if seen_blob_header && is_re_prefix(prefix.as_ref()) {
133                    token_found = false;
134                } else {
135                    in_blob_ignore = true;
136                }
137            }
138            _ if ch.is_whitespace() => {
139                if token_end == 0 {
140                    token_end = pos;
141                }
142            }
143            _ => {
144                if !token_found {
145                    token_start = pos;
146                    token_end = 0;
147                    token_found = true;
148                } else if !in_blob && pos - token_start > 21 {
149                    break;
150                }
151            }
152        }
153    }
154
155    if last_blob_end > thread_name_start
156        || (fwd_start > 0 && last_blob_end > fwd_start && fwd_start > thread_name_start)
157    {
158        let result = trim_trailing_fwd(&text[last_blob_end..]);
159        if !result.is_empty() {
160            return result;
161        }
162    }
163
164    if fwd_start > 0 && thread_name_start < fwd_start {
165        let result = trim_trailing_fwd(&text[fwd_start..fwd_end]);
166        if !result.is_empty() {
167            return result;
168        }
169    }
170
171    trim_trailing_fwd(&text[thread_name_start..])
172}
173
174pub fn trim_trailing_fwd(text: &str) -> &str {
175    let mut in_parentheses = false;
176    let mut trim_end = true;
177    let mut end_found = false;
178
179    let mut text_start = 0;
180    let mut text_end = text.len();
181    let mut fwd_end = 0;
182
183    for (pos, ch) in text.char_indices().rev() {
184        match ch {
185            '(' if !end_found => {
186                if in_parentheses {
187                    in_parentheses = false;
188                    if fwd_end - pos > 2
189                        && is_fwd_prefix(text[pos + 1..fwd_end].to_lowercase().as_ref())
190                    {
191                        text_end = pos;
192                        trim_end = true;
193                        continue;
194                    }
195                }
196                end_found = true;
197            }
198            ')' if !end_found => {
199                if !in_parentheses {
200                    in_parentheses = true;
201                    fwd_end = pos;
202                } else {
203                    end_found = true;
204                }
205            }
206            _ if ch.is_whitespace() => {
207                if trim_end {
208                    text_end = pos;
209                }
210                continue;
211            }
212            _ => {
213                if !in_parentheses && !end_found {
214                    end_found = true;
215                }
216            }
217        }
218
219        if trim_end {
220            trim_end = false;
221        }
222        text_start = pos;
223    }
224
225    if text_end >= text_start {
226        &text[text_start..text_end]
227    } else {
228        ""
229    }
230}
231
232#[cfg(test)]
233mod tests {
234    use crate::parsers::fields::thread::{thread_name, trim_trailing_fwd};
235
236    #[test]
237    fn parse_thread_name() {
238        let tests = [
239            ("re: hello", "hello"),
240            ("re:re: hello", "hello"),
241            ("re:fwd: hello", "hello"),
242            ("fwd[5]:re[5]: hello", "hello"),
243            ("fwd[99]:  re[40]: hello", "hello"),
244            (": hello", ": hello"),
245            ("z: hello", "z: hello"),
246            ("re:: hello", ": hello"),
247            ("[10] hello", "hello"),
248            ("fwd[a]: hello", "hello"),
249            ("re:", ""),
250            ("re::", ":"),
251            ("", ""),
252            (" ", ""),
253            ("回复: 轉寄: 轉寄", "轉寄"),
254            ("aw[50]: wg: aw[1]: hallo", "hallo"),
255            ("res: rv: enc: továbbítás: ", ""),
256            ("[fwd: hello world]", "hello world"),
257            ("re: enc: re[5]: [fwd: hello world]", "hello world"),
258            ("[fwd: re: fw: hello world]", "hello world"),
259            ("[fwd: hello world]: another text", ": another text"),
260            ("[fwd: re: fwd:] another text", "another text"),
261            ("[hello world]", "[hello world]"),
262            ("re: fwd[9]: [hello world]", "[hello world]"),
263            ("[mailing-list] hello world", "hello world"),
264            ("[mailing-list] re: hello world", "hello world"),
265            ("[mailing-list] wg[8]:re:  hello world", "hello world"),
266            ("hello [world]", "hello [world]"),
267            (" [hello] [world] ", "[hello] [world]"),
268            ("[mailing-list] hello [world]", "hello [world]"),
269            ("[hello [world]", "[hello [world]"),
270            ("[]hello [world]", "hello [world]"),
271            ("[fwd: re: re:] fwd[6]:re:  fw:", ""),
272            ("[fwd hello] world hello", "world hello"),
273            ("[fwd: مرحبا بالعالم]", "مرحبا بالعالم"),
274            ("[fwd: hello world] مرحبا بالعالم", "مرحبا بالعالم"),
275            ("  hello world  ", "hello world"),
276            (
277                "[mailing-list] wg[8]:re:  hello world (fwd)(fwd)",
278                "hello world",
279            ),
280            ("[fwd: re: fw: hello world (fwd)]", "hello world"),
281            (
282                "res: rv: enc: továbbítás: hello world (doorst)",
283                "hello world",
284            ),
285            ("[fwd: re: re: (fwd)] fwd[6]:re:  fw: (fwd)", ""),
286        ];
287
288        for (input, expected) in tests {
289            assert_eq!(thread_name(input), expected, "{input:?}");
290        }
291    }
292
293    #[test]
294    fn parse_trail_fwd() {
295        let tests = [
296            ("hello (fwd)", "hello"),
297            (" hello (fwd)(fwd)", "hello"),
298            ("hello (wg) (fwd) (fwd)", "hello"),
299            ("(fwd)(fwd)", ""),
300            ("(fwd)hello(fwd)", "(fwd)hello"),
301            ("  hello  ", "hello"),
302            ("  hello world   ", "hello world"),
303            ("", ""),
304            ("    ", ""),
305            ("hello ()(fwd)", "hello ()"),
306            ("(hello)", "(hello)"),
307            ("hello () (fwd) ()(fwd)", "hello () (fwd) ()"),
308            (")(", ")("),
309            (" 你好世界(fwd) ", "你好世界"),
310            ("你好世界 (回覆)", "你好世界"),
311            ("hello(fwd", "hello(fwd"),
312            ("hello(fwd))", "hello(fwd))"),
313        ];
314
315        for (input, expected) in tests {
316            assert_eq!(trim_trailing_fwd(input), expected, "{input:?}");
317        }
318    }
319}