uuencoding_multi/
subject.rs

1use std::sync::OnceLock;
2
3use regex::Regex;
4
5use crate::SubjectParts;
6
7// ---------------------------------------------------------------------------
8// Compiled-once regex patterns
9// ---------------------------------------------------------------------------
10//
11// Pattern priority (most specific first):
12//   1. Parenthesised fraction:  (03/17)
13//   2. Bracketed fraction:      [2/4]   — only when both sides are digits
14//   3. English "part N/M":      Part 3/17
15//   4. English "part N of M":   Part 03 of 17 / part3of17
16//   5. Dash-separated fraction: - 03/17
17//
18// All patterns are compiled once into a static array via `OnceLock`.
19
20struct Pattern {
21    re: Regex,
22}
23
24fn patterns() -> &'static [Pattern; 5] {
25    static PATTERNS: OnceLock<[Pattern; 5]> = OnceLock::new();
26    PATTERNS.get_or_init(|| {
27        [
28            // 1. Parenthesised fraction: (03/17) or ( 3 / 17 )
29            Pattern {
30                re: Regex::new(r"\(\s*(\d{1,6})\s*/\s*(\d{1,6})\s*\)").unwrap(),
31            },
32            // 2. Bracketed fraction: [2/4] — require digit on both sides so
33            //    [BINARY] is not matched.
34            Pattern {
35                re: Regex::new(r"\[\s*(\d{1,6})\s*/\s*(\d{1,6})\s*\]").unwrap(),
36            },
37            // 3. English "Part N/M" (case-insensitive via (?i))
38            Pattern {
39                re: Regex::new(r"(?i)\bpart\s+(\d{1,6})\s*/\s*(\d{1,6})\b").unwrap(),
40            },
41            // 4. English "Part N of M" / "Part3of17" (case-insensitive)
42            Pattern {
43                re: Regex::new(r"(?i)\bpart\s*(\d{1,6})\s*of\s*(\d{1,6})\b").unwrap(),
44            },
45            // 5. Dash-separated fraction: " - 03/17"
46            Pattern {
47                re: Regex::new(r"\s+-\s+(\d{1,6})\s*/\s*(\d{1,6})\b").unwrap(),
48            },
49        ]
50    })
51}
52
53// Matches a yEnc marker at a word boundary (case-insensitive).
54fn yenc_re() -> &'static Regex {
55    static RE: OnceLock<Regex> = OnceLock::new();
56    RE.get_or_init(|| Regex::new(r"(?i)\byenc\b").unwrap())
57}
58
59// Strips common reply/forward prefixes (case-insensitive) repeatedly.
60fn strip_prefixes(s: &str) -> &str {
61    static RE: OnceLock<Regex> = OnceLock::new();
62    let re = RE.get_or_init(|| Regex::new(r"(?i)^(re|fwd?)\s*:\s*").unwrap());
63
64    let mut cur = s;
65    loop {
66        let stripped = re.find(cur).map(|m| &cur[m.end()..]).unwrap_or(cur);
67        if stripped.len() == cur.len() {
68            break;
69        }
70        cur = stripped;
71    }
72    cur
73}
74
75// ---------------------------------------------------------------------------
76// Public API
77// ---------------------------------------------------------------------------
78
79/// Parse a multi-part Usenet/email subject line.
80///
81/// Recognises five marker formats (in priority order):
82/// 1. Parenthesised fraction: `(03/17)` or `( 3 / 17 )`
83/// 2. Bracketed fraction: `[2/4]` (only when both sides are digits)
84/// 3. English "Part N/M" (case-insensitive)
85/// 4. English "Part N of M" / `part3of17` (case-insensitive)
86/// 5. Dash-separated fraction: ` - 03/17`
87///
88/// Leading `Re:`, `Fwd:`, and `Fw:` prefixes are stripped before matching
89/// (repeatedly, to handle nested re-forwards). The extracted part marker is
90/// removed from the subject to produce `base_subject`.
91///
92/// # Return value
93///
94/// Returns `None` only when:
95/// - `subject` is empty, or
96/// - `subject` contains a `yEnc` marker (those posts use a distinct encoding
97///   that is explicitly out of scope for this crate).
98///
99/// Otherwise returns `Some(SubjectParts)`. When no part-marker pattern
100/// matches, `part_index` and `part_total` are both `None` and `base_subject`
101/// is the prefix-stripped, trimmed input.
102///
103/// # Never panics
104///
105/// This function never panics on any input, including strings containing
106/// arbitrary Unicode code points.
107///
108/// # Examples
109///
110/// ```
111/// use uuencoding_multi::parse_subject;
112///
113/// // Parenthesised fraction — the most common Usenet format.
114/// let sp = parse_subject("bigfile.rar (2/5)").unwrap();
115/// assert_eq!(sp.part_index, Some(2));
116/// assert_eq!(sp.part_total, Some(5));
117/// assert_eq!(sp.base_subject, "bigfile.rar");
118/// ```
119///
120/// ```
121/// use uuencoding_multi::parse_subject;
122///
123/// // Re: prefix is stripped before matching.
124/// let sp = parse_subject("Re: archive.tar.gz (03/17)").unwrap();
125/// assert_eq!(sp.part_index, Some(3));
126/// assert_eq!(sp.part_total, Some(17));
127/// ```
128///
129/// ```
130/// use uuencoding_multi::parse_subject;
131///
132/// // yEnc subject → None (out of scope for this crate).
133/// assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
134/// ```
135///
136/// ```
137/// use uuencoding_multi::parse_subject;
138///
139/// // Empty input → None.
140/// assert!(parse_subject("").is_none());
141/// ```
142///
143/// ```
144/// use uuencoding_multi::parse_subject;
145///
146/// // No marker → Some with None fields and subject preserved.
147/// let sp = parse_subject("just a plain subject").unwrap();
148/// assert_eq!(sp.part_index, None);
149/// assert_eq!(sp.part_total, None);
150/// assert_eq!(sp.base_subject, "just a plain subject");
151/// ```
152pub fn parse_subject(subject: &str) -> Option<SubjectParts> {
153    if subject.is_empty() {
154        return None;
155    }
156
157    // yEnc posts are out of scope.
158    if yenc_re().is_match(subject) {
159        return None;
160    }
161
162    let stripped = strip_prefixes(subject).trim();
163
164    for pat in patterns() {
165        if let Some(caps) = pat.re.captures(stripped) {
166            // Capture group 1 is always the part index.
167            let part_index: u32 = caps[1].parse().ok()?;
168            // Capture group 2 is always the total (all five patterns have it).
169            let part_total: u32 = caps[2].parse().ok()?;
170
171            // Build base_subject: remove the matched span from `stripped`.
172            let m = caps.get(0).unwrap();
173            let before = stripped[..m.start()].trim_end();
174            let after = stripped[m.end()..].trim_start();
175
176            let raw = if before.is_empty() {
177                after.to_string()
178            } else if after.is_empty() {
179                before.to_string()
180            } else {
181                format!("{} {}", before, after)
182            };
183
184            // Strip trailing " -" artifact (e.g. "filename.tar.gz -").
185            let base_subject = raw
186                .trim_end_matches(|c: char| c == '-' || c.is_whitespace())
187                .trim()
188                .to_string();
189
190            return Some(SubjectParts {
191                base_subject,
192                part_index: Some(part_index),
193                part_total: Some(part_total),
194            });
195        }
196    }
197
198    // No marker found.
199    Some(SubjectParts {
200        base_subject: stripped.to_string(),
201        part_index: None,
202        part_total: None,
203    })
204}
205
206// ---------------------------------------------------------------------------
207// Tests
208// ---------------------------------------------------------------------------
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    fn parts(subject: &str) -> SubjectParts {
215        parse_subject(subject).unwrap()
216    }
217
218    // ------------------------------------------------------------------
219    // Pattern 1 — parenthesised fraction
220    // ------------------------------------------------------------------
221
222    #[test]
223    fn paren_fraction_basic() {
224        let p = parts("bigfile.rar (1/5)");
225        assert_eq!(p.part_index, Some(1));
226        assert_eq!(p.part_total, Some(5));
227        assert_eq!(p.base_subject, "bigfile.rar");
228    }
229
230    #[test]
231    fn paren_fraction_leading_zero() {
232        let p = parts("filename.tar.gz (03/17)");
233        assert_eq!(p.part_index, Some(3));
234        assert_eq!(p.part_total, Some(17));
235        assert_eq!(p.base_subject, "filename.tar.gz");
236    }
237
238    #[test]
239    fn paren_fraction_spaces_inside() {
240        let p = parts("file.zip ( 2 / 7 )");
241        assert_eq!(p.part_index, Some(2));
242        assert_eq!(p.part_total, Some(7));
243    }
244
245    // ------------------------------------------------------------------
246    // Pattern 2 — bracketed fraction
247    // ------------------------------------------------------------------
248
249    #[test]
250    fn bracket_fraction_basic() {
251        let p = parts("image.jpg [2/4]");
252        assert_eq!(p.part_index, Some(2));
253        assert_eq!(p.part_total, Some(4));
254        assert_eq!(p.base_subject, "image.jpg");
255    }
256
257    #[test]
258    fn bracket_fraction_not_binary_tag() {
259        // [BINARY] must NOT be parsed as a fraction.
260        let p = parts("[BINARY] filename - Part 3 of 12");
261        assert_eq!(p.part_index, Some(3));
262        assert_eq!(p.part_total, Some(12));
263    }
264
265    // ------------------------------------------------------------------
266    // Pattern 3 — "Part N/M"
267    // ------------------------------------------------------------------
268
269    #[test]
270    fn part_slash_basic() {
271        let p = parts("file.zip Part 3/17");
272        assert_eq!(p.part_index, Some(3));
273        assert_eq!(p.part_total, Some(17));
274    }
275
276    #[test]
277    fn part_slash_lowercase() {
278        let p = parts("file.tar.gz part 2/5");
279        assert_eq!(p.part_index, Some(2));
280        assert_eq!(p.part_total, Some(5));
281    }
282
283    // ------------------------------------------------------------------
284    // Pattern 4 — "Part N of M"
285    // ------------------------------------------------------------------
286
287    #[test]
288    fn part_of_with_spaces() {
289        let p = parts("file.zip Part 03 of 17");
290        assert_eq!(p.part_index, Some(3));
291        assert_eq!(p.part_total, Some(17));
292    }
293
294    #[test]
295    fn part_of_no_spaces() {
296        let p = parts("archive.tar.gz part3of17");
297        assert_eq!(p.part_index, Some(3));
298        assert_eq!(p.part_total, Some(17));
299    }
300
301    #[test]
302    fn binary_tag_part_of() {
303        let p = parts("[BINARY] filename - Part 3 of 12");
304        assert_eq!(p.part_index, Some(3));
305        assert_eq!(p.part_total, Some(12));
306    }
307
308    // ------------------------------------------------------------------
309    // Pattern 5 — dash-separated fraction
310    // ------------------------------------------------------------------
311
312    #[test]
313    fn dash_fraction() {
314        let p = parts("filename.tar.gz - 03/17");
315        assert_eq!(p.part_index, Some(3));
316        assert_eq!(p.part_total, Some(17));
317        assert_eq!(p.base_subject, "filename.tar.gz");
318    }
319
320    // ------------------------------------------------------------------
321    // Part 0 (TOC)
322    // ------------------------------------------------------------------
323
324    #[test]
325    fn part_zero_toc() {
326        let p = parts("filename.tar.gz (00/17)");
327        assert_eq!(p.part_index, Some(0));
328        assert_eq!(p.part_total, Some(17));
329    }
330
331    // ------------------------------------------------------------------
332    // yEnc → None
333    // ------------------------------------------------------------------
334
335    #[test]
336    fn yenc_returns_none() {
337        assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
338    }
339
340    #[test]
341    fn yenc_uppercase_returns_none() {
342        assert!(parse_subject("\"file.nfo\" YENC (1/3)").is_none());
343    }
344
345    // ------------------------------------------------------------------
346    // No marker
347    // ------------------------------------------------------------------
348
349    #[test]
350    fn no_marker_returns_some_none_fields() {
351        let p = parts("plain subject");
352        assert_eq!(p.base_subject, "plain subject");
353        assert_eq!(p.part_index, None);
354        assert_eq!(p.part_total, None);
355    }
356
357    // ------------------------------------------------------------------
358    // Empty input → None
359    // ------------------------------------------------------------------
360
361    #[test]
362    fn empty_returns_none() {
363        assert!(parse_subject("").is_none());
364    }
365
366    // ------------------------------------------------------------------
367    // Re: / Fwd: prefix stripping
368    // ------------------------------------------------------------------
369
370    #[test]
371    fn re_prefix_stripped() {
372        let p = parts("Re: filename.tar.gz (03/17)");
373        assert_eq!(p.part_index, Some(3));
374        assert_eq!(p.part_total, Some(17));
375    }
376
377    #[test]
378    fn fwd_prefix_stripped() {
379        let p = parts("Fwd: filename.tar.gz (03/17)");
380        assert_eq!(p.part_index, Some(3));
381    }
382
383    #[test]
384    fn fw_prefix_stripped() {
385        let p = parts("Fw: filename.tar.gz (03/17)");
386        assert_eq!(p.part_index, Some(3));
387    }
388
389    #[test]
390    fn nested_re_prefix_stripped() {
391        let p = parts("Re: Re: filename.tar.gz (03/17)");
392        assert_eq!(p.part_index, Some(3));
393    }
394
395    // ------------------------------------------------------------------
396    // Unicode stem — must not panic
397    // ------------------------------------------------------------------
398
399    #[test]
400    fn unicode_stem_no_panic() {
401        let p = parts("日本語ファイル (1/3)");
402        assert_eq!(p.part_index, Some(1));
403        assert_eq!(p.part_total, Some(3));
404        assert!(p.base_subject.contains('日'));
405    }
406
407    // ------------------------------------------------------------------
408    // base_subject trimming
409    // ------------------------------------------------------------------
410
411    #[test]
412    fn base_subject_trailing_dash_stripped() {
413        // Pattern 5 leaves no artifact here; verify general trimming.
414        let p = parts("myfile.bin (2/5)");
415        assert!(!p.base_subject.ends_with('-'));
416        assert!(!p.base_subject.ends_with(' '));
417    }
418}
uuencoding_multi/subject.rs

uuencoding_multi/
subject.rs