Skip to main content

uuencoding_multi/
subject.rs

1use std::sync::OnceLock;
2
3use regex::Regex;
4
5/// Fields extracted from a parsed Usenet/email subject line.
6///
7/// Returned by [`parse_subject`]. The `base_subject` field can be used as a
8/// stable grouping key across parts of the same series.
9///
10/// # Field invariants
11///
12/// - `part_total` is always `Some` when `part_index` is `Some`, because every
13///   supported marker format includes the total count.
14#[derive(Debug)]
15pub struct SubjectParts {
16    /// The subject with all `Re:`/`Fwd:` prefixes and the part-number marker stripped.
17    /// Never empty; [`parse_subject`] returns `None` rather than returning an empty
18    /// `base_subject`.
19    pub base_subject: String,
20    /// 1-based part number extracted from the marker. `Some(0)` indicates a
21    /// TOC post (e.g. `(00/17)`). `None` when no recognised marker was found.
22    pub part_index: Option<u32>,
23    /// Total number of parts as declared in the subject marker.
24    /// Always `Some` when `part_index` is `Some`; `None` otherwise.
25    pub part_total: Option<u32>,
26}
27
28// ---------------------------------------------------------------------------
29// Compiled-once regex patterns
30// ---------------------------------------------------------------------------
31//
32// Pattern priority (most specific first):
33//   1. Parenthesised fraction:  (03/17)
34//   2. Bracketed fraction:      [2/4]   — only when both sides are digits
35//   3. English "part N/M":      Part 3/17
36//   4. English "part N of M":   Part 03 of 17 / part3of17
37//   5. Dash-separated fraction: - 03/17
38//
39// All patterns are compiled once into a static array via `OnceLock`.
40
41struct Pattern {
42    re: Regex,
43}
44
45fn patterns() -> &'static [Pattern; 5] {
46    static PATTERNS: OnceLock<[Pattern; 5]> = OnceLock::new();
47    PATTERNS.get_or_init(|| {
48        [
49            // 1. Parenthesised fraction: (03/17) or ( 3 / 17 )
50            Pattern {
51                re: Regex::new(r"\([ \t]*([0-9]{1,6})[ \t]*/[ \t]*([0-9]{1,6})[ \t]*\)").unwrap(),
52            },
53            // 2. Bracketed fraction: [2/4] — require digit on both sides so
54            //    [BINARY] is not matched.
55            Pattern {
56                re: Regex::new(r"\[[ \t]*([0-9]{1,6})[ \t]*/[ \t]*([0-9]{1,6})[ \t]*\]").unwrap(),
57            },
58            // 3. English "Part N/M" (case-insensitive)
59            Pattern {
60                re: Regex::new(r"(?i)\bpart[ \t]+([0-9]{1,6})[ \t]*/[ \t]*([0-9]{1,6})\b").unwrap(),
61            },
62            // 4. English "Part N of M" / "Part3of17" (case-insensitive)
63            Pattern {
64                re: Regex::new(r"(?i)\bpart[ \t]*([0-9]{1,6})[ \t]*of[ \t]*([0-9]{1,6})\b")
65                    .unwrap(),
66            },
67            // 5. Dash-separated fraction: " - 03/17"
68            Pattern {
69                re: Regex::new(r"[ \t]+-[ \t]+([0-9]{1,6})[ \t]*/[ \t]*([0-9]{1,6})\b").unwrap(),
70            },
71        ]
72    })
73}
74
75// Matches a yEnc marker at a word boundary (case-insensitive).
76fn yenc_re() -> &'static Regex {
77    static RE: OnceLock<Regex> = OnceLock::new();
78    RE.get_or_init(|| Regex::new(r"(?i)\byenc\b").unwrap())
79}
80
81// Strips common reply/forward prefixes (case-insensitive) repeatedly.
82fn strip_prefixes(s: &str) -> &str {
83    static RE: OnceLock<Regex> = OnceLock::new();
84    let re = RE.get_or_init(|| Regex::new(r"(?i)^(re|fwd?)[ \t]*:[ \t]*").unwrap());
85
86    let mut cur = s;
87    loop {
88        let stripped = re.find(cur).map(|m| &cur[m.end()..]).unwrap_or(cur);
89        if stripped.len() == cur.len() {
90            break;
91        }
92        cur = stripped;
93    }
94    cur
95}
96
97// ---------------------------------------------------------------------------
98// Public API
99// ---------------------------------------------------------------------------
100
101/// Parse a multi-part Usenet/email subject line.
102///
103/// Recognises five marker formats (in priority order):
104/// 1. Parenthesised fraction: `(03/17)` or `( 3 / 17 )`
105/// 2. Bracketed fraction: `[2/4]` (only when both sides are digits)
106/// 3. English "Part N/M" (case-insensitive)
107/// 4. English "Part N of M" / `part3of17` (case-insensitive)
108/// 5. Dash-separated fraction: ` - 03/17`
109///
110/// Leading `Re:`, `Fwd:`, and `Fw:` prefixes are stripped before matching
111/// (repeatedly, to handle nested re-forwards). The extracted part marker is
112/// removed from the subject to produce `base_subject`.
113///
114/// # First-match-wins
115///
116/// The five patterns are tried in the priority order listed above. The first
117/// pattern that matches wins; no attempt is made to find a "better" match
118/// further along. If a subject line contains multiple markers
119/// (e.g. `"file (1/3) [2/4]"`), only the first one matched — the
120/// parenthesised fraction in that example — is used and the rest are left in
121/// `base_subject`.
122///
123/// # Return value
124///
125/// Returns `None` only when:
126/// - `subject` is empty, or
127/// - `subject` contains a `yEnc` marker (those posts use a distinct encoding
128///   that is explicitly out of scope for this crate), or
129/// - the entire input is consumed by the part-marker pattern, leaving no
130///   base subject (e.g. `"(1/3)"` with nothing else). The invariant that
131///   `base_subject` is never empty must hold whenever `Some` is returned.
132///
133/// Otherwise returns `Some(SubjectParts)`. When no part-marker pattern
134/// matches, `part_index` and `part_total` are both `None` and `base_subject`
135/// is the prefix-stripped, trimmed input.
136///
137/// # Zero totals
138///
139/// A subject like `"file.bin (3/0)"` produces `part_total = Some(0)`. This is
140/// nonsensical but is passed through verbatim since the crate cannot know
141/// whether the source is malformed or intentional. Callers that pass
142/// `part_total` directly to [`PartCollection::with_total`][crate::PartCollection::with_total]
143/// should validate that the total is non-zero before doing so.
144///
145/// # Never panics
146///
147/// This function never panics on any input, including strings containing
148/// arbitrary Unicode code points.
149///
150/// # Examples
151///
152/// ```
153/// use uuencoding_multi::parse_subject;
154///
155/// // Parenthesised fraction — the most common Usenet format.
156/// let sp = parse_subject("bigfile.rar (2/5)").unwrap();
157/// assert_eq!(sp.part_index, Some(2));
158/// assert_eq!(sp.part_total, Some(5));
159/// assert_eq!(sp.base_subject, "bigfile.rar");
160/// ```
161///
162/// ```
163/// use uuencoding_multi::parse_subject;
164///
165/// // Re: prefix is stripped before matching.
166/// let sp = parse_subject("Re: archive.tar.gz (03/17)").unwrap();
167/// assert_eq!(sp.part_index, Some(3));
168/// assert_eq!(sp.part_total, Some(17));
169/// ```
170///
171/// ```
172/// use uuencoding_multi::parse_subject;
173///
174/// // yEnc subject → None (out of scope for this crate).
175/// assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
176/// ```
177///
178/// ```
179/// use uuencoding_multi::parse_subject;
180///
181/// // Empty input → None.
182/// assert!(parse_subject("").is_none());
183/// ```
184///
185/// ```
186/// use uuencoding_multi::parse_subject;
187///
188/// // No marker → Some with None fields and subject preserved.
189/// let sp = parse_subject("just a plain subject").unwrap();
190/// assert_eq!(sp.part_index, None);
191/// assert_eq!(sp.part_total, None);
192/// assert_eq!(sp.base_subject, "just a plain subject");
193/// ```
194pub fn parse_subject(subject: &str) -> Option<SubjectParts> {
195    if subject.is_empty() {
196        return None;
197    }
198
199    // yEnc posts are out of scope.
200    if yenc_re().is_match(subject) {
201        return None;
202    }
203
204    let stripped = strip_prefixes(subject).trim();
205
206    if stripped.is_empty() {
207        return None;
208    }
209
210    for pat in patterns() {
211        if let Some(caps) = pat.re.captures(stripped) {
212            // Capture group 1 is always the part index.
213            // Use `continue` (not `?`) so that a failed parse on one pattern
214            // does not exit the function — we try the remaining patterns instead.
215            // In practice the regex limits captures to 6 digits (max 999999),
216            // which is always within u32 range, so parse failure is impossible
217            // with current regexes.
218            let part_index: u32 = match caps[1].parse() {
219                Ok(n) => n,
220                Err(_) => continue,
221            };
222            // Capture group 2 is always the total (all five patterns have it).
223            let part_total: u32 = match caps[2].parse() {
224                Ok(n) => n,
225                Err(_) => continue,
226            };
227
228            // Build base_subject: remove the matched span from `stripped`.
229            let m = caps.get(0).unwrap();
230            let before = stripped[..m.start()].trim_end();
231            let after = stripped[m.end()..].trim_start();
232
233            let raw = if before.is_empty() {
234                after.to_string()
235            } else if after.is_empty() {
236                before.to_string()
237            } else {
238                format!("{} {}", before, after)
239            };
240
241            // Strip trailing " -" artifact (e.g. "filename.tar.gz -").
242            let base_subject = raw
243                .trim_end_matches(|c: char| c == '-' || c.is_whitespace())
244                .trim()
245                .to_string();
246
247            // Invariant: base_subject must not be empty. A subject that
248            // consists solely of a part marker (e.g. "(1/3)") has no
249            // meaningful base; treat it as non-parseable.
250            if base_subject.is_empty() {
251                return None;
252            }
253
254            return Some(SubjectParts {
255                base_subject,
256                part_index: Some(part_index),
257                part_total: Some(part_total),
258            });
259        }
260    }
261
262    // No marker found.
263    Some(SubjectParts {
264        base_subject: stripped.to_string(),
265        part_index: None,
266        part_total: None,
267    })
268}
269
270// ---------------------------------------------------------------------------
271// Tests
272// ---------------------------------------------------------------------------
273
274#[cfg(test)]
275mod tests {
276    use super::*;
277
278    fn parts(subject: &str) -> SubjectParts {
279        parse_subject(subject).unwrap()
280    }
281
282    // ------------------------------------------------------------------
283    // Pattern 1 — parenthesised fraction
284    // ------------------------------------------------------------------
285
286    #[test]
287    fn paren_fraction_basic() {
288        let p = parts("bigfile.rar (1/5)");
289        assert_eq!(p.part_index, Some(1));
290        assert_eq!(p.part_total, Some(5));
291        assert_eq!(p.base_subject, "bigfile.rar");
292    }
293
294    #[test]
295    fn paren_fraction_leading_zero() {
296        let p = parts("filename.tar.gz (03/17)");
297        assert_eq!(p.part_index, Some(3));
298        assert_eq!(p.part_total, Some(17));
299        assert_eq!(p.base_subject, "filename.tar.gz");
300    }
301
302    #[test]
303    fn paren_fraction_spaces_inside() {
304        let p = parts("file.zip ( 2 / 7 )");
305        assert_eq!(p.part_index, Some(2));
306        assert_eq!(p.part_total, Some(7));
307    }
308
309    // ------------------------------------------------------------------
310    // Pattern 2 — bracketed fraction
311    // ------------------------------------------------------------------
312
313    #[test]
314    fn bracket_fraction_basic() {
315        let p = parts("image.jpg [2/4]");
316        assert_eq!(p.part_index, Some(2));
317        assert_eq!(p.part_total, Some(4));
318        assert_eq!(p.base_subject, "image.jpg");
319    }
320
321    #[test]
322    fn bracket_fraction_not_binary_tag() {
323        // [BINARY] must NOT be parsed as a fraction.
324        let p = parts("[BINARY] filename - Part 3 of 12");
325        assert_eq!(p.part_index, Some(3));
326        assert_eq!(p.part_total, Some(12));
327    }
328
329    // ------------------------------------------------------------------
330    // Pattern 3 — "Part N/M"
331    // ------------------------------------------------------------------
332
333    #[test]
334    fn part_slash_basic() {
335        let p = parts("file.zip Part 3/17");
336        assert_eq!(p.part_index, Some(3));
337        assert_eq!(p.part_total, Some(17));
338    }
339
340    #[test]
341    fn part_slash_lowercase() {
342        let p = parts("file.tar.gz part 2/5");
343        assert_eq!(p.part_index, Some(2));
344        assert_eq!(p.part_total, Some(5));
345    }
346
347    // ------------------------------------------------------------------
348    // Pattern 4 — "Part N of M"
349    // ------------------------------------------------------------------
350
351    #[test]
352    fn part_of_with_spaces() {
353        let p = parts("file.zip Part 03 of 17");
354        assert_eq!(p.part_index, Some(3));
355        assert_eq!(p.part_total, Some(17));
356    }
357
358    #[test]
359    fn part_of_no_spaces() {
360        let p = parts("archive.tar.gz part3of17");
361        assert_eq!(p.part_index, Some(3));
362        assert_eq!(p.part_total, Some(17));
363    }
364
365    #[test]
366    fn binary_tag_part_of() {
367        let p = parts("[BINARY] filename - Part 3 of 12");
368        assert_eq!(p.part_index, Some(3));
369        assert_eq!(p.part_total, Some(12));
370    }
371
372    // ------------------------------------------------------------------
373    // Pattern 5 — dash-separated fraction
374    // ------------------------------------------------------------------
375
376    #[test]
377    fn dash_fraction() {
378        let p = parts("filename.tar.gz - 03/17");
379        assert_eq!(p.part_index, Some(3));
380        assert_eq!(p.part_total, Some(17));
381        assert_eq!(p.base_subject, "filename.tar.gz");
382    }
383
384    // ------------------------------------------------------------------
385    // Part 0 (TOC)
386    // ------------------------------------------------------------------
387
388    #[test]
389    fn part_zero_toc() {
390        let p = parts("filename.tar.gz (00/17)");
391        assert_eq!(p.part_index, Some(0));
392        assert_eq!(p.part_total, Some(17));
393    }
394
395    // ------------------------------------------------------------------
396    // yEnc → None
397    // ------------------------------------------------------------------
398
399    #[test]
400    fn yenc_returns_none() {
401        assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
402    }
403
404    #[test]
405    fn yenc_uppercase_returns_none() {
406        assert!(parse_subject("\"file.nfo\" YENC (1/3)").is_none());
407    }
408
409    // ------------------------------------------------------------------
410    // No marker
411    // ------------------------------------------------------------------
412
413    #[test]
414    fn no_marker_returns_some_none_fields() {
415        let p = parts("plain subject");
416        assert_eq!(p.base_subject, "plain subject");
417        assert_eq!(p.part_index, None);
418        assert_eq!(p.part_total, None);
419    }
420
421    // ------------------------------------------------------------------
422    // Empty input → None
423    // ------------------------------------------------------------------
424
425    #[test]
426    fn empty_returns_none() {
427        assert!(parse_subject("").is_none());
428    }
429
430    // ------------------------------------------------------------------
431    // Bare marker (entire input is the marker) → None
432    // Invariant: base_subject must never be empty.
433    // ------------------------------------------------------------------
434
435    #[test]
436    fn bare_paren_marker_returns_none() {
437        assert!(parse_subject("(1/3)").is_none());
438    }
439
440    #[test]
441    fn bare_paren_marker_with_spaces_returns_none() {
442        assert!(parse_subject("  (1/3)  ").is_none());
443    }
444
445    #[test]
446    fn bare_bracket_marker_returns_none() {
447        assert!(parse_subject("[2/4]").is_none());
448    }
449
450    /// A subject that is entirely a "Part N of M" or "Part N/M" marker with no
451    /// surrounding text strips to empty after removing the marker.
452    /// Invariant: base_subject must never be empty → returns None.
453    #[test]
454    fn bare_part_marker_only_returns_none() {
455        assert!(parse_subject("Part 1 of 3").is_none());
456        assert!(parse_subject("Part 1/3").is_none());
457    }
458
459    // ------------------------------------------------------------------
460    // Re: / Fwd: prefix stripping
461    // ------------------------------------------------------------------
462
463    #[test]
464    fn re_prefix_stripped() {
465        let p = parts("Re: filename.tar.gz (03/17)");
466        assert_eq!(p.part_index, Some(3));
467        assert_eq!(p.part_total, Some(17));
468    }
469
470    #[test]
471    fn fwd_prefix_stripped() {
472        let p = parts("Fwd: filename.tar.gz (03/17)");
473        assert_eq!(p.part_index, Some(3));
474    }
475
476    #[test]
477    fn fw_prefix_stripped() {
478        let p = parts("Fw: filename.tar.gz (03/17)");
479        assert_eq!(p.part_index, Some(3));
480    }
481
482    #[test]
483    fn nested_re_prefix_stripped() {
484        let p = parts("Re: Re: filename.tar.gz (03/17)");
485        assert_eq!(p.part_index, Some(3));
486    }
487
488    // ------------------------------------------------------------------
489    // Unicode stem — must not panic
490    // ------------------------------------------------------------------
491
492    #[test]
493    fn unicode_stem_no_panic() {
494        let p = parts("日本語ファイル (1/3)");
495        assert_eq!(p.part_index, Some(1));
496        assert_eq!(p.part_total, Some(3));
497        assert!(p.base_subject.contains('日'));
498    }
499
500    // ------------------------------------------------------------------
501    // base_subject trimming
502    // ------------------------------------------------------------------
503
504    #[test]
505    fn base_subject_trailing_dash_stripped() {
506        // Pattern 5 leaves no artifact here; verify general trimming.
507        let p = parts("myfile.bin (2/5)");
508        assert!(!p.base_subject.ends_with('-'));
509        assert!(!p.base_subject.ends_with(' '));
510    }
511
512    // ------------------------------------------------------------------
513    // All-prefix input stripped to empty → None
514    // Invariant: base_subject must never be empty.
515    // ------------------------------------------------------------------
516
517    #[test]
518    fn parse_subject_returns_none_for_all_prefix_input() {
519        assert!(parse_subject("Re: ").is_none());
520        assert!(parse_subject("Fwd: Re: ").is_none());
521        assert!(parse_subject("   ").is_none());
522    }
523}