Skip to main content

uuencoding_multi/
subject.rs

1use std::sync::OnceLock;
2
3use regex::Regex;
4
5use crate::SubjectParts;
6
7// ---------------------------------------------------------------------------
8// Compiled-once regex patterns
9// ---------------------------------------------------------------------------
10//
11// Pattern priority (most specific first):
12//   1. Parenthesised fraction:  (03/17)
13//   2. Bracketed fraction:      [2/4]   — only when both sides are digits
14//   3. English "part N/M":      Part 3/17
15//   4. English "part N of M":   Part 03 of 17 / part3of17
16//   5. Dash-separated fraction: - 03/17
17//
18// All patterns are compiled once into a static array via `OnceLock`.
19
20struct Pattern {
21    re: Regex,
22}
23
24fn patterns() -> &'static [Pattern; 5] {
25    static PATTERNS: OnceLock<[Pattern; 5]> = OnceLock::new();
26    PATTERNS.get_or_init(|| {
27        [
28            // 1. Parenthesised fraction: (03/17) or ( 3 / 17 )
29            Pattern {
30                re: Regex::new(r"\(\s*(\d{1,6})\s*/\s*(\d{1,6})\s*\)").unwrap(),
31            },
32            // 2. Bracketed fraction: [2/4] — require digit on both sides so
33            //    [BINARY] is not matched.
34            Pattern {
35                re: Regex::new(r"\[\s*(\d{1,6})\s*/\s*(\d{1,6})\s*\]").unwrap(),
36            },
37            // 3. English "Part N/M" (case-insensitive via (?i))
38            Pattern {
39                re: Regex::new(r"(?i)\bpart\s+(\d{1,6})\s*/\s*(\d{1,6})\b").unwrap(),
40            },
41            // 4. English "Part N of M" / "Part3of17" (case-insensitive)
42            Pattern {
43                re: Regex::new(r"(?i)\bpart\s*(\d{1,6})\s*of\s*(\d{1,6})\b").unwrap(),
44            },
45            // 5. Dash-separated fraction: " - 03/17"
46            Pattern {
47                re: Regex::new(r"\s+-\s+(\d{1,6})\s*/\s*(\d{1,6})\b").unwrap(),
48            },
49        ]
50    })
51}
52
53// Matches a yEnc marker at a word boundary (case-insensitive).
54fn yenc_re() -> &'static Regex {
55    static RE: OnceLock<Regex> = OnceLock::new();
56    RE.get_or_init(|| Regex::new(r"(?i)\byenc\b").unwrap())
57}
58
59// Strips common reply/forward prefixes (case-insensitive) repeatedly.
60fn strip_prefixes(s: &str) -> &str {
61    static RE: OnceLock<Regex> = OnceLock::new();
62    let re = RE.get_or_init(|| Regex::new(r"(?i)^(re|fwd?)\s*:\s*").unwrap());
63
64    let mut cur = s;
65    loop {
66        let stripped = re.find(cur).map(|m| &cur[m.end()..]).unwrap_or(cur);
67        if stripped.len() == cur.len() {
68            break;
69        }
70        cur = stripped;
71    }
72    cur
73}
74
75// ---------------------------------------------------------------------------
76// Public API
77// ---------------------------------------------------------------------------
78
79/// Parse a multi-part Usenet/email subject line.
80///
81/// Recognises five marker formats (in priority order):
82/// 1. Parenthesised fraction: `(03/17)` or `( 3 / 17 )`
83/// 2. Bracketed fraction: `[2/4]` (only when both sides are digits)
84/// 3. English "Part N/M" (case-insensitive)
85/// 4. English "Part N of M" / `part3of17` (case-insensitive)
86/// 5. Dash-separated fraction: ` - 03/17`
87///
88/// Leading `Re:`, `Fwd:`, and `Fw:` prefixes are stripped before matching
89/// (repeatedly, to handle nested re-forwards). The extracted part marker is
90/// removed from the subject to produce `base_subject`.
91///
92/// # Return value
93///
94/// Returns `None` only when:
95/// - `subject` is empty, or
96/// - `subject` contains a `yEnc` marker (those posts use a distinct encoding
97///   that is explicitly out of scope for this crate).
98///
99/// Otherwise returns `Some(SubjectParts)`. When no part-marker pattern
100/// matches, `part_index` and `part_total` are both `None` and `base_subject`
101/// is the prefix-stripped, trimmed input.
102///
103/// # Zero totals
104///
105/// A subject like `"file.bin (3/0)"` produces `part_total = Some(0)`. This is
106/// nonsensical but is passed through verbatim since the crate cannot know
107/// whether the source is malformed or intentional. Callers that pass
108/// `part_total` directly to [`PartCollection::with_total`][crate::PartCollection::with_total]
109/// should validate that the total is non-zero before doing so.
110///
111/// # Never panics
112///
113/// This function never panics on any input, including strings containing
114/// arbitrary Unicode code points.
115///
116/// # Examples
117///
118/// ```
119/// use uuencoding_multi::parse_subject;
120///
121/// // Parenthesised fraction — the most common Usenet format.
122/// let sp = parse_subject("bigfile.rar (2/5)").unwrap();
123/// assert_eq!(sp.part_index, Some(2));
124/// assert_eq!(sp.part_total, Some(5));
125/// assert_eq!(sp.base_subject, "bigfile.rar");
126/// ```
127///
128/// ```
129/// use uuencoding_multi::parse_subject;
130///
131/// // Re: prefix is stripped before matching.
132/// let sp = parse_subject("Re: archive.tar.gz (03/17)").unwrap();
133/// assert_eq!(sp.part_index, Some(3));
134/// assert_eq!(sp.part_total, Some(17));
135/// ```
136///
137/// ```
138/// use uuencoding_multi::parse_subject;
139///
140/// // yEnc subject → None (out of scope for this crate).
141/// assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
142/// ```
143///
144/// ```
145/// use uuencoding_multi::parse_subject;
146///
147/// // Empty input → None.
148/// assert!(parse_subject("").is_none());
149/// ```
150///
151/// ```
152/// use uuencoding_multi::parse_subject;
153///
154/// // No marker → Some with None fields and subject preserved.
155/// let sp = parse_subject("just a plain subject").unwrap();
156/// assert_eq!(sp.part_index, None);
157/// assert_eq!(sp.part_total, None);
158/// assert_eq!(sp.base_subject, "just a plain subject");
159/// ```
160pub fn parse_subject(subject: &str) -> Option<SubjectParts> {
161    if subject.is_empty() {
162        return None;
163    }
164
165    // yEnc posts are out of scope.
166    if yenc_re().is_match(subject) {
167        return None;
168    }
169
170    let stripped = strip_prefixes(subject).trim();
171
172    for pat in patterns() {
173        if let Some(caps) = pat.re.captures(stripped) {
174            // Capture group 1 is always the part index.
175            // Use `continue` (not `?`) so that a failed parse on one pattern
176            // does not exit the function — we try the remaining patterns instead.
177            // In practice the regex limits captures to 6 digits (max 999999),
178            // which is always within u32 range, so parse failure is impossible
179            // with current regexes.
180            let part_index: u32 = match caps[1].parse() {
181                Ok(n) => n,
182                Err(_) => continue,
183            };
184            // Capture group 2 is always the total (all five patterns have it).
185            let part_total: u32 = match caps[2].parse() {
186                Ok(n) => n,
187                Err(_) => continue,
188            };
189
190            // Build base_subject: remove the matched span from `stripped`.
191            let m = caps.get(0).unwrap();
192            let before = stripped[..m.start()].trim_end();
193            let after = stripped[m.end()..].trim_start();
194
195            let raw = if before.is_empty() {
196                after.to_string()
197            } else if after.is_empty() {
198                before.to_string()
199            } else {
200                format!("{} {}", before, after)
201            };
202
203            // Strip trailing " -" artifact (e.g. "filename.tar.gz -").
204            let base_subject = raw
205                .trim_end_matches(|c: char| c == '-' || c.is_whitespace())
206                .trim()
207                .to_string();
208
209            return Some(SubjectParts {
210                base_subject,
211                part_index: Some(part_index),
212                part_total: Some(part_total),
213            });
214        }
215    }
216
217    // No marker found.
218    Some(SubjectParts {
219        base_subject: stripped.to_string(),
220        part_index: None,
221        part_total: None,
222    })
223}
224
225// ---------------------------------------------------------------------------
226// Tests
227// ---------------------------------------------------------------------------
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    fn parts(subject: &str) -> SubjectParts {
234        parse_subject(subject).unwrap()
235    }
236
237    // ------------------------------------------------------------------
238    // Pattern 1 — parenthesised fraction
239    // ------------------------------------------------------------------
240
241    #[test]
242    fn paren_fraction_basic() {
243        let p = parts("bigfile.rar (1/5)");
244        assert_eq!(p.part_index, Some(1));
245        assert_eq!(p.part_total, Some(5));
246        assert_eq!(p.base_subject, "bigfile.rar");
247    }
248
249    #[test]
250    fn paren_fraction_leading_zero() {
251        let p = parts("filename.tar.gz (03/17)");
252        assert_eq!(p.part_index, Some(3));
253        assert_eq!(p.part_total, Some(17));
254        assert_eq!(p.base_subject, "filename.tar.gz");
255    }
256
257    #[test]
258    fn paren_fraction_spaces_inside() {
259        let p = parts("file.zip ( 2 / 7 )");
260        assert_eq!(p.part_index, Some(2));
261        assert_eq!(p.part_total, Some(7));
262    }
263
264    // ------------------------------------------------------------------
265    // Pattern 2 — bracketed fraction
266    // ------------------------------------------------------------------
267
268    #[test]
269    fn bracket_fraction_basic() {
270        let p = parts("image.jpg [2/4]");
271        assert_eq!(p.part_index, Some(2));
272        assert_eq!(p.part_total, Some(4));
273        assert_eq!(p.base_subject, "image.jpg");
274    }
275
276    #[test]
277    fn bracket_fraction_not_binary_tag() {
278        // [BINARY] must NOT be parsed as a fraction.
279        let p = parts("[BINARY] filename - Part 3 of 12");
280        assert_eq!(p.part_index, Some(3));
281        assert_eq!(p.part_total, Some(12));
282    }
283
284    // ------------------------------------------------------------------
285    // Pattern 3 — "Part N/M"
286    // ------------------------------------------------------------------
287
288    #[test]
289    fn part_slash_basic() {
290        let p = parts("file.zip Part 3/17");
291        assert_eq!(p.part_index, Some(3));
292        assert_eq!(p.part_total, Some(17));
293    }
294
295    #[test]
296    fn part_slash_lowercase() {
297        let p = parts("file.tar.gz part 2/5");
298        assert_eq!(p.part_index, Some(2));
299        assert_eq!(p.part_total, Some(5));
300    }
301
302    // ------------------------------------------------------------------
303    // Pattern 4 — "Part N of M"
304    // ------------------------------------------------------------------
305
306    #[test]
307    fn part_of_with_spaces() {
308        let p = parts("file.zip Part 03 of 17");
309        assert_eq!(p.part_index, Some(3));
310        assert_eq!(p.part_total, Some(17));
311    }
312
313    #[test]
314    fn part_of_no_spaces() {
315        let p = parts("archive.tar.gz part3of17");
316        assert_eq!(p.part_index, Some(3));
317        assert_eq!(p.part_total, Some(17));
318    }
319
320    #[test]
321    fn binary_tag_part_of() {
322        let p = parts("[BINARY] filename - Part 3 of 12");
323        assert_eq!(p.part_index, Some(3));
324        assert_eq!(p.part_total, Some(12));
325    }
326
327    // ------------------------------------------------------------------
328    // Pattern 5 — dash-separated fraction
329    // ------------------------------------------------------------------
330
331    #[test]
332    fn dash_fraction() {
333        let p = parts("filename.tar.gz - 03/17");
334        assert_eq!(p.part_index, Some(3));
335        assert_eq!(p.part_total, Some(17));
336        assert_eq!(p.base_subject, "filename.tar.gz");
337    }
338
339    // ------------------------------------------------------------------
340    // Part 0 (TOC)
341    // ------------------------------------------------------------------
342
343    #[test]
344    fn part_zero_toc() {
345        let p = parts("filename.tar.gz (00/17)");
346        assert_eq!(p.part_index, Some(0));
347        assert_eq!(p.part_total, Some(17));
348    }
349
350    // ------------------------------------------------------------------
351    // yEnc → None
352    // ------------------------------------------------------------------
353
354    #[test]
355    fn yenc_returns_none() {
356        assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
357    }
358
359    #[test]
360    fn yenc_uppercase_returns_none() {
361        assert!(parse_subject("\"file.nfo\" YENC (1/3)").is_none());
362    }
363
364    // ------------------------------------------------------------------
365    // No marker
366    // ------------------------------------------------------------------
367
368    #[test]
369    fn no_marker_returns_some_none_fields() {
370        let p = parts("plain subject");
371        assert_eq!(p.base_subject, "plain subject");
372        assert_eq!(p.part_index, None);
373        assert_eq!(p.part_total, None);
374    }
375
376    // ------------------------------------------------------------------
377    // Empty input → None
378    // ------------------------------------------------------------------
379
380    #[test]
381    fn empty_returns_none() {
382        assert!(parse_subject("").is_none());
383    }
384
385    // ------------------------------------------------------------------
386    // Re: / Fwd: prefix stripping
387    // ------------------------------------------------------------------
388
389    #[test]
390    fn re_prefix_stripped() {
391        let p = parts("Re: filename.tar.gz (03/17)");
392        assert_eq!(p.part_index, Some(3));
393        assert_eq!(p.part_total, Some(17));
394    }
395
396    #[test]
397    fn fwd_prefix_stripped() {
398        let p = parts("Fwd: filename.tar.gz (03/17)");
399        assert_eq!(p.part_index, Some(3));
400    }
401
402    #[test]
403    fn fw_prefix_stripped() {
404        let p = parts("Fw: filename.tar.gz (03/17)");
405        assert_eq!(p.part_index, Some(3));
406    }
407
408    #[test]
409    fn nested_re_prefix_stripped() {
410        let p = parts("Re: Re: filename.tar.gz (03/17)");
411        assert_eq!(p.part_index, Some(3));
412    }
413
414    // ------------------------------------------------------------------
415    // Unicode stem — must not panic
416    // ------------------------------------------------------------------
417
418    #[test]
419    fn unicode_stem_no_panic() {
420        let p = parts("日本語ファイル (1/3)");
421        assert_eq!(p.part_index, Some(1));
422        assert_eq!(p.part_total, Some(3));
423        assert!(p.base_subject.contains('日'));
424    }
425
426    // ------------------------------------------------------------------
427    // base_subject trimming
428    // ------------------------------------------------------------------
429
430    #[test]
431    fn base_subject_trailing_dash_stripped() {
432        // Pattern 5 leaves no artifact here; verify general trimming.
433        let p = parts("myfile.bin (2/5)");
434        assert!(!p.base_subject.ends_with('-'));
435        assert!(!p.base_subject.ends_with(' '));
436    }
437}