uuencoding_multi/subject.rs
1use std::sync::OnceLock;
2
3use regex::Regex;
4
5/// Fields extracted from a parsed Usenet/email subject line.
6///
7/// Returned by [`parse_subject`]. The `base_subject` field can be used as a
8/// stable grouping key across parts of the same series.
9///
10/// # Field invariants
11///
12/// - `part_total` is always `Some` when `part_index` is `Some`, because every
13/// supported marker format includes the total count.
14#[derive(Debug)]
15pub struct SubjectParts {
16 /// The subject with all `Re:`/`Fwd:` prefixes and the part-number marker stripped.
17 /// Never empty; [`parse_subject`] returns `None` rather than returning an empty
18 /// `base_subject`.
19 pub base_subject: String,
20 /// 1-based part number extracted from the marker. `Some(0)` indicates a
21 /// TOC post (e.g. `(00/17)`). `None` when no recognised marker was found.
22 pub part_index: Option<u32>,
23 /// Total number of parts as declared in the subject marker.
24 /// Always `Some` when `part_index` is `Some`; `None` otherwise.
25 pub part_total: Option<u32>,
26}
27
28// ---------------------------------------------------------------------------
29// Compiled-once regex patterns
30// ---------------------------------------------------------------------------
31//
32// Pattern priority (most specific first):
33// 1. Parenthesised fraction: (03/17)
34// 2. Bracketed fraction: [2/4] — only when both sides are digits
35// 3. English "part N/M": Part 3/17
36// 4. English "part N of M": Part 03 of 17 / part3of17
37// 5. Dash-separated fraction: - 03/17
38//
39// All patterns are compiled once into a static array via `OnceLock`.
40
41struct Pattern {
42 re: Regex,
43}
44
45fn patterns() -> &'static [Pattern; 5] {
46 static PATTERNS: OnceLock<[Pattern; 5]> = OnceLock::new();
47 PATTERNS.get_or_init(|| {
48 [
49 // 1. Parenthesised fraction: (03/17) or ( 3 / 17 )
50 Pattern {
51 re: Regex::new(r"\([ \t]*([0-9]{1,6})[ \t]*/[ \t]*([0-9]{1,6})[ \t]*\)").unwrap(),
52 },
53 // 2. Bracketed fraction: [2/4] — require digit on both sides so
54 // [BINARY] is not matched.
55 Pattern {
56 re: Regex::new(r"\[[ \t]*([0-9]{1,6})[ \t]*/[ \t]*([0-9]{1,6})[ \t]*\]").unwrap(),
57 },
58 // 3. English "Part N/M" (case-insensitive)
59 Pattern {
60 re: Regex::new(r"(?i)\bpart[ \t]+([0-9]{1,6})[ \t]*/[ \t]*([0-9]{1,6})\b").unwrap(),
61 },
62 // 4. English "Part N of M" / "Part3of17" (case-insensitive)
63 Pattern {
64 re: Regex::new(r"(?i)\bpart[ \t]*([0-9]{1,6})[ \t]*of[ \t]*([0-9]{1,6})\b")
65 .unwrap(),
66 },
67 // 5. Dash-separated fraction: " - 03/17"
68 Pattern {
69 re: Regex::new(r"[ \t]+-[ \t]+([0-9]{1,6})[ \t]*/[ \t]*([0-9]{1,6})\b").unwrap(),
70 },
71 ]
72 })
73}
74
75// Matches a yEnc marker at a word boundary (case-insensitive).
76fn yenc_re() -> &'static Regex {
77 static RE: OnceLock<Regex> = OnceLock::new();
78 RE.get_or_init(|| Regex::new(r"(?i)\byenc\b").unwrap())
79}
80
81// Strips common reply/forward prefixes (case-insensitive) repeatedly.
82fn strip_prefixes(s: &str) -> &str {
83 static RE: OnceLock<Regex> = OnceLock::new();
84 let re = RE.get_or_init(|| Regex::new(r"(?i)^(re|fwd?)[ \t]*:[ \t]*").unwrap());
85
86 let mut cur = s;
87 loop {
88 let stripped = re.find(cur).map(|m| &cur[m.end()..]).unwrap_or(cur);
89 if stripped.len() == cur.len() {
90 break;
91 }
92 cur = stripped;
93 }
94 cur
95}
96
97// ---------------------------------------------------------------------------
98// Public API
99// ---------------------------------------------------------------------------
100
101/// Parse a multi-part Usenet/email subject line.
102///
103/// Recognises five marker formats (in priority order):
104/// 1. Parenthesised fraction: `(03/17)` or `( 3 / 17 )`
105/// 2. Bracketed fraction: `[2/4]` (only when both sides are digits)
106/// 3. English "Part N/M" (case-insensitive)
107/// 4. English "Part N of M" / `part3of17` (case-insensitive)
108/// 5. Dash-separated fraction: ` - 03/17`
109///
110/// Leading `Re:`, `Fwd:`, and `Fw:` prefixes are stripped before matching
111/// (repeatedly, to handle nested re-forwards). The extracted part marker is
112/// removed from the subject to produce `base_subject`.
113///
114/// # First-match-wins
115///
116/// The five patterns are tried in the priority order listed above. The first
117/// pattern that matches wins; no attempt is made to find a "better" match
118/// further along. If a subject line contains multiple markers
119/// (e.g. `"file (1/3) [2/4]"`), only the first one matched — the
120/// parenthesised fraction in that example — is used and the rest are left in
121/// `base_subject`.
122///
123/// # Return value
124///
125/// Returns `None` only when:
126/// - `subject` is empty, or
127/// - `subject` contains a `yEnc` marker (those posts use a distinct encoding
128/// that is explicitly out of scope for this crate), or
129/// - the entire input is consumed by the part-marker pattern, leaving no
130/// base subject (e.g. `"(1/3)"` with nothing else). The invariant that
131/// `base_subject` is never empty must hold whenever `Some` is returned.
132///
133/// Otherwise returns `Some(SubjectParts)`. When no part-marker pattern
134/// matches, `part_index` and `part_total` are both `None` and `base_subject`
135/// is the prefix-stripped, trimmed input.
136///
137/// # Zero totals
138///
139/// A subject like `"file.bin (3/0)"` produces `part_total = Some(0)`. This is
140/// nonsensical but is passed through verbatim since the crate cannot know
141/// whether the source is malformed or intentional. Callers that pass
142/// `part_total` directly to [`PartCollection::with_total`][crate::PartCollection::with_total]
143/// should validate that the total is non-zero before doing so.
144///
145/// # Never panics
146///
147/// This function never panics on any input, including strings containing
148/// arbitrary Unicode code points.
149///
150/// # Examples
151///
152/// ```
153/// use uuencoding_multi::parse_subject;
154///
155/// // Parenthesised fraction — the most common Usenet format.
156/// let sp = parse_subject("bigfile.rar (2/5)").unwrap();
157/// assert_eq!(sp.part_index, Some(2));
158/// assert_eq!(sp.part_total, Some(5));
159/// assert_eq!(sp.base_subject, "bigfile.rar");
160/// ```
161///
162/// ```
163/// use uuencoding_multi::parse_subject;
164///
165/// // Re: prefix is stripped before matching.
166/// let sp = parse_subject("Re: archive.tar.gz (03/17)").unwrap();
167/// assert_eq!(sp.part_index, Some(3));
168/// assert_eq!(sp.part_total, Some(17));
169/// ```
170///
171/// ```
172/// use uuencoding_multi::parse_subject;
173///
174/// // yEnc subject → None (out of scope for this crate).
175/// assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
176/// ```
177///
178/// ```
179/// use uuencoding_multi::parse_subject;
180///
181/// // Empty input → None.
182/// assert!(parse_subject("").is_none());
183/// ```
184///
185/// ```
186/// use uuencoding_multi::parse_subject;
187///
188/// // No marker → Some with None fields and subject preserved.
189/// let sp = parse_subject("just a plain subject").unwrap();
190/// assert_eq!(sp.part_index, None);
191/// assert_eq!(sp.part_total, None);
192/// assert_eq!(sp.base_subject, "just a plain subject");
193/// ```
194pub fn parse_subject(subject: &str) -> Option<SubjectParts> {
195 if subject.is_empty() {
196 return None;
197 }
198
199 // yEnc posts are out of scope.
200 if yenc_re().is_match(subject) {
201 return None;
202 }
203
204 let stripped = strip_prefixes(subject).trim();
205
206 if stripped.is_empty() {
207 return None;
208 }
209
210 for pat in patterns() {
211 if let Some(caps) = pat.re.captures(stripped) {
212 // Capture group 1 is always the part index.
213 // Use `continue` (not `?`) so that a failed parse on one pattern
214 // does not exit the function — we try the remaining patterns instead.
215 // In practice the regex limits captures to 6 digits (max 999999),
216 // which is always within u32 range, so parse failure is impossible
217 // with current regexes.
218 let part_index: u32 = match caps[1].parse() {
219 Ok(n) => n,
220 Err(_) => continue,
221 };
222 // Capture group 2 is always the total (all five patterns have it).
223 let part_total: u32 = match caps[2].parse() {
224 Ok(n) => n,
225 Err(_) => continue,
226 };
227
228 // Build base_subject: remove the matched span from `stripped`.
229 let m = caps.get(0).unwrap();
230 let before = stripped[..m.start()].trim_end();
231 let after = stripped[m.end()..].trim_start();
232
233 let raw = if before.is_empty() {
234 after.to_string()
235 } else if after.is_empty() {
236 before.to_string()
237 } else {
238 format!("{} {}", before, after)
239 };
240
241 // Strip trailing " -" artifact (e.g. "filename.tar.gz -").
242 let base_subject = raw
243 .trim_end_matches(|c: char| c == '-' || c.is_whitespace())
244 .trim()
245 .to_string();
246
247 // Invariant: base_subject must not be empty. A subject that
248 // consists solely of a part marker (e.g. "(1/3)") has no
249 // meaningful base; treat it as non-parseable.
250 if base_subject.is_empty() {
251 return None;
252 }
253
254 return Some(SubjectParts {
255 base_subject,
256 part_index: Some(part_index),
257 part_total: Some(part_total),
258 });
259 }
260 }
261
262 // No marker found.
263 Some(SubjectParts {
264 base_subject: stripped.to_string(),
265 part_index: None,
266 part_total: None,
267 })
268}
269
270// ---------------------------------------------------------------------------
271// Tests
272// ---------------------------------------------------------------------------
273
274#[cfg(test)]
275mod tests {
276 use super::*;
277
278 fn parts(subject: &str) -> SubjectParts {
279 parse_subject(subject).unwrap()
280 }
281
282 // ------------------------------------------------------------------
283 // Pattern 1 — parenthesised fraction
284 // ------------------------------------------------------------------
285
286 #[test]
287 fn paren_fraction_basic() {
288 let p = parts("bigfile.rar (1/5)");
289 assert_eq!(p.part_index, Some(1));
290 assert_eq!(p.part_total, Some(5));
291 assert_eq!(p.base_subject, "bigfile.rar");
292 }
293
294 #[test]
295 fn paren_fraction_leading_zero() {
296 let p = parts("filename.tar.gz (03/17)");
297 assert_eq!(p.part_index, Some(3));
298 assert_eq!(p.part_total, Some(17));
299 assert_eq!(p.base_subject, "filename.tar.gz");
300 }
301
302 #[test]
303 fn paren_fraction_spaces_inside() {
304 let p = parts("file.zip ( 2 / 7 )");
305 assert_eq!(p.part_index, Some(2));
306 assert_eq!(p.part_total, Some(7));
307 }
308
309 // ------------------------------------------------------------------
310 // Pattern 2 — bracketed fraction
311 // ------------------------------------------------------------------
312
313 #[test]
314 fn bracket_fraction_basic() {
315 let p = parts("image.jpg [2/4]");
316 assert_eq!(p.part_index, Some(2));
317 assert_eq!(p.part_total, Some(4));
318 assert_eq!(p.base_subject, "image.jpg");
319 }
320
321 #[test]
322 fn bracket_fraction_not_binary_tag() {
323 // [BINARY] must NOT be parsed as a fraction.
324 let p = parts("[BINARY] filename - Part 3 of 12");
325 assert_eq!(p.part_index, Some(3));
326 assert_eq!(p.part_total, Some(12));
327 }
328
329 // ------------------------------------------------------------------
330 // Pattern 3 — "Part N/M"
331 // ------------------------------------------------------------------
332
333 #[test]
334 fn part_slash_basic() {
335 let p = parts("file.zip Part 3/17");
336 assert_eq!(p.part_index, Some(3));
337 assert_eq!(p.part_total, Some(17));
338 }
339
340 #[test]
341 fn part_slash_lowercase() {
342 let p = parts("file.tar.gz part 2/5");
343 assert_eq!(p.part_index, Some(2));
344 assert_eq!(p.part_total, Some(5));
345 }
346
347 // ------------------------------------------------------------------
348 // Pattern 4 — "Part N of M"
349 // ------------------------------------------------------------------
350
351 #[test]
352 fn part_of_with_spaces() {
353 let p = parts("file.zip Part 03 of 17");
354 assert_eq!(p.part_index, Some(3));
355 assert_eq!(p.part_total, Some(17));
356 }
357
358 #[test]
359 fn part_of_no_spaces() {
360 let p = parts("archive.tar.gz part3of17");
361 assert_eq!(p.part_index, Some(3));
362 assert_eq!(p.part_total, Some(17));
363 }
364
365 #[test]
366 fn binary_tag_part_of() {
367 let p = parts("[BINARY] filename - Part 3 of 12");
368 assert_eq!(p.part_index, Some(3));
369 assert_eq!(p.part_total, Some(12));
370 }
371
372 // ------------------------------------------------------------------
373 // Pattern 5 — dash-separated fraction
374 // ------------------------------------------------------------------
375
376 #[test]
377 fn dash_fraction() {
378 let p = parts("filename.tar.gz - 03/17");
379 assert_eq!(p.part_index, Some(3));
380 assert_eq!(p.part_total, Some(17));
381 assert_eq!(p.base_subject, "filename.tar.gz");
382 }
383
384 // ------------------------------------------------------------------
385 // Part 0 (TOC)
386 // ------------------------------------------------------------------
387
388 #[test]
389 fn part_zero_toc() {
390 let p = parts("filename.tar.gz (00/17)");
391 assert_eq!(p.part_index, Some(0));
392 assert_eq!(p.part_total, Some(17));
393 }
394
395 // ------------------------------------------------------------------
396 // yEnc → None
397 // ------------------------------------------------------------------
398
399 #[test]
400 fn yenc_returns_none() {
401 assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
402 }
403
404 #[test]
405 fn yenc_uppercase_returns_none() {
406 assert!(parse_subject("\"file.nfo\" YENC (1/3)").is_none());
407 }
408
409 // ------------------------------------------------------------------
410 // No marker
411 // ------------------------------------------------------------------
412
413 #[test]
414 fn no_marker_returns_some_none_fields() {
415 let p = parts("plain subject");
416 assert_eq!(p.base_subject, "plain subject");
417 assert_eq!(p.part_index, None);
418 assert_eq!(p.part_total, None);
419 }
420
421 // ------------------------------------------------------------------
422 // Empty input → None
423 // ------------------------------------------------------------------
424
425 #[test]
426 fn empty_returns_none() {
427 assert!(parse_subject("").is_none());
428 }
429
430 // ------------------------------------------------------------------
431 // Bare marker (entire input is the marker) → None
432 // Invariant: base_subject must never be empty.
433 // ------------------------------------------------------------------
434
435 #[test]
436 fn bare_paren_marker_returns_none() {
437 assert!(parse_subject("(1/3)").is_none());
438 }
439
440 #[test]
441 fn bare_paren_marker_with_spaces_returns_none() {
442 assert!(parse_subject(" (1/3) ").is_none());
443 }
444
445 #[test]
446 fn bare_bracket_marker_returns_none() {
447 assert!(parse_subject("[2/4]").is_none());
448 }
449
450 /// A subject that is entirely a "Part N of M" or "Part N/M" marker with no
451 /// surrounding text strips to empty after removing the marker.
452 /// Invariant: base_subject must never be empty → returns None.
453 #[test]
454 fn bare_part_marker_only_returns_none() {
455 assert!(parse_subject("Part 1 of 3").is_none());
456 assert!(parse_subject("Part 1/3").is_none());
457 }
458
459 // ------------------------------------------------------------------
460 // Re: / Fwd: prefix stripping
461 // ------------------------------------------------------------------
462
463 #[test]
464 fn re_prefix_stripped() {
465 let p = parts("Re: filename.tar.gz (03/17)");
466 assert_eq!(p.part_index, Some(3));
467 assert_eq!(p.part_total, Some(17));
468 }
469
470 #[test]
471 fn fwd_prefix_stripped() {
472 let p = parts("Fwd: filename.tar.gz (03/17)");
473 assert_eq!(p.part_index, Some(3));
474 }
475
476 #[test]
477 fn fw_prefix_stripped() {
478 let p = parts("Fw: filename.tar.gz (03/17)");
479 assert_eq!(p.part_index, Some(3));
480 }
481
482 #[test]
483 fn nested_re_prefix_stripped() {
484 let p = parts("Re: Re: filename.tar.gz (03/17)");
485 assert_eq!(p.part_index, Some(3));
486 }
487
488 // ------------------------------------------------------------------
489 // Unicode stem — must not panic
490 // ------------------------------------------------------------------
491
492 #[test]
493 fn unicode_stem_no_panic() {
494 let p = parts("日本語ファイル (1/3)");
495 assert_eq!(p.part_index, Some(1));
496 assert_eq!(p.part_total, Some(3));
497 assert!(p.base_subject.contains('日'));
498 }
499
500 // ------------------------------------------------------------------
501 // base_subject trimming
502 // ------------------------------------------------------------------
503
504 #[test]
505 fn base_subject_trailing_dash_stripped() {
506 // Pattern 5 leaves no artifact here; verify general trimming.
507 let p = parts("myfile.bin (2/5)");
508 assert!(!p.base_subject.ends_with('-'));
509 assert!(!p.base_subject.ends_with(' '));
510 }
511
512 // ------------------------------------------------------------------
513 // All-prefix input stripped to empty → None
514 // Invariant: base_subject must never be empty.
515 // ------------------------------------------------------------------
516
517 #[test]
518 fn parse_subject_returns_none_for_all_prefix_input() {
519 assert!(parse_subject("Re: ").is_none());
520 assert!(parse_subject("Fwd: Re: ").is_none());
521 assert!(parse_subject(" ").is_none());
522 }
523}