uuencoding_multi/subject.rs
1use std::sync::OnceLock;
2
3use regex::Regex;
4
5use crate::SubjectParts;
6
7// ---------------------------------------------------------------------------
8// Compiled-once regex patterns
9// ---------------------------------------------------------------------------
10//
11// Pattern priority (most specific first):
12// 1. Parenthesised fraction: (03/17)
13// 2. Bracketed fraction: [2/4] — only when both sides are digits
14// 3. English "part N/M": Part 3/17
15// 4. English "part N of M": Part 03 of 17 / part3of17
16// 5. Dash-separated fraction: - 03/17
17//
18// All patterns are compiled once into a static array via `OnceLock`.
19
20struct Pattern {
21 re: Regex,
22}
23
24fn patterns() -> &'static [Pattern; 5] {
25 static PATTERNS: OnceLock<[Pattern; 5]> = OnceLock::new();
26 PATTERNS.get_or_init(|| {
27 [
28 // 1. Parenthesised fraction: (03/17) or ( 3 / 17 )
29 Pattern {
30 re: Regex::new(r"\(\s*(\d{1,6})\s*/\s*(\d{1,6})\s*\)").unwrap(),
31 },
32 // 2. Bracketed fraction: [2/4] — require digit on both sides so
33 // [BINARY] is not matched.
34 Pattern {
35 re: Regex::new(r"\[\s*(\d{1,6})\s*/\s*(\d{1,6})\s*\]").unwrap(),
36 },
37 // 3. English "Part N/M" (case-insensitive via (?i))
38 Pattern {
39 re: Regex::new(r"(?i)\bpart\s+(\d{1,6})\s*/\s*(\d{1,6})\b").unwrap(),
40 },
41 // 4. English "Part N of M" / "Part3of17" (case-insensitive)
42 Pattern {
43 re: Regex::new(r"(?i)\bpart\s*(\d{1,6})\s*of\s*(\d{1,6})\b").unwrap(),
44 },
45 // 5. Dash-separated fraction: " - 03/17"
46 Pattern {
47 re: Regex::new(r"\s+-\s+(\d{1,6})\s*/\s*(\d{1,6})\b").unwrap(),
48 },
49 ]
50 })
51}
52
53// Matches a yEnc marker at a word boundary (case-insensitive).
54fn yenc_re() -> &'static Regex {
55 static RE: OnceLock<Regex> = OnceLock::new();
56 RE.get_or_init(|| Regex::new(r"(?i)\byenc\b").unwrap())
57}
58
59// Strips common reply/forward prefixes (case-insensitive) repeatedly.
60fn strip_prefixes(s: &str) -> &str {
61 static RE: OnceLock<Regex> = OnceLock::new();
62 let re = RE.get_or_init(|| Regex::new(r"(?i)^(re|fwd?)\s*:\s*").unwrap());
63
64 let mut cur = s;
65 loop {
66 let stripped = re.find(cur).map(|m| &cur[m.end()..]).unwrap_or(cur);
67 if stripped.len() == cur.len() {
68 break;
69 }
70 cur = stripped;
71 }
72 cur
73}
74
75// ---------------------------------------------------------------------------
76// Public API
77// ---------------------------------------------------------------------------
78
79/// Parse a multi-part Usenet/email subject line.
80///
81/// Recognises five marker formats (in priority order):
82/// 1. Parenthesised fraction: `(03/17)` or `( 3 / 17 )`
83/// 2. Bracketed fraction: `[2/4]` (only when both sides are digits)
84/// 3. English "Part N/M" (case-insensitive)
85/// 4. English "Part N of M" / `part3of17` (case-insensitive)
86/// 5. Dash-separated fraction: ` - 03/17`
87///
88/// Leading `Re:`, `Fwd:`, and `Fw:` prefixes are stripped before matching
89/// (repeatedly, to handle nested re-forwards). The extracted part marker is
90/// removed from the subject to produce `base_subject`.
91///
92/// # Return value
93///
94/// Returns `None` only when:
95/// - `subject` is empty, or
96/// - `subject` contains a `yEnc` marker (those posts use a distinct encoding
97/// that is explicitly out of scope for this crate).
98///
99/// Otherwise returns `Some(SubjectParts)`. When no part-marker pattern
100/// matches, `part_index` and `part_total` are both `None` and `base_subject`
101/// is the prefix-stripped, trimmed input.
102///
103/// # Zero totals
104///
105/// A subject like `"file.bin (3/0)"` produces `part_total = Some(0)`. This is
106/// nonsensical but is passed through verbatim since the crate cannot know
107/// whether the source is malformed or intentional. Callers that pass
108/// `part_total` directly to [`PartCollection::with_total`][crate::PartCollection::with_total]
109/// should validate that the total is non-zero before doing so.
110///
111/// # Never panics
112///
113/// This function never panics on any input, including strings containing
114/// arbitrary Unicode code points.
115///
116/// # Examples
117///
118/// ```
119/// use uuencoding_multi::parse_subject;
120///
121/// // Parenthesised fraction — the most common Usenet format.
122/// let sp = parse_subject("bigfile.rar (2/5)").unwrap();
123/// assert_eq!(sp.part_index, Some(2));
124/// assert_eq!(sp.part_total, Some(5));
125/// assert_eq!(sp.base_subject, "bigfile.rar");
126/// ```
127///
128/// ```
129/// use uuencoding_multi::parse_subject;
130///
131/// // Re: prefix is stripped before matching.
132/// let sp = parse_subject("Re: archive.tar.gz (03/17)").unwrap();
133/// assert_eq!(sp.part_index, Some(3));
134/// assert_eq!(sp.part_total, Some(17));
135/// ```
136///
137/// ```
138/// use uuencoding_multi::parse_subject;
139///
140/// // yEnc subject → None (out of scope for this crate).
141/// assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
142/// ```
143///
144/// ```
145/// use uuencoding_multi::parse_subject;
146///
147/// // Empty input → None.
148/// assert!(parse_subject("").is_none());
149/// ```
150///
151/// ```
152/// use uuencoding_multi::parse_subject;
153///
154/// // No marker → Some with None fields and subject preserved.
155/// let sp = parse_subject("just a plain subject").unwrap();
156/// assert_eq!(sp.part_index, None);
157/// assert_eq!(sp.part_total, None);
158/// assert_eq!(sp.base_subject, "just a plain subject");
159/// ```
160pub fn parse_subject(subject: &str) -> Option<SubjectParts> {
161 if subject.is_empty() {
162 return None;
163 }
164
165 // yEnc posts are out of scope.
166 if yenc_re().is_match(subject) {
167 return None;
168 }
169
170 let stripped = strip_prefixes(subject).trim();
171
172 for pat in patterns() {
173 if let Some(caps) = pat.re.captures(stripped) {
174 // Capture group 1 is always the part index.
175 // Use `continue` (not `?`) so that a failed parse on one pattern
176 // does not exit the function — we try the remaining patterns instead.
177 // In practice the regex limits captures to 6 digits (max 999999),
178 // which is always within u32 range, so parse failure is impossible
179 // with current regexes.
180 let part_index: u32 = match caps[1].parse() {
181 Ok(n) => n,
182 Err(_) => continue,
183 };
184 // Capture group 2 is always the total (all five patterns have it).
185 let part_total: u32 = match caps[2].parse() {
186 Ok(n) => n,
187 Err(_) => continue,
188 };
189
190 // Build base_subject: remove the matched span from `stripped`.
191 let m = caps.get(0).unwrap();
192 let before = stripped[..m.start()].trim_end();
193 let after = stripped[m.end()..].trim_start();
194
195 let raw = if before.is_empty() {
196 after.to_string()
197 } else if after.is_empty() {
198 before.to_string()
199 } else {
200 format!("{} {}", before, after)
201 };
202
203 // Strip trailing " -" artifact (e.g. "filename.tar.gz -").
204 let base_subject = raw
205 .trim_end_matches(|c: char| c == '-' || c.is_whitespace())
206 .trim()
207 .to_string();
208
209 return Some(SubjectParts {
210 base_subject,
211 part_index: Some(part_index),
212 part_total: Some(part_total),
213 });
214 }
215 }
216
217 // No marker found.
218 Some(SubjectParts {
219 base_subject: stripped.to_string(),
220 part_index: None,
221 part_total: None,
222 })
223}
224
225// ---------------------------------------------------------------------------
226// Tests
227// ---------------------------------------------------------------------------
228
229#[cfg(test)]
230mod tests {
231 use super::*;
232
233 fn parts(subject: &str) -> SubjectParts {
234 parse_subject(subject).unwrap()
235 }
236
237 // ------------------------------------------------------------------
238 // Pattern 1 — parenthesised fraction
239 // ------------------------------------------------------------------
240
241 #[test]
242 fn paren_fraction_basic() {
243 let p = parts("bigfile.rar (1/5)");
244 assert_eq!(p.part_index, Some(1));
245 assert_eq!(p.part_total, Some(5));
246 assert_eq!(p.base_subject, "bigfile.rar");
247 }
248
249 #[test]
250 fn paren_fraction_leading_zero() {
251 let p = parts("filename.tar.gz (03/17)");
252 assert_eq!(p.part_index, Some(3));
253 assert_eq!(p.part_total, Some(17));
254 assert_eq!(p.base_subject, "filename.tar.gz");
255 }
256
257 #[test]
258 fn paren_fraction_spaces_inside() {
259 let p = parts("file.zip ( 2 / 7 )");
260 assert_eq!(p.part_index, Some(2));
261 assert_eq!(p.part_total, Some(7));
262 }
263
264 // ------------------------------------------------------------------
265 // Pattern 2 — bracketed fraction
266 // ------------------------------------------------------------------
267
268 #[test]
269 fn bracket_fraction_basic() {
270 let p = parts("image.jpg [2/4]");
271 assert_eq!(p.part_index, Some(2));
272 assert_eq!(p.part_total, Some(4));
273 assert_eq!(p.base_subject, "image.jpg");
274 }
275
276 #[test]
277 fn bracket_fraction_not_binary_tag() {
278 // [BINARY] must NOT be parsed as a fraction.
279 let p = parts("[BINARY] filename - Part 3 of 12");
280 assert_eq!(p.part_index, Some(3));
281 assert_eq!(p.part_total, Some(12));
282 }
283
284 // ------------------------------------------------------------------
285 // Pattern 3 — "Part N/M"
286 // ------------------------------------------------------------------
287
288 #[test]
289 fn part_slash_basic() {
290 let p = parts("file.zip Part 3/17");
291 assert_eq!(p.part_index, Some(3));
292 assert_eq!(p.part_total, Some(17));
293 }
294
295 #[test]
296 fn part_slash_lowercase() {
297 let p = parts("file.tar.gz part 2/5");
298 assert_eq!(p.part_index, Some(2));
299 assert_eq!(p.part_total, Some(5));
300 }
301
302 // ------------------------------------------------------------------
303 // Pattern 4 — "Part N of M"
304 // ------------------------------------------------------------------
305
306 #[test]
307 fn part_of_with_spaces() {
308 let p = parts("file.zip Part 03 of 17");
309 assert_eq!(p.part_index, Some(3));
310 assert_eq!(p.part_total, Some(17));
311 }
312
313 #[test]
314 fn part_of_no_spaces() {
315 let p = parts("archive.tar.gz part3of17");
316 assert_eq!(p.part_index, Some(3));
317 assert_eq!(p.part_total, Some(17));
318 }
319
320 #[test]
321 fn binary_tag_part_of() {
322 let p = parts("[BINARY] filename - Part 3 of 12");
323 assert_eq!(p.part_index, Some(3));
324 assert_eq!(p.part_total, Some(12));
325 }
326
327 // ------------------------------------------------------------------
328 // Pattern 5 — dash-separated fraction
329 // ------------------------------------------------------------------
330
331 #[test]
332 fn dash_fraction() {
333 let p = parts("filename.tar.gz - 03/17");
334 assert_eq!(p.part_index, Some(3));
335 assert_eq!(p.part_total, Some(17));
336 assert_eq!(p.base_subject, "filename.tar.gz");
337 }
338
339 // ------------------------------------------------------------------
340 // Part 0 (TOC)
341 // ------------------------------------------------------------------
342
343 #[test]
344 fn part_zero_toc() {
345 let p = parts("filename.tar.gz (00/17)");
346 assert_eq!(p.part_index, Some(0));
347 assert_eq!(p.part_total, Some(17));
348 }
349
350 // ------------------------------------------------------------------
351 // yEnc → None
352 // ------------------------------------------------------------------
353
354 #[test]
355 fn yenc_returns_none() {
356 assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
357 }
358
359 #[test]
360 fn yenc_uppercase_returns_none() {
361 assert!(parse_subject("\"file.nfo\" YENC (1/3)").is_none());
362 }
363
364 // ------------------------------------------------------------------
365 // No marker
366 // ------------------------------------------------------------------
367
368 #[test]
369 fn no_marker_returns_some_none_fields() {
370 let p = parts("plain subject");
371 assert_eq!(p.base_subject, "plain subject");
372 assert_eq!(p.part_index, None);
373 assert_eq!(p.part_total, None);
374 }
375
376 // ------------------------------------------------------------------
377 // Empty input → None
378 // ------------------------------------------------------------------
379
380 #[test]
381 fn empty_returns_none() {
382 assert!(parse_subject("").is_none());
383 }
384
385 // ------------------------------------------------------------------
386 // Re: / Fwd: prefix stripping
387 // ------------------------------------------------------------------
388
389 #[test]
390 fn re_prefix_stripped() {
391 let p = parts("Re: filename.tar.gz (03/17)");
392 assert_eq!(p.part_index, Some(3));
393 assert_eq!(p.part_total, Some(17));
394 }
395
396 #[test]
397 fn fwd_prefix_stripped() {
398 let p = parts("Fwd: filename.tar.gz (03/17)");
399 assert_eq!(p.part_index, Some(3));
400 }
401
402 #[test]
403 fn fw_prefix_stripped() {
404 let p = parts("Fw: filename.tar.gz (03/17)");
405 assert_eq!(p.part_index, Some(3));
406 }
407
408 #[test]
409 fn nested_re_prefix_stripped() {
410 let p = parts("Re: Re: filename.tar.gz (03/17)");
411 assert_eq!(p.part_index, Some(3));
412 }
413
414 // ------------------------------------------------------------------
415 // Unicode stem — must not panic
416 // ------------------------------------------------------------------
417
418 #[test]
419 fn unicode_stem_no_panic() {
420 let p = parts("日本語ファイル (1/3)");
421 assert_eq!(p.part_index, Some(1));
422 assert_eq!(p.part_total, Some(3));
423 assert!(p.base_subject.contains('日'));
424 }
425
426 // ------------------------------------------------------------------
427 // base_subject trimming
428 // ------------------------------------------------------------------
429
430 #[test]
431 fn base_subject_trailing_dash_stripped() {
432 // Pattern 5 leaves no artifact here; verify general trimming.
433 let p = parts("myfile.bin (2/5)");
434 assert!(!p.base_subject.ends_with('-'));
435 assert!(!p.base_subject.ends_with(' '));
436 }
437}