uuencoding_multi/subject.rs
1use std::sync::OnceLock;
2
3use regex::Regex;
4
5use crate::SubjectParts;
6
7// ---------------------------------------------------------------------------
8// Compiled-once regex patterns
9// ---------------------------------------------------------------------------
10//
11// Pattern priority (most specific first):
12// 1. Parenthesised fraction: (03/17)
13// 2. Bracketed fraction: [2/4] — only when both sides are digits
14// 3. English "part N/M": Part 3/17
15// 4. English "part N of M": Part 03 of 17 / part3of17
16// 5. Dash-separated fraction: - 03/17
17//
18// All patterns are compiled once into a static array via `OnceLock`.
19
20struct Pattern {
21 re: Regex,
22}
23
24fn patterns() -> &'static [Pattern; 5] {
25 static PATTERNS: OnceLock<[Pattern; 5]> = OnceLock::new();
26 PATTERNS.get_or_init(|| {
27 [
28 // 1. Parenthesised fraction: (03/17) or ( 3 / 17 )
29 Pattern {
30 re: Regex::new(r"\(\s*(\d{1,6})\s*/\s*(\d{1,6})\s*\)").unwrap(),
31 },
32 // 2. Bracketed fraction: [2/4] — require digit on both sides so
33 // [BINARY] is not matched.
34 Pattern {
35 re: Regex::new(r"\[\s*(\d{1,6})\s*/\s*(\d{1,6})\s*\]").unwrap(),
36 },
37 // 3. English "Part N/M" (case-insensitive via (?i))
38 Pattern {
39 re: Regex::new(r"(?i)\bpart\s+(\d{1,6})\s*/\s*(\d{1,6})\b").unwrap(),
40 },
41 // 4. English "Part N of M" / "Part3of17" (case-insensitive)
42 Pattern {
43 re: Regex::new(r"(?i)\bpart\s*(\d{1,6})\s*of\s*(\d{1,6})\b").unwrap(),
44 },
45 // 5. Dash-separated fraction: " - 03/17"
46 Pattern {
47 re: Regex::new(r"\s+-\s+(\d{1,6})\s*/\s*(\d{1,6})\b").unwrap(),
48 },
49 ]
50 })
51}
52
53// Matches a yEnc marker at a word boundary (case-insensitive).
54fn yenc_re() -> &'static Regex {
55 static RE: OnceLock<Regex> = OnceLock::new();
56 RE.get_or_init(|| Regex::new(r"(?i)\byenc\b").unwrap())
57}
58
59// Strips common reply/forward prefixes (case-insensitive) repeatedly.
60fn strip_prefixes(s: &str) -> &str {
61 static RE: OnceLock<Regex> = OnceLock::new();
62 let re = RE.get_or_init(|| Regex::new(r"(?i)^(re|fwd?)\s*:\s*").unwrap());
63
64 let mut cur = s;
65 loop {
66 let stripped = re.find(cur).map(|m| &cur[m.end()..]).unwrap_or(cur);
67 if stripped.len() == cur.len() {
68 break;
69 }
70 cur = stripped;
71 }
72 cur
73}
74
75// ---------------------------------------------------------------------------
76// Public API
77// ---------------------------------------------------------------------------
78
79/// Parse a multi-part Usenet/email subject line.
80///
81/// Recognises five marker formats (in priority order):
82/// 1. Parenthesised fraction: `(03/17)` or `( 3 / 17 )`
83/// 2. Bracketed fraction: `[2/4]` (only when both sides are digits)
84/// 3. English "Part N/M" (case-insensitive)
85/// 4. English "Part N of M" / `part3of17` (case-insensitive)
86/// 5. Dash-separated fraction: ` - 03/17`
87///
88/// Leading `Re:`, `Fwd:`, and `Fw:` prefixes are stripped before matching
89/// (repeatedly, to handle nested re-forwards). The extracted part marker is
90/// removed from the subject to produce `base_subject`.
91///
92/// # Return value
93///
94/// Returns `None` only when:
95/// - `subject` is empty, or
96/// - `subject` contains a `yEnc` marker (those posts use a distinct encoding
97/// that is explicitly out of scope for this crate).
98///
99/// Otherwise returns `Some(SubjectParts)`. When no part-marker pattern
100/// matches, `part_index` and `part_total` are both `None` and `base_subject`
101/// is the prefix-stripped, trimmed input.
102///
103/// # Never panics
104///
105/// This function never panics on any input, including strings containing
106/// arbitrary Unicode code points.
107///
108/// # Examples
109///
110/// ```
111/// use uuencoding_multi::parse_subject;
112///
113/// // Parenthesised fraction — the most common Usenet format.
114/// let sp = parse_subject("bigfile.rar (2/5)").unwrap();
115/// assert_eq!(sp.part_index, Some(2));
116/// assert_eq!(sp.part_total, Some(5));
117/// assert_eq!(sp.base_subject, "bigfile.rar");
118/// ```
119///
120/// ```
121/// use uuencoding_multi::parse_subject;
122///
123/// // Re: prefix is stripped before matching.
124/// let sp = parse_subject("Re: archive.tar.gz (03/17)").unwrap();
125/// assert_eq!(sp.part_index, Some(3));
126/// assert_eq!(sp.part_total, Some(17));
127/// ```
128///
129/// ```
130/// use uuencoding_multi::parse_subject;
131///
132/// // yEnc subject → None (out of scope for this crate).
133/// assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
134/// ```
135///
136/// ```
137/// use uuencoding_multi::parse_subject;
138///
139/// // Empty input → None.
140/// assert!(parse_subject("").is_none());
141/// ```
142///
143/// ```
144/// use uuencoding_multi::parse_subject;
145///
146/// // No marker → Some with None fields and subject preserved.
147/// let sp = parse_subject("just a plain subject").unwrap();
148/// assert_eq!(sp.part_index, None);
149/// assert_eq!(sp.part_total, None);
150/// assert_eq!(sp.base_subject, "just a plain subject");
151/// ```
152pub fn parse_subject(subject: &str) -> Option<SubjectParts> {
153 if subject.is_empty() {
154 return None;
155 }
156
157 // yEnc posts are out of scope.
158 if yenc_re().is_match(subject) {
159 return None;
160 }
161
162 let stripped = strip_prefixes(subject).trim();
163
164 for pat in patterns() {
165 if let Some(caps) = pat.re.captures(stripped) {
166 // Capture group 1 is always the part index.
167 let part_index: u32 = caps[1].parse().ok()?;
168 // Capture group 2 is always the total (all five patterns have it).
169 let part_total: u32 = caps[2].parse().ok()?;
170
171 // Build base_subject: remove the matched span from `stripped`.
172 let m = caps.get(0).unwrap();
173 let before = stripped[..m.start()].trim_end();
174 let after = stripped[m.end()..].trim_start();
175
176 let raw = if before.is_empty() {
177 after.to_string()
178 } else if after.is_empty() {
179 before.to_string()
180 } else {
181 format!("{} {}", before, after)
182 };
183
184 // Strip trailing " -" artifact (e.g. "filename.tar.gz -").
185 let base_subject = raw
186 .trim_end_matches(|c: char| c == '-' || c.is_whitespace())
187 .trim()
188 .to_string();
189
190 return Some(SubjectParts {
191 base_subject,
192 part_index: Some(part_index),
193 part_total: Some(part_total),
194 });
195 }
196 }
197
198 // No marker found.
199 Some(SubjectParts {
200 base_subject: stripped.to_string(),
201 part_index: None,
202 part_total: None,
203 })
204}
205
206// ---------------------------------------------------------------------------
207// Tests
208// ---------------------------------------------------------------------------
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213
214 fn parts(subject: &str) -> SubjectParts {
215 parse_subject(subject).unwrap()
216 }
217
218 // ------------------------------------------------------------------
219 // Pattern 1 — parenthesised fraction
220 // ------------------------------------------------------------------
221
222 #[test]
223 fn paren_fraction_basic() {
224 let p = parts("bigfile.rar (1/5)");
225 assert_eq!(p.part_index, Some(1));
226 assert_eq!(p.part_total, Some(5));
227 assert_eq!(p.base_subject, "bigfile.rar");
228 }
229
230 #[test]
231 fn paren_fraction_leading_zero() {
232 let p = parts("filename.tar.gz (03/17)");
233 assert_eq!(p.part_index, Some(3));
234 assert_eq!(p.part_total, Some(17));
235 assert_eq!(p.base_subject, "filename.tar.gz");
236 }
237
238 #[test]
239 fn paren_fraction_spaces_inside() {
240 let p = parts("file.zip ( 2 / 7 )");
241 assert_eq!(p.part_index, Some(2));
242 assert_eq!(p.part_total, Some(7));
243 }
244
245 // ------------------------------------------------------------------
246 // Pattern 2 — bracketed fraction
247 // ------------------------------------------------------------------
248
249 #[test]
250 fn bracket_fraction_basic() {
251 let p = parts("image.jpg [2/4]");
252 assert_eq!(p.part_index, Some(2));
253 assert_eq!(p.part_total, Some(4));
254 assert_eq!(p.base_subject, "image.jpg");
255 }
256
257 #[test]
258 fn bracket_fraction_not_binary_tag() {
259 // [BINARY] must NOT be parsed as a fraction.
260 let p = parts("[BINARY] filename - Part 3 of 12");
261 assert_eq!(p.part_index, Some(3));
262 assert_eq!(p.part_total, Some(12));
263 }
264
265 // ------------------------------------------------------------------
266 // Pattern 3 — "Part N/M"
267 // ------------------------------------------------------------------
268
269 #[test]
270 fn part_slash_basic() {
271 let p = parts("file.zip Part 3/17");
272 assert_eq!(p.part_index, Some(3));
273 assert_eq!(p.part_total, Some(17));
274 }
275
276 #[test]
277 fn part_slash_lowercase() {
278 let p = parts("file.tar.gz part 2/5");
279 assert_eq!(p.part_index, Some(2));
280 assert_eq!(p.part_total, Some(5));
281 }
282
283 // ------------------------------------------------------------------
284 // Pattern 4 — "Part N of M"
285 // ------------------------------------------------------------------
286
287 #[test]
288 fn part_of_with_spaces() {
289 let p = parts("file.zip Part 03 of 17");
290 assert_eq!(p.part_index, Some(3));
291 assert_eq!(p.part_total, Some(17));
292 }
293
294 #[test]
295 fn part_of_no_spaces() {
296 let p = parts("archive.tar.gz part3of17");
297 assert_eq!(p.part_index, Some(3));
298 assert_eq!(p.part_total, Some(17));
299 }
300
301 #[test]
302 fn binary_tag_part_of() {
303 let p = parts("[BINARY] filename - Part 3 of 12");
304 assert_eq!(p.part_index, Some(3));
305 assert_eq!(p.part_total, Some(12));
306 }
307
308 // ------------------------------------------------------------------
309 // Pattern 5 — dash-separated fraction
310 // ------------------------------------------------------------------
311
312 #[test]
313 fn dash_fraction() {
314 let p = parts("filename.tar.gz - 03/17");
315 assert_eq!(p.part_index, Some(3));
316 assert_eq!(p.part_total, Some(17));
317 assert_eq!(p.base_subject, "filename.tar.gz");
318 }
319
320 // ------------------------------------------------------------------
321 // Part 0 (TOC)
322 // ------------------------------------------------------------------
323
324 #[test]
325 fn part_zero_toc() {
326 let p = parts("filename.tar.gz (00/17)");
327 assert_eq!(p.part_index, Some(0));
328 assert_eq!(p.part_total, Some(17));
329 }
330
331 // ------------------------------------------------------------------
332 // yEnc → None
333 // ------------------------------------------------------------------
334
335 #[test]
336 fn yenc_returns_none() {
337 assert!(parse_subject("\"file.nfo\" yEnc (1/3)").is_none());
338 }
339
340 #[test]
341 fn yenc_uppercase_returns_none() {
342 assert!(parse_subject("\"file.nfo\" YENC (1/3)").is_none());
343 }
344
345 // ------------------------------------------------------------------
346 // No marker
347 // ------------------------------------------------------------------
348
349 #[test]
350 fn no_marker_returns_some_none_fields() {
351 let p = parts("plain subject");
352 assert_eq!(p.base_subject, "plain subject");
353 assert_eq!(p.part_index, None);
354 assert_eq!(p.part_total, None);
355 }
356
357 // ------------------------------------------------------------------
358 // Empty input → None
359 // ------------------------------------------------------------------
360
361 #[test]
362 fn empty_returns_none() {
363 assert!(parse_subject("").is_none());
364 }
365
366 // ------------------------------------------------------------------
367 // Re: / Fwd: prefix stripping
368 // ------------------------------------------------------------------
369
370 #[test]
371 fn re_prefix_stripped() {
372 let p = parts("Re: filename.tar.gz (03/17)");
373 assert_eq!(p.part_index, Some(3));
374 assert_eq!(p.part_total, Some(17));
375 }
376
377 #[test]
378 fn fwd_prefix_stripped() {
379 let p = parts("Fwd: filename.tar.gz (03/17)");
380 assert_eq!(p.part_index, Some(3));
381 }
382
383 #[test]
384 fn fw_prefix_stripped() {
385 let p = parts("Fw: filename.tar.gz (03/17)");
386 assert_eq!(p.part_index, Some(3));
387 }
388
389 #[test]
390 fn nested_re_prefix_stripped() {
391 let p = parts("Re: Re: filename.tar.gz (03/17)");
392 assert_eq!(p.part_index, Some(3));
393 }
394
395 // ------------------------------------------------------------------
396 // Unicode stem — must not panic
397 // ------------------------------------------------------------------
398
399 #[test]
400 fn unicode_stem_no_panic() {
401 let p = parts("日本語ファイル (1/3)");
402 assert_eq!(p.part_index, Some(1));
403 assert_eq!(p.part_total, Some(3));
404 assert!(p.base_subject.contains('日'));
405 }
406
407 // ------------------------------------------------------------------
408 // base_subject trimming
409 // ------------------------------------------------------------------
410
411 #[test]
412 fn base_subject_trailing_dash_stripped() {
413 // Pattern 5 leaves no artifact here; verify general trimming.
414 let p = parts("myfile.bin (2/5)");
415 assert!(!p.base_subject.ends_with('-'));
416 assert!(!p.base_subject.ends_with(' '));
417 }
418}