uuencoding_multi/toc.rs
1//! Parser for multi-part UUencode table-of-contents (TOC) bodies.
2//!
3//! TOC posts (typically numbered part 0) list the files contained in a
4//! multi-part series, along with optional size information and part ranges.
5//! This module provides a best-effort parser that tolerates varied formatting
6//! and non-UTF-8 byte sequences.
7
8use std::ops::RangeInclusive;
9use std::sync::OnceLock;
10
11use regex::Regex;
12
13// ---------------------------------------------------------------------------
14// Public types
15// ---------------------------------------------------------------------------
16
17/// One entry in a table-of-contents post.
18///
19/// A TOC entry describes a single file within a multi-part series. Not all
20/// fields are present in every TOC format; `size_bytes` and `parts` are
21/// `None` when the corresponding information was absent from the line.
22///
23/// # Example
24///
25/// ```
26/// use uuencoding_multi::parse_toc;
27///
28/// let body = b"archive.tar.gz 1234567 bytes parts 1-8\n";
29/// let toc = parse_toc(body).unwrap();
30/// let entry = &toc.entries[0];
31/// assert_eq!(entry.filename, "archive.tar.gz");
32/// assert_eq!(entry.size_bytes, Some(1_234_567));
33/// assert_eq!(entry.parts, Some(1..=8));
34/// ```
35#[derive(Debug, PartialEq)]
36pub struct TocEntry {
37 /// Filename as it appears in the TOC line.
38 pub filename: String,
39 /// Declared file size in bytes, if present. KB and MB values are
40 /// converted to bytes (1 KB = 1 024 bytes, 1 MB = 1 048 576 bytes).
41 pub size_bytes: Option<u64>,
42 /// Which parts carry this file, if a range was specified. The range is
43 /// inclusive on both ends (`lo..=hi`). Inverted ranges (`lo > hi`) are
44 /// silently discarded and produce `None`.
45 pub parts: Option<RangeInclusive<u32>>,
46}
47
48/// Result of parsing a TOC body.
49///
50/// `entries` may be a strict subset of the lines in the body: lines that do
51/// not look like TOC entries (plain text, comments, blank lines) are silently
52/// skipped. Inspect `raw_text` for the original body when debugging partial
53/// results.
54///
55/// # Example
56///
57/// ```
58/// use uuencoding_multi::parse_toc;
59///
60/// let body = b"# TOC\nfile.bin (512 bytes)\n";
61/// let toc = parse_toc(body).unwrap();
62/// assert_eq!(toc.entries.len(), 1);
63/// assert!(toc.raw_text.contains("# TOC"));
64/// ```
65#[derive(Debug)]
66pub struct ParsedToc {
67 /// Successfully parsed entries; may be a strict subset of all lines.
68 pub entries: Vec<TocEntry>,
69 /// Full body text kept verbatim for diagnostic use when parsing is partial.
70 /// Non-UTF-8 bytes are replaced with the Unicode replacement character
71 /// (`U+FFFD`) via lossy conversion.
72 pub raw_text: String,
73}
74
75// ---------------------------------------------------------------------------
76// Compiled-once regex patterns
77// ---------------------------------------------------------------------------
78
79/// Format 1: `filename.tar.gz 1234567 bytes parts 1-8`
80/// Filename is the first whitespace-delimited token; size and "parts N-M" can
81/// appear in either order after it.
82fn re_format1() -> &'static Regex {
83 static RE: OnceLock<Regex> = OnceLock::new();
84 RE.get_or_init(|| {
85 // Captures: 1=filename, rest parsed manually for size/parts.
86 Regex::new(r"(?i)^([^ \t\r\n]+)[ \t]+(.+)$").unwrap()
87 })
88}
89
90/// Format 3 prefix: `01-08 filename.tar.gz 1234 KB`
91/// The line starts with a zero-padded (or plain) part range.
92fn re_format3_prefix() -> &'static Regex {
93 static RE: OnceLock<Regex> = OnceLock::new();
94 RE.get_or_init(|| {
95 Regex::new(r"^([0-9]{1,6})-([0-9]{1,6})[ \t]+([^ \t\r\n]+)[ \t]*(.*)$").unwrap()
96 })
97}
98
99/// Format 2: `filename.tar.gz (1234567 bytes)` — parenthesised size.
100fn re_format2() -> &'static Regex {
101 static RE: OnceLock<Regex> = OnceLock::new();
102 RE.get_or_init(|| {
103 Regex::new(r"(?i)^([^ \t\r\n]+)[ \t]*\([ \t]*([0-9]+)[ \t]*(bytes?|b|kb|mb)[ \t]*\)[ \t]*$")
104 .unwrap()
105 })
106}
107
108/// Size token: one or more digits followed by a unit.
109fn re_size_token() -> &'static Regex {
110 static RE: OnceLock<Regex> = OnceLock::new();
111 RE.get_or_init(|| Regex::new(r"(?i)\b([0-9]+)[ \t]*(bytes?|b|kb|mb)\b").unwrap())
112}
113
114/// "parts N-M" token.
115fn re_parts_token() -> &'static Regex {
116 static RE: OnceLock<Regex> = OnceLock::new();
117 RE.get_or_init(|| Regex::new(r"(?i)\bparts?[ \t]+([0-9]{1,6})-([0-9]{1,6})\b").unwrap())
118}
119
120// ---------------------------------------------------------------------------
121// Helpers
122// ---------------------------------------------------------------------------
123
124/// Convert a size value + unit string to bytes.
125/// Returns `None` only if the multiplication would overflow `u64`.
126fn parse_size(digits: u64, unit: &str) -> Option<u64> {
127 match unit.to_lowercase().trim_end_matches('s') {
128 "byte" | "b" => Some(digits),
129 "kb" => digits.checked_mul(1024),
130 "mb" => digits.checked_mul(1024 * 1024),
131 _ => None,
132 }
133}
134
135/// Try to extract a size value from an arbitrary text fragment.
136fn extract_size(text: &str) -> Option<u64> {
137 let caps = re_size_token().captures(text)?;
138 let digits: u64 = caps[1].parse().ok()?;
139 parse_size(digits, &caps[2])
140}
141
142/// Try to extract a part range from an arbitrary text fragment.
143fn extract_parts(text: &str) -> Option<RangeInclusive<u32>> {
144 let caps = re_parts_token().captures(text)?;
145 let lo: u32 = caps[1].parse().ok()?;
146 let hi: u32 = caps[2].parse().ok()?;
147 if lo <= hi {
148 Some(lo..=hi)
149 } else {
150 None
151 }
152}
153
154/// Attempt to parse a single TOC line into a [`TocEntry`].
155///
156/// Returns `None` if the line doesn't look like a TOC entry at all.
157fn parse_line(line: &str) -> Option<TocEntry> {
158 let line = line.trim();
159
160 // Skip blank lines and comments.
161 if line.is_empty() || line.starts_with('#') {
162 return None;
163 }
164
165 // Format 3: line starts with a part range, e.g. "01-08 filename 1234 KB"
166 if let Some(caps) = re_format3_prefix().captures(line) {
167 let lo: u32 = caps[1].parse().ok()?;
168 let hi: u32 = caps[2].parse().ok()?;
169 let filename = caps[3].to_string();
170 let remainder = &caps[4];
171 let size_bytes = extract_size(remainder);
172 // Validate the range is sensible and the filename looks real.
173 if lo > hi || !looks_like_filename(&filename) {
174 return None;
175 }
176 return Some(TocEntry {
177 filename,
178 size_bytes,
179 parts: Some(lo..=hi),
180 });
181 }
182
183 // Format 2: "filename (1234567 bytes)"
184 if let Some(caps) = re_format2().captures(line) {
185 let filename = caps[1].to_string();
186 if !looks_like_filename(&filename) {
187 return None;
188 }
189 let digits: u64 = caps[2].parse().ok()?;
190 let size_bytes = parse_size(digits, &caps[3]);
191 return Some(TocEntry {
192 filename,
193 size_bytes,
194 parts: None,
195 });
196 }
197
198 // Format 1 (and fallback): "filename size_with_unit [parts N-M]"
199 // The filename is the first non-whitespace token; the rest is parsed for
200 // size and parts tokens in any order.
201 if let Some(caps) = re_format1().captures(line) {
202 let filename = caps[1].to_string();
203 let remainder = &caps[2];
204
205 if !looks_like_filename(&filename) {
206 return None;
207 }
208
209 let size_bytes = extract_size(remainder);
210 let parts = extract_parts(remainder);
211
212 // A line only qualifies if it yields at least a size or a parts range
213 // — otherwise almost any two-token line would be accepted.
214 if size_bytes.is_none() && parts.is_none() {
215 return None;
216 }
217
218 return Some(TocEntry {
219 filename,
220 size_bytes,
221 parts,
222 });
223 }
224
225 None
226}
227
228/// Heuristic gate: a bare word like `"garbage"` or `"just"` should not be
229/// treated as a filename.
230///
231/// We require either:
232/// - a path separator (`/` or `\`), which strongly implies a path, or
233/// - a `.` whose extension part (the text after the last `.`) contains at
234/// least one alphabetic character — this rejects bare decimal numbers like
235/// `"1.5"` or `"3.14"` while accepting `"file.bin"`, `"1.txt"`,
236/// `"archive.tar.gz"`, and Unicode filenames like `"日本語.tar.gz"`.
237///
238/// **Known limitation**: a string like `"foo.123"` (digits-only extension)
239/// is rejected. This is intentional: pure-numeric extensions are rare in
240/// real-world UU archive filenames and are more likely to be numeric tokens.
241fn looks_like_filename(s: &str) -> bool {
242 if s.contains('/') || s.contains('\\') {
243 return true;
244 }
245 if let Some(dot_pos) = s.rfind('.') {
246 let ext = &s[dot_pos + 1..];
247 return ext.chars().any(|c| c.is_alphabetic());
248 }
249 false
250}
251
252// ---------------------------------------------------------------------------
253// Public API
254// ---------------------------------------------------------------------------
255
256/// Best-effort parse of a UUencode multi-part TOC body.
257///
258/// The input is treated as a sequence of lines. Each line is independently
259/// attempted against three recognised TOC formats (in priority order):
260///
261/// 1. **Part-range prefix**: `01-08 filename.tar.gz 1234 KB`
262/// 2. **Parenthesised size**: `filename.tar.gz (1234567 bytes)`
263/// 3. **Inline size/parts**: `filename.tar.gz 1234567 bytes parts 1-8`
264///
265/// Lines that match none of these formats (comments starting with `#`, blank
266/// lines, plain prose) are silently ignored. This makes the parser tolerant
267/// of the varied free-form headers that real TOC posts contain.
268///
269/// # Return value
270///
271/// Returns `None` if no lines at all parse as TOC entries.
272/// Returns `Some(ParsedToc)` with partial `entries` if at least one line
273/// parses; unparseable lines are omitted without error.
274///
275/// # Format notes
276///
277/// - Size units `bytes`/`b`, `KB`, and `MB` are recognised case-insensitively.
278/// KB and MB are converted to bytes using powers of 1 024.
279/// - Part ranges use an inclusive dash notation (`N-M`). Inverted ranges
280/// where `N > M` are rejected and produce `None` for that field.
281/// - Non-UTF-8 bytes in `body_bytes` are replaced via lossy conversion and
282/// preserved verbatim in [`ParsedToc::raw_text`].
283///
284/// # Never panics
285///
286/// This function never panics on any input, including empty slices and
287/// byte sequences that are not valid UTF-8.
288///
289/// # Examples
290///
291/// ```
292/// use uuencoding_multi::parse_toc;
293///
294/// let body = b"archive.tar.gz 1234567 bytes parts 1-8\n";
295/// let toc = parse_toc(body).expect("should parse");
296/// assert_eq!(toc.entries.len(), 1);
297/// assert_eq!(toc.entries[0].filename, "archive.tar.gz");
298/// assert_eq!(toc.entries[0].size_bytes, Some(1_234_567));
299/// assert_eq!(toc.entries[0].parts, Some(1..=8));
300/// ```
301///
302/// ```
303/// use uuencoding_multi::parse_toc;
304///
305/// // Body with no recognisable TOC lines → None.
306/// assert!(parse_toc(b"just plain text\n").is_none());
307/// ```
308///
309/// ```
310/// use uuencoding_multi::parse_toc;
311///
312/// // Empty input → None.
313/// assert!(parse_toc(b"").is_none());
314/// ```
315pub fn parse_toc(body_bytes: &[u8]) -> Option<ParsedToc> {
316 let raw_text = String::from_utf8_lossy(body_bytes).into_owned();
317
318 let entries: Vec<TocEntry> = raw_text.lines().filter_map(parse_line).collect();
319
320 if entries.is_empty() {
321 None
322 } else {
323 Some(ParsedToc { entries, raw_text })
324 }
325}
326
327// ---------------------------------------------------------------------------
328// Tests
329// ---------------------------------------------------------------------------
330
331#[cfg(test)]
332mod tests {
333 use super::*;
334
335 // ------------------------------------------------------------------
336 // Full TOC with all three formats present
337 // ------------------------------------------------------------------
338
339 #[test]
340 fn full_toc_three_formats() {
341 let body = b"# TOC\nfilename.tar.gz 1234567 bytes parts 1-8\nother.zip 512 KB\nsome.bin (99 bytes)\n";
342 let toc = parse_toc(body).expect("should parse");
343 assert_eq!(toc.entries.len(), 3);
344
345 let e0 = &toc.entries[0];
346 assert_eq!(e0.filename, "filename.tar.gz");
347 assert_eq!(e0.size_bytes, Some(1234567));
348 assert_eq!(e0.parts, Some(1..=8));
349
350 let e1 = &toc.entries[1];
351 assert_eq!(e1.filename, "other.zip");
352 assert_eq!(e1.size_bytes, Some(512 * 1024));
353 assert_eq!(e1.parts, None);
354
355 let e2 = &toc.entries[2];
356 assert_eq!(e2.filename, "some.bin");
357 assert_eq!(e2.size_bytes, Some(99));
358 assert_eq!(e2.parts, None);
359 }
360
361 // ------------------------------------------------------------------
362 // Unparseable lines mixed in — no panic, still get 1 entry
363 // ------------------------------------------------------------------
364
365 #[test]
366 fn garbage_lines_mixed_in() {
367 let body = b"garbage\nfile.txt 100 bytes\ngibberish here\n";
368 let toc = parse_toc(body).expect("should parse");
369 assert_eq!(toc.entries.len(), 1);
370 assert_eq!(toc.entries[0].filename, "file.txt");
371 assert_eq!(toc.entries[0].size_bytes, Some(100));
372 }
373
374 // ------------------------------------------------------------------
375 // Not a TOC → None
376 // ------------------------------------------------------------------
377
378 #[test]
379 fn not_a_toc_returns_none() {
380 let body = b"just plain text body\nno entries at all\n";
381 assert!(parse_toc(body).is_none());
382 }
383
384 // ------------------------------------------------------------------
385 // UTF-8 filename
386 // ------------------------------------------------------------------
387
388 #[test]
389 fn utf8_filename_no_panic() {
390 let body = "日本語.tar.gz 100 bytes\n".as_bytes();
391 let toc = parse_toc(body).expect("should parse");
392 assert_eq!(toc.entries.len(), 1);
393 assert_eq!(toc.entries[0].filename, "日本語.tar.gz");
394 }
395
396 // ------------------------------------------------------------------
397 // Part range formats
398 // ------------------------------------------------------------------
399
400 #[test]
401 fn parts_token_format1() {
402 let body = b"file.tar.gz 100 bytes parts 2-5\n";
403 let toc = parse_toc(body).expect("should parse");
404 assert_eq!(toc.entries[0].parts, Some(2..=5));
405 }
406
407 #[test]
408 fn parts_prefix_format3() {
409 let body = b"02-05 file.tar.gz 100 bytes\n";
410 let toc = parse_toc(body).expect("should parse");
411 assert_eq!(toc.entries[0].filename, "file.tar.gz");
412 assert_eq!(toc.entries[0].parts, Some(2..=5));
413 assert_eq!(toc.entries[0].size_bytes, Some(100));
414 }
415
416 // ------------------------------------------------------------------
417 // Size unit parsing
418 // ------------------------------------------------------------------
419
420 #[test]
421 fn size_kb() {
422 let body = b"archive.zip 1 KB\n";
423 let toc = parse_toc(body).expect("should parse");
424 assert_eq!(toc.entries[0].size_bytes, Some(1024));
425 }
426
427 #[test]
428 fn size_mb() {
429 let body = b"archive.zip 2 MB\n";
430 let toc = parse_toc(body).expect("should parse");
431 assert_eq!(toc.entries[0].size_bytes, Some(2 * 1024 * 1024));
432 }
433
434 #[test]
435 fn size_bare_b_unit() {
436 let body = b"file.bin 512 B\n";
437 let toc = parse_toc(body).expect("should parse");
438 assert_eq!(toc.entries[0].size_bytes, Some(512));
439 }
440
441 // ------------------------------------------------------------------
442 // Non-UTF-8 input — must not panic
443 // ------------------------------------------------------------------
444
445 #[test]
446 fn non_utf8_no_panic() {
447 // Embed an invalid UTF-8 sequence followed by a valid TOC line.
448 let mut body = vec![0xFF, 0xFE, b'\n'];
449 body.extend_from_slice(b"file.tar.gz 100 bytes\n");
450 // May or may not produce an entry depending on lossy conversion, but
451 // must never panic.
452 let _ = parse_toc(&body);
453 }
454
455 // ------------------------------------------------------------------
456 // Comment-only body → None
457 // ------------------------------------------------------------------
458
459 #[test]
460 fn comment_only_returns_none() {
461 let body = b"# just a comment\n# another comment\n";
462 assert!(parse_toc(body).is_none());
463 }
464
465 // ------------------------------------------------------------------
466 // Empty input → None
467 // ------------------------------------------------------------------
468
469 #[test]
470 fn empty_input_returns_none() {
471 assert!(parse_toc(b"").is_none());
472 }
473
474 // ------------------------------------------------------------------
475 // raw_text is preserved verbatim
476 // ------------------------------------------------------------------
477
478 #[test]
479 fn raw_text_preserved() {
480 let body = b"# TOC\nfile.tar.gz 100 bytes\n";
481 let toc = parse_toc(body).expect("should parse");
482 assert!(toc.raw_text.contains("# TOC"));
483 assert!(toc.raw_text.contains("file.tar.gz"));
484 }
485
486 // ------------------------------------------------------------------
487 // Format 2 parenthesised size — various units
488 // ------------------------------------------------------------------
489
490 #[test]
491 fn format2_kb() {
492 let body = b"file.tar.gz (1024 KB)\n";
493 let toc = parse_toc(body).expect("should parse");
494 assert_eq!(toc.entries[0].size_bytes, Some(1024 * 1024));
495 }
496
497 // ------------------------------------------------------------------
498 // Part range where lo > hi is ignored (invalid range)
499 // ------------------------------------------------------------------
500
501 #[test]
502 fn inverted_parts_range_format1_ignored() {
503 // "parts 8-1" is nonsensical — should still parse the entry but
504 // produce no parts range.
505 let body = b"file.tar.gz 100 bytes parts 8-1\n";
506 let toc = parse_toc(body).expect("should parse");
507 assert_eq!(toc.entries[0].parts, None);
508 }
509
510 // ------------------------------------------------------------------
511 // Plural "parts" keyword
512 // ------------------------------------------------------------------
513
514 #[test]
515 fn plural_parts_keyword() {
516 let body = b"file.tar.gz 100 bytes parts 3-6\n";
517 let toc = parse_toc(body).expect("should parse");
518 assert_eq!(toc.entries[0].parts, Some(3..=6));
519 }
520
521 // ------------------------------------------------------------------
522 // Singular "part" keyword
523 // ------------------------------------------------------------------
524
525 #[test]
526 fn singular_part_keyword() {
527 let body = b"file.tar.gz 100 bytes part 3-6\n";
528 let toc = parse_toc(body).expect("should parse");
529 assert_eq!(toc.entries[0].parts, Some(3..=6));
530 }
531
532 // ------------------------------------------------------------------
533 // looks_like_filename — path separator branch
534 // ------------------------------------------------------------------
535
536 /// Strings containing '/' or '\' are unconditionally accepted as filenames
537 /// regardless of whether they have an extension, because a path separator
538 /// is strong evidence of an actual path.
539 #[test]
540 fn looks_like_filename_path_separator() {
541 assert!(looks_like_filename("some/path/file.rar"));
542 assert!(looks_like_filename("/absolute/path"));
543 // Backslash path (Windows-style) is also accepted.
544 assert!(looks_like_filename("some\\path\\file.rar"));
545 }
546
547 // ------------------------------------------------------------------
548 // looks_like_filename — decimal numbers are not filenames (592.9)
549 // ------------------------------------------------------------------
550
551 /// Bare decimal numbers like "1.5" or "3.14" must not be accepted as
552 /// filenames; their extension is digits-only.
553 #[test]
554 fn decimal_number_is_not_a_filename() {
555 assert!(
556 !looks_like_filename("1.5"),
557 "\"1.5\" must not be treated as a filename"
558 );
559 assert!(
560 !looks_like_filename("3.14"),
561 "\"3.14\" must not be treated as a filename"
562 );
563 // Sanity-check: real filenames still pass.
564 assert!(looks_like_filename("file.bin"));
565 assert!(looks_like_filename("archive.tar.gz"));
566 assert!(looks_like_filename("1.txt"));
567 }
568}