Skip to main content

uuencoding_multi/
toc.rs

1//! Parser for multi-part UUencode table-of-contents (TOC) bodies.
2//!
3//! TOC posts (typically numbered part 0) list the files contained in a
4//! multi-part series, along with optional size information and part ranges.
5//! This module provides a best-effort parser that tolerates varied formatting
6//! and non-UTF-8 byte sequences.
7
8use std::ops::RangeInclusive;
9use std::sync::OnceLock;
10
11use regex::Regex;
12
13// ---------------------------------------------------------------------------
14// Public types
15// ---------------------------------------------------------------------------
16
17/// One entry in a table-of-contents post.
18///
19/// A TOC entry describes a single file within a multi-part series. Not all
20/// fields are present in every TOC format; `size_bytes` and `parts` are
21/// `None` when the corresponding information was absent from the line.
22///
23/// # Example
24///
25/// ```
26/// use uuencoding_multi::parse_toc;
27///
28/// let body = b"archive.tar.gz   1234567 bytes   parts 1-8\n";
29/// let toc = parse_toc(body).unwrap();
30/// let entry = &toc.entries[0];
31/// assert_eq!(entry.filename, "archive.tar.gz");
32/// assert_eq!(entry.size_bytes, Some(1_234_567));
33/// assert_eq!(entry.parts, Some(1..=8));
34/// ```
35#[derive(Debug, PartialEq)]
36pub struct TocEntry {
37    /// Filename as it appears in the TOC line.
38    pub filename: String,
39    /// Declared file size in bytes, if present. KB and MB values are
40    /// converted to bytes (1 KB = 1 024 bytes, 1 MB = 1 048 576 bytes).
41    pub size_bytes: Option<u64>,
42    /// Which parts carry this file, if a range was specified. The range is
43    /// inclusive on both ends (`lo..=hi`). Inverted ranges (`lo > hi`) are
44    /// silently discarded and produce `None`.
45    pub parts: Option<RangeInclusive<u32>>,
46}
47
48/// Result of parsing a TOC body.
49///
50/// `entries` may be a strict subset of the lines in the body: lines that do
51/// not look like TOC entries (plain text, comments, blank lines) are silently
52/// skipped. Inspect `raw_text` for the original body when debugging partial
53/// results.
54///
55/// # Example
56///
57/// ```
58/// use uuencoding_multi::parse_toc;
59///
60/// let body = b"# TOC\nfile.bin (512 bytes)\n";
61/// let toc = parse_toc(body).unwrap();
62/// assert_eq!(toc.entries.len(), 1);
63/// assert!(toc.raw_text.contains("# TOC"));
64/// ```
65#[derive(Debug)]
66pub struct ParsedToc {
67    /// Successfully parsed entries; may be a strict subset of all lines.
68    pub entries: Vec<TocEntry>,
69    /// Full body text kept verbatim for diagnostic use when parsing is partial.
70    /// Non-UTF-8 bytes are replaced with the Unicode replacement character
71    /// (`U+FFFD`) via lossy conversion.
72    pub raw_text: String,
73}
74
75// ---------------------------------------------------------------------------
76// Compiled-once regex patterns
77// ---------------------------------------------------------------------------
78
79/// Format 1: `filename.tar.gz   1234567 bytes   parts 1-8`
80/// Filename is the first whitespace-delimited token; size and "parts N-M" can
81/// appear in either order after it.
82fn re_format1() -> &'static Regex {
83    static RE: OnceLock<Regex> = OnceLock::new();
84    RE.get_or_init(|| {
85        // Captures: 1=filename, rest parsed manually for size/parts.
86        Regex::new(r"(?i)^([^ \t\r\n]+)[ \t]+(.+)$").unwrap()
87    })
88}
89
90/// Format 3 prefix: `01-08  filename.tar.gz  1234 KB`
91/// The line starts with a zero-padded (or plain) part range.
92fn re_format3_prefix() -> &'static Regex {
93    static RE: OnceLock<Regex> = OnceLock::new();
94    RE.get_or_init(|| {
95        Regex::new(r"^([0-9]{1,6})-([0-9]{1,6})[ \t]+([^ \t\r\n]+)[ \t]*(.*)$").unwrap()
96    })
97}
98
99/// Format 2: `filename.tar.gz (1234567 bytes)` — parenthesised size.
100fn re_format2() -> &'static Regex {
101    static RE: OnceLock<Regex> = OnceLock::new();
102    RE.get_or_init(|| {
103        Regex::new(r"(?i)^([^ \t\r\n]+)[ \t]*\([ \t]*([0-9]+)[ \t]*(bytes?|b|kb|mb)[ \t]*\)[ \t]*$")
104            .unwrap()
105    })
106}
107
108/// Size token: one or more digits followed by a unit.
109fn re_size_token() -> &'static Regex {
110    static RE: OnceLock<Regex> = OnceLock::new();
111    RE.get_or_init(|| Regex::new(r"(?i)\b([0-9]+)[ \t]*(bytes?|b|kb|mb)\b").unwrap())
112}
113
114/// "parts N-M" token.
115fn re_parts_token() -> &'static Regex {
116    static RE: OnceLock<Regex> = OnceLock::new();
117    RE.get_or_init(|| Regex::new(r"(?i)\bparts?[ \t]+([0-9]{1,6})-([0-9]{1,6})\b").unwrap())
118}
119
120// ---------------------------------------------------------------------------
121// Helpers
122// ---------------------------------------------------------------------------
123
124/// Convert a size value + unit string to bytes.
125/// Returns `None` only if the multiplication would overflow `u64`.
126fn parse_size(digits: u64, unit: &str) -> Option<u64> {
127    match unit.to_lowercase().trim_end_matches('s') {
128        "byte" | "b" => Some(digits),
129        "kb" => digits.checked_mul(1024),
130        "mb" => digits.checked_mul(1024 * 1024),
131        _ => None,
132    }
133}
134
135/// Try to extract a size value from an arbitrary text fragment.
136fn extract_size(text: &str) -> Option<u64> {
137    let caps = re_size_token().captures(text)?;
138    let digits: u64 = caps[1].parse().ok()?;
139    parse_size(digits, &caps[2])
140}
141
142/// Try to extract a part range from an arbitrary text fragment.
143fn extract_parts(text: &str) -> Option<RangeInclusive<u32>> {
144    let caps = re_parts_token().captures(text)?;
145    let lo: u32 = caps[1].parse().ok()?;
146    let hi: u32 = caps[2].parse().ok()?;
147    if lo <= hi {
148        Some(lo..=hi)
149    } else {
150        None
151    }
152}
153
154/// Attempt to parse a single TOC line into a [`TocEntry`].
155///
156/// Returns `None` if the line doesn't look like a TOC entry at all.
157fn parse_line(line: &str) -> Option<TocEntry> {
158    let line = line.trim();
159
160    // Skip blank lines and comments.
161    if line.is_empty() || line.starts_with('#') {
162        return None;
163    }
164
165    // Format 3: line starts with a part range, e.g. "01-08  filename  1234 KB"
166    if let Some(caps) = re_format3_prefix().captures(line) {
167        let lo: u32 = caps[1].parse().ok()?;
168        let hi: u32 = caps[2].parse().ok()?;
169        let filename = caps[3].to_string();
170        let remainder = &caps[4];
171        let size_bytes = extract_size(remainder);
172        // Validate the range is sensible and the filename looks real.
173        if lo > hi || !looks_like_filename(&filename) {
174            return None;
175        }
176        return Some(TocEntry {
177            filename,
178            size_bytes,
179            parts: Some(lo..=hi),
180        });
181    }
182
183    // Format 2: "filename (1234567 bytes)"
184    if let Some(caps) = re_format2().captures(line) {
185        let filename = caps[1].to_string();
186        if !looks_like_filename(&filename) {
187            return None;
188        }
189        let digits: u64 = caps[2].parse().ok()?;
190        let size_bytes = parse_size(digits, &caps[3]);
191        return Some(TocEntry {
192            filename,
193            size_bytes,
194            parts: None,
195        });
196    }
197
198    // Format 1 (and fallback): "filename  size_with_unit  [parts N-M]"
199    // The filename is the first non-whitespace token; the rest is parsed for
200    // size and parts tokens in any order.
201    if let Some(caps) = re_format1().captures(line) {
202        let filename = caps[1].to_string();
203        let remainder = &caps[2];
204
205        if !looks_like_filename(&filename) {
206            return None;
207        }
208
209        let size_bytes = extract_size(remainder);
210        let parts = extract_parts(remainder);
211
212        // A line only qualifies if it yields at least a size or a parts range
213        // — otherwise almost any two-token line would be accepted.
214        if size_bytes.is_none() && parts.is_none() {
215            return None;
216        }
217
218        return Some(TocEntry {
219            filename,
220            size_bytes,
221            parts,
222        });
223    }
224
225    None
226}
227
228/// Heuristic gate: a bare word like `"garbage"` or `"just"` should not be
229/// treated as a filename.
230///
231/// We require either:
232/// - a path separator (`/` or `\`), which strongly implies a path, or
233/// - a `.` whose extension part (the text after the last `.`) contains at
234///   least one alphabetic character — this rejects bare decimal numbers like
235///   `"1.5"` or `"3.14"` while accepting `"file.bin"`, `"1.txt"`,
236///   `"archive.tar.gz"`, and Unicode filenames like `"日本語.tar.gz"`.
237///
238/// **Known limitation**: a string like `"foo.123"` (digits-only extension)
239/// is rejected. This is intentional: pure-numeric extensions are rare in
240/// real-world UU archive filenames and are more likely to be numeric tokens.
241fn looks_like_filename(s: &str) -> bool {
242    if s.contains('/') || s.contains('\\') {
243        return true;
244    }
245    if let Some(dot_pos) = s.rfind('.') {
246        let ext = &s[dot_pos + 1..];
247        return ext.chars().any(|c| c.is_alphabetic());
248    }
249    false
250}
251
252// ---------------------------------------------------------------------------
253// Public API
254// ---------------------------------------------------------------------------
255
256/// Best-effort parse of a UUencode multi-part TOC body.
257///
258/// The input is treated as a sequence of lines. Each line is independently
259/// attempted against three recognised TOC formats (in priority order):
260///
261/// 1. **Part-range prefix**: `01-08  filename.tar.gz  1234 KB`
262/// 2. **Parenthesised size**: `filename.tar.gz (1234567 bytes)`
263/// 3. **Inline size/parts**: `filename.tar.gz   1234567 bytes   parts 1-8`
264///
265/// Lines that match none of these formats (comments starting with `#`, blank
266/// lines, plain prose) are silently ignored. This makes the parser tolerant
267/// of the varied free-form headers that real TOC posts contain.
268///
269/// # Return value
270///
271/// Returns `None` if no lines at all parse as TOC entries.
272/// Returns `Some(ParsedToc)` with partial `entries` if at least one line
273/// parses; unparseable lines are omitted without error.
274///
275/// # Format notes
276///
277/// - Size units `bytes`/`b`, `KB`, and `MB` are recognised case-insensitively.
278///   KB and MB are converted to bytes using powers of 1 024.
279/// - Part ranges use an inclusive dash notation (`N-M`). Inverted ranges
280///   where `N > M` are rejected and produce `None` for that field.
281/// - Non-UTF-8 bytes in `body_bytes` are replaced via lossy conversion and
282///   preserved verbatim in [`ParsedToc::raw_text`].
283///
284/// # Never panics
285///
286/// This function never panics on any input, including empty slices and
287/// byte sequences that are not valid UTF-8.
288///
289/// # Examples
290///
291/// ```
292/// use uuencoding_multi::parse_toc;
293///
294/// let body = b"archive.tar.gz   1234567 bytes   parts 1-8\n";
295/// let toc = parse_toc(body).expect("should parse");
296/// assert_eq!(toc.entries.len(), 1);
297/// assert_eq!(toc.entries[0].filename, "archive.tar.gz");
298/// assert_eq!(toc.entries[0].size_bytes, Some(1_234_567));
299/// assert_eq!(toc.entries[0].parts, Some(1..=8));
300/// ```
301///
302/// ```
303/// use uuencoding_multi::parse_toc;
304///
305/// // Body with no recognisable TOC lines → None.
306/// assert!(parse_toc(b"just plain text\n").is_none());
307/// ```
308///
309/// ```
310/// use uuencoding_multi::parse_toc;
311///
312/// // Empty input → None.
313/// assert!(parse_toc(b"").is_none());
314/// ```
315pub fn parse_toc(body_bytes: &[u8]) -> Option<ParsedToc> {
316    let raw_text = String::from_utf8_lossy(body_bytes).into_owned();
317
318    let entries: Vec<TocEntry> = raw_text.lines().filter_map(parse_line).collect();
319
320    if entries.is_empty() {
321        None
322    } else {
323        Some(ParsedToc { entries, raw_text })
324    }
325}
326
327// ---------------------------------------------------------------------------
328// Tests
329// ---------------------------------------------------------------------------
330
331#[cfg(test)]
332mod tests {
333    use super::*;
334
335    // ------------------------------------------------------------------
336    // Full TOC with all three formats present
337    // ------------------------------------------------------------------
338
339    #[test]
340    fn full_toc_three_formats() {
341        let body = b"# TOC\nfilename.tar.gz   1234567 bytes   parts 1-8\nother.zip   512 KB\nsome.bin (99 bytes)\n";
342        let toc = parse_toc(body).expect("should parse");
343        assert_eq!(toc.entries.len(), 3);
344
345        let e0 = &toc.entries[0];
346        assert_eq!(e0.filename, "filename.tar.gz");
347        assert_eq!(e0.size_bytes, Some(1234567));
348        assert_eq!(e0.parts, Some(1..=8));
349
350        let e1 = &toc.entries[1];
351        assert_eq!(e1.filename, "other.zip");
352        assert_eq!(e1.size_bytes, Some(512 * 1024));
353        assert_eq!(e1.parts, None);
354
355        let e2 = &toc.entries[2];
356        assert_eq!(e2.filename, "some.bin");
357        assert_eq!(e2.size_bytes, Some(99));
358        assert_eq!(e2.parts, None);
359    }
360
361    // ------------------------------------------------------------------
362    // Unparseable lines mixed in — no panic, still get 1 entry
363    // ------------------------------------------------------------------
364
365    #[test]
366    fn garbage_lines_mixed_in() {
367        let body = b"garbage\nfile.txt   100 bytes\ngibberish here\n";
368        let toc = parse_toc(body).expect("should parse");
369        assert_eq!(toc.entries.len(), 1);
370        assert_eq!(toc.entries[0].filename, "file.txt");
371        assert_eq!(toc.entries[0].size_bytes, Some(100));
372    }
373
374    // ------------------------------------------------------------------
375    // Not a TOC → None
376    // ------------------------------------------------------------------
377
378    #[test]
379    fn not_a_toc_returns_none() {
380        let body = b"just plain text body\nno entries at all\n";
381        assert!(parse_toc(body).is_none());
382    }
383
384    // ------------------------------------------------------------------
385    // UTF-8 filename
386    // ------------------------------------------------------------------
387
388    #[test]
389    fn utf8_filename_no_panic() {
390        let body = "日本語.tar.gz   100 bytes\n".as_bytes();
391        let toc = parse_toc(body).expect("should parse");
392        assert_eq!(toc.entries.len(), 1);
393        assert_eq!(toc.entries[0].filename, "日本語.tar.gz");
394    }
395
396    // ------------------------------------------------------------------
397    // Part range formats
398    // ------------------------------------------------------------------
399
400    #[test]
401    fn parts_token_format1() {
402        let body = b"file.tar.gz   100 bytes   parts 2-5\n";
403        let toc = parse_toc(body).expect("should parse");
404        assert_eq!(toc.entries[0].parts, Some(2..=5));
405    }
406
407    #[test]
408    fn parts_prefix_format3() {
409        let body = b"02-05  file.tar.gz  100 bytes\n";
410        let toc = parse_toc(body).expect("should parse");
411        assert_eq!(toc.entries[0].filename, "file.tar.gz");
412        assert_eq!(toc.entries[0].parts, Some(2..=5));
413        assert_eq!(toc.entries[0].size_bytes, Some(100));
414    }
415
416    // ------------------------------------------------------------------
417    // Size unit parsing
418    // ------------------------------------------------------------------
419
420    #[test]
421    fn size_kb() {
422        let body = b"archive.zip   1 KB\n";
423        let toc = parse_toc(body).expect("should parse");
424        assert_eq!(toc.entries[0].size_bytes, Some(1024));
425    }
426
427    #[test]
428    fn size_mb() {
429        let body = b"archive.zip   2 MB\n";
430        let toc = parse_toc(body).expect("should parse");
431        assert_eq!(toc.entries[0].size_bytes, Some(2 * 1024 * 1024));
432    }
433
434    #[test]
435    fn size_bare_b_unit() {
436        let body = b"file.bin   512 B\n";
437        let toc = parse_toc(body).expect("should parse");
438        assert_eq!(toc.entries[0].size_bytes, Some(512));
439    }
440
441    // ------------------------------------------------------------------
442    // Non-UTF-8 input — must not panic
443    // ------------------------------------------------------------------
444
445    #[test]
446    fn non_utf8_no_panic() {
447        // Embed an invalid UTF-8 sequence followed by a valid TOC line.
448        let mut body = vec![0xFF, 0xFE, b'\n'];
449        body.extend_from_slice(b"file.tar.gz   100 bytes\n");
450        // May or may not produce an entry depending on lossy conversion, but
451        // must never panic.
452        let _ = parse_toc(&body);
453    }
454
455    // ------------------------------------------------------------------
456    // Comment-only body → None
457    // ------------------------------------------------------------------
458
459    #[test]
460    fn comment_only_returns_none() {
461        let body = b"# just a comment\n# another comment\n";
462        assert!(parse_toc(body).is_none());
463    }
464
465    // ------------------------------------------------------------------
466    // Empty input → None
467    // ------------------------------------------------------------------
468
469    #[test]
470    fn empty_input_returns_none() {
471        assert!(parse_toc(b"").is_none());
472    }
473
474    // ------------------------------------------------------------------
475    // raw_text is preserved verbatim
476    // ------------------------------------------------------------------
477
478    #[test]
479    fn raw_text_preserved() {
480        let body = b"# TOC\nfile.tar.gz   100 bytes\n";
481        let toc = parse_toc(body).expect("should parse");
482        assert!(toc.raw_text.contains("# TOC"));
483        assert!(toc.raw_text.contains("file.tar.gz"));
484    }
485
486    // ------------------------------------------------------------------
487    // Format 2 parenthesised size — various units
488    // ------------------------------------------------------------------
489
490    #[test]
491    fn format2_kb() {
492        let body = b"file.tar.gz (1024 KB)\n";
493        let toc = parse_toc(body).expect("should parse");
494        assert_eq!(toc.entries[0].size_bytes, Some(1024 * 1024));
495    }
496
497    // ------------------------------------------------------------------
498    // Part range where lo > hi is ignored (invalid range)
499    // ------------------------------------------------------------------
500
501    #[test]
502    fn inverted_parts_range_format1_ignored() {
503        // "parts 8-1" is nonsensical — should still parse the entry but
504        // produce no parts range.
505        let body = b"file.tar.gz   100 bytes   parts 8-1\n";
506        let toc = parse_toc(body).expect("should parse");
507        assert_eq!(toc.entries[0].parts, None);
508    }
509
510    // ------------------------------------------------------------------
511    // Plural "parts" keyword
512    // ------------------------------------------------------------------
513
514    #[test]
515    fn plural_parts_keyword() {
516        let body = b"file.tar.gz   100 bytes   parts 3-6\n";
517        let toc = parse_toc(body).expect("should parse");
518        assert_eq!(toc.entries[0].parts, Some(3..=6));
519    }
520
521    // ------------------------------------------------------------------
522    // Singular "part" keyword
523    // ------------------------------------------------------------------
524
525    #[test]
526    fn singular_part_keyword() {
527        let body = b"file.tar.gz   100 bytes   part 3-6\n";
528        let toc = parse_toc(body).expect("should parse");
529        assert_eq!(toc.entries[0].parts, Some(3..=6));
530    }
531
532    // ------------------------------------------------------------------
533    // looks_like_filename — path separator branch
534    // ------------------------------------------------------------------
535
536    /// Strings containing '/' or '\' are unconditionally accepted as filenames
537    /// regardless of whether they have an extension, because a path separator
538    /// is strong evidence of an actual path.
539    #[test]
540    fn looks_like_filename_path_separator() {
541        assert!(looks_like_filename("some/path/file.rar"));
542        assert!(looks_like_filename("/absolute/path"));
543        // Backslash path (Windows-style) is also accepted.
544        assert!(looks_like_filename("some\\path\\file.rar"));
545    }
546
547    // ------------------------------------------------------------------
548    // looks_like_filename — decimal numbers are not filenames (592.9)
549    // ------------------------------------------------------------------
550
551    /// Bare decimal numbers like "1.5" or "3.14" must not be accepted as
552    /// filenames; their extension is digits-only.
553    #[test]
554    fn decimal_number_is_not_a_filename() {
555        assert!(
556            !looks_like_filename("1.5"),
557            "\"1.5\" must not be treated as a filename"
558        );
559        assert!(
560            !looks_like_filename("3.14"),
561            "\"3.14\" must not be treated as a filename"
562        );
563        // Sanity-check: real filenames still pass.
564        assert!(looks_like_filename("file.bin"));
565        assert!(looks_like_filename("archive.tar.gz"));
566        assert!(looks_like_filename("1.txt"));
567    }
568}