uuencoding_multi/
toc.rs

1//! Parser for multi-part UUencode table-of-contents (TOC) bodies.
2//!
3//! TOC posts (typically numbered part 0) list the files contained in a
4//! multi-part series, along with optional size information and part ranges.
5//! This module provides a best-effort parser that tolerates varied formatting
6//! and non-UTF-8 byte sequences.
7
8use std::ops::RangeInclusive;
9use std::sync::OnceLock;
10
11use regex::Regex;
12
13// ---------------------------------------------------------------------------
14// Public types
15// ---------------------------------------------------------------------------
16
17/// One entry in a table-of-contents post.
18///
19/// A TOC entry describes a single file within a multi-part series. Not all
20/// fields are present in every TOC format; `size_bytes` and `parts` are
21/// `None` when the corresponding information was absent from the line.
22///
23/// # Example
24///
25/// ```
26/// use uuencoding_multi::parse_toc;
27///
28/// let body = b"archive.tar.gz   1234567 bytes   parts 1-8\n";
29/// let toc = parse_toc(body).unwrap();
30/// let entry = &toc.entries[0];
31/// assert_eq!(entry.filename, "archive.tar.gz");
32/// assert_eq!(entry.size_bytes, Some(1_234_567));
33/// assert_eq!(entry.parts, Some(1..=8));
34/// ```
35#[derive(Debug, PartialEq)]
36pub struct TocEntry {
37    /// Filename as it appears in the TOC line.
38    pub filename: String,
39    /// Declared file size in bytes, if present. KB and MB values are
40    /// converted to bytes (1 KB = 1 024 bytes, 1 MB = 1 048 576 bytes).
41    pub size_bytes: Option<u64>,
42    /// Which parts carry this file, if a range was specified. The range is
43    /// inclusive on both ends (`lo..=hi`). Inverted ranges (`lo > hi`) are
44    /// silently discarded and produce `None`.
45    pub parts: Option<RangeInclusive<u32>>,
46}
47
48/// Result of parsing a TOC body.
49///
50/// `entries` may be a strict subset of the lines in the body: lines that do
51/// not look like TOC entries (plain text, comments, blank lines) are silently
52/// skipped. Inspect `raw_text` for the original body when debugging partial
53/// results.
54///
55/// # Example
56///
57/// ```
58/// use uuencoding_multi::parse_toc;
59///
60/// let body = b"# TOC\nfile.bin (512 bytes)\n";
61/// let toc = parse_toc(body).unwrap();
62/// assert_eq!(toc.entries.len(), 1);
63/// assert!(toc.raw_text.contains("# TOC"));
64/// ```
65#[derive(Debug)]
66pub struct ParsedToc {
67    /// Successfully parsed entries; may be a strict subset of all lines.
68    pub entries: Vec<TocEntry>,
69    /// Full body text kept verbatim for diagnostic use when parsing is partial.
70    /// Non-UTF-8 bytes are replaced with the Unicode replacement character
71    /// (`U+FFFD`) via lossy conversion.
72    pub raw_text: String,
73}
74
75// ---------------------------------------------------------------------------
76// Compiled-once regex patterns
77// ---------------------------------------------------------------------------
78
79/// Format 1: `filename.tar.gz   1234567 bytes   parts 1-8`
80/// Filename is the first whitespace-delimited token; size and "parts N-M" can
81/// appear in either order after it.
82fn re_format1() -> &'static Regex {
83    static RE: OnceLock<Regex> = OnceLock::new();
84    RE.get_or_init(|| {
85        // Captures: 1=filename, rest parsed manually for size/parts.
86        Regex::new(r"(?i)^(\S+)\s+(.+)$").unwrap()
87    })
88}
89
90/// Format 3 prefix: `01-08  filename.tar.gz  1234 KB`
91/// The line starts with a zero-padded (or plain) part range.
92fn re_format3_prefix() -> &'static Regex {
93    static RE: OnceLock<Regex> = OnceLock::new();
94    RE.get_or_init(|| Regex::new(r"^(\d{1,6})-(\d{1,6})\s+(\S+)\s*(.*)$").unwrap())
95}
96
97/// Format 2: `filename.tar.gz (1234567 bytes)` — parenthesised size.
98fn re_format2() -> &'static Regex {
99    static RE: OnceLock<Regex> = OnceLock::new();
100    RE.get_or_init(|| Regex::new(r"(?i)^(\S+)\s*\(\s*(\d+)\s*(bytes?|b|kb|mb)\s*\)\s*$").unwrap())
101}
102
103/// Size token: one or more digits followed by a unit.
104fn re_size_token() -> &'static Regex {
105    static RE: OnceLock<Regex> = OnceLock::new();
106    RE.get_or_init(|| Regex::new(r"(?i)\b(\d+)\s*(bytes?|b|kb|mb)\b").unwrap())
107}
108
109/// "parts N-M" token.
110fn re_parts_token() -> &'static Regex {
111    static RE: OnceLock<Regex> = OnceLock::new();
112    RE.get_or_init(|| Regex::new(r"(?i)\bparts?\s+(\d{1,6})-(\d{1,6})\b").unwrap())
113}
114
115// ---------------------------------------------------------------------------
116// Helpers
117// ---------------------------------------------------------------------------
118
119/// Convert a size value + unit string to bytes.
120/// Returns `None` only if the multiplication would overflow `u64`.
121fn parse_size(digits: u64, unit: &str) -> Option<u64> {
122    match unit.to_lowercase().trim_end_matches('s') {
123        "byte" | "b" => Some(digits),
124        "kb" => digits.checked_mul(1024),
125        "mb" => digits.checked_mul(1024 * 1024),
126        _ => None,
127    }
128}
129
130/// Try to extract a size value from an arbitrary text fragment.
131fn extract_size(text: &str) -> Option<u64> {
132    let caps = re_size_token().captures(text)?;
133    let digits: u64 = caps[1].parse().ok()?;
134    parse_size(digits, &caps[2])
135}
136
137/// Try to extract a part range from an arbitrary text fragment.
138fn extract_parts(text: &str) -> Option<RangeInclusive<u32>> {
139    let caps = re_parts_token().captures(text)?;
140    let lo: u32 = caps[1].parse().ok()?;
141    let hi: u32 = caps[2].parse().ok()?;
142    if lo <= hi {
143        Some(lo..=hi)
144    } else {
145        None
146    }
147}
148
149/// Attempt to parse a single TOC line into a [`TocEntry`].
150///
151/// Returns `None` if the line doesn't look like a TOC entry at all.
152fn parse_line(line: &str) -> Option<TocEntry> {
153    let line = line.trim();
154
155    // Skip blank lines and comments.
156    if line.is_empty() || line.starts_with('#') {
157        return None;
158    }
159
160    // Format 3: line starts with a part range, e.g. "01-08  filename  1234 KB"
161    if let Some(caps) = re_format3_prefix().captures(line) {
162        let lo: u32 = caps[1].parse().ok()?;
163        let hi: u32 = caps[2].parse().ok()?;
164        let filename = caps[3].to_string();
165        let remainder = &caps[4];
166        let size_bytes = extract_size(remainder);
167        // Validate the range is sensible and the filename looks real.
168        if lo > hi || !looks_like_filename(&filename) {
169            return None;
170        }
171        return Some(TocEntry {
172            filename,
173            size_bytes,
174            parts: Some(lo..=hi),
175        });
176    }
177
178    // Format 2: "filename (1234567 bytes)"
179    if let Some(caps) = re_format2().captures(line) {
180        let filename = caps[1].to_string();
181        if !looks_like_filename(&filename) {
182            return None;
183        }
184        let digits: u64 = caps[2].parse().ok()?;
185        let size_bytes = parse_size(digits, &caps[3]);
186        return Some(TocEntry {
187            filename,
188            size_bytes,
189            parts: None,
190        });
191    }
192
193    // Format 1 (and fallback): "filename  size_with_unit  [parts N-M]"
194    // The filename is the first non-whitespace token; the rest is parsed for
195    // size and parts tokens in any order.
196    if let Some(caps) = re_format1().captures(line) {
197        let filename = caps[1].to_string();
198        let remainder = &caps[2];
199
200        if !looks_like_filename(&filename) {
201            return None;
202        }
203
204        let size_bytes = extract_size(remainder);
205        let parts = extract_parts(remainder);
206
207        // A line only qualifies if it yields at least a size or a parts range
208        // — otherwise almost any two-token line would be accepted.
209        if size_bytes.is_none() && parts.is_none() {
210            return None;
211        }
212
213        return Some(TocEntry {
214            filename,
215            size_bytes,
216            parts,
217        });
218    }
219
220    None
221}
222
223/// Heuristic gate: a bare word like "garbage" or "just" should not be
224/// treated as a filename.  We require at least one `.` in the name, or
225/// that it contains a path separator, which is a loose but practical signal
226/// that it is a real filename rather than a prose word.
227fn looks_like_filename(s: &str) -> bool {
228    s.contains('.') || s.contains('/') || s.contains('\\')
229}
230
231// ---------------------------------------------------------------------------
232// Public API
233// ---------------------------------------------------------------------------
234
235/// Best-effort parse of a UUencode multi-part TOC body.
236///
237/// The input is treated as a sequence of lines. Each line is independently
238/// attempted against three recognised TOC formats (in priority order):
239///
240/// 1. **Part-range prefix**: `01-08  filename.tar.gz  1234 KB`
241/// 2. **Parenthesised size**: `filename.tar.gz (1234567 bytes)`
242/// 3. **Inline size/parts**: `filename.tar.gz   1234567 bytes   parts 1-8`
243///
244/// Lines that match none of these formats (comments starting with `#`, blank
245/// lines, plain prose) are silently ignored. This makes the parser tolerant
246/// of the varied free-form headers that real TOC posts contain.
247///
248/// # Return value
249///
250/// Returns `None` if no lines at all parse as TOC entries.
251/// Returns `Some(ParsedToc)` with partial `entries` if at least one line
252/// parses; unparseable lines are omitted without error.
253///
254/// # Format notes
255///
256/// - Size units `bytes`/`b`, `KB`, and `MB` are recognised case-insensitively.
257///   KB and MB are converted to bytes using powers of 1 024.
258/// - Part ranges use an inclusive dash notation (`N-M`). Inverted ranges
259///   where `N > M` are rejected and produce `None` for that field.
260/// - Non-UTF-8 bytes in `body_bytes` are replaced via lossy conversion and
261///   preserved verbatim in [`ParsedToc::raw_text`].
262///
263/// # Never panics
264///
265/// This function never panics on any input, including empty slices and
266/// byte sequences that are not valid UTF-8.
267///
268/// # Examples
269///
270/// ```
271/// use uuencoding_multi::parse_toc;
272///
273/// let body = b"archive.tar.gz   1234567 bytes   parts 1-8\n";
274/// let toc = parse_toc(body).expect("should parse");
275/// assert_eq!(toc.entries.len(), 1);
276/// assert_eq!(toc.entries[0].filename, "archive.tar.gz");
277/// assert_eq!(toc.entries[0].size_bytes, Some(1_234_567));
278/// assert_eq!(toc.entries[0].parts, Some(1..=8));
279/// ```
280///
281/// ```
282/// use uuencoding_multi::parse_toc;
283///
284/// // Body with no recognisable TOC lines → None.
285/// assert!(parse_toc(b"just plain text\n").is_none());
286/// ```
287///
288/// ```
289/// use uuencoding_multi::parse_toc;
290///
291/// // Empty input → None.
292/// assert!(parse_toc(b"").is_none());
293/// ```
294pub fn parse_toc(body_bytes: &[u8]) -> Option<ParsedToc> {
295    let raw_text = String::from_utf8_lossy(body_bytes).into_owned();
296
297    let entries: Vec<TocEntry> = raw_text.lines().filter_map(parse_line).collect();
298
299    if entries.is_empty() {
300        None
301    } else {
302        Some(ParsedToc { entries, raw_text })
303    }
304}
305
306// ---------------------------------------------------------------------------
307// Tests
308// ---------------------------------------------------------------------------
309
310#[cfg(test)]
311mod tests {
312    use super::*;
313
314    // ------------------------------------------------------------------
315    // Full TOC with all three formats present
316    // ------------------------------------------------------------------
317
318    #[test]
319    fn full_toc_three_formats() {
320        let body = b"# TOC\nfilename.tar.gz   1234567 bytes   parts 1-8\nother.zip   512 KB\nsome.bin (99 bytes)\n";
321        let toc = parse_toc(body).expect("should parse");
322        assert_eq!(toc.entries.len(), 3);
323
324        let e0 = &toc.entries[0];
325        assert_eq!(e0.filename, "filename.tar.gz");
326        assert_eq!(e0.size_bytes, Some(1234567));
327        assert_eq!(e0.parts, Some(1..=8));
328
329        let e1 = &toc.entries[1];
330        assert_eq!(e1.filename, "other.zip");
331        assert_eq!(e1.size_bytes, Some(512 * 1024));
332        assert_eq!(e1.parts, None);
333
334        let e2 = &toc.entries[2];
335        assert_eq!(e2.filename, "some.bin");
336        assert_eq!(e2.size_bytes, Some(99));
337        assert_eq!(e2.parts, None);
338    }
339
340    // ------------------------------------------------------------------
341    // Unparseable lines mixed in — no panic, still get 1 entry
342    // ------------------------------------------------------------------
343
344    #[test]
345    fn garbage_lines_mixed_in() {
346        let body = b"garbage\nfile.txt   100 bytes\ngibberish here\n";
347        let toc = parse_toc(body).expect("should parse");
348        assert_eq!(toc.entries.len(), 1);
349        assert_eq!(toc.entries[0].filename, "file.txt");
350        assert_eq!(toc.entries[0].size_bytes, Some(100));
351    }
352
353    // ------------------------------------------------------------------
354    // Not a TOC → None
355    // ------------------------------------------------------------------
356
357    #[test]
358    fn not_a_toc_returns_none() {
359        let body = b"just plain text body\nno entries at all\n";
360        assert!(parse_toc(body).is_none());
361    }
362
363    // ------------------------------------------------------------------
364    // UTF-8 filename
365    // ------------------------------------------------------------------
366
367    #[test]
368    fn utf8_filename_no_panic() {
369        let body = "日本語.tar.gz   100 bytes\n".as_bytes();
370        let toc = parse_toc(body).expect("should parse");
371        assert_eq!(toc.entries.len(), 1);
372        assert_eq!(toc.entries[0].filename, "日本語.tar.gz");
373    }
374
375    // ------------------------------------------------------------------
376    // Part range formats
377    // ------------------------------------------------------------------
378
379    #[test]
380    fn parts_token_format1() {
381        let body = b"file.tar.gz   100 bytes   parts 2-5\n";
382        let toc = parse_toc(body).expect("should parse");
383        assert_eq!(toc.entries[0].parts, Some(2..=5));
384    }
385
386    #[test]
387    fn parts_prefix_format3() {
388        let body = b"02-05  file.tar.gz  100 bytes\n";
389        let toc = parse_toc(body).expect("should parse");
390        assert_eq!(toc.entries[0].filename, "file.tar.gz");
391        assert_eq!(toc.entries[0].parts, Some(2..=5));
392        assert_eq!(toc.entries[0].size_bytes, Some(100));
393    }
394
395    // ------------------------------------------------------------------
396    // Size unit parsing
397    // ------------------------------------------------------------------
398
399    #[test]
400    fn size_kb() {
401        let body = b"archive.zip   1 KB\n";
402        let toc = parse_toc(body).expect("should parse");
403        assert_eq!(toc.entries[0].size_bytes, Some(1024));
404    }
405
406    #[test]
407    fn size_mb() {
408        let body = b"archive.zip   2 MB\n";
409        let toc = parse_toc(body).expect("should parse");
410        assert_eq!(toc.entries[0].size_bytes, Some(2 * 1024 * 1024));
411    }
412
413    #[test]
414    fn size_bare_b_unit() {
415        let body = b"file.bin   512 B\n";
416        let toc = parse_toc(body).expect("should parse");
417        assert_eq!(toc.entries[0].size_bytes, Some(512));
418    }
419
420    // ------------------------------------------------------------------
421    // Non-UTF-8 input — must not panic
422    // ------------------------------------------------------------------
423
424    #[test]
425    fn non_utf8_no_panic() {
426        // Embed an invalid UTF-8 sequence followed by a valid TOC line.
427        let mut body = vec![0xFF, 0xFE, b'\n'];
428        body.extend_from_slice(b"file.tar.gz   100 bytes\n");
429        // May or may not produce an entry depending on lossy conversion, but
430        // must never panic.
431        let _ = parse_toc(&body);
432    }
433
434    // ------------------------------------------------------------------
435    // Comment-only body → None
436    // ------------------------------------------------------------------
437
438    #[test]
439    fn comment_only_returns_none() {
440        let body = b"# just a comment\n# another comment\n";
441        assert!(parse_toc(body).is_none());
442    }
443
444    // ------------------------------------------------------------------
445    // Empty input → None
446    // ------------------------------------------------------------------
447
448    #[test]
449    fn empty_input_returns_none() {
450        assert!(parse_toc(b"").is_none());
451    }
452
453    // ------------------------------------------------------------------
454    // raw_text is preserved verbatim
455    // ------------------------------------------------------------------
456
457    #[test]
458    fn raw_text_preserved() {
459        let body = b"# TOC\nfile.tar.gz   100 bytes\n";
460        let toc = parse_toc(body).expect("should parse");
461        assert!(toc.raw_text.contains("# TOC"));
462        assert!(toc.raw_text.contains("file.tar.gz"));
463    }
464
465    // ------------------------------------------------------------------
466    // Format 2 parenthesised size — various units
467    // ------------------------------------------------------------------
468
469    #[test]
470    fn format2_kb() {
471        let body = b"file.tar.gz (1024 KB)\n";
472        let toc = parse_toc(body).expect("should parse");
473        assert_eq!(toc.entries[0].size_bytes, Some(1024 * 1024));
474    }
475
476    // ------------------------------------------------------------------
477    // Part range where lo > hi is ignored (invalid range)
478    // ------------------------------------------------------------------
479
480    #[test]
481    fn inverted_parts_range_format1_ignored() {
482        // "parts 8-1" is nonsensical — should still parse the entry but
483        // produce no parts range.
484        let body = b"file.tar.gz   100 bytes   parts 8-1\n";
485        let toc = parse_toc(body).expect("should parse");
486        assert_eq!(toc.entries[0].parts, None);
487    }
488
489    // ------------------------------------------------------------------
490    // Plural "parts" keyword
491    // ------------------------------------------------------------------
492
493    #[test]
494    fn plural_parts_keyword() {
495        let body = b"file.tar.gz   100 bytes   parts 3-6\n";
496        let toc = parse_toc(body).expect("should parse");
497        assert_eq!(toc.entries[0].parts, Some(3..=6));
498    }
499
500    // ------------------------------------------------------------------
501    // Singular "part" keyword
502    // ------------------------------------------------------------------
503
504    #[test]
505    fn singular_part_keyword() {
506        let body = b"file.tar.gz   100 bytes   part 3-6\n";
507        let toc = parse_toc(body).expect("should parse");
508        assert_eq!(toc.entries[0].parts, Some(3..=6));
509    }
510}
uuencoding_multi/toc.rs

uuencoding_multi/
toc.rs