uuencoding_multi/toc.rs
1//! Parser for multi-part UUencode table-of-contents (TOC) bodies.
2//!
3//! TOC posts (typically numbered part 0) list the files contained in a
4//! multi-part series, along with optional size information and part ranges.
5//! This module provides a best-effort parser that tolerates varied formatting
6//! and non-UTF-8 byte sequences.
7
8use std::ops::RangeInclusive;
9use std::sync::OnceLock;
10
11use regex::Regex;
12
13// ---------------------------------------------------------------------------
14// Public types
15// ---------------------------------------------------------------------------
16
17/// One entry in a table-of-contents post.
18///
19/// A TOC entry describes a single file within a multi-part series. Not all
20/// fields are present in every TOC format; `size_bytes` and `parts` are
21/// `None` when the corresponding information was absent from the line.
22///
23/// # Example
24///
25/// ```
26/// use uuencoding_multi::parse_toc;
27///
28/// let body = b"archive.tar.gz 1234567 bytes parts 1-8\n";
29/// let toc = parse_toc(body).unwrap();
30/// let entry = &toc.entries[0];
31/// assert_eq!(entry.filename, "archive.tar.gz");
32/// assert_eq!(entry.size_bytes, Some(1_234_567));
33/// assert_eq!(entry.parts, Some(1..=8));
34/// ```
35#[derive(Debug, PartialEq)]
36pub struct TocEntry {
37 /// Filename as it appears in the TOC line.
38 pub filename: String,
39 /// Declared file size in bytes, if present. KB and MB values are
40 /// converted to bytes (1 KB = 1 024 bytes, 1 MB = 1 048 576 bytes).
41 pub size_bytes: Option<u64>,
42 /// Which parts carry this file, if a range was specified. The range is
43 /// inclusive on both ends (`lo..=hi`). Inverted ranges (`lo > hi`) are
44 /// silently discarded and produce `None`.
45 pub parts: Option<RangeInclusive<u32>>,
46}
47
48/// Result of parsing a TOC body.
49///
50/// `entries` may be a strict subset of the lines in the body: lines that do
51/// not look like TOC entries (plain text, comments, blank lines) are silently
52/// skipped. Inspect `raw_text` for the original body when debugging partial
53/// results.
54///
55/// # Example
56///
57/// ```
58/// use uuencoding_multi::parse_toc;
59///
60/// let body = b"# TOC\nfile.bin (512 bytes)\n";
61/// let toc = parse_toc(body).unwrap();
62/// assert_eq!(toc.entries.len(), 1);
63/// assert!(toc.raw_text.contains("# TOC"));
64/// ```
65#[derive(Debug)]
66pub struct ParsedToc {
67 /// Successfully parsed entries; may be a strict subset of all lines.
68 pub entries: Vec<TocEntry>,
69 /// Full body text kept verbatim for diagnostic use when parsing is partial.
70 /// Non-UTF-8 bytes are replaced with the Unicode replacement character
71 /// (`U+FFFD`) via lossy conversion.
72 pub raw_text: String,
73}
74
75// ---------------------------------------------------------------------------
76// Compiled-once regex patterns
77// ---------------------------------------------------------------------------
78
79/// Format 1: `filename.tar.gz 1234567 bytes parts 1-8`
80/// Filename is the first whitespace-delimited token; size and "parts N-M" can
81/// appear in either order after it.
82fn re_format1() -> &'static Regex {
83 static RE: OnceLock<Regex> = OnceLock::new();
84 RE.get_or_init(|| {
85 // Captures: 1=filename, rest parsed manually for size/parts.
86 Regex::new(r"(?i)^(\S+)\s+(.+)$").unwrap()
87 })
88}
89
90/// Format 3 prefix: `01-08 filename.tar.gz 1234 KB`
91/// The line starts with a zero-padded (or plain) part range.
92fn re_format3_prefix() -> &'static Regex {
93 static RE: OnceLock<Regex> = OnceLock::new();
94 RE.get_or_init(|| Regex::new(r"^(\d{1,6})-(\d{1,6})\s+(\S+)\s*(.*)$").unwrap())
95}
96
97/// Format 2: `filename.tar.gz (1234567 bytes)` — parenthesised size.
98fn re_format2() -> &'static Regex {
99 static RE: OnceLock<Regex> = OnceLock::new();
100 RE.get_or_init(|| Regex::new(r"(?i)^(\S+)\s*\(\s*(\d+)\s*(bytes?|b|kb|mb)\s*\)\s*$").unwrap())
101}
102
103/// Size token: one or more digits followed by a unit.
104fn re_size_token() -> &'static Regex {
105 static RE: OnceLock<Regex> = OnceLock::new();
106 RE.get_or_init(|| Regex::new(r"(?i)\b(\d+)\s*(bytes?|b|kb|mb)\b").unwrap())
107}
108
109/// "parts N-M" token.
110fn re_parts_token() -> &'static Regex {
111 static RE: OnceLock<Regex> = OnceLock::new();
112 RE.get_or_init(|| Regex::new(r"(?i)\bparts?\s+(\d{1,6})-(\d{1,6})\b").unwrap())
113}
114
115// ---------------------------------------------------------------------------
116// Helpers
117// ---------------------------------------------------------------------------
118
119/// Convert a size value + unit string to bytes.
120/// Returns `None` only if the multiplication would overflow `u64`.
121fn parse_size(digits: u64, unit: &str) -> Option<u64> {
122 match unit.to_lowercase().trim_end_matches('s') {
123 "byte" | "b" => Some(digits),
124 "kb" => digits.checked_mul(1024),
125 "mb" => digits.checked_mul(1024 * 1024),
126 _ => None,
127 }
128}
129
130/// Try to extract a size value from an arbitrary text fragment.
131fn extract_size(text: &str) -> Option<u64> {
132 let caps = re_size_token().captures(text)?;
133 let digits: u64 = caps[1].parse().ok()?;
134 parse_size(digits, &caps[2])
135}
136
137/// Try to extract a part range from an arbitrary text fragment.
138fn extract_parts(text: &str) -> Option<RangeInclusive<u32>> {
139 let caps = re_parts_token().captures(text)?;
140 let lo: u32 = caps[1].parse().ok()?;
141 let hi: u32 = caps[2].parse().ok()?;
142 if lo <= hi {
143 Some(lo..=hi)
144 } else {
145 None
146 }
147}
148
149/// Attempt to parse a single TOC line into a [`TocEntry`].
150///
151/// Returns `None` if the line doesn't look like a TOC entry at all.
152fn parse_line(line: &str) -> Option<TocEntry> {
153 let line = line.trim();
154
155 // Skip blank lines and comments.
156 if line.is_empty() || line.starts_with('#') {
157 return None;
158 }
159
160 // Format 3: line starts with a part range, e.g. "01-08 filename 1234 KB"
161 if let Some(caps) = re_format3_prefix().captures(line) {
162 let lo: u32 = caps[1].parse().ok()?;
163 let hi: u32 = caps[2].parse().ok()?;
164 let filename = caps[3].to_string();
165 let remainder = &caps[4];
166 let size_bytes = extract_size(remainder);
167 // Validate the range is sensible and the filename looks real.
168 if lo > hi || !looks_like_filename(&filename) {
169 return None;
170 }
171 return Some(TocEntry {
172 filename,
173 size_bytes,
174 parts: Some(lo..=hi),
175 });
176 }
177
178 // Format 2: "filename (1234567 bytes)"
179 if let Some(caps) = re_format2().captures(line) {
180 let filename = caps[1].to_string();
181 if !looks_like_filename(&filename) {
182 return None;
183 }
184 let digits: u64 = caps[2].parse().ok()?;
185 let size_bytes = parse_size(digits, &caps[3]);
186 return Some(TocEntry {
187 filename,
188 size_bytes,
189 parts: None,
190 });
191 }
192
193 // Format 1 (and fallback): "filename size_with_unit [parts N-M]"
194 // The filename is the first non-whitespace token; the rest is parsed for
195 // size and parts tokens in any order.
196 if let Some(caps) = re_format1().captures(line) {
197 let filename = caps[1].to_string();
198 let remainder = &caps[2];
199
200 if !looks_like_filename(&filename) {
201 return None;
202 }
203
204 let size_bytes = extract_size(remainder);
205 let parts = extract_parts(remainder);
206
207 // A line only qualifies if it yields at least a size or a parts range
208 // — otherwise almost any two-token line would be accepted.
209 if size_bytes.is_none() && parts.is_none() {
210 return None;
211 }
212
213 return Some(TocEntry {
214 filename,
215 size_bytes,
216 parts,
217 });
218 }
219
220 None
221}
222
223/// Heuristic gate: a bare word like "garbage" or "just" should not be
224/// treated as a filename. We require at least one `.` in the name, or
225/// that it contains a path separator, which is a loose but practical signal
226/// that it is a real filename rather than a prose word.
227fn looks_like_filename(s: &str) -> bool {
228 s.contains('.') || s.contains('/') || s.contains('\\')
229}
230
231// ---------------------------------------------------------------------------
232// Public API
233// ---------------------------------------------------------------------------
234
235/// Best-effort parse of a UUencode multi-part TOC body.
236///
237/// The input is treated as a sequence of lines. Each line is independently
238/// attempted against three recognised TOC formats (in priority order):
239///
240/// 1. **Part-range prefix**: `01-08 filename.tar.gz 1234 KB`
241/// 2. **Parenthesised size**: `filename.tar.gz (1234567 bytes)`
242/// 3. **Inline size/parts**: `filename.tar.gz 1234567 bytes parts 1-8`
243///
244/// Lines that match none of these formats (comments starting with `#`, blank
245/// lines, plain prose) are silently ignored. This makes the parser tolerant
246/// of the varied free-form headers that real TOC posts contain.
247///
248/// # Return value
249///
250/// Returns `None` if no lines at all parse as TOC entries.
251/// Returns `Some(ParsedToc)` with partial `entries` if at least one line
252/// parses; unparseable lines are omitted without error.
253///
254/// # Format notes
255///
256/// - Size units `bytes`/`b`, `KB`, and `MB` are recognised case-insensitively.
257/// KB and MB are converted to bytes using powers of 1 024.
258/// - Part ranges use an inclusive dash notation (`N-M`). Inverted ranges
259/// where `N > M` are rejected and produce `None` for that field.
260/// - Non-UTF-8 bytes in `body_bytes` are replaced via lossy conversion and
261/// preserved verbatim in [`ParsedToc::raw_text`].
262///
263/// # Never panics
264///
265/// This function never panics on any input, including empty slices and
266/// byte sequences that are not valid UTF-8.
267///
268/// # Examples
269///
270/// ```
271/// use uuencoding_multi::parse_toc;
272///
273/// let body = b"archive.tar.gz 1234567 bytes parts 1-8\n";
274/// let toc = parse_toc(body).expect("should parse");
275/// assert_eq!(toc.entries.len(), 1);
276/// assert_eq!(toc.entries[0].filename, "archive.tar.gz");
277/// assert_eq!(toc.entries[0].size_bytes, Some(1_234_567));
278/// assert_eq!(toc.entries[0].parts, Some(1..=8));
279/// ```
280///
281/// ```
282/// use uuencoding_multi::parse_toc;
283///
284/// // Body with no recognisable TOC lines → None.
285/// assert!(parse_toc(b"just plain text\n").is_none());
286/// ```
287///
288/// ```
289/// use uuencoding_multi::parse_toc;
290///
291/// // Empty input → None.
292/// assert!(parse_toc(b"").is_none());
293/// ```
294pub fn parse_toc(body_bytes: &[u8]) -> Option<ParsedToc> {
295 let raw_text = String::from_utf8_lossy(body_bytes).into_owned();
296
297 let entries: Vec<TocEntry> = raw_text.lines().filter_map(parse_line).collect();
298
299 if entries.is_empty() {
300 None
301 } else {
302 Some(ParsedToc { entries, raw_text })
303 }
304}
305
306// ---------------------------------------------------------------------------
307// Tests
308// ---------------------------------------------------------------------------
309
310#[cfg(test)]
311mod tests {
312 use super::*;
313
314 // ------------------------------------------------------------------
315 // Full TOC with all three formats present
316 // ------------------------------------------------------------------
317
318 #[test]
319 fn full_toc_three_formats() {
320 let body = b"# TOC\nfilename.tar.gz 1234567 bytes parts 1-8\nother.zip 512 KB\nsome.bin (99 bytes)\n";
321 let toc = parse_toc(body).expect("should parse");
322 assert_eq!(toc.entries.len(), 3);
323
324 let e0 = &toc.entries[0];
325 assert_eq!(e0.filename, "filename.tar.gz");
326 assert_eq!(e0.size_bytes, Some(1234567));
327 assert_eq!(e0.parts, Some(1..=8));
328
329 let e1 = &toc.entries[1];
330 assert_eq!(e1.filename, "other.zip");
331 assert_eq!(e1.size_bytes, Some(512 * 1024));
332 assert_eq!(e1.parts, None);
333
334 let e2 = &toc.entries[2];
335 assert_eq!(e2.filename, "some.bin");
336 assert_eq!(e2.size_bytes, Some(99));
337 assert_eq!(e2.parts, None);
338 }
339
340 // ------------------------------------------------------------------
341 // Unparseable lines mixed in — no panic, still get 1 entry
342 // ------------------------------------------------------------------
343
344 #[test]
345 fn garbage_lines_mixed_in() {
346 let body = b"garbage\nfile.txt 100 bytes\ngibberish here\n";
347 let toc = parse_toc(body).expect("should parse");
348 assert_eq!(toc.entries.len(), 1);
349 assert_eq!(toc.entries[0].filename, "file.txt");
350 assert_eq!(toc.entries[0].size_bytes, Some(100));
351 }
352
353 // ------------------------------------------------------------------
354 // Not a TOC → None
355 // ------------------------------------------------------------------
356
357 #[test]
358 fn not_a_toc_returns_none() {
359 let body = b"just plain text body\nno entries at all\n";
360 assert!(parse_toc(body).is_none());
361 }
362
363 // ------------------------------------------------------------------
364 // UTF-8 filename
365 // ------------------------------------------------------------------
366
367 #[test]
368 fn utf8_filename_no_panic() {
369 let body = "日本語.tar.gz 100 bytes\n".as_bytes();
370 let toc = parse_toc(body).expect("should parse");
371 assert_eq!(toc.entries.len(), 1);
372 assert_eq!(toc.entries[0].filename, "日本語.tar.gz");
373 }
374
375 // ------------------------------------------------------------------
376 // Part range formats
377 // ------------------------------------------------------------------
378
379 #[test]
380 fn parts_token_format1() {
381 let body = b"file.tar.gz 100 bytes parts 2-5\n";
382 let toc = parse_toc(body).expect("should parse");
383 assert_eq!(toc.entries[0].parts, Some(2..=5));
384 }
385
386 #[test]
387 fn parts_prefix_format3() {
388 let body = b"02-05 file.tar.gz 100 bytes\n";
389 let toc = parse_toc(body).expect("should parse");
390 assert_eq!(toc.entries[0].filename, "file.tar.gz");
391 assert_eq!(toc.entries[0].parts, Some(2..=5));
392 assert_eq!(toc.entries[0].size_bytes, Some(100));
393 }
394
395 // ------------------------------------------------------------------
396 // Size unit parsing
397 // ------------------------------------------------------------------
398
399 #[test]
400 fn size_kb() {
401 let body = b"archive.zip 1 KB\n";
402 let toc = parse_toc(body).expect("should parse");
403 assert_eq!(toc.entries[0].size_bytes, Some(1024));
404 }
405
406 #[test]
407 fn size_mb() {
408 let body = b"archive.zip 2 MB\n";
409 let toc = parse_toc(body).expect("should parse");
410 assert_eq!(toc.entries[0].size_bytes, Some(2 * 1024 * 1024));
411 }
412
413 #[test]
414 fn size_bare_b_unit() {
415 let body = b"file.bin 512 B\n";
416 let toc = parse_toc(body).expect("should parse");
417 assert_eq!(toc.entries[0].size_bytes, Some(512));
418 }
419
420 // ------------------------------------------------------------------
421 // Non-UTF-8 input — must not panic
422 // ------------------------------------------------------------------
423
424 #[test]
425 fn non_utf8_no_panic() {
426 // Embed an invalid UTF-8 sequence followed by a valid TOC line.
427 let mut body = vec![0xFF, 0xFE, b'\n'];
428 body.extend_from_slice(b"file.tar.gz 100 bytes\n");
429 // May or may not produce an entry depending on lossy conversion, but
430 // must never panic.
431 let _ = parse_toc(&body);
432 }
433
434 // ------------------------------------------------------------------
435 // Comment-only body → None
436 // ------------------------------------------------------------------
437
438 #[test]
439 fn comment_only_returns_none() {
440 let body = b"# just a comment\n# another comment\n";
441 assert!(parse_toc(body).is_none());
442 }
443
444 // ------------------------------------------------------------------
445 // Empty input → None
446 // ------------------------------------------------------------------
447
448 #[test]
449 fn empty_input_returns_none() {
450 assert!(parse_toc(b"").is_none());
451 }
452
453 // ------------------------------------------------------------------
454 // raw_text is preserved verbatim
455 // ------------------------------------------------------------------
456
457 #[test]
458 fn raw_text_preserved() {
459 let body = b"# TOC\nfile.tar.gz 100 bytes\n";
460 let toc = parse_toc(body).expect("should parse");
461 assert!(toc.raw_text.contains("# TOC"));
462 assert!(toc.raw_text.contains("file.tar.gz"));
463 }
464
465 // ------------------------------------------------------------------
466 // Format 2 parenthesised size — various units
467 // ------------------------------------------------------------------
468
469 #[test]
470 fn format2_kb() {
471 let body = b"file.tar.gz (1024 KB)\n";
472 let toc = parse_toc(body).expect("should parse");
473 assert_eq!(toc.entries[0].size_bytes, Some(1024 * 1024));
474 }
475
476 // ------------------------------------------------------------------
477 // Part range where lo > hi is ignored (invalid range)
478 // ------------------------------------------------------------------
479
480 #[test]
481 fn inverted_parts_range_format1_ignored() {
482 // "parts 8-1" is nonsensical — should still parse the entry but
483 // produce no parts range.
484 let body = b"file.tar.gz 100 bytes parts 8-1\n";
485 let toc = parse_toc(body).expect("should parse");
486 assert_eq!(toc.entries[0].parts, None);
487 }
488
489 // ------------------------------------------------------------------
490 // Plural "parts" keyword
491 // ------------------------------------------------------------------
492
493 #[test]
494 fn plural_parts_keyword() {
495 let body = b"file.tar.gz 100 bytes parts 3-6\n";
496 let toc = parse_toc(body).expect("should parse");
497 assert_eq!(toc.entries[0].parts, Some(3..=6));
498 }
499
500 // ------------------------------------------------------------------
501 // Singular "part" keyword
502 // ------------------------------------------------------------------
503
504 #[test]
505 fn singular_part_keyword() {
506 let body = b"file.tar.gz 100 bytes part 3-6\n";
507 let toc = parse_toc(body).expect("should parse");
508 assert_eq!(toc.entries[0].parts, Some(3..=6));
509 }
510}