uuencoding-multi 0.2.0

Multi-part UUencoded Usenet/email post reassembly
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
//! Parser for multi-part UUencode table-of-contents (TOC) bodies.
//!
//! TOC posts (typically numbered part 0) list the files contained in a
//! multi-part series, along with optional size information and part ranges.
//! This module provides a best-effort parser that tolerates varied formatting
//! and non-UTF-8 byte sequences.

use std::ops::RangeInclusive;
use std::sync::OnceLock;

use regex::Regex;

// ---------------------------------------------------------------------------
// Public types
// ---------------------------------------------------------------------------

/// One entry in a table-of-contents post.
///
/// A TOC entry describes a single file within a multi-part series. Not all
/// fields are present in every TOC format; `size_bytes` and `parts` are
/// `None` when the corresponding information was absent from the line.
///
/// # Example
///
/// ```
/// use uuencoding_multi::parse_toc;
///
/// let body = b"archive.tar.gz   1234567 bytes   parts 1-8\n";
/// let toc = parse_toc(body).unwrap();
/// let entry = &toc.entries[0];
/// assert_eq!(entry.filename, "archive.tar.gz");
/// assert_eq!(entry.size_bytes, Some(1_234_567));
/// assert_eq!(entry.parts, Some(1..=8));
/// ```
#[derive(Debug, PartialEq)]
pub struct TocEntry {
    /// Filename as it appears in the TOC line.
    pub filename: String,
    /// Declared file size in bytes, if present. KB and MB values are
    /// converted to bytes (1 KB = 1 024 bytes, 1 MB = 1 048 576 bytes).
    pub size_bytes: Option<u64>,
    /// Which parts carry this file, if a range was specified. The range is
    /// inclusive on both ends (`lo..=hi`). Inverted ranges (`lo > hi`) are
    /// silently discarded and produce `None`.
    pub parts: Option<RangeInclusive<u32>>,
}

/// Result of parsing a TOC body.
///
/// `entries` may be a strict subset of the lines in the body: lines that do
/// not look like TOC entries (plain text, comments, blank lines) are silently
/// skipped. Inspect `raw_text` for the original body when debugging partial
/// results.
///
/// # Example
///
/// ```
/// use uuencoding_multi::parse_toc;
///
/// let body = b"# TOC\nfile.bin (512 bytes)\n";
/// let toc = parse_toc(body).unwrap();
/// assert_eq!(toc.entries.len(), 1);
/// assert!(toc.raw_text.contains("# TOC"));
/// ```
#[derive(Debug)]
pub struct ParsedToc {
    /// Successfully parsed entries; may be a strict subset of all lines.
    pub entries: Vec<TocEntry>,
    /// Full body text kept verbatim for diagnostic use when parsing is partial.
    /// Non-UTF-8 bytes are replaced with the Unicode replacement character
    /// (`U+FFFD`) via lossy conversion.
    pub raw_text: String,
}

// ---------------------------------------------------------------------------
// Compiled-once regex patterns
// ---------------------------------------------------------------------------

/// Format 1: `filename.tar.gz   1234567 bytes   parts 1-8`
/// Filename is the first whitespace-delimited token; size and "parts N-M" can
/// appear in either order after it.
fn re_format1() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| {
        // Captures: 1=filename, rest parsed manually for size/parts.
        Regex::new(r"(?i)^(\S+)\s+(.+)$").unwrap()
    })
}

/// Format 3 prefix: `01-08  filename.tar.gz  1234 KB`
/// The line starts with a zero-padded (or plain) part range.
fn re_format3_prefix() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| Regex::new(r"^(\d{1,6})-(\d{1,6})\s+(\S+)\s*(.*)$").unwrap())
}

/// Format 2: `filename.tar.gz (1234567 bytes)` — parenthesised size.
fn re_format2() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| Regex::new(r"(?i)^(\S+)\s*\(\s*(\d+)\s*(bytes?|b|kb|mb)\s*\)\s*$").unwrap())
}

/// Size token: one or more digits followed by a unit.
fn re_size_token() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| Regex::new(r"(?i)\b(\d+)\s*(bytes?|b|kb|mb)\b").unwrap())
}

/// "parts N-M" token.
fn re_parts_token() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| Regex::new(r"(?i)\bparts?\s+(\d{1,6})-(\d{1,6})\b").unwrap())
}

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/// Convert a size value + unit string to bytes.
/// Returns `None` only if the multiplication would overflow `u64`.
fn parse_size(digits: u64, unit: &str) -> Option<u64> {
    match unit.to_lowercase().trim_end_matches('s') {
        "byte" | "b" => Some(digits),
        "kb" => digits.checked_mul(1024),
        "mb" => digits.checked_mul(1024 * 1024),
        _ => None,
    }
}

/// Try to extract a size value from an arbitrary text fragment.
fn extract_size(text: &str) -> Option<u64> {
    let caps = re_size_token().captures(text)?;
    let digits: u64 = caps[1].parse().ok()?;
    parse_size(digits, &caps[2])
}

/// Try to extract a part range from an arbitrary text fragment.
fn extract_parts(text: &str) -> Option<RangeInclusive<u32>> {
    let caps = re_parts_token().captures(text)?;
    let lo: u32 = caps[1].parse().ok()?;
    let hi: u32 = caps[2].parse().ok()?;
    if lo <= hi {
        Some(lo..=hi)
    } else {
        None
    }
}

/// Attempt to parse a single TOC line into a [`TocEntry`].
///
/// Returns `None` if the line doesn't look like a TOC entry at all.
fn parse_line(line: &str) -> Option<TocEntry> {
    let line = line.trim();

    // Skip blank lines and comments.
    if line.is_empty() || line.starts_with('#') {
        return None;
    }

    // Format 3: line starts with a part range, e.g. "01-08  filename  1234 KB"
    if let Some(caps) = re_format3_prefix().captures(line) {
        let lo: u32 = caps[1].parse().ok()?;
        let hi: u32 = caps[2].parse().ok()?;
        let filename = caps[3].to_string();
        let remainder = &caps[4];
        let size_bytes = extract_size(remainder);
        // Validate the range is sensible and the filename looks real.
        if lo > hi || !looks_like_filename(&filename) {
            return None;
        }
        return Some(TocEntry {
            filename,
            size_bytes,
            parts: Some(lo..=hi),
        });
    }

    // Format 2: "filename (1234567 bytes)"
    if let Some(caps) = re_format2().captures(line) {
        let filename = caps[1].to_string();
        if !looks_like_filename(&filename) {
            return None;
        }
        let digits: u64 = caps[2].parse().ok()?;
        let size_bytes = parse_size(digits, &caps[3]);
        return Some(TocEntry {
            filename,
            size_bytes,
            parts: None,
        });
    }

    // Format 1 (and fallback): "filename  size_with_unit  [parts N-M]"
    // The filename is the first non-whitespace token; the rest is parsed for
    // size and parts tokens in any order.
    if let Some(caps) = re_format1().captures(line) {
        let filename = caps[1].to_string();
        let remainder = &caps[2];

        if !looks_like_filename(&filename) {
            return None;
        }

        let size_bytes = extract_size(remainder);
        let parts = extract_parts(remainder);

        // A line only qualifies if it yields at least a size or a parts range
        // — otherwise almost any two-token line would be accepted.
        if size_bytes.is_none() && parts.is_none() {
            return None;
        }

        return Some(TocEntry {
            filename,
            size_bytes,
            parts,
        });
    }

    None
}

/// Heuristic gate: a bare word like "garbage" or "just" should not be
/// treated as a filename.  We require at least one `.` in the name, or
/// that it contains a path separator, which is a loose but practical signal
/// that it is a real filename rather than a prose word.
fn looks_like_filename(s: &str) -> bool {
    s.contains('.') || s.contains('/') || s.contains('\\')
}

// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------

/// Best-effort parse of a UUencode multi-part TOC body.
///
/// The input is treated as a sequence of lines. Each line is independently
/// attempted against three recognised TOC formats (in priority order):
///
/// 1. **Part-range prefix**: `01-08  filename.tar.gz  1234 KB`
/// 2. **Parenthesised size**: `filename.tar.gz (1234567 bytes)`
/// 3. **Inline size/parts**: `filename.tar.gz   1234567 bytes   parts 1-8`
///
/// Lines that match none of these formats (comments starting with `#`, blank
/// lines, plain prose) are silently ignored. This makes the parser tolerant
/// of the varied free-form headers that real TOC posts contain.
///
/// # Return value
///
/// Returns `None` if no lines at all parse as TOC entries.
/// Returns `Some(ParsedToc)` with partial `entries` if at least one line
/// parses; unparseable lines are omitted without error.
///
/// # Format notes
///
/// - Size units `bytes`/`b`, `KB`, and `MB` are recognised case-insensitively.
///   KB and MB are converted to bytes using powers of 1 024.
/// - Part ranges use an inclusive dash notation (`N-M`). Inverted ranges
///   where `N > M` are rejected and produce `None` for that field.
/// - Non-UTF-8 bytes in `body_bytes` are replaced via lossy conversion and
///   preserved verbatim in [`ParsedToc::raw_text`].
///
/// # Never panics
///
/// This function never panics on any input, including empty slices and
/// byte sequences that are not valid UTF-8.
///
/// # Examples
///
/// ```
/// use uuencoding_multi::parse_toc;
///
/// let body = b"archive.tar.gz   1234567 bytes   parts 1-8\n";
/// let toc = parse_toc(body).expect("should parse");
/// assert_eq!(toc.entries.len(), 1);
/// assert_eq!(toc.entries[0].filename, "archive.tar.gz");
/// assert_eq!(toc.entries[0].size_bytes, Some(1_234_567));
/// assert_eq!(toc.entries[0].parts, Some(1..=8));
/// ```
///
/// ```
/// use uuencoding_multi::parse_toc;
///
/// // Body with no recognisable TOC lines → None.
/// assert!(parse_toc(b"just plain text\n").is_none());
/// ```
///
/// ```
/// use uuencoding_multi::parse_toc;
///
/// // Empty input → None.
/// assert!(parse_toc(b"").is_none());
/// ```
pub fn parse_toc(body_bytes: &[u8]) -> Option<ParsedToc> {
    let raw_text = String::from_utf8_lossy(body_bytes).into_owned();

    let entries: Vec<TocEntry> = raw_text.lines().filter_map(parse_line).collect();

    if entries.is_empty() {
        None
    } else {
        Some(ParsedToc { entries, raw_text })
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    // ------------------------------------------------------------------
    // Full TOC with all three formats present
    // ------------------------------------------------------------------

    #[test]
    fn full_toc_three_formats() {
        let body = b"# TOC\nfilename.tar.gz   1234567 bytes   parts 1-8\nother.zip   512 KB\nsome.bin (99 bytes)\n";
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries.len(), 3);

        let e0 = &toc.entries[0];
        assert_eq!(e0.filename, "filename.tar.gz");
        assert_eq!(e0.size_bytes, Some(1234567));
        assert_eq!(e0.parts, Some(1..=8));

        let e1 = &toc.entries[1];
        assert_eq!(e1.filename, "other.zip");
        assert_eq!(e1.size_bytes, Some(512 * 1024));
        assert_eq!(e1.parts, None);

        let e2 = &toc.entries[2];
        assert_eq!(e2.filename, "some.bin");
        assert_eq!(e2.size_bytes, Some(99));
        assert_eq!(e2.parts, None);
    }

    // ------------------------------------------------------------------
    // Unparseable lines mixed in — no panic, still get 1 entry
    // ------------------------------------------------------------------

    #[test]
    fn garbage_lines_mixed_in() {
        let body = b"garbage\nfile.txt   100 bytes\ngibberish here\n";
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries.len(), 1);
        assert_eq!(toc.entries[0].filename, "file.txt");
        assert_eq!(toc.entries[0].size_bytes, Some(100));
    }

    // ------------------------------------------------------------------
    // Not a TOC → None
    // ------------------------------------------------------------------

    #[test]
    fn not_a_toc_returns_none() {
        let body = b"just plain text body\nno entries at all\n";
        assert!(parse_toc(body).is_none());
    }

    // ------------------------------------------------------------------
    // UTF-8 filename
    // ------------------------------------------------------------------

    #[test]
    fn utf8_filename_no_panic() {
        let body = "日本語.tar.gz   100 bytes\n".as_bytes();
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries.len(), 1);
        assert_eq!(toc.entries[0].filename, "日本語.tar.gz");
    }

    // ------------------------------------------------------------------
    // Part range formats
    // ------------------------------------------------------------------

    #[test]
    fn parts_token_format1() {
        let body = b"file.tar.gz   100 bytes   parts 2-5\n";
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries[0].parts, Some(2..=5));
    }

    #[test]
    fn parts_prefix_format3() {
        let body = b"02-05  file.tar.gz  100 bytes\n";
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries[0].filename, "file.tar.gz");
        assert_eq!(toc.entries[0].parts, Some(2..=5));
        assert_eq!(toc.entries[0].size_bytes, Some(100));
    }

    // ------------------------------------------------------------------
    // Size unit parsing
    // ------------------------------------------------------------------

    #[test]
    fn size_kb() {
        let body = b"archive.zip   1 KB\n";
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries[0].size_bytes, Some(1024));
    }

    #[test]
    fn size_mb() {
        let body = b"archive.zip   2 MB\n";
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries[0].size_bytes, Some(2 * 1024 * 1024));
    }

    #[test]
    fn size_bare_b_unit() {
        let body = b"file.bin   512 B\n";
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries[0].size_bytes, Some(512));
    }

    // ------------------------------------------------------------------
    // Non-UTF-8 input — must not panic
    // ------------------------------------------------------------------

    #[test]
    fn non_utf8_no_panic() {
        // Embed an invalid UTF-8 sequence followed by a valid TOC line.
        let mut body = vec![0xFF, 0xFE, b'\n'];
        body.extend_from_slice(b"file.tar.gz   100 bytes\n");
        // May or may not produce an entry depending on lossy conversion, but
        // must never panic.
        let _ = parse_toc(&body);
    }

    // ------------------------------------------------------------------
    // Comment-only body → None
    // ------------------------------------------------------------------

    #[test]
    fn comment_only_returns_none() {
        let body = b"# just a comment\n# another comment\n";
        assert!(parse_toc(body).is_none());
    }

    // ------------------------------------------------------------------
    // Empty input → None
    // ------------------------------------------------------------------

    #[test]
    fn empty_input_returns_none() {
        assert!(parse_toc(b"").is_none());
    }

    // ------------------------------------------------------------------
    // raw_text is preserved verbatim
    // ------------------------------------------------------------------

    #[test]
    fn raw_text_preserved() {
        let body = b"# TOC\nfile.tar.gz   100 bytes\n";
        let toc = parse_toc(body).expect("should parse");
        assert!(toc.raw_text.contains("# TOC"));
        assert!(toc.raw_text.contains("file.tar.gz"));
    }

    // ------------------------------------------------------------------
    // Format 2 parenthesised size — various units
    // ------------------------------------------------------------------

    #[test]
    fn format2_kb() {
        let body = b"file.tar.gz (1024 KB)\n";
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries[0].size_bytes, Some(1024 * 1024));
    }

    // ------------------------------------------------------------------
    // Part range where lo > hi is ignored (invalid range)
    // ------------------------------------------------------------------

    #[test]
    fn inverted_parts_range_format1_ignored() {
        // "parts 8-1" is nonsensical — should still parse the entry but
        // produce no parts range.
        let body = b"file.tar.gz   100 bytes   parts 8-1\n";
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries[0].parts, None);
    }

    // ------------------------------------------------------------------
    // Plural "parts" keyword
    // ------------------------------------------------------------------

    #[test]
    fn plural_parts_keyword() {
        let body = b"file.tar.gz   100 bytes   parts 3-6\n";
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries[0].parts, Some(3..=6));
    }

    // ------------------------------------------------------------------
    // Singular "part" keyword
    // ------------------------------------------------------------------

    #[test]
    fn singular_part_keyword() {
        let body = b"file.tar.gz   100 bytes   part 3-6\n";
        let toc = parse_toc(body).expect("should parse");
        assert_eq!(toc.entries[0].parts, Some(3..=6));
    }
}