use std::ops::RangeInclusive;
use std::sync::OnceLock;
use regex::Regex;
#[derive(Debug, PartialEq)]
pub struct TocEntry {
pub filename: String,
pub size_bytes: Option<u64>,
pub parts: Option<RangeInclusive<u32>>,
}
#[derive(Debug)]
pub struct ParsedToc {
pub entries: Vec<TocEntry>,
pub raw_text: String,
}
fn re_format1() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r"(?i)^(\S+)\s+(.+)$").unwrap()
})
}
fn re_format3_prefix() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| Regex::new(r"^(\d{1,6})-(\d{1,6})\s+(\S+)\s*(.*)$").unwrap())
}
fn re_format2() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| Regex::new(r"(?i)^(\S+)\s*\(\s*(\d+)\s*(bytes?|b|kb|mb)\s*\)\s*$").unwrap())
}
fn re_size_token() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| Regex::new(r"(?i)\b(\d+)\s*(bytes?|b|kb|mb)\b").unwrap())
}
fn re_parts_token() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| Regex::new(r"(?i)\bparts?\s+(\d{1,6})-(\d{1,6})\b").unwrap())
}
fn parse_size(digits: u64, unit: &str) -> Option<u64> {
match unit.to_lowercase().trim_end_matches('s') {
"byte" | "b" => Some(digits),
"kb" => digits.checked_mul(1024),
"mb" => digits.checked_mul(1024 * 1024),
_ => None,
}
}
fn extract_size(text: &str) -> Option<u64> {
let caps = re_size_token().captures(text)?;
let digits: u64 = caps[1].parse().ok()?;
parse_size(digits, &caps[2])
}
fn extract_parts(text: &str) -> Option<RangeInclusive<u32>> {
let caps = re_parts_token().captures(text)?;
let lo: u32 = caps[1].parse().ok()?;
let hi: u32 = caps[2].parse().ok()?;
if lo <= hi {
Some(lo..=hi)
} else {
None
}
}
fn parse_line(line: &str) -> Option<TocEntry> {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
return None;
}
if let Some(caps) = re_format3_prefix().captures(line) {
let lo: u32 = caps[1].parse().ok()?;
let hi: u32 = caps[2].parse().ok()?;
let filename = caps[3].to_string();
let remainder = &caps[4];
let size_bytes = extract_size(remainder);
if lo > hi || !looks_like_filename(&filename) {
return None;
}
return Some(TocEntry {
filename,
size_bytes,
parts: Some(lo..=hi),
});
}
if let Some(caps) = re_format2().captures(line) {
let filename = caps[1].to_string();
if !looks_like_filename(&filename) {
return None;
}
let digits: u64 = caps[2].parse().ok()?;
let size_bytes = parse_size(digits, &caps[3]);
return Some(TocEntry {
filename,
size_bytes,
parts: None,
});
}
if let Some(caps) = re_format1().captures(line) {
let filename = caps[1].to_string();
let remainder = &caps[2];
if !looks_like_filename(&filename) {
return None;
}
let size_bytes = extract_size(remainder);
let parts = extract_parts(remainder);
if size_bytes.is_none() && parts.is_none() {
return None;
}
return Some(TocEntry {
filename,
size_bytes,
parts,
});
}
None
}
fn looks_like_filename(s: &str) -> bool {
s.contains('.') || s.contains('/') || s.contains('\\')
}
pub fn parse_toc(body_bytes: &[u8]) -> Option<ParsedToc> {
let raw_text = String::from_utf8_lossy(body_bytes).into_owned();
let entries: Vec<TocEntry> = raw_text.lines().filter_map(parse_line).collect();
if entries.is_empty() {
None
} else {
Some(ParsedToc { entries, raw_text })
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn full_toc_three_formats() {
let body = b"# TOC\nfilename.tar.gz 1234567 bytes parts 1-8\nother.zip 512 KB\nsome.bin (99 bytes)\n";
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries.len(), 3);
let e0 = &toc.entries[0];
assert_eq!(e0.filename, "filename.tar.gz");
assert_eq!(e0.size_bytes, Some(1234567));
assert_eq!(e0.parts, Some(1..=8));
let e1 = &toc.entries[1];
assert_eq!(e1.filename, "other.zip");
assert_eq!(e1.size_bytes, Some(512 * 1024));
assert_eq!(e1.parts, None);
let e2 = &toc.entries[2];
assert_eq!(e2.filename, "some.bin");
assert_eq!(e2.size_bytes, Some(99));
assert_eq!(e2.parts, None);
}
#[test]
fn garbage_lines_mixed_in() {
let body = b"garbage\nfile.txt 100 bytes\ngibberish here\n";
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries.len(), 1);
assert_eq!(toc.entries[0].filename, "file.txt");
assert_eq!(toc.entries[0].size_bytes, Some(100));
}
#[test]
fn not_a_toc_returns_none() {
let body = b"just plain text body\nno entries at all\n";
assert!(parse_toc(body).is_none());
}
#[test]
fn utf8_filename_no_panic() {
let body = "日本語.tar.gz 100 bytes\n".as_bytes();
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries.len(), 1);
assert_eq!(toc.entries[0].filename, "日本語.tar.gz");
}
#[test]
fn parts_token_format1() {
let body = b"file.tar.gz 100 bytes parts 2-5\n";
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries[0].parts, Some(2..=5));
}
#[test]
fn parts_prefix_format3() {
let body = b"02-05 file.tar.gz 100 bytes\n";
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries[0].filename, "file.tar.gz");
assert_eq!(toc.entries[0].parts, Some(2..=5));
assert_eq!(toc.entries[0].size_bytes, Some(100));
}
#[test]
fn size_kb() {
let body = b"archive.zip 1 KB\n";
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries[0].size_bytes, Some(1024));
}
#[test]
fn size_mb() {
let body = b"archive.zip 2 MB\n";
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries[0].size_bytes, Some(2 * 1024 * 1024));
}
#[test]
fn size_bare_b_unit() {
let body = b"file.bin 512 B\n";
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries[0].size_bytes, Some(512));
}
#[test]
fn non_utf8_no_panic() {
let mut body = vec![0xFF, 0xFE, b'\n'];
body.extend_from_slice(b"file.tar.gz 100 bytes\n");
let _ = parse_toc(&body);
}
#[test]
fn comment_only_returns_none() {
let body = b"# just a comment\n# another comment\n";
assert!(parse_toc(body).is_none());
}
#[test]
fn empty_input_returns_none() {
assert!(parse_toc(b"").is_none());
}
#[test]
fn raw_text_preserved() {
let body = b"# TOC\nfile.tar.gz 100 bytes\n";
let toc = parse_toc(body).expect("should parse");
assert!(toc.raw_text.contains("# TOC"));
assert!(toc.raw_text.contains("file.tar.gz"));
}
#[test]
fn format2_kb() {
let body = b"file.tar.gz (1024 KB)\n";
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries[0].size_bytes, Some(1024 * 1024));
}
#[test]
fn inverted_parts_range_format1_ignored() {
let body = b"file.tar.gz 100 bytes parts 8-1\n";
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries[0].parts, None);
}
#[test]
fn plural_parts_keyword() {
let body = b"file.tar.gz 100 bytes parts 3-6\n";
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries[0].parts, Some(3..=6));
}
#[test]
fn singular_part_keyword() {
let body = b"file.tar.gz 100 bytes part 3-6\n";
let toc = parse_toc(body).expect("should parse");
assert_eq!(toc.entries[0].parts, Some(3..=6));
}
}