warc 0.4.0

A Rust library for reading and writing WARC files.
Documentation
use warc::WarcHeader;
use warc::WarcReader;

macro_rules! usage_err {
    ($str:expr) => {
        std::io::Error::new(std::io::ErrorKind::InvalidInput, $str.to_string())
    };
}

fn main() -> std::io::Result<()> {
    let mut args = std::env::args_os().skip(1);

    let warc_name = args
        .next()
        .ok_or_else(|| usage_err!("compressed warc filename not supplied"))?;

    let filtered_file_names: Vec<_> = args.map(|s| s.to_string_lossy().to_string()).collect();
    if filtered_file_names.is_empty() {
        Err(usage_err!("one or more filtered file names not supplied"))?;
    }

    let mut file = WarcReader::from_path_gzip(warc_name)?;

    let mut count = 0;
    let mut skipped = 0;
    let mut stream_iter = file.stream_records();
    while let Some(record) = stream_iter.next_item() {
        let record = record.expect("read of headers ok");
        count += 1;
        match record.header(WarcHeader::TargetURI).map(|s| s.to_string()) {
            Some(v) if has_matching_filename(&v, &filtered_file_names) => {
                println!("Matches filename, skipping record");
                skipped += 1;
            }
            _ => {
                let buffered = record.into_buffered().expect("read of record ok");
                println!(
                    "Found record. Data:\n{}",
                    String::from_utf8_lossy(buffered.body())
                );
            }
        }
    }

    println!("Total records: {}\nSkipped records: {}", count, skipped);

    Ok(())
}

fn has_matching_filename(u: &str, matches: &[String]) -> bool {
    let url = url::Url::parse(u).expect("Target URI is not a URI!?");
    let iter = match url.path_segments() {
        None => return false,
        Some(it) => it,
    };
    let last_segment = match iter.last() {
        None => return false,
        Some(s) => s.to_string(),
    };
    matches.contains(&last_segment)
}