use warc::WarcHeader;
use warc::WarcReader;
macro_rules! usage_err {
($str:expr) => {
std::io::Error::new(std::io::ErrorKind::InvalidInput, $str.to_string())
};
}
fn main() -> std::io::Result<()> {
let mut args = std::env::args_os().skip(1);
let warc_name = args
.next()
.ok_or_else(|| usage_err!("compressed warc filename not supplied"))?;
let filtered_file_names: Vec<_> = args.map(|s| s.to_string_lossy().to_string()).collect();
if filtered_file_names.is_empty() {
Err(usage_err!("one or more filtered file names not supplied"))?;
}
let mut file = WarcReader::from_path_gzip(warc_name)?;
let mut count = 0;
let mut skipped = 0;
let mut stream_iter = file.stream_records();
while let Some(record) = stream_iter.next_item() {
let record = record.expect("read of headers ok");
count += 1;
match record.header(WarcHeader::TargetURI).map(|s| s.to_string()) {
Some(v) if has_matching_filename(&v, &filtered_file_names) => {
println!("Matches filename, skipping record");
skipped += 1;
}
_ => {
let buffered = record.into_buffered().expect("read of record ok");
println!(
"Found record. Data:\n{}",
String::from_utf8_lossy(buffered.body())
);
}
}
}
println!("Total records: {}\nSkipped records: {}", count, skipped);
Ok(())
}
fn has_matching_filename(u: &str, matches: &[String]) -> bool {
let url = url::Url::parse(u).expect("Target URI is not a URI!?");
let iter = match url.path_segments() {
None => return false,
Some(it) => it,
};
let last_segment = match iter.last() {
None => return false,
Some(s) => s.to_string(),
};
matches.contains(&last_segment)
}