read_filtered/
read_filtered.rs

1use warc::WarcHeader;
2use warc::WarcReader;
3
4macro_rules! usage_err {
5    ($str:expr) => {
6        std::io::Error::new(std::io::ErrorKind::InvalidInput, $str.to_string())
7    };
8}
9
10fn main() -> std::io::Result<()> {
11    let mut args = std::env::args_os().skip(1);
12
13    let warc_name = args
14        .next()
15        .ok_or_else(|| usage_err!("compressed warc filename not supplied"))?;
16
17    let filtered_file_names: Vec<_> = args.map(|s| s.to_string_lossy().to_string()).collect();
18    if filtered_file_names.is_empty() {
19        Err(usage_err!("one or more filtered file names not supplied"))?;
20    }
21
22    let mut file = WarcReader::from_path_gzip(warc_name)?;
23
24    let mut count = 0;
25    let mut skipped = 0;
26    let mut stream_iter = file.stream_records();
27    while let Some(record) = stream_iter.next_item() {
28        let record = record.expect("read of headers ok");
29        count += 1;
30        match record.header(WarcHeader::TargetURI).map(|s| s.to_string()) {
31            Some(v) if has_matching_filename(&v, &filtered_file_names) => {
32                println!("Matches filename, skipping record");
33                skipped += 1;
34            }
35            _ => {
36                let buffered = record.into_buffered().expect("read of record ok");
37                println!(
38                    "Found record. Data:\n{}",
39                    String::from_utf8_lossy(buffered.body())
40                );
41            }
42        }
43    }
44
45    println!("Total records: {}\nSkipped records: {}", count, skipped);
46
47    Ok(())
48}
49
50fn has_matching_filename(u: &str, matches: &[String]) -> bool {
51    let url = url::Url::parse(u).expect("Target URI is not a URI!?");
52    let iter = match url.path_segments() {
53        None => return false,
54        Some(it) => it,
55    };
56    let last_segment = match iter.last() {
57        None => return false,
58        Some(s) => s.to_string(),
59    };
60    matches.contains(&last_segment)
61}