read_filtered/
read_filtered.rs1use warc::WarcHeader;
2use warc::WarcReader;
3
4macro_rules! usage_err {
5 ($str:expr) => {
6 std::io::Error::new(std::io::ErrorKind::InvalidInput, $str.to_string())
7 };
8}
9
10fn main() -> std::io::Result<()> {
11 let mut args = std::env::args_os().skip(1);
12
13 let warc_name = args
14 .next()
15 .ok_or_else(|| usage_err!("compressed warc filename not supplied"))?;
16
17 let filtered_file_names: Vec<_> = args.map(|s| s.to_string_lossy().to_string()).collect();
18 if filtered_file_names.is_empty() {
19 Err(usage_err!("one or more filtered file names not supplied"))?;
20 }
21
22 let mut file = WarcReader::from_path_gzip(warc_name)?;
23
24 let mut count = 0;
25 let mut skipped = 0;
26 let mut stream_iter = file.stream_records();
27 while let Some(record) = stream_iter.next_item() {
28 let record = record.expect("read of headers ok");
29 count += 1;
30 match record.header(WarcHeader::TargetURI).map(|s| s.to_string()) {
31 Some(v) if has_matching_filename(&v, &filtered_file_names) => {
32 println!("Matches filename, skipping record");
33 skipped += 1;
34 }
35 _ => {
36 let buffered = record.into_buffered().expect("read of record ok");
37 println!(
38 "Found record. Data:\n{}",
39 String::from_utf8_lossy(buffered.body())
40 );
41 }
42 }
43 }
44
45 println!("Total records: {}\nSkipped records: {}", count, skipped);
46
47 Ok(())
48}
49
50fn has_matching_filename(u: &str, matches: &[String]) -> bool {
51 let url = url::Url::parse(u).expect("Target URI is not a URI!?");
52 let iter = match url.path_segments() {
53 None => return false,
54 Some(it) => it,
55 };
56 let last_segment = match iter.last() {
57 None => return false,
58 Some(s) => s.to_string(),
59 };
60 matches.contains(&last_segment)
61}