WarcReader

Struct WarcReader 

Source
pub struct WarcReader<R> { /* private fields */ }
Expand description

A reader which iteratively parses WARC records from a stream.

Implementations§

Source§

impl<R: BufRead> WarcReader<R>

Source

pub fn new(r: R) -> Self

Create a new reader.

Source

pub fn iter_raw_records(self) -> RawRecordIter<R>

Create an iterator over all of the raw records read.

This only does well-formedness checks on the headers. See RawRecordHeader for more information.

Examples found in repository?
examples/read_raw.rs (line 8)
4fn main() -> Result<(), std::io::Error> {
5    let file = WarcReader::from_path("warc_example.warc")?;
6
7    let mut count = 0;
8    for record in file.iter_raw_records() {
9        count += 1;
10        match record {
11            Err(err) => println!("ERROR: {}\r\n", err),
12            Ok((headers, _)) => {
13                println!(
14                    "{}: {}",
15                    WarcHeader::RecordID,
16                    String::from_utf8_lossy(headers.as_ref().get(&WarcHeader::RecordID).unwrap())
17                );
18                println!(
19                    "{}: {}",
20                    WarcHeader::Date,
21                    String::from_utf8_lossy(headers.as_ref().get(&WarcHeader::Date).unwrap())
22                );
23                println!();
24            }
25        }
26    }
27
28    println!("Total records: {}", count);
29
30    Ok(())
31}
Source

pub fn iter_records(self) -> RecordIter<R>

Create an iterator over all of the records read.

This will fully build each record and check it for semantic correctness. See the Record type for more information.

Examples found in repository?
examples/read_file.rs (line 8)
4fn main() -> Result<(), std::io::Error> {
5    let file = WarcReader::from_path("warc_example.warc")?;
6
7    let mut count = 0;
8    for record in file.iter_records() {
9        count += 1;
10        match record {
11            Err(err) => println!("ERROR: {}\r\n", err),
12            Ok(record) => {
13                println!("{}: {}", WarcHeader::RecordID, record.warc_id(),);
14                println!("{}: {}", WarcHeader::Date, record.date(),);
15                println!();
16            }
17        }
18    }
19
20    println!("Total records: {}", count);
21
22    Ok(())
23}
More examples
Hide additional examples
examples/read_gzip.rs (line 8)
4fn main() -> Result<(), std::io::Error> {
5    let file = WarcReader::from_path_gzip("warc_example.warc.gz")?;
6
7    let mut count = 0;
8    for record in file.iter_records() {
9        count += 1;
10        match record {
11            Err(err) => println!("ERROR: {}\r\n", err),
12            Ok(record) => {
13                println!("{}: {}", WarcHeader::RecordID, record.warc_id());
14                println!("{}: {}", WarcHeader::Date, record.date());
15                println!();
16            }
17        }
18    }
19
20    println!("Total records: {}", count);
21
22    Ok(())
23}
Source

pub fn stream_records(&mut self) -> StreamingIter<'_, R>

Create a streaming iterator over all of the records read.

This will build each record header, and allow the caller to decide whether to read the body or not.

Examples found in repository?
examples/read_filtered.rs (line 26)
10fn main() -> std::io::Result<()> {
11    let mut args = std::env::args_os().skip(1);
12
13    let warc_name = args
14        .next()
15        .ok_or_else(|| usage_err!("compressed warc filename not supplied"))?;
16
17    let filtered_file_names: Vec<_> = args.map(|s| s.to_string_lossy().to_string()).collect();
18    if filtered_file_names.is_empty() {
19        Err(usage_err!("one or more filtered file names not supplied"))?;
20    }
21
22    let mut file = WarcReader::from_path_gzip(warc_name)?;
23
24    let mut count = 0;
25    let mut skipped = 0;
26    let mut stream_iter = file.stream_records();
27    while let Some(record) = stream_iter.next_item() {
28        let record = record.expect("read of headers ok");
29        count += 1;
30        match record.header(WarcHeader::TargetURI).map(|s| s.to_string()) {
31            Some(v) if has_matching_filename(&v, &filtered_file_names) => {
32                println!("Matches filename, skipping record");
33                skipped += 1;
34            }
35            _ => {
36                let buffered = record.into_buffered().expect("read of record ok");
37                println!(
38                    "Found record. Data:\n{}",
39                    String::from_utf8_lossy(buffered.body())
40                );
41            }
42        }
43    }
44
45    println!("Total records: {}\nSkipped records: {}", count, skipped);
46
47    Ok(())
48}
Source§

impl WarcReader<BufReader<File>>

Source

pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self>

Create a new reader which reads from file.

Examples found in repository?
examples/read_file.rs (line 5)
4fn main() -> Result<(), std::io::Error> {
5    let file = WarcReader::from_path("warc_example.warc")?;
6
7    let mut count = 0;
8    for record in file.iter_records() {
9        count += 1;
10        match record {
11            Err(err) => println!("ERROR: {}\r\n", err),
12            Ok(record) => {
13                println!("{}: {}", WarcHeader::RecordID, record.warc_id(),);
14                println!("{}: {}", WarcHeader::Date, record.date(),);
15                println!();
16            }
17        }
18    }
19
20    println!("Total records: {}", count);
21
22    Ok(())
23}
More examples
Hide additional examples
examples/read_raw.rs (line 5)
4fn main() -> Result<(), std::io::Error> {
5    let file = WarcReader::from_path("warc_example.warc")?;
6
7    let mut count = 0;
8    for record in file.iter_raw_records() {
9        count += 1;
10        match record {
11            Err(err) => println!("ERROR: {}\r\n", err),
12            Ok((headers, _)) => {
13                println!(
14                    "{}: {}",
15                    WarcHeader::RecordID,
16                    String::from_utf8_lossy(headers.as_ref().get(&WarcHeader::RecordID).unwrap())
17                );
18                println!(
19                    "{}: {}",
20                    WarcHeader::Date,
21                    String::from_utf8_lossy(headers.as_ref().get(&WarcHeader::Date).unwrap())
22                );
23                println!();
24            }
25        }
26    }
27
28    println!("Total records: {}", count);
29
30    Ok(())
31}
Source§

impl WarcReader<BufReader<MultiDecoder<BufReader<File>>>>

Source

pub fn from_path_gzip<P: AsRef<Path>>(path: P) -> Result<Self>

Create a new reader which reads from a compressed file.

Only GZIP compression is currently supported.

Examples found in repository?
examples/read_gzip.rs (line 5)
4fn main() -> Result<(), std::io::Error> {
5    let file = WarcReader::from_path_gzip("warc_example.warc.gz")?;
6
7    let mut count = 0;
8    for record in file.iter_records() {
9        count += 1;
10        match record {
11            Err(err) => println!("ERROR: {}\r\n", err),
12            Ok(record) => {
13                println!("{}: {}", WarcHeader::RecordID, record.warc_id());
14                println!("{}: {}", WarcHeader::Date, record.date());
15                println!();
16            }
17        }
18    }
19
20    println!("Total records: {}", count);
21
22    Ok(())
23}
More examples
Hide additional examples
examples/read_filtered.rs (line 22)
10fn main() -> std::io::Result<()> {
11    let mut args = std::env::args_os().skip(1);
12
13    let warc_name = args
14        .next()
15        .ok_or_else(|| usage_err!("compressed warc filename not supplied"))?;
16
17    let filtered_file_names: Vec<_> = args.map(|s| s.to_string_lossy().to_string()).collect();
18    if filtered_file_names.is_empty() {
19        Err(usage_err!("one or more filtered file names not supplied"))?;
20    }
21
22    let mut file = WarcReader::from_path_gzip(warc_name)?;
23
24    let mut count = 0;
25    let mut skipped = 0;
26    let mut stream_iter = file.stream_records();
27    while let Some(record) = stream_iter.next_item() {
28        let record = record.expect("read of headers ok");
29        count += 1;
30        match record.header(WarcHeader::TargetURI).map(|s| s.to_string()) {
31            Some(v) if has_matching_filename(&v, &filtered_file_names) => {
32                println!("Matches filename, skipping record");
33                skipped += 1;
34            }
35            _ => {
36                let buffered = record.into_buffered().expect("read of record ok");
37                println!(
38                    "Found record. Data:\n{}",
39                    String::from_utf8_lossy(buffered.body())
40                );
41            }
42        }
43    }
44
45    println!("Total records: {}\nSkipped records: {}", count, skipped);
46
47    Ok(())
48}

Auto Trait Implementations§

§

impl<R> Freeze for WarcReader<R>
where R: Freeze,

§

impl<R> RefUnwindSafe for WarcReader<R>
where R: RefUnwindSafe,

§

impl<R> Send for WarcReader<R>
where R: Send,

§

impl<R> Sync for WarcReader<R>
where R: Sync,

§

impl<R> Unpin for WarcReader<R>
where R: Unpin,

§

impl<R> UnwindSafe for WarcReader<R>
where R: UnwindSafe,

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.