pub struct WarcReader<R> { /* private fields */ }Expand description
A reader which iteratively parses WARC records from a stream.
Implementations§
Source§impl<R: BufRead> WarcReader<R>
impl<R: BufRead> WarcReader<R>
Sourcepub fn iter_raw_records(self) -> RawRecordIter<R> ⓘ
pub fn iter_raw_records(self) -> RawRecordIter<R> ⓘ
Create an iterator over all of the raw records read.
This only does well-formedness checks on the headers. See RawRecordHeader for more
information.
Examples found in repository?
examples/read_raw.rs (line 8)
4fn main() -> Result<(), std::io::Error> {
5 let file = WarcReader::from_path("warc_example.warc")?;
6
7 let mut count = 0;
8 for record in file.iter_raw_records() {
9 count += 1;
10 match record {
11 Err(err) => println!("ERROR: {}\r\n", err),
12 Ok((headers, _)) => {
13 println!(
14 "{}: {}",
15 WarcHeader::RecordID,
16 String::from_utf8_lossy(headers.as_ref().get(&WarcHeader::RecordID).unwrap())
17 );
18 println!(
19 "{}: {}",
20 WarcHeader::Date,
21 String::from_utf8_lossy(headers.as_ref().get(&WarcHeader::Date).unwrap())
22 );
23 println!();
24 }
25 }
26 }
27
28 println!("Total records: {}", count);
29
30 Ok(())
31}Sourcepub fn iter_records(self) -> RecordIter<R> ⓘ
pub fn iter_records(self) -> RecordIter<R> ⓘ
Create an iterator over all of the records read.
This will fully build each record and check it for semantic correctness. See the Record
type for more information.
Examples found in repository?
examples/read_file.rs (line 8)
4fn main() -> Result<(), std::io::Error> {
5 let file = WarcReader::from_path("warc_example.warc")?;
6
7 let mut count = 0;
8 for record in file.iter_records() {
9 count += 1;
10 match record {
11 Err(err) => println!("ERROR: {}\r\n", err),
12 Ok(record) => {
13 println!("{}: {}", WarcHeader::RecordID, record.warc_id(),);
14 println!("{}: {}", WarcHeader::Date, record.date(),);
15 println!();
16 }
17 }
18 }
19
20 println!("Total records: {}", count);
21
22 Ok(())
23}More examples
examples/read_gzip.rs (line 8)
4fn main() -> Result<(), std::io::Error> {
5 let file = WarcReader::from_path_gzip("warc_example.warc.gz")?;
6
7 let mut count = 0;
8 for record in file.iter_records() {
9 count += 1;
10 match record {
11 Err(err) => println!("ERROR: {}\r\n", err),
12 Ok(record) => {
13 println!("{}: {}", WarcHeader::RecordID, record.warc_id());
14 println!("{}: {}", WarcHeader::Date, record.date());
15 println!();
16 }
17 }
18 }
19
20 println!("Total records: {}", count);
21
22 Ok(())
23}Sourcepub fn stream_records(&mut self) -> StreamingIter<'_, R>
pub fn stream_records(&mut self) -> StreamingIter<'_, R>
Create a streaming iterator over all of the records read.
This will build each record header, and allow the caller to decide whether to read the body or not.
Examples found in repository?
examples/read_filtered.rs (line 26)
10fn main() -> std::io::Result<()> {
11 let mut args = std::env::args_os().skip(1);
12
13 let warc_name = args
14 .next()
15 .ok_or_else(|| usage_err!("compressed warc filename not supplied"))?;
16
17 let filtered_file_names: Vec<_> = args.map(|s| s.to_string_lossy().to_string()).collect();
18 if filtered_file_names.is_empty() {
19 Err(usage_err!("one or more filtered file names not supplied"))?;
20 }
21
22 let mut file = WarcReader::from_path_gzip(warc_name)?;
23
24 let mut count = 0;
25 let mut skipped = 0;
26 let mut stream_iter = file.stream_records();
27 while let Some(record) = stream_iter.next_item() {
28 let record = record.expect("read of headers ok");
29 count += 1;
30 match record.header(WarcHeader::TargetURI).map(|s| s.to_string()) {
31 Some(v) if has_matching_filename(&v, &filtered_file_names) => {
32 println!("Matches filename, skipping record");
33 skipped += 1;
34 }
35 _ => {
36 let buffered = record.into_buffered().expect("read of record ok");
37 println!(
38 "Found record. Data:\n{}",
39 String::from_utf8_lossy(buffered.body())
40 );
41 }
42 }
43 }
44
45 println!("Total records: {}\nSkipped records: {}", count, skipped);
46
47 Ok(())
48}Source§impl WarcReader<BufReader<File>>
impl WarcReader<BufReader<File>>
Sourcepub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self>
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self>
Create a new reader which reads from file.
Examples found in repository?
examples/read_file.rs (line 5)
4fn main() -> Result<(), std::io::Error> {
5 let file = WarcReader::from_path("warc_example.warc")?;
6
7 let mut count = 0;
8 for record in file.iter_records() {
9 count += 1;
10 match record {
11 Err(err) => println!("ERROR: {}\r\n", err),
12 Ok(record) => {
13 println!("{}: {}", WarcHeader::RecordID, record.warc_id(),);
14 println!("{}: {}", WarcHeader::Date, record.date(),);
15 println!();
16 }
17 }
18 }
19
20 println!("Total records: {}", count);
21
22 Ok(())
23}More examples
examples/read_raw.rs (line 5)
4fn main() -> Result<(), std::io::Error> {
5 let file = WarcReader::from_path("warc_example.warc")?;
6
7 let mut count = 0;
8 for record in file.iter_raw_records() {
9 count += 1;
10 match record {
11 Err(err) => println!("ERROR: {}\r\n", err),
12 Ok((headers, _)) => {
13 println!(
14 "{}: {}",
15 WarcHeader::RecordID,
16 String::from_utf8_lossy(headers.as_ref().get(&WarcHeader::RecordID).unwrap())
17 );
18 println!(
19 "{}: {}",
20 WarcHeader::Date,
21 String::from_utf8_lossy(headers.as_ref().get(&WarcHeader::Date).unwrap())
22 );
23 println!();
24 }
25 }
26 }
27
28 println!("Total records: {}", count);
29
30 Ok(())
31}Source§impl WarcReader<BufReader<MultiDecoder<BufReader<File>>>>
impl WarcReader<BufReader<MultiDecoder<BufReader<File>>>>
Sourcepub fn from_path_gzip<P: AsRef<Path>>(path: P) -> Result<Self>
pub fn from_path_gzip<P: AsRef<Path>>(path: P) -> Result<Self>
Create a new reader which reads from a compressed file.
Only GZIP compression is currently supported.
Examples found in repository?
examples/read_gzip.rs (line 5)
4fn main() -> Result<(), std::io::Error> {
5 let file = WarcReader::from_path_gzip("warc_example.warc.gz")?;
6
7 let mut count = 0;
8 for record in file.iter_records() {
9 count += 1;
10 match record {
11 Err(err) => println!("ERROR: {}\r\n", err),
12 Ok(record) => {
13 println!("{}: {}", WarcHeader::RecordID, record.warc_id());
14 println!("{}: {}", WarcHeader::Date, record.date());
15 println!();
16 }
17 }
18 }
19
20 println!("Total records: {}", count);
21
22 Ok(())
23}More examples
examples/read_filtered.rs (line 22)
10fn main() -> std::io::Result<()> {
11 let mut args = std::env::args_os().skip(1);
12
13 let warc_name = args
14 .next()
15 .ok_or_else(|| usage_err!("compressed warc filename not supplied"))?;
16
17 let filtered_file_names: Vec<_> = args.map(|s| s.to_string_lossy().to_string()).collect();
18 if filtered_file_names.is_empty() {
19 Err(usage_err!("one or more filtered file names not supplied"))?;
20 }
21
22 let mut file = WarcReader::from_path_gzip(warc_name)?;
23
24 let mut count = 0;
25 let mut skipped = 0;
26 let mut stream_iter = file.stream_records();
27 while let Some(record) = stream_iter.next_item() {
28 let record = record.expect("read of headers ok");
29 count += 1;
30 match record.header(WarcHeader::TargetURI).map(|s| s.to_string()) {
31 Some(v) if has_matching_filename(&v, &filtered_file_names) => {
32 println!("Matches filename, skipping record");
33 skipped += 1;
34 }
35 _ => {
36 let buffered = record.into_buffered().expect("read of record ok");
37 println!(
38 "Found record. Data:\n{}",
39 String::from_utf8_lossy(buffered.body())
40 );
41 }
42 }
43 }
44
45 println!("Total records: {}\nSkipped records: {}", count, skipped);
46
47 Ok(())
48}Auto Trait Implementations§
impl<R> Freeze for WarcReader<R>where
R: Freeze,
impl<R> RefUnwindSafe for WarcReader<R>where
R: RefUnwindSafe,
impl<R> Send for WarcReader<R>where
R: Send,
impl<R> Sync for WarcReader<R>where
R: Sync,
impl<R> Unpin for WarcReader<R>where
R: Unpin,
impl<R> UnwindSafe for WarcReader<R>where
R: UnwindSafe,
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more