use super::FileReader;
use anyhow::{Context, Result};
use std::path::Path;
pub struct WarcFileReader;
impl WarcFileReader {
pub fn new() -> Self {
Self
}
}
impl FileReader for WarcFileReader {
fn read_urls(&self, file_path: &Path) -> Result<Vec<String>> {
use std::fs::File;
use std::io::{BufRead, BufReader};
let file = File::open(file_path)
.with_context(|| format!("Failed to open WARC file: {}", file_path.display()))?;
let reader = BufReader::new(file);
let mut urls = Vec::new();
for line in reader.lines() {
let line = line?;
if line.starts_with("WARC-Target-URI:") {
if let Some(url) = line.strip_prefix("WARC-Target-URI:") {
let url = url.trim();
if url.starts_with("http://") || url.starts_with("https://") {
urls.push(url.to_string());
}
}
}
else if line.trim().starts_with("http://") || line.trim().starts_with("https://") {
let url = line.trim();
if url.contains("://") && !url.contains(' ') {
urls.push(url.to_string());
}
}
}
Ok(urls)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_warc_file_reader_creation() {
let reader = WarcFileReader::new();
assert_eq!(std::mem::size_of_val(&reader), 0); }
#[test]
fn test_read_warc_headers() -> Result<()> {
let mut temp_file = NamedTempFile::new()?;
writeln!(temp_file, "WARC/1.0")?;
writeln!(temp_file, "WARC-Type: response")?;
writeln!(temp_file, "WARC-Target-URI: https://example.com/page1")?;
writeln!(temp_file, "Content-Length: 100")?;
writeln!(temp_file)?;
writeln!(temp_file, "HTTP response content here")?;
writeln!(temp_file, "WARC-Target-URI: http://example.org/page2")?;
temp_file.flush()?;
let reader = WarcFileReader::new();
let urls = reader.read_urls(temp_file.path())?;
assert_eq!(urls.len(), 2);
assert!(urls.contains(&"https://example.com/page1".to_string()));
assert!(urls.contains(&"http://example.org/page2".to_string()));
Ok(())
}
}