#![allow(dead_code)]
use std::borrow::Cow;
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;
use chrono::{DateTime, Utc};
use uuid::Uuid;
use crate::error::CrawlError;
const WARC_VERSION: &str = "WARC/1.1";
pub struct WarcWriter {
writer: BufWriter<File>,
warcinfo_id: Box<str>,
}
impl WarcWriter {
pub fn new(path: &Path) -> Result<Self, CrawlError> {
let file = File::create(path).map_err(|e| CrawlError::Other(format!("create WARC file: {e}")))?;
Ok(Self {
writer: BufWriter::new(file),
warcinfo_id: String::new().into_boxed_str(),
})
}
pub fn write_warcinfo(&mut self, software: &str, hostname: &str) -> Result<(), CrawlError> {
let record_id = make_record_id();
let date = format_warc_date(Utc::now());
let payload = format!("software: {software}\r\nhostname: {hostname}\r\n");
let payload_bytes = payload.as_bytes();
write_record(
&mut self.writer,
&[
("WARC-Type", Cow::Borrowed("warcinfo")),
("WARC-Date", Cow::Owned(date)),
("WARC-Record-ID", Cow::Borrowed(&record_id)),
("Content-Type", Cow::Borrowed("application/warc-fields")),
("Content-Length", Cow::Owned(payload_bytes.len().to_string())),
],
payload_bytes,
)?;
self.warcinfo_id = record_id.into_boxed_str();
Ok(())
}
pub fn write_response(
&mut self,
url: &str,
status: u16,
headers: &[(&str, &str)],
body: &[u8],
fetch_time: DateTime<Utc>,
) -> Result<String, CrawlError> {
let record_id = make_record_id();
let date = format_warc_date(fetch_time);
let http_block = build_http_block(status, headers, body)?;
let mut warc_headers: Vec<(&str, Cow<'_, str>)> = vec![
("WARC-Type", Cow::Borrowed("response")),
("WARC-Date", Cow::Owned(date)),
("WARC-Target-URI", Cow::Borrowed(url)),
("WARC-Record-ID", Cow::Borrowed(&record_id)),
("Content-Type", Cow::Borrowed("application/http; msgtype=response")),
("Content-Length", Cow::Owned(http_block.len().to_string())),
];
if !self.warcinfo_id.is_empty() {
warc_headers.push(("WARC-Warcinfo-ID", Cow::Borrowed(&self.warcinfo_id)));
}
write_record(&mut self.writer, &warc_headers, &http_block)?;
Ok(record_id)
}
pub fn finish(mut self) -> Result<(), CrawlError> {
self.writer
.flush()
.map_err(|e| CrawlError::Other(format!("flush WARC file: {e}")))
}
}
fn make_record_id() -> String {
format!("<urn:uuid:{}>", Uuid::new_v4())
}
fn format_warc_date(dt: DateTime<Utc>) -> String {
let mut buf = String::with_capacity(20);
use std::fmt::Write as _;
write!(buf, "{}", dt.format("%Y-%m-%dT%H:%M:%SZ")).expect("write to String cannot fail");
buf
}
fn validate_header_value(name: &str, value: &str) -> Result<(), CrawlError> {
if name.contains('\r') || name.contains('\n') {
return Err(CrawlError::InvalidConfig(format!(
"header name contains invalid CR/LF characters: {name:?}"
)));
}
if value.contains('\r') || value.contains('\n') {
return Err(CrawlError::InvalidConfig(format!(
"header value contains invalid CR/LF characters for header {name:?}"
)));
}
Ok(())
}
fn build_http_block(status: u16, headers: &[(&str, &str)], body: &[u8]) -> Result<Vec<u8>, CrawlError> {
for (name, value) in headers {
validate_header_value(name, value)?;
}
let reason = http_reason_phrase(status);
let estimated_size = 32 + headers.iter().map(|(n, v)| n.len() + v.len() + 4).sum::<usize>() + 2 + body.len();
let mut bytes = Vec::with_capacity(estimated_size);
use std::io::Write as _;
write!(&mut bytes, "HTTP/1.1 {status} {reason}\r\n").expect("write to Vec cannot fail");
for (name, value) in headers {
write!(&mut bytes, "{name}: {value}\r\n").expect("write to Vec cannot fail");
}
write!(&mut bytes, "\r\n").expect("write to Vec cannot fail");
bytes.extend_from_slice(body);
Ok(bytes)
}
fn write_record(w: &mut BufWriter<File>, headers: &[(&str, Cow<'_, str>)], payload: &[u8]) -> Result<(), CrawlError> {
let map_io = |e: std::io::Error| CrawlError::Other(format!("write WARC record: {e}"));
w.write_all(WARC_VERSION.as_bytes()).map_err(&map_io)?;
w.write_all(b"\r\n").map_err(&map_io)?;
for (name, value) in headers {
w.write_all(name.as_bytes()).map_err(&map_io)?;
w.write_all(b": ").map_err(&map_io)?;
w.write_all(value.as_bytes()).map_err(&map_io)?;
w.write_all(b"\r\n").map_err(&map_io)?;
}
w.write_all(b"\r\n").map_err(&map_io)?;
w.write_all(payload).map_err(&map_io)?;
w.write_all(b"\r\n\r\n").map_err(&map_io)?;
Ok(())
}
fn http_reason_phrase(status: u16) -> &'static str {
match status {
200 => "OK",
201 => "Created",
204 => "No Content",
301 => "Moved Permanently",
302 => "Found",
304 => "Not Modified",
400 => "Bad Request",
401 => "Unauthorized",
403 => "Forbidden",
404 => "Not Found",
429 => "Too Many Requests",
500 => "Internal Server Error",
502 => "Bad Gateway",
503 => "Service Unavailable",
_ => "Unknown",
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_make_record_id_format() {
let id = make_record_id();
assert!(id.starts_with("<urn:uuid:"));
assert!(id.ends_with('>'));
}
#[test]
fn test_format_warc_date() {
let dt = DateTime::parse_from_rfc3339("2026-04-09T12:00:00Z")
.expect("valid date")
.with_timezone(&Utc);
assert_eq!(format_warc_date(dt), "2026-04-09T12:00:00Z");
}
#[test]
fn test_build_http_block() {
let headers = vec![("Content-Type", "text/html")];
let body = b"<html></html>";
let block = build_http_block(200, &headers, body).expect("valid block");
let text = String::from_utf8_lossy(&block);
assert!(text.starts_with("HTTP/1.1 200 OK\r\n"));
assert!(text.contains("Content-Type: text/html\r\n"));
assert!(text.contains("\r\n\r\n<html></html>"));
}
#[test]
fn test_http_reason_phrase_known() {
assert_eq!(http_reason_phrase(200), "OK");
assert_eq!(http_reason_phrase(404), "Not Found");
assert_eq!(http_reason_phrase(999), "Unknown");
}
}