use std::{collections::BTreeMap, time::Duration};
#[derive(Debug, Default, Clone)]
pub struct DomainStats {
pub scheduled: u64,
pub deduped: u64,
pub completed: u64,
pub failed: u64,
pub retries: u64,
pub robots_blocked: u64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum StopReason {
FrontierExhausted,
Interrupted,
MaxPages,
MaxItems,
MaxDuration,
MaxErrors,
}
#[derive(Debug, Default, Clone)]
pub struct CrawlStats {
pub pages_crawled: u64,
pub items_scraped: u64,
pub errors: u64,
pub duration: Duration,
pub bytes_downloaded: u64,
pub interrupted: bool,
pub scheduled: u64,
pub deduped: u64,
pub retries: u64,
pub robots_blocked: u64,
pub domains: BTreeMap<String, DomainStats>,
pub stop_reason: Option<StopReason>,
}
impl CrawlStats {
pub fn record_scheduled(&mut self, domain: &str) {
self.scheduled += 1;
self.domain_mut(domain).scheduled += 1;
}
pub fn record_deduped(&mut self, domain: &str) {
self.deduped += 1;
self.domain_mut(domain).deduped += 1;
}
pub fn record_completed(&mut self, domain: &str) {
self.domain_mut(domain).completed += 1;
}
pub fn record_failed(&mut self, domain: &str) {
self.domain_mut(domain).failed += 1;
}
pub fn record_error(&mut self, domain: &str) {
self.errors += 1;
self.record_failed(domain);
}
pub fn record_retry(&mut self, domain: &str) {
self.retries += 1;
self.domain_mut(domain).retries += 1;
}
pub fn record_robots_blocked(&mut self, domain: &str) {
self.robots_blocked += 1;
self.domain_mut(domain).robots_blocked += 1;
}
fn domain_mut(&mut self, domain: &str) -> &mut DomainStats {
self.domains.entry(domain.to_string()).or_default()
}
}
impl StopReason {
pub fn as_str(self) -> &'static str {
match self {
Self::FrontierExhausted => "frontier_exhausted",
Self::Interrupted => "interrupted",
Self::MaxPages => "max_pages",
Self::MaxItems => "max_items",
Self::MaxDuration => "max_duration",
Self::MaxErrors => "max_errors",
}
}
}
#[derive(Debug, Clone)]
pub struct CrawlReport {
pub pages_crawled: u64,
pub items_scraped: u64,
pub errors: u64,
pub duration: Duration,
pub bytes_downloaded: u64,
pub interrupted: bool,
pub scheduled: u64,
pub deduped: u64,
pub retries: u64,
pub robots_blocked: u64,
pub domains: BTreeMap<String, DomainStats>,
pub stop_reason: Option<StopReason>,
}
impl From<CrawlStats> for CrawlReport {
fn from(stats: CrawlStats) -> Self {
Self {
pages_crawled: stats.pages_crawled,
items_scraped: stats.items_scraped,
errors: stats.errors,
duration: stats.duration,
bytes_downloaded: stats.bytes_downloaded,
interrupted: stats.interrupted,
scheduled: stats.scheduled,
deduped: stats.deduped,
retries: stats.retries,
robots_blocked: stats.robots_blocked,
domains: stats.domains,
stop_reason: stats.stop_reason,
}
}
}
impl CrawlReport {
pub fn to_json_value(&self) -> serde_json::Value {
let domains = self
.domains
.iter()
.map(|(domain, stats)| {
(
domain.clone(),
serde_json::json!({
"scheduled": stats.scheduled,
"deduped": stats.deduped,
"completed": stats.completed,
"failed": stats.failed,
"retries": stats.retries,
"robots_blocked": stats.robots_blocked,
}),
)
})
.collect::<serde_json::Map<_, _>>();
serde_json::json!({
"pages_crawled": self.pages_crawled,
"items_scraped": self.items_scraped,
"errors": self.errors,
"duration_ms": self.duration.as_millis(),
"duration_secs": self.duration.as_secs_f64(),
"bytes_downloaded": self.bytes_downloaded,
"interrupted": self.interrupted,
"scheduled": self.scheduled,
"deduped": self.deduped,
"retries": self.retries,
"robots_blocked": self.robots_blocked,
"domains": domains,
"stop_reason": self.stop_reason.map(StopReason::as_str),
})
}
pub fn to_json_string(&self) -> String {
self.to_json_value().to_string()
}
pub fn to_json_string_pretty(&self) -> String {
serde_json::to_string_pretty(&self.to_json_value())
.expect("CrawlReport JSON value should always serialize")
}
}
pub(crate) fn domain_key(url: &str) -> String {
url::Url::parse(url)
.ok()
.and_then(|url| url.host_str().map(str::to_ascii_lowercase))
.unwrap_or_else(|| "<unknown>".to_string())
}