use std::collections::HashMap;
use std::path::Path;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CrawlStats {
pub requests_count: u64,
pub concurrent_requests: u32,
pub concurrent_requests_per_domain: u32,
pub failed_requests_count: u64,
pub offsite_requests_count: u64,
pub robots_disallowed_count: u64,
pub cache_hits: u64,
pub cache_misses: u64,
pub response_bytes: u64,
pub items_scraped: u64,
pub items_dropped: u64,
pub start_time: f64,
pub end_time: f64,
pub download_delay: f64,
pub blocked_requests_count: u64,
pub custom_stats: HashMap<String, serde_json::Value>,
pub response_status_count: HashMap<String, u64>,
pub domains_response_bytes: HashMap<String, u64>,
pub sessions_requests_count: HashMap<String, u64>,
pub proxies: Vec<String>,
pub log_levels_counter: HashMap<String, u64>,
}
impl CrawlStats {
pub fn elapsed_seconds(&self) -> f64 {
self.end_time - self.start_time
}
pub fn requests_per_second(&self) -> f64 {
let elapsed = self.elapsed_seconds();
if elapsed == 0.0 {
0.0
} else {
self.requests_count as f64 / elapsed
}
}
pub fn increment_status(&mut self, status: u16) {
let key = format!("status_{status}");
*self.response_status_count.entry(key).or_insert(0) += 1;
}
pub fn increment_response_bytes(&mut self, domain: &str, count: u64) {
self.response_bytes += count;
*self
.domains_response_bytes
.entry(domain.to_owned())
.or_insert(0) += count;
}
pub fn increment_requests_count(&mut self, sid: &str) {
self.requests_count += 1;
*self
.sessions_requests_count
.entry(sid.to_owned())
.or_insert(0) += 1;
}
}
#[derive(Debug, Default)]
pub struct ItemList(Vec<serde_json::Value>);
impl ItemList {
pub fn new() -> Self {
Self(Vec::new())
}
pub fn push(&mut self, item: serde_json::Value) {
self.0.push(item);
}
pub fn len(&self) -> usize {
self.0.len()
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn iter(&self) -> std::slice::Iter<'_, serde_json::Value> {
self.0.iter()
}
pub fn to_json(&self, path: &Path, indent: bool) -> std::io::Result<()> {
path.parent().map(std::fs::create_dir_all).transpose()?;
let data = match indent {
true => serde_json::to_vec_pretty(&self.0),
false => serde_json::to_vec(&self.0),
}
.unwrap_or_default();
std::fs::write(path, data)
}
pub fn to_jsonl(&self, path: &Path) -> std::io::Result<()> {
path.parent().map(std::fs::create_dir_all).transpose()?;
let content = self
.0
.iter()
.map(|item| serde_json::to_string(item).unwrap_or_default())
.collect::<Vec<_>>()
.join("\n");
std::fs::write(path, content)
}
}
impl IntoIterator for ItemList {
type Item = serde_json::Value;
type IntoIter = std::vec::IntoIter<serde_json::Value>;
fn into_iter(self) -> Self::IntoIter {
self.0.into_iter()
}
}
impl<'a> IntoIterator for &'a ItemList {
type Item = &'a serde_json::Value;
type IntoIter = std::slice::Iter<'a, serde_json::Value>;
fn into_iter(self) -> Self::IntoIter {
self.0.iter()
}
}
impl std::ops::Index<usize> for ItemList {
type Output = serde_json::Value;
fn index(&self, idx: usize) -> &Self::Output {
&self.0[idx]
}
}
pub struct CrawlResult {
pub stats: CrawlStats,
pub items: ItemList,
pub paused: bool,
}
impl CrawlResult {
pub fn completed(&self) -> bool {
!self.paused
}
pub fn len(&self) -> usize {
self.items.len()
}
pub fn is_empty(&self) -> bool {
self.items.is_empty()
}
}