#![allow(dead_code)]
use std::time::Duration;
use crate::error::CrawlError;
use crate::types::{CachedPage, CrawlPageResult, ScrapeResult};
use async_trait::async_trait;
#[derive(Debug, Clone)]
pub struct FrontierEntry {
pub url: String,
pub depth: usize,
pub priority: f64,
}
#[derive(Debug, Clone, Default)]
pub struct CrawlStats {
pub pages_crawled: usize,
pub pages_failed: usize,
pub urls_discovered: usize,
pub urls_filtered: usize,
pub elapsed: Duration,
}
#[derive(Debug, Clone)]
pub struct PageEvent {
pub url: String,
pub status_code: u16,
pub depth: usize,
}
#[derive(Debug, Clone)]
pub struct ErrorEvent {
pub url: String,
pub error: String,
}
#[derive(Debug, Clone)]
pub struct CompleteEvent {
pub pages_crawled: usize,
}
#[async_trait]
pub trait Frontier: Send + Sync {
async fn push(&self, entry: FrontierEntry) -> Result<(), CrawlError>;
async fn pop(&self) -> Result<Option<FrontierEntry>, CrawlError>;
async fn pop_batch(&self, n: usize) -> Result<Vec<FrontierEntry>, CrawlError> {
let mut batch = Vec::with_capacity(n);
for _ in 0..n {
match self.pop().await? {
Some(entry) => batch.push(entry),
None => break,
}
}
Ok(batch)
}
async fn len(&self) -> Result<usize, CrawlError>;
async fn is_empty(&self) -> Result<bool, CrawlError> {
Ok(self.len().await? == 0)
}
async fn is_seen(&self, url: &str) -> Result<bool, CrawlError>;
async fn mark_seen(&self, url: &str) -> Result<(), CrawlError>;
}
#[async_trait]
pub trait RateLimiter: Send + Sync {
async fn acquire(&self, domain: &str) -> Result<(), CrawlError>;
async fn record_response(&self, domain: &str, status: u16) -> Result<(), CrawlError>;
async fn set_crawl_delay(&self, domain: &str, delay: Duration) -> Result<(), CrawlError>;
}
#[async_trait]
pub trait CrawlStore: Send + Sync {
async fn store_page(&self, url: &str, result: &ScrapeResult) -> Result<(), CrawlError>;
async fn store_crawl_page(&self, url: &str, result: &CrawlPageResult) -> Result<(), CrawlError>;
async fn store_error(&self, url: &str, error: &CrawlError) -> Result<(), CrawlError>;
async fn on_complete(&self, stats: &CrawlStats) -> Result<(), CrawlError>;
}
#[async_trait]
pub trait EventEmitter: Send + Sync {
async fn on_page(&self, event: &PageEvent);
async fn on_error(&self, event: &ErrorEvent);
async fn on_complete(&self, event: &CompleteEvent);
async fn on_discovered(&self, url: &str, depth: usize);
}
pub trait CrawlStrategy: Send + Sync {
fn select_next(&self, candidates: &[FrontierEntry]) -> Option<usize>;
fn score_url(&self, url: &str, depth: usize) -> f64 {
let _ = url;
1.0 / (depth as f64 + 1.0)
}
fn should_continue(&self, stats: &CrawlStats) -> bool {
let _ = stats;
true
}
fn on_page_processed(&self, _page: &CrawlPageResult) {}
}
#[async_trait]
pub trait ContentFilter: Send + Sync {
async fn filter(&self, page: CrawlPageResult) -> Result<Option<CrawlPageResult>, CrawlError>;
}
#[async_trait]
pub trait CrawlCache: Send + Sync {
async fn get(&self, key: &str) -> Result<Option<CachedPage>, CrawlError>;
async fn set(&self, key: &str, page: &CachedPage) -> Result<(), CrawlError>;
async fn has(&self, key: &str) -> Result<bool, CrawlError>;
}