pub mod autothrottle;
pub mod default_headers;
pub mod proxy;
pub mod rate_limit;
pub mod status_retry;
pub mod user_agent;
pub use autothrottle::AutoThrottle;
pub use default_headers::DefaultHeaders;
pub use proxy::ProxyRotator;
pub use rate_limit::RateLimiter;
pub use status_retry::StatusRetry;
pub use user_agent::UserAgentRotator;
use crate::{error::KumoError, extract::Response};
use reqwest::{
Method,
header::{HeaderMap, HeaderName, HeaderValue},
};
use std::sync::atomic::{AtomicUsize, Ordering};
pub(super) enum RotationStrategy {
RoundRobin(AtomicUsize),
Random(AtomicUsize),
}
impl RotationStrategy {
pub(super) fn round_robin() -> Self {
Self::RoundRobin(AtomicUsize::new(0))
}
pub(super) fn random() -> Self {
use std::time::{SystemTime, UNIX_EPOCH};
let seed = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.subsec_nanos() as usize)
.unwrap_or(42);
Self::Random(AtomicUsize::new(seed | 1))
}
pub(super) fn pick_index(&self, len: usize) -> usize {
match self {
Self::RoundRobin(counter) => counter.fetch_add(1, Ordering::Relaxed) % len,
Self::Random(state) => {
let mut x = state.load(Ordering::Relaxed);
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
state.store(x, Ordering::Relaxed);
x % len
}
}
}
}
pub struct FetchRequest {
url: String,
pub method: Method,
pub headers: HeaderMap,
pub body: Option<Vec<u8>>,
pub depth: usize,
pub proxy: Option<String>,
}
impl FetchRequest {
pub fn new(url: impl Into<String>, depth: usize) -> Self {
Self {
url: url.into(),
method: Method::GET,
headers: HeaderMap::new(),
body: None,
depth,
proxy: None,
}
}
pub fn url(&self) -> &str {
&self.url
}
pub fn headers_mut(&mut self) -> &mut HeaderMap {
&mut self.headers
}
pub fn header(&mut self, name: HeaderName, value: HeaderValue) -> &mut Self {
self.headers.insert(name, value);
self
}
pub(crate) fn from_crawl_request(request: &crate::request::CrawlRequest, depth: usize) -> Self {
Self {
url: request.url().to_string(),
method: request.method_ref().clone(),
headers: request.headers().clone(),
body: request.body_bytes().map(ToOwned::to_owned),
depth,
proxy: None,
}
}
}
#[async_trait::async_trait]
pub trait Middleware: Send + Sync {
async fn before_request(&self, request: &mut FetchRequest) -> Result<(), KumoError>;
async fn after_response(&self, _response: &mut Response) -> Result<(), KumoError> {
Ok(())
}
async fn on_error(&self, _url: &str, _error: &KumoError) {}
}