kumo 0.3.12

An async web crawling framework for Rust - Scrapy for Rust
Documentation
pub mod autothrottle;
pub mod default_headers;
pub mod proxy;
pub mod rate_limit;
pub mod status_retry;
pub mod user_agent;

pub use autothrottle::AutoThrottle;
pub use default_headers::DefaultHeaders;
pub use proxy::ProxyRotator;
pub use rate_limit::RateLimiter;
pub use status_retry::StatusRetry;
pub use user_agent::UserAgentRotator;

use crate::{error::KumoError, extract::Response};
use reqwest::{
    Method,
    header::{HeaderMap, HeaderName, HeaderValue},
};
use std::{
    sync::atomic::{AtomicUsize, Ordering},
    time::Duration,
};

/// Shared selection strategy used by `UserAgentRotator` and `ProxyRotator`.
pub(super) enum RotationStrategy {
    RoundRobin(AtomicUsize),
    Random(AtomicUsize),
}

impl RotationStrategy {
    pub(super) fn round_robin() -> Self {
        Self::RoundRobin(AtomicUsize::new(0))
    }

    pub(super) fn random() -> Self {
        use std::time::{SystemTime, UNIX_EPOCH};
        let seed = SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .map(|d| d.subsec_nanos() as usize)
            .unwrap_or(42);
        Self::Random(AtomicUsize::new(seed | 1))
    }

    /// Return the index to use from a list of length `len`.
    pub(super) fn pick_index(&self, len: usize) -> usize {
        match self {
            Self::RoundRobin(counter) => counter.fetch_add(1, Ordering::Relaxed) % len,
            Self::Random(state) => {
                // XorShift pseudo-random — no external dependency needed.
                let mut x = state.load(Ordering::Relaxed);
                x ^= x << 13;
                x ^= x >> 7;
                x ^= x << 17;
                state.store(x, Ordering::Relaxed);
                x % len
            }
        }
    }
}

/// A fetch-time HTTP request, passed through middleware before fetching.
pub struct FetchRequest {
    url: String,
    pub method: Method,
    pub headers: HeaderMap,
    pub body: Option<Vec<u8>>,
    pub depth: usize,
    /// Proxy URL set by `ProxyRotator` middleware (e.g. `"http://user:pass@host:port"`).
    /// The `HttpFetcher` reads this field to route the request through the specified proxy.
    pub proxy: Option<String>,
}

impl FetchRequest {
    pub fn new(url: impl Into<String>, depth: usize) -> Self {
        Self {
            url: url.into(),
            method: Method::GET,
            headers: HeaderMap::new(),
            body: None,
            depth,
            proxy: None,
        }
    }

    /// The URL this request will fetch.
    pub fn url(&self) -> &str {
        &self.url
    }

    /// Mutable access to headers before the request is fetched.
    pub fn headers_mut(&mut self) -> &mut HeaderMap {
        &mut self.headers
    }

    /// Set or replace one header before the request is fetched.
    pub fn header(&mut self, name: HeaderName, value: HeaderValue) -> &mut Self {
        self.headers.insert(name, value);
        self
    }

    pub(crate) fn from_crawl_request(request: &crate::request::CrawlRequest, depth: usize) -> Self {
        Self {
            url: request.url().to_string(),
            method: request.method_ref().clone(),
            headers: request.headers().clone(),
            body: request.body_bytes().map(ToOwned::to_owned),
            depth,
            proxy: None,
        }
    }
}

/// Wraps the fetch cycle with pre/post-request hooks.
/// Multiple middleware are applied in registration order.
#[async_trait::async_trait]
pub trait Middleware: Send + Sync {
    /// Called before the HTTP request is sent.
    /// Modify `request` in place (e.g., inject headers, enforce rate limits).
    async fn before_request(&self, request: &mut FetchRequest) -> Result<(), KumoError>;

    /// Called after a successful HTTP response.
    /// Modify `response` in place, or return an error to trigger the spider's error policy.
    async fn after_response(&self, _response: &mut Response) -> Result<(), KumoError> {
        Ok(())
    }

    /// Called when a URL permanently fails (after all retries are exhausted).
    /// Use this to log failures, mark proxies as bad, emit metrics, etc.
    /// Default implementation does nothing.
    async fn on_error(&self, _url: &str, _error: &KumoError) {}

    /// Optional retry delay hint for an error observed by this middleware.
    ///
    /// Middleware can use this to pass server-provided backoff information,
    /// such as `Retry-After`, to the engine without changing `KumoError`.
    fn retry_delay(&self, _url: &str, _error: &KumoError) -> Option<Duration> {
        None
    }
}