kumo 0.2.6

An async web crawling framework for Rust — Scrapy for Rust
Documentation
use thiserror::Error;

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum KumoErrorKind {
    Fetch,
    Parse,
    Store,
    InvalidUrl,
    DepthExceeded,
    DomainNotAllowed,
    Llm,
    Browser,
    HttpStatus,
}

#[derive(Debug, Error)]
pub enum KumoError {
    #[error("fetch error: {0}")]
    Fetch(#[from] reqwest::Error),

    #[error("parse error - {context}: {source}")]
    Parse {
        context: String,
        #[source]
        source: Box<dyn std::error::Error + Send + Sync>,
    },

    #[error("store error - {context}: {source}")]
    Store {
        context: String,
        #[source]
        source: Box<dyn std::error::Error + Send + Sync>,
    },

    #[error("invalid URL: {0}")]
    InvalidUrl(String),

    #[error("max crawl depth exceeded")]
    DepthExceeded,

    #[error("domain not allowed: {0}")]
    DomainNotAllowed(String),

    #[error("llm error: {0}")]
    Llm(String),

    #[error("browser error: {0}")]
    Browser(String),

    /// Returned by `StatusRetry` middleware when the response status code matches
    /// the retry set. Triggers the engine's exponential-backoff retry loop.
    #[error("HTTP {status} from {url}")]
    HttpStatus { status: u16, url: String },
}

/// Thin wrapper so plain `String` messages can be boxed as `dyn Error`.
#[derive(Debug)]
struct Msg(String);
impl std::fmt::Display for Msg {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0)
    }
}
impl std::error::Error for Msg {}

impl KumoError {
    pub fn kind(&self) -> KumoErrorKind {
        match self {
            Self::Fetch(_) => KumoErrorKind::Fetch,
            Self::Parse { .. } => KumoErrorKind::Parse,
            Self::Store { .. } => KumoErrorKind::Store,
            Self::InvalidUrl(_) => KumoErrorKind::InvalidUrl,
            Self::DepthExceeded => KumoErrorKind::DepthExceeded,
            Self::DomainNotAllowed(_) => KumoErrorKind::DomainNotAllowed,
            Self::Llm(_) => KumoErrorKind::Llm,
            Self::Browser(_) => KumoErrorKind::Browser,
            Self::HttpStatus { .. } => KumoErrorKind::HttpStatus,
        }
    }

    pub fn invalid_url(url: impl Into<String>) -> Self {
        Self::InvalidUrl(url.into())
    }

    pub fn llm(message: impl Into<String>) -> Self {
        Self::Llm(message.into())
    }

    pub fn browser(message: impl Into<String>) -> Self {
        Self::Browser(message.into())
    }

    pub fn http_status(status: u16, url: impl Into<String>) -> Self {
        Self::HttpStatus {
            status,
            url: url.into(),
        }
    }

    pub fn status_code(&self) -> Option<u16> {
        match self {
            Self::HttpStatus { status, .. } => Some(*status),
            Self::Fetch(err) => err.status().map(|status| status.as_u16()),
            _ => None,
        }
    }

    pub fn url(&self) -> Option<&str> {
        match self {
            Self::HttpStatus { url, .. } => Some(url),
            Self::Fetch(err) => err.url().map(|url| url.as_str()),
            Self::InvalidUrl(url) | Self::DomainNotAllowed(url) => Some(url.as_str()),
            _ => None,
        }
    }

    /// Construct a `Parse` variant from a real source error.
    pub fn parse(
        context: impl Into<String>,
        source: impl std::error::Error + Send + Sync + 'static,
    ) -> Self {
        Self::Parse {
            context: context.into(),
            source: Box::new(source),
        }
    }

    /// Construct a `Parse` variant from a plain message (no source).
    pub fn parse_msg(msg: impl Into<String>) -> Self {
        let msg = msg.into();
        Self::Parse {
            context: msg.clone(),
            source: Box::new(Msg(msg)),
        }
    }

    /// Construct a `Store` variant from a real source error.
    pub fn store(
        context: impl Into<String>,
        source: impl std::error::Error + Send + Sync + 'static,
    ) -> Self {
        Self::Store {
            context: context.into(),
            source: Box::new(source),
        }
    }

    /// Construct a `Store` variant from a plain message (no source).
    pub fn store_msg(msg: impl Into<String>) -> Self {
        let msg = msg.into();
        Self::Store {
            context: msg.clone(),
            source: Box::new(Msg(msg)),
        }
    }
}

/// Determines what the engine does when Spider::parse or a fetch fails.
#[derive(Debug, Clone)]
pub enum ErrorPolicy {
    /// Skip this URL and continue crawling. (default)
    Skip,
    /// Abort the entire crawl immediately.
    Abort,
    /// Retry this URL up to N more times (via the frontier).
    Retry(u32),
}