kumo 0.3.2

An async web crawling framework for Rust - Scrapy for Rust
Documentation
use crate::{error::KumoError, extract::selector::re_matches};

/// A single node extracted from an HTML document.
///
/// Kept separate from `Element` to allow future extractors (XPath, LLM)
/// to return richer data without touching the CSS-specific types.
pub struct ExtractedNode {
    pub outer_html: String,
}

impl ExtractedNode {
    /// Get the concatenated text content of this node and all its descendants.
    pub fn text(&self) -> String {
        let fragment = scraper::Html::parse_fragment(&self.outer_html);
        fragment.root_element().text().collect::<Vec<_>>().join("")
    }

    /// Get the value of an attribute by name.
    pub fn attr(&self, name: &str) -> Option<String> {
        let fragment = scraper::Html::parse_fragment(&self.outer_html);
        let sel = scraper::Selector::parse("*").unwrap();
        fragment
            .select(&sel)
            .find(|el| !matches!(el.value().name(), "html" | "body"))
            .and_then(|el| el.value().attr(name))
            .map(String::from)
    }
}

/// Extension point for pluggable extraction strategies.
///
/// The default implementation (`CssExtractor`) uses CSS selectors via `scraper`.
/// Future crates can implement XPath or LLM-based extraction without touching
/// core kumo code.
pub trait Extractor: Send + Sync {
    fn extract(&self, html: &str, selector: &str) -> Result<Vec<ExtractedNode>, KumoError>;
}

/// Default CSS-selector extractor backed by the `scraper` crate.
pub struct CssExtractor;

impl Extractor for CssExtractor {
    fn extract(&self, html: &str, selector: &str) -> Result<Vec<ExtractedNode>, KumoError> {
        let document = scraper::Html::parse_document(html);
        let sel = scraper::Selector::parse(selector).map_err(|e| {
            KumoError::parse_msg(format!("invalid CSS selector '{selector}': {e:?}"))
        })?;
        let nodes = document
            .select(&sel)
            .map(|el| ExtractedNode {
                outer_html: el.html(),
            })
            .collect();
        Ok(nodes)
    }
}

/// Extension trait for extractors that produce plain string values rather than HTML nodes.
///
/// Used by regex and JSONPath extractors where the result is not an HTML fragment.
pub trait ValueExtractor: Send + Sync {
    fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError>;
}

/// Regex-based value extractor.
///
/// If the pattern contains capture group 1, returns group-1 matches.
/// Otherwise returns the full match.
pub struct RegexExtractor;

impl ValueExtractor for RegexExtractor {
    fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError> {
        regex::Regex::new(selector)
            .map_err(|e| KumoError::parse(format!("invalid regex '{selector}'"), e))?;
        Ok(re_matches(input, selector))
    }
}

/// JSONPath value extractor. Parses input as JSON and evaluates the path expression.
///
/// Results are serialized back to JSON strings.
#[cfg(feature = "jsonpath")]
pub struct JsonPathExtractor;

#[cfg(feature = "jsonpath")]
impl ValueExtractor for JsonPathExtractor {
    fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError> {
        use jsonpath_rust::JsonPath;

        let value: serde_json::Value =
            serde_json::from_str(input).map_err(|e| KumoError::parse("invalid JSON input", e))?;
        let results = value
            .query(selector)
            .map_err(|e| KumoError::parse(format!("jsonpath '{selector}'"), e))?
            .into_iter()
            .map(|v| v.to_string())
            .collect();
        Ok(results)
    }
}