Skip to main content

kumo/extract/
extractor.rs

1use crate::{error::KumoError, extract::selector::re_matches};
2
3/// A single node extracted from an HTML document.
4///
5/// Kept separate from `Element` to allow future extractors (XPath, LLM)
6/// to return richer data without touching the CSS-specific types.
7pub struct ExtractedNode {
8    pub outer_html: String,
9}
10
11impl ExtractedNode {
12    /// Get the concatenated text content of this node and all its descendants.
13    pub fn text(&self) -> String {
14        let fragment = scraper::Html::parse_fragment(&self.outer_html);
15        fragment.root_element().text().collect::<Vec<_>>().join("")
16    }
17
18    /// Get the value of an attribute by name.
19    pub fn attr(&self, name: &str) -> Option<String> {
20        let fragment = scraper::Html::parse_fragment(&self.outer_html);
21        let sel = scraper::Selector::parse("*").unwrap();
22        fragment
23            .select(&sel)
24            .find(|el| !matches!(el.value().name(), "html" | "body"))
25            .and_then(|el| el.value().attr(name))
26            .map(String::from)
27    }
28}
29
30/// Extension point for pluggable extraction strategies.
31///
32/// The default implementation (`CssExtractor`) uses CSS selectors via `scraper`.
33/// Future crates can implement XPath or LLM-based extraction without touching
34/// core kumo code.
35pub trait Extractor: Send + Sync {
36    fn extract(&self, html: &str, selector: &str) -> Result<Vec<ExtractedNode>, KumoError>;
37}
38
39/// Default CSS-selector extractor backed by the `scraper` crate.
40pub struct CssExtractor;
41
42impl Extractor for CssExtractor {
43    fn extract(&self, html: &str, selector: &str) -> Result<Vec<ExtractedNode>, KumoError> {
44        let document = scraper::Html::parse_document(html);
45        let sel = scraper::Selector::parse(selector).map_err(|e| {
46            KumoError::parse_msg(format!("invalid CSS selector '{selector}': {e:?}"))
47        })?;
48        let nodes = document
49            .select(&sel)
50            .map(|el| ExtractedNode {
51                outer_html: el.html(),
52            })
53            .collect();
54        Ok(nodes)
55    }
56}
57
58/// Extension trait for extractors that produce plain string values rather than HTML nodes.
59///
60/// Used by regex and JSONPath extractors where the result is not an HTML fragment.
61pub trait ValueExtractor: Send + Sync {
62    fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError>;
63}
64
65/// Regex-based value extractor.
66///
67/// If the pattern contains capture group 1, returns group-1 matches.
68/// Otherwise returns the full match.
69pub struct RegexExtractor;
70
71impl ValueExtractor for RegexExtractor {
72    fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError> {
73        regex::Regex::new(selector)
74            .map_err(|e| KumoError::parse(format!("invalid regex '{selector}'"), e))?;
75        Ok(re_matches(input, selector))
76    }
77}
78
79/// JSONPath value extractor. Parses input as JSON and evaluates the path expression.
80///
81/// Results are serialized back to JSON strings.
82#[cfg(feature = "jsonpath")]
83pub struct JsonPathExtractor;
84
85#[cfg(feature = "jsonpath")]
86impl ValueExtractor for JsonPathExtractor {
87    fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError> {
88        use jsonpath_rust::JsonPath;
89
90        let value: serde_json::Value =
91            serde_json::from_str(input).map_err(|e| KumoError::parse("invalid JSON input", e))?;
92        let results = value
93            .query(selector)
94            .map_err(|e| KumoError::parse(format!("jsonpath '{selector}'"), e))?
95            .into_iter()
96            .map(|v| v.to_string())
97            .collect();
98        Ok(results)
99    }
100}