kumo/extract/
extractor.rs1use crate::{error::KumoError, extract::selector::re_matches};
2
3pub struct ExtractedNode {
8 pub outer_html: String,
9}
10
11impl ExtractedNode {
12 pub fn text(&self) -> String {
14 let fragment = scraper::Html::parse_fragment(&self.outer_html);
15 fragment.root_element().text().collect::<Vec<_>>().join("")
16 }
17
18 pub fn attr(&self, name: &str) -> Option<String> {
20 let fragment = scraper::Html::parse_fragment(&self.outer_html);
21 let sel = scraper::Selector::parse("*").unwrap();
22 fragment
23 .select(&sel)
24 .find(|el| !matches!(el.value().name(), "html" | "body"))
25 .and_then(|el| el.value().attr(name))
26 .map(String::from)
27 }
28}
29
30pub trait Extractor: Send + Sync {
36 fn extract(&self, html: &str, selector: &str) -> Result<Vec<ExtractedNode>, KumoError>;
37}
38
39pub struct CssExtractor;
41
42impl Extractor for CssExtractor {
43 fn extract(&self, html: &str, selector: &str) -> Result<Vec<ExtractedNode>, KumoError> {
44 let document = scraper::Html::parse_document(html);
45 let sel = scraper::Selector::parse(selector).map_err(|e| {
46 KumoError::parse_msg(format!("invalid CSS selector '{selector}': {e:?}"))
47 })?;
48 let nodes = document
49 .select(&sel)
50 .map(|el| ExtractedNode {
51 outer_html: el.html(),
52 })
53 .collect();
54 Ok(nodes)
55 }
56}
57
58pub trait ValueExtractor: Send + Sync {
62 fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError>;
63}
64
65pub struct RegexExtractor;
70
71impl ValueExtractor for RegexExtractor {
72 fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError> {
73 regex::Regex::new(selector)
74 .map_err(|e| KumoError::parse(format!("invalid regex '{selector}'"), e))?;
75 Ok(re_matches(input, selector))
76 }
77}
78
79#[cfg(feature = "jsonpath")]
83pub struct JsonPathExtractor;
84
85#[cfg(feature = "jsonpath")]
86impl ValueExtractor for JsonPathExtractor {
87 fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError> {
88 use jsonpath_rust::JsonPath;
89
90 let value: serde_json::Value =
91 serde_json::from_str(input).map_err(|e| KumoError::parse("invalid JSON input", e))?;
92 let results = value
93 .query(selector)
94 .map_err(|e| KumoError::parse(format!("jsonpath '{selector}'"), e))?
95 .into_iter()
96 .map(|v| v.to_string())
97 .collect();
98 Ok(results)
99 }
100}