use crate::{error::KumoError, extract::selector::re_matches};
pub struct ExtractedNode {
pub outer_html: String,
}
impl ExtractedNode {
pub fn text(&self) -> String {
let fragment = scraper::Html::parse_fragment(&self.outer_html);
fragment.root_element().text().collect::<Vec<_>>().join("")
}
pub fn attr(&self, name: &str) -> Option<String> {
let fragment = scraper::Html::parse_fragment(&self.outer_html);
let sel = scraper::Selector::parse("*").unwrap();
fragment
.select(&sel)
.find(|el| !matches!(el.value().name(), "html" | "body"))
.and_then(|el| el.value().attr(name))
.map(String::from)
}
}
pub trait Extractor: Send + Sync {
fn extract(&self, html: &str, selector: &str) -> Result<Vec<ExtractedNode>, KumoError>;
}
pub struct CssExtractor;
impl Extractor for CssExtractor {
fn extract(&self, html: &str, selector: &str) -> Result<Vec<ExtractedNode>, KumoError> {
let document = scraper::Html::parse_document(html);
let sel = scraper::Selector::parse(selector).map_err(|e| {
KumoError::parse_msg(format!("invalid CSS selector '{selector}': {e:?}"))
})?;
let nodes = document
.select(&sel)
.map(|el| ExtractedNode {
outer_html: el.html(),
})
.collect();
Ok(nodes)
}
}
pub trait ValueExtractor: Send + Sync {
fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError>;
}
pub struct RegexExtractor;
impl ValueExtractor for RegexExtractor {
fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError> {
regex::Regex::new(selector)
.map_err(|e| KumoError::parse(format!("invalid regex '{selector}'"), e))?;
Ok(re_matches(input, selector))
}
}
#[cfg(feature = "jsonpath")]
pub struct JsonPathExtractor;
#[cfg(feature = "jsonpath")]
impl ValueExtractor for JsonPathExtractor {
fn extract_values(&self, input: &str, selector: &str) -> Result<Vec<String>, KumoError> {
use jsonpath_rust::JsonPath;
let value: serde_json::Value =
serde_json::from_str(input).map_err(|e| KumoError::parse("invalid JSON input", e))?;
let results = value
.query(selector)
.map_err(|e| KumoError::parse(format!("jsonpath '{selector}'"), e))?
.into_iter()
.map(|v| v.to_string())
.collect();
Ok(results)
}
}