use super::error::EvalError;
use super::source::{Extract, ExtractOp, Via};
use dom_query::{Document, Matcher};
use fancy_regex::Regex;
use jsonpath_rust::JsonPath;
use serde_json::Value;
use std::sync::LazyLock;
pub fn extract(
via: Via,
content: &str,
select: Option<&str>,
index: Option<i64>,
ex: &Extract,
) -> Result<String, EvalError> {
match via {
Via::Css => html_extract(content, select, index, ex),
Via::Json => json_extract(content, select, index, ex),
Via::Regex => regex_extract(content, select, index),
Via::Raw => Ok(content.to_string()),
Via::Xpath => Err(EvalError::Unsupported("xpath")),
}
}
pub fn select_all(via: Via, content: &str, select: &str) -> Result<Vec<String>, EvalError> {
match via {
Via::Css => {
let doc = Document::from(content.to_string());
let matcher =
Matcher::new(select).map_err(|_| EvalError::Selector(select.to_string()))?;
let sel = doc.select_matcher(&matcher);
Ok(sel.nodes().iter().map(|n| n.html().to_string()).collect())
}
Via::Json => {
let value: Value =
serde_json::from_str(content).map_err(|e| EvalError::Json(e.to_string()))?;
let matched = value
.query(select)
.map_err(|e| EvalError::JsonPath(e.to_string()))?;
Ok(matched.into_iter().map(value_to_string).collect())
}
Via::Regex => {
let re = Regex::new(select).map_err(|e| EvalError::Regex(e.to_string()))?;
Ok(re
.find_iter(content)
.filter_map(|m| m.ok())
.map(|m| m.as_str().to_string())
.collect())
}
Via::Raw => Ok(vec![content.to_string()]),
Via::Xpath => Err(EvalError::Unsupported("xpath")),
}
}
fn html_extract(
content: &str,
select: Option<&str>,
index: Option<i64>,
ex: &Extract,
) -> Result<String, EvalError> {
let doc = Document::from(content.to_string());
let sel = match select {
Some(s) => {
let matcher = Matcher::new(s).map_err(|_| EvalError::Selector(s.to_string()))?;
doc.select_matcher(&matcher)
}
None => doc.select(":root"),
};
let nodes = sel.nodes();
if nodes.is_empty() {
return Ok(String::new());
}
let node = &nodes[resolve_index(index, nodes.len())];
Ok(match ex {
Extract::Op(ExtractOp::Text) => node.text().trim().to_string(),
Extract::Op(ExtractOp::OwnText) => node.immediate_text().trim().to_string(),
Extract::Op(ExtractOp::Html) => clean_html(&node.inner_html()),
Extract::Op(ExtractOp::InnerHtml) => node.inner_html().to_string(),
Extract::Op(ExtractOp::OuterHtml) => node.html().to_string(),
Extract::Attr { attr } => node
.attr(attr)
.map(|s| s.trim().to_string())
.unwrap_or_default(),
})
}
fn clean_html(html: &str) -> String {
static TAGS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"</?(?:div|p|br|hr|h[1-6]|article|section|dd|dl|li)[^>]*>").unwrap()
});
static COMMENTS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--[\s\S]*?-->").unwrap());
static OTHER_TAGS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<[^>]+>").unwrap());
let s = TAGS.replace_all(html, "\n");
let s = COMMENTS.replace_all(&s, "");
let s = OTHER_TAGS.replace_all(&s, "");
decode_entities(&s)
}
fn decode_entities(s: &str) -> String {
s.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(" ", " ")
.replace("'", "'")
.replace(""", "\"")
}
fn json_extract(
content: &str,
select: Option<&str>,
index: Option<i64>,
ex: &Extract,
) -> Result<String, EvalError> {
let value: Value = serde_json::from_str(content).map_err(|e| EvalError::Json(e.to_string()))?;
let path = select.unwrap_or("$");
let matched = value
.query(path)
.map_err(|e| EvalError::JsonPath(e.to_string()))?;
if matched.is_empty() {
return Ok(String::new());
}
let v = matched[resolve_index(index, matched.len())];
let _ = ex;
Ok(value_to_string(v))
}
fn value_to_string(v: &Value) -> String {
match v {
Value::String(s) => s.clone(),
Value::Null => String::new(),
other => other.to_string(),
}
}
fn regex_extract(
content: &str,
select: Option<&str>,
index: Option<i64>,
) -> Result<String, EvalError> {
let pat = select.unwrap_or("");
let re = Regex::new(pat).map_err(|e| EvalError::Regex(e.to_string()))?;
let caps: Vec<String> = re
.captures_iter(content)
.filter_map(|c| c.ok())
.map(|c| {
c.get(1)
.or_else(|| c.get(0))
.map(|m| m.as_str().to_string())
.unwrap_or_default()
})
.collect();
if caps.is_empty() {
return Ok(String::new());
}
Ok(caps[resolve_index(index, caps.len())].clone())
}
fn resolve_index(index: Option<i64>, len: usize) -> usize {
match index {
None => 0,
Some(i) if i >= 0 => (i as usize).min(len - 1),
Some(i) => {
let from_end = (-i) as usize;
len.saturating_sub(from_end)
}
}
}