use crate::{
XpathError,
fallback::{CssFallback, CustomFilters},
};
pub struct XpathEngine;
impl XpathEngine {
pub fn eval_text(html: &str, xpath: &str) -> Result<Vec<String>, XpathError> {
match Self::eval_text_sxd(html, xpath) {
Ok(results) => Ok(results),
Err(XpathError::HtmlParseError) => {
Self::eval_text_fallback(html, xpath)
}
Err(e) => Err(e),
}
}
fn eval_text_sxd(html: &str, xpath: &str) -> Result<Vec<String>, XpathError> {
let package = sxd_html::parse_html(html);
let document = package.as_document();
let factory = sxd_xpath::Factory::new();
let expression = factory
.build(xpath)
.map_err(|e| XpathError::CompileError(format!("{e:?}")))?
.ok_or_else(|| XpathError::CompileError("empty expression".into()))?;
let context = sxd_xpath::Context::new();
let value = expression
.evaluate(&context, document.root())
.map_err(|e| XpathError::EvalError(format!("{e:?}")))?;
match value {
sxd_xpath::Value::Nodeset(nodes) => Ok(nodes
.document_order()
.iter()
.map(|n| n.string_value())
.collect()),
sxd_xpath::Value::String(s) => Ok(vec![s]),
_ => Ok(vec![value.string()]),
}
}
fn eval_text_fallback(html: &str, xpath: &str) -> Result<Vec<String>, XpathError> {
if let Some(result) = CssFallback::eval_text(html, xpath) {
return result;
}
if xpath.contains("translate(") {
return Self::eval_text_with_custom_filters(html, xpath);
}
Err(XpathError::EvalError(
"XPath cannot be translated to CSS selector".into(),
))
}
fn eval_text_with_custom_filters(html: &str, xpath: &str) -> Result<Vec<String>, XpathError> {
let base_selector = Self::extract_base_selector(xpath)?;
let html_doc = scraper::Html::parse_document(html);
let selector = scraper::Selector::parse(base_selector)
.map_err(|e| XpathError::CompileError(format!("CSS selector error: {e}")))?;
let elements: Vec<scraper::ElementRef> = html_doc.select(&selector).collect();
let filtered = CustomFilters::apply_filters(elements);
let results: Vec<String> = filtered.iter().map(|el| el.text().collect()).collect();
Ok(results)
}
fn extract_base_selector(xpath: &str) -> Result<&str, XpathError> {
if xpath.contains("self::article") {
return Ok("article");
}
if xpath.contains("self::div") {
return Ok("div");
}
if xpath.contains("self::section") {
return Ok("section");
}
Err(XpathError::CompileError(
"Cannot extract base selector from XPath".into(),
))
}
pub fn has_match(html: &str, xpath: &str) -> bool {
Self::eval_text(html, xpath)
.map(|results| !results.is_empty())
.unwrap_or(false)
}
}