Skip to main content

kawat_xpath/
eval.rs

1//! XPath evaluation engine.
2
3use crate::{
4    XpathError,
5    fallback::{CssFallback, CustomFilters},
6};
7
8/// Evaluates XPath expressions against HTML documents.
9pub struct XpathEngine;
10
11impl XpathEngine {
12    /// Evaluate an XPath expression on HTML content, returning matching text fragments.
13    pub fn eval_text(html: &str, xpath: &str) -> Result<Vec<String>, XpathError> {
14        // Try sxd_html first
15        match Self::eval_text_sxd(html, xpath) {
16            Ok(results) => Ok(results),
17            Err(XpathError::HtmlParseError) => {
18                // Fallback to CSS selectors if HTML parsing fails
19                Self::eval_text_fallback(html, xpath)
20            }
21            Err(e) => Err(e),
22        }
23    }
24
25    /// Evaluate using sxd_html (primary method).
26    fn eval_text_sxd(html: &str, xpath: &str) -> Result<Vec<String>, XpathError> {
27        let package = sxd_html::parse_html(html);
28        let document = package.as_document();
29
30        let factory = sxd_xpath::Factory::new();
31        let expression = factory
32            .build(xpath)
33            .map_err(|e| XpathError::CompileError(format!("{e:?}")))?
34            .ok_or_else(|| XpathError::CompileError("empty expression".into()))?;
35
36        let context = sxd_xpath::Context::new();
37        let value = expression
38            .evaluate(&context, document.root())
39            .map_err(|e| XpathError::EvalError(format!("{e:?}")))?;
40
41        match value {
42            sxd_xpath::Value::Nodeset(nodes) => Ok(nodes
43                .document_order()
44                .iter()
45                .map(|n| n.string_value())
46                .collect()),
47            sxd_xpath::Value::String(s) => Ok(vec![s]),
48            _ => Ok(vec![value.string()]),
49        }
50    }
51
52    /// Fallback evaluation using CSS selectors.
53    fn eval_text_fallback(html: &str, xpath: &str) -> Result<Vec<String>, XpathError> {
54        // Try CSS selector fallback first
55        if let Some(result) = CssFallback::eval_text(html, xpath) {
56            return result;
57        }
58
59        // If CSS fallback isn't available, try custom filters for translate() expressions
60        if xpath.contains("translate(") {
61            return Self::eval_text_with_custom_filters(html, xpath);
62        }
63
64        Err(XpathError::EvalError(
65            "XPath cannot be translated to CSS selector".into(),
66        ))
67    }
68
69    /// Evaluate using custom filters for complex XPath expressions.
70    fn eval_text_with_custom_filters(html: &str, xpath: &str) -> Result<Vec<String>, XpathError> {
71        // For translate() expressions, extract the base selector and apply custom filters
72        let base_selector = Self::extract_base_selector(xpath)?;
73
74        let html_doc = scraper::Html::parse_document(html);
75        let selector = scraper::Selector::parse(base_selector)
76            .map_err(|e| XpathError::CompileError(format!("CSS selector error: {e}")))?;
77
78        let elements: Vec<scraper::ElementRef> = html_doc.select(&selector).collect();
79        let filtered = CustomFilters::apply_filters(elements);
80
81        let results: Vec<String> = filtered.iter().map(|el| el.text().collect()).collect();
82
83        Ok(results)
84    }
85
86    /// Extract base CSS selector from complex XPath expression.
87    fn extract_base_selector(xpath: &str) -> Result<&str, XpathError> {
88        // Simple extraction for common patterns
89        if xpath.contains("self::article") {
90            return Ok("article");
91        }
92        if xpath.contains("self::div") {
93            return Ok("div");
94        }
95        if xpath.contains("self::section") {
96            return Ok("section");
97        }
98
99        Err(XpathError::CompileError(
100            "Cannot extract base selector from XPath".into(),
101        ))
102    }
103
104    /// Check if an XPath expression matches anything in the HTML.
105    pub fn has_match(html: &str, xpath: &str) -> bool {
106        Self::eval_text(html, xpath)
107            .map(|results| !results.is_empty())
108            .unwrap_or(false)
109    }
110}