Skip to main content

fhp_selector/xpath/
parser.rs

1//! XPath expression parser.
2//!
3//! Parses a subset of XPath 1.0 into [`XPathExpr`](crate::xpath::ast::XPathExpr) AST nodes.
4//!
5//! # Supported syntax
6//!
7//! - `//tag` — descendant search
8//! - `//tag[@attr='value']` — attribute predicate
9//! - `//tag[contains(@attr, 'substr')]` — contains predicate
10//! - `//tag[position()=N]` — position predicate
11//! - `//tag/text()` — text extraction
12//! - `/path/to/tag` — absolute path
13//! - `//*` — descendant wildcard
14//! - `..` — parent axis
15
16use fhp_core::error::XPathError;
17use fhp_core::tag::Tag;
18
19use super::ast::{PathStep, Predicate, XPathExpr};
20
21/// Parse an XPath expression string into an AST.
22///
23/// # Errors
24///
25/// Returns [`XPathError::Invalid`] if the expression syntax is invalid.
26pub fn parse_xpath(input: &str) -> Result<XPathExpr, XPathError> {
27    let input = input.trim();
28    if input.is_empty() {
29        return Err(XPathError::Invalid {
30            reason: "empty xpath expression".to_string(),
31        });
32    }
33
34    let mut parser = XPathParser::new(input);
35    let expr = parser.parse()?;
36
37    // Check for trailing /text()
38    if parser.remaining().starts_with("/text()") {
39        parser.advance(7);
40        parser.skip_whitespace();
41        if !parser.is_eof() {
42            return Err(XPathError::Invalid {
43                reason: format!("unexpected trailing: {}", parser.remaining()),
44            });
45        }
46        return Ok(XPathExpr::TextExtract(Box::new(expr)));
47    }
48
49    if !parser.is_eof() {
50        return Err(XPathError::Invalid {
51            reason: format!("unexpected trailing: {}", parser.remaining()),
52        });
53    }
54
55    Ok(expr)
56}
57
58/// Hand-rolled XPath parser.
59struct XPathParser<'a> {
60    input: &'a str,
61    pos: usize,
62}
63
64impl<'a> XPathParser<'a> {
65    fn new(input: &'a str) -> Self {
66        Self { input, pos: 0 }
67    }
68
69    fn remaining(&self) -> &'a str {
70        &self.input[self.pos..]
71    }
72
73    fn is_eof(&self) -> bool {
74        self.pos >= self.input.len()
75    }
76
77    fn peek(&self) -> Option<u8> {
78        self.input.as_bytes().get(self.pos).copied()
79    }
80
81    fn advance(&mut self, n: usize) {
82        self.pos = (self.pos + n).min(self.input.len());
83    }
84
85    fn skip_whitespace(&mut self) {
86        while self.pos < self.input.len() && self.input.as_bytes()[self.pos].is_ascii_whitespace() {
87            self.pos += 1;
88        }
89    }
90
91    fn parse(&mut self) -> Result<XPathExpr, XPathError> {
92        // Handle `..` (parent)
93        if self.remaining().starts_with("..") {
94            self.advance(2);
95            return Ok(XPathExpr::Parent);
96        }
97
98        // Handle `//` (descendant axis)
99        if self.remaining().starts_with("//") {
100            self.advance(2);
101            return self.parse_descendant();
102        }
103
104        // Handle `/` (absolute path)
105        if self.remaining().starts_with('/') {
106            self.advance(1);
107            return self.parse_absolute_path();
108        }
109
110        Err(XPathError::Invalid {
111            reason: format!("expected '/' or '//' at: {}", self.remaining()),
112        })
113    }
114
115    /// Parse `//tag[...]` or `//*[...]` expressions.
116    fn parse_descendant(&mut self) -> Result<XPathExpr, XPathError> {
117        self.skip_whitespace();
118
119        // Check for wildcard `*`
120        if self.peek() == Some(b'*') {
121            self.advance(1);
122            return self.parse_descendant_wildcard();
123        }
124
125        let tag = self.read_tag_name()?;
126
127        // Check for predicate `[...]`
128        if self.peek() == Some(b'[') {
129            let pred = self.parse_predicate()?;
130            return self.build_descendant_with_predicate(tag, pred);
131        }
132
133        // Check for `/text()` suffix (handled by caller)
134        Ok(XPathExpr::DescendantByTag(tag))
135    }
136
137    /// Parse wildcard descendant `//*` with optional predicate.
138    fn parse_descendant_wildcard(&mut self) -> Result<XPathExpr, XPathError> {
139        if self.peek() == Some(b'[') {
140            let pred = self.parse_predicate()?;
141            match pred {
142                Predicate::AttrEquals { attr, value } => {
143                    Ok(XPathExpr::DescendantWildcardByAttr { attr, value })
144                }
145                Predicate::AttrExists { attr } => {
146                    Ok(XPathExpr::DescendantWildcardByAttrExists { attr })
147                }
148                _ => Err(XPathError::Invalid {
149                    reason: "unsupported predicate on wildcard".to_string(),
150                }),
151            }
152        } else {
153            Ok(XPathExpr::DescendantWildcard)
154        }
155    }
156
157    /// Build a descendant expression with a predicate.
158    fn build_descendant_with_predicate(
159        &self,
160        tag: Tag,
161        pred: Predicate,
162    ) -> Result<XPathExpr, XPathError> {
163        match pred {
164            Predicate::AttrEquals { attr, value } => {
165                Ok(XPathExpr::DescendantByAttr { tag, attr, value })
166            }
167            Predicate::Contains { attr, substr } => {
168                Ok(XPathExpr::ContainsPredicate { tag, attr, substr })
169            }
170            Predicate::Position(pos) => Ok(XPathExpr::PositionPredicate { tag, pos }),
171            Predicate::AttrExists { attr } => Ok(XPathExpr::DescendantByAttrExists { tag, attr }),
172        }
173    }
174
175    /// Parse an absolute path `/step/step/...`
176    fn parse_absolute_path(&mut self) -> Result<XPathExpr, XPathError> {
177        let mut steps = Vec::new();
178        loop {
179            self.skip_whitespace();
180            if self.is_eof() || self.remaining().starts_with("/text()") {
181                break;
182            }
183
184            let tag = self.read_tag_name()?;
185            let predicate = if self.peek() == Some(b'[') {
186                Some(self.parse_predicate()?)
187            } else {
188                None
189            };
190
191            steps.push(PathStep { tag, predicate });
192
193            // Expect `/` separator or end
194            if self.peek() == Some(b'/') {
195                // Check for /text() which is handled by caller
196                if self.remaining().starts_with("/text()") {
197                    break;
198                }
199                self.advance(1);
200            } else {
201                break;
202            }
203        }
204
205        if steps.is_empty() {
206            return Err(XPathError::Invalid {
207                reason: "empty absolute path".to_string(),
208            });
209        }
210
211        Ok(XPathExpr::AbsolutePath(steps))
212    }
213
214    /// Parse a predicate `[...]`.
215    fn parse_predicate(&mut self) -> Result<Predicate, XPathError> {
216        self.expect(b'[')?;
217        self.skip_whitespace();
218
219        let pred = if self.remaining().starts_with("contains(") {
220            self.parse_contains_predicate()?
221        } else if self.remaining().starts_with("position()") {
222            self.parse_position_predicate()?
223        } else if self.peek() == Some(b'@') {
224            self.parse_attr_predicate()?
225        } else if self.peek().is_some_and(|b| b.is_ascii_digit()) {
226            // Shorthand [N] = [position()=N]
227            let n = self.read_number()?;
228            Predicate::Position(n)
229        } else {
230            return Err(XPathError::Invalid {
231                reason: format!("unsupported predicate at: {}", self.remaining()),
232            });
233        };
234
235        self.skip_whitespace();
236        self.expect(b']')?;
237        Ok(pred)
238    }
239
240    /// Parse `@attr='value'` or `@attr` inside a predicate.
241    fn parse_attr_predicate(&mut self) -> Result<Predicate, XPathError> {
242        self.expect(b'@')?;
243        let attr = self.read_ident()?;
244        self.skip_whitespace();
245
246        if self.peek() == Some(b'=') {
247            self.advance(1);
248            self.skip_whitespace();
249            let value = self.read_string_literal()?;
250            Ok(Predicate::AttrEquals { attr, value })
251        } else {
252            Ok(Predicate::AttrExists { attr })
253        }
254    }
255
256    /// Parse `contains(@attr, 'substr')`.
257    fn parse_contains_predicate(&mut self) -> Result<Predicate, XPathError> {
258        self.advance_str("contains(")?;
259        self.skip_whitespace();
260        self.expect(b'@')?;
261        let attr = self.read_ident()?;
262        self.skip_whitespace();
263        self.expect(b',')?;
264        self.skip_whitespace();
265        let substr = self.read_string_literal()?;
266        self.skip_whitespace();
267        self.expect(b')')?;
268        Ok(Predicate::Contains { attr, substr })
269    }
270
271    /// Parse `position()=N`.
272    fn parse_position_predicate(&mut self) -> Result<Predicate, XPathError> {
273        self.advance_str("position()")?;
274        self.skip_whitespace();
275        self.expect(b'=')?;
276        self.skip_whitespace();
277        let n = self.read_number()?;
278        Ok(Predicate::Position(n))
279    }
280
281    /// Read a tag name and resolve to a [`Tag`].
282    fn read_tag_name(&mut self) -> Result<Tag, XPathError> {
283        let name = self.read_ident()?;
284        let tag = Tag::from_bytes(name.as_bytes());
285        if tag == Tag::Unknown {
286            return Err(XPathError::Invalid {
287                reason: format!("unknown tag: {name}"),
288            });
289        }
290        Ok(tag)
291    }
292
293    /// Read an identifier (letters, digits, hyphens).
294    fn read_ident(&mut self) -> Result<String, XPathError> {
295        let start = self.pos;
296        while self.pos < self.input.len() {
297            let b = self.input.as_bytes()[self.pos];
298            if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
299                self.pos += 1;
300            } else {
301                break;
302            }
303        }
304        if self.pos == start {
305            return Err(XPathError::Invalid {
306                reason: format!("expected identifier at position {}", self.pos),
307            });
308        }
309        Ok(self.input[start..self.pos].to_string())
310    }
311
312    /// Read a quoted string literal (`'...'` or `"..."`).
313    fn read_string_literal(&mut self) -> Result<String, XPathError> {
314        let quote = self.peek().ok_or_else(|| XPathError::Invalid {
315            reason: "expected string literal, got EOF".to_string(),
316        })?;
317
318        if quote != b'\'' && quote != b'"' {
319            return Err(XPathError::Invalid {
320                reason: format!("expected quote, got '{}'", quote as char),
321            });
322        }
323
324        self.advance(1);
325        let start = self.pos;
326        while self.pos < self.input.len() && self.input.as_bytes()[self.pos] != quote {
327            self.pos += 1;
328        }
329        if self.pos >= self.input.len() {
330            return Err(XPathError::Invalid {
331                reason: "unclosed string literal".to_string(),
332            });
333        }
334        let value = self.input[start..self.pos].to_string();
335        self.advance(1); // skip closing quote
336        Ok(value)
337    }
338
339    /// Read a positive integer.
340    fn read_number(&mut self) -> Result<usize, XPathError> {
341        let start = self.pos;
342        while self.pos < self.input.len() && self.input.as_bytes()[self.pos].is_ascii_digit() {
343            self.pos += 1;
344        }
345        if self.pos == start {
346            return Err(XPathError::Invalid {
347                reason: format!("expected number at position {}", self.pos),
348            });
349        }
350        self.input[start..self.pos]
351            .parse::<usize>()
352            .map_err(|_| XPathError::Invalid {
353                reason: "invalid number".to_string(),
354            })
355    }
356
357    /// Expect and consume a specific byte.
358    fn expect(&mut self, expected: u8) -> Result<(), XPathError> {
359        if self.peek() == Some(expected) {
360            self.advance(1);
361            Ok(())
362        } else {
363            Err(XPathError::Invalid {
364                reason: format!(
365                    "expected '{}', got '{}'",
366                    expected as char,
367                    self.peek()
368                        .map_or("EOF".to_string(), |b| (b as char).to_string())
369                ),
370            })
371        }
372    }
373
374    /// Expect and consume a specific string prefix.
375    fn advance_str(&mut self, s: &str) -> Result<(), XPathError> {
376        if self.remaining().starts_with(s) {
377            self.advance(s.len());
378            Ok(())
379        } else {
380            Err(XPathError::Invalid {
381                reason: format!("expected '{}' at: {}", s, self.remaining()),
382            })
383        }
384    }
385}
386
387#[cfg(test)]
388mod tests {
389    use super::*;
390    use crate::xpath::ast::{Predicate, XPathExpr};
391
392    #[test]
393    fn parse_descendant_tag() {
394        let expr = parse_xpath("//div").unwrap();
395        assert_eq!(expr, XPathExpr::DescendantByTag(Tag::Div));
396    }
397
398    #[test]
399    fn parse_descendant_p() {
400        let expr = parse_xpath("//p").unwrap();
401        assert_eq!(expr, XPathExpr::DescendantByTag(Tag::P));
402    }
403
404    #[test]
405    fn parse_descendant_attr() {
406        let expr = parse_xpath("//a[@href='http://example.com']").unwrap();
407        assert_eq!(
408            expr,
409            XPathExpr::DescendantByAttr {
410                tag: Tag::A,
411                attr: "href".to_string(),
412                value: "http://example.com".to_string(),
413            }
414        );
415    }
416
417    #[test]
418    fn parse_descendant_attr_double_quote() {
419        let expr = parse_xpath("//a[@href=\"url\"]").unwrap();
420        assert_eq!(
421            expr,
422            XPathExpr::DescendantByAttr {
423                tag: Tag::A,
424                attr: "href".to_string(),
425                value: "url".to_string(),
426            }
427        );
428    }
429
430    #[test]
431    fn parse_descendant_attr_exists() {
432        let expr = parse_xpath("//a[@href]").unwrap();
433        assert_eq!(
434            expr,
435            XPathExpr::DescendantByAttrExists {
436                tag: Tag::A,
437                attr: "href".to_string(),
438            }
439        );
440    }
441
442    #[test]
443    fn parse_contains() {
444        let expr = parse_xpath("//a[contains(@class, 'nav')]").unwrap();
445        assert_eq!(
446            expr,
447            XPathExpr::ContainsPredicate {
448                tag: Tag::A,
449                attr: "class".to_string(),
450                substr: "nav".to_string(),
451            }
452        );
453    }
454
455    #[test]
456    fn parse_position() {
457        let expr = parse_xpath("//li[position()=3]").unwrap();
458        assert_eq!(
459            expr,
460            XPathExpr::PositionPredicate {
461                tag: Tag::Li,
462                pos: 3,
463            }
464        );
465    }
466
467    #[test]
468    fn parse_position_shorthand() {
469        let expr = parse_xpath("//li[2]").unwrap();
470        assert_eq!(
471            expr,
472            XPathExpr::PositionPredicate {
473                tag: Tag::Li,
474                pos: 2,
475            }
476        );
477    }
478
479    #[test]
480    fn parse_text_extract() {
481        let expr = parse_xpath("//p/text()").unwrap();
482        assert_eq!(
483            expr,
484            XPathExpr::TextExtract(Box::new(XPathExpr::DescendantByTag(Tag::P)))
485        );
486    }
487
488    #[test]
489    fn parse_absolute_path() {
490        let expr = parse_xpath("/html/body/div").unwrap();
491        assert_eq!(
492            expr,
493            XPathExpr::AbsolutePath(vec![
494                PathStep {
495                    tag: Tag::Html,
496                    predicate: None,
497                },
498                PathStep {
499                    tag: Tag::Body,
500                    predicate: None,
501                },
502                PathStep {
503                    tag: Tag::Div,
504                    predicate: None,
505                },
506            ])
507        );
508    }
509
510    #[test]
511    fn parse_absolute_path_with_predicate() {
512        let expr = parse_xpath("/html/body/div[@class='main']").unwrap();
513        match expr {
514            XPathExpr::AbsolutePath(steps) => {
515                assert_eq!(steps.len(), 3);
516                assert_eq!(steps[2].tag, Tag::Div);
517                assert_eq!(
518                    steps[2].predicate,
519                    Some(Predicate::AttrEquals {
520                        attr: "class".to_string(),
521                        value: "main".to_string(),
522                    })
523                );
524            }
525            _ => panic!("expected AbsolutePath"),
526        }
527    }
528
529    #[test]
530    fn parse_absolute_path_text() {
531        let expr = parse_xpath("/html/body/p/text()").unwrap();
532        match expr {
533            XPathExpr::TextExtract(inner) => {
534                assert!(matches!(*inner, XPathExpr::AbsolutePath(_)));
535            }
536            _ => panic!("expected TextExtract"),
537        }
538    }
539
540    #[test]
541    fn parse_wildcard() {
542        let expr = parse_xpath("//*").unwrap();
543        assert_eq!(expr, XPathExpr::DescendantWildcard);
544    }
545
546    #[test]
547    fn parse_wildcard_attr() {
548        let expr = parse_xpath("//*[@id='main']").unwrap();
549        assert_eq!(
550            expr,
551            XPathExpr::DescendantWildcardByAttr {
552                attr: "id".to_string(),
553                value: "main".to_string(),
554            }
555        );
556    }
557
558    #[test]
559    fn parse_wildcard_attr_exists() {
560        let expr = parse_xpath("//*[@id]").unwrap();
561        assert_eq!(
562            expr,
563            XPathExpr::DescendantWildcardByAttrExists {
564                attr: "id".to_string(),
565            }
566        );
567    }
568
569    #[test]
570    fn parse_parent() {
571        let expr = parse_xpath("..").unwrap();
572        assert_eq!(expr, XPathExpr::Parent);
573    }
574
575    #[test]
576    fn parse_empty_error() {
577        assert!(parse_xpath("").is_err());
578    }
579
580    #[test]
581    fn parse_unknown_tag_error() {
582        assert!(parse_xpath("//foobar").is_err());
583    }
584
585    #[test]
586    fn parse_unclosed_bracket_error() {
587        assert!(parse_xpath("//div[@class='x'").is_err());
588    }
589
590    #[test]
591    fn parse_trailing_garbage_error() {
592        assert!(parse_xpath("//div garbage").is_err());
593    }
594}