1use fhp_core::error::XPathError;
17use fhp_core::tag::Tag;
18
19use super::ast::{PathStep, Predicate, XPathExpr};
20
21pub fn parse_xpath(input: &str) -> Result<XPathExpr, XPathError> {
27 let input = input.trim();
28 if input.is_empty() {
29 return Err(XPathError::Invalid {
30 reason: "empty xpath expression".to_string(),
31 });
32 }
33
34 let mut parser = XPathParser::new(input);
35 let expr = parser.parse()?;
36
37 if parser.remaining().starts_with("/text()") {
39 parser.advance(7);
40 parser.skip_whitespace();
41 if !parser.is_eof() {
42 return Err(XPathError::Invalid {
43 reason: format!("unexpected trailing: {}", parser.remaining()),
44 });
45 }
46 return Ok(XPathExpr::TextExtract(Box::new(expr)));
47 }
48
49 if !parser.is_eof() {
50 return Err(XPathError::Invalid {
51 reason: format!("unexpected trailing: {}", parser.remaining()),
52 });
53 }
54
55 Ok(expr)
56}
57
58struct XPathParser<'a> {
60 input: &'a str,
61 pos: usize,
62}
63
64impl<'a> XPathParser<'a> {
65 fn new(input: &'a str) -> Self {
66 Self { input, pos: 0 }
67 }
68
69 fn remaining(&self) -> &'a str {
70 &self.input[self.pos..]
71 }
72
73 fn is_eof(&self) -> bool {
74 self.pos >= self.input.len()
75 }
76
77 fn peek(&self) -> Option<u8> {
78 self.input.as_bytes().get(self.pos).copied()
79 }
80
81 fn advance(&mut self, n: usize) {
82 self.pos = (self.pos + n).min(self.input.len());
83 }
84
85 fn skip_whitespace(&mut self) {
86 while self.pos < self.input.len() && self.input.as_bytes()[self.pos].is_ascii_whitespace() {
87 self.pos += 1;
88 }
89 }
90
91 fn parse(&mut self) -> Result<XPathExpr, XPathError> {
92 if self.remaining().starts_with("..") {
94 self.advance(2);
95 return Ok(XPathExpr::Parent);
96 }
97
98 if self.remaining().starts_with("//") {
100 self.advance(2);
101 return self.parse_descendant();
102 }
103
104 if self.remaining().starts_with('/') {
106 self.advance(1);
107 return self.parse_absolute_path();
108 }
109
110 Err(XPathError::Invalid {
111 reason: format!("expected '/' or '//' at: {}", self.remaining()),
112 })
113 }
114
115 fn parse_descendant(&mut self) -> Result<XPathExpr, XPathError> {
117 self.skip_whitespace();
118
119 if self.peek() == Some(b'*') {
121 self.advance(1);
122 return self.parse_descendant_wildcard();
123 }
124
125 let tag = self.read_tag_name()?;
126
127 if self.peek() == Some(b'[') {
129 let pred = self.parse_predicate()?;
130 return self.build_descendant_with_predicate(tag, pred);
131 }
132
133 Ok(XPathExpr::DescendantByTag(tag))
135 }
136
137 fn parse_descendant_wildcard(&mut self) -> Result<XPathExpr, XPathError> {
139 if self.peek() == Some(b'[') {
140 let pred = self.parse_predicate()?;
141 match pred {
142 Predicate::AttrEquals { attr, value } => {
143 Ok(XPathExpr::DescendantWildcardByAttr { attr, value })
144 }
145 Predicate::AttrExists { attr } => {
146 Ok(XPathExpr::DescendantWildcardByAttrExists { attr })
147 }
148 _ => Err(XPathError::Invalid {
149 reason: "unsupported predicate on wildcard".to_string(),
150 }),
151 }
152 } else {
153 Ok(XPathExpr::DescendantWildcard)
154 }
155 }
156
157 fn build_descendant_with_predicate(
159 &self,
160 tag: Tag,
161 pred: Predicate,
162 ) -> Result<XPathExpr, XPathError> {
163 match pred {
164 Predicate::AttrEquals { attr, value } => {
165 Ok(XPathExpr::DescendantByAttr { tag, attr, value })
166 }
167 Predicate::Contains { attr, substr } => {
168 Ok(XPathExpr::ContainsPredicate { tag, attr, substr })
169 }
170 Predicate::Position(pos) => Ok(XPathExpr::PositionPredicate { tag, pos }),
171 Predicate::AttrExists { attr } => Ok(XPathExpr::DescendantByAttrExists { tag, attr }),
172 }
173 }
174
175 fn parse_absolute_path(&mut self) -> Result<XPathExpr, XPathError> {
177 let mut steps = Vec::new();
178 loop {
179 self.skip_whitespace();
180 if self.is_eof() || self.remaining().starts_with("/text()") {
181 break;
182 }
183
184 let tag = self.read_tag_name()?;
185 let predicate = if self.peek() == Some(b'[') {
186 Some(self.parse_predicate()?)
187 } else {
188 None
189 };
190
191 steps.push(PathStep { tag, predicate });
192
193 if self.peek() == Some(b'/') {
195 if self.remaining().starts_with("/text()") {
197 break;
198 }
199 self.advance(1);
200 } else {
201 break;
202 }
203 }
204
205 if steps.is_empty() {
206 return Err(XPathError::Invalid {
207 reason: "empty absolute path".to_string(),
208 });
209 }
210
211 Ok(XPathExpr::AbsolutePath(steps))
212 }
213
214 fn parse_predicate(&mut self) -> Result<Predicate, XPathError> {
216 self.expect(b'[')?;
217 self.skip_whitespace();
218
219 let pred = if self.remaining().starts_with("contains(") {
220 self.parse_contains_predicate()?
221 } else if self.remaining().starts_with("position()") {
222 self.parse_position_predicate()?
223 } else if self.peek() == Some(b'@') {
224 self.parse_attr_predicate()?
225 } else if self.peek().is_some_and(|b| b.is_ascii_digit()) {
226 let n = self.read_number()?;
228 Predicate::Position(n)
229 } else {
230 return Err(XPathError::Invalid {
231 reason: format!("unsupported predicate at: {}", self.remaining()),
232 });
233 };
234
235 self.skip_whitespace();
236 self.expect(b']')?;
237 Ok(pred)
238 }
239
240 fn parse_attr_predicate(&mut self) -> Result<Predicate, XPathError> {
242 self.expect(b'@')?;
243 let attr = self.read_ident()?;
244 self.skip_whitespace();
245
246 if self.peek() == Some(b'=') {
247 self.advance(1);
248 self.skip_whitespace();
249 let value = self.read_string_literal()?;
250 Ok(Predicate::AttrEquals { attr, value })
251 } else {
252 Ok(Predicate::AttrExists { attr })
253 }
254 }
255
256 fn parse_contains_predicate(&mut self) -> Result<Predicate, XPathError> {
258 self.advance_str("contains(")?;
259 self.skip_whitespace();
260 self.expect(b'@')?;
261 let attr = self.read_ident()?;
262 self.skip_whitespace();
263 self.expect(b',')?;
264 self.skip_whitespace();
265 let substr = self.read_string_literal()?;
266 self.skip_whitespace();
267 self.expect(b')')?;
268 Ok(Predicate::Contains { attr, substr })
269 }
270
271 fn parse_position_predicate(&mut self) -> Result<Predicate, XPathError> {
273 self.advance_str("position()")?;
274 self.skip_whitespace();
275 self.expect(b'=')?;
276 self.skip_whitespace();
277 let n = self.read_number()?;
278 Ok(Predicate::Position(n))
279 }
280
281 fn read_tag_name(&mut self) -> Result<Tag, XPathError> {
283 let name = self.read_ident()?;
284 let tag = Tag::from_bytes(name.as_bytes());
285 if tag == Tag::Unknown {
286 return Err(XPathError::Invalid {
287 reason: format!("unknown tag: {name}"),
288 });
289 }
290 Ok(tag)
291 }
292
293 fn read_ident(&mut self) -> Result<String, XPathError> {
295 let start = self.pos;
296 while self.pos < self.input.len() {
297 let b = self.input.as_bytes()[self.pos];
298 if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
299 self.pos += 1;
300 } else {
301 break;
302 }
303 }
304 if self.pos == start {
305 return Err(XPathError::Invalid {
306 reason: format!("expected identifier at position {}", self.pos),
307 });
308 }
309 Ok(self.input[start..self.pos].to_string())
310 }
311
312 fn read_string_literal(&mut self) -> Result<String, XPathError> {
314 let quote = self.peek().ok_or_else(|| XPathError::Invalid {
315 reason: "expected string literal, got EOF".to_string(),
316 })?;
317
318 if quote != b'\'' && quote != b'"' {
319 return Err(XPathError::Invalid {
320 reason: format!("expected quote, got '{}'", quote as char),
321 });
322 }
323
324 self.advance(1);
325 let start = self.pos;
326 while self.pos < self.input.len() && self.input.as_bytes()[self.pos] != quote {
327 self.pos += 1;
328 }
329 if self.pos >= self.input.len() {
330 return Err(XPathError::Invalid {
331 reason: "unclosed string literal".to_string(),
332 });
333 }
334 let value = self.input[start..self.pos].to_string();
335 self.advance(1); Ok(value)
337 }
338
339 fn read_number(&mut self) -> Result<usize, XPathError> {
341 let start = self.pos;
342 while self.pos < self.input.len() && self.input.as_bytes()[self.pos].is_ascii_digit() {
343 self.pos += 1;
344 }
345 if self.pos == start {
346 return Err(XPathError::Invalid {
347 reason: format!("expected number at position {}", self.pos),
348 });
349 }
350 self.input[start..self.pos]
351 .parse::<usize>()
352 .map_err(|_| XPathError::Invalid {
353 reason: "invalid number".to_string(),
354 })
355 }
356
357 fn expect(&mut self, expected: u8) -> Result<(), XPathError> {
359 if self.peek() == Some(expected) {
360 self.advance(1);
361 Ok(())
362 } else {
363 Err(XPathError::Invalid {
364 reason: format!(
365 "expected '{}', got '{}'",
366 expected as char,
367 self.peek()
368 .map_or("EOF".to_string(), |b| (b as char).to_string())
369 ),
370 })
371 }
372 }
373
374 fn advance_str(&mut self, s: &str) -> Result<(), XPathError> {
376 if self.remaining().starts_with(s) {
377 self.advance(s.len());
378 Ok(())
379 } else {
380 Err(XPathError::Invalid {
381 reason: format!("expected '{}' at: {}", s, self.remaining()),
382 })
383 }
384 }
385}
386
387#[cfg(test)]
388mod tests {
389 use super::*;
390 use crate::xpath::ast::{Predicate, XPathExpr};
391
392 #[test]
393 fn parse_descendant_tag() {
394 let expr = parse_xpath("//div").unwrap();
395 assert_eq!(expr, XPathExpr::DescendantByTag(Tag::Div));
396 }
397
398 #[test]
399 fn parse_descendant_p() {
400 let expr = parse_xpath("//p").unwrap();
401 assert_eq!(expr, XPathExpr::DescendantByTag(Tag::P));
402 }
403
404 #[test]
405 fn parse_descendant_attr() {
406 let expr = parse_xpath("//a[@href='http://example.com']").unwrap();
407 assert_eq!(
408 expr,
409 XPathExpr::DescendantByAttr {
410 tag: Tag::A,
411 attr: "href".to_string(),
412 value: "http://example.com".to_string(),
413 }
414 );
415 }
416
417 #[test]
418 fn parse_descendant_attr_double_quote() {
419 let expr = parse_xpath("//a[@href=\"url\"]").unwrap();
420 assert_eq!(
421 expr,
422 XPathExpr::DescendantByAttr {
423 tag: Tag::A,
424 attr: "href".to_string(),
425 value: "url".to_string(),
426 }
427 );
428 }
429
430 #[test]
431 fn parse_descendant_attr_exists() {
432 let expr = parse_xpath("//a[@href]").unwrap();
433 assert_eq!(
434 expr,
435 XPathExpr::DescendantByAttrExists {
436 tag: Tag::A,
437 attr: "href".to_string(),
438 }
439 );
440 }
441
442 #[test]
443 fn parse_contains() {
444 let expr = parse_xpath("//a[contains(@class, 'nav')]").unwrap();
445 assert_eq!(
446 expr,
447 XPathExpr::ContainsPredicate {
448 tag: Tag::A,
449 attr: "class".to_string(),
450 substr: "nav".to_string(),
451 }
452 );
453 }
454
455 #[test]
456 fn parse_position() {
457 let expr = parse_xpath("//li[position()=3]").unwrap();
458 assert_eq!(
459 expr,
460 XPathExpr::PositionPredicate {
461 tag: Tag::Li,
462 pos: 3,
463 }
464 );
465 }
466
467 #[test]
468 fn parse_position_shorthand() {
469 let expr = parse_xpath("//li[2]").unwrap();
470 assert_eq!(
471 expr,
472 XPathExpr::PositionPredicate {
473 tag: Tag::Li,
474 pos: 2,
475 }
476 );
477 }
478
479 #[test]
480 fn parse_text_extract() {
481 let expr = parse_xpath("//p/text()").unwrap();
482 assert_eq!(
483 expr,
484 XPathExpr::TextExtract(Box::new(XPathExpr::DescendantByTag(Tag::P)))
485 );
486 }
487
488 #[test]
489 fn parse_absolute_path() {
490 let expr = parse_xpath("/html/body/div").unwrap();
491 assert_eq!(
492 expr,
493 XPathExpr::AbsolutePath(vec![
494 PathStep {
495 tag: Tag::Html,
496 predicate: None,
497 },
498 PathStep {
499 tag: Tag::Body,
500 predicate: None,
501 },
502 PathStep {
503 tag: Tag::Div,
504 predicate: None,
505 },
506 ])
507 );
508 }
509
510 #[test]
511 fn parse_absolute_path_with_predicate() {
512 let expr = parse_xpath("/html/body/div[@class='main']").unwrap();
513 match expr {
514 XPathExpr::AbsolutePath(steps) => {
515 assert_eq!(steps.len(), 3);
516 assert_eq!(steps[2].tag, Tag::Div);
517 assert_eq!(
518 steps[2].predicate,
519 Some(Predicate::AttrEquals {
520 attr: "class".to_string(),
521 value: "main".to_string(),
522 })
523 );
524 }
525 _ => panic!("expected AbsolutePath"),
526 }
527 }
528
529 #[test]
530 fn parse_absolute_path_text() {
531 let expr = parse_xpath("/html/body/p/text()").unwrap();
532 match expr {
533 XPathExpr::TextExtract(inner) => {
534 assert!(matches!(*inner, XPathExpr::AbsolutePath(_)));
535 }
536 _ => panic!("expected TextExtract"),
537 }
538 }
539
540 #[test]
541 fn parse_wildcard() {
542 let expr = parse_xpath("//*").unwrap();
543 assert_eq!(expr, XPathExpr::DescendantWildcard);
544 }
545
546 #[test]
547 fn parse_wildcard_attr() {
548 let expr = parse_xpath("//*[@id='main']").unwrap();
549 assert_eq!(
550 expr,
551 XPathExpr::DescendantWildcardByAttr {
552 attr: "id".to_string(),
553 value: "main".to_string(),
554 }
555 );
556 }
557
558 #[test]
559 fn parse_wildcard_attr_exists() {
560 let expr = parse_xpath("//*[@id]").unwrap();
561 assert_eq!(
562 expr,
563 XPathExpr::DescendantWildcardByAttrExists {
564 attr: "id".to_string(),
565 }
566 );
567 }
568
569 #[test]
570 fn parse_parent() {
571 let expr = parse_xpath("..").unwrap();
572 assert_eq!(expr, XPathExpr::Parent);
573 }
574
575 #[test]
576 fn parse_empty_error() {
577 assert!(parse_xpath("").is_err());
578 }
579
580 #[test]
581 fn parse_unknown_tag_error() {
582 assert!(parse_xpath("//foobar").is_err());
583 }
584
585 #[test]
586 fn parse_unclosed_bracket_error() {
587 assert!(parse_xpath("//div[@class='x'").is_err());
588 }
589
590 #[test]
591 fn parse_trailing_garbage_error() {
592 assert!(parse_xpath("//div garbage").is_err());
593 }
594}