1use std::collections::VecDeque;
23use std::fmt;
24
25use super::XPathMode;
26
27#[derive(Debug, Clone, PartialEq)]
29pub enum Token {
30 StringLiteral(String),
32 IntegerLiteral(String),
33 DecimalLiteral(String),
34 DoubleLiteral(String),
35
36 NCName(String),
38 QName(String),
39 QNameNillable(String),
42 VarName {
43 prefix: String,
44 local: String,
45 },
46
47 For,
49 In,
50 Return,
51 If,
52 Then,
53 Else,
54 Some,
55 Every,
56 Satisfies,
57 And,
58 Or,
59 To,
60 Div,
61 IDiv,
62 Mod,
63 Union,
64 Except,
65 Intersect,
66
67 InstanceOf,
69 TreatAs,
70 CastAs,
71 CastableAs,
72
73 Element,
75 Attribute,
76 Text,
77 Comment,
78 Node,
79 DocumentNode,
80 ProcessingInstruction,
81 SchemaElement,
82 SchemaAttribute,
83 Item,
84 EmptySequence,
85
86 AxisChild,
88 AxisDescendant,
89 AxisAttribute,
90 AxisSelf,
91 AxisDescendantOrSelf,
92 AxisFollowingSibling,
93 AxisFollowing,
94 AxisParent,
95 AxisAncestor,
96 AxisPrecedingSibling,
97 AxisPreceding,
98 AxisAncestorOrSelf,
99 AxisNamespace,
100
101 Eq,
103 Ne,
104 Lt,
105 Le,
106 Gt,
107 Ge,
108 Is,
109
110 OccurrenceZeroOrOne, OccurrenceOneOrMore, OccurrenceZeroOrMore, DoublePeriod, DoubleSlash, NotEquals, LessEquals, GreaterEquals, DoubleLess, DoubleGreater, LParen,
126 RParen,
127 LBracket,
128 RBracket,
129 Comma,
130 Colon,
131 At,
132 Dollar,
133 Slash,
134 SlashOnly,
137 Pipe,
138 Plus,
139 Minus,
140 Minus10, Plus10, Minus20, Plus20, Star,
146 Equals,
147 LessThan,
148 GreaterThan,
149 Question,
150 Dot,
151
152 Eof,
154}
155
156impl fmt::Display for Token {
157 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
158 match self {
159 Token::StringLiteral(s) => write!(f, "\"{}\"", s),
160 Token::IntegerLiteral(s) => write!(f, "{}", s),
161 Token::DecimalLiteral(s) => write!(f, "{}", s),
162 Token::DoubleLiteral(s) => write!(f, "{}", s),
163 Token::NCName(s) => write!(f, "{}", s),
164 Token::QName(s) => write!(f, "{}", s),
165 Token::QNameNillable(s) => write!(f, "{}?", s),
166 Token::VarName { prefix, local } => {
167 if prefix.is_empty() {
168 write!(f, "${}", local)
169 } else {
170 write!(f, "${}:{}", prefix, local)
171 }
172 }
173 Token::For => write!(f, "for"),
174 Token::In => write!(f, "in"),
175 Token::Return => write!(f, "return"),
176 Token::If => write!(f, "if"),
177 Token::Then => write!(f, "then"),
178 Token::Else => write!(f, "else"),
179 Token::Some => write!(f, "some"),
180 Token::Every => write!(f, "every"),
181 Token::Satisfies => write!(f, "satisfies"),
182 Token::And => write!(f, "and"),
183 Token::Or => write!(f, "or"),
184 Token::To => write!(f, "to"),
185 Token::Div => write!(f, "div"),
186 Token::IDiv => write!(f, "idiv"),
187 Token::Mod => write!(f, "mod"),
188 Token::Union => write!(f, "union"),
189 Token::Except => write!(f, "except"),
190 Token::Intersect => write!(f, "intersect"),
191 Token::InstanceOf => write!(f, "instance of"),
192 Token::TreatAs => write!(f, "treat as"),
193 Token::CastAs => write!(f, "cast as"),
194 Token::CastableAs => write!(f, "castable as"),
195 Token::Element => write!(f, "element"),
196 Token::Attribute => write!(f, "attribute"),
197 Token::Text => write!(f, "text"),
198 Token::Comment => write!(f, "comment"),
199 Token::Node => write!(f, "node"),
200 Token::DocumentNode => write!(f, "document-node"),
201 Token::ProcessingInstruction => write!(f, "processing-instruction"),
202 Token::SchemaElement => write!(f, "schema-element"),
203 Token::SchemaAttribute => write!(f, "schema-attribute"),
204 Token::Item => write!(f, "item"),
205 Token::EmptySequence => write!(f, "empty-sequence"),
206 Token::AxisChild => write!(f, "child::"),
207 Token::AxisDescendant => write!(f, "descendant::"),
208 Token::AxisAttribute => write!(f, "attribute::"),
209 Token::AxisSelf => write!(f, "self::"),
210 Token::AxisDescendantOrSelf => write!(f, "descendant-or-self::"),
211 Token::AxisFollowingSibling => write!(f, "following-sibling::"),
212 Token::AxisFollowing => write!(f, "following::"),
213 Token::AxisParent => write!(f, "parent::"),
214 Token::AxisAncestor => write!(f, "ancestor::"),
215 Token::AxisPrecedingSibling => write!(f, "preceding-sibling::"),
216 Token::AxisPreceding => write!(f, "preceding::"),
217 Token::AxisAncestorOrSelf => write!(f, "ancestor-or-self::"),
218 Token::AxisNamespace => write!(f, "namespace::"),
219 Token::Eq => write!(f, "eq"),
220 Token::Ne => write!(f, "ne"),
221 Token::Lt => write!(f, "lt"),
222 Token::Le => write!(f, "le"),
223 Token::Gt => write!(f, "gt"),
224 Token::Ge => write!(f, "ge"),
225 Token::Is => write!(f, "is"),
226 Token::OccurrenceZeroOrOne => write!(f, "?"),
227 Token::OccurrenceOneOrMore => write!(f, "+"),
228 Token::OccurrenceZeroOrMore => write!(f, "*"),
229 Token::DoublePeriod => write!(f, ".."),
230 Token::DoubleSlash => write!(f, "//"),
231 Token::NotEquals => write!(f, "!="),
232 Token::LessEquals => write!(f, "<="),
233 Token::GreaterEquals => write!(f, ">="),
234 Token::DoubleLess => write!(f, "<<"),
235 Token::DoubleGreater => write!(f, ">>"),
236 Token::LParen => write!(f, "("),
237 Token::RParen => write!(f, ")"),
238 Token::LBracket => write!(f, "["),
239 Token::RBracket => write!(f, "]"),
240 Token::Comma => write!(f, ","),
241 Token::Colon => write!(f, ":"),
242 Token::At => write!(f, "@"),
243 Token::Dollar => write!(f, "$"),
244 Token::Slash => write!(f, "/"),
245 Token::SlashOnly => write!(f, "/"),
246 Token::Pipe => write!(f, "|"),
247 Token::Plus => write!(f, "+"),
248 Token::Minus => write!(f, "-"),
249 Token::Minus10 | Token::Minus20 => write!(f, "-"),
250 Token::Plus10 | Token::Plus20 => write!(f, "+"),
251 Token::Star => write!(f, "*"),
252 Token::Equals => write!(f, "="),
253 Token::LessThan => write!(f, "<"),
254 Token::GreaterThan => write!(f, ">"),
255 Token::Question => write!(f, "?"),
256 Token::Dot => write!(f, "."),
257 Token::Eof => write!(f, "EOF"),
258 }
259 }
260}
261
262#[derive(Debug, Clone, PartialEq)]
264pub struct LexerError {
265 pub message: String,
266 pub position: usize,
267}
268
269impl fmt::Display for LexerError {
270 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
271 write!(
272 f,
273 "Lexer error at position {}: {}",
274 self.position, self.message
275 )
276 }
277}
278
279impl std::error::Error for LexerError {}
280
281pub type Spanned = (usize, Token, usize);
283
284#[derive(Debug, Clone, Copy, PartialEq, Eq)]
286enum LexerState {
287 Default,
289 Operator,
291 SingleType,
293 ItemType,
295 KindTest,
297 KindTestForPi,
299 CloseKindTest,
301 TypeNameInKindTest,
303 OccurrenceIndicator,
305 VarName,
307}
308
309pub struct Lexer<'input> {
311 #[allow(dead_code)]
312 input: &'input str,
313 chars: Vec<char>,
314 pos: usize,
315 state: LexerState,
316 state_stack: Vec<LexerState>,
317 token_queue: VecDeque<Spanned>,
318 finished: bool,
319 mode: XPathMode,
320}
321
322impl<'input> Lexer<'input> {
323 pub fn new(input: &'input str) -> Self {
325 Self {
326 input,
327 chars: input.chars().collect(),
328 pos: 0,
329 state: LexerState::Default,
330 state_stack: Vec::new(),
331 token_queue: VecDeque::new(),
332 finished: false,
333 mode: XPathMode::XPath20,
334 }
335 }
336
337 pub fn new_with_mode(input: &'input str, mode: XPathMode) -> Self {
339 Self {
340 input,
341 chars: input.chars().collect(),
342 pos: 0,
343 state: LexerState::Default,
344 state_stack: Vec::new(),
345 token_queue: VecDeque::new(),
346 finished: false,
347 mode,
348 }
349 }
350
351 fn is_xpath10(&self) -> bool {
353 self.mode == XPathMode::XPath10
354 }
355
356 #[inline]
358 fn peek(&self, offset: usize) -> Option<char> {
359 self.chars.get(self.pos + offset).copied()
360 }
361
362 #[inline]
364 fn current(&self) -> Option<char> {
365 self.peek(0)
366 }
367
368 #[inline]
370 fn advance(&mut self, n: usize) {
371 self.pos += n;
372 }
373
374 #[inline]
376 fn read(&mut self) -> Option<char> {
377 let c = self.current();
378 if c.is_some() {
379 self.pos += 1;
380 }
381 c
382 }
383
384 fn is_ncname_start(c: char) -> bool {
386 c.is_alphabetic() || c == '_'
387 }
388
389 fn is_ncname_char(c: char) -> bool {
391 c.is_alphanumeric()
392 || c == '_'
393 || c == '-'
394 || c == '.'
395 || c == '\u{B7}'
396 || ('\u{0300}'..='\u{036F}').contains(&c)
397 || ('\u{203F}'..='\u{2040}').contains(&c)
398 }
399
400 fn is_digit(c: char) -> bool {
402 c.is_ascii_digit()
403 }
404
405 fn is_whitespace(c: char) -> bool {
407 matches!(c, ' ' | '\t' | '\r' | '\n')
408 }
409
410 fn can_start_relative_path(&self) -> bool {
413 let mut i = 0;
414
415 loop {
417 while self.pos + i < self.chars.len() && Self::is_whitespace(self.chars[self.pos + i]) {
418 i += 1;
419 }
420
421 if self.pos + i + 1 < self.chars.len()
423 && self.chars[self.pos + i] == '('
424 && self.chars[self.pos + i + 1] == ':'
425 {
426 i += 2;
427 let mut depth = 1;
428 while depth > 0 && self.pos + i < self.chars.len() {
429 if self.pos + i + 1 < self.chars.len()
430 && self.chars[self.pos + i] == '('
431 && self.chars[self.pos + i + 1] == ':'
432 {
433 i += 2;
434 depth += 1;
435 } else if self.pos + i + 1 < self.chars.len()
436 && self.chars[self.pos + i] == ':'
437 && self.chars[self.pos + i + 1] == ')'
438 {
439 i += 2;
440 depth -= 1;
441 } else {
442 i += 1;
443 }
444 }
445 continue;
446 }
447 break;
448 }
449
450 if self.pos + i >= self.chars.len() {
452 return false; }
454
455 let c = self.chars[self.pos + i];
456
457 matches!(c,
467 'a'..='z' | 'A'..='Z' | '_' | '@' | '*' | '.' | '$' | '(' | '"' | '\''
469 ) || c.is_ascii_digit()
470 || c.is_alphabetic()
471 }
472
473 fn skip_whitespace_and_comments(&mut self) {
475 loop {
476 while let Some(c) = self.current() {
478 if Self::is_whitespace(c) {
479 self.advance(1);
480 } else {
481 break;
482 }
483 }
484
485 if self.peek(0) == Some('(') && self.peek(1) == Some(':') {
487 self.advance(2);
488 let mut depth = 1;
489 while depth > 0 {
490 match (self.peek(0), self.peek(1)) {
491 (Some('('), Some(':')) => {
492 self.advance(2);
493 depth += 1;
494 }
495 (Some(':'), Some(')')) => {
496 self.advance(2);
497 depth -= 1;
498 }
499 (Some(_), _) => {
500 self.advance(1);
501 }
502 (None, _) => break, }
504 }
505 continue;
506 }
507
508 break;
509 }
510 }
511
512 fn match_identifier(&mut self, parts: &[&str]) -> bool {
516 let start_pos = self.pos;
517 let mut i = 0;
518
519 for part in parts {
520 while i < self.chars.len() - self.pos {
522 let idx = self.pos + i;
523 if idx >= self.chars.len() {
524 break;
525 }
526 let c = self.chars[idx];
527 if Self::is_whitespace(c) {
528 i += 1;
529 continue;
530 }
531 if idx + 1 < self.chars.len()
533 && self.chars[idx] == '('
534 && self.chars[idx + 1] == ':'
535 {
536 i += 2;
537 let mut depth = 1;
538 while depth > 0 && self.pos + i < self.chars.len() {
539 if self.pos + i + 1 < self.chars.len()
540 && self.chars[self.pos + i] == '('
541 && self.chars[self.pos + i + 1] == ':'
542 {
543 i += 2;
544 depth += 1;
545 } else if self.pos + i + 1 < self.chars.len()
546 && self.chars[self.pos + i] == ':'
547 && self.chars[self.pos + i + 1] == ')'
548 {
549 i += 2;
550 depth -= 1;
551 } else {
552 i += 1;
553 }
554 }
555 continue;
556 }
557 break;
558 }
559
560 let part_chars: Vec<char> = part.chars().collect();
562 for (j, &pc) in part_chars.iter().enumerate() {
563 let idx = self.pos + i + j;
564 if idx >= self.chars.len() || self.chars[idx] != pc {
565 self.pos = start_pos;
566 return false;
567 }
568 }
569
570 if !part.is_empty() && Self::is_ncname_start(part_chars[0]) {
572 let after_idx = self.pos + i + part_chars.len();
573 if after_idx < self.chars.len() && Self::is_ncname_char(self.chars[after_idx]) {
574 self.pos = start_pos;
575 return false;
576 }
577 }
578
579 i += part_chars.len();
580 }
581
582 self.pos += i;
584 true
585 }
586
587 fn try_match_identifier(&self, parts: &[&str]) -> bool {
589 let mut i = 0;
590
591 for part in parts {
592 while self.pos + i < self.chars.len() {
594 let idx = self.pos + i;
595 let c = self.chars[idx];
596 if Self::is_whitespace(c) {
597 i += 1;
598 continue;
599 }
600 if idx + 1 < self.chars.len()
602 && self.chars[idx] == '('
603 && self.chars[idx + 1] == ':'
604 {
605 i += 2;
606 let mut depth = 1;
607 while depth > 0 && self.pos + i < self.chars.len() {
608 if self.pos + i + 1 < self.chars.len()
609 && self.chars[self.pos + i] == '('
610 && self.chars[self.pos + i + 1] == ':'
611 {
612 i += 2;
613 depth += 1;
614 } else if self.pos + i + 1 < self.chars.len()
615 && self.chars[self.pos + i] == ':'
616 && self.chars[self.pos + i + 1] == ')'
617 {
618 i += 2;
619 depth -= 1;
620 } else {
621 i += 1;
622 }
623 }
624 continue;
625 }
626 break;
627 }
628
629 let part_chars: Vec<char> = part.chars().collect();
631 for (j, &pc) in part_chars.iter().enumerate() {
632 let idx = self.pos + i + j;
633 if idx >= self.chars.len() || self.chars[idx] != pc {
634 return false;
635 }
636 }
637
638 if !part.is_empty() && Self::is_ncname_start(part_chars[0]) {
640 let after_idx = self.pos + i + part_chars.len();
641 if after_idx < self.chars.len() && Self::is_ncname_char(self.chars[after_idx]) {
642 return false;
643 }
644 }
645
646 i += part_chars.len();
647 }
648
649 true
650 }
651
652 fn consume_ncname(&mut self) -> String {
654 let start = self.pos;
655 while let Some(c) = self.current() {
656 if Self::is_ncname_char(c) {
657 self.advance(1);
658 } else {
659 break;
660 }
661 }
662 self.chars[start..self.pos].iter().collect()
663 }
664
665 fn consume_qname(&mut self) -> String {
667 let start = self.pos;
668 while let Some(c) = self.current() {
669 if (c == ':' && self.peek(1).map(Self::is_ncname_char).unwrap_or(false))
671 || Self::is_ncname_char(c)
672 {
673 self.advance(1);
674 } else {
675 break;
676 }
677 }
678 self.chars[start..self.pos].iter().collect()
679 }
680
681 fn consume_number(&mut self) -> (Token, usize, usize) {
683 let start = self.pos;
684 let mut is_decimal = false;
685 let mut is_double = false;
686
687 while let Some(c) = self.current() {
689 if Self::is_digit(c) {
690 self.advance(1);
691 } else {
692 break;
693 }
694 }
695
696 if self.current() == Some('.') && self.peek(1).map(Self::is_digit).unwrap_or(false) {
698 is_decimal = true;
699 self.advance(1);
700 while let Some(c) = self.current() {
701 if Self::is_digit(c) {
702 self.advance(1);
703 } else {
704 break;
705 }
706 }
707 } else if self.current() == Some('.') && start == self.pos {
708 is_decimal = true;
710 self.advance(1);
711 while let Some(c) = self.current() {
712 if Self::is_digit(c) {
713 self.advance(1);
714 } else {
715 break;
716 }
717 }
718 }
719
720 if let Some(c) = self.current() {
722 if c == 'e' || c == 'E' {
723 is_double = true;
724 self.advance(1);
725 if let Some(sign) = self.current() {
726 if sign == '+' || sign == '-' {
727 self.advance(1);
728 }
729 }
730 while let Some(c) = self.current() {
731 if Self::is_digit(c) {
732 self.advance(1);
733 } else {
734 break;
735 }
736 }
737 }
738 }
739
740 let value: String = self.chars[start..self.pos].iter().collect();
741 let token = if is_double {
742 Token::DoubleLiteral(value)
743 } else if is_decimal {
744 Token::DecimalLiteral(value)
745 } else {
746 Token::IntegerLiteral(value)
747 };
748
749 (token, start, self.pos)
750 }
751
752 fn consume_string(&mut self) -> Result<(Token, usize, usize), LexerError> {
754 let start = self.pos;
755 let quote = self.read().unwrap();
756 let mut value = String::new();
757
758 loop {
759 match self.current() {
760 None => {
761 return Err(LexerError {
762 message: "Unterminated string literal".to_string(),
763 position: start,
764 });
765 }
766 Some(c) if c == quote => {
767 self.advance(1);
768 if self.current() == Some(quote) {
770 value.push(quote);
771 self.advance(1);
772 } else {
773 break;
774 }
775 }
776 Some(c) => {
777 value.push(c);
778 self.advance(1);
779 }
780 }
781 }
782
783 let value =
786 crate::xpath::string_ops::normalize_string_value(&value, false, true).map_err(|e| {
787 LexerError {
788 message: format!("{}", e),
789 position: start,
790 }
791 })?;
792
793 Ok((Token::StringLiteral(value), start, self.pos))
794 }
795
796 fn enqueue(&mut self, token: Token, start: usize, end: usize) {
798 self.token_queue.push_back((start, token, end));
799 }
800
801 fn default_state(&mut self) -> Result<(), LexerError> {
803 self.skip_whitespace_and_comments();
804 let start = self.pos;
805
806 match self.current() {
807 None => {
808 self.enqueue(Token::Eof, start, start);
809 }
810
811 Some('.') => {
812 if self.peek(1) == Some('.') {
813 self.advance(2);
814 self.enqueue(Token::DoublePeriod, start, self.pos);
815 } else if self.peek(1).map(Self::is_digit).unwrap_or(false) {
816 let (tok, s, e) = self.consume_number();
817 self.enqueue(tok, s, e);
818 } else {
819 self.advance(1);
820 self.enqueue(Token::Dot, start, self.pos);
821 }
822 self.state = LexerState::Operator;
823 }
824
825 Some(')') => {
826 self.advance(1);
827 self.enqueue(Token::RParen, start, self.pos);
828 self.state = LexerState::Operator;
829 }
830
831 Some('*') => {
832 self.advance(1);
833 let star_end = self.pos;
834 if self.current() == Some(':') {
836 self.advance(1);
837 if self.current().map(Self::is_ncname_start).unwrap_or(false) {
838 self.enqueue(Token::Star, start, star_end);
839 let colon_start = star_end;
840 self.enqueue(Token::Colon, colon_start, colon_start + 1);
841 let ncname_start = self.pos;
842 let ncname = self.consume_ncname();
843 self.enqueue(Token::NCName(ncname), ncname_start, self.pos);
844 } else {
845 self.pos = star_end; self.enqueue(Token::Star, start, star_end);
848 }
849 } else {
850 self.enqueue(Token::Star, start, self.pos);
851 }
852 self.state = LexerState::Operator;
853 }
854
855 Some(c @ (';' | ',' | '(' | '-' | '+' | '@' | '~')) => {
856 self.advance(1);
857 let token = match c {
858 ';' => Token::Comma, ',' => Token::Comma,
860 '(' => Token::LParen,
861 '-' => {
862 if self.is_xpath10() {
863 Token::Minus10
864 } else {
865 Token::Minus20
866 }
867 }
868 '+' => {
869 if self.is_xpath10() {
870 Token::Plus10
871 } else {
872 Token::Plus20
873 }
874 }
875 '@' => Token::At,
876 '~' => {
877 if self.is_xpath10() {
878 Token::Minus10
879 } else {
880 Token::Minus20
881 }
882 }
883 _ => unreachable!(),
884 };
885 self.enqueue(token, start, self.pos);
886 }
887
888 Some('/') => {
889 if self.peek(1) == Some('/') {
890 self.advance(2);
891 self.enqueue(Token::DoubleSlash, start, self.pos);
892 } else {
893 self.advance(1);
894 if self.can_start_relative_path() {
896 self.enqueue(Token::Slash, start, self.pos);
897 } else {
898 self.enqueue(Token::SlashOnly, start, self.pos);
900 self.state = LexerState::Operator;
901 }
902 }
903 }
904
905 Some('$') => {
906 self.advance(1);
907 self.enqueue(Token::Dollar, start, self.pos);
908 self.state = LexerState::VarName;
909 }
910
911 Some('[') => {
912 self.advance(1);
913 self.enqueue(Token::LBracket, start, self.pos);
914 self.state_stack.push(self.state);
915 }
916
917 Some(']') => {
918 self.advance(1);
919 self.enqueue(Token::RBracket, start, self.pos);
920 if let Some(s) = self.state_stack.pop() {
921 self.state = s;
922 }
923 }
924
925 Some('"') | Some('\'') => {
926 let (tok, s, e) = self.consume_string()?;
927 self.enqueue(tok, s, e);
928 self.state = LexerState::Operator;
929 }
930
931 Some(c) if Self::is_digit(c) => {
932 let (tok, s, e) = self.consume_number();
933 self.enqueue(tok, s, e);
934 self.state = LexerState::Operator;
935 }
936
937 Some(c) if Self::is_ncname_start(c) => {
938 self.process_name_in_default_state(start)?;
939 }
940
941 Some(c) => {
942 return Err(LexerError {
943 message: format!("Unexpected character: '{}'", c),
944 position: start,
945 });
946 }
947 }
948
949 Ok(())
950 }
951
952 fn process_name_in_default_state(&mut self, start: usize) -> Result<(), LexerError> {
954 if !self.is_xpath10() && self.match_identifier(&["if", "("]) {
958 self.enqueue(Token::If, start, self.pos - 1);
959 self.enqueue(Token::LParen, self.pos - 1, self.pos);
960 return Ok(());
961 }
962
963 if !self.is_xpath10() && self.try_match_identifier(&["for"]) {
965 self.match_identifier(&["for"]);
966 self.enqueue(Token::For, start, self.pos);
967 self.skip_whitespace_and_comments();
968 if self.current() == Some('$') {
969 let dollar_start = self.pos;
970 self.advance(1);
971 self.enqueue(Token::Dollar, dollar_start, self.pos);
972 self.state = LexerState::VarName;
973 }
974 return Ok(());
975 }
976
977 if !self.is_xpath10() && self.try_match_identifier(&["some"]) {
979 self.match_identifier(&["some"]);
980 self.enqueue(Token::Some, start, self.pos);
981 self.skip_whitespace_and_comments();
982 if self.current() == Some('$') {
983 let dollar_start = self.pos;
984 self.advance(1);
985 self.enqueue(Token::Dollar, dollar_start, self.pos);
986 self.state = LexerState::VarName;
987 }
988 return Ok(());
989 }
990
991 if !self.is_xpath10() && self.try_match_identifier(&["every"]) {
993 self.match_identifier(&["every"]);
994 self.enqueue(Token::Every, start, self.pos);
995 self.skip_whitespace_and_comments();
996 if self.current() == Some('$') {
997 let dollar_start = self.pos;
998 self.advance(1);
999 self.enqueue(Token::Dollar, dollar_start, self.pos);
1000 self.state = LexerState::VarName;
1001 }
1002 return Ok(());
1003 }
1004
1005 if !self.is_xpath10() && self.match_identifier(&["element", "("]) {
1009 self.enqueue(Token::Element, start, self.pos - 1);
1010 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1011 self.state_stack.push(LexerState::Operator);
1012 self.state = LexerState::KindTest;
1013 return Ok(());
1014 }
1015 if !self.is_xpath10() && self.match_identifier(&["attribute", "("]) {
1016 self.enqueue(Token::Attribute, start, self.pos - 1);
1017 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1018 self.state_stack.push(LexerState::Operator);
1019 self.state = LexerState::KindTest;
1020 return Ok(());
1021 }
1022 if !self.is_xpath10() && self.match_identifier(&["schema-element", "("]) {
1023 self.enqueue(Token::SchemaElement, start, self.pos - 1);
1024 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1025 self.state_stack.push(LexerState::Operator);
1026 self.state = LexerState::KindTest;
1027 return Ok(());
1028 }
1029 if !self.is_xpath10() && self.match_identifier(&["schema-attribute", "("]) {
1030 self.enqueue(Token::SchemaAttribute, start, self.pos - 1);
1031 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1032 self.state_stack.push(LexerState::Operator);
1033 self.state = LexerState::KindTest;
1034 return Ok(());
1035 }
1036 if self.match_identifier(&["comment", "("]) {
1038 self.enqueue(Token::Comment, start, self.pos - 1);
1039 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1040 self.state_stack.push(LexerState::Operator);
1041 self.state = LexerState::KindTest;
1042 return Ok(());
1043 }
1044 if self.match_identifier(&["text", "("]) {
1045 self.enqueue(Token::Text, start, self.pos - 1);
1046 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1047 self.state_stack.push(LexerState::Operator);
1048 self.state = LexerState::KindTest;
1049 return Ok(());
1050 }
1051 if self.match_identifier(&["node", "("]) {
1052 self.enqueue(Token::Node, start, self.pos - 1);
1053 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1054 self.state_stack.push(LexerState::Operator);
1055 self.state = LexerState::KindTest;
1056 return Ok(());
1057 }
1058 if !self.is_xpath10() && self.match_identifier(&["document-node", "("]) {
1059 self.enqueue(Token::DocumentNode, start, self.pos - 1);
1060 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1061 self.state_stack.push(LexerState::Operator);
1062 self.state = LexerState::KindTest;
1063 return Ok(());
1064 }
1065 if self.match_identifier(&["processing-instruction", "("]) {
1066 self.enqueue(Token::ProcessingInstruction, start, self.pos - 1);
1067 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1068 self.state_stack.push(LexerState::Operator);
1069 self.state = LexerState::KindTestForPi;
1070 return Ok(());
1071 }
1072
1073 if self.match_identifier(&["ancestor-or-self", "::"]) {
1075 self.enqueue(Token::AxisAncestorOrSelf, start, self.pos);
1076 return Ok(());
1077 }
1078 if self.match_identifier(&["ancestor", "::"]) {
1079 self.enqueue(Token::AxisAncestor, start, self.pos);
1080 return Ok(());
1081 }
1082 if self.match_identifier(&["attribute", "::"]) {
1083 self.enqueue(Token::AxisAttribute, start, self.pos);
1084 return Ok(());
1085 }
1086 if self.match_identifier(&["child", "::"]) {
1087 self.enqueue(Token::AxisChild, start, self.pos);
1088 return Ok(());
1089 }
1090 if self.match_identifier(&["descendant-or-self", "::"]) {
1091 self.enqueue(Token::AxisDescendantOrSelf, start, self.pos);
1092 return Ok(());
1093 }
1094 if self.match_identifier(&["descendant", "::"]) {
1095 self.enqueue(Token::AxisDescendant, start, self.pos);
1096 return Ok(());
1097 }
1098 if self.match_identifier(&["following-sibling", "::"]) {
1099 self.enqueue(Token::AxisFollowingSibling, start, self.pos);
1100 return Ok(());
1101 }
1102 if self.match_identifier(&["following", "::"]) {
1103 self.enqueue(Token::AxisFollowing, start, self.pos);
1104 return Ok(());
1105 }
1106 if self.match_identifier(&["parent", "::"]) {
1107 self.enqueue(Token::AxisParent, start, self.pos);
1108 return Ok(());
1109 }
1110 if self.match_identifier(&["preceding-sibling", "::"]) {
1111 self.enqueue(Token::AxisPrecedingSibling, start, self.pos);
1112 return Ok(());
1113 }
1114 if self.match_identifier(&["preceding", "::"]) {
1115 self.enqueue(Token::AxisPreceding, start, self.pos);
1116 return Ok(());
1117 }
1118 if self.match_identifier(&["self", "::"]) {
1119 self.enqueue(Token::AxisSelf, start, self.pos);
1120 return Ok(());
1121 }
1122 if self.match_identifier(&["namespace", "::"]) {
1123 self.enqueue(Token::AxisNamespace, start, self.pos);
1124 return Ok(());
1125 }
1126
1127 let name = self.consume_qname();
1129 let end = self.pos;
1130
1131 if name.contains(':') && self.current() == Some('*') {
1133 }
1136
1137 self.skip_whitespace_and_comments();
1138
1139 if self.current() != Some('(') {
1141 self.state = LexerState::Operator;
1142 }
1143
1144 self.enqueue(Token::QName(name), start, end);
1145
1146 Ok(())
1147 }
1148
1149 fn varname_state(&mut self) -> Result<(), LexerError> {
1151 self.skip_whitespace_and_comments();
1152 let start = self.pos;
1153
1154 if self.current().map(Self::is_ncname_start).unwrap_or(false) {
1155 let first = self.consume_ncname();
1156 let prefix;
1157 let local;
1158
1159 if self.current() == Some(':')
1160 && self.peek(1).map(Self::is_ncname_start).unwrap_or(false)
1161 {
1162 self.advance(1);
1163 prefix = first;
1164 local = self.consume_ncname();
1165 } else {
1166 prefix = String::new();
1167 local = first;
1168 }
1169
1170 self.enqueue(Token::VarName { prefix, local }, start, self.pos);
1171 self.state = LexerState::Operator;
1172 }
1173
1174 Ok(())
1175 }
1176
1177 fn operator_state(&mut self) -> Result<(), LexerError> {
1179 self.skip_whitespace_and_comments();
1180 let start = self.pos;
1181
1182 match self.current() {
1183 None => {
1184 self.enqueue(Token::Eof, start, start);
1185 }
1186
1187 Some(c @ (',' | '=' | '+' | '-' | '[' | '|')) => {
1188 self.advance(1);
1189 let token = match c {
1190 ',' => Token::Comma,
1191 '=' => Token::Equals,
1192 '+' => Token::Plus,
1193 '-' => Token::Minus,
1194 '[' => {
1195 self.state_stack.push(self.state);
1196 Token::LBracket
1197 }
1198 '|' => Token::Pipe,
1199 _ => unreachable!(),
1200 };
1201 self.enqueue(token, start, self.pos);
1202 self.state = LexerState::Default;
1203 }
1204
1205 Some('*') => {
1206 self.advance(1);
1207 self.enqueue(Token::Star, start, self.pos);
1208 self.state = LexerState::Default;
1209 }
1210
1211 Some('!') if self.peek(1) == Some('=') => {
1212 self.advance(2);
1213 self.enqueue(Token::NotEquals, start, self.pos);
1214 self.state = LexerState::Default;
1215 }
1216
1217 Some('>') => {
1218 if self.peek(1) == Some('=') {
1219 self.advance(2);
1220 self.enqueue(Token::GreaterEquals, start, self.pos);
1221 } else if !self.is_xpath10() && self.peek(1) == Some('>') {
1222 self.advance(2);
1223 self.enqueue(Token::DoubleGreater, start, self.pos);
1224 } else {
1225 self.advance(1);
1226 self.enqueue(Token::GreaterThan, start, self.pos);
1227 }
1228 self.state = LexerState::Default;
1229 }
1230
1231 Some('<') => {
1232 if self.peek(1) == Some('=') {
1233 self.advance(2);
1234 self.enqueue(Token::LessEquals, start, self.pos);
1235 } else if !self.is_xpath10() && self.peek(1) == Some('<') {
1236 self.advance(2);
1237 self.enqueue(Token::DoubleLess, start, self.pos);
1238 } else {
1239 self.advance(1);
1240 self.enqueue(Token::LessThan, start, self.pos);
1241 }
1242 self.state = LexerState::Default;
1243 }
1244
1245 Some('/') => {
1246 if self.peek(1) == Some('/') {
1247 self.advance(2);
1248 self.enqueue(Token::DoubleSlash, start, self.pos);
1249 self.state = LexerState::Default;
1250 } else {
1251 self.advance(1);
1252 if self.can_start_relative_path() {
1254 self.enqueue(Token::Slash, start, self.pos);
1255 self.state = LexerState::Default;
1256 } else {
1257 self.enqueue(Token::SlashOnly, start, self.pos);
1260 }
1262 }
1263 }
1264
1265 Some(')') => {
1266 self.advance(1);
1267 self.enqueue(Token::RParen, start, self.pos);
1268 }
1269
1270 Some('?') => {
1271 self.advance(1);
1272 self.enqueue(Token::Question, start, self.pos);
1273 }
1274
1275 Some(']') => {
1276 self.advance(1);
1277 self.enqueue(Token::RBracket, start, self.pos);
1278 if let Some(s) = self.state_stack.pop() {
1279 self.state = s;
1280 }
1281 }
1282
1283 Some('$') => {
1284 self.advance(1);
1285 self.enqueue(Token::Dollar, start, self.pos);
1286 self.state = LexerState::VarName;
1287 }
1288
1289 Some('"') | Some('\'') => {
1290 let (tok, s, e) = self.consume_string()?;
1291 self.enqueue(tok, s, e);
1292 }
1293
1294 Some(c) if Self::is_ncname_start(c) => {
1295 self.process_keyword_in_operator_state(start)?;
1296 }
1297
1298 Some(c) => {
1299 return Err(LexerError {
1300 message: format!("Unexpected character in operator context: '{}'", c),
1301 position: start,
1302 });
1303 }
1304 }
1305
1306 Ok(())
1307 }
1308
1309 fn process_keyword_in_operator_state(&mut self, start: usize) -> Result<(), LexerError> {
1311 if !self.is_xpath10() {
1313 if self.match_identifier(&["castable", "as"]) {
1314 self.enqueue(Token::CastableAs, start, self.pos);
1315 self.state = LexerState::SingleType;
1316 return Ok(());
1317 }
1318 if self.match_identifier(&["cast", "as"]) {
1319 self.enqueue(Token::CastAs, start, self.pos);
1320 self.state = LexerState::SingleType;
1321 return Ok(());
1322 }
1323 if self.match_identifier(&["instance", "of"]) {
1324 self.enqueue(Token::InstanceOf, start, self.pos);
1325 self.state = LexerState::ItemType;
1326 return Ok(());
1327 }
1328 if self.match_identifier(&["treat", "as"]) {
1329 self.enqueue(Token::TreatAs, start, self.pos);
1330 self.state = LexerState::ItemType;
1331 return Ok(());
1332 }
1333 }
1334
1335 let shared_keywords: &[(&str, Token, LexerState)] = &[
1337 ("and", Token::And, LexerState::Default),
1338 ("or", Token::Or, LexerState::Default),
1339 ("div", Token::Div, LexerState::Default),
1340 ("mod", Token::Mod, LexerState::Default),
1341 ];
1342
1343 for (kw, tok, next_state) in shared_keywords {
1344 if self.match_identifier(&[kw]) {
1345 self.enqueue(tok.clone(), start, self.pos);
1346 self.state = *next_state;
1347 return Ok(());
1348 }
1349 }
1350
1351 if !self.is_xpath10() {
1353 let xpath20_keywords: &[(&str, Token, LexerState)] = &[
1354 ("then", Token::Then, LexerState::Default),
1355 ("else", Token::Else, LexerState::Default),
1356 ("idiv", Token::IDiv, LexerState::Default),
1357 ("except", Token::Except, LexerState::Default),
1358 ("intersect", Token::Intersect, LexerState::Default),
1359 ("union", Token::Union, LexerState::Default),
1360 ("return", Token::Return, LexerState::Default),
1361 ("satisfies", Token::Satisfies, LexerState::Default),
1362 ("to", Token::To, LexerState::Default),
1363 ("in", Token::In, LexerState::Default),
1364 ("is", Token::Is, LexerState::Default),
1365 ("eq", Token::Eq, LexerState::Default),
1366 ("ne", Token::Ne, LexerState::Default),
1367 ("lt", Token::Lt, LexerState::Default),
1368 ("le", Token::Le, LexerState::Default),
1369 ("gt", Token::Gt, LexerState::Default),
1370 ("ge", Token::Ge, LexerState::Default),
1371 ];
1372
1373 for (kw, tok, next_state) in xpath20_keywords {
1374 if self.match_identifier(&[kw]) {
1375 self.enqueue(tok.clone(), start, self.pos);
1376 self.state = *next_state;
1377 return Ok(());
1378 }
1379 }
1380 }
1381
1382 if !self.is_xpath10() && self.try_match_identifier(&["for"]) {
1384 self.match_identifier(&["for"]);
1385 self.enqueue(Token::For, start, self.pos);
1386 self.skip_whitespace_and_comments();
1387 if self.current() == Some('$') {
1388 let dollar_start = self.pos;
1389 self.advance(1);
1390 self.enqueue(Token::Dollar, dollar_start, self.pos);
1391 self.state = LexerState::VarName;
1392 } else {
1393 self.state = LexerState::Default;
1394 }
1395 return Ok(());
1396 }
1397
1398 if self.is_xpath10() {
1400 let name = self.consume_qname();
1401 let end = self.pos;
1402 self.skip_whitespace_and_comments();
1403 if self.current() != Some('(') {
1405 self.state = LexerState::Operator;
1406 } else {
1407 self.state = LexerState::Default;
1408 }
1409 self.enqueue(Token::QName(name), start, end);
1410 return Ok(());
1411 }
1412
1413 let name = self.consume_qname();
1415 Err(LexerError {
1416 message: format!("Unexpected identifier in operator context: '{}'", name),
1417 position: start,
1418 })
1419 }
1420
1421 fn single_type_state(&mut self) -> Result<(), LexerError> {
1423 self.skip_whitespace_and_comments();
1424 let start = self.pos;
1425
1426 if self.current().map(Self::is_ncname_start).unwrap_or(false) {
1427 let qname = self.consume_qname();
1428 self.enqueue(Token::QName(qname), start, self.pos);
1429 self.state = LexerState::OccurrenceIndicator;
1431 }
1432
1433 Ok(())
1434 }
1435
1436 fn item_type_state(&mut self) -> Result<(), LexerError> {
1438 self.skip_whitespace_and_comments();
1439 let start = self.pos;
1440
1441 match self.current() {
1442 None => {
1443 self.enqueue(Token::Eof, start, start);
1444 }
1445
1446 Some('$') => {
1447 self.advance(1);
1448 self.enqueue(Token::Dollar, start, self.pos);
1449 self.state = LexerState::VarName;
1450 }
1451
1452 Some(')') => {
1453 self.advance(1);
1454 self.enqueue(Token::RParen, start, self.pos);
1455 }
1456
1457 Some(c) if Self::is_ncname_start(c) => {
1458 if self.match_identifier(&["empty-sequence", "(", ")"]) {
1460 self.enqueue(Token::EmptySequence, start, self.pos);
1461 self.state = LexerState::Operator;
1462 return Ok(());
1463 }
1464 if self.match_identifier(&["item", "(", ")"]) {
1465 self.enqueue(Token::Item, start, self.pos);
1466 self.state = LexerState::OccurrenceIndicator;
1467 return Ok(());
1468 }
1469
1470 if self.match_identifier(&["element", "("]) {
1472 self.enqueue(Token::Element, start, self.pos - 1);
1473 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1474 self.state_stack.push(LexerState::OccurrenceIndicator);
1475 self.state = LexerState::KindTest;
1476 return Ok(());
1477 }
1478 if self.match_identifier(&["attribute", "("]) {
1479 self.enqueue(Token::Attribute, start, self.pos - 1);
1480 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1481 self.state_stack.push(LexerState::OccurrenceIndicator);
1482 self.state = LexerState::KindTest;
1483 return Ok(());
1484 }
1485 if self.match_identifier(&["schema-element", "("]) {
1486 self.enqueue(Token::SchemaElement, start, self.pos - 1);
1487 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1488 self.state_stack.push(LexerState::OccurrenceIndicator);
1489 self.state = LexerState::KindTest;
1490 return Ok(());
1491 }
1492 if self.match_identifier(&["schema-attribute", "("]) {
1493 self.enqueue(Token::SchemaAttribute, start, self.pos - 1);
1494 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1495 self.state_stack.push(LexerState::OccurrenceIndicator);
1496 self.state = LexerState::KindTest;
1497 return Ok(());
1498 }
1499 if self.match_identifier(&["comment", "("]) {
1500 self.enqueue(Token::Comment, start, self.pos - 1);
1501 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1502 self.state_stack.push(LexerState::OccurrenceIndicator);
1503 self.state = LexerState::KindTest;
1504 return Ok(());
1505 }
1506 if self.match_identifier(&["text", "("]) {
1507 self.enqueue(Token::Text, start, self.pos - 1);
1508 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1509 self.state_stack.push(LexerState::OccurrenceIndicator);
1510 self.state = LexerState::KindTest;
1511 return Ok(());
1512 }
1513 if self.match_identifier(&["node", "("]) {
1514 self.enqueue(Token::Node, start, self.pos - 1);
1515 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1516 self.state_stack.push(LexerState::OccurrenceIndicator);
1517 self.state = LexerState::KindTest;
1518 return Ok(());
1519 }
1520 if self.match_identifier(&["document-node", "("]) {
1521 self.enqueue(Token::DocumentNode, start, self.pos - 1);
1522 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1523 self.state_stack.push(LexerState::OccurrenceIndicator);
1524 self.state = LexerState::KindTest;
1525 return Ok(());
1526 }
1527 if self.match_identifier(&["processing-instruction", "("]) {
1528 self.enqueue(Token::ProcessingInstruction, start, self.pos - 1);
1529 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1530 self.state_stack.push(LexerState::OccurrenceIndicator);
1531 self.state = LexerState::KindTestForPi;
1532 return Ok(());
1533 }
1534
1535 let qname = self.consume_qname();
1537 self.enqueue(Token::QName(qname), start, self.pos);
1538 self.state = LexerState::OccurrenceIndicator;
1539 }
1540
1541 _ => {}
1542 }
1543
1544 Ok(())
1545 }
1546
1547 fn kind_test_state(&mut self) -> Result<(), LexerError> {
1549 self.skip_whitespace_and_comments();
1550 let start = self.pos;
1551
1552 match self.current() {
1553 None => {}
1554
1555 Some(')') => {
1556 self.advance(1);
1557 self.enqueue(Token::RParen, start, self.pos);
1558 if let Some(s) = self.state_stack.pop() {
1559 self.state = s;
1560 }
1561 }
1562
1563 Some('*') => {
1564 self.advance(1);
1565 self.enqueue(Token::Star, start, self.pos);
1566 self.state = LexerState::CloseKindTest;
1567 }
1568
1569 Some(c) if Self::is_ncname_start(c) => {
1570 if self.match_identifier(&["element", "("]) {
1572 self.enqueue(Token::Element, start, self.pos - 1);
1573 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1574 self.state_stack.push(LexerState::KindTest);
1575 return Ok(());
1576 }
1577 if self.match_identifier(&["schema-element", "("]) {
1578 self.enqueue(Token::SchemaElement, start, self.pos - 1);
1579 self.enqueue(Token::LParen, self.pos - 1, self.pos);
1580 self.state_stack.push(LexerState::KindTest);
1581 return Ok(());
1582 }
1583
1584 let qname = self.consume_qname();
1585 self.enqueue(Token::QName(qname), start, self.pos);
1586 self.state = LexerState::CloseKindTest;
1587 }
1588
1589 _ => {}
1590 }
1591
1592 Ok(())
1593 }
1594
1595 fn kind_test_for_pi_state(&mut self) -> Result<(), LexerError> {
1597 self.skip_whitespace_and_comments();
1598 let start = self.pos;
1599
1600 match self.current() {
1601 Some(')') => {
1602 self.advance(1);
1603 self.enqueue(Token::RParen, start, self.pos);
1604 if let Some(s) = self.state_stack.pop() {
1605 self.state = s;
1606 }
1607 }
1608
1609 Some(c) if Self::is_ncname_start(c) => {
1610 let ncname = self.consume_ncname();
1611 self.enqueue(Token::NCName(ncname), start, self.pos);
1612 }
1613
1614 Some('"') | Some('\'') => {
1615 let (tok, s, e) = self.consume_string()?;
1616 self.enqueue(tok, s, e);
1617 }
1618
1619 _ => {}
1620 }
1621
1622 Ok(())
1623 }
1624
1625 fn close_kind_test_state(&mut self) -> Result<(), LexerError> {
1627 self.skip_whitespace_and_comments();
1628 let start = self.pos;
1629
1630 match self.current() {
1631 Some(')') => {
1632 self.advance(1);
1633 self.enqueue(Token::RParen, start, self.pos);
1634 if let Some(s) = self.state_stack.pop() {
1635 self.state = s;
1636 }
1637 }
1638
1639 Some(',') => {
1640 self.advance(1);
1641 self.enqueue(Token::Comma, start, self.pos);
1642 self.state = LexerState::TypeNameInKindTest;
1644 }
1645
1646 _ => {}
1647 }
1648
1649 Ok(())
1650 }
1651
1652 fn type_name_in_kind_test_state(&mut self) -> Result<(), LexerError> {
1654 self.skip_whitespace_and_comments();
1655 let start = self.pos;
1656
1657 match self.current() {
1658 Some(')') => {
1659 self.advance(1);
1660 self.enqueue(Token::RParen, start, self.pos);
1661 if let Some(s) = self.state_stack.pop() {
1662 self.state = s;
1663 }
1664 }
1665
1666 Some(c) if Self::is_ncname_start(c) => {
1667 let qname = self.consume_qname();
1668 let qname_end = self.pos;
1669
1670 self.skip_whitespace_and_comments();
1672 if self.current() == Some('?') {
1673 self.advance(1);
1674 self.enqueue(Token::QNameNillable(qname), start, self.pos);
1675 } else {
1676 self.enqueue(Token::QName(qname), start, qname_end);
1677 }
1678 self.state = LexerState::CloseKindTest;
1679 }
1680
1681 _ => {}
1682 }
1683
1684 Ok(())
1685 }
1686
1687 fn occurrence_indicator_state(&mut self) -> Result<(), LexerError> {
1689 self.skip_whitespace_and_comments();
1690 let start = self.pos;
1691
1692 match self.current() {
1693 Some('?') => {
1694 self.advance(1);
1695 self.enqueue(Token::OccurrenceZeroOrOne, start, self.pos);
1696 }
1697 Some('+') => {
1698 self.advance(1);
1699 self.enqueue(Token::OccurrenceOneOrMore, start, self.pos);
1700 }
1701 Some('*') => {
1702 self.advance(1);
1703 self.enqueue(Token::OccurrenceZeroOrMore, start, self.pos);
1704 }
1705 _ => {}
1706 }
1707
1708 self.state = LexerState::Operator;
1709 self.operator_state()?;
1711
1712 Ok(())
1713 }
1714
1715 fn enter_state(&mut self) -> Result<(), LexerError> {
1717 match self.state {
1718 LexerState::Default => self.default_state(),
1719 LexerState::Operator => self.operator_state(),
1720 LexerState::VarName => self.varname_state(),
1721 LexerState::SingleType => self.single_type_state(),
1722 LexerState::ItemType => self.item_type_state(),
1723 LexerState::KindTest => self.kind_test_state(),
1724 LexerState::KindTestForPi => self.kind_test_for_pi_state(),
1725 LexerState::CloseKindTest => self.close_kind_test_state(),
1726 LexerState::TypeNameInKindTest => self.type_name_in_kind_test_state(),
1727 LexerState::OccurrenceIndicator => self.occurrence_indicator_state(),
1728 }
1729 }
1730}
1731
1732impl<'input> Iterator for Lexer<'input> {
1733 type Item = Result<Spanned, LexerError>;
1734
1735 fn next(&mut self) -> Option<Self::Item> {
1736 if self.finished {
1737 return None;
1738 }
1739
1740 if let Some(spanned) = self.token_queue.pop_front() {
1742 if spanned.1 == Token::Eof {
1743 self.finished = true;
1744 }
1745 return Some(Ok(spanned));
1746 }
1747
1748 if let Err(e) = self.enter_state() {
1750 self.finished = true;
1751 return Some(Err(e));
1752 }
1753
1754 if let Some(spanned) = self.token_queue.pop_front() {
1756 if spanned.1 == Token::Eof {
1757 self.finished = true;
1758 }
1759 Some(Ok(spanned))
1760 } else {
1761 self.finished = true;
1762 None
1763 }
1764 }
1765}
1766
1767#[cfg(test)]
1768mod tests {
1769 use super::*;
1770
1771 fn tokenize(input: &str) -> Vec<Token> {
1772 Lexer::new(input)
1773 .map(|r| r.unwrap().1)
1774 .filter(|t| *t != Token::Eof)
1775 .collect()
1776 }
1777
1778 #[test]
1779 fn test_simple_number() {
1780 assert_eq!(
1781 tokenize("42"),
1782 vec![Token::IntegerLiteral("42".to_string())]
1783 );
1784 assert_eq!(
1785 tokenize("2.5"),
1786 vec![Token::DecimalLiteral("2.5".to_string())]
1787 );
1788 assert_eq!(
1789 tokenize("1e10"),
1790 vec![Token::DoubleLiteral("1e10".to_string())]
1791 );
1792 }
1793
1794 #[test]
1795 fn test_string_literal() {
1796 assert_eq!(
1797 tokenize("'hello'"),
1798 vec![Token::StringLiteral("hello".to_string())]
1799 );
1800 assert_eq!(
1801 tokenize("\"world\""),
1802 vec![Token::StringLiteral("world".to_string())]
1803 );
1804 assert_eq!(
1805 tokenize("'it''s'"),
1806 vec![Token::StringLiteral("it's".to_string())]
1807 );
1808 }
1809
1810 #[test]
1811 fn test_variable() {
1812 assert_eq!(
1813 tokenize("$x"),
1814 vec![
1815 Token::Dollar,
1816 Token::VarName {
1817 prefix: String::new(),
1818 local: "x".to_string()
1819 }
1820 ]
1821 );
1822 assert_eq!(
1823 tokenize("$ns:var"),
1824 vec![
1825 Token::Dollar,
1826 Token::VarName {
1827 prefix: "ns".to_string(),
1828 local: "var".to_string()
1829 }
1830 ]
1831 );
1832 }
1833
1834 #[test]
1835 fn test_axis_specifiers() {
1836 assert_eq!(tokenize("child::"), vec![Token::AxisChild]);
1837 assert_eq!(
1838 tokenize("ancestor-or-self::"),
1839 vec![Token::AxisAncestorOrSelf]
1840 );
1841 assert_eq!(
1842 tokenize("descendant-or-self::"),
1843 vec![Token::AxisDescendantOrSelf]
1844 );
1845 }
1846
1847 #[test]
1848 fn test_path_expression() {
1849 let tokens = tokenize("/a/b");
1850 assert_eq!(
1851 tokens,
1852 vec![
1853 Token::Slash,
1854 Token::QName("a".to_string()),
1855 Token::Slash,
1856 Token::QName("b".to_string())
1857 ]
1858 );
1859 }
1860
1861 #[test]
1862 fn test_double_slash() {
1863 let tokens = tokenize("//a");
1864 assert_eq!(
1865 tokens,
1866 vec![Token::DoubleSlash, Token::QName("a".to_string())]
1867 );
1868 }
1869
1870 #[test]
1871 fn test_if_expression() {
1872 let tokens = tokenize("if (");
1873 assert_eq!(tokens, vec![Token::If, Token::LParen]);
1874 }
1875
1876 #[test]
1877 fn test_for_expression() {
1878 let tokens = tokenize("for $x");
1879 assert_eq!(
1880 tokens,
1881 vec![
1882 Token::For,
1883 Token::Dollar,
1884 Token::VarName {
1885 prefix: String::new(),
1886 local: "x".to_string()
1887 }
1888 ]
1889 );
1890 }
1891
1892 #[test]
1893 fn test_comparison_operators() {
1894 let tokens = tokenize("1 eq 2");
1895 assert_eq!(
1896 tokens,
1897 vec![
1898 Token::IntegerLiteral("1".to_string()),
1899 Token::Eq,
1900 Token::IntegerLiteral("2".to_string())
1901 ]
1902 );
1903 }
1904
1905 #[test]
1906 fn test_instance_of() {
1907 let tokens = tokenize("$x instance of xs:integer");
1908 assert_eq!(
1909 tokens,
1910 vec![
1911 Token::Dollar,
1912 Token::VarName {
1913 prefix: String::new(),
1914 local: "x".to_string()
1915 },
1916 Token::InstanceOf,
1917 Token::QName("xs:integer".to_string())
1918 ]
1919 );
1920 }
1921
1922 #[test]
1923 fn test_cast_as() {
1924 let tokens = tokenize("$x cast as xs:string");
1925 assert_eq!(
1926 tokens,
1927 vec![
1928 Token::Dollar,
1929 Token::VarName {
1930 prefix: String::new(),
1931 local: "x".to_string()
1932 },
1933 Token::CastAs,
1934 Token::QName("xs:string".to_string())
1935 ]
1936 );
1937 }
1938
1939 #[test]
1940 fn test_kind_test() {
1941 let tokens = tokenize("node()");
1942 assert_eq!(tokens, vec![Token::Node, Token::LParen, Token::RParen]);
1943 }
1944
1945 #[test]
1946 fn test_element_test() {
1947 let tokens = tokenize("element(foo)");
1948 assert_eq!(
1949 tokens,
1950 vec![
1951 Token::Element,
1952 Token::LParen,
1953 Token::QName("foo".to_string()),
1954 Token::RParen
1955 ]
1956 );
1957 }
1958
1959 #[test]
1960 fn test_element_test_with_type() {
1961 let tokens = tokenize("element(foo, xs:string)");
1963 assert_eq!(
1964 tokens,
1965 vec![
1966 Token::Element,
1967 Token::LParen,
1968 Token::QName("foo".to_string()),
1969 Token::Comma,
1970 Token::QName("xs:string".to_string()),
1971 Token::RParen
1972 ]
1973 );
1974 }
1975
1976 #[test]
1977 fn test_element_test_with_nillable_type() {
1978 let tokens = tokenize("element(foo, xs:string?)");
1980 assert_eq!(
1981 tokens,
1982 vec![
1983 Token::Element,
1984 Token::LParen,
1985 Token::QName("foo".to_string()),
1986 Token::Comma,
1987 Token::QNameNillable("xs:string".to_string()),
1988 Token::RParen
1989 ]
1990 );
1991 }
1992
1993 #[test]
1994 fn test_element_test_nillable_with_whitespace() {
1995 let tokens = tokenize("element(foo, xs:string ?)");
1997 assert_eq!(
1998 tokens,
1999 vec![
2000 Token::Element,
2001 Token::LParen,
2002 Token::QName("foo".to_string()),
2003 Token::Comma,
2004 Token::QNameNillable("xs:string".to_string()),
2005 Token::RParen
2006 ]
2007 );
2008 }
2009
2010 #[test]
2011 fn test_comments() {
2012 let tokens = tokenize("1 (: comment :) + 2");
2013 assert_eq!(
2014 tokens,
2015 vec![
2016 Token::IntegerLiteral("1".to_string()),
2017 Token::Plus,
2018 Token::IntegerLiteral("2".to_string())
2019 ]
2020 );
2021 }
2022
2023 #[test]
2024 fn test_nested_comments() {
2025 let tokens = tokenize("1 (: outer (: inner :) outer :) + 2");
2026 assert_eq!(
2027 tokens,
2028 vec![
2029 Token::IntegerLiteral("1".to_string()),
2030 Token::Plus,
2031 Token::IntegerLiteral("2".to_string())
2032 ]
2033 );
2034 }
2035
2036 #[test]
2037 fn test_arithmetic() {
2038 let tokens = tokenize("1 + 2 * 3");
2039 assert_eq!(
2040 tokens,
2041 vec![
2042 Token::IntegerLiteral("1".to_string()),
2043 Token::Plus,
2044 Token::IntegerLiteral("2".to_string()),
2045 Token::Star,
2046 Token::IntegerLiteral("3".to_string())
2047 ]
2048 );
2049 }
2050
2051 #[test]
2052 fn test_predicates() {
2053 let tokens = tokenize("a[1]");
2054 assert_eq!(
2055 tokens,
2056 vec![
2057 Token::QName("a".to_string()),
2058 Token::LBracket,
2059 Token::IntegerLiteral("1".to_string()),
2060 Token::RBracket
2061 ]
2062 );
2063 }
2064
2065 #[test]
2066 fn test_double_period() {
2067 let tokens = tokenize("..");
2068 assert_eq!(tokens, vec![Token::DoublePeriod]);
2069 }
2070
2071 #[test]
2072 fn test_context_item() {
2073 let tokens = tokenize(".");
2074 assert_eq!(tokens, vec![Token::Dot]);
2075 }
2076
2077 #[test]
2078 fn test_slash_only() {
2079 let tokens = tokenize("/");
2081 assert_eq!(tokens, vec![Token::SlashOnly]);
2082 }
2083
2084 #[test]
2085 fn test_slash_only_with_trailing_whitespace() {
2086 let tokens = tokenize("/ ");
2088 assert_eq!(tokens, vec![Token::SlashOnly]);
2089 }
2090
2091 #[test]
2092 fn test_slash_only_with_comment() {
2093 let tokens = tokenize("/ (: comment :)");
2095 assert_eq!(tokens, vec![Token::SlashOnly]);
2096 }
2097
2098 #[test]
2099 fn test_slash_with_path() {
2100 let tokens = tokenize("/a");
2102 assert_eq!(tokens, vec![Token::Slash, Token::QName("a".to_string())]);
2103 }
2104
2105 #[test]
2106 fn test_slash_with_whitespace_then_path() {
2107 let tokens = tokenize("/ a");
2109 assert_eq!(tokens, vec![Token::Slash, Token::QName("a".to_string())]);
2110 }
2111
2112 #[test]
2113 fn test_slash_with_comment_then_path() {
2114 let tokens = tokenize("/ (: comment :) a");
2116 assert_eq!(tokens, vec![Token::Slash, Token::QName("a".to_string())]);
2117 }
2118
2119 #[test]
2120 fn test_attribute_shorthand() {
2121 let tokens = tokenize("@id");
2122 assert_eq!(tokens, vec![Token::At, Token::QName("id".to_string())]);
2123 }
2124
2125 #[test]
2126 fn test_wildcard() {
2127 let tokens = tokenize("*");
2128 assert_eq!(tokens, vec![Token::Star]);
2129 }
2130
2131 #[test]
2132 fn test_namespace_wildcard() {
2133 let tokens = tokenize("*:local");
2134 assert_eq!(
2135 tokens,
2136 vec![
2137 Token::Star,
2138 Token::Colon,
2139 Token::NCName("local".to_string())
2140 ]
2141 );
2142 }
2143
2144 #[test]
2145 fn test_occurrence_indicators() {
2146 let tokens = tokenize("$x instance of xs:integer?");
2147 assert!(tokens.contains(&Token::OccurrenceZeroOrOne));
2148 }
2149
2150 fn tokenize_10(input: &str) -> Vec<Token> {
2153 Lexer::new_with_mode(input, XPathMode::XPath10)
2154 .map(|r| r.unwrap().1)
2155 .filter(|t| *t != Token::Eof)
2156 .collect()
2157 }
2158
2159 #[test]
2160 fn test_xpath10_if_as_element_name() {
2161 let tokens = tokenize_10("//if");
2163 assert_eq!(tokens, vec![Token::DoubleSlash, Token::QName("if".into())]);
2164 }
2165
2166 #[test]
2167 fn test_xpath10_for_as_element_name() {
2168 let tokens = tokenize_10("//for");
2169 assert_eq!(tokens, vec![Token::DoubleSlash, Token::QName("for".into())]);
2170 }
2171
2172 #[test]
2173 fn test_xpath10_every_as_element_name() {
2174 let tokens = tokenize_10("//every");
2175 assert_eq!(
2176 tokens,
2177 vec![Token::DoubleSlash, Token::QName("every".into())]
2178 );
2179 }
2180
2181 #[test]
2182 fn test_xpath10_some_as_element_name() {
2183 let tokens = tokenize_10("//some");
2184 assert_eq!(
2185 tokens,
2186 vec![Token::DoubleSlash, Token::QName("some".into())]
2187 );
2188 }
2189
2190 #[test]
2191 fn test_xpath10_xpath20_keywords_as_qnames() {
2192 for name in &[
2194 "then",
2195 "else",
2196 "return",
2197 "to",
2198 "union",
2199 "except",
2200 "intersect",
2201 "eq",
2202 "ne",
2203 "lt",
2204 "le",
2205 "gt",
2206 "ge",
2207 "is",
2208 "idiv",
2209 "satisfies",
2210 "in",
2211 ] {
2212 let input = format!("a/{}", name);
2213 let tokens = tokenize_10(&input);
2214 assert_eq!(
2215 tokens,
2216 vec![
2217 Token::QName("a".into()),
2218 Token::Slash,
2219 Token::QName((*name).into()),
2220 ],
2221 "Failed for keyword: {}",
2222 name
2223 );
2224 }
2225 }
2226
2227 #[test]
2228 fn test_xpath10_unary_minus() {
2229 let tokens = tokenize_10("-a");
2230 assert_eq!(tokens, vec![Token::Minus10, Token::QName("a".into())]);
2231 }
2232
2233 #[test]
2234 fn test_xpath10_unary_plus() {
2235 let tokens = tokenize_10("+a");
2236 assert_eq!(tokens, vec![Token::Plus10, Token::QName("a".into())]);
2237 }
2238
2239 #[test]
2240 fn test_xpath20_unary_minus() {
2241 let tokens = tokenize("-a");
2242 assert_eq!(tokens, vec![Token::Minus20, Token::QName("a".into())]);
2243 }
2244
2245 #[test]
2246 fn test_xpath10_binary_minus() {
2247 let tokens = tokenize_10("a - b");
2248 assert_eq!(
2249 tokens,
2250 vec![
2251 Token::QName("a".into()),
2252 Token::Minus,
2253 Token::QName("b".into())
2254 ]
2255 );
2256 }
2257
2258 #[test]
2259 fn test_xpath10_binary_plus() {
2260 let tokens = tokenize_10("a + b");
2261 assert_eq!(
2262 tokens,
2263 vec![
2264 Token::QName("a".into()),
2265 Token::Plus,
2266 Token::QName("b".into())
2267 ]
2268 );
2269 }
2270
2271 #[test]
2272 fn test_xpath10_and_or_div_mod() {
2273 let tokens = tokenize_10("a and b");
2274 assert_eq!(
2275 tokens,
2276 vec![
2277 Token::QName("a".into()),
2278 Token::And,
2279 Token::QName("b".into())
2280 ]
2281 );
2282 let tokens = tokenize_10("a or b");
2283 assert_eq!(
2284 tokens,
2285 vec![
2286 Token::QName("a".into()),
2287 Token::Or,
2288 Token::QName("b".into())
2289 ]
2290 );
2291 let tokens = tokenize_10("a div b");
2292 assert_eq!(
2293 tokens,
2294 vec![
2295 Token::QName("a".into()),
2296 Token::Div,
2297 Token::QName("b".into())
2298 ]
2299 );
2300 let tokens = tokenize_10("a mod b");
2301 assert_eq!(
2302 tokens,
2303 vec![
2304 Token::QName("a".into()),
2305 Token::Mod,
2306 Token::QName("b".into())
2307 ]
2308 );
2309 }
2310
2311 #[test]
2312 fn test_xpath10_node_kind_test() {
2313 let tokens = tokenize_10("node()");
2314 assert_eq!(tokens, vec![Token::Node, Token::LParen, Token::RParen]);
2315 }
2316
2317 #[test]
2318 fn test_xpath10_comment_kind_test() {
2319 let tokens = tokenize_10("comment()");
2320 assert_eq!(tokens, vec![Token::Comment, Token::LParen, Token::RParen]);
2321 }
2322
2323 #[test]
2324 fn test_xpath10_text_kind_test() {
2325 let tokens = tokenize_10("text()");
2326 assert_eq!(tokens, vec![Token::Text, Token::LParen, Token::RParen]);
2327 }
2328
2329 #[test]
2330 fn test_xpath10_processing_instruction_kind_test() {
2331 let tokens = tokenize_10("processing-instruction()");
2332 assert_eq!(
2333 tokens,
2334 vec![Token::ProcessingInstruction, Token::LParen, Token::RParen]
2335 );
2336 }
2337
2338 #[test]
2339 fn test_xpath10_element_as_function() {
2340 let tokens = tokenize_10("element(foo)");
2342 assert_eq!(
2343 tokens,
2344 vec![
2345 Token::QName("element".into()),
2346 Token::LParen,
2347 Token::QName("foo".into()),
2348 Token::RParen
2349 ]
2350 );
2351 }
2352
2353 #[test]
2354 fn test_xpath10_axis_specifiers() {
2355 let tokens = tokenize_10("child::*");
2356 assert_eq!(tokens, vec![Token::AxisChild, Token::Star]);
2357 }
2358
2359 #[test]
2360 fn test_xpath10_ancestor_or_self_axis() {
2361 let tokens = tokenize_10("ancestor-or-self::node()");
2362 assert_eq!(
2363 tokens,
2364 vec![
2365 Token::AxisAncestorOrSelf,
2366 Token::Node,
2367 Token::LParen,
2368 Token::RParen
2369 ]
2370 );
2371 }
2372
2373 #[test]
2374 fn test_xpath10_no_double_less() {
2375 let tokens: Vec<Token> = Lexer::new_with_mode("a<<b", XPathMode::XPath10)
2377 .filter_map(|r| r.ok().map(|t| t.1))
2378 .filter(|t| *t != Token::Eof)
2379 .collect();
2380 assert!(!tokens.contains(&Token::DoubleLess));
2381 }
2382
2383 #[test]
2384 fn test_xpath10_no_double_greater() {
2385 let tokens: Vec<Token> = Lexer::new_with_mode("a>>b", XPathMode::XPath10)
2387 .filter_map(|r| r.ok().map(|t| t.1))
2388 .filter(|t| *t != Token::Eof)
2389 .collect();
2390 assert!(!tokens.contains(&Token::DoubleGreater));
2391 }
2392
2393 #[test]
2394 fn test_xpath20_double_less_still_works() {
2395 let tokens = tokenize("a<<b");
2396 assert!(tokens.contains(&Token::DoubleLess));
2397 }
2398
2399 #[test]
2400 fn test_xpath20_double_greater_still_works() {
2401 let tokens = tokenize("a>>b");
2402 assert!(tokens.contains(&Token::DoubleGreater));
2403 }
2404}