use std::collections::VecDeque;
use std::fmt;
use super::XPathMode;
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
StringLiteral(String),
IntegerLiteral(String),
DecimalLiteral(String),
DoubleLiteral(String),
NCName(String),
QName(String),
QNameNillable(String),
VarName {
prefix: String,
local: String,
},
For,
In,
Return,
If,
Then,
Else,
Some,
Every,
Satisfies,
And,
Or,
To,
Div,
IDiv,
Mod,
Union,
Except,
Intersect,
InstanceOf,
TreatAs,
CastAs,
CastableAs,
Element,
Attribute,
Text,
Comment,
Node,
DocumentNode,
ProcessingInstruction,
SchemaElement,
SchemaAttribute,
Item,
EmptySequence,
AxisChild,
AxisDescendant,
AxisAttribute,
AxisSelf,
AxisDescendantOrSelf,
AxisFollowingSibling,
AxisFollowing,
AxisParent,
AxisAncestor,
AxisPrecedingSibling,
AxisPreceding,
AxisAncestorOrSelf,
AxisNamespace,
Eq,
Ne,
Lt,
Le,
Gt,
Ge,
Is,
OccurrenceZeroOrOne, OccurrenceOneOrMore, OccurrenceZeroOrMore,
DoublePeriod, DoubleSlash, NotEquals, LessEquals, GreaterEquals, DoubleLess, DoubleGreater,
LParen,
RParen,
LBracket,
RBracket,
Comma,
Colon,
At,
Dollar,
Slash,
SlashOnly,
Pipe,
Plus,
Minus,
Minus10, Plus10, Minus20, Plus20, Star,
Equals,
LessThan,
GreaterThan,
Question,
Dot,
Eof,
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Token::StringLiteral(s) => write!(f, "\"{}\"", s),
Token::IntegerLiteral(s) => write!(f, "{}", s),
Token::DecimalLiteral(s) => write!(f, "{}", s),
Token::DoubleLiteral(s) => write!(f, "{}", s),
Token::NCName(s) => write!(f, "{}", s),
Token::QName(s) => write!(f, "{}", s),
Token::QNameNillable(s) => write!(f, "{}?", s),
Token::VarName { prefix, local } => {
if prefix.is_empty() {
write!(f, "${}", local)
} else {
write!(f, "${}:{}", prefix, local)
}
}
Token::For => write!(f, "for"),
Token::In => write!(f, "in"),
Token::Return => write!(f, "return"),
Token::If => write!(f, "if"),
Token::Then => write!(f, "then"),
Token::Else => write!(f, "else"),
Token::Some => write!(f, "some"),
Token::Every => write!(f, "every"),
Token::Satisfies => write!(f, "satisfies"),
Token::And => write!(f, "and"),
Token::Or => write!(f, "or"),
Token::To => write!(f, "to"),
Token::Div => write!(f, "div"),
Token::IDiv => write!(f, "idiv"),
Token::Mod => write!(f, "mod"),
Token::Union => write!(f, "union"),
Token::Except => write!(f, "except"),
Token::Intersect => write!(f, "intersect"),
Token::InstanceOf => write!(f, "instance of"),
Token::TreatAs => write!(f, "treat as"),
Token::CastAs => write!(f, "cast as"),
Token::CastableAs => write!(f, "castable as"),
Token::Element => write!(f, "element"),
Token::Attribute => write!(f, "attribute"),
Token::Text => write!(f, "text"),
Token::Comment => write!(f, "comment"),
Token::Node => write!(f, "node"),
Token::DocumentNode => write!(f, "document-node"),
Token::ProcessingInstruction => write!(f, "processing-instruction"),
Token::SchemaElement => write!(f, "schema-element"),
Token::SchemaAttribute => write!(f, "schema-attribute"),
Token::Item => write!(f, "item"),
Token::EmptySequence => write!(f, "empty-sequence"),
Token::AxisChild => write!(f, "child::"),
Token::AxisDescendant => write!(f, "descendant::"),
Token::AxisAttribute => write!(f, "attribute::"),
Token::AxisSelf => write!(f, "self::"),
Token::AxisDescendantOrSelf => write!(f, "descendant-or-self::"),
Token::AxisFollowingSibling => write!(f, "following-sibling::"),
Token::AxisFollowing => write!(f, "following::"),
Token::AxisParent => write!(f, "parent::"),
Token::AxisAncestor => write!(f, "ancestor::"),
Token::AxisPrecedingSibling => write!(f, "preceding-sibling::"),
Token::AxisPreceding => write!(f, "preceding::"),
Token::AxisAncestorOrSelf => write!(f, "ancestor-or-self::"),
Token::AxisNamespace => write!(f, "namespace::"),
Token::Eq => write!(f, "eq"),
Token::Ne => write!(f, "ne"),
Token::Lt => write!(f, "lt"),
Token::Le => write!(f, "le"),
Token::Gt => write!(f, "gt"),
Token::Ge => write!(f, "ge"),
Token::Is => write!(f, "is"),
Token::OccurrenceZeroOrOne => write!(f, "?"),
Token::OccurrenceOneOrMore => write!(f, "+"),
Token::OccurrenceZeroOrMore => write!(f, "*"),
Token::DoublePeriod => write!(f, ".."),
Token::DoubleSlash => write!(f, "//"),
Token::NotEquals => write!(f, "!="),
Token::LessEquals => write!(f, "<="),
Token::GreaterEquals => write!(f, ">="),
Token::DoubleLess => write!(f, "<<"),
Token::DoubleGreater => write!(f, ">>"),
Token::LParen => write!(f, "("),
Token::RParen => write!(f, ")"),
Token::LBracket => write!(f, "["),
Token::RBracket => write!(f, "]"),
Token::Comma => write!(f, ","),
Token::Colon => write!(f, ":"),
Token::At => write!(f, "@"),
Token::Dollar => write!(f, "$"),
Token::Slash => write!(f, "/"),
Token::SlashOnly => write!(f, "/"),
Token::Pipe => write!(f, "|"),
Token::Plus => write!(f, "+"),
Token::Minus => write!(f, "-"),
Token::Minus10 | Token::Minus20 => write!(f, "-"),
Token::Plus10 | Token::Plus20 => write!(f, "+"),
Token::Star => write!(f, "*"),
Token::Equals => write!(f, "="),
Token::LessThan => write!(f, "<"),
Token::GreaterThan => write!(f, ">"),
Token::Question => write!(f, "?"),
Token::Dot => write!(f, "."),
Token::Eof => write!(f, "EOF"),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct LexerError {
pub message: String,
pub position: usize,
}
impl fmt::Display for LexerError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Lexer error at position {}: {}",
self.position, self.message
)
}
}
impl std::error::Error for LexerError {}
pub type Spanned = (usize, Token, usize);
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum LexerState {
Default,
Operator,
SingleType,
ItemType,
KindTest,
KindTestForPi,
CloseKindTest,
TypeNameInKindTest,
OccurrenceIndicator,
VarName,
}
pub struct Lexer<'input> {
#[allow(dead_code)]
input: &'input str,
chars: Vec<char>,
pos: usize,
state: LexerState,
state_stack: Vec<LexerState>,
token_queue: VecDeque<Spanned>,
finished: bool,
mode: XPathMode,
}
impl<'input> Lexer<'input> {
pub fn new(input: &'input str) -> Self {
Self {
input,
chars: input.chars().collect(),
pos: 0,
state: LexerState::Default,
state_stack: Vec::new(),
token_queue: VecDeque::new(),
finished: false,
mode: XPathMode::XPath20,
}
}
pub fn new_with_mode(input: &'input str, mode: XPathMode) -> Self {
Self {
input,
chars: input.chars().collect(),
pos: 0,
state: LexerState::Default,
state_stack: Vec::new(),
token_queue: VecDeque::new(),
finished: false,
mode,
}
}
fn is_xpath10(&self) -> bool {
self.mode == XPathMode::XPath10
}
#[inline]
fn peek(&self, offset: usize) -> Option<char> {
self.chars.get(self.pos + offset).copied()
}
#[inline]
fn current(&self) -> Option<char> {
self.peek(0)
}
#[inline]
fn advance(&mut self, n: usize) {
self.pos += n;
}
#[inline]
fn read(&mut self) -> Option<char> {
let c = self.current();
if c.is_some() {
self.pos += 1;
}
c
}
fn is_ncname_start(c: char) -> bool {
c.is_alphabetic() || c == '_'
}
fn is_ncname_char(c: char) -> bool {
c.is_alphanumeric()
|| c == '_'
|| c == '-'
|| c == '.'
|| c == '\u{B7}'
|| ('\u{0300}'..='\u{036F}').contains(&c)
|| ('\u{203F}'..='\u{2040}').contains(&c)
}
fn is_digit(c: char) -> bool {
c.is_ascii_digit()
}
fn is_whitespace(c: char) -> bool {
matches!(c, ' ' | '\t' | '\r' | '\n')
}
fn can_start_relative_path(&self) -> bool {
let mut i = 0;
loop {
while self.pos + i < self.chars.len() && Self::is_whitespace(self.chars[self.pos + i]) {
i += 1;
}
if self.pos + i + 1 < self.chars.len()
&& self.chars[self.pos + i] == '('
&& self.chars[self.pos + i + 1] == ':'
{
i += 2;
let mut depth = 1;
while depth > 0 && self.pos + i < self.chars.len() {
if self.pos + i + 1 < self.chars.len()
&& self.chars[self.pos + i] == '('
&& self.chars[self.pos + i + 1] == ':'
{
i += 2;
depth += 1;
} else if self.pos + i + 1 < self.chars.len()
&& self.chars[self.pos + i] == ':'
&& self.chars[self.pos + i + 1] == ')'
{
i += 2;
depth -= 1;
} else {
i += 1;
}
}
continue;
}
break;
}
if self.pos + i >= self.chars.len() {
return false; }
let c = self.chars[self.pos + i];
matches!(c,
'a'..='z' | 'A'..='Z' | '_' | '@' | '*' | '.' | '$' | '(' | '"' | '\''
) || c.is_ascii_digit()
|| c.is_alphabetic()
}
fn skip_whitespace_and_comments(&mut self) {
loop {
while let Some(c) = self.current() {
if Self::is_whitespace(c) {
self.advance(1);
} else {
break;
}
}
if self.peek(0) == Some('(') && self.peek(1) == Some(':') {
self.advance(2);
let mut depth = 1;
while depth > 0 {
match (self.peek(0), self.peek(1)) {
(Some('('), Some(':')) => {
self.advance(2);
depth += 1;
}
(Some(':'), Some(')')) => {
self.advance(2);
depth -= 1;
}
(Some(_), _) => {
self.advance(1);
}
(None, _) => break, }
}
continue;
}
break;
}
}
fn match_identifier(&mut self, parts: &[&str]) -> bool {
let start_pos = self.pos;
let mut i = 0;
for part in parts {
while i < self.chars.len() - self.pos {
let idx = self.pos + i;
if idx >= self.chars.len() {
break;
}
let c = self.chars[idx];
if Self::is_whitespace(c) {
i += 1;
continue;
}
if idx + 1 < self.chars.len()
&& self.chars[idx] == '('
&& self.chars[idx + 1] == ':'
{
i += 2;
let mut depth = 1;
while depth > 0 && self.pos + i < self.chars.len() {
if self.pos + i + 1 < self.chars.len()
&& self.chars[self.pos + i] == '('
&& self.chars[self.pos + i + 1] == ':'
{
i += 2;
depth += 1;
} else if self.pos + i + 1 < self.chars.len()
&& self.chars[self.pos + i] == ':'
&& self.chars[self.pos + i + 1] == ')'
{
i += 2;
depth -= 1;
} else {
i += 1;
}
}
continue;
}
break;
}
let part_chars: Vec<char> = part.chars().collect();
for (j, &pc) in part_chars.iter().enumerate() {
let idx = self.pos + i + j;
if idx >= self.chars.len() || self.chars[idx] != pc {
self.pos = start_pos;
return false;
}
}
if !part.is_empty() && Self::is_ncname_start(part_chars[0]) {
let after_idx = self.pos + i + part_chars.len();
if after_idx < self.chars.len() && Self::is_ncname_char(self.chars[after_idx]) {
self.pos = start_pos;
return false;
}
}
i += part_chars.len();
}
self.pos += i;
true
}
fn try_match_identifier(&self, parts: &[&str]) -> bool {
let mut i = 0;
for part in parts {
while self.pos + i < self.chars.len() {
let idx = self.pos + i;
let c = self.chars[idx];
if Self::is_whitespace(c) {
i += 1;
continue;
}
if idx + 1 < self.chars.len()
&& self.chars[idx] == '('
&& self.chars[idx + 1] == ':'
{
i += 2;
let mut depth = 1;
while depth > 0 && self.pos + i < self.chars.len() {
if self.pos + i + 1 < self.chars.len()
&& self.chars[self.pos + i] == '('
&& self.chars[self.pos + i + 1] == ':'
{
i += 2;
depth += 1;
} else if self.pos + i + 1 < self.chars.len()
&& self.chars[self.pos + i] == ':'
&& self.chars[self.pos + i + 1] == ')'
{
i += 2;
depth -= 1;
} else {
i += 1;
}
}
continue;
}
break;
}
let part_chars: Vec<char> = part.chars().collect();
for (j, &pc) in part_chars.iter().enumerate() {
let idx = self.pos + i + j;
if idx >= self.chars.len() || self.chars[idx] != pc {
return false;
}
}
if !part.is_empty() && Self::is_ncname_start(part_chars[0]) {
let after_idx = self.pos + i + part_chars.len();
if after_idx < self.chars.len() && Self::is_ncname_char(self.chars[after_idx]) {
return false;
}
}
i += part_chars.len();
}
true
}
fn consume_ncname(&mut self) -> String {
let start = self.pos;
while let Some(c) = self.current() {
if Self::is_ncname_char(c) {
self.advance(1);
} else {
break;
}
}
self.chars[start..self.pos].iter().collect()
}
fn consume_qname(&mut self) -> String {
let start = self.pos;
while let Some(c) = self.current() {
if (c == ':' && self.peek(1).map(Self::is_ncname_char).unwrap_or(false))
|| Self::is_ncname_char(c)
{
self.advance(1);
} else {
break;
}
}
self.chars[start..self.pos].iter().collect()
}
fn consume_number(&mut self) -> (Token, usize, usize) {
let start = self.pos;
let mut is_decimal = false;
let mut is_double = false;
while let Some(c) = self.current() {
if Self::is_digit(c) {
self.advance(1);
} else {
break;
}
}
if self.current() == Some('.') && self.peek(1).map(Self::is_digit).unwrap_or(false) {
is_decimal = true;
self.advance(1);
while let Some(c) = self.current() {
if Self::is_digit(c) {
self.advance(1);
} else {
break;
}
}
} else if self.current() == Some('.') && start == self.pos {
is_decimal = true;
self.advance(1);
while let Some(c) = self.current() {
if Self::is_digit(c) {
self.advance(1);
} else {
break;
}
}
}
if let Some(c) = self.current() {
if c == 'e' || c == 'E' {
is_double = true;
self.advance(1);
if let Some(sign) = self.current() {
if sign == '+' || sign == '-' {
self.advance(1);
}
}
while let Some(c) = self.current() {
if Self::is_digit(c) {
self.advance(1);
} else {
break;
}
}
}
}
let value: String = self.chars[start..self.pos].iter().collect();
let token = if is_double {
Token::DoubleLiteral(value)
} else if is_decimal {
Token::DecimalLiteral(value)
} else {
Token::IntegerLiteral(value)
};
(token, start, self.pos)
}
fn consume_string(&mut self) -> Result<(Token, usize, usize), LexerError> {
let start = self.pos;
let quote = self.read().unwrap();
let mut value = String::new();
loop {
match self.current() {
None => {
return Err(LexerError {
message: "Unterminated string literal".to_string(),
position: start,
});
}
Some(c) if c == quote => {
self.advance(1);
if self.current() == Some(quote) {
value.push(quote);
self.advance(1);
} else {
break;
}
}
Some(c) => {
value.push(c);
self.advance(1);
}
}
}
let value =
crate::xpath::string_ops::normalize_string_value(&value, false, true).map_err(|e| {
LexerError {
message: format!("{}", e),
position: start,
}
})?;
Ok((Token::StringLiteral(value), start, self.pos))
}
fn enqueue(&mut self, token: Token, start: usize, end: usize) {
self.token_queue.push_back((start, token, end));
}
fn default_state(&mut self) -> Result<(), LexerError> {
self.skip_whitespace_and_comments();
let start = self.pos;
match self.current() {
None => {
self.enqueue(Token::Eof, start, start);
}
Some('.') => {
if self.peek(1) == Some('.') {
self.advance(2);
self.enqueue(Token::DoublePeriod, start, self.pos);
} else if self.peek(1).map(Self::is_digit).unwrap_or(false) {
let (tok, s, e) = self.consume_number();
self.enqueue(tok, s, e);
} else {
self.advance(1);
self.enqueue(Token::Dot, start, self.pos);
}
self.state = LexerState::Operator;
}
Some(')') => {
self.advance(1);
self.enqueue(Token::RParen, start, self.pos);
self.state = LexerState::Operator;
}
Some('*') => {
self.advance(1);
let star_end = self.pos;
if self.current() == Some(':') {
self.advance(1);
if self.current().map(Self::is_ncname_start).unwrap_or(false) {
self.enqueue(Token::Star, start, star_end);
let colon_start = star_end;
self.enqueue(Token::Colon, colon_start, colon_start + 1);
let ncname_start = self.pos;
let ncname = self.consume_ncname();
self.enqueue(Token::NCName(ncname), ncname_start, self.pos);
} else {
self.pos = star_end; self.enqueue(Token::Star, start, star_end);
}
} else {
self.enqueue(Token::Star, start, self.pos);
}
self.state = LexerState::Operator;
}
Some(c @ (';' | ',' | '(' | '-' | '+' | '@' | '~')) => {
self.advance(1);
let token = match c {
';' => Token::Comma, ',' => Token::Comma,
'(' => Token::LParen,
'-' => {
if self.is_xpath10() {
Token::Minus10
} else {
Token::Minus20
}
}
'+' => {
if self.is_xpath10() {
Token::Plus10
} else {
Token::Plus20
}
}
'@' => Token::At,
'~' => {
if self.is_xpath10() {
Token::Minus10
} else {
Token::Minus20
}
}
_ => unreachable!(),
};
self.enqueue(token, start, self.pos);
}
Some('/') => {
if self.peek(1) == Some('/') {
self.advance(2);
self.enqueue(Token::DoubleSlash, start, self.pos);
} else {
self.advance(1);
if self.can_start_relative_path() {
self.enqueue(Token::Slash, start, self.pos);
} else {
self.enqueue(Token::SlashOnly, start, self.pos);
self.state = LexerState::Operator;
}
}
}
Some('$') => {
self.advance(1);
self.enqueue(Token::Dollar, start, self.pos);
self.state = LexerState::VarName;
}
Some('[') => {
self.advance(1);
self.enqueue(Token::LBracket, start, self.pos);
self.state_stack.push(self.state);
}
Some(']') => {
self.advance(1);
self.enqueue(Token::RBracket, start, self.pos);
if let Some(s) = self.state_stack.pop() {
self.state = s;
}
}
Some('"') | Some('\'') => {
let (tok, s, e) = self.consume_string()?;
self.enqueue(tok, s, e);
self.state = LexerState::Operator;
}
Some(c) if Self::is_digit(c) => {
let (tok, s, e) = self.consume_number();
self.enqueue(tok, s, e);
self.state = LexerState::Operator;
}
Some(c) if Self::is_ncname_start(c) => {
self.process_name_in_default_state(start)?;
}
Some(c) => {
return Err(LexerError {
message: format!("Unexpected character: '{}'", c),
position: start,
});
}
}
Ok(())
}
fn process_name_in_default_state(&mut self, start: usize) -> Result<(), LexerError> {
if !self.is_xpath10() && self.match_identifier(&["if", "("]) {
self.enqueue(Token::If, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
return Ok(());
}
if !self.is_xpath10() && self.try_match_identifier(&["for"]) {
self.match_identifier(&["for"]);
self.enqueue(Token::For, start, self.pos);
self.skip_whitespace_and_comments();
if self.current() == Some('$') {
let dollar_start = self.pos;
self.advance(1);
self.enqueue(Token::Dollar, dollar_start, self.pos);
self.state = LexerState::VarName;
}
return Ok(());
}
if !self.is_xpath10() && self.try_match_identifier(&["some"]) {
self.match_identifier(&["some"]);
self.enqueue(Token::Some, start, self.pos);
self.skip_whitespace_and_comments();
if self.current() == Some('$') {
let dollar_start = self.pos;
self.advance(1);
self.enqueue(Token::Dollar, dollar_start, self.pos);
self.state = LexerState::VarName;
}
return Ok(());
}
if !self.is_xpath10() && self.try_match_identifier(&["every"]) {
self.match_identifier(&["every"]);
self.enqueue(Token::Every, start, self.pos);
self.skip_whitespace_and_comments();
if self.current() == Some('$') {
let dollar_start = self.pos;
self.advance(1);
self.enqueue(Token::Dollar, dollar_start, self.pos);
self.state = LexerState::VarName;
}
return Ok(());
}
if !self.is_xpath10() && self.match_identifier(&["element", "("]) {
self.enqueue(Token::Element, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::Operator);
self.state = LexerState::KindTest;
return Ok(());
}
if !self.is_xpath10() && self.match_identifier(&["attribute", "("]) {
self.enqueue(Token::Attribute, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::Operator);
self.state = LexerState::KindTest;
return Ok(());
}
if !self.is_xpath10() && self.match_identifier(&["schema-element", "("]) {
self.enqueue(Token::SchemaElement, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::Operator);
self.state = LexerState::KindTest;
return Ok(());
}
if !self.is_xpath10() && self.match_identifier(&["schema-attribute", "("]) {
self.enqueue(Token::SchemaAttribute, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::Operator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["comment", "("]) {
self.enqueue(Token::Comment, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::Operator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["text", "("]) {
self.enqueue(Token::Text, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::Operator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["node", "("]) {
self.enqueue(Token::Node, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::Operator);
self.state = LexerState::KindTest;
return Ok(());
}
if !self.is_xpath10() && self.match_identifier(&["document-node", "("]) {
self.enqueue(Token::DocumentNode, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::Operator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["processing-instruction", "("]) {
self.enqueue(Token::ProcessingInstruction, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::Operator);
self.state = LexerState::KindTestForPi;
return Ok(());
}
if self.match_identifier(&["ancestor-or-self", "::"]) {
self.enqueue(Token::AxisAncestorOrSelf, start, self.pos);
return Ok(());
}
if self.match_identifier(&["ancestor", "::"]) {
self.enqueue(Token::AxisAncestor, start, self.pos);
return Ok(());
}
if self.match_identifier(&["attribute", "::"]) {
self.enqueue(Token::AxisAttribute, start, self.pos);
return Ok(());
}
if self.match_identifier(&["child", "::"]) {
self.enqueue(Token::AxisChild, start, self.pos);
return Ok(());
}
if self.match_identifier(&["descendant-or-self", "::"]) {
self.enqueue(Token::AxisDescendantOrSelf, start, self.pos);
return Ok(());
}
if self.match_identifier(&["descendant", "::"]) {
self.enqueue(Token::AxisDescendant, start, self.pos);
return Ok(());
}
if self.match_identifier(&["following-sibling", "::"]) {
self.enqueue(Token::AxisFollowingSibling, start, self.pos);
return Ok(());
}
if self.match_identifier(&["following", "::"]) {
self.enqueue(Token::AxisFollowing, start, self.pos);
return Ok(());
}
if self.match_identifier(&["parent", "::"]) {
self.enqueue(Token::AxisParent, start, self.pos);
return Ok(());
}
if self.match_identifier(&["preceding-sibling", "::"]) {
self.enqueue(Token::AxisPrecedingSibling, start, self.pos);
return Ok(());
}
if self.match_identifier(&["preceding", "::"]) {
self.enqueue(Token::AxisPreceding, start, self.pos);
return Ok(());
}
if self.match_identifier(&["self", "::"]) {
self.enqueue(Token::AxisSelf, start, self.pos);
return Ok(());
}
if self.match_identifier(&["namespace", "::"]) {
self.enqueue(Token::AxisNamespace, start, self.pos);
return Ok(());
}
let name = self.consume_qname();
let end = self.pos;
if name.contains(':') && self.current() == Some('*') {
}
self.skip_whitespace_and_comments();
if self.current() != Some('(') {
self.state = LexerState::Operator;
}
self.enqueue(Token::QName(name), start, end);
Ok(())
}
fn varname_state(&mut self) -> Result<(), LexerError> {
self.skip_whitespace_and_comments();
let start = self.pos;
if self.current().map(Self::is_ncname_start).unwrap_or(false) {
let first = self.consume_ncname();
let prefix;
let local;
if self.current() == Some(':')
&& self.peek(1).map(Self::is_ncname_start).unwrap_or(false)
{
self.advance(1);
prefix = first;
local = self.consume_ncname();
} else {
prefix = String::new();
local = first;
}
self.enqueue(Token::VarName { prefix, local }, start, self.pos);
self.state = LexerState::Operator;
}
Ok(())
}
fn operator_state(&mut self) -> Result<(), LexerError> {
self.skip_whitespace_and_comments();
let start = self.pos;
match self.current() {
None => {
self.enqueue(Token::Eof, start, start);
}
Some(c @ (',' | '=' | '+' | '-' | '[' | '|')) => {
self.advance(1);
let token = match c {
',' => Token::Comma,
'=' => Token::Equals,
'+' => Token::Plus,
'-' => Token::Minus,
'[' => {
self.state_stack.push(self.state);
Token::LBracket
}
'|' => Token::Pipe,
_ => unreachable!(),
};
self.enqueue(token, start, self.pos);
self.state = LexerState::Default;
}
Some('*') => {
self.advance(1);
self.enqueue(Token::Star, start, self.pos);
self.state = LexerState::Default;
}
Some('!') if self.peek(1) == Some('=') => {
self.advance(2);
self.enqueue(Token::NotEquals, start, self.pos);
self.state = LexerState::Default;
}
Some('>') => {
if self.peek(1) == Some('=') {
self.advance(2);
self.enqueue(Token::GreaterEquals, start, self.pos);
} else if !self.is_xpath10() && self.peek(1) == Some('>') {
self.advance(2);
self.enqueue(Token::DoubleGreater, start, self.pos);
} else {
self.advance(1);
self.enqueue(Token::GreaterThan, start, self.pos);
}
self.state = LexerState::Default;
}
Some('<') => {
if self.peek(1) == Some('=') {
self.advance(2);
self.enqueue(Token::LessEquals, start, self.pos);
} else if !self.is_xpath10() && self.peek(1) == Some('<') {
self.advance(2);
self.enqueue(Token::DoubleLess, start, self.pos);
} else {
self.advance(1);
self.enqueue(Token::LessThan, start, self.pos);
}
self.state = LexerState::Default;
}
Some('/') => {
if self.peek(1) == Some('/') {
self.advance(2);
self.enqueue(Token::DoubleSlash, start, self.pos);
self.state = LexerState::Default;
} else {
self.advance(1);
if self.can_start_relative_path() {
self.enqueue(Token::Slash, start, self.pos);
self.state = LexerState::Default;
} else {
self.enqueue(Token::SlashOnly, start, self.pos);
}
}
}
Some(')') => {
self.advance(1);
self.enqueue(Token::RParen, start, self.pos);
}
Some('?') => {
self.advance(1);
self.enqueue(Token::Question, start, self.pos);
}
Some(']') => {
self.advance(1);
self.enqueue(Token::RBracket, start, self.pos);
if let Some(s) = self.state_stack.pop() {
self.state = s;
}
}
Some('$') => {
self.advance(1);
self.enqueue(Token::Dollar, start, self.pos);
self.state = LexerState::VarName;
}
Some('"') | Some('\'') => {
let (tok, s, e) = self.consume_string()?;
self.enqueue(tok, s, e);
}
Some(c) if Self::is_ncname_start(c) => {
self.process_keyword_in_operator_state(start)?;
}
Some(c) => {
return Err(LexerError {
message: format!("Unexpected character in operator context: '{}'", c),
position: start,
});
}
}
Ok(())
}
fn process_keyword_in_operator_state(&mut self, start: usize) -> Result<(), LexerError> {
if !self.is_xpath10() {
if self.match_identifier(&["castable", "as"]) {
self.enqueue(Token::CastableAs, start, self.pos);
self.state = LexerState::SingleType;
return Ok(());
}
if self.match_identifier(&["cast", "as"]) {
self.enqueue(Token::CastAs, start, self.pos);
self.state = LexerState::SingleType;
return Ok(());
}
if self.match_identifier(&["instance", "of"]) {
self.enqueue(Token::InstanceOf, start, self.pos);
self.state = LexerState::ItemType;
return Ok(());
}
if self.match_identifier(&["treat", "as"]) {
self.enqueue(Token::TreatAs, start, self.pos);
self.state = LexerState::ItemType;
return Ok(());
}
}
let shared_keywords: &[(&str, Token, LexerState)] = &[
("and", Token::And, LexerState::Default),
("or", Token::Or, LexerState::Default),
("div", Token::Div, LexerState::Default),
("mod", Token::Mod, LexerState::Default),
];
for (kw, tok, next_state) in shared_keywords {
if self.match_identifier(&[kw]) {
self.enqueue(tok.clone(), start, self.pos);
self.state = *next_state;
return Ok(());
}
}
if !self.is_xpath10() {
let xpath20_keywords: &[(&str, Token, LexerState)] = &[
("then", Token::Then, LexerState::Default),
("else", Token::Else, LexerState::Default),
("idiv", Token::IDiv, LexerState::Default),
("except", Token::Except, LexerState::Default),
("intersect", Token::Intersect, LexerState::Default),
("union", Token::Union, LexerState::Default),
("return", Token::Return, LexerState::Default),
("satisfies", Token::Satisfies, LexerState::Default),
("to", Token::To, LexerState::Default),
("in", Token::In, LexerState::Default),
("is", Token::Is, LexerState::Default),
("eq", Token::Eq, LexerState::Default),
("ne", Token::Ne, LexerState::Default),
("lt", Token::Lt, LexerState::Default),
("le", Token::Le, LexerState::Default),
("gt", Token::Gt, LexerState::Default),
("ge", Token::Ge, LexerState::Default),
];
for (kw, tok, next_state) in xpath20_keywords {
if self.match_identifier(&[kw]) {
self.enqueue(tok.clone(), start, self.pos);
self.state = *next_state;
return Ok(());
}
}
}
if !self.is_xpath10() && self.try_match_identifier(&["for"]) {
self.match_identifier(&["for"]);
self.enqueue(Token::For, start, self.pos);
self.skip_whitespace_and_comments();
if self.current() == Some('$') {
let dollar_start = self.pos;
self.advance(1);
self.enqueue(Token::Dollar, dollar_start, self.pos);
self.state = LexerState::VarName;
} else {
self.state = LexerState::Default;
}
return Ok(());
}
if self.is_xpath10() {
let name = self.consume_qname();
let end = self.pos;
self.skip_whitespace_and_comments();
if self.current() != Some('(') {
self.state = LexerState::Operator;
} else {
self.state = LexerState::Default;
}
self.enqueue(Token::QName(name), start, end);
return Ok(());
}
let name = self.consume_qname();
Err(LexerError {
message: format!("Unexpected identifier in operator context: '{}'", name),
position: start,
})
}
fn single_type_state(&mut self) -> Result<(), LexerError> {
self.skip_whitespace_and_comments();
let start = self.pos;
if self.current().map(Self::is_ncname_start).unwrap_or(false) {
let qname = self.consume_qname();
self.enqueue(Token::QName(qname), start, self.pos);
self.state = LexerState::OccurrenceIndicator;
}
Ok(())
}
fn item_type_state(&mut self) -> Result<(), LexerError> {
self.skip_whitespace_and_comments();
let start = self.pos;
match self.current() {
None => {
self.enqueue(Token::Eof, start, start);
}
Some('$') => {
self.advance(1);
self.enqueue(Token::Dollar, start, self.pos);
self.state = LexerState::VarName;
}
Some(')') => {
self.advance(1);
self.enqueue(Token::RParen, start, self.pos);
}
Some(c) if Self::is_ncname_start(c) => {
if self.match_identifier(&["empty-sequence", "(", ")"]) {
self.enqueue(Token::EmptySequence, start, self.pos);
self.state = LexerState::Operator;
return Ok(());
}
if self.match_identifier(&["item", "(", ")"]) {
self.enqueue(Token::Item, start, self.pos);
self.state = LexerState::OccurrenceIndicator;
return Ok(());
}
if self.match_identifier(&["element", "("]) {
self.enqueue(Token::Element, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::OccurrenceIndicator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["attribute", "("]) {
self.enqueue(Token::Attribute, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::OccurrenceIndicator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["schema-element", "("]) {
self.enqueue(Token::SchemaElement, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::OccurrenceIndicator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["schema-attribute", "("]) {
self.enqueue(Token::SchemaAttribute, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::OccurrenceIndicator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["comment", "("]) {
self.enqueue(Token::Comment, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::OccurrenceIndicator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["text", "("]) {
self.enqueue(Token::Text, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::OccurrenceIndicator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["node", "("]) {
self.enqueue(Token::Node, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::OccurrenceIndicator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["document-node", "("]) {
self.enqueue(Token::DocumentNode, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::OccurrenceIndicator);
self.state = LexerState::KindTest;
return Ok(());
}
if self.match_identifier(&["processing-instruction", "("]) {
self.enqueue(Token::ProcessingInstruction, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::OccurrenceIndicator);
self.state = LexerState::KindTestForPi;
return Ok(());
}
let qname = self.consume_qname();
self.enqueue(Token::QName(qname), start, self.pos);
self.state = LexerState::OccurrenceIndicator;
}
_ => {}
}
Ok(())
}
fn kind_test_state(&mut self) -> Result<(), LexerError> {
self.skip_whitespace_and_comments();
let start = self.pos;
match self.current() {
None => {}
Some(')') => {
self.advance(1);
self.enqueue(Token::RParen, start, self.pos);
if let Some(s) = self.state_stack.pop() {
self.state = s;
}
}
Some('*') => {
self.advance(1);
self.enqueue(Token::Star, start, self.pos);
self.state = LexerState::CloseKindTest;
}
Some(c) if Self::is_ncname_start(c) => {
if self.match_identifier(&["element", "("]) {
self.enqueue(Token::Element, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::KindTest);
return Ok(());
}
if self.match_identifier(&["schema-element", "("]) {
self.enqueue(Token::SchemaElement, start, self.pos - 1);
self.enqueue(Token::LParen, self.pos - 1, self.pos);
self.state_stack.push(LexerState::KindTest);
return Ok(());
}
let qname = self.consume_qname();
self.enqueue(Token::QName(qname), start, self.pos);
self.state = LexerState::CloseKindTest;
}
_ => {}
}
Ok(())
}
fn kind_test_for_pi_state(&mut self) -> Result<(), LexerError> {
self.skip_whitespace_and_comments();
let start = self.pos;
match self.current() {
Some(')') => {
self.advance(1);
self.enqueue(Token::RParen, start, self.pos);
if let Some(s) = self.state_stack.pop() {
self.state = s;
}
}
Some(c) if Self::is_ncname_start(c) => {
let ncname = self.consume_ncname();
self.enqueue(Token::NCName(ncname), start, self.pos);
}
Some('"') | Some('\'') => {
let (tok, s, e) = self.consume_string()?;
self.enqueue(tok, s, e);
}
_ => {}
}
Ok(())
}
fn close_kind_test_state(&mut self) -> Result<(), LexerError> {
self.skip_whitespace_and_comments();
let start = self.pos;
match self.current() {
Some(')') => {
self.advance(1);
self.enqueue(Token::RParen, start, self.pos);
if let Some(s) = self.state_stack.pop() {
self.state = s;
}
}
Some(',') => {
self.advance(1);
self.enqueue(Token::Comma, start, self.pos);
self.state = LexerState::TypeNameInKindTest;
}
_ => {}
}
Ok(())
}
fn type_name_in_kind_test_state(&mut self) -> Result<(), LexerError> {
self.skip_whitespace_and_comments();
let start = self.pos;
match self.current() {
Some(')') => {
self.advance(1);
self.enqueue(Token::RParen, start, self.pos);
if let Some(s) = self.state_stack.pop() {
self.state = s;
}
}
Some(c) if Self::is_ncname_start(c) => {
let qname = self.consume_qname();
let qname_end = self.pos;
self.skip_whitespace_and_comments();
if self.current() == Some('?') {
self.advance(1);
self.enqueue(Token::QNameNillable(qname), start, self.pos);
} else {
self.enqueue(Token::QName(qname), start, qname_end);
}
self.state = LexerState::CloseKindTest;
}
_ => {}
}
Ok(())
}
fn occurrence_indicator_state(&mut self) -> Result<(), LexerError> {
self.skip_whitespace_and_comments();
let start = self.pos;
match self.current() {
Some('?') => {
self.advance(1);
self.enqueue(Token::OccurrenceZeroOrOne, start, self.pos);
}
Some('+') => {
self.advance(1);
self.enqueue(Token::OccurrenceOneOrMore, start, self.pos);
}
Some('*') => {
self.advance(1);
self.enqueue(Token::OccurrenceZeroOrMore, start, self.pos);
}
_ => {}
}
self.state = LexerState::Operator;
self.operator_state()?;
Ok(())
}
fn enter_state(&mut self) -> Result<(), LexerError> {
match self.state {
LexerState::Default => self.default_state(),
LexerState::Operator => self.operator_state(),
LexerState::VarName => self.varname_state(),
LexerState::SingleType => self.single_type_state(),
LexerState::ItemType => self.item_type_state(),
LexerState::KindTest => self.kind_test_state(),
LexerState::KindTestForPi => self.kind_test_for_pi_state(),
LexerState::CloseKindTest => self.close_kind_test_state(),
LexerState::TypeNameInKindTest => self.type_name_in_kind_test_state(),
LexerState::OccurrenceIndicator => self.occurrence_indicator_state(),
}
}
}
impl<'input> Iterator for Lexer<'input> {
type Item = Result<Spanned, LexerError>;
fn next(&mut self) -> Option<Self::Item> {
if self.finished {
return None;
}
if let Some(spanned) = self.token_queue.pop_front() {
if spanned.1 == Token::Eof {
self.finished = true;
}
return Some(Ok(spanned));
}
if let Err(e) = self.enter_state() {
self.finished = true;
return Some(Err(e));
}
if let Some(spanned) = self.token_queue.pop_front() {
if spanned.1 == Token::Eof {
self.finished = true;
}
Some(Ok(spanned))
} else {
self.finished = true;
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn tokenize(input: &str) -> Vec<Token> {
Lexer::new(input)
.map(|r| r.unwrap().1)
.filter(|t| *t != Token::Eof)
.collect()
}
#[test]
fn test_simple_number() {
assert_eq!(
tokenize("42"),
vec![Token::IntegerLiteral("42".to_string())]
);
assert_eq!(
tokenize("2.5"),
vec![Token::DecimalLiteral("2.5".to_string())]
);
assert_eq!(
tokenize("1e10"),
vec![Token::DoubleLiteral("1e10".to_string())]
);
}
#[test]
fn test_string_literal() {
assert_eq!(
tokenize("'hello'"),
vec![Token::StringLiteral("hello".to_string())]
);
assert_eq!(
tokenize("\"world\""),
vec![Token::StringLiteral("world".to_string())]
);
assert_eq!(
tokenize("'it''s'"),
vec![Token::StringLiteral("it's".to_string())]
);
}
#[test]
fn test_variable() {
assert_eq!(
tokenize("$x"),
vec![
Token::Dollar,
Token::VarName {
prefix: String::new(),
local: "x".to_string()
}
]
);
assert_eq!(
tokenize("$ns:var"),
vec![
Token::Dollar,
Token::VarName {
prefix: "ns".to_string(),
local: "var".to_string()
}
]
);
}
#[test]
fn test_axis_specifiers() {
assert_eq!(tokenize("child::"), vec![Token::AxisChild]);
assert_eq!(
tokenize("ancestor-or-self::"),
vec![Token::AxisAncestorOrSelf]
);
assert_eq!(
tokenize("descendant-or-self::"),
vec![Token::AxisDescendantOrSelf]
);
}
#[test]
fn test_path_expression() {
let tokens = tokenize("/a/b");
assert_eq!(
tokens,
vec![
Token::Slash,
Token::QName("a".to_string()),
Token::Slash,
Token::QName("b".to_string())
]
);
}
#[test]
fn test_double_slash() {
let tokens = tokenize("//a");
assert_eq!(
tokens,
vec![Token::DoubleSlash, Token::QName("a".to_string())]
);
}
#[test]
fn test_if_expression() {
let tokens = tokenize("if (");
assert_eq!(tokens, vec![Token::If, Token::LParen]);
}
#[test]
fn test_for_expression() {
let tokens = tokenize("for $x");
assert_eq!(
tokens,
vec![
Token::For,
Token::Dollar,
Token::VarName {
prefix: String::new(),
local: "x".to_string()
}
]
);
}
#[test]
fn test_comparison_operators() {
let tokens = tokenize("1 eq 2");
assert_eq!(
tokens,
vec![
Token::IntegerLiteral("1".to_string()),
Token::Eq,
Token::IntegerLiteral("2".to_string())
]
);
}
#[test]
fn test_instance_of() {
let tokens = tokenize("$x instance of xs:integer");
assert_eq!(
tokens,
vec![
Token::Dollar,
Token::VarName {
prefix: String::new(),
local: "x".to_string()
},
Token::InstanceOf,
Token::QName("xs:integer".to_string())
]
);
}
#[test]
fn test_cast_as() {
let tokens = tokenize("$x cast as xs:string");
assert_eq!(
tokens,
vec![
Token::Dollar,
Token::VarName {
prefix: String::new(),
local: "x".to_string()
},
Token::CastAs,
Token::QName("xs:string".to_string())
]
);
}
#[test]
fn test_kind_test() {
let tokens = tokenize("node()");
assert_eq!(tokens, vec![Token::Node, Token::LParen, Token::RParen]);
}
#[test]
fn test_element_test() {
let tokens = tokenize("element(foo)");
assert_eq!(
tokens,
vec![
Token::Element,
Token::LParen,
Token::QName("foo".to_string()),
Token::RParen
]
);
}
#[test]
fn test_element_test_with_type() {
let tokens = tokenize("element(foo, xs:string)");
assert_eq!(
tokens,
vec![
Token::Element,
Token::LParen,
Token::QName("foo".to_string()),
Token::Comma,
Token::QName("xs:string".to_string()),
Token::RParen
]
);
}
#[test]
fn test_element_test_with_nillable_type() {
let tokens = tokenize("element(foo, xs:string?)");
assert_eq!(
tokens,
vec![
Token::Element,
Token::LParen,
Token::QName("foo".to_string()),
Token::Comma,
Token::QNameNillable("xs:string".to_string()),
Token::RParen
]
);
}
#[test]
fn test_element_test_nillable_with_whitespace() {
let tokens = tokenize("element(foo, xs:string ?)");
assert_eq!(
tokens,
vec![
Token::Element,
Token::LParen,
Token::QName("foo".to_string()),
Token::Comma,
Token::QNameNillable("xs:string".to_string()),
Token::RParen
]
);
}
#[test]
fn test_comments() {
let tokens = tokenize("1 (: comment :) + 2");
assert_eq!(
tokens,
vec![
Token::IntegerLiteral("1".to_string()),
Token::Plus,
Token::IntegerLiteral("2".to_string())
]
);
}
#[test]
fn test_nested_comments() {
let tokens = tokenize("1 (: outer (: inner :) outer :) + 2");
assert_eq!(
tokens,
vec![
Token::IntegerLiteral("1".to_string()),
Token::Plus,
Token::IntegerLiteral("2".to_string())
]
);
}
#[test]
fn test_arithmetic() {
let tokens = tokenize("1 + 2 * 3");
assert_eq!(
tokens,
vec![
Token::IntegerLiteral("1".to_string()),
Token::Plus,
Token::IntegerLiteral("2".to_string()),
Token::Star,
Token::IntegerLiteral("3".to_string())
]
);
}
#[test]
fn test_predicates() {
let tokens = tokenize("a[1]");
assert_eq!(
tokens,
vec![
Token::QName("a".to_string()),
Token::LBracket,
Token::IntegerLiteral("1".to_string()),
Token::RBracket
]
);
}
#[test]
fn test_double_period() {
let tokens = tokenize("..");
assert_eq!(tokens, vec![Token::DoublePeriod]);
}
#[test]
fn test_context_item() {
let tokens = tokenize(".");
assert_eq!(tokens, vec![Token::Dot]);
}
#[test]
fn test_slash_only() {
let tokens = tokenize("/");
assert_eq!(tokens, vec![Token::SlashOnly]);
}
#[test]
fn test_slash_only_with_trailing_whitespace() {
let tokens = tokenize("/ ");
assert_eq!(tokens, vec![Token::SlashOnly]);
}
#[test]
fn test_slash_only_with_comment() {
let tokens = tokenize("/ (: comment :)");
assert_eq!(tokens, vec![Token::SlashOnly]);
}
#[test]
fn test_slash_with_path() {
let tokens = tokenize("/a");
assert_eq!(tokens, vec![Token::Slash, Token::QName("a".to_string())]);
}
#[test]
fn test_slash_with_whitespace_then_path() {
let tokens = tokenize("/ a");
assert_eq!(tokens, vec![Token::Slash, Token::QName("a".to_string())]);
}
#[test]
fn test_slash_with_comment_then_path() {
let tokens = tokenize("/ (: comment :) a");
assert_eq!(tokens, vec![Token::Slash, Token::QName("a".to_string())]);
}
#[test]
fn test_attribute_shorthand() {
let tokens = tokenize("@id");
assert_eq!(tokens, vec![Token::At, Token::QName("id".to_string())]);
}
#[test]
fn test_wildcard() {
let tokens = tokenize("*");
assert_eq!(tokens, vec![Token::Star]);
}
#[test]
fn test_namespace_wildcard() {
let tokens = tokenize("*:local");
assert_eq!(
tokens,
vec![
Token::Star,
Token::Colon,
Token::NCName("local".to_string())
]
);
}
#[test]
fn test_occurrence_indicators() {
let tokens = tokenize("$x instance of xs:integer?");
assert!(tokens.contains(&Token::OccurrenceZeroOrOne));
}
fn tokenize_10(input: &str) -> Vec<Token> {
Lexer::new_with_mode(input, XPathMode::XPath10)
.map(|r| r.unwrap().1)
.filter(|t| *t != Token::Eof)
.collect()
}
#[test]
fn test_xpath10_if_as_element_name() {
let tokens = tokenize_10("//if");
assert_eq!(tokens, vec![Token::DoubleSlash, Token::QName("if".into())]);
}
#[test]
fn test_xpath10_for_as_element_name() {
let tokens = tokenize_10("//for");
assert_eq!(tokens, vec![Token::DoubleSlash, Token::QName("for".into())]);
}
#[test]
fn test_xpath10_every_as_element_name() {
let tokens = tokenize_10("//every");
assert_eq!(
tokens,
vec![Token::DoubleSlash, Token::QName("every".into())]
);
}
#[test]
fn test_xpath10_some_as_element_name() {
let tokens = tokenize_10("//some");
assert_eq!(
tokens,
vec![Token::DoubleSlash, Token::QName("some".into())]
);
}
#[test]
fn test_xpath10_xpath20_keywords_as_qnames() {
for name in &[
"then",
"else",
"return",
"to",
"union",
"except",
"intersect",
"eq",
"ne",
"lt",
"le",
"gt",
"ge",
"is",
"idiv",
"satisfies",
"in",
] {
let input = format!("a/{}", name);
let tokens = tokenize_10(&input);
assert_eq!(
tokens,
vec![
Token::QName("a".into()),
Token::Slash,
Token::QName((*name).into()),
],
"Failed for keyword: {}",
name
);
}
}
#[test]
fn test_xpath10_unary_minus() {
let tokens = tokenize_10("-a");
assert_eq!(tokens, vec![Token::Minus10, Token::QName("a".into())]);
}
#[test]
fn test_xpath10_unary_plus() {
let tokens = tokenize_10("+a");
assert_eq!(tokens, vec![Token::Plus10, Token::QName("a".into())]);
}
#[test]
fn test_xpath20_unary_minus() {
let tokens = tokenize("-a");
assert_eq!(tokens, vec![Token::Minus20, Token::QName("a".into())]);
}
#[test]
fn test_xpath10_binary_minus() {
let tokens = tokenize_10("a - b");
assert_eq!(
tokens,
vec![
Token::QName("a".into()),
Token::Minus,
Token::QName("b".into())
]
);
}
#[test]
fn test_xpath10_binary_plus() {
let tokens = tokenize_10("a + b");
assert_eq!(
tokens,
vec![
Token::QName("a".into()),
Token::Plus,
Token::QName("b".into())
]
);
}
#[test]
fn test_xpath10_and_or_div_mod() {
let tokens = tokenize_10("a and b");
assert_eq!(
tokens,
vec![
Token::QName("a".into()),
Token::And,
Token::QName("b".into())
]
);
let tokens = tokenize_10("a or b");
assert_eq!(
tokens,
vec![
Token::QName("a".into()),
Token::Or,
Token::QName("b".into())
]
);
let tokens = tokenize_10("a div b");
assert_eq!(
tokens,
vec![
Token::QName("a".into()),
Token::Div,
Token::QName("b".into())
]
);
let tokens = tokenize_10("a mod b");
assert_eq!(
tokens,
vec![
Token::QName("a".into()),
Token::Mod,
Token::QName("b".into())
]
);
}
#[test]
fn test_xpath10_node_kind_test() {
let tokens = tokenize_10("node()");
assert_eq!(tokens, vec![Token::Node, Token::LParen, Token::RParen]);
}
#[test]
fn test_xpath10_comment_kind_test() {
let tokens = tokenize_10("comment()");
assert_eq!(tokens, vec![Token::Comment, Token::LParen, Token::RParen]);
}
#[test]
fn test_xpath10_text_kind_test() {
let tokens = tokenize_10("text()");
assert_eq!(tokens, vec![Token::Text, Token::LParen, Token::RParen]);
}
#[test]
fn test_xpath10_processing_instruction_kind_test() {
let tokens = tokenize_10("processing-instruction()");
assert_eq!(
tokens,
vec![Token::ProcessingInstruction, Token::LParen, Token::RParen]
);
}
#[test]
fn test_xpath10_element_as_function() {
let tokens = tokenize_10("element(foo)");
assert_eq!(
tokens,
vec![
Token::QName("element".into()),
Token::LParen,
Token::QName("foo".into()),
Token::RParen
]
);
}
#[test]
fn test_xpath10_axis_specifiers() {
let tokens = tokenize_10("child::*");
assert_eq!(tokens, vec![Token::AxisChild, Token::Star]);
}
#[test]
fn test_xpath10_ancestor_or_self_axis() {
let tokens = tokenize_10("ancestor-or-self::node()");
assert_eq!(
tokens,
vec![
Token::AxisAncestorOrSelf,
Token::Node,
Token::LParen,
Token::RParen
]
);
}
#[test]
fn test_xpath10_no_double_less() {
let tokens: Vec<Token> = Lexer::new_with_mode("a<<b", XPathMode::XPath10)
.filter_map(|r| r.ok().map(|t| t.1))
.filter(|t| *t != Token::Eof)
.collect();
assert!(!tokens.contains(&Token::DoubleLess));
}
#[test]
fn test_xpath10_no_double_greater() {
let tokens: Vec<Token> = Lexer::new_with_mode("a>>b", XPathMode::XPath10)
.filter_map(|r| r.ok().map(|t| t.1))
.filter(|t| *t != Token::Eof)
.collect();
assert!(!tokens.contains(&Token::DoubleGreater));
}
#[test]
fn test_xpath20_double_less_still_works() {
let tokens = tokenize("a<<b");
assert!(tokens.contains(&Token::DoubleLess));
}
#[test]
fn test_xpath20_double_greater_still_works() {
let tokens = tokenize("a>>b");
assert!(tokens.contains(&Token::DoubleGreater));
}
}