1use std::fmt;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub enum IdXPathToken<'a> {
13 NCName(&'a str),
15 Star,
17 Colon,
19 Slash,
21 DoubleSlash,
23 DoubleColon,
25 Pipe,
27 Dot,
29 At,
31}
32
33pub type IdXPathSpanned<'a> = (usize, IdXPathToken<'a>, usize);
35
36#[derive(Debug, Clone)]
38pub struct IdXPathLexError {
39 pub message: String,
41 pub position: usize,
43}
44
45impl fmt::Display for IdXPathLexError {
46 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47 write!(
48 f,
49 "identity XPath lex error at position {}: {}",
50 self.position, self.message
51 )
52 }
53}
54
55impl std::error::Error for IdXPathLexError {}
56
57pub struct IdXPathLexer<'a> {
59 input: &'a str,
60 pos: usize,
61}
62
63fn is_ncname_start(c: char) -> bool {
65 c.is_alphabetic() || c == '_'
66}
67
68fn is_ncname_char(c: char) -> bool {
70 c.is_alphanumeric()
71 || c == '_'
72 || c == '-'
73 || c == '.'
74 || c == '\u{B7}'
75 || ('\u{0300}'..='\u{036F}').contains(&c)
76 || ('\u{203F}'..='\u{2040}').contains(&c)
77}
78
79impl<'a> IdXPathLexer<'a> {
80 pub fn new(input: &'a str) -> Self {
82 Self { input, pos: 0 }
83 }
84
85 fn current(&self) -> Option<char> {
87 self.input[self.pos..].chars().next()
88 }
89
90 fn peek_next(&self) -> Option<char> {
92 let mut chars = self.input[self.pos..].chars();
93 chars.next();
94 chars.next()
95 }
96
97 fn advance(&mut self) -> Option<char> {
99 let c = self.current()?;
100 self.pos += c.len_utf8();
101 Some(c)
102 }
103
104 fn skip_whitespace(&mut self) {
106 while let Some(c) = self.current() {
107 if matches!(c, ' ' | '\t' | '\r' | '\n') {
108 self.pos += c.len_utf8();
109 } else {
110 break;
111 }
112 }
113 }
114
115 fn lex_ncname(&mut self) -> IdXPathToken<'a> {
117 let start = self.pos;
118 self.advance(); while let Some(c) = self.current() {
120 if is_ncname_char(c) {
121 self.advance();
122 } else {
123 break;
124 }
125 }
126 IdXPathToken::NCName(&self.input[start..self.pos])
127 }
128
129 fn next_token(&mut self) -> Option<Result<IdXPathSpanned<'a>, IdXPathLexError>> {
131 self.skip_whitespace();
132 let start = self.pos;
133 let c = self.current()?;
134
135 if is_ncname_start(c) {
137 let tok = self.lex_ncname();
138 return Some(Ok((start, tok, self.pos)));
139 }
140
141 match c {
142 '/' => {
143 self.advance();
144 if self.current() == Some('/') {
145 self.advance();
146 Some(Ok((start, IdXPathToken::DoubleSlash, self.pos)))
147 } else {
148 Some(Ok((start, IdXPathToken::Slash, self.pos)))
149 }
150 }
151 ':' => {
152 self.advance();
153 if self.current() == Some(':') {
154 self.advance();
155 Some(Ok((start, IdXPathToken::DoubleColon, self.pos)))
156 } else {
157 Some(Ok((start, IdXPathToken::Colon, self.pos)))
158 }
159 }
160 '.' => {
161 if self.peek_next() == Some('.') {
162 Some(Err(IdXPathLexError {
163 message: "parent axis `..` is not allowed in identity-constraint XPath"
164 .into(),
165 position: start,
166 }))
167 } else {
168 self.advance();
169 Some(Ok((start, IdXPathToken::Dot, self.pos)))
170 }
171 }
172 '*' => {
173 self.advance();
174 Some(Ok((start, IdXPathToken::Star, self.pos)))
175 }
176 '|' => {
177 self.advance();
178 Some(Ok((start, IdXPathToken::Pipe, self.pos)))
179 }
180 '@' => {
181 self.advance();
182 Some(Ok((start, IdXPathToken::At, self.pos)))
183 }
184 '[' => Some(Err(IdXPathLexError {
185 message: "predicates `[...]` are not allowed in identity-constraint XPath".into(),
186 position: start,
187 })),
188 '(' => Some(Err(IdXPathLexError {
189 message: "function calls are not allowed in identity-constraint XPath".into(),
190 position: start,
191 })),
192 _ => Some(Err(IdXPathLexError {
193 message: format!("unexpected character `{c}`"),
194 position: start,
195 })),
196 }
197 }
198}
199
200impl<'a> Iterator for IdXPathLexer<'a> {
201 type Item = Result<IdXPathSpanned<'a>, IdXPathLexError>;
202
203 fn next(&mut self) -> Option<Self::Item> {
204 self.next_token()
205 }
206}
207
208#[cfg(test)]
209mod tests {
210 use super::*;
211
212 fn lex_ok(input: &str) -> Vec<IdXPathSpanned<'_>> {
214 IdXPathLexer::new(input)
215 .collect::<Result<Vec<_>, _>>()
216 .unwrap_or_else(|e| panic!("unexpected lex error: {e}"))
217 }
218
219 fn lex_err(input: &str) -> IdXPathLexError {
221 IdXPathLexer::new(input)
222 .collect::<Result<Vec<_>, _>>()
223 .expect_err("expected a lex error")
224 }
225
226 #[test]
229 fn ncname_simple() {
230 let tokens = lex_ok("foo");
231 assert_eq!(tokens, vec![(0, IdXPathToken::NCName("foo"), 3)]);
232 }
233
234 #[test]
235 fn star() {
236 let tokens = lex_ok("*");
237 assert_eq!(tokens, vec![(0, IdXPathToken::Star, 1)]);
238 }
239
240 #[test]
241 fn colon() {
242 let tokens = lex_ok("ns:local");
244 assert_eq!(
245 tokens,
246 vec![
247 (0, IdXPathToken::NCName("ns"), 2),
248 (2, IdXPathToken::Colon, 3),
249 (3, IdXPathToken::NCName("local"), 8),
250 ]
251 );
252 }
253
254 #[test]
255 fn slash() {
256 let tokens = lex_ok("/");
257 assert_eq!(tokens, vec![(0, IdXPathToken::Slash, 1)]);
258 }
259
260 #[test]
261 fn double_slash() {
262 let tokens = lex_ok("//");
263 assert_eq!(tokens, vec![(0, IdXPathToken::DoubleSlash, 2)]);
264 }
265
266 #[test]
267 fn double_colon() {
268 let tokens = lex_ok("::");
269 assert_eq!(tokens, vec![(0, IdXPathToken::DoubleColon, 2)]);
270 }
271
272 #[test]
273 fn pipe() {
274 let tokens = lex_ok("|");
275 assert_eq!(tokens, vec![(0, IdXPathToken::Pipe, 1)]);
276 }
277
278 #[test]
279 fn dot() {
280 let tokens = lex_ok(".");
281 assert_eq!(tokens, vec![(0, IdXPathToken::Dot, 1)]);
282 }
283
284 #[test]
285 fn at() {
286 let tokens = lex_ok("@");
287 assert_eq!(tokens, vec![(0, IdXPathToken::At, 1)]);
288 }
289
290 #[test]
293 fn descendant_path() {
294 let tokens = lex_ok(".//foo/bar");
296 assert_eq!(
297 tokens,
298 vec![
299 (0, IdXPathToken::Dot, 1),
300 (1, IdXPathToken::DoubleSlash, 3),
301 (3, IdXPathToken::NCName("foo"), 6),
302 (6, IdXPathToken::Slash, 7),
303 (7, IdXPathToken::NCName("bar"), 10),
304 ]
305 );
306 }
307
308 #[test]
309 fn child_axis() {
310 let tokens = lex_ok("child::foo");
312 assert_eq!(
313 tokens,
314 vec![
315 (0, IdXPathToken::NCName("child"), 5),
316 (5, IdXPathToken::DoubleColon, 7),
317 (7, IdXPathToken::NCName("foo"), 10),
318 ]
319 );
320 }
321
322 #[test]
323 fn namespace_wildcard() {
324 let tokens = lex_ok("ns:*");
326 assert_eq!(
327 tokens,
328 vec![
329 (0, IdXPathToken::NCName("ns"), 2),
330 (2, IdXPathToken::Colon, 3),
331 (3, IdXPathToken::Star, 4),
332 ]
333 );
334 }
335
336 #[test]
337 fn attribute_path() {
338 let tokens = lex_ok(".//foo/@bar");
340 assert_eq!(
341 tokens,
342 vec![
343 (0, IdXPathToken::Dot, 1),
344 (1, IdXPathToken::DoubleSlash, 3),
345 (3, IdXPathToken::NCName("foo"), 6),
346 (6, IdXPathToken::Slash, 7),
347 (7, IdXPathToken::At, 8),
348 (8, IdXPathToken::NCName("bar"), 11),
349 ]
350 );
351 }
352
353 #[test]
356 fn keywords_as_ncname() {
357 for kw in &["and", "or", "div", "mod", "child", "attribute"] {
358 let tokens = lex_ok(kw);
359 assert_eq!(tokens, vec![(0, IdXPathToken::NCName(kw), kw.len())]);
360 }
361 }
362
363 #[test]
366 fn error_parent_axis() {
367 let err = lex_err("..");
368 assert!(
369 err.message.contains("parent axis"),
370 "message: {}",
371 err.message
372 );
373 assert_eq!(err.position, 0);
374 }
375
376 #[test]
377 fn error_predicate() {
378 let err = lex_err("foo[1]");
379 assert!(
380 err.message.contains("predicates"),
381 "message: {}",
382 err.message
383 );
384 assert_eq!(err.position, 3);
385 }
386
387 #[test]
388 fn error_function_call() {
389 let err = lex_err("fn(");
390 assert!(
391 err.message.contains("function calls"),
392 "message: {}",
393 err.message
394 );
395 assert_eq!(err.position, 2);
396 }
397
398 #[test]
401 fn empty_input() {
402 let tokens = lex_ok("");
403 assert!(tokens.is_empty());
404 }
405
406 #[test]
407 fn whitespace_only() {
408 let tokens = lex_ok(" \t\n ");
409 assert!(tokens.is_empty());
410 }
411
412 #[test]
413 fn span_correctness_with_whitespace() {
414 let tokens = lex_ok(" foo / bar ");
416 assert_eq!(
417 tokens,
418 vec![
419 (2, IdXPathToken::NCName("foo"), 5),
420 (6, IdXPathToken::Slash, 7),
421 (8, IdXPathToken::NCName("bar"), 11),
422 ]
423 );
424 }
425}