Skip to main content

xsd_schema/validation/
identity_lexer.rs

1//! Lexer for XSD identity-constraint XPath subset (selector/field expressions).
2//!
3//! This subset is much simpler than full XPath 2.0: no predicates, no function calls,
4//! no parent axis, no operators. All names (including `child`, `attribute`, `and`, `or`)
5//! are emitted as plain `NCName` tokens, avoiding reserved-word conflicts with the
6//! main XPath 2.0 lexer.
7
8use std::fmt;
9
10/// Token produced by the identity-constraint XPath lexer.
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub enum IdXPathToken<'a> {
13    /// An XML NCName (unqualified name).
14    NCName(&'a str),
15    /// `*`
16    Star,
17    /// `:`
18    Colon,
19    /// `/`
20    Slash,
21    /// `//`
22    DoubleSlash,
23    /// `::`
24    DoubleColon,
25    /// `|`
26    Pipe,
27    /// `.`
28    Dot,
29    /// `@`
30    At,
31}
32
33/// A spanned token: `(start, token, end)` where positions are byte offsets.
34pub type IdXPathSpanned<'a> = (usize, IdXPathToken<'a>, usize);
35
36/// Error produced during identity-constraint XPath lexing.
37#[derive(Debug, Clone)]
38pub struct IdXPathLexError {
39    /// Human-readable error message.
40    pub message: String,
41    /// Byte offset where the error occurred.
42    pub position: usize,
43}
44
45impl fmt::Display for IdXPathLexError {
46    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47        write!(
48            f,
49            "identity XPath lex error at position {}: {}",
50            self.position, self.message
51        )
52    }
53}
54
55impl std::error::Error for IdXPathLexError {}
56
57/// Lexer for the restricted XPath subset used in XSD identity constraints.
58pub struct IdXPathLexer<'a> {
59    input: &'a str,
60    pos: usize,
61}
62
63/// Check if a character is an XML NCName start character.
64fn is_ncname_start(c: char) -> bool {
65    c.is_alphabetic() || c == '_'
66}
67
68/// Check if a character is an XML NCName character.
69fn is_ncname_char(c: char) -> bool {
70    c.is_alphanumeric()
71        || c == '_'
72        || c == '-'
73        || c == '.'
74        || c == '\u{B7}'
75        || ('\u{0300}'..='\u{036F}').contains(&c)
76        || ('\u{203F}'..='\u{2040}').contains(&c)
77}
78
79impl<'a> IdXPathLexer<'a> {
80    /// Create a new lexer for the given input string.
81    pub fn new(input: &'a str) -> Self {
82        Self { input, pos: 0 }
83    }
84
85    /// Peek at the current character without advancing.
86    fn current(&self) -> Option<char> {
87        self.input[self.pos..].chars().next()
88    }
89
90    /// Peek at the character after the current one.
91    fn peek_next(&self) -> Option<char> {
92        let mut chars = self.input[self.pos..].chars();
93        chars.next();
94        chars.next()
95    }
96
97    /// Advance past the current character and return it.
98    fn advance(&mut self) -> Option<char> {
99        let c = self.current()?;
100        self.pos += c.len_utf8();
101        Some(c)
102    }
103
104    /// Skip whitespace characters.
105    fn skip_whitespace(&mut self) {
106        while let Some(c) = self.current() {
107            if matches!(c, ' ' | '\t' | '\r' | '\n') {
108                self.pos += c.len_utf8();
109            } else {
110                break;
111            }
112        }
113    }
114
115    /// Lex an NCName starting at the current position.
116    fn lex_ncname(&mut self) -> IdXPathToken<'a> {
117        let start = self.pos;
118        self.advance(); // consume the start character
119        while let Some(c) = self.current() {
120            if is_ncname_char(c) {
121                self.advance();
122            } else {
123                break;
124            }
125        }
126        IdXPathToken::NCName(&self.input[start..self.pos])
127    }
128
129    /// Produce the next token or an error.
130    fn next_token(&mut self) -> Option<Result<IdXPathSpanned<'a>, IdXPathLexError>> {
131        self.skip_whitespace();
132        let start = self.pos;
133        let c = self.current()?;
134
135        // NCName
136        if is_ncname_start(c) {
137            let tok = self.lex_ncname();
138            return Some(Ok((start, tok, self.pos)));
139        }
140
141        match c {
142            '/' => {
143                self.advance();
144                if self.current() == Some('/') {
145                    self.advance();
146                    Some(Ok((start, IdXPathToken::DoubleSlash, self.pos)))
147                } else {
148                    Some(Ok((start, IdXPathToken::Slash, self.pos)))
149                }
150            }
151            ':' => {
152                self.advance();
153                if self.current() == Some(':') {
154                    self.advance();
155                    Some(Ok((start, IdXPathToken::DoubleColon, self.pos)))
156                } else {
157                    Some(Ok((start, IdXPathToken::Colon, self.pos)))
158                }
159            }
160            '.' => {
161                if self.peek_next() == Some('.') {
162                    Some(Err(IdXPathLexError {
163                        message: "parent axis `..` is not allowed in identity-constraint XPath"
164                            .into(),
165                        position: start,
166                    }))
167                } else {
168                    self.advance();
169                    Some(Ok((start, IdXPathToken::Dot, self.pos)))
170                }
171            }
172            '*' => {
173                self.advance();
174                Some(Ok((start, IdXPathToken::Star, self.pos)))
175            }
176            '|' => {
177                self.advance();
178                Some(Ok((start, IdXPathToken::Pipe, self.pos)))
179            }
180            '@' => {
181                self.advance();
182                Some(Ok((start, IdXPathToken::At, self.pos)))
183            }
184            '[' => Some(Err(IdXPathLexError {
185                message: "predicates `[...]` are not allowed in identity-constraint XPath".into(),
186                position: start,
187            })),
188            '(' => Some(Err(IdXPathLexError {
189                message: "function calls are not allowed in identity-constraint XPath".into(),
190                position: start,
191            })),
192            _ => Some(Err(IdXPathLexError {
193                message: format!("unexpected character `{c}`"),
194                position: start,
195            })),
196        }
197    }
198}
199
200impl<'a> Iterator for IdXPathLexer<'a> {
201    type Item = Result<IdXPathSpanned<'a>, IdXPathLexError>;
202
203    fn next(&mut self) -> Option<Self::Item> {
204        self.next_token()
205    }
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    /// Collect all tokens from an input, asserting no errors.
213    fn lex_ok(input: &str) -> Vec<IdXPathSpanned<'_>> {
214        IdXPathLexer::new(input)
215            .collect::<Result<Vec<_>, _>>()
216            .unwrap_or_else(|e| panic!("unexpected lex error: {e}"))
217    }
218
219    /// Expect a lex error from the input.
220    fn lex_err(input: &str) -> IdXPathLexError {
221        IdXPathLexer::new(input)
222            .collect::<Result<Vec<_>, _>>()
223            .expect_err("expected a lex error")
224    }
225
226    // --- Individual token types ---
227
228    #[test]
229    fn ncname_simple() {
230        let tokens = lex_ok("foo");
231        assert_eq!(tokens, vec![(0, IdXPathToken::NCName("foo"), 3)]);
232    }
233
234    #[test]
235    fn star() {
236        let tokens = lex_ok("*");
237        assert_eq!(tokens, vec![(0, IdXPathToken::Star, 1)]);
238    }
239
240    #[test]
241    fn colon() {
242        // A bare colon (namespace separator context)
243        let tokens = lex_ok("ns:local");
244        assert_eq!(
245            tokens,
246            vec![
247                (0, IdXPathToken::NCName("ns"), 2),
248                (2, IdXPathToken::Colon, 3),
249                (3, IdXPathToken::NCName("local"), 8),
250            ]
251        );
252    }
253
254    #[test]
255    fn slash() {
256        let tokens = lex_ok("/");
257        assert_eq!(tokens, vec![(0, IdXPathToken::Slash, 1)]);
258    }
259
260    #[test]
261    fn double_slash() {
262        let tokens = lex_ok("//");
263        assert_eq!(tokens, vec![(0, IdXPathToken::DoubleSlash, 2)]);
264    }
265
266    #[test]
267    fn double_colon() {
268        let tokens = lex_ok("::");
269        assert_eq!(tokens, vec![(0, IdXPathToken::DoubleColon, 2)]);
270    }
271
272    #[test]
273    fn pipe() {
274        let tokens = lex_ok("|");
275        assert_eq!(tokens, vec![(0, IdXPathToken::Pipe, 1)]);
276    }
277
278    #[test]
279    fn dot() {
280        let tokens = lex_ok(".");
281        assert_eq!(tokens, vec![(0, IdXPathToken::Dot, 1)]);
282    }
283
284    #[test]
285    fn at() {
286        let tokens = lex_ok("@");
287        assert_eq!(tokens, vec![(0, IdXPathToken::At, 1)]);
288    }
289
290    // --- Full expressions ---
291
292    #[test]
293    fn descendant_path() {
294        // .//foo/bar
295        let tokens = lex_ok(".//foo/bar");
296        assert_eq!(
297            tokens,
298            vec![
299                (0, IdXPathToken::Dot, 1),
300                (1, IdXPathToken::DoubleSlash, 3),
301                (3, IdXPathToken::NCName("foo"), 6),
302                (6, IdXPathToken::Slash, 7),
303                (7, IdXPathToken::NCName("bar"), 10),
304            ]
305        );
306    }
307
308    #[test]
309    fn child_axis() {
310        // child::foo
311        let tokens = lex_ok("child::foo");
312        assert_eq!(
313            tokens,
314            vec![
315                (0, IdXPathToken::NCName("child"), 5),
316                (5, IdXPathToken::DoubleColon, 7),
317                (7, IdXPathToken::NCName("foo"), 10),
318            ]
319        );
320    }
321
322    #[test]
323    fn namespace_wildcard() {
324        // ns:*
325        let tokens = lex_ok("ns:*");
326        assert_eq!(
327            tokens,
328            vec![
329                (0, IdXPathToken::NCName("ns"), 2),
330                (2, IdXPathToken::Colon, 3),
331                (3, IdXPathToken::Star, 4),
332            ]
333        );
334    }
335
336    #[test]
337    fn attribute_path() {
338        // .//foo/@bar
339        let tokens = lex_ok(".//foo/@bar");
340        assert_eq!(
341            tokens,
342            vec![
343                (0, IdXPathToken::Dot, 1),
344                (1, IdXPathToken::DoubleSlash, 3),
345                (3, IdXPathToken::NCName("foo"), 6),
346                (6, IdXPathToken::Slash, 7),
347                (7, IdXPathToken::At, 8),
348                (8, IdXPathToken::NCName("bar"), 11),
349            ]
350        );
351    }
352
353    // --- XPath keywords emitted as NCName ---
354
355    #[test]
356    fn keywords_as_ncname() {
357        for kw in &["and", "or", "div", "mod", "child", "attribute"] {
358            let tokens = lex_ok(kw);
359            assert_eq!(tokens, vec![(0, IdXPathToken::NCName(kw), kw.len())]);
360        }
361    }
362
363    // --- Error cases ---
364
365    #[test]
366    fn error_parent_axis() {
367        let err = lex_err("..");
368        assert!(
369            err.message.contains("parent axis"),
370            "message: {}",
371            err.message
372        );
373        assert_eq!(err.position, 0);
374    }
375
376    #[test]
377    fn error_predicate() {
378        let err = lex_err("foo[1]");
379        assert!(
380            err.message.contains("predicates"),
381            "message: {}",
382            err.message
383        );
384        assert_eq!(err.position, 3);
385    }
386
387    #[test]
388    fn error_function_call() {
389        let err = lex_err("fn(");
390        assert!(
391            err.message.contains("function calls"),
392            "message: {}",
393            err.message
394        );
395        assert_eq!(err.position, 2);
396    }
397
398    // --- Edge cases ---
399
400    #[test]
401    fn empty_input() {
402        let tokens = lex_ok("");
403        assert!(tokens.is_empty());
404    }
405
406    #[test]
407    fn whitespace_only() {
408        let tokens = lex_ok("   \t\n  ");
409        assert!(tokens.is_empty());
410    }
411
412    #[test]
413    fn span_correctness_with_whitespace() {
414        // "  foo / bar  "
415        let tokens = lex_ok("  foo / bar  ");
416        assert_eq!(
417            tokens,
418            vec![
419                (2, IdXPathToken::NCName("foo"), 5),
420                (6, IdXPathToken::Slash, 7),
421                (8, IdXPathToken::NCName("bar"), 11),
422            ]
423        );
424    }
425}