udled_tokenizers/
ident.rs

1use udled::{any, token::Spanned, Lex, Span, StringExt, Tokenizer};
2
3/// Match a unicode identifier
4#[derive(Debug, Clone, Copy, Default)]
5pub struct Ident;
6
7impl Tokenizer for Ident {
8    type Token<'a> = Lex<'a>;
9
10    fn to_token<'a>(
11        &self,
12        reader: &mut udled::Reader<'_, 'a>,
13    ) -> Result<Self::Token<'a>, udled::Error> {
14        let start_idx = reader.position();
15
16        let mut end_idx = start_idx;
17
18        let Some(first) = reader.peek_ch() else {
19            return Err(reader.error("expected identifier"));
20        };
21
22        if !first.is_alphabetic() && first != "_" {
23            return Err(reader.error("expected identifier"));
24        }
25
26        loop {
27            let Some(ch) = reader.peek_ch() else {
28                break;
29            };
30
31            if ch == "\0" {
32                break;
33            }
34
35            if !ch.is_ascii_alphanumeric() && ch != "_" {
36                break;
37            }
38
39            end_idx += 1;
40
41            reader.eat_ch()?;
42        }
43
44        if start_idx == end_idx {
45            return Err(reader.error("expected identifier"));
46        }
47
48        let ret = &reader.source()[start_idx..reader.position()];
49
50        Ok(Lex::new(ret, Span::new(start_idx, reader.position())))
51    }
52
53    fn peek<'a>(&self, reader: &mut udled::Reader<'_, '_>) -> Result<bool, udled::Error> {
54        let ch = reader.eat_ch()?;
55        Ok(ch.is_alphabetic() || ch == "_")
56    }
57}
58
59/// Match a xml style tag or attribute
60pub struct XmlIdent;
61
62impl XmlIdent {}
63
64impl Tokenizer for XmlIdent {
65    type Token<'a> = Lex<'a>;
66
67    fn to_token<'a>(
68        &self,
69        reader: &mut udled::Reader<'_, 'a>,
70    ) -> Result<Self::Token<'a>, udled::Error> {
71        let start_tokenizer = any!(
72            ':',
73            'a'..='z',
74            'A'..='Z',
75            '\u{2070}'..='\u{218F}',
76            '\u{2C00}'..='\u{2FEF}',
77            '\u{3001}'..='\u{D7FF}',
78            '\u{F900}'..='\u{FDCF}',
79            '\u{FDF0}'..='\u{FFFD}'
80        );
81        let rest_tokenizer = any!(
82            '0'..='9',
83            '-',
84            ".",
85            '_',
86            '\u{00B7}',
87            '\u{0300}'..='\u{036F}',
88            '\u{203F}'..='\u{2040}'
89        );
90
91        let all = any!(&start_tokenizer, rest_tokenizer);
92
93        let start = reader.parse(Spanned(&start_tokenizer))?;
94        let mut end = start;
95
96        loop {
97            if reader.eof() {
98                break;
99            }
100
101            if !reader.peek(&all)? {
102                break;
103            }
104
105            end = reader.parse(Spanned(&all))?;
106        }
107
108        let span = start + end;
109
110        if let Some(content) = span.slice(reader.source()) {
111            Ok(Lex::new(content, span))
112        } else {
113            Err(reader.error("Invalid range"))
114        }
115    }
116
117    fn peek(&self, reader: &mut udled::Reader<'_, '_>) -> Result<bool, udled::Error> {
118        reader.peek(any!(
119            ':',
120            'a'..='z',
121            'A'..='Z',
122            '\u{2070}'..='\u{218F}',
123            '\u{2C00}'..='\u{2FEF}',
124            '\u{3001}'..='\u{D7FF}',
125            '\u{F900}'..='\u{FDCF}',
126            '\u{FDF0}'..='\u{FFFD}'
127        ))
128    }
129}
130
131#[cfg(test)]
132mod test {
133    use udled::{token::Ws, Input, Lex, Span};
134
135    use super::{Ident, XmlIdent};
136
137    #[test]
138    fn xml_ident() {
139        let mut input = Input::new("div custom-tag data-id2");
140
141        assert_eq!(
142            input.parse((XmlIdent, Ws, XmlIdent, Ws, XmlIdent)).unwrap(),
143            (
144                Lex::new("div", Span::new(0, 3)),
145                Span::new(3, 4),
146                Lex::new("custom-tag", Span::new(4, 14)),
147                Span::new(14, 15),
148                Lex::new("data-id2", Span::new(15, 23))
149            )
150        );
151    }
152
153    #[test]
154    fn ident() {
155        let mut input = Input::new("Ident other");
156        assert_eq!(
157            input.parse(Ident).unwrap(),
158            Lex {
159                value: "Ident",
160                span: Span { start: 0, end: 5 }
161            }
162        );
163    }
164}