1use std::iter::Peekable;
4use std::str::Chars;
5
6#[derive(Debug, Clone, PartialEq)]
8pub enum Token {
9 Namespace,
11 Default,
12 Element,
13 Attribute,
14 Empty,
15 String,
16 Mixed,
17 List,
18 Text,
19 Ident(String),
21 QuotedString(String),
22 Equals,
24 Comma,
25 Pipe,
26 Ampersand,
27 Question,
28 Star,
29 Plus,
30 Minus,
31 LBrace,
32 RBrace,
33 LParen,
34 RParen,
35 Colon,
36 DocComment(String),
38 Eof,
40}
41
42pub struct Lexer<'a> {
44 input: Peekable<Chars<'a>>,
45 current_line: usize,
46}
47
48impl<'a> Lexer<'a> {
49 pub fn new(input: &'a str) -> Self {
50 Self {
51 input: input.chars().peekable(),
52 current_line: 1,
53 }
54 }
55
56 pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
57 let mut tokens = Vec::new();
58 loop {
59 let token = self.next_token()?;
60 if token == Token::Eof {
61 tokens.push(token);
62 break;
63 }
64 tokens.push(token);
65 }
66 Ok(tokens)
67 }
68
69 fn next_token(&mut self) -> Result<Token, LexError> {
70 self.skip_whitespace_and_comments();
71
72 let Some(ch) = self.input.peek().copied() else {
73 return Ok(Token::Eof);
74 };
75
76 match ch {
77 '=' => {
78 self.input.next();
79 Ok(Token::Equals)
80 }
81 ',' => {
82 self.input.next();
83 Ok(Token::Comma)
84 }
85 '|' => {
86 self.input.next();
87 Ok(Token::Pipe)
88 }
89 '&' => {
90 self.input.next();
91 Ok(Token::Ampersand)
92 }
93 '?' => {
94 self.input.next();
95 Ok(Token::Question)
96 }
97 '*' => {
98 self.input.next();
99 Ok(Token::Star)
100 }
101 '+' => {
102 self.input.next();
103 Ok(Token::Plus)
104 }
105 '-' => {
106 self.input.next();
107 Ok(Token::Minus)
108 }
109 '{' => {
110 self.input.next();
111 Ok(Token::LBrace)
112 }
113 '}' => {
114 self.input.next();
115 Ok(Token::RBrace)
116 }
117 '(' => {
118 self.input.next();
119 Ok(Token::LParen)
120 }
121 ')' => {
122 self.input.next();
123 Ok(Token::RParen)
124 }
125 ':' => {
126 self.input.next();
127 Ok(Token::Colon)
128 }
129 '"' => self.read_quoted_string(),
130 _ if ch.is_alphabetic() || ch == '_' => self.read_ident(),
131 _ => Err(LexError::UnexpectedChar(ch, self.current_line)),
132 }
133 }
134
135 fn skip_whitespace_and_comments(&mut self) {
136 loop {
137 while let Some(&ch) = self.input.peek() {
139 if ch == '\n' {
140 self.current_line += 1;
141 self.input.next();
142 } else if ch.is_whitespace() {
143 self.input.next();
144 } else {
145 break;
146 }
147 }
148
149 if self.input.peek() == Some(&'#') {
151 self.input.next(); let _is_doc = self.input.peek() == Some(&'#');
154 if _is_doc {
155 self.input.next();
156 }
157 while let Some(&ch) = self.input.peek() {
159 if ch == '\n' {
160 self.current_line += 1;
161 self.input.next();
162 break;
163 }
164 self.input.next();
165 }
166 } else {
167 break;
168 }
169 }
170 }
171
172 fn read_quoted_string(&mut self) -> Result<Token, LexError> {
173 self.input.next(); let mut s = String::new();
175 loop {
176 match self.input.next() {
177 Some('"') => break,
178 Some('\\') => {
179 match self.input.next() {
181 Some('n') => s.push('\n'),
182 Some('t') => s.push('\t'),
183 Some('\\') => s.push('\\'),
184 Some('"') => s.push('"'),
185 Some(ch) => s.push(ch),
186 None => return Err(LexError::UnterminatedString(self.current_line)),
187 }
188 }
189 Some('\n') => {
190 self.current_line += 1;
191 s.push('\n');
192 }
193 Some(ch) => s.push(ch),
194 None => return Err(LexError::UnterminatedString(self.current_line)),
195 }
196 }
197 Ok(Token::QuotedString(s))
198 }
199
200 fn read_ident(&mut self) -> Result<Token, LexError> {
201 let mut s = String::new();
202 while let Some(&ch) = self.input.peek() {
203 if ch.is_alphanumeric() || ch == '_' || ch == '-' {
204 s.push(ch);
205 self.input.next();
206 } else {
207 break;
208 }
209 }
210 let token = match s.as_str() {
211 "namespace" => Token::Namespace,
212 "default" => Token::Default,
213 "element" => Token::Element,
214 "attribute" => Token::Attribute,
215 "empty" => Token::Empty,
216 "string" => Token::String,
217 "mixed" => Token::Mixed,
218 "list" => Token::List,
219 "text" => Token::Text,
220 _ => Token::Ident(s),
221 };
222 Ok(token)
223 }
224}
225
226#[derive(Debug, thiserror::Error)]
227pub enum LexError {
228 #[error("unexpected character '{0}' at line {1}")]
229 UnexpectedChar(char, usize),
230 #[error("unterminated string at line {0}")]
231 UnterminatedString(usize),
232}
233
234#[cfg(test)]
235mod tests {
236 use super::*;
237
238 #[test]
239 fn test_simple_definition() {
240 let input = r#"w_CT_Empty = empty"#;
241 let tokens = Lexer::new(input).tokenize().unwrap();
242 assert_eq!(
243 tokens,
244 vec![
245 Token::Ident("w_CT_Empty".into()),
246 Token::Equals,
247 Token::Empty,
248 Token::Eof,
249 ]
250 );
251 }
252
253 #[test]
254 fn test_attribute() {
255 let input = r#"attribute w:val { s_ST_String }"#;
256 let tokens = Lexer::new(input).tokenize().unwrap();
257 assert_eq!(
258 tokens,
259 vec![
260 Token::Attribute,
261 Token::Ident("w".into()),
262 Token::Colon,
263 Token::Ident("val".into()),
264 Token::LBrace,
265 Token::Ident("s_ST_String".into()),
266 Token::RBrace,
267 Token::Eof,
268 ]
269 );
270 }
271
272 #[test]
273 fn test_choice() {
274 let input = r#"string "foo" | string "bar""#;
275 let tokens = Lexer::new(input).tokenize().unwrap();
276 assert_eq!(
277 tokens,
278 vec![
279 Token::String,
280 Token::QuotedString("foo".into()),
281 Token::Pipe,
282 Token::String,
283 Token::QuotedString("bar".into()),
284 Token::Eof,
285 ]
286 );
287 }
288}