1#[derive(Debug, PartialEq, Clone)]
4pub enum TokenType {
5 Eof,
8 Whitespace,
10 Comment(String),
13 Unknown,
15
16 Identifier(String),
20 String(String),
23 Number(f64),
25
26 True,
29 False,
31 Null,
33 Import,
35 From,
37 As,
39
40 LBrace,
43 RBrace,
45 LBracket,
47 RBracket,
49 LParen,
51 RParen,
53 Comma,
55 Colon,
57 DoubleColon,
59 Dot,
61 Equals,
63 Hash,
65 Dollar,
67 Ampersand,
69 Asterisk,
71 Spread,
73}
74
75#[derive(Debug, Clone)]
77pub struct Token {
78 pub ttype: TokenType,
79 pub pos_start: usize,
80 pub pos_end: usize,
81}
82
83impl Token {
84 #[must_use]
85 pub fn new(ttype: TokenType, pos_start: usize, pos_end: usize) -> Token {
86 Token {
87 ttype,
88 pos_start,
89 pos_end,
90 }
91 }
92}
93
94pub struct Lexer<'a> {
95 chars: std::iter::Peekable<std::str::Chars<'a>>,
96 position: usize,
97}
98
99impl<'a> Lexer<'a> {
100 #[must_use]
101 pub fn new(input: &'a str) -> Self {
102 Self {
103 chars: input.chars().peekable(),
104 position: 0,
105 }
106 }
107
108 pub fn lex(&mut self) -> Vec<Token> {
109 let mut tokens = Vec::new();
110 loop {
111 let token = self.next_token();
112 if token.ttype == TokenType::Eof {
113 tokens.push(token);
114 break;
115 }
116 tokens.push(token);
117 }
118 tokens
119 }
120
121 pub fn next_token(&mut self) -> Token {
122 let start_pos = self.position;
123
124 let ttype = if let Some(char) = self.advance() {
125 match char {
126 '{' => TokenType::LBrace,
127 '}' => TokenType::RBrace,
128 '[' => TokenType::LBracket,
129 ']' => TokenType::RBracket,
130 '(' => TokenType::LParen,
131 ')' => TokenType::RParen,
132 ',' => TokenType::Comma,
133 '#' => TokenType::Hash,
134 '$' => TokenType::Dollar,
135 '&' => TokenType::Ampersand,
136 '*' => TokenType::Asterisk,
137 '=' => TokenType::Equals,
138
139 ':' => {
140 if self.peek() == Some(&':') {
141 self.advance();
142 TokenType::DoubleColon
143 } else {
144 TokenType::Colon
145 }
146 }
147 '.' => {
148 if self.peek() == Some(&'.') {
149 self.advance();
150 if self.peek() == Some(&'.') {
151 self.advance();
152 TokenType::Spread
153 } else {
154 TokenType::Unknown
155 }
156 } else {
157 TokenType::Dot
158 }
159 }
160 '/' => {
161 if self.peek() == Some(&'/') {
162 self.read_comment()
163 } else {
164 TokenType::Unknown
165 }
166 }
167 '"' => self.read_string(),
168 c if c.is_whitespace() => self.read_whitespace(),
169 c if c.is_ascii_alphabetic() || c == '_' => self.read_identifier(c),
170 c if c.is_ascii_digit()
171 || (c == '-' && self.peek().is_some_and(char::is_ascii_digit)) =>
172 {
173 self.read_number(c)
174 }
175
176 _ => TokenType::Unknown,
177 }
178 } else {
179 TokenType::Eof
180 };
181
182 Token::new(ttype, start_pos, self.position)
183 }
184
185 fn advance(&mut self) -> Option<char> {
186 let char = self.chars.next();
187 if let Some(c) = char {
188 self.position += c.len_utf8();
189 }
190 char
191 }
192
193 fn peek(&mut self) -> Option<&char> {
194 self.chars.peek()
195 }
196
197 fn read_whitespace(&mut self) -> TokenType {
198 while let Some(c) = self.peek() {
199 if c.is_whitespace() {
200 self.advance();
201 } else {
202 break;
203 }
204 }
205 TokenType::Whitespace
206 }
207
208 fn read_comment(&mut self) -> TokenType {
209 self.advance(); let mut comment_text = String::new();
211 while let Some(c) = self.peek() {
212 if *c == '\n' {
213 break;
214 }
215 comment_text.push(self.advance().unwrap());
216 }
217 TokenType::Comment(comment_text.trim().to_string())
218 }
219
220 fn read_string(&mut self) -> TokenType {
221 let mut value = String::new();
222 loop {
223 match self.peek() {
224 Some('\"') => {
225 self.advance(); return TokenType::String(value);
227 }
228 Some('\\') => {
229 self.advance(); match self.advance() {
231 Some('\"') => value.push('\"'),
232 Some('\\') => value.push('\\'),
233 Some('n') => value.push('\n'),
234 Some('r') => value.push('\r'),
235 Some('t') => value.push('\t'),
236 Some(other) => {
237 value.push('\\');
238 value.push(other);
239 }
240 None => return TokenType::Unknown, }
242 }
243 Some(c) => {
244 value.push(*c);
245 self.advance();
246 }
247 None => return TokenType::Unknown, }
249 }
250 }
251
252 fn read_identifier(&mut self, first_char: char) -> TokenType {
253 let mut ident = String::new();
254 ident.push(first_char);
255
256 while let Some(c) = self.peek() {
257 if c.is_ascii_alphanumeric() || *c == '_' {
258 ident.push(self.advance().unwrap());
259 } else {
260 break;
261 }
262 }
263
264 match ident.as_str() {
265 "true" | "on" => TokenType::True,
266 "false" | "off" => TokenType::False,
267 "null" => TokenType::Null,
268 "import" => TokenType::Import,
269 "from" => TokenType::From,
270 "as" => TokenType::As,
271 _ => TokenType::Identifier(ident),
272 }
273 }
274
275 fn read_number(&mut self, first_char: char) -> TokenType {
276 let mut number_str = String::new();
277 number_str.push(first_char);
278 let mut has_dot = first_char == '.';
279 let mut has_exponent = false;
280
281 while let Some(c) = self.peek() {
282 if c.is_ascii_digit() {
283 number_str.push(self.advance().unwrap());
284 } else if *c == '.' && !has_dot {
285 has_dot = true;
286 number_str.push(self.advance().unwrap());
287 } else if (*c == 'e' || *c == 'E') && !has_exponent {
288 has_exponent = true;
289 number_str.push(self.advance().unwrap());
290 if let Some(sign_char) = self.peek() {
292 if *sign_char == '+' || *sign_char == '-' {
293 number_str.push(self.advance().unwrap());
294 }
295 }
296 } else {
297 break;
298 }
299 }
300
301 if let Ok(num) = number_str.parse::<f64>() {
302 TokenType::Number(num)
303 } else {
304 TokenType::Unknown
305 }
306 }
307}
308
309#[allow(dead_code)]
311pub(crate) fn tokens_to_pretty_string(tokens: &[Token]) -> String {
312 let mut buff: Vec<String> = Vec::with_capacity(tokens.len());
313
314 for token in tokens {
315 buff.push(format!(
316 "{:?}, {}, {}",
317 token.ttype, token.pos_start, token.pos_end
318 ));
319 }
320
321 buff.join("\n")
322}
323
324#[cfg(test)]
325#[allow(clippy::needless_pass_by_value)]
326#[allow(clippy::explicit_auto_deref)]
327mod tests {
328 use super::*;
329
330 fn assert_tokens(input: &str, expected: &[TokenType]) {
331 let mut lexer = Lexer::new(input);
332 let tokens = lexer.lex();
333 let token_types: Vec<TokenType> = tokens.into_iter().map(|t| t.ttype).collect();
334
335 let filtered_tokens: Vec<TokenType> = token_types
337 .into_iter()
338 .filter(|t| !matches!(t, TokenType::Whitespace | TokenType::Comment(_)))
339 .collect();
340
341 assert_eq!(filtered_tokens, expected);
342 }
343
344 #[test]
345 fn test_eof() {
346 assert_tokens("", &[TokenType::Eof]);
347 }
348
349 #[test]
350 fn test_single_char_tokens() {
351 let input = "{}[](),:#{new_string}*";
352 let expected = vec![
353 TokenType::LBrace,
354 TokenType::RBrace,
355 TokenType::LBracket,
356 TokenType::RBracket,
357 TokenType::LParen,
358 TokenType::RParen,
359 TokenType::Comma,
360 TokenType::Colon,
361 TokenType::Hash,
362 TokenType::LBrace,
363 TokenType::Identifier("new_string".to_string()),
364 TokenType::RBrace,
365 TokenType::Asterisk,
366 TokenType::Eof,
367 ];
368 assert_tokens(input, &expected);
369 }
370
371 #[test]
372 fn test_multi_char_operators() {
373 let input = ":: ...";
374 let expected = vec![TokenType::DoubleColon, TokenType::Spread, TokenType::Eof];
375 assert_tokens(input, &expected);
376 }
377
378 #[test]
379 fn test_keywords() {
380 let input = "true on false off null import from as";
381 let expected = vec![
382 TokenType::True,
383 TokenType::True,
384 TokenType::False,
385 TokenType::False,
386 TokenType::Null,
387 TokenType::Import,
388 TokenType::From,
389 TokenType::As,
390 TokenType::Eof,
391 ];
392 assert_tokens(input, &expected);
393 }
394
395 #[test]
396 fn test_identifiers() {
397 let input = "foo bar_123 _baz";
398 let expected = vec![
399 TokenType::Identifier("foo".to_string()),
400 TokenType::Identifier("bar_123".to_string()),
401 TokenType::Identifier("_baz".to_string()),
402 TokenType::Eof,
403 ];
404 assert_tokens(input, &expected);
405 }
406
407 #[test]
408 fn test_numbers() {
409 let input = "123 45.67 -10 0.5";
410 let expected = vec![
411 TokenType::Number(123.0),
412 TokenType::Number(45.67),
413 TokenType::Number(-10.0),
414 TokenType::Number(0.5),
415 TokenType::Eof,
416 ];
417 assert_tokens(input, &expected);
418 }
419
420 #[test]
421 fn test_strings() {
422 let input = r#""hello world" "" "another""#;
423 let expected = vec![
424 TokenType::String("hello world".to_string()),
425 TokenType::String(String::new()),
426 TokenType::String("another".to_string()),
427 TokenType::Eof,
428 ];
429 assert_tokens(input, &expected);
430 }
431
432 #[test]
433 fn test_strings_with_escapes() {
434 let input = r#""hello \"world\"\t\n\r""#;
435 let expected = vec![
436 TokenType::String("hello \"world\"\t\n\r".to_string()),
437 TokenType::Eof,
438 ];
439 assert_tokens(input, &expected);
440 }
441
442 #[test]
443 fn test_comments_and_whitespace() {
444 let input = " // this is a comment\n key: value // another one";
445 let mut lexer = Lexer::new(input);
446 let tokens = lexer.lex();
447 let token_types: Vec<TokenType> = tokens.into_iter().map(|t| t.ttype).collect();
448
449 let expected = vec![
450 TokenType::Whitespace,
451 TokenType::Comment("this is a comment".to_string()),
452 TokenType::Whitespace,
453 TokenType::Identifier("key".to_string()),
454 TokenType::Colon,
455 TokenType::Whitespace,
456 TokenType::Identifier("value".to_string()),
457 TokenType::Whitespace,
458 TokenType::Comment("another one".to_string()),
459 TokenType::Eof,
460 ];
461
462 assert_eq!(token_types, expected);
463 }
464
465 #[test]
466 fn test_complex_mon_structure() {
467 let input = r#"
468{
469 // Config settings
470 service_name: "My App",
471 port: 8080,
472 is_enabled: on,
473
474 &default_user: {
475 permissions: ["READ", "WRITE"],
476 },
477
478 admin :: User = {
479 ...*default_user,
480 name: "Admin",
481 }
482}
483 "#;
484 let expected = vec![
485 TokenType::LBrace,
486 TokenType::Identifier("service_name".to_string()),
487 TokenType::Colon,
488 TokenType::String("My App".to_string()),
489 TokenType::Comma,
490 TokenType::Identifier("port".to_string()),
491 TokenType::Colon,
492 TokenType::Number(8080.0),
493 TokenType::Comma,
494 TokenType::Identifier("is_enabled".to_string()),
495 TokenType::Colon,
496 TokenType::True,
497 TokenType::Comma,
498 TokenType::Ampersand,
499 TokenType::Identifier("default_user".to_string()),
500 TokenType::Colon,
501 TokenType::LBrace,
502 TokenType::Identifier("permissions".to_string()),
503 TokenType::Colon,
504 TokenType::LBracket,
505 TokenType::String("READ".to_string()),
506 TokenType::Comma,
507 TokenType::String("WRITE".to_string()),
508 TokenType::RBracket,
509 TokenType::Comma,
510 TokenType::RBrace,
511 TokenType::Comma,
512 TokenType::Identifier("admin".to_string()),
513 TokenType::DoubleColon,
514 TokenType::Identifier("User".to_string()),
515 TokenType::Equals,
516 TokenType::LBrace,
517 TokenType::Spread,
518 TokenType::Asterisk,
519 TokenType::Identifier("default_user".to_string()),
520 TokenType::Comma,
521 TokenType::Identifier("name".to_string()),
522 TokenType::Colon,
523 TokenType::String("Admin".to_string()),
524 TokenType::Comma,
525 TokenType::RBrace,
526 TokenType::RBrace,
527 TokenType::Eof,
528 ];
529 print!("{input}");
530 assert_tokens(input, &expected);
531 }
532}