1use super::XetoError;
4
5#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum TokenType {
8 Colon,
11 ColonColon,
13 LBrace,
15 RBrace,
17 LAngle,
19 RAngle,
21 Comma,
23 Dot,
25 Question,
27 Star,
29
30 Ident,
33 Str,
35 Number,
37 Comment,
39
40 Newline,
43 Eof,
45}
46
47#[derive(Debug, Clone)]
49pub struct Token {
50 pub typ: TokenType,
52 pub val: String,
54 pub line: usize,
56 pub col: usize,
58}
59
60pub struct XetoLexer {
62 chars: Vec<char>,
63 pos: usize,
64 line: usize,
65 col: usize,
66}
67
68impl XetoLexer {
69 pub fn new(source: &str) -> Self {
71 Self {
72 chars: source.chars().collect(),
73 pos: 0,
74 line: 1,
75 col: 1,
76 }
77 }
78
79 pub fn tokenize(&mut self) -> Result<Vec<Token>, XetoError> {
81 let mut tokens = Vec::new();
82 let mut last_was_newline = false;
83
84 loop {
85 self.skip_spaces();
86
87 if self.at_end() {
88 tokens.push(Token {
89 typ: TokenType::Eof,
90 val: String::new(),
91 line: self.line,
92 col: self.col,
93 });
94 break;
95 }
96
97 let ch = self.peek();
98
99 if ch == '\n' || ch == '\r' {
101 self.consume_newline();
102 if !last_was_newline {
103 tokens.push(Token {
104 typ: TokenType::Newline,
105 val: "\n".to_string(),
106 line: self.line - 1,
107 col: 1,
108 });
109 last_was_newline = true;
110 }
111 continue;
112 }
113
114 last_was_newline = false;
115
116 if ch == '/' && self.peek_at(1) == Some('/') {
118 let tok = self.read_comment();
119 tokens.push(tok);
120 continue;
121 }
122
123 if ch == '"' {
125 let tok = self.read_string()?;
126 tokens.push(tok);
127 continue;
128 }
129
130 if ch.is_ascii_digit()
132 || (ch == '-' && self.peek_at(1).is_some_and(|c| c.is_ascii_digit()))
133 {
134 let tok = self.read_number();
135 tokens.push(tok);
136 continue;
137 }
138
139 if ch.is_alphabetic() || ch == '_' {
141 let tok = self.read_ident();
142 tokens.push(tok);
143 continue;
144 }
145
146 let (typ, val) = match ch {
148 ':' => {
149 if self.peek_at(1) == Some(':') {
150 self.advance();
151 self.advance();
152 (TokenType::ColonColon, "::".to_string())
153 } else {
154 self.advance();
155 (TokenType::Colon, ":".to_string())
156 }
157 }
158 '{' => {
159 self.advance();
160 (TokenType::LBrace, "{".to_string())
161 }
162 '}' => {
163 self.advance();
164 (TokenType::RBrace, "}".to_string())
165 }
166 '<' => {
167 self.advance();
168 (TokenType::LAngle, "<".to_string())
169 }
170 '>' => {
171 self.advance();
172 (TokenType::RAngle, ">".to_string())
173 }
174 ',' => {
175 self.advance();
176 (TokenType::Comma, ",".to_string())
177 }
178 '.' => {
179 self.advance();
180 (TokenType::Dot, ".".to_string())
181 }
182 '?' => {
183 self.advance();
184 (TokenType::Question, "?".to_string())
185 }
186 '*' => {
187 self.advance();
188 (TokenType::Star, "*".to_string())
189 }
190 other => {
191 return Err(XetoError::Parse {
192 line: self.line,
193 col: self.col,
194 message: format!("unexpected character: '{}'", other),
195 });
196 }
197 };
198
199 let line = self.line;
200 let col_start = self.col - val.len();
202 tokens.push(Token {
203 typ,
204 val,
205 line,
206 col: col_start,
207 });
208 }
209
210 Ok(tokens)
211 }
212
213 fn at_end(&self) -> bool {
216 self.pos >= self.chars.len()
217 }
218
219 fn peek(&self) -> char {
220 self.chars[self.pos]
221 }
222
223 fn peek_at(&self, offset: usize) -> Option<char> {
224 self.chars.get(self.pos + offset).copied()
225 }
226
227 fn advance(&mut self) -> char {
228 let ch = self.chars[self.pos];
229 self.pos += 1;
230 if ch == '\n' {
231 self.line += 1;
232 self.col = 1;
233 } else {
234 self.col += 1;
235 }
236 ch
237 }
238
239 fn skip_spaces(&mut self) {
240 while !self.at_end() {
241 let ch = self.peek();
242 if ch == ' ' || ch == '\t' || ch == '\r' {
243 self.advance();
244 } else {
245 break;
246 }
247 }
248 }
249
250 fn consume_newline(&mut self) {
251 if self.peek() == '\r' {
252 self.advance();
253 if !self.at_end() && self.peek() == '\n' {
254 self.advance();
255 }
256 } else {
257 self.advance();
258 }
259 }
260
261 fn read_comment(&mut self) -> Token {
262 let line = self.line;
263 let col = self.col;
264 self.advance();
266 self.advance();
267 if !self.at_end() && self.peek() == ' ' {
269 self.advance();
270 }
271 let mut text = String::new();
272 while !self.at_end() && self.peek() != '\n' && self.peek() != '\r' {
273 text.push(self.advance());
274 }
275 Token {
276 typ: TokenType::Comment,
277 val: text,
278 line,
279 col,
280 }
281 }
282
283 fn read_string(&mut self) -> Result<Token, XetoError> {
284 let line = self.line;
285 let col = self.col;
286 self.advance(); let mut text = String::new();
288 loop {
289 if self.at_end() {
290 return Err(XetoError::Parse {
291 line,
292 col,
293 message: "unterminated string literal".to_string(),
294 });
295 }
296 let ch = self.advance();
297 if ch == '"' {
298 break;
299 }
300 if ch == '\\' {
301 if self.at_end() {
302 return Err(XetoError::Parse {
303 line,
304 col,
305 message: "unterminated escape sequence".to_string(),
306 });
307 }
308 let esc = self.advance();
309 match esc {
310 'n' => text.push('\n'),
311 't' => text.push('\t'),
312 '\\' => text.push('\\'),
313 '"' => text.push('"'),
314 other => {
315 text.push('\\');
316 text.push(other);
317 }
318 }
319 } else {
320 text.push(ch);
321 }
322 }
323 Ok(Token {
324 typ: TokenType::Str,
325 val: text,
326 line,
327 col,
328 })
329 }
330
331 fn read_number(&mut self) -> Token {
332 let line = self.line;
333 let col = self.col;
334 let mut text = String::new();
335
336 if !self.at_end() && self.peek() == '-' {
338 text.push(self.advance());
339 }
340
341 while !self.at_end() && self.peek().is_ascii_digit() {
343 text.push(self.advance());
344 }
345
346 if !self.at_end()
348 && self.peek() == '.'
349 && self.peek_at(1).is_some_and(|c| c.is_ascii_digit())
350 {
351 text.push(self.advance()); while !self.at_end() && self.peek().is_ascii_digit() {
353 text.push(self.advance());
354 }
355 }
356
357 if !self.at_end() && (self.peek() == 'e' || self.peek() == 'E') {
359 text.push(self.advance());
360 if !self.at_end() && (self.peek() == '+' || self.peek() == '-') {
361 text.push(self.advance());
362 }
363 while !self.at_end() && self.peek().is_ascii_digit() {
364 text.push(self.advance());
365 }
366 }
367
368 while !self.at_end() {
370 let ch = self.peek();
371 if ch.is_alphabetic() || ch == '%' || ch == '/' || ch == '\u{00b0}' {
372 text.push(self.advance());
373 } else {
374 break;
375 }
376 }
377
378 Token {
379 typ: TokenType::Number,
380 val: text,
381 line,
382 col,
383 }
384 }
385
386 fn read_ident(&mut self) -> Token {
387 let line = self.line;
388 let col = self.col;
389 let mut text = String::new();
390
391 while !self.at_end() {
392 let ch = self.peek();
393 if ch.is_alphanumeric() || ch == '_' {
394 text.push(self.advance());
395 } else {
396 break;
397 }
398 }
399
400 Token {
401 typ: TokenType::Ident,
402 val: text,
403 line,
404 col,
405 }
406 }
407}
408
409#[cfg(test)]
410mod tests {
411 use super::*;
412
413 fn lex(source: &str) -> Vec<Token> {
414 let mut lexer = XetoLexer::new(source);
415 lexer.tokenize().unwrap()
416 }
417
418 fn types(tokens: &[Token]) -> Vec<&TokenType> {
419 tokens.iter().map(|t| &t.typ).collect()
420 }
421
422 #[test]
423 fn tokenize_identifiers() {
424 let tokens = lex("foo bar_baz Ahu123");
425 let idents: Vec<&str> = tokens
426 .iter()
427 .filter(|t| t.typ == TokenType::Ident)
428 .map(|t| t.val.as_str())
429 .collect();
430 assert_eq!(idents, vec!["foo", "bar_baz", "Ahu123"]);
431 }
432
433 #[test]
434 fn tokenize_strings() {
435 let tokens = lex(r#""hello" "world""#);
436 let strs: Vec<&str> = tokens
437 .iter()
438 .filter(|t| t.typ == TokenType::Str)
439 .map(|t| t.val.as_str())
440 .collect();
441 assert_eq!(strs, vec!["hello", "world"]);
442 }
443
444 #[test]
445 fn string_escape_sequences() {
446 let tokens = lex(r#""line\nnew\ttab\\back\"quote""#);
447 let s = &tokens[0];
448 assert_eq!(s.typ, TokenType::Str);
449 assert_eq!(s.val, "line\nnew\ttab\\back\"quote");
450 }
451
452 #[test]
453 fn tokenize_numbers() {
454 let tokens = lex("42 72.5 -10");
455 let nums: Vec<&str> = tokens
456 .iter()
457 .filter(|t| t.typ == TokenType::Number)
458 .map(|t| t.val.as_str())
459 .collect();
460 assert_eq!(nums, vec!["42", "72.5", "-10"]);
461 }
462
463 #[test]
464 fn token_positions() {
465 let tokens = lex("foo : bar");
466 assert_eq!(tokens[0].line, 1);
468 assert_eq!(tokens[0].col, 1);
469 assert_eq!(tokens[0].typ, TokenType::Ident);
470
471 assert_eq!(tokens[1].typ, TokenType::Colon);
472 assert_eq!(tokens[1].col, 5);
473
474 assert_eq!(tokens[2].typ, TokenType::Ident);
475 assert_eq!(tokens[2].col, 7);
476 }
477
478 #[test]
479 fn comments() {
480 let tokens = lex("// this is a comment\nfoo");
481 assert_eq!(tokens[0].typ, TokenType::Comment);
482 assert_eq!(tokens[0].val, "this is a comment");
483 assert_eq!(tokens[1].typ, TokenType::Newline);
484 assert_eq!(tokens[2].typ, TokenType::Ident);
485 assert_eq!(tokens[2].val, "foo");
486 }
487
488 #[test]
489 fn newlines_collapsed() {
490 let tokens = lex("foo\n\n\nbar");
491 let typs = types(&tokens);
492 assert_eq!(
494 typs,
495 vec![
496 &TokenType::Ident,
497 &TokenType::Newline,
498 &TokenType::Ident,
499 &TokenType::Eof,
500 ]
501 );
502 }
503
504 #[test]
505 fn delimiters() {
506 let tokens = lex(": :: { } < > , . ? *");
507 let typs: Vec<&TokenType> = tokens
508 .iter()
509 .filter(|t| t.typ != TokenType::Eof)
510 .map(|t| &t.typ)
511 .collect();
512 assert_eq!(
513 typs,
514 vec![
515 &TokenType::Colon,
516 &TokenType::ColonColon,
517 &TokenType::LBrace,
518 &TokenType::RBrace,
519 &TokenType::LAngle,
520 &TokenType::RAngle,
521 &TokenType::Comma,
522 &TokenType::Dot,
523 &TokenType::Question,
524 &TokenType::Star,
525 ]
526 );
527 }
528
529 #[test]
530 fn complex_sequence() {
531 let tokens = lex("Ahu : Equip <abstract> {\n discharge\n}");
532 let typs: Vec<&TokenType> = tokens
533 .iter()
534 .filter(|t| t.typ != TokenType::Eof)
535 .map(|t| &t.typ)
536 .collect();
537 assert_eq!(
538 typs,
539 vec![
540 &TokenType::Ident, &TokenType::Colon, &TokenType::Ident, &TokenType::LAngle, &TokenType::Ident, &TokenType::RAngle, &TokenType::LBrace, &TokenType::Newline, &TokenType::Ident, &TokenType::Newline, &TokenType::RBrace, ]
552 );
553 }
554
555 #[test]
556 fn colon_colon_qualified_name() {
557 let tokens = lex("ph::Ahu");
558 assert_eq!(tokens[0].typ, TokenType::Ident);
559 assert_eq!(tokens[0].val, "ph");
560 assert_eq!(tokens[1].typ, TokenType::ColonColon);
561 assert_eq!(tokens[2].typ, TokenType::Ident);
562 assert_eq!(tokens[2].val, "Ahu");
563 }
564
565 #[test]
566 fn unterminated_string_error() {
567 let mut lexer = XetoLexer::new(r#""hello"#);
568 let result = lexer.tokenize();
569 assert!(result.is_err());
570 let err = result.unwrap_err();
571 assert!(err.to_string().contains("unterminated string"));
572 }
573
574 #[test]
575 fn number_with_unit() {
576 let tokens = lex("72.5kW");
577 assert_eq!(tokens[0].typ, TokenType::Number);
578 assert_eq!(tokens[0].val, "72.5kW");
579 }
580
581 #[test]
582 fn number_with_exponent() {
583 let tokens = lex("1e3 2.5E-4 1E+10");
584 let nums: Vec<&str> = tokens
585 .iter()
586 .filter(|t| t.typ == TokenType::Number)
587 .map(|t| t.val.as_str())
588 .collect();
589 assert_eq!(nums, vec!["1e3", "2.5E-4", "1E+10"]);
590 }
591
592 #[test]
593 fn number_exponent_without_fraction() {
594 let tokens = lex("1e3");
595 assert_eq!(tokens[0].typ, TokenType::Number);
596 assert_eq!(tokens[0].val, "1e3");
597 }
598
599 #[test]
600 fn bare_cr_as_whitespace() {
601 let tokens = lex("foo\rbar");
602 let idents: Vec<&str> = tokens
603 .iter()
604 .filter(|t| t.typ == TokenType::Ident)
605 .map(|t| t.val.as_str())
606 .collect();
607 assert_eq!(idents, vec!["foo", "bar"]);
608 }
609}