Skip to main content

haystack_core/xeto/
lexer.rs

1// Xeto lexer -- tokenizes Xeto source text.
2
3use super::XetoError;
4
5/// Token types produced by the Xeto lexer.
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum TokenType {
8    // Delimiters
9    /// `:`
10    Colon,
11    /// `::`
12    ColonColon,
13    /// `{`
14    LBrace,
15    /// `}`
16    RBrace,
17    /// `<`
18    LAngle,
19    /// `>`
20    RAngle,
21    /// `,`
22    Comma,
23    /// `.`
24    Dot,
25    /// `?`
26    Question,
27    /// `*`
28    Star,
29
30    // Literals
31    /// Identifier (letters, digits, underscores; starts with letter or underscore).
32    Ident,
33    /// Quoted string literal.
34    Str,
35    /// Numeric literal (integer or float, with optional unit suffix).
36    Number,
37    /// Comment (`// ...` to end of line).
38    Comment,
39
40    // Special
41    /// Newline (consecutive newlines are collapsed).
42    Newline,
43    /// End of file.
44    Eof,
45}
46
47/// A single token from Xeto source.
48#[derive(Debug, Clone)]
49pub struct Token {
50    /// Token type.
51    pub typ: TokenType,
52    /// Token text.
53    pub val: String,
54    /// 1-based line number.
55    pub line: usize,
56    /// 1-based column number.
57    pub col: usize,
58}
59
60/// Tokenizer for Xeto source text.
61pub struct XetoLexer {
62    chars: Vec<char>,
63    pos: usize,
64    line: usize,
65    col: usize,
66}
67
68impl XetoLexer {
69    /// Create a new lexer for the given source text.
70    pub fn new(source: &str) -> Self {
71        Self {
72            chars: source.chars().collect(),
73            pos: 0,
74            line: 1,
75            col: 1,
76        }
77    }
78
79    /// Tokenize the entire source, returning a list of tokens.
80    pub fn tokenize(&mut self) -> Result<Vec<Token>, XetoError> {
81        let mut tokens = Vec::new();
82        let mut last_was_newline = false;
83
84        loop {
85            self.skip_spaces();
86
87            if self.at_end() {
88                tokens.push(Token {
89                    typ: TokenType::Eof,
90                    val: String::new(),
91                    line: self.line,
92                    col: self.col,
93                });
94                break;
95            }
96
97            let ch = self.peek();
98
99            // Newlines
100            if ch == '\n' || ch == '\r' {
101                self.consume_newline();
102                if !last_was_newline {
103                    tokens.push(Token {
104                        typ: TokenType::Newline,
105                        val: "\n".to_string(),
106                        line: self.line - 1,
107                        col: 1,
108                    });
109                    last_was_newline = true;
110                }
111                continue;
112            }
113
114            last_was_newline = false;
115
116            // Comments
117            if ch == '/' && self.peek_at(1) == Some('/') {
118                let tok = self.read_comment();
119                tokens.push(tok);
120                continue;
121            }
122
123            // String literals
124            if ch == '"' {
125                let tok = self.read_string()?;
126                tokens.push(tok);
127                continue;
128            }
129
130            // Numbers
131            if ch.is_ascii_digit()
132                || (ch == '-' && self.peek_at(1).is_some_and(|c| c.is_ascii_digit()))
133            {
134                let tok = self.read_number();
135                tokens.push(tok);
136                continue;
137            }
138
139            // Identifiers
140            if ch.is_alphabetic() || ch == '_' {
141                let tok = self.read_ident();
142                tokens.push(tok);
143                continue;
144            }
145
146            // Delimiters
147            let (typ, val) = match ch {
148                ':' => {
149                    if self.peek_at(1) == Some(':') {
150                        self.advance();
151                        self.advance();
152                        (TokenType::ColonColon, "::".to_string())
153                    } else {
154                        self.advance();
155                        (TokenType::Colon, ":".to_string())
156                    }
157                }
158                '{' => {
159                    self.advance();
160                    (TokenType::LBrace, "{".to_string())
161                }
162                '}' => {
163                    self.advance();
164                    (TokenType::RBrace, "}".to_string())
165                }
166                '<' => {
167                    self.advance();
168                    (TokenType::LAngle, "<".to_string())
169                }
170                '>' => {
171                    self.advance();
172                    (TokenType::RAngle, ">".to_string())
173                }
174                ',' => {
175                    self.advance();
176                    (TokenType::Comma, ",".to_string())
177                }
178                '.' => {
179                    self.advance();
180                    (TokenType::Dot, ".".to_string())
181                }
182                '?' => {
183                    self.advance();
184                    (TokenType::Question, "?".to_string())
185                }
186                '*' => {
187                    self.advance();
188                    (TokenType::Star, "*".to_string())
189                }
190                other => {
191                    return Err(XetoError::Parse {
192                        line: self.line,
193                        col: self.col,
194                        message: format!("unexpected character: '{}'", other),
195                    });
196                }
197            };
198
199            let line = self.line;
200            // col for delimiter was before the advance(s); compute it
201            let col_start = self.col - val.len();
202            tokens.push(Token {
203                typ,
204                val,
205                line,
206                col: col_start,
207            });
208        }
209
210        Ok(tokens)
211    }
212
213    // --- internal helpers ---
214
215    fn at_end(&self) -> bool {
216        self.pos >= self.chars.len()
217    }
218
219    fn peek(&self) -> char {
220        self.chars[self.pos]
221    }
222
223    fn peek_at(&self, offset: usize) -> Option<char> {
224        self.chars.get(self.pos + offset).copied()
225    }
226
227    fn advance(&mut self) -> char {
228        let ch = self.chars[self.pos];
229        self.pos += 1;
230        if ch == '\n' {
231            self.line += 1;
232            self.col = 1;
233        } else {
234            self.col += 1;
235        }
236        ch
237    }
238
239    fn skip_spaces(&mut self) {
240        while !self.at_end() {
241            let ch = self.peek();
242            if ch == ' ' || ch == '\t' || ch == '\r' {
243                self.advance();
244            } else {
245                break;
246            }
247        }
248    }
249
250    fn consume_newline(&mut self) {
251        if self.peek() == '\r' {
252            self.advance();
253            if !self.at_end() && self.peek() == '\n' {
254                self.advance();
255            }
256        } else {
257            self.advance();
258        }
259    }
260
261    fn read_comment(&mut self) -> Token {
262        let line = self.line;
263        let col = self.col;
264        // Skip the two slashes
265        self.advance();
266        self.advance();
267        // Skip optional leading space
268        if !self.at_end() && self.peek() == ' ' {
269            self.advance();
270        }
271        let mut text = String::new();
272        while !self.at_end() && self.peek() != '\n' && self.peek() != '\r' {
273            text.push(self.advance());
274        }
275        Token {
276            typ: TokenType::Comment,
277            val: text,
278            line,
279            col,
280        }
281    }
282
283    fn read_string(&mut self) -> Result<Token, XetoError> {
284        let line = self.line;
285        let col = self.col;
286        self.advance(); // opening quote
287        let mut text = String::new();
288        loop {
289            if self.at_end() {
290                return Err(XetoError::Parse {
291                    line,
292                    col,
293                    message: "unterminated string literal".to_string(),
294                });
295            }
296            let ch = self.advance();
297            if ch == '"' {
298                break;
299            }
300            if ch == '\\' {
301                if self.at_end() {
302                    return Err(XetoError::Parse {
303                        line,
304                        col,
305                        message: "unterminated escape sequence".to_string(),
306                    });
307                }
308                let esc = self.advance();
309                match esc {
310                    'n' => text.push('\n'),
311                    't' => text.push('\t'),
312                    '\\' => text.push('\\'),
313                    '"' => text.push('"'),
314                    other => {
315                        text.push('\\');
316                        text.push(other);
317                    }
318                }
319            } else {
320                text.push(ch);
321            }
322        }
323        Ok(Token {
324            typ: TokenType::Str,
325            val: text,
326            line,
327            col,
328        })
329    }
330
331    fn read_number(&mut self) -> Token {
332        let line = self.line;
333        let col = self.col;
334        let mut text = String::new();
335
336        // Optional leading minus
337        if !self.at_end() && self.peek() == '-' {
338            text.push(self.advance());
339        }
340
341        // Integer part
342        while !self.at_end() && self.peek().is_ascii_digit() {
343            text.push(self.advance());
344        }
345
346        // Fractional part
347        if !self.at_end()
348            && self.peek() == '.'
349            && self.peek_at(1).is_some_and(|c| c.is_ascii_digit())
350        {
351            text.push(self.advance()); // '.'
352            while !self.at_end() && self.peek().is_ascii_digit() {
353                text.push(self.advance());
354            }
355        }
356
357        // Exponent part (scientific notation)
358        if !self.at_end() && (self.peek() == 'e' || self.peek() == 'E') {
359            text.push(self.advance());
360            if !self.at_end() && (self.peek() == '+' || self.peek() == '-') {
361                text.push(self.advance());
362            }
363            while !self.at_end() && self.peek().is_ascii_digit() {
364                text.push(self.advance());
365            }
366        }
367
368        // Optional unit suffix (letters and special chars like degree, percent, etc.)
369        while !self.at_end() {
370            let ch = self.peek();
371            if ch.is_alphabetic() || ch == '%' || ch == '/' || ch == '\u{00b0}' {
372                text.push(self.advance());
373            } else {
374                break;
375            }
376        }
377
378        Token {
379            typ: TokenType::Number,
380            val: text,
381            line,
382            col,
383        }
384    }
385
386    fn read_ident(&mut self) -> Token {
387        let line = self.line;
388        let col = self.col;
389        let mut text = String::new();
390
391        while !self.at_end() {
392            let ch = self.peek();
393            if ch.is_alphanumeric() || ch == '_' {
394                text.push(self.advance());
395            } else {
396                break;
397            }
398        }
399
400        Token {
401            typ: TokenType::Ident,
402            val: text,
403            line,
404            col,
405        }
406    }
407}
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412
413    fn lex(source: &str) -> Vec<Token> {
414        let mut lexer = XetoLexer::new(source);
415        lexer.tokenize().unwrap()
416    }
417
418    fn types(tokens: &[Token]) -> Vec<&TokenType> {
419        tokens.iter().map(|t| &t.typ).collect()
420    }
421
422    #[test]
423    fn tokenize_identifiers() {
424        let tokens = lex("foo bar_baz Ahu123");
425        let idents: Vec<&str> = tokens
426            .iter()
427            .filter(|t| t.typ == TokenType::Ident)
428            .map(|t| t.val.as_str())
429            .collect();
430        assert_eq!(idents, vec!["foo", "bar_baz", "Ahu123"]);
431    }
432
433    #[test]
434    fn tokenize_strings() {
435        let tokens = lex(r#""hello" "world""#);
436        let strs: Vec<&str> = tokens
437            .iter()
438            .filter(|t| t.typ == TokenType::Str)
439            .map(|t| t.val.as_str())
440            .collect();
441        assert_eq!(strs, vec!["hello", "world"]);
442    }
443
444    #[test]
445    fn string_escape_sequences() {
446        let tokens = lex(r#""line\nnew\ttab\\back\"quote""#);
447        let s = &tokens[0];
448        assert_eq!(s.typ, TokenType::Str);
449        assert_eq!(s.val, "line\nnew\ttab\\back\"quote");
450    }
451
452    #[test]
453    fn tokenize_numbers() {
454        let tokens = lex("42 72.5 -10");
455        let nums: Vec<&str> = tokens
456            .iter()
457            .filter(|t| t.typ == TokenType::Number)
458            .map(|t| t.val.as_str())
459            .collect();
460        assert_eq!(nums, vec!["42", "72.5", "-10"]);
461    }
462
463    #[test]
464    fn token_positions() {
465        let tokens = lex("foo : bar");
466        // foo at col 1, : at col 5, bar at col 7
467        assert_eq!(tokens[0].line, 1);
468        assert_eq!(tokens[0].col, 1);
469        assert_eq!(tokens[0].typ, TokenType::Ident);
470
471        assert_eq!(tokens[1].typ, TokenType::Colon);
472        assert_eq!(tokens[1].col, 5);
473
474        assert_eq!(tokens[2].typ, TokenType::Ident);
475        assert_eq!(tokens[2].col, 7);
476    }
477
478    #[test]
479    fn comments() {
480        let tokens = lex("// this is a comment\nfoo");
481        assert_eq!(tokens[0].typ, TokenType::Comment);
482        assert_eq!(tokens[0].val, "this is a comment");
483        assert_eq!(tokens[1].typ, TokenType::Newline);
484        assert_eq!(tokens[2].typ, TokenType::Ident);
485        assert_eq!(tokens[2].val, "foo");
486    }
487
488    #[test]
489    fn newlines_collapsed() {
490        let tokens = lex("foo\n\n\nbar");
491        let typs = types(&tokens);
492        // foo, newline (collapsed), bar, eof
493        assert_eq!(
494            typs,
495            vec![
496                &TokenType::Ident,
497                &TokenType::Newline,
498                &TokenType::Ident,
499                &TokenType::Eof,
500            ]
501        );
502    }
503
504    #[test]
505    fn delimiters() {
506        let tokens = lex(": :: { } < > , . ? *");
507        let typs: Vec<&TokenType> = tokens
508            .iter()
509            .filter(|t| t.typ != TokenType::Eof)
510            .map(|t| &t.typ)
511            .collect();
512        assert_eq!(
513            typs,
514            vec![
515                &TokenType::Colon,
516                &TokenType::ColonColon,
517                &TokenType::LBrace,
518                &TokenType::RBrace,
519                &TokenType::LAngle,
520                &TokenType::RAngle,
521                &TokenType::Comma,
522                &TokenType::Dot,
523                &TokenType::Question,
524                &TokenType::Star,
525            ]
526        );
527    }
528
529    #[test]
530    fn complex_sequence() {
531        let tokens = lex("Ahu : Equip <abstract> {\n  discharge\n}");
532        let typs: Vec<&TokenType> = tokens
533            .iter()
534            .filter(|t| t.typ != TokenType::Eof)
535            .map(|t| &t.typ)
536            .collect();
537        assert_eq!(
538            typs,
539            vec![
540                &TokenType::Ident,   // Ahu
541                &TokenType::Colon,   // :
542                &TokenType::Ident,   // Equip
543                &TokenType::LAngle,  // <
544                &TokenType::Ident,   // abstract
545                &TokenType::RAngle,  // >
546                &TokenType::LBrace,  // {
547                &TokenType::Newline, // \n
548                &TokenType::Ident,   // discharge
549                &TokenType::Newline, // \n
550                &TokenType::RBrace,  // }
551            ]
552        );
553    }
554
555    #[test]
556    fn colon_colon_qualified_name() {
557        let tokens = lex("ph::Ahu");
558        assert_eq!(tokens[0].typ, TokenType::Ident);
559        assert_eq!(tokens[0].val, "ph");
560        assert_eq!(tokens[1].typ, TokenType::ColonColon);
561        assert_eq!(tokens[2].typ, TokenType::Ident);
562        assert_eq!(tokens[2].val, "Ahu");
563    }
564
565    #[test]
566    fn unterminated_string_error() {
567        let mut lexer = XetoLexer::new(r#""hello"#);
568        let result = lexer.tokenize();
569        assert!(result.is_err());
570        let err = result.unwrap_err();
571        assert!(err.to_string().contains("unterminated string"));
572    }
573
574    #[test]
575    fn number_with_unit() {
576        let tokens = lex("72.5kW");
577        assert_eq!(tokens[0].typ, TokenType::Number);
578        assert_eq!(tokens[0].val, "72.5kW");
579    }
580
581    #[test]
582    fn number_with_exponent() {
583        let tokens = lex("1e3 2.5E-4 1E+10");
584        let nums: Vec<&str> = tokens
585            .iter()
586            .filter(|t| t.typ == TokenType::Number)
587            .map(|t| t.val.as_str())
588            .collect();
589        assert_eq!(nums, vec!["1e3", "2.5E-4", "1E+10"]);
590    }
591
592    #[test]
593    fn number_exponent_without_fraction() {
594        let tokens = lex("1e3");
595        assert_eq!(tokens[0].typ, TokenType::Number);
596        assert_eq!(tokens[0].val, "1e3");
597    }
598
599    #[test]
600    fn bare_cr_as_whitespace() {
601        let tokens = lex("foo\rbar");
602        let idents: Vec<&str> = tokens
603            .iter()
604            .filter(|t| t.typ == TokenType::Ident)
605            .map(|t| t.val.as_str())
606            .collect();
607        assert_eq!(idents, vec!["foo", "bar"]);
608    }
609}