Skip to main content

snc_core/
lexer.rs

1use crate::error::{CompileError, CompileResult, Span};
2
3#[derive(Debug, Clone, PartialEq)]
4pub enum Token {
5    // Keywords
6    Program,
7    Ss,
8    State,
9    When,
10    Entry,
11    Exit,
12    Option_,
13    Assign,
14    To,
15    Monitor,
16    Sync,
17    EvFlag,
18    If,
19    Else,
20    While,
21    For,
22    Break,
23    Return,
24    // Types
25    Int,
26    Short,
27    Long,
28    Float,
29    Double,
30    String_,
31    Char,
32    Unsigned,
33    // Literals
34    IntLit(i64),
35    FloatLit(f64),
36    StringLit(String),
37    // Identifiers
38    Ident(String),
39    // Punctuation
40    LParen,
41    RParen,
42    LBrace,
43    RBrace,
44    LBracket,
45    RBracket,
46    Semi,
47    Comma,
48    Dot,
49    Arrow, // ->
50    // Operators
51    Plus,
52    Minus,
53    Star,
54    Slash,
55    Percent,
56    Eq,      // ==
57    Ne,      // !=
58    Lt,
59    Le,      // <=
60    Gt,
61    Ge,      // >=
62    And,     // &&
63    Or,      // ||
64    Not,     // !
65    BitAnd,  // &
66    BitOr,   // |
67    BitXor,  // ^
68    BitNot,  // ~
69    Shl,     // <<
70    Shr,     // >>
71    Assign_,  // =
72    PlusEq,  // +=
73    MinusEq, // -=
74    StarEq,  // *=
75    SlashEq, // /=
76    PlusPlus,   // ++
77    MinusMinus, // --
78    Question,   // ?
79    Colon,      // :
80    // Special
81    Hash,        // #
82    DoublePercent, // %%
83    EmbeddedLine(String), // %% rest-of-line
84    Eof,
85}
86
87#[derive(Debug, Clone)]
88pub struct SpannedToken {
89    pub token: Token,
90    pub span: Span,
91}
92
93pub struct Lexer<'a> {
94    input: &'a [u8],
95    pos: usize,
96    line: usize,
97    col: usize,
98}
99
100impl<'a> Lexer<'a> {
101    pub fn new(input: &'a str) -> Self {
102        Self {
103            input: input.as_bytes(),
104            pos: 0,
105            line: 1,
106            col: 1,
107        }
108    }
109
110    fn span(&self) -> Span {
111        Span {
112            offset: self.pos,
113            line: self.line,
114            column: self.col,
115        }
116    }
117
118    fn peek(&self) -> Option<u8> {
119        self.input.get(self.pos).copied()
120    }
121
122    fn peek2(&self) -> Option<u8> {
123        self.input.get(self.pos + 1).copied()
124    }
125
126    fn advance(&mut self) -> Option<u8> {
127        let ch = self.input.get(self.pos).copied()?;
128        self.pos += 1;
129        if ch == b'\n' {
130            self.line += 1;
131            self.col = 1;
132        } else {
133            self.col += 1;
134        }
135        Some(ch)
136    }
137
138    fn skip_whitespace_and_comments(&mut self) {
139        loop {
140            // Skip whitespace
141            while self.peek().map_or(false, |c| c.is_ascii_whitespace()) {
142                self.advance();
143            }
144
145            // Skip // comments
146            if self.peek() == Some(b'/') && self.peek2() == Some(b'/') {
147                while self.peek().map_or(false, |c| c != b'\n') {
148                    self.advance();
149                }
150                continue;
151            }
152
153            // Skip /* */ comments
154            if self.peek() == Some(b'/') && self.peek2() == Some(b'*') {
155                self.advance();
156                self.advance();
157                let mut depth = 1;
158                while depth > 0 {
159                    match self.advance() {
160                        Some(b'*') if self.peek() == Some(b'/') => {
161                            self.advance();
162                            depth -= 1;
163                        }
164                        Some(b'/') if self.peek() == Some(b'*') => {
165                            self.advance();
166                            depth += 1;
167                        }
168                        None => break,
169                        _ => {}
170                    }
171                }
172                continue;
173            }
174
175            break;
176        }
177    }
178
179    fn read_string(&mut self) -> CompileResult<String> {
180        let span = self.span();
181        self.advance(); // consume opening "
182        let mut s = String::new();
183        loop {
184            match self.advance() {
185                Some(b'"') => return Ok(s),
186                Some(b'\\') => match self.advance() {
187                    Some(b'n') => s.push('\n'),
188                    Some(b't') => s.push('\t'),
189                    Some(b'\\') => s.push('\\'),
190                    Some(b'"') => s.push('"'),
191                    Some(b'0') => s.push('\0'),
192                    Some(c) => {
193                        s.push('\\');
194                        s.push(c as char);
195                    }
196                    None => return Err(CompileError::syntax(span, "unterminated string")),
197                },
198                Some(c) => s.push(c as char),
199                None => return Err(CompileError::syntax(span, "unterminated string")),
200            }
201        }
202    }
203
204    fn read_char_literal(&mut self) -> CompileResult<i64> {
205        let span = self.span();
206        self.advance(); // consume opening '
207        let ch = match self.advance() {
208            Some(b'\\') => match self.advance() {
209                Some(b'n') => b'\n',
210                Some(b't') => b'\t',
211                Some(b'\\') => b'\\',
212                Some(b'\'') => b'\'',
213                Some(b'0') => b'\0',
214                Some(b'r') => b'\r',
215                Some(b'a') => 7, // bell
216                Some(b'b') => 8, // backspace
217                Some(c) => c,
218                None => return Err(CompileError::syntax(span, "unterminated char literal")),
219            },
220            Some(c) => c,
221            None => return Err(CompileError::syntax(span, "unterminated char literal")),
222        };
223        match self.advance() {
224            Some(b'\'') => Ok(ch as i64),
225            _ => Err(CompileError::syntax(span, "unterminated char literal")),
226        }
227    }
228
229    fn read_number(&mut self) -> Token {
230        let start = self.pos;
231        let mut is_float = false;
232
233        // Handle 0x prefix
234        if self.peek() == Some(b'0')
235            && self.input.get(self.pos + 1).map_or(false, |&c| c == b'x' || c == b'X')
236        {
237            self.advance();
238            self.advance();
239            while self.peek().map_or(false, |c| c.is_ascii_hexdigit()) {
240                self.advance();
241            }
242            let s = std::str::from_utf8(&self.input[start..self.pos]).unwrap();
243            return Token::IntLit(i64::from_str_radix(&s[2..], 16).unwrap_or(0));
244        }
245
246        while self.peek().map_or(false, |c| c.is_ascii_digit()) {
247            self.advance();
248        }
249
250        if self.peek() == Some(b'.') && self.peek2().map_or(false, |c| c.is_ascii_digit()) {
251            is_float = true;
252            self.advance();
253            while self.peek().map_or(false, |c| c.is_ascii_digit()) {
254                self.advance();
255            }
256        }
257
258        // Scientific notation
259        if self.peek().map_or(false, |c| c == b'e' || c == b'E') {
260            is_float = true;
261            self.advance();
262            if self.peek().map_or(false, |c| c == b'+' || c == b'-') {
263                self.advance();
264            }
265            while self.peek().map_or(false, |c| c.is_ascii_digit()) {
266                self.advance();
267            }
268        }
269
270        // Suffix
271        if self.peek().map_or(false, |c| c == b'f' || c == b'F') {
272            is_float = true;
273            self.advance();
274        }
275
276        let s = std::str::from_utf8(&self.input[start..self.pos]).unwrap();
277        let s = s.trim_end_matches(|c: char| c == 'f' || c == 'F');
278
279        if is_float {
280            Token::FloatLit(s.parse().unwrap_or(0.0))
281        } else {
282            Token::IntLit(s.parse().unwrap_or(0))
283        }
284    }
285
286    fn read_ident(&mut self) -> String {
287        let start = self.pos;
288        while self
289            .peek()
290            .map_or(false, |c| c.is_ascii_alphanumeric() || c == b'_')
291        {
292            self.advance();
293        }
294        std::str::from_utf8(&self.input[start..self.pos])
295            .unwrap()
296            .to_string()
297    }
298
299    pub fn tokenize(&mut self) -> CompileResult<Vec<SpannedToken>> {
300        let mut tokens = Vec::new();
301
302        loop {
303            self.skip_whitespace_and_comments();
304
305            let span = self.span();
306
307            let Some(ch) = self.peek() else {
308                tokens.push(SpannedToken {
309                    token: Token::Eof,
310                    span,
311                });
312                break;
313            };
314
315            let token = match ch {
316                b'(' => { self.advance(); Token::LParen }
317                b')' => { self.advance(); Token::RParen }
318                b'{' => { self.advance(); Token::LBrace }
319                b'}' => { self.advance(); Token::RBrace }
320                b'[' => { self.advance(); Token::LBracket }
321                b']' => { self.advance(); Token::RBracket }
322                b';' => { self.advance(); Token::Semi }
323                b',' => { self.advance(); Token::Comma }
324                b'.' => { self.advance(); Token::Dot }
325                b'~' => { self.advance(); Token::BitNot }
326                b'?' => { self.advance(); Token::Question }
327                b':' => { self.advance(); Token::Colon }
328                b'"' => Token::StringLit(self.read_string()?),
329                b'\'' => Token::IntLit(self.read_char_literal()?),
330                b'#' => {
331                    self.advance();
332                    // Skip preprocessor lines
333                    let start = self.pos;
334                    while self.peek().map_or(false, |c| c != b'\n') {
335                        self.advance();
336                    }
337                    let _line = std::str::from_utf8(&self.input[start..self.pos]).unwrap();
338                    continue; // skip preprocessor directives
339                }
340                b'%' if self.peek2() == Some(b'%') => {
341                    self.advance();
342                    self.advance();
343                    // Read rest of line as embedded code
344                    let start = self.pos;
345                    while self.peek().map_or(false, |c| c != b'\n') {
346                        self.advance();
347                    }
348                    let code = std::str::from_utf8(&self.input[start..self.pos]).unwrap().to_string();
349                    tokens.push(SpannedToken {
350                        token: Token::EmbeddedLine(code),
351                        span,
352                    });
353                    continue;
354                }
355                b'+' => {
356                    self.advance();
357                    match self.peek() {
358                        Some(b'+') => { self.advance(); Token::PlusPlus }
359                        Some(b'=') => { self.advance(); Token::PlusEq }
360                        _ => Token::Plus,
361                    }
362                }
363                b'-' => {
364                    self.advance();
365                    match self.peek() {
366                        Some(b'-') => { self.advance(); Token::MinusMinus }
367                        Some(b'=') => { self.advance(); Token::MinusEq }
368                        Some(b'>') => { self.advance(); Token::Arrow }
369                        _ => Token::Minus,
370                    }
371                }
372                b'*' => {
373                    self.advance();
374                    if self.peek() == Some(b'=') { self.advance(); Token::StarEq }
375                    else { Token::Star }
376                }
377                b'/' => {
378                    self.advance();
379                    if self.peek() == Some(b'=') { self.advance(); Token::SlashEq }
380                    else { Token::Slash }
381                }
382                b'%' if self.peek2() == Some(b'{') => {
383                    self.advance();
384                    self.advance();
385                    // Read until }%
386                    let start = self.pos;
387                    loop {
388                        match self.peek() {
389                            Some(b'}') if self.input.get(self.pos + 1) == Some(&b'%') => {
390                                let code = std::str::from_utf8(&self.input[start..self.pos]).unwrap().to_string();
391                                self.advance();
392                                self.advance();
393                                tokens.push(SpannedToken {
394                                    token: Token::EmbeddedLine(code),
395                                    span,
396                                });
397                                break;
398                            }
399                            Some(_) => { self.advance(); }
400                            None => {
401                                return Err(CompileError::syntax(span, "unterminated %{ }% block"));
402                            }
403                        }
404                    }
405                    continue;
406                }
407                b'%' => {
408                    self.advance();
409                    Token::Percent
410                }
411                b'=' => {
412                    self.advance();
413                    if self.peek() == Some(b'=') { self.advance(); Token::Eq }
414                    else { Token::Assign_ }
415                }
416                b'!' => {
417                    self.advance();
418                    if self.peek() == Some(b'=') { self.advance(); Token::Ne }
419                    else { Token::Not }
420                }
421                b'<' => {
422                    self.advance();
423                    match self.peek() {
424                        Some(b'=') => { self.advance(); Token::Le }
425                        Some(b'<') => { self.advance(); Token::Shl }
426                        _ => Token::Lt,
427                    }
428                }
429                b'>' => {
430                    self.advance();
431                    match self.peek() {
432                        Some(b'=') => { self.advance(); Token::Ge }
433                        Some(b'>') => { self.advance(); Token::Shr }
434                        _ => Token::Gt,
435                    }
436                }
437                b'&' => {
438                    self.advance();
439                    if self.peek() == Some(b'&') { self.advance(); Token::And }
440                    else { Token::BitAnd }
441                }
442                b'|' => {
443                    self.advance();
444                    if self.peek() == Some(b'|') { self.advance(); Token::Or }
445                    else { Token::BitOr }
446                }
447                b'^' => { self.advance(); Token::BitXor }
448                c if c.is_ascii_digit() => self.read_number(),
449                c if c.is_ascii_alphabetic() || c == b'_' => {
450                    let ident = self.read_ident();
451                    match ident.as_str() {
452                        "program" => Token::Program,
453                        "ss" => Token::Ss,
454                        "state" => Token::State,
455                        "when" => Token::When,
456                        "entry" => Token::Entry,
457                        "exit" => Token::Exit,
458                        "option" => Token::Option_,
459                        "assign" => Token::Assign,
460                        "to" => Token::To,
461                        "monitor" => Token::Monitor,
462                        "sync" => Token::Sync,
463                        "evflag" => Token::EvFlag,
464                        "if" => Token::If,
465                        "else" => Token::Else,
466                        "while" => Token::While,
467                        "for" => Token::For,
468                        "break" => Token::Break,
469                        "return" => Token::Return,
470                        "int" => Token::Int,
471                        "short" => Token::Short,
472                        "long" => Token::Long,
473                        "float" => Token::Float,
474                        "double" => Token::Double,
475                        "string" => Token::String_,
476                        "char" => Token::Char,
477                        "unsigned" => Token::Unsigned,
478                        "TRUE" | "true" => Token::IntLit(1),
479                        "FALSE" | "false" => Token::IntLit(0),
480                        _ => Token::Ident(ident),
481                    }
482                }
483                _ => {
484                    return Err(CompileError::syntax(
485                        span,
486                        format!("unexpected character: '{}'", ch as char),
487                    ));
488                }
489            };
490
491            tokens.push(SpannedToken { token, span });
492        }
493
494        Ok(tokens)
495    }
496}
497
498#[cfg(test)]
499mod tests {
500    use super::*;
501
502    fn lex(input: &str) -> Vec<Token> {
503        Lexer::new(input)
504            .tokenize()
505            .unwrap()
506            .into_iter()
507            .map(|st| st.token)
508            .collect()
509    }
510
511    #[test]
512    fn test_keywords() {
513        let tokens = lex("program ss state when entry exit");
514        assert_eq!(
515            tokens,
516            vec![
517                Token::Program,
518                Token::Ss,
519                Token::State,
520                Token::When,
521                Token::Entry,
522                Token::Exit,
523                Token::Eof,
524            ]
525        );
526    }
527
528    #[test]
529    fn test_operators() {
530        let tokens = lex("+ - * / == != <= >= && || ++ -- += -=");
531        assert_eq!(
532            tokens,
533            vec![
534                Token::Plus, Token::Minus, Token::Star, Token::Slash,
535                Token::Eq, Token::Ne, Token::Le, Token::Ge,
536                Token::And, Token::Or, Token::PlusPlus, Token::MinusMinus,
537                Token::PlusEq, Token::MinusEq, Token::Eof,
538            ]
539        );
540    }
541
542    #[test]
543    fn test_numbers() {
544        let tokens = lex("42 3.14 0xFF 1e5");
545        assert_eq!(
546            tokens,
547            vec![
548                Token::IntLit(42),
549                Token::FloatLit(3.14),
550                Token::IntLit(255),
551                Token::FloatLit(1e5),
552                Token::Eof,
553            ]
554        );
555    }
556
557    #[test]
558    fn test_string() {
559        let tokens = lex(r#""hello\nworld""#);
560        assert_eq!(
561            tokens,
562            vec![Token::StringLit("hello\nworld".to_string()), Token::Eof]
563        );
564    }
565
566    #[test]
567    fn test_comment_skip() {
568        let tokens = lex("a /* comment */ b // line\nc");
569        assert_eq!(
570            tokens,
571            vec![
572                Token::Ident("a".into()),
573                Token::Ident("b".into()),
574                Token::Ident("c".into()),
575                Token::Eof,
576            ]
577        );
578    }
579
580    #[test]
581    fn test_simple_program() {
582        let tokens = lex(r#"
583            program test
584            option +s;
585            double x;
586            assign x to "PV:x";
587            monitor x;
588        "#);
589        assert_eq!(tokens[0], Token::Program);
590        assert_eq!(tokens[1], Token::Ident("test".into()));
591        assert_eq!(tokens[2], Token::Option_);
592    }
593
594    #[test]
595    fn test_preprocessor_skipped() {
596        let tokens = lex("#include \"foo.h\"\nint x;");
597        assert_eq!(
598            tokens,
599            vec![Token::Int, Token::Ident("x".into()), Token::Semi, Token::Eof]
600        );
601    }
602
603    #[test]
604    fn test_char_literal() {
605        let tokens = lex("'A' '\\n' '\\0'");
606        assert_eq!(
607            tokens,
608            vec![
609                Token::IntLit(65),
610                Token::IntLit(10),
611                Token::IntLit(0),
612                Token::Eof,
613            ]
614        );
615    }
616
617    #[test]
618    fn test_embedded_line() {
619        let tokens = lex("%% use std::io;\nint x;");
620        assert_eq!(tokens.len(), 5); // EmbeddedLine, Int, Ident, Semi, Eof
621        assert!(matches!(&tokens[0], Token::EmbeddedLine(s) if s.contains("use std::io")));
622    }
623
624    #[test]
625    fn test_embedded_block() {
626        let tokens = lex("%{ some code }%\nint x;");
627        assert_eq!(tokens.len(), 5); // EmbeddedLine, Int, Ident, Semi, Eof
628        assert!(matches!(&tokens[0], Token::EmbeddedLine(s) if s.contains("some code")));
629    }
630
631    #[test]
632    fn test_true_false() {
633        let tokens = lex("TRUE FALSE true false");
634        assert_eq!(
635            tokens,
636            vec![
637                Token::IntLit(1), Token::IntLit(0),
638                Token::IntLit(1), Token::IntLit(0),
639                Token::Eof,
640            ]
641        );
642    }
643}