Skip to main content

lashlang/
lexer.rs

1use compact_str::CompactString;
2use serde::{Deserialize, Serialize};
3use thiserror::Error;
4
5#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
6pub struct Span {
7    pub start: usize,
8    pub end: usize,
9}
10
11#[derive(Clone, Debug, PartialEq)]
12pub struct Token {
13    pub kind: TokenKind,
14    pub span: Span,
15}
16
17#[derive(Clone, Debug, PartialEq)]
18pub enum TokenKind {
19    Ident(CompactString),
20    String(CompactString),
21    Number(f64),
22    LBrace,
23    RBrace,
24    LParen,
25    RParen,
26    LBracket,
27    RBracket,
28    Comma,
29    Colon,
30    At,
31    Question,
32    Dot,
33    Bang,
34    Equal,
35    DoubleEqual,
36    BangEqual,
37    AndAnd,
38    OrOr,
39    Pipe,
40    Less,
41    LessEqual,
42    Greater,
43    GreaterEqual,
44    Plus,
45    Minus,
46    Star,
47    Slash,
48    Percent,
49    If,
50    Else,
51    For,
52    In,
53    Await,
54    Cancel,
55    Submit,
56    Print,
57    Call,
58    And,
59    Or,
60    Not,
61    True,
62    False,
63    Null,
64    Eof,
65}
66
67#[derive(Debug, Error, PartialEq)]
68pub enum LexError {
69    #[error("unexpected `{ch}`")]
70    UnexpectedChar { ch: char, offset: usize },
71    #[error("unterminated string")]
72    UnterminatedString { offset: usize },
73    #[error("invalid number `{lexeme}`")]
74    InvalidNumber { lexeme: String, offset: usize },
75}
76
77impl LexError {
78    pub fn offset(&self) -> usize {
79        match self {
80            Self::UnexpectedChar { offset, .. }
81            | Self::UnterminatedString { offset }
82            | Self::InvalidNumber { offset, .. } => *offset,
83        }
84    }
85}
86
87pub fn lex(source: &str) -> Result<Vec<Token>, LexError> {
88    let mut lexer = Lexer {
89        source,
90        chars: source.char_indices().peekable(),
91    };
92    lexer.lex_all()
93}
94
95struct Lexer<'a> {
96    source: &'a str,
97    chars: std::iter::Peekable<std::str::CharIndices<'a>>,
98}
99
100impl<'a> Lexer<'a> {
101    fn lex_all(&mut self) -> Result<Vec<Token>, LexError> {
102        let mut tokens = Vec::with_capacity((self.source.len() / 4).max(8));
103        while let Some((offset, ch)) = self.peek() {
104            if ch.is_whitespace() || ch == ';' {
105                self.bump();
106                continue;
107            }
108            if ch == '#' {
109                self.skip_comment();
110                continue;
111            }
112            if ch == '/' && self.peek_second() == Some('/') {
113                self.bump();
114                self.bump();
115                self.skip_comment();
116                continue;
117            }
118
119            let token = match ch {
120                '{' => self.single(TokenKind::LBrace),
121                '}' => self.single(TokenKind::RBrace),
122                '(' => self.single(TokenKind::LParen),
123                ')' => self.single(TokenKind::RParen),
124                '[' => self.single(TokenKind::LBracket),
125                ']' => self.single(TokenKind::RBracket),
126                ',' => self.single(TokenKind::Comma),
127                ':' => self.single(TokenKind::Colon),
128                '@' => self.single(TokenKind::At),
129                '?' => self.single(TokenKind::Question),
130                '.' => self.single(TokenKind::Dot),
131                '+' => self.single(TokenKind::Plus),
132                '-' => self.single(TokenKind::Minus),
133                '*' => self.single(TokenKind::Star),
134                '/' => self.single(TokenKind::Slash),
135                '%' => self.single(TokenKind::Percent),
136                '=' => self.double_or_single('=', TokenKind::DoubleEqual, TokenKind::Equal),
137                '!' => self.double_or_single('=', TokenKind::BangEqual, TokenKind::Bang),
138                '&' => self.required_double('&', TokenKind::AndAnd)?,
139                '|' => self.double_or_single('|', TokenKind::OrOr, TokenKind::Pipe),
140                '<' => self.double_or_single('=', TokenKind::LessEqual, TokenKind::Less),
141                '>' => self.double_or_single('=', TokenKind::GreaterEqual, TokenKind::Greater),
142                '"' if self.starts_with_at(offset, "\"\"\"") => {
143                    self.triple_string(false, "\"\"\"")?
144                }
145                '"' => self.string()?,
146                'r' if self.starts_with_at(offset, "r\"\"\"") => {
147                    self.triple_string(true, "\"\"\"")?
148                }
149                'r' if self.starts_with_at(offset, "r'''") => self.triple_string(true, "'''")?,
150                c if is_ident_start(c) => self.ident_or_keyword(),
151                c if c.is_ascii_digit() => self.number()?,
152                _ => return Err(LexError::UnexpectedChar { ch, offset }),
153            };
154            tokens.push(token);
155        }
156
157        let end = self.source.len();
158        tokens.push(Token {
159            kind: TokenKind::Eof,
160            span: Span { start: end, end },
161        });
162        Ok(tokens)
163    }
164
165    fn single(&mut self, kind: TokenKind) -> Token {
166        let (start, ch) = self.bump().expect("single token requires input");
167        Token {
168            kind,
169            span: Span {
170                start,
171                end: start + ch.len_utf8(),
172            },
173        }
174    }
175
176    fn double_or_single(
177        &mut self,
178        second: char,
179        double_kind: TokenKind,
180        single_kind: TokenKind,
181    ) -> Token {
182        let (start, ch) = self.bump().expect("double token requires input");
183        let end = if self.consume_if(second) {
184            start + ch.len_utf8() + second.len_utf8()
185        } else {
186            start + ch.len_utf8()
187        };
188        Token {
189            kind: if end > start + ch.len_utf8() {
190                double_kind
191            } else {
192                single_kind
193            },
194            span: Span { start, end },
195        }
196    }
197
198    fn string(&mut self) -> Result<Token, LexError> {
199        let (start, _) = self.bump().expect("string requires quote");
200        let content_start = start + 1;
201        let mut value: Option<String> = None;
202        while let Some((offset, ch)) = self.bump() {
203            match ch {
204                '"' => {
205                    let value = match value {
206                        Some(value) => CompactString::from(value),
207                        None => CompactString::from(&self.source[content_start..offset]),
208                    };
209                    return Ok(Token {
210                        kind: TokenKind::String(value),
211                        span: Span {
212                            start,
213                            end: offset + 1,
214                        },
215                    });
216                }
217                '\\' => {
218                    let value =
219                        value.get_or_insert_with(|| self.source[content_start..offset].to_string());
220                    let Some((_, escaped)) = self.bump() else {
221                        return Err(LexError::UnterminatedString { offset: start });
222                    };
223                    let translated = match escaped {
224                        '"' => '"',
225                        '\\' => '\\',
226                        'n' => '\n',
227                        'r' => '\r',
228                        't' => '\t',
229                        other => other,
230                    };
231                    value.push(translated);
232                }
233                other => {
234                    if let Some(value) = &mut value {
235                        value.push(other);
236                    }
237                }
238            }
239        }
240        Err(LexError::UnterminatedString { offset: start })
241    }
242
243    fn triple_string(&mut self, raw: bool, delimiter: &str) -> Result<Token, LexError> {
244        let (start, _) = self.peek().expect("triple string requires input");
245        let delimiter_start = start + usize::from(raw);
246        let content_start = delimiter_start + delimiter.len();
247        self.consume_until_byte(content_start);
248
249        if raw {
250            let Some(relative_end) = self.source[content_start..].find(delimiter) else {
251                return Err(LexError::UnterminatedString { offset: start });
252            };
253            let content_end = content_start + relative_end;
254            let end = content_end + delimiter.len();
255            let value = CompactString::from(&self.source[content_start..content_end]);
256            self.consume_until_byte(end);
257            return Ok(Token {
258                kind: TokenKind::String(value),
259                span: Span { start, end },
260            });
261        }
262
263        let mut value = String::new();
264        while let Some((offset, ch)) = self.bump() {
265            if self.starts_with_at(offset, delimiter) {
266                self.consume_until_byte(offset + delimiter.len());
267                return Ok(Token {
268                    kind: TokenKind::String(value.into()),
269                    span: Span {
270                        start,
271                        end: offset + delimiter.len(),
272                    },
273                });
274            }
275            if ch == '\\' {
276                let Some((_, escaped)) = self.bump() else {
277                    return Err(LexError::UnterminatedString { offset: start });
278                };
279                let translated = match escaped {
280                    '"' => '"',
281                    '\\' => '\\',
282                    'n' => '\n',
283                    'r' => '\r',
284                    't' => '\t',
285                    other => other,
286                };
287                value.push(translated);
288            } else {
289                value.push(ch);
290            }
291        }
292
293        Err(LexError::UnterminatedString { offset: start })
294    }
295
296    fn required_double(&mut self, expected: char, kind: TokenKind) -> Result<Token, LexError> {
297        let (start, ch) = self.bump().expect("double token requires input");
298        if !self.consume_if(expected) {
299            return Err(LexError::UnexpectedChar { ch, offset: start });
300        }
301        Ok(Token {
302            kind,
303            span: Span {
304                start,
305                end: start + ch.len_utf8() + expected.len_utf8(),
306            },
307        })
308    }
309
310    fn ident_or_keyword(&mut self) -> Token {
311        let (start, _) = self.peek().expect("identifier requires input");
312        let mut end = start;
313        while let Some((offset, ch)) = self.peek() {
314            if !is_ident_continue(ch) {
315                break;
316            }
317            end = offset + ch.len_utf8();
318            self.bump();
319        }
320        let text = &self.source[start..end];
321        let kind = match text {
322            "if" => TokenKind::If,
323            "else" => TokenKind::Else,
324            "for" => TokenKind::For,
325            "in" => TokenKind::In,
326            "await" => TokenKind::Await,
327            "cancel" => TokenKind::Cancel,
328            "submit" => TokenKind::Submit,
329            "print" => TokenKind::Print,
330            "call" => TokenKind::Call,
331            "and" => TokenKind::And,
332            "or" => TokenKind::Or,
333            "not" => TokenKind::Not,
334            "true" => TokenKind::True,
335            "false" => TokenKind::False,
336            "null" => TokenKind::Null,
337            _ => TokenKind::Ident(text.into()),
338        };
339        Token {
340            kind,
341            span: Span { start, end },
342        }
343    }
344
345    fn number(&mut self) -> Result<Token, LexError> {
346        let (start, _) = self.peek().expect("number requires input");
347        let mut end = start;
348        let mut seen_dot = false;
349        while let Some((offset, ch)) = self.peek() {
350            if ch == '.' && !seen_dot {
351                seen_dot = true;
352                end = offset + 1;
353                self.bump();
354                continue;
355            }
356            if !ch.is_ascii_digit() {
357                break;
358            }
359            end = offset + ch.len_utf8();
360            self.bump();
361        }
362        let lexeme = &self.source[start..end];
363        let value = lexeme.parse::<f64>().map_err(|_| LexError::InvalidNumber {
364            lexeme: lexeme.to_string(),
365            offset: start,
366        })?;
367        Ok(Token {
368            kind: TokenKind::Number(value),
369            span: Span { start, end },
370        })
371    }
372
373    fn skip_comment(&mut self) {
374        while let Some((_, ch)) = self.bump() {
375            if ch == '\n' {
376                break;
377            }
378        }
379    }
380
381    fn bump(&mut self) -> Option<(usize, char)> {
382        self.chars.next()
383    }
384
385    fn peek(&mut self) -> Option<(usize, char)> {
386        self.chars.peek().copied()
387    }
388
389    fn peek_second(&self) -> Option<char> {
390        let mut chars = self.chars.clone();
391        chars.next()?;
392        chars.next().map(|(_, ch)| ch)
393    }
394
395    fn starts_with_at(&self, offset: usize, needle: &str) -> bool {
396        self.source[offset..].starts_with(needle)
397    }
398
399    fn consume_until_byte(&mut self, end: usize) {
400        while let Some((offset, _)) = self.peek() {
401            if offset >= end {
402                break;
403            }
404            self.bump();
405        }
406    }
407
408    fn consume_if(&mut self, expected: char) -> bool {
409        match self.peek() {
410            Some((_, ch)) if ch == expected => {
411                self.bump();
412                true
413            }
414            _ => false,
415        }
416    }
417}
418
419fn is_ident_start(ch: char) -> bool {
420    ch == '_' || ch.is_ascii_alphabetic()
421}
422
423fn is_ident_continue(ch: char) -> bool {
424    is_ident_start(ch) || ch.is_ascii_digit()
425}
426
427#[cfg(test)]
428mod tests {
429    use super::*;
430
431    #[test]
432    fn lexes_all_token_classes_and_comments() {
433        let tokens = lex(r#"
434            # comment
435            // comment
436            if else for in await cancel submit print call and or not true false null start
437            name _x a1 "hi\n\t\"\\\r\q" 12 3.5 { } ( ) [ ] , : @ ? . ! = == != && || | < <= > >= + - * / %
438            "#)
439        .expect("lexing should succeed");
440
441        let kinds: Vec<_> = tokens.into_iter().map(|token| token.kind).collect();
442        assert_eq!(
443            kinds,
444            vec![
445                TokenKind::If,
446                TokenKind::Else,
447                TokenKind::For,
448                TokenKind::In,
449                TokenKind::Await,
450                TokenKind::Cancel,
451                TokenKind::Submit,
452                TokenKind::Print,
453                TokenKind::Call,
454                TokenKind::And,
455                TokenKind::Or,
456                TokenKind::Not,
457                TokenKind::True,
458                TokenKind::False,
459                TokenKind::Null,
460                TokenKind::Ident("start".into()),
461                TokenKind::Ident("name".into()),
462                TokenKind::Ident("_x".into()),
463                TokenKind::Ident("a1".into()),
464                TokenKind::String("hi\n\t\"\\\rq".into()),
465                TokenKind::Number(12.0),
466                TokenKind::Number(3.5),
467                TokenKind::LBrace,
468                TokenKind::RBrace,
469                TokenKind::LParen,
470                TokenKind::RParen,
471                TokenKind::LBracket,
472                TokenKind::RBracket,
473                TokenKind::Comma,
474                TokenKind::Colon,
475                TokenKind::At,
476                TokenKind::Question,
477                TokenKind::Dot,
478                TokenKind::Bang,
479                TokenKind::Equal,
480                TokenKind::DoubleEqual,
481                TokenKind::BangEqual,
482                TokenKind::AndAnd,
483                TokenKind::OrOr,
484                TokenKind::Pipe,
485                TokenKind::Less,
486                TokenKind::LessEqual,
487                TokenKind::Greater,
488                TokenKind::GreaterEqual,
489                TokenKind::Plus,
490                TokenKind::Minus,
491                TokenKind::Star,
492                TokenKind::Slash,
493                TokenKind::Percent,
494                TokenKind::Eof,
495            ]
496        );
497    }
498
499    #[test]
500    fn rejects_unexpected_characters() {
501        let err = lex("`").expect_err("lexing should fail");
502        assert_eq!(err, LexError::UnexpectedChar { ch: '`', offset: 0 });
503    }
504
505    #[test]
506    fn lexes_multiline_and_raw_multiline_strings() {
507        let tokens = lex(r####"
508            normal = """line1\n"quoted"
509line2"""
510            raw = r"""*** Begin Patch
511@@
512\n { untouched }
513*** End Patch"""
514            "####)
515        .expect("lexing should succeed");
516
517        let strings: Vec<_> = tokens
518            .into_iter()
519            .filter_map(|token| match token.kind {
520                TokenKind::String(value) => Some(value),
521                _ => None,
522            })
523            .collect();
524        assert_eq!(
525            strings,
526            vec![
527                CompactString::from("line1\n\"quoted\"\nline2"),
528                CompactString::from("*** Begin Patch\n@@\n\\n { untouched }\n*** End Patch"),
529            ]
530        );
531    }
532
533    #[test]
534    fn lexes_raw_triple_single_quoted_strings() {
535        let tokens = lex(r####"
536            script = r'''python3 - <<'PY'
537print("""double quotes are preserved""")
538\n { braces stay raw }
539PY'''
540            "####)
541        .expect("lexing should succeed");
542
543        let strings: Vec<_> = tokens
544            .into_iter()
545            .filter_map(|token| match token.kind {
546                TokenKind::String(value) => Some(value),
547                _ => None,
548            })
549            .collect();
550        assert_eq!(
551            strings,
552            vec![CompactString::from(
553                "python3 - <<'PY'\nprint(\"\"\"double quotes are preserved\"\"\")\n\\n { braces stay raw }\nPY"
554            )]
555        );
556    }
557
558    #[test]
559    fn lexes_label_annotation_text_inside_strings_as_strings() {
560        let tokens = lex(r####"
561            regular = "@label(title: \"plain\")"
562            multiline = """@label(title: "plain")
563finish null"""
564            raw = r'''@label(title: "plain")
565@label(title: "still plain") finish null'''
566            "####)
567        .expect("lexing should succeed");
568
569        assert!(
570            tokens
571                .iter()
572                .all(|token| !matches!(token.kind, TokenKind::At)),
573            "`@` inside strings must not lex as annotation syntax"
574        );
575        let strings: Vec<_> = tokens
576            .into_iter()
577            .filter_map(|token| match token.kind {
578                TokenKind::String(value) => Some(value),
579                _ => None,
580            })
581            .collect();
582        assert_eq!(
583            strings,
584            vec![
585                CompactString::from("@label(title: \"plain\")"),
586                CompactString::from("@label(title: \"plain\")\nfinish null"),
587                CompactString::from(
588                    "@label(title: \"plain\")\n@label(title: \"still plain\") finish null"
589                ),
590            ]
591        );
592    }
593
594    #[test]
595    fn lexes_double_slash_comments_without_breaking_division() {
596        let tokens = lex(r#"
597            value = 6 / 2
598            // trailing comment
599            submit value
600            "#)
601        .expect("lexing should succeed");
602
603        let kinds: Vec<_> = tokens.into_iter().map(|token| token.kind).collect();
604        assert_eq!(
605            kinds,
606            vec![
607                TokenKind::Ident("value".into()),
608                TokenKind::Equal,
609                TokenKind::Number(6.0),
610                TokenKind::Slash,
611                TokenKind::Number(2.0),
612                TokenKind::Submit,
613                TokenKind::Ident("value".into()),
614                TokenKind::Eof,
615            ]
616        );
617    }
618
619    #[test]
620    fn rejects_unterminated_strings() {
621        let err = lex("\"abc").expect_err("lexing should fail");
622        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
623
624        let err = lex("\"abc\\").expect_err("lexing should fail");
625        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
626
627        let err = lex("\"\"\"abc").expect_err("lexing should fail");
628        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
629
630        let err = lex("r\"\"\"abc").expect_err("lexing should fail");
631        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
632
633        let err = lex("r'''abc").expect_err("lexing should fail");
634        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
635    }
636
637    #[test]
638    fn internal_number_error_path_is_covered() {
639        let mut lexer = Lexer {
640            source: ".",
641            chars: ".".char_indices().peekable(),
642        };
643        let err = lexer.number().expect_err("number parsing should fail");
644        assert_eq!(
645            err,
646            LexError::InvalidNumber {
647                lexeme: ".".to_string(),
648                offset: 0
649            }
650        );
651    }
652
653    #[test]
654    fn identifier_helpers_cover_true_and_false_cases() {
655        assert!(is_ident_start('_'));
656        assert!(is_ident_start('a'));
657        assert!(!is_ident_start('1'));
658
659        assert!(is_ident_continue('9'));
660        assert!(is_ident_continue('_'));
661        assert!(!is_ident_continue('-'));
662    }
663}