Skip to main content

lashlang/
lexer.rs

1use compact_str::CompactString;
2use serde::{Deserialize, Serialize};
3use thiserror::Error;
4
5#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
6pub struct Span {
7    pub start: usize,
8    pub end: usize,
9}
10
11#[derive(Clone, Debug, PartialEq)]
12pub struct Token {
13    pub kind: TokenKind,
14    pub span: Span,
15}
16
17#[derive(Clone, Debug, PartialEq)]
18pub enum TokenKind {
19    Ident(CompactString),
20    String(CompactString),
21    Number(f64),
22    LBrace,
23    RBrace,
24    LParen,
25    RParen,
26    LBracket,
27    RBracket,
28    Comma,
29    Colon,
30    At,
31    Question,
32    Dot,
33    Bang,
34    Equal,
35    DoubleEqual,
36    BangEqual,
37    AndAnd,
38    OrOr,
39    Pipe,
40    Less,
41    LessEqual,
42    Greater,
43    GreaterEqual,
44    Plus,
45    Minus,
46    Star,
47    Slash,
48    Percent,
49    If,
50    Else,
51    For,
52    In,
53    Await,
54    Cancel,
55    Submit,
56    Print,
57    Call,
58    And,
59    Or,
60    Not,
61    True,
62    False,
63    Null,
64    Eof,
65}
66
67#[derive(Debug, Error, PartialEq)]
68pub enum LexError {
69    #[error("unexpected `{ch}`")]
70    UnexpectedChar { ch: char, offset: usize },
71    #[error("unterminated string")]
72    UnterminatedString { offset: usize },
73    #[error("invalid number `{lexeme}`")]
74    InvalidNumber { lexeme: String, offset: usize },
75}
76
77impl LexError {
78    pub fn offset(&self) -> usize {
79        match self {
80            Self::UnexpectedChar { offset, .. }
81            | Self::UnterminatedString { offset }
82            | Self::InvalidNumber { offset, .. } => *offset,
83        }
84    }
85}
86
87pub fn lex(source: &str) -> Result<Vec<Token>, LexError> {
88    let mut lexer = Lexer {
89        source,
90        chars: source.char_indices().peekable(),
91    };
92    lexer.lex_all()
93}
94
95struct Lexer<'a> {
96    source: &'a str,
97    chars: std::iter::Peekable<std::str::CharIndices<'a>>,
98}
99
100impl<'a> Lexer<'a> {
101    fn lex_all(&mut self) -> Result<Vec<Token>, LexError> {
102        let mut tokens = Vec::with_capacity((self.source.len() / 4).max(8));
103        while let Some((offset, ch)) = self.peek() {
104            if ch.is_whitespace() || ch == ';' {
105                self.bump();
106                continue;
107            }
108            if ch == '#' {
109                self.skip_comment();
110                continue;
111            }
112            if ch == '/' && self.peek_second() == Some('/') {
113                self.bump();
114                self.bump();
115                self.skip_comment();
116                continue;
117            }
118
119            let token = match ch {
120                '{' => self.single(TokenKind::LBrace),
121                '}' => self.single(TokenKind::RBrace),
122                '(' => self.single(TokenKind::LParen),
123                ')' => self.single(TokenKind::RParen),
124                '[' => self.single(TokenKind::LBracket),
125                ']' => self.single(TokenKind::RBracket),
126                ',' => self.single(TokenKind::Comma),
127                ':' => self.single(TokenKind::Colon),
128                '@' => self.single(TokenKind::At),
129                '?' => self.single(TokenKind::Question),
130                '.' => self.single(TokenKind::Dot),
131                '+' => self.single(TokenKind::Plus),
132                '-' => self.single(TokenKind::Minus),
133                '*' => self.single(TokenKind::Star),
134                '/' => self.single(TokenKind::Slash),
135                '%' => self.single(TokenKind::Percent),
136                '=' => self.double_or_single('=', TokenKind::DoubleEqual, TokenKind::Equal),
137                '!' => self.double_or_single('=', TokenKind::BangEqual, TokenKind::Bang),
138                '&' => self.required_double('&', TokenKind::AndAnd)?,
139                '|' => self.double_or_single('|', TokenKind::OrOr, TokenKind::Pipe),
140                '<' => self.double_or_single('=', TokenKind::LessEqual, TokenKind::Less),
141                '>' => self.double_or_single('=', TokenKind::GreaterEqual, TokenKind::Greater),
142                '"' | '\'' => self.quoted_string(ch)?,
143                'r' | 'R' if self.raw_string_delimiter(offset).is_some() => self.raw_string()?,
144                c if is_ident_start(c) => self.ident_or_keyword(),
145                c if c.is_ascii_digit() => self.number()?,
146                _ => return Err(LexError::UnexpectedChar { ch, offset }),
147            };
148            tokens.push(token);
149        }
150
151        let end = self.source.len();
152        tokens.push(Token {
153            kind: TokenKind::Eof,
154            span: Span { start: end, end },
155        });
156        Ok(tokens)
157    }
158
159    fn single(&mut self, kind: TokenKind) -> Token {
160        let (start, ch) = self.bump().expect("single token requires input");
161        Token {
162            kind,
163            span: Span {
164                start,
165                end: start + ch.len_utf8(),
166            },
167        }
168    }
169
170    fn double_or_single(
171        &mut self,
172        second: char,
173        double_kind: TokenKind,
174        single_kind: TokenKind,
175    ) -> Token {
176        let (start, ch) = self.bump().expect("double token requires input");
177        let end = if self.consume_if(second) {
178            start + ch.len_utf8() + second.len_utf8()
179        } else {
180            start + ch.len_utf8()
181        };
182        Token {
183            kind: if end > start + ch.len_utf8() {
184                double_kind
185            } else {
186                single_kind
187            },
188            span: Span { start, end },
189        }
190    }
191
192    fn quoted_string(&mut self, quote: char) -> Result<Token, LexError> {
193        let (start, _) = self.peek().expect("string requires quote");
194        let delimiter = string_delimiter(quote, self.starts_with_triple_quote_at(start, quote));
195        let content_start = start + delimiter.len();
196        self.consume_until_byte(content_start);
197
198        let mut value = String::new();
199        while let Some((offset, ch)) = self.bump() {
200            if delimiter.len() == 1 {
201                if ch == quote {
202                    return Ok(Token {
203                        kind: TokenKind::String(value.into()),
204                        span: Span {
205                            start,
206                            end: offset + quote.len_utf8(),
207                        },
208                    });
209                }
210            } else if self.starts_with_at(offset, &delimiter) {
211                self.consume_until_byte(offset + delimiter.len());
212                return Ok(Token {
213                    kind: TokenKind::String(value.into()),
214                    span: Span {
215                        start,
216                        end: offset + delimiter.len(),
217                    },
218                });
219            }
220
221            if ch == '\\' {
222                let Some((_, escaped)) = self.bump() else {
223                    return Err(LexError::UnterminatedString { offset: start });
224                };
225                value.push(translate_escape(escaped, quote));
226            } else {
227                value.push(ch);
228            }
229        }
230        Err(LexError::UnterminatedString { offset: start })
231    }
232
233    fn raw_string_delimiter(&self, offset: usize) -> Option<String> {
234        let rest = self.source.get(offset..)?;
235        let mut chars = rest.chars();
236        match chars.next()? {
237            'r' | 'R' => {}
238            _ => return None,
239        }
240        let quote = chars.next()?;
241        if quote != '"' && quote != '\'' {
242            return None;
243        }
244        let after_prefix = offset + 1;
245        Some(string_delimiter(
246            quote,
247            self.starts_with_triple_quote_at(after_prefix, quote),
248        ))
249    }
250
251    fn raw_string(&mut self) -> Result<Token, LexError> {
252        let (start, _) = self.peek().expect("raw string requires input");
253        let delimiter = self
254            .raw_string_delimiter(start)
255            .expect("raw string branch requires valid opener");
256        let content_start = start + 1 + delimiter.len();
257
258        let Some(relative_end) = self.source[content_start..].find(&delimiter) else {
259            return Err(LexError::UnterminatedString { offset: start });
260        };
261        let content_end = content_start + relative_end;
262        let end = content_end + delimiter.len();
263        let value = CompactString::from(&self.source[content_start..content_end]);
264        self.consume_until_byte(end);
265        Ok(Token {
266            kind: TokenKind::String(value),
267            span: Span { start, end },
268        })
269    }
270
271    fn required_double(&mut self, expected: char, kind: TokenKind) -> Result<Token, LexError> {
272        let (start, ch) = self.bump().expect("double token requires input");
273        if !self.consume_if(expected) {
274            return Err(LexError::UnexpectedChar { ch, offset: start });
275        }
276        Ok(Token {
277            kind,
278            span: Span {
279                start,
280                end: start + ch.len_utf8() + expected.len_utf8(),
281            },
282        })
283    }
284
285    fn ident_or_keyword(&mut self) -> Token {
286        let (start, _) = self.peek().expect("identifier requires input");
287        let mut end = start;
288        while let Some((offset, ch)) = self.peek() {
289            if !is_ident_continue(ch) {
290                break;
291            }
292            end = offset + ch.len_utf8();
293            self.bump();
294        }
295        let text = &self.source[start..end];
296        let kind = match text {
297            "if" => TokenKind::If,
298            "else" => TokenKind::Else,
299            "for" => TokenKind::For,
300            "in" => TokenKind::In,
301            "await" => TokenKind::Await,
302            "cancel" => TokenKind::Cancel,
303            "submit" => TokenKind::Submit,
304            "print" => TokenKind::Print,
305            "call" => TokenKind::Call,
306            "and" => TokenKind::And,
307            "or" => TokenKind::Or,
308            "not" => TokenKind::Not,
309            "true" => TokenKind::True,
310            "false" => TokenKind::False,
311            "null" => TokenKind::Null,
312            _ => TokenKind::Ident(text.into()),
313        };
314        Token {
315            kind,
316            span: Span { start, end },
317        }
318    }
319
320    fn number(&mut self) -> Result<Token, LexError> {
321        let (start, _) = self.peek().expect("number requires input");
322        let mut end = start;
323        let mut seen_dot = false;
324        while let Some((offset, ch)) = self.peek() {
325            if ch == '.' && !seen_dot {
326                seen_dot = true;
327                end = offset + 1;
328                self.bump();
329                continue;
330            }
331            if !ch.is_ascii_digit() {
332                break;
333            }
334            end = offset + ch.len_utf8();
335            self.bump();
336        }
337        let lexeme = &self.source[start..end];
338        let value = lexeme.parse::<f64>().map_err(|_| LexError::InvalidNumber {
339            lexeme: lexeme.to_string(),
340            offset: start,
341        })?;
342        Ok(Token {
343            kind: TokenKind::Number(value),
344            span: Span { start, end },
345        })
346    }
347
348    fn skip_comment(&mut self) {
349        while let Some((_, ch)) = self.bump() {
350            if ch == '\n' {
351                break;
352            }
353        }
354    }
355
356    fn bump(&mut self) -> Option<(usize, char)> {
357        self.chars.next()
358    }
359
360    fn peek(&mut self) -> Option<(usize, char)> {
361        self.chars.peek().copied()
362    }
363
364    fn peek_second(&self) -> Option<char> {
365        let mut chars = self.chars.clone();
366        chars.next()?;
367        chars.next().map(|(_, ch)| ch)
368    }
369
370    fn starts_with_at(&self, offset: usize, needle: &str) -> bool {
371        self.source[offset..].starts_with(needle)
372    }
373
374    fn starts_with_triple_quote_at(&self, offset: usize, quote: char) -> bool {
375        let mut delimiter = String::with_capacity(3);
376        delimiter.push(quote);
377        delimiter.push(quote);
378        delimiter.push(quote);
379        self.starts_with_at(offset, &delimiter)
380    }
381
382    fn consume_until_byte(&mut self, end: usize) {
383        while let Some((offset, _)) = self.peek() {
384            if offset >= end {
385                break;
386            }
387            self.bump();
388        }
389    }
390
391    fn consume_if(&mut self, expected: char) -> bool {
392        match self.peek() {
393            Some((_, ch)) if ch == expected => {
394                self.bump();
395                true
396            }
397            _ => false,
398        }
399    }
400}
401
402fn is_ident_start(ch: char) -> bool {
403    ch == '_' || ch.is_ascii_alphabetic()
404}
405
406fn is_ident_continue(ch: char) -> bool {
407    is_ident_start(ch) || ch.is_ascii_digit()
408}
409
410fn string_delimiter(quote: char, triple: bool) -> String {
411    let count = if triple { 3 } else { 1 };
412    std::iter::repeat_n(quote, count).collect()
413}
414
415fn translate_escape(escaped: char, quote: char) -> char {
416    match escaped {
417        '\\' => '\\',
418        'n' => '\n',
419        'r' => '\r',
420        't' => '\t',
421        other if other == quote => quote,
422        other => other,
423    }
424}
425
426#[cfg(test)]
427mod tests {
428    use super::*;
429
430    #[test]
431    fn lexes_all_token_classes_and_comments() {
432        let tokens = lex(r#"
433            # comment
434            // comment
435            if else for in await cancel submit print call and or not true false null start
436            name _x a1 "hi\n\t\"\\\r\q" 12 3.5 { } ( ) [ ] , : @ ? . ! = == != && || | < <= > >= + - * / %
437            "#)
438        .expect("lexing should succeed");
439
440        let kinds: Vec<_> = tokens.into_iter().map(|token| token.kind).collect();
441        assert_eq!(
442            kinds,
443            vec![
444                TokenKind::If,
445                TokenKind::Else,
446                TokenKind::For,
447                TokenKind::In,
448                TokenKind::Await,
449                TokenKind::Cancel,
450                TokenKind::Submit,
451                TokenKind::Print,
452                TokenKind::Call,
453                TokenKind::And,
454                TokenKind::Or,
455                TokenKind::Not,
456                TokenKind::True,
457                TokenKind::False,
458                TokenKind::Null,
459                TokenKind::Ident("start".into()),
460                TokenKind::Ident("name".into()),
461                TokenKind::Ident("_x".into()),
462                TokenKind::Ident("a1".into()),
463                TokenKind::String("hi\n\t\"\\\rq".into()),
464                TokenKind::Number(12.0),
465                TokenKind::Number(3.5),
466                TokenKind::LBrace,
467                TokenKind::RBrace,
468                TokenKind::LParen,
469                TokenKind::RParen,
470                TokenKind::LBracket,
471                TokenKind::RBracket,
472                TokenKind::Comma,
473                TokenKind::Colon,
474                TokenKind::At,
475                TokenKind::Question,
476                TokenKind::Dot,
477                TokenKind::Bang,
478                TokenKind::Equal,
479                TokenKind::DoubleEqual,
480                TokenKind::BangEqual,
481                TokenKind::AndAnd,
482                TokenKind::OrOr,
483                TokenKind::Pipe,
484                TokenKind::Less,
485                TokenKind::LessEqual,
486                TokenKind::Greater,
487                TokenKind::GreaterEqual,
488                TokenKind::Plus,
489                TokenKind::Minus,
490                TokenKind::Star,
491                TokenKind::Slash,
492                TokenKind::Percent,
493                TokenKind::Eof,
494            ]
495        );
496    }
497
498    #[test]
499    fn rejects_unexpected_characters() {
500        let err = lex("`").expect_err("lexing should fail");
501        assert_eq!(err, LexError::UnexpectedChar { ch: '`', offset: 0 });
502    }
503
504    #[test]
505    fn lexes_python_shaped_string_literals() {
506        let tokens = lex(r####"
507            double = "hi\n\t\"\\\r\q"
508            single = 'it\'s ok\n'
509            triple_double = """line1\n"quoted"
510line2"""
511            triple_single = '''line1\n'quoted'
512line2'''
513            raw_double = r"*** Begin Patch
514@@
515\n { untouched }
516*** End Patch"
517            raw_single = r'path\to\file'
518            "####)
519        .expect("lexing should succeed");
520
521        let strings: Vec<_> = tokens
522            .into_iter()
523            .filter_map(|token| match token.kind {
524                TokenKind::String(value) => Some(value),
525                _ => None,
526            })
527            .collect();
528        assert_eq!(
529            strings,
530            vec![
531                CompactString::from("hi\n\t\"\\\rq"),
532                CompactString::from("it's ok\n"),
533                CompactString::from("line1\n\"quoted\"\nline2"),
534                CompactString::from("line1\n'quoted'\nline2"),
535                CompactString::from("*** Begin Patch\n@@\n\\n { untouched }\n*** End Patch"),
536                CompactString::from("path\\to\\file"),
537            ]
538        );
539    }
540
541    #[test]
542    fn lexes_shell_and_formatter_shaped_string_literals() {
543        let tokens = lex(r####"
544            date = "date '+%Y-%m-%d %H:%M:%S %Z (%z)'"
545            printf = 'printf "%s\\n" "$value"'
546            json = "{\"cmd\":\"echo 'ok'\"}"
547            shell = "${HOME:-/tmp} && echo %done"
548            comment_text = "// not a comment # also not a comment"
549            label_text = "@label(title: \"plain\")"
550            "####)
551        .expect("lexing should succeed");
552
553        let strings: Vec<_> = tokens
554            .into_iter()
555            .filter_map(|token| match token.kind {
556                TokenKind::String(value) => Some(value),
557                _ => None,
558            })
559            .collect();
560        assert_eq!(
561            strings,
562            vec![
563                CompactString::from("date '+%Y-%m-%d %H:%M:%S %Z (%z)'"),
564                CompactString::from("printf \"%s\\n\" \"$value\""),
565                CompactString::from("{\"cmd\":\"echo 'ok'\"}"),
566                CompactString::from("${HOME:-/tmp} && echo %done"),
567                CompactString::from("// not a comment # also not a comment"),
568                CompactString::from("@label(title: \"plain\")"),
569            ]
570        );
571    }
572
573    #[test]
574    fn lexes_raw_triple_strings() {
575        let tokens = lex(r#####"
576            script = r'''python3 - <<'PY'
577print("""double quotes are preserved""")
578\n { braces stay raw }
579PY'''
580            markdown = R"""This body can mention " and ' without escaping.
581It ends only at three double quotes."""
582            "#####)
583        .expect("lexing should succeed");
584
585        let strings: Vec<_> = tokens
586            .into_iter()
587            .filter_map(|token| match token.kind {
588                TokenKind::String(value) => Some(value),
589                _ => None,
590            })
591            .collect();
592        assert_eq!(
593            strings,
594            vec![
595                CompactString::from(
596                    "python3 - <<'PY'\nprint(\"\"\"double quotes are preserved\"\"\")\n\\n { braces stay raw }\nPY",
597                ),
598                CompactString::from(
599                    "This body can mention \" and ' without escaping.\nIt ends only at three double quotes.",
600                ),
601            ]
602        );
603    }
604
605    #[test]
606    fn lexes_label_annotation_text_inside_strings_as_strings() {
607        let tokens = lex(r####"
608            regular = "@label(title: \"plain\")"
609            multiline = """@label(title: "plain")
610finish null"""
611            raw = r"""@label(title: "plain")
612@label(title: "still plain") finish null"""
613            "####)
614        .expect("lexing should succeed");
615
616        assert!(
617            tokens
618                .iter()
619                .all(|token| !matches!(token.kind, TokenKind::At)),
620            "`@` inside strings must not lex as annotation syntax"
621        );
622        let strings: Vec<_> = tokens
623            .into_iter()
624            .filter_map(|token| match token.kind {
625                TokenKind::String(value) => Some(value),
626                _ => None,
627            })
628            .collect();
629        assert_eq!(
630            strings,
631            vec![
632                CompactString::from("@label(title: \"plain\")"),
633                CompactString::from("@label(title: \"plain\")\nfinish null"),
634                CompactString::from(
635                    "@label(title: \"plain\")\n@label(title: \"still plain\") finish null"
636                ),
637            ]
638        );
639    }
640
641    #[test]
642    fn lexes_double_slash_comments_without_breaking_division() {
643        let tokens = lex(r#"
644            value = 6 / 2
645            // trailing comment
646            submit value
647            "#)
648        .expect("lexing should succeed");
649
650        let kinds: Vec<_> = tokens.into_iter().map(|token| token.kind).collect();
651        assert_eq!(
652            kinds,
653            vec![
654                TokenKind::Ident("value".into()),
655                TokenKind::Equal,
656                TokenKind::Number(6.0),
657                TokenKind::Slash,
658                TokenKind::Number(2.0),
659                TokenKind::Submit,
660                TokenKind::Ident("value".into()),
661                TokenKind::Eof,
662            ]
663        );
664    }
665
666    #[test]
667    fn rejects_unterminated_strings() {
668        let err = lex("\"abc").expect_err("lexing should fail");
669        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
670
671        let err = lex("\"abc\\").expect_err("lexing should fail");
672        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
673
674        let err = lex("\"\"\"abc").expect_err("lexing should fail");
675        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
676
677        let err = lex("'abc").expect_err("lexing should fail");
678        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
679
680        let err = lex("'''abc").expect_err("lexing should fail");
681        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
682
683        let err = lex("r\"abc").expect_err("lexing should fail");
684        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
685
686        let err = lex("r'''abc").expect_err("lexing should fail");
687        assert_eq!(err, LexError::UnterminatedString { offset: 0 });
688    }
689
690    #[test]
691    fn rust_style_raw_strings_are_not_recognized() {
692        let tokens = lex("submit r#\"abc\"#").expect("lexing treats hash as comment");
693        let kinds: Vec<_> = tokens.into_iter().map(|token| token.kind).collect();
694        assert_eq!(
695            kinds,
696            vec![
697                TokenKind::Submit,
698                TokenKind::Ident("r".into()),
699                TokenKind::Eof,
700            ]
701        );
702    }
703
704    #[test]
705    fn internal_number_error_path_is_covered() {
706        let mut lexer = Lexer {
707            source: ".",
708            chars: ".".char_indices().peekable(),
709        };
710        let err = lexer.number().expect_err("number parsing should fail");
711        assert_eq!(
712            err,
713            LexError::InvalidNumber {
714                lexeme: ".".to_string(),
715                offset: 0
716            }
717        );
718    }
719
720    #[test]
721    fn identifier_helpers_cover_true_and_false_cases() {
722        assert!(is_ident_start('_'));
723        assert!(is_ident_start('a'));
724        assert!(!is_ident_start('1'));
725
726        assert!(is_ident_continue('9'));
727        assert!(is_ident_continue('_'));
728        assert!(!is_ident_continue('-'));
729    }
730}