Skip to main content

maya_mel/
lexer.rs

1#![forbid(unsafe_code)]
2//! MEL lexer entry points.
3//!
4//! Most users do not need to call the lexer directly. Prefer [`crate::parse_source`]
5//! unless you specifically need token streams or lexical diagnostics.
6
7use mel_syntax::{LexDiagnostic, Lexed, Token, TokenKind, text_range};
8
9pub struct Lexer<'a> {
10    input: &'a str,
11    bytes: &'a [u8],
12    offset: usize,
13    emitted_eof: bool,
14    diagnostics: Vec<LexDiagnostic>,
15    significant_only: bool,
16    reject_block_comments: bool,
17}
18
19#[derive(Debug, Clone, Copy, Default)]
20pub(crate) struct LexerPolicy {
21    pub(crate) reject_block_comments: bool,
22}
23
24impl<'a> Lexer<'a> {
25    #[must_use]
26    pub fn new(input: &'a str) -> Self {
27        Self::with_options(input, false)
28    }
29
30    #[must_use]
31    pub fn significant(input: &'a str) -> Self {
32        Self::with_options(input, true)
33    }
34
35    #[must_use]
36    pub(crate) fn significant_with_policy(input: &'a str, policy: LexerPolicy) -> Self {
37        Self::with_options_and_policy(input, true, policy)
38    }
39
40    fn with_options(input: &'a str, significant_only: bool) -> Self {
41        Self::with_options_and_policy(input, significant_only, LexerPolicy::default())
42    }
43
44    fn with_options_and_policy(
45        input: &'a str,
46        significant_only: bool,
47        policy: LexerPolicy,
48    ) -> Self {
49        Self {
50            input,
51            bytes: input.as_bytes(),
52            offset: 0,
53            emitted_eof: false,
54            diagnostics: Vec::new(),
55            significant_only,
56            reject_block_comments: policy.reject_block_comments,
57        }
58    }
59
60    #[must_use]
61    pub fn finish(self) -> Vec<LexDiagnostic> {
62        self.diagnostics
63    }
64
65    fn next_token_internal(&mut self) -> Option<Token> {
66        if self.emitted_eof {
67            return None;
68        }
69
70        loop {
71            if self.offset >= self.bytes.len() {
72                self.emitted_eof = true;
73                let eof = self.input.len() as u32;
74                return Some(Token::new(TokenKind::Eof, text_range(eof, eof)));
75            }
76
77            let bytes = self.bytes;
78            let mut i = self.offset;
79            let token = match bytes[i] {
80                b' ' | b'\t' | b'\r' | b'\n' => {
81                    let start = i;
82                    i = lex_whitespace(bytes, i);
83                    Token::new(TokenKind::Whitespace, text_range(start as u32, i as u32))
84                }
85                b'/' if matches!(bytes.get(i + 1), Some(b'/')) => {
86                    let start = i;
87                    i = lex_line_comment(bytes, i);
88                    Token::new(TokenKind::LineComment, text_range(start as u32, i as u32))
89                }
90                b'/' if matches!(bytes.get(i + 1), Some(b'*')) => {
91                    let start = i;
92                    let (end, terminated) = lex_block_comment(bytes, i);
93                    i = end;
94                    if self.reject_block_comments {
95                        self.diagnostics.push(LexDiagnostic::new(
96                            "block comments are not allowed in expression mode",
97                            text_range(start as u32, end as u32),
98                        ));
99                    }
100                    if !terminated {
101                        self.diagnostics.push(LexDiagnostic::new(
102                            "unterminated block comment",
103                            text_range(start as u32, end as u32),
104                        ));
105                    }
106                    Token::new(
107                        TokenKind::BlockComment,
108                        text_range(start as u32, end as u32),
109                    )
110                }
111                b';' => advance_token(TokenKind::Semi, i, i + 1, &mut i),
112                b'(' => advance_token(TokenKind::LParen, i, i + 1, &mut i),
113                b')' => advance_token(TokenKind::RParen, i, i + 1, &mut i),
114                b'[' => advance_token(TokenKind::LBracket, i, i + 1, &mut i),
115                b']' => advance_token(TokenKind::RBracket, i, i + 1, &mut i),
116                b'{' => advance_token(TokenKind::LBrace, i, i + 1, &mut i),
117                b'}' => advance_token(TokenKind::RBrace, i, i + 1, &mut i),
118                b'.' if bytes
119                    .get(i + 1)
120                    .copied()
121                    .is_some_and(|b| b.is_ascii_digit()) =>
122                {
123                    let start = i;
124                    i += 1;
125                    while bytes.get(i).copied().is_some_and(|b| b.is_ascii_digit()) {
126                        i += 1;
127                    }
128
129                    if let Some(end) = lex_exponent_suffix(bytes, i) {
130                        i = end;
131                    }
132
133                    Token::new(TokenKind::FloatLiteral, text_range(start as u32, i as u32))
134                }
135                b'.' => advance_token(TokenKind::Dot, i, i + 1, &mut i),
136                b',' => advance_token(TokenKind::Comma, i, i + 1, &mut i),
137                b'$' => advance_token(TokenKind::Dollar, i, i + 1, &mut i),
138                b'`' => advance_token(TokenKind::Backquote, i, i + 1, &mut i),
139                b'?' => advance_token(TokenKind::Question, i, i + 1, &mut i),
140                b':' => advance_token(TokenKind::Colon, i, i + 1, &mut i),
141                b'+' if matches!(bytes.get(i + 1), Some(b'=')) => {
142                    advance_token(TokenKind::PlusEq, i, i + 2, &mut i)
143                }
144                b'+' if matches!(bytes.get(i + 1), Some(b'+')) => {
145                    advance_token(TokenKind::PlusPlus, i, i + 2, &mut i)
146                }
147                b'+' => advance_token(TokenKind::Plus, i, i + 1, &mut i),
148                b'*' if matches!(bytes.get(i + 1), Some(b'=')) => {
149                    advance_token(TokenKind::StarEq, i, i + 2, &mut i)
150                }
151                b'*' => advance_token(TokenKind::Star, i, i + 1, &mut i),
152                b'/' if matches!(bytes.get(i + 1), Some(b'=')) => {
153                    advance_token(TokenKind::SlashEq, i, i + 2, &mut i)
154                }
155                b'/' => advance_token(TokenKind::Slash, i, i + 1, &mut i),
156                b'%' => advance_token(TokenKind::Percent, i, i + 1, &mut i),
157                b'^' => advance_token(TokenKind::Caret, i, i + 1, &mut i),
158                b'!' if matches!(bytes.get(i + 1), Some(b'=')) => {
159                    advance_token(TokenKind::NotEq, i, i + 2, &mut i)
160                }
161                b'!' => advance_token(TokenKind::Bang, i, i + 1, &mut i),
162                b'=' if matches!(bytes.get(i + 1), Some(b'=')) => {
163                    advance_token(TokenKind::EqEq, i, i + 2, &mut i)
164                }
165                b'=' => advance_token(TokenKind::Assign, i, i + 1, &mut i),
166                b'<' if matches!(bytes.get(i + 1), Some(b'<')) => {
167                    advance_token(TokenKind::LtLt, i, i + 2, &mut i)
168                }
169                b'<' if matches!(bytes.get(i + 1), Some(b'=')) => {
170                    advance_token(TokenKind::Le, i, i + 2, &mut i)
171                }
172                b'<' => advance_token(TokenKind::Lt, i, i + 1, &mut i),
173                b'>' if matches!(bytes.get(i + 1), Some(b'>')) => {
174                    advance_token(TokenKind::GtGt, i, i + 2, &mut i)
175                }
176                b'>' if matches!(bytes.get(i + 1), Some(b'=')) => {
177                    advance_token(TokenKind::Ge, i, i + 2, &mut i)
178                }
179                b'>' => advance_token(TokenKind::Gt, i, i + 1, &mut i),
180                b'&' if matches!(bytes.get(i + 1), Some(b'&')) => {
181                    advance_token(TokenKind::AndAnd, i, i + 2, &mut i)
182                }
183                b'|' if matches!(bytes.get(i + 1), Some(b'|')) => {
184                    advance_token(TokenKind::OrOr, i, i + 2, &mut i)
185                }
186                b'|' => advance_token(TokenKind::Pipe, i, i + 1, &mut i),
187                b'-' if matches!(bytes.get(i + 1), Some(b'-')) => {
188                    advance_token(TokenKind::MinusMinus, i, i + 2, &mut i)
189                }
190                b'-' if matches!(bytes.get(i + 1), Some(b'=')) => {
191                    advance_token(TokenKind::MinusEq, i, i + 2, &mut i)
192                }
193                b'-' if bytes.get(i + 1).copied().is_some_and(is_ident_start_byte)
194                    && can_start_flag(bytes, i) =>
195                {
196                    let start = i;
197                    i += 1;
198                    while bytes.get(i).copied().is_some_and(is_ident_continue_byte) {
199                        i += 1;
200                    }
201                    Token::new(TokenKind::Flag, text_range(start as u32, i as u32))
202                }
203                b'-' => advance_token(TokenKind::Minus, i, i + 1, &mut i),
204                b'"' => {
205                    let start = i;
206                    i += 1;
207                    let mut terminated = false;
208                    while i < bytes.len() {
209                        match bytes[i] {
210                            b'\\' => {
211                                i += if i + 1 < bytes.len() { 2 } else { 1 };
212                            }
213                            b'"' => {
214                                i += 1;
215                                terminated = true;
216                                break;
217                            }
218                            _ => i += 1,
219                        }
220                    }
221                    if !terminated {
222                        self.diagnostics.push(LexDiagnostic::new(
223                            "unterminated string literal",
224                            text_range(start as u32, i as u32),
225                        ));
226                    }
227                    Token::new(TokenKind::StringLiteral, text_range(start as u32, i as u32))
228                }
229                b'0'..=b'9' => {
230                    let start = i;
231
232                    if bytes[i] == b'0'
233                        && matches!(bytes.get(i + 1), Some(b'x' | b'X'))
234                        && bytes
235                            .get(i + 2)
236                            .copied()
237                            .is_some_and(|b| b.is_ascii_hexdigit())
238                    {
239                        i += 2;
240                        while bytes.get(i).copied().is_some_and(|b| b.is_ascii_hexdigit()) {
241                            i += 1;
242                        }
243                        self.offset = i;
244                        let token =
245                            Token::new(TokenKind::IntLiteral, text_range(start as u32, i as u32));
246                        if self.significant_only && token.kind.is_trivia() {
247                            continue;
248                        }
249                        return Some(token);
250                    }
251
252                    i += 1;
253                    while bytes.get(i).copied().is_some_and(|b| b.is_ascii_digit()) {
254                        i += 1;
255                    }
256
257                    let mut kind = TokenKind::IntLiteral;
258
259                    if matches!(bytes.get(i), Some(b'.')) {
260                        if bytes
261                            .get(i + 1)
262                            .copied()
263                            .is_some_and(|b| b.is_ascii_digit())
264                        {
265                            i += 1;
266                            while bytes.get(i).copied().is_some_and(|b| b.is_ascii_digit()) {
267                                i += 1;
268                            }
269                            kind = TokenKind::FloatLiteral;
270                        } else if can_end_with_trailing_dot_float(bytes, i + 1) {
271                            i += 1;
272                            kind = TokenKind::FloatLiteral;
273                        }
274                    }
275
276                    if let Some(end) = lex_exponent_suffix(bytes, i) {
277                        i = end;
278                        kind = TokenKind::FloatLiteral;
279                    }
280
281                    Token::new(kind, text_range(start as u32, i as u32))
282                }
283                b if is_ident_start_byte(b) => {
284                    let start = i;
285                    i += 1;
286                    while bytes.get(i).copied().is_some_and(is_ident_continue_byte) {
287                        i += 1;
288                    }
289                    Token::new(TokenKind::Ident, text_range(start as u32, i as u32))
290                }
291                _ => {
292                    let start = i;
293                    let end = next_codepoint_boundary(self.input, i);
294                    self.diagnostics.push(LexDiagnostic::new(
295                        "unknown character",
296                        text_range(start as u32, end as u32),
297                    ));
298                    i = end;
299                    Token::new(TokenKind::Unknown, text_range(start as u32, end as u32))
300                }
301            };
302
303            self.offset = i;
304            if self.significant_only && token.kind.is_trivia() {
305                continue;
306            }
307            return Some(token);
308        }
309    }
310}
311
312impl Iterator for Lexer<'_> {
313    type Item = Token;
314
315    fn next(&mut self) -> Option<Self::Item> {
316        self.next_token_internal()
317    }
318}
319
320#[must_use]
321pub fn lexer(input: &str) -> Lexer<'_> {
322    Lexer::new(input)
323}
324
325#[must_use]
326pub fn significant_lexer(input: &str) -> Lexer<'_> {
327    Lexer::significant(input)
328}
329
330#[must_use]
331pub fn lex(input: &str) -> Lexed {
332    let mut lexer = lexer(input);
333    let tokens = lexer.by_ref().collect();
334    let diagnostics = lexer.finish();
335    Lexed {
336        tokens,
337        diagnostics,
338    }
339}
340
341#[must_use]
342pub fn lex_significant(input: &str) -> Lexed {
343    let mut lexer = significant_lexer(input);
344    let tokens = lexer.by_ref().collect();
345    let diagnostics = lexer.finish();
346    Lexed {
347        tokens,
348        diagnostics,
349    }
350}
351
352fn advance_token(kind: TokenKind, start: usize, end: usize, index: &mut usize) -> Token {
353    *index = end;
354    Token::new(kind, text_range(start as u32, end as u32))
355}
356
357fn next_codepoint_boundary(input: &str, start: usize) -> usize {
358    debug_assert!(input.is_char_boundary(start));
359    input[start..]
360        .chars()
361        .next()
362        .map_or(input.len(), |ch| start + ch.len_utf8())
363}
364
365fn lex_whitespace(bytes: &[u8], start: usize) -> usize {
366    let mut i = start;
367    while matches!(bytes.get(i), Some(b' ' | b'\t' | b'\r' | b'\n')) {
368        i += 1;
369    }
370    i
371}
372
373fn lex_line_comment(bytes: &[u8], start: usize) -> usize {
374    let mut i = start + 2;
375    while let Some(byte) = bytes.get(i) {
376        if *byte == b'\n' {
377            break;
378        }
379        i += 1;
380    }
381    i
382}
383
384fn lex_block_comment(bytes: &[u8], start: usize) -> (usize, bool) {
385    let mut i = start + 2;
386    while i + 1 < bytes.len() {
387        if bytes[i] == b'*' && bytes[i + 1] == b'/' {
388            return (i + 2, true);
389        }
390        i += 1;
391    }
392    (bytes.len(), false)
393}
394
395fn can_start_flag(bytes: &[u8], index: usize) -> bool {
396    index > 0 && bytes[index - 1].is_ascii_whitespace()
397}
398
399fn is_ident_start_byte(byte: u8) -> bool {
400    byte.is_ascii_alphabetic() || byte == b'_'
401}
402
403fn is_ident_continue_byte(byte: u8) -> bool {
404    is_ident_start_byte(byte) || byte.is_ascii_digit()
405}
406
407fn can_end_with_trailing_dot_float(bytes: &[u8], index: usize) -> bool {
408    match bytes.get(index).copied() {
409        None => true,
410        Some(byte) if byte.is_ascii_whitespace() => true,
411        Some(
412            b';' | b',' | b')' | b']' | b'}' | b'?' | b':' | b'+' | b'-' | b'*' | b'/' | b'%'
413            | b'=' | b'!' | b'<' | b'>' | b'&' | b'|',
414        ) => true,
415        _ => false,
416    }
417}
418
419fn lex_exponent_suffix(bytes: &[u8], start: usize) -> Option<usize> {
420    let exponent = bytes.get(start).copied()?;
421    if !matches!(exponent, b'e' | b'E') {
422        return None;
423    }
424
425    let mut index = start + 1;
426    if matches!(bytes.get(index), Some(b'+' | b'-')) {
427        index += 1;
428    }
429
430    let first_digit = bytes.get(index).copied()?;
431    if !first_digit.is_ascii_digit() {
432        return None;
433    }
434
435    index += 1;
436    while bytes
437        .get(index)
438        .copied()
439        .is_some_and(|byte| byte.is_ascii_digit())
440    {
441        index += 1;
442    }
443
444    Some(index)
445}
446
447#[cfg(test)]
448mod tests {
449    use super::lex;
450    use mel_syntax::{TokenKind, range_end, range_start, text_range};
451
452    fn token_kinds(input: &str) -> Vec<TokenKind> {
453        lex(input)
454            .tokens
455            .into_iter()
456            .map(|token| token.kind)
457            .collect()
458    }
459
460    #[test]
461    fn lexes_basic_statement() {
462        let kinds = token_kinds(r#"$foo = 1;"#);
463        assert_eq!(
464            kinds,
465            vec![
466                TokenKind::Dollar,
467                TokenKind::Ident,
468                TokenKind::Whitespace,
469                TokenKind::Assign,
470                TokenKind::Whitespace,
471                TokenKind::IntLiteral,
472                TokenKind::Semi,
473                TokenKind::Eof,
474            ]
475        );
476    }
477
478    #[test]
479    fn lexes_compound_assignment_and_updates() {
480        let kinds = token_kinds(r#"$foo += 1; $bar -= 2; $baz *= 3; $qux /= 4; $foo++; $foo--;"#);
481        assert_eq!(
482            kinds,
483            vec![
484                TokenKind::Dollar,
485                TokenKind::Ident,
486                TokenKind::Whitespace,
487                TokenKind::PlusEq,
488                TokenKind::Whitespace,
489                TokenKind::IntLiteral,
490                TokenKind::Semi,
491                TokenKind::Whitespace,
492                TokenKind::Dollar,
493                TokenKind::Ident,
494                TokenKind::Whitespace,
495                TokenKind::MinusEq,
496                TokenKind::Whitespace,
497                TokenKind::IntLiteral,
498                TokenKind::Semi,
499                TokenKind::Whitespace,
500                TokenKind::Dollar,
501                TokenKind::Ident,
502                TokenKind::Whitespace,
503                TokenKind::StarEq,
504                TokenKind::Whitespace,
505                TokenKind::IntLiteral,
506                TokenKind::Semi,
507                TokenKind::Whitespace,
508                TokenKind::Dollar,
509                TokenKind::Ident,
510                TokenKind::Whitespace,
511                TokenKind::SlashEq,
512                TokenKind::Whitespace,
513                TokenKind::IntLiteral,
514                TokenKind::Semi,
515                TokenKind::Whitespace,
516                TokenKind::Dollar,
517                TokenKind::Ident,
518                TokenKind::PlusPlus,
519                TokenKind::Semi,
520                TokenKind::Whitespace,
521                TokenKind::Dollar,
522                TokenKind::Ident,
523                TokenKind::MinusMinus,
524                TokenKind::Semi,
525                TokenKind::Eof,
526            ]
527        );
528    }
529
530    #[test]
531    fn lexes_backquoted_command() {
532        let kinds = token_kinds(r#"`ls -sl`;"#);
533        assert_eq!(
534            kinds,
535            vec![
536                TokenKind::Backquote,
537                TokenKind::Ident,
538                TokenKind::Whitespace,
539                TokenKind::Flag,
540                TokenKind::Backquote,
541                TokenKind::Semi,
542                TokenKind::Eof,
543            ]
544        );
545    }
546
547    #[test]
548    fn lexes_minus_before_ident_in_expression_as_minus() {
549        let kinds = token_kinds(r#"size($path)-size($sceneName);"#);
550        assert_eq!(
551            kinds,
552            vec![
553                TokenKind::Ident,
554                TokenKind::LParen,
555                TokenKind::Dollar,
556                TokenKind::Ident,
557                TokenKind::RParen,
558                TokenKind::Minus,
559                TokenKind::Ident,
560                TokenKind::LParen,
561                TokenKind::Dollar,
562                TokenKind::Ident,
563                TokenKind::RParen,
564                TokenKind::Semi,
565                TokenKind::Eof,
566            ]
567        );
568    }
569
570    #[test]
571    fn keeps_minus_ident_after_whitespace_as_flag() {
572        let kinds = token_kinds("optionVar -q Foo;");
573        assert_eq!(
574            kinds,
575            vec![
576                TokenKind::Ident,
577                TokenKind::Whitespace,
578                TokenKind::Flag,
579                TokenKind::Whitespace,
580                TokenKind::Ident,
581                TokenKind::Semi,
582                TokenKind::Eof,
583            ]
584        );
585    }
586
587    #[test]
588    fn lexes_exponent_float_literals() {
589        let input = "1.0e-3 1e+3 0.0e0 1E-9";
590        let lexed = lex(input);
591        let texts: Vec<_> = lexed
592            .tokens
593            .iter()
594            .filter(|token| !token.kind.is_trivia() && token.kind != TokenKind::Eof)
595            .map(|token| {
596                (
597                    &input[range_start(token.range) as usize..range_end(token.range) as usize],
598                    token.kind,
599                )
600            })
601            .collect();
602
603        assert_eq!(
604            texts,
605            vec![
606                ("1.0e-3", TokenKind::FloatLiteral),
607                ("1e+3", TokenKind::FloatLiteral),
608                ("0.0e0", TokenKind::FloatLiteral),
609                ("1E-9", TokenKind::FloatLiteral),
610            ]
611        );
612    }
613
614    #[test]
615    fn lexes_leading_dot_float_literals() {
616        let input = ".7 .001 .5e+2 .";
617        let lexed = lex(input);
618        let texts: Vec<_> = lexed
619            .tokens
620            .iter()
621            .filter(|token| !token.kind.is_trivia() && token.kind != TokenKind::Eof)
622            .map(|token| {
623                (
624                    &input[range_start(token.range) as usize..range_end(token.range) as usize],
625                    token.kind,
626                )
627            })
628            .collect();
629
630        assert_eq!(
631            texts,
632            vec![
633                (".7", TokenKind::FloatLiteral),
634                (".001", TokenKind::FloatLiteral),
635                (".5e+2", TokenKind::FloatLiteral),
636                (".", TokenKind::Dot),
637            ]
638        );
639    }
640
641    #[test]
642    fn lexes_trailing_dot_float_literals() {
643        let input = "1000. 0. -1000. 1.. 1.foo";
644        let lexed = lex(input);
645        let texts: Vec<_> = lexed
646            .tokens
647            .iter()
648            .filter(|token| !token.kind.is_trivia() && token.kind != TokenKind::Eof)
649            .map(|token| {
650                (
651                    &input[range_start(token.range) as usize..range_end(token.range) as usize],
652                    token.kind,
653                )
654            })
655            .collect();
656
657        assert_eq!(
658            texts,
659            vec![
660                ("1000.", TokenKind::FloatLiteral),
661                ("0.", TokenKind::FloatLiteral),
662                ("-", TokenKind::Minus),
663                ("1000.", TokenKind::FloatLiteral),
664                ("1", TokenKind::IntLiteral),
665                (".", TokenKind::Dot),
666                (".", TokenKind::Dot),
667                ("1", TokenKind::IntLiteral),
668                (".", TokenKind::Dot),
669                ("foo", TokenKind::Ident),
670            ]
671        );
672    }
673
674    #[test]
675    fn lexes_hex_integer_literals() {
676        let input = "0x8000 0X0001 42";
677        let lexed = lex(input);
678        let texts: Vec<_> = lexed
679            .tokens
680            .iter()
681            .filter(|token| !token.kind.is_trivia() && token.kind != TokenKind::Eof)
682            .map(|token| {
683                (
684                    &input[range_start(token.range) as usize..range_end(token.range) as usize],
685                    token.kind,
686                )
687            })
688            .collect();
689
690        assert_eq!(
691            texts,
692            vec![
693                ("0x8000", TokenKind::IntLiteral),
694                ("0X0001", TokenKind::IntLiteral),
695                ("42", TokenKind::IntLiteral),
696            ]
697        );
698    }
699
700    #[test]
701    fn lexes_caret_operator() {
702        let kinds = token_kinds("vector $cross = $a ^ $b;");
703        assert_eq!(
704            kinds,
705            vec![
706                TokenKind::Ident,
707                TokenKind::Whitespace,
708                TokenKind::Dollar,
709                TokenKind::Ident,
710                TokenKind::Whitespace,
711                TokenKind::Assign,
712                TokenKind::Whitespace,
713                TokenKind::Dollar,
714                TokenKind::Ident,
715                TokenKind::Whitespace,
716                TokenKind::Caret,
717                TokenKind::Whitespace,
718                TokenKind::Dollar,
719                TokenKind::Ident,
720                TokenKind::Semi,
721                TokenKind::Eof,
722            ]
723        );
724    }
725
726    #[test]
727    fn malformed_exponent_suffix_stays_split() {
728        let kinds = token_kinds("1e+ 1.0e 1e-");
729        assert_eq!(
730            kinds,
731            vec![
732                TokenKind::IntLiteral,
733                TokenKind::Ident,
734                TokenKind::Plus,
735                TokenKind::Whitespace,
736                TokenKind::FloatLiteral,
737                TokenKind::Ident,
738                TokenKind::Whitespace,
739                TokenKind::IntLiteral,
740                TokenKind::Ident,
741                TokenKind::Minus,
742                TokenKind::Eof,
743            ]
744        );
745    }
746
747    #[test]
748    fn lexes_vector_literals_and_components() {
749        let kinds = token_kinds(r#"$dir = <<1, 2, 3>>; $x = $dir.x;"#);
750        assert_eq!(
751            kinds,
752            vec![
753                TokenKind::Dollar,
754                TokenKind::Ident,
755                TokenKind::Whitespace,
756                TokenKind::Assign,
757                TokenKind::Whitespace,
758                TokenKind::LtLt,
759                TokenKind::IntLiteral,
760                TokenKind::Comma,
761                TokenKind::Whitespace,
762                TokenKind::IntLiteral,
763                TokenKind::Comma,
764                TokenKind::Whitespace,
765                TokenKind::IntLiteral,
766                TokenKind::GtGt,
767                TokenKind::Semi,
768                TokenKind::Whitespace,
769                TokenKind::Dollar,
770                TokenKind::Ident,
771                TokenKind::Whitespace,
772                TokenKind::Assign,
773                TokenKind::Whitespace,
774                TokenKind::Dollar,
775                TokenKind::Ident,
776                TokenKind::Dot,
777                TokenKind::Ident,
778                TokenKind::Semi,
779                TokenKind::Eof,
780            ]
781        );
782    }
783
784    #[test]
785    fn lexes_single_pipe_for_dag_paths() {
786        let kinds = token_kinds("|pSphere1|pSphereShape1.instObjGroups[0]");
787        assert_eq!(
788            kinds,
789            vec![
790                TokenKind::Pipe,
791                TokenKind::Ident,
792                TokenKind::Pipe,
793                TokenKind::Ident,
794                TokenKind::Dot,
795                TokenKind::Ident,
796                TokenKind::LBracket,
797                TokenKind::IntLiteral,
798                TokenKind::RBracket,
799                TokenKind::Eof,
800            ]
801        );
802    }
803
804    #[test]
805    fn keeps_double_pipe_as_boolean_or() {
806        let kinds = token_kinds("$a || $b");
807        assert_eq!(
808            kinds,
809            vec![
810                TokenKind::Dollar,
811                TokenKind::Ident,
812                TokenKind::Whitespace,
813                TokenKind::OrOr,
814                TokenKind::Whitespace,
815                TokenKind::Dollar,
816                TokenKind::Ident,
817                TokenKind::Eof,
818            ]
819        );
820    }
821
822    #[test]
823    fn retains_trivia_tokens() {
824        let kinds = token_kinds("// lead\n$foo /* mid */ = 1;");
825        assert_eq!(
826            kinds,
827            vec![
828                TokenKind::LineComment,
829                TokenKind::Whitespace,
830                TokenKind::Dollar,
831                TokenKind::Ident,
832                TokenKind::Whitespace,
833                TokenKind::BlockComment,
834                TokenKind::Whitespace,
835                TokenKind::Assign,
836                TokenKind::Whitespace,
837                TokenKind::IntLiteral,
838                TokenKind::Semi,
839                TokenKind::Eof,
840            ]
841        );
842    }
843
844    #[test]
845    fn unterminated_string_produces_diagnostic() {
846        let lexed = lex("\"unterminated");
847        assert_eq!(lexed.tokens.len(), 2);
848        assert_eq!(lexed.tokens[0].kind, TokenKind::StringLiteral);
849        assert_eq!(lexed.tokens[0].range, text_range(0, 13));
850        assert_eq!(lexed.tokens[1].kind, TokenKind::Eof);
851        assert_eq!(lexed.tokens[1].range, text_range(13, 13));
852        assert_eq!(lexed.diagnostics.len(), 1);
853        assert_eq!(lexed.diagnostics[0].message, "unterminated string literal");
854        assert_eq!(lexed.diagnostics[0].range, text_range(0, 13));
855    }
856
857    #[test]
858    fn unterminated_block_comment_produces_diagnostic() {
859        let lexed = lex("/* unterminated");
860        assert_eq!(lexed.tokens.len(), 2);
861        assert_eq!(lexed.tokens[0].kind, TokenKind::BlockComment);
862        assert_eq!(lexed.tokens[0].range, text_range(0, 15));
863        assert_eq!(lexed.tokens[1].kind, TokenKind::Eof);
864        assert_eq!(lexed.tokens[1].range, text_range(15, 15));
865        assert_eq!(lexed.diagnostics.len(), 1);
866        assert_eq!(lexed.diagnostics[0].message, "unterminated block comment");
867        assert_eq!(lexed.diagnostics[0].range, text_range(0, 15));
868    }
869
870    #[test]
871    fn unknown_character_produces_token_and_diagnostic() {
872        let lexed = lex("@");
873        assert_eq!(lexed.tokens.len(), 2);
874        assert_eq!(lexed.tokens[0].kind, TokenKind::Unknown);
875        assert_eq!(lexed.tokens[0].range, text_range(0, 1));
876        assert_eq!(lexed.diagnostics.len(), 1);
877        assert_eq!(lexed.diagnostics[0].message, "unknown character");
878        assert_eq!(lexed.diagnostics[0].range, text_range(0, 1));
879    }
880
881    #[test]
882    fn unknown_utf8_codepoint_produces_single_token_and_diagnostic() {
883        let lexed = lex("😀");
884        assert_eq!(lexed.tokens.len(), 2);
885        assert_eq!(lexed.tokens[0].kind, TokenKind::Unknown);
886        assert_eq!(lexed.tokens[0].range, text_range(0, 4));
887        assert_eq!(lexed.tokens[1].kind, TokenKind::Eof);
888        assert_eq!(lexed.tokens[1].range, text_range(4, 4));
889        assert_eq!(lexed.diagnostics.len(), 1);
890        assert_eq!(lexed.diagnostics[0].message, "unknown character");
891        assert_eq!(lexed.diagnostics[0].range, text_range(0, 4));
892    }
893}