Skip to main content

leo_parser_rowan/
lexer.rs

1// Copyright (C) 2019-2026 Provable Inc.
2// This file is part of the Leo library.
3
4// The Leo library is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8
9// The Leo library is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13
14// You should have received a copy of the GNU General Public License
15// along with the Leo library. If not, see <https://www.gnu.org/licenses/>.
16
17//! Lexer for the rowan-based Leo parser.
18//!
19//! This module provides a logos-based lexer that produces tokens suitable for
20//! use with rowan's GreenNodeBuilder. All tokens, including whitespace and
21//! comments (trivia), are explicitly represented.
22
23use crate::{SyntaxKind, SyntaxKind::*};
24use logos::Logos;
25use rowan::{TextRange, TextSize};
26
27/// A token produced by the lexer.
28#[derive(Debug, Clone, Copy, PartialEq, Eq)]
29pub struct Token {
30    /// The kind of token.
31    pub kind: SyntaxKind,
32    /// The length in bytes of the token text.
33    pub len: u32,
34}
35
36/// The kind of lexer error, carrying any structured data needed for diagnostics.
37#[derive(Debug, Clone, PartialEq, Eq)]
38pub enum LexErrorKind {
39    /// A digit was invalid for the given radix (e.g. `9` in an octal literal).
40    InvalidDigit { digit: char, radix: u32, token: String },
41    /// A token could not be lexed at all.
42    CouldNotLex { content: String },
43    /// A Unicode bidi override code point was encountered.
44    BidiOverride,
45}
46
47/// An error encountered during lexing.
48#[derive(Debug, Clone, PartialEq, Eq)]
49pub struct LexError {
50    /// The text range where the error occurred.
51    pub range: TextRange,
52    /// Structured error kind.
53    pub kind: LexErrorKind,
54}
55
56/// Callback for parsing block comments.
57///
58/// Block comments can't be matched with a simple regex due to the need to find
59/// the closing `*/`. This also detects bidi override characters for security.
60/// Always returns true to produce a token; unterminated comments are detected
61/// by checking if the slice ends with `*/` in the lex() function.
62fn comment_block(lex: &mut logos::Lexer<LogosToken>) -> bool {
63    let mut last_asterisk = false;
64    for (index, c) in lex.remainder().char_indices() {
65        if c == '*' {
66            last_asterisk = true;
67        } else if c == '/' && last_asterisk {
68            lex.bump(index + 1);
69            return true;
70        } else if matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}') {
71            // Bidi character detected - end the comment token here
72            // so we can report that error separately.
73            lex.bump(index);
74            return true;
75        } else {
76            last_asterisk = false;
77        }
78    }
79    // Unterminated block comment - consume all remaining input
80    let remaining = lex.remainder().len();
81    lex.bump(remaining);
82    true
83}
84
85/// Internal logos token enum.
86///
87/// This is mapped to `SyntaxKind` during lexing. We use a separate enum here
88/// because logos requires ownership of the token type during lexing.
89#[derive(Clone, Copy, Debug, PartialEq, Eq, Logos)]
90enum LogosToken {
91    // =========================================================================
92    // Trivia
93    // =========================================================================
94    #[regex(r"[ \t\f]+")]
95    Whitespace,
96
97    #[regex(r"\r?\n")]
98    Linebreak,
99
100    // Comments don't include line breaks or bidi characters
101    #[regex(r"//[^\r\n\u{202A}-\u{202E}\u{2066}-\u{2069}]*")]
102    CommentLine,
103
104    #[token(r"/*", comment_block)]
105    CommentBlock,
106
107    // =========================================================================
108    // Literals
109    // =========================================================================
110    // Address literals: aleo1...
111    // We lex any length and validate later
112    #[regex(r"aleo1[a-z0-9]*")]
113    AddressLiteral,
114
115    // Integer literals with various radixes
116    // The regex includes type suffixes (u8, i32, field, etc.) to lex as a single token.
117    // We use permissive patterns that allow invalid digits (e.g., G in hex) so we can
118    // report specific validation errors rather than failing to lex.
119    #[regex(r"0x[0-9A-Za-z_]+([ui](8|16|32|64|128)|field|group|scalar)?", priority = 3)]
120    #[regex(r"0o[0-9A-Za-z_]+([ui](8|16|32|64|128)|field|group|scalar)?", priority = 3)]
121    #[regex(r"0b[0-9A-Za-z_]+([ui](8|16|32|64|128)|field|group|scalar)?", priority = 3)]
122    #[regex(r"[0-9][0-9A-Za-z_]*([ui](8|16|32|64|128)|field|group|scalar)?")]
123    Integer,
124
125    #[regex(r#""[^"]*""#)]
126    StaticString,
127
128    // Identifier literals: 'foo', 'bar_baz'
129    #[regex(r"'[a-zA-Z][a-zA-Z0-9_]*'")]
130    IdentifierLiteral,
131
132    // =========================================================================
133    // Identifiers and Keywords
134    // =========================================================================
135    // Note: Complex identifiers (paths like foo::bar, program IDs like foo.aleo,
136    // locators like foo.aleo::bar) are deferred to Phase 2. The lexer produces
137    // simple tokens; the parser handles disambiguation.
138    //
139    // We need special cases for `group::abc`, `signature::abc`, and `Future::abc`
140    // as otherwise these would be keywords followed by ::.
141    #[regex(r"group::[a-zA-Z][a-zA-Z0-9_]*")]
142    #[regex(r"signature::[a-zA-Z][a-zA-Z0-9_]*")]
143    #[regex(r"Future::[a-zA-Z][a-zA-Z0-9_]*")]
144    PathSpecial,
145
146    // Identifiers starting with underscore (intrinsic names)
147    #[regex(r"_[a-zA-Z][a-zA-Z0-9_]*")]
148    IdentIntrinsic,
149
150    // Regular identifiers (keywords are matched by checking the slice)
151    #[regex(r"[a-zA-Z][a-zA-Z0-9_]*")]
152    Ident,
153
154    // =========================================================================
155    // Operators (multi-character first for correct priority)
156    // =========================================================================
157    #[token("**=")]
158    PowAssign,
159    #[token("&&=")]
160    AndAssign,
161    #[token("||=")]
162    OrAssign,
163    #[token("<<=")]
164    ShlAssign,
165    #[token(">>=")]
166    ShrAssign,
167
168    #[token("**")]
169    Pow,
170    #[token("&&")]
171    And,
172    #[token("||")]
173    Or,
174    #[token("<<")]
175    Shl,
176    #[token(">>")]
177    Shr,
178    #[token("==")]
179    EqEq,
180    #[token("!=")]
181    NotEq,
182    #[token("<=")]
183    LtEq,
184    #[token(">=")]
185    GtEq,
186    #[token("+=")]
187    AddAssign,
188    #[token("-=")]
189    SubAssign,
190    #[token("*=")]
191    MulAssign,
192    #[token("/=")]
193    DivAssign,
194    #[token("%=")]
195    RemAssign,
196    #[token("&=")]
197    BitAndAssign,
198    #[token("|=")]
199    BitOrAssign,
200    #[token("^=")]
201    BitXorAssign,
202
203    #[token("->")]
204    Arrow,
205    #[token("=>")]
206    FatArrow,
207    #[token("..=")]
208    DotDotEq,
209    #[token("..")]
210    DotDot,
211    #[token("::")]
212    ColonColon,
213
214    // Single character operators and punctuation
215    #[token("=")]
216    Eq,
217    #[token("!")]
218    Bang,
219    #[token("<")]
220    Lt,
221    #[token(">")]
222    Gt,
223    #[token("+")]
224    Plus,
225    #[token("-")]
226    Minus,
227    #[token("*")]
228    Star,
229    #[token("/")]
230    Slash,
231    #[token("%")]
232    Percent,
233    #[token("&")]
234    Amp,
235    #[token("|")]
236    Pipe,
237    #[token("^")]
238    Caret,
239
240    // =========================================================================
241    // Punctuation
242    // =========================================================================
243    #[token("(")]
244    LParen,
245    #[token(")")]
246    RParen,
247    #[token("[")]
248    LBracket,
249    #[token("]")]
250    RBracket,
251    #[token("{")]
252    LBrace,
253    #[token("}")]
254    RBrace,
255    #[token(",")]
256    Comma,
257    #[token(".")]
258    Dot,
259    #[token(";")]
260    Semicolon,
261    #[token(":")]
262    Colon,
263    #[token("?")]
264    Question,
265    #[token("_")]
266    Underscore,
267    #[token("@")]
268    At,
269
270    // =========================================================================
271    // Security
272    // =========================================================================
273    // Unicode bidirectional control characters are a security risk.
274    // We detect them so we can report an error.
275    #[regex(r"[\u{202A}-\u{202E}\u{2066}-\u{2069}]")]
276    Bidi,
277}
278
279/// Convert an identifier slice to the appropriate SyntaxKind (keyword or IDENT).
280fn ident_to_kind(s: &str) -> SyntaxKind {
281    match s {
282        // Literal keywords
283        "true" => KW_TRUE,
284        "false" => KW_FALSE,
285        "none" => KW_NONE,
286        // Type keywords
287        "address" => KW_ADDRESS,
288        "bool" => KW_BOOL,
289        "field" => KW_FIELD,
290        "group" => KW_GROUP,
291        "scalar" => KW_SCALAR,
292        "signature" => KW_SIGNATURE,
293        "string" => KW_STRING,
294        "record" => KW_RECORD,
295        "dyn" => KW_DYN,
296        "identifier" => KW_IDENTIFIER,
297        "i8" => KW_I8,
298        "i16" => KW_I16,
299        "i32" => KW_I32,
300        "i64" => KW_I64,
301        "i128" => KW_I128,
302        "u8" => KW_U8,
303        "u16" => KW_U16,
304        "u32" => KW_U32,
305        "u64" => KW_U64,
306        "u128" => KW_U128,
307        // Control flow keywords
308        "if" => KW_IF,
309        "else" => KW_ELSE,
310        "for" => KW_FOR,
311        "in" => KW_IN,
312        "return" => KW_RETURN,
313        // Declaration keywords
314        "let" => KW_LET,
315        "const" => KW_CONST,
316        "constant" => KW_CONSTANT,
317        "final" => KW_FINAL,
318        "Final" => KW_FINAL_UPPER,
319        "view" => KW_VIEW,
320        "fn" => KW_FN,
321        "Fn" => KW_FN_UPPER,
322        "struct" => KW_STRUCT,
323        "constructor" => KW_CONSTRUCTOR,
324        "interface" => KW_INTERFACE,
325        // Program structure keywords
326        "program" => KW_PROGRAM,
327        "import" => KW_IMPORT,
328        "mapping" => KW_MAPPING,
329        "storage" => KW_STORAGE,
330        "network" => KW_NETWORK,
331        "aleo" => KW_ALEO,
332        "script" => KW_SCRIPT,
333        "block" => KW_BLOCK,
334        // Visibility & assertion keywords
335        "public" => KW_PUBLIC,
336        "private" => KW_PRIVATE,
337        "as" => KW_AS,
338        "self" => KW_SELF,
339        "assert" => KW_ASSERT,
340        "assert_eq" => KW_ASSERT_EQ,
341        "assert_neq" => KW_ASSERT_NEQ,
342        // Not a keyword
343        _ => IDENT,
344    }
345}
346
347/// Strip integer type suffix from a string, returning the numeric part.
348fn strip_int_suffix(s: &str) -> Option<&str> {
349    // Check for integer type suffixes (longest first to match correctly)
350    let suffixes = ["u128", "i128", "u64", "i64", "u32", "i32", "u16", "i16", "u8", "i8"];
351    for suffix in suffixes {
352        if let Some(prefix) = s.strip_suffix(suffix) {
353            return Some(prefix);
354        }
355    }
356    None
357}
358
359/// Validate integer literal digits for the appropriate radix.
360/// Adds errors to the error vector if invalid digits are found.
361fn validate_integer_digits(text: &str, offset: usize, errors: &mut Vec<LexError>) {
362    // Strip type suffix if present (field, group, scalar, or integer types)
363    let num_part = text
364        .strip_suffix("field")
365        .or_else(|| text.strip_suffix("group"))
366        .or_else(|| text.strip_suffix("scalar"))
367        .or_else(|| strip_int_suffix(text))
368        .unwrap_or(text);
369
370    // Determine the radix and get the digit part
371    let (digits, radix, _prefix_len): (&str, u32, usize) = if let Some(s) = num_part.strip_prefix("0x") {
372        (s, 16, 2)
373    } else if let Some(s) = num_part.strip_prefix("0X") {
374        (s, 16, 2)
375    } else if let Some(s) = num_part.strip_prefix("0o") {
376        (s, 8, 2)
377    } else if let Some(s) = num_part.strip_prefix("0O") {
378        (s, 8, 2)
379    } else if let Some(s) = num_part.strip_prefix("0b") {
380        (s, 2, 2)
381    } else if let Some(s) = num_part.strip_prefix("0B") {
382        (s, 2, 2)
383    } else {
384        // Decimal - no prefix
385        (num_part, 10, 0)
386    };
387
388    // Find the first invalid digit
389    for (_, c) in digits.char_indices() {
390        if c == '_' {
391            continue; // Underscores are allowed
392        }
393        if !c.is_digit(radix) {
394            // Found an invalid digit - span covers the entire numeric part (like LALRPOP)
395            let error_end = offset + num_part.len();
396            errors.push(LexError {
397                range: TextRange::new(TextSize::new(offset as u32), TextSize::new(error_end as u32)),
398                kind: LexErrorKind::InvalidDigit { digit: c, radix, token: num_part.to_string() },
399            });
400            return; // Only report the first invalid digit
401        }
402    }
403}
404
405/// Lex the given source text into a sequence of tokens.
406///
407/// Returns a vector of tokens and any errors encountered. Even if errors
408/// occur, tokens are still produced to enable error recovery in the parser.
409pub fn lex(source: &str) -> (Vec<Token>, Vec<LexError>) {
410    let mut tokens = Vec::new();
411    let mut errors = Vec::new();
412    let mut lexer = LogosToken::lexer(source);
413
414    while let Some(result) = lexer.next() {
415        let span = lexer.span();
416        let len = (span.end - span.start) as u32;
417        let slice = lexer.slice();
418
419        let kind = match result {
420            Ok(token) => match token {
421                // Trivia
422                LogosToken::Whitespace => WHITESPACE,
423                LogosToken::Linebreak => LINEBREAK,
424                LogosToken::CommentLine => COMMENT_LINE,
425                LogosToken::CommentBlock => {
426                    // Check if block comment was properly terminated
427                    if !slice.ends_with("*/") {
428                        let preview_len = slice.len().min(10);
429                        let preview = &slice[..preview_len];
430                        errors.push(LexError {
431                            range: TextRange::new(
432                                TextSize::new(span.start as u32),
433                                TextSize::new((span.start + 2) as u32), // Just the /*
434                            ),
435                            kind: LexErrorKind::CouldNotLex { content: preview.to_string() },
436                        });
437                    }
438                    COMMENT_BLOCK
439                }
440
441                // Literals
442                LogosToken::AddressLiteral => ADDRESS_LIT,
443                LogosToken::Integer => INTEGER,
444                LogosToken::StaticString => STRING,
445                LogosToken::IdentifierLiteral => IDENT_LIT,
446
447                // Identifiers (check for keywords)
448                LogosToken::Ident => ident_to_kind(slice),
449                LogosToken::IdentIntrinsic => IDENT,
450                LogosToken::PathSpecial => IDENT, // Treat as IDENT for now (Phase 2)
451
452                // Multi-char operators
453                LogosToken::PowAssign => STAR2_EQ,
454                LogosToken::AndAssign => AMP2_EQ,
455                LogosToken::OrAssign => PIPE2_EQ,
456                LogosToken::ShlAssign => SHL_EQ,
457                LogosToken::ShrAssign => SHR_EQ,
458                LogosToken::Pow => STAR2,
459                LogosToken::And => AMP2,
460                LogosToken::Or => PIPE2,
461                LogosToken::Shl => SHL,
462                LogosToken::Shr => SHR,
463                LogosToken::EqEq => EQ2,
464                LogosToken::NotEq => BANG_EQ,
465                LogosToken::LtEq => LT_EQ,
466                LogosToken::GtEq => GT_EQ,
467                LogosToken::AddAssign => PLUS_EQ,
468                LogosToken::SubAssign => MINUS_EQ,
469                LogosToken::MulAssign => STAR_EQ,
470                LogosToken::DivAssign => SLASH_EQ,
471                LogosToken::RemAssign => PERCENT_EQ,
472                LogosToken::BitAndAssign => AMP_EQ,
473                LogosToken::BitOrAssign => PIPE_EQ,
474                LogosToken::BitXorAssign => CARET_EQ,
475                LogosToken::Arrow => ARROW,
476                LogosToken::FatArrow => FAT_ARROW,
477                LogosToken::DotDotEq => DOT_DOT_EQ,
478                LogosToken::DotDot => DOT_DOT,
479                LogosToken::ColonColon => COLON_COLON,
480
481                // Single-char operators
482                LogosToken::Eq => EQ,
483                LogosToken::Bang => BANG,
484                LogosToken::Lt => LT,
485                LogosToken::Gt => GT,
486                LogosToken::Plus => PLUS,
487                LogosToken::Minus => MINUS,
488                LogosToken::Star => STAR,
489                LogosToken::Slash => SLASH,
490                LogosToken::Percent => PERCENT,
491                LogosToken::Amp => AMP,
492                LogosToken::Pipe => PIPE,
493                LogosToken::Caret => CARET,
494
495                // Punctuation
496                LogosToken::LParen => L_PAREN,
497                LogosToken::RParen => R_PAREN,
498                LogosToken::LBracket => L_BRACKET,
499                LogosToken::RBracket => R_BRACKET,
500                LogosToken::LBrace => L_BRACE,
501                LogosToken::RBrace => R_BRACE,
502                LogosToken::Comma => COMMA,
503                LogosToken::Dot => DOT,
504                LogosToken::Semicolon => SEMICOLON,
505                LogosToken::Colon => COLON,
506                LogosToken::Question => QUESTION,
507                LogosToken::Underscore => UNDERSCORE,
508                LogosToken::At => AT,
509
510                // Security: bidi characters
511                LogosToken::Bidi => {
512                    errors.push(LexError {
513                        range: TextRange::new(TextSize::new(span.start as u32), TextSize::new(span.end as u32)),
514                        kind: LexErrorKind::BidiOverride,
515                    });
516                    ERROR
517                }
518            },
519            Err(()) => {
520                errors.push(LexError {
521                    range: TextRange::new(TextSize::new(span.start as u32), TextSize::new(span.end as u32)),
522                    kind: LexErrorKind::CouldNotLex { content: slice.to_string() },
523                });
524                ERROR
525            }
526        };
527
528        // Validate integer literal digits for the appropriate radix
529        if kind == INTEGER {
530            validate_integer_digits(slice, span.start, &mut errors);
531        }
532
533        tokens.push(Token { kind, len });
534    }
535
536    // Add EOF token
537    tokens.push(Token { kind: EOF, len: 0 });
538
539    (tokens, errors)
540}
541
542#[cfg(test)]
543mod tests {
544    use super::*;
545    use expect_test::{Expect, expect};
546
547    /// Helper to format tokens for snapshot testing.
548    fn check_lex(input: &str, expect: Expect) {
549        let (tokens, _errors) = lex(input);
550        let mut output = String::new();
551        let mut offset = 0usize;
552        for token in &tokens {
553            let text = &input[offset..offset + token.len as usize];
554            output.push_str(&format!("{:?} {:?}\n", token.kind, text));
555            offset += token.len as usize;
556        }
557        expect.assert_eq(&output);
558    }
559
560    /// Helper to check that lexing produces expected errors.
561    fn check_lex_errors(input: &str, expect: Expect) {
562        let (_tokens, errors) = lex(input);
563        let output = errors
564            .iter()
565            .map(|e| format!("{}..{}:{:?}", u32::from(e.range.start()), u32::from(e.range.end()), e.kind))
566            .collect::<Vec<_>>()
567            .join("\n");
568        expect.assert_eq(&output);
569    }
570
571    #[test]
572    fn lex_empty() {
573        check_lex("", expect![[r#"
574            EOF ""
575        "#]]);
576    }
577
578    #[test]
579    fn lex_whitespace() {
580        check_lex("  \t  ", expect![[r#"
581            WHITESPACE "  \t  "
582            EOF ""
583        "#]]);
584    }
585
586    #[test]
587    fn lex_linebreaks() {
588        check_lex("\n\r\n\n", expect![[r#"
589            LINEBREAK "\n"
590            LINEBREAK "\r\n"
591            LINEBREAK "\n"
592            EOF ""
593"#]]);
594    }
595
596    #[test]
597    fn lex_mixed_whitespace() {
598        check_lex("  \n  \t\n", expect![[r#"
599            WHITESPACE "  "
600            LINEBREAK "\n"
601            WHITESPACE "  \t"
602            LINEBREAK "\n"
603            EOF ""
604        "#]]);
605    }
606
607    #[test]
608    fn lex_line_comments() {
609        check_lex("// hello\n// world", expect![[r#"
610            COMMENT_LINE "// hello"
611            LINEBREAK "\n"
612            COMMENT_LINE "// world"
613            EOF ""
614        "#]]);
615    }
616
617    #[test]
618    fn lex_block_comments() {
619        check_lex("/* hello */ /* multi\nline */", expect![[r#"
620            COMMENT_BLOCK "/* hello */"
621            WHITESPACE " "
622            COMMENT_BLOCK "/* multi\nline */"
623            EOF ""
624        "#]]);
625    }
626
627    #[test]
628    fn lex_identifiers() {
629        check_lex("foo Bar _baz x123", expect![[r#"
630            IDENT "foo"
631            WHITESPACE " "
632            IDENT "Bar"
633            WHITESPACE " "
634            IDENT "_baz"
635            WHITESPACE " "
636            IDENT "x123"
637            EOF ""
638        "#]]);
639    }
640
641    #[test]
642    fn lex_keywords() {
643        check_lex("let fn if return true false", expect![[r#"
644            KW_LET "let"
645            WHITESPACE " "
646            KW_FN "fn"
647            WHITESPACE " "
648            KW_IF "if"
649            WHITESPACE " "
650            KW_RETURN "return"
651            WHITESPACE " "
652            KW_TRUE "true"
653            WHITESPACE " "
654            KW_FALSE "false"
655            EOF ""
656        "#]]);
657    }
658
659    #[test]
660    fn lex_type_keywords() {
661        check_lex("u8 u16 u32 u64 u128 i8 i16 i32 i64 i128", expect![[r#"
662            KW_U8 "u8"
663            WHITESPACE " "
664            KW_U16 "u16"
665            WHITESPACE " "
666            KW_U32 "u32"
667            WHITESPACE " "
668            KW_U64 "u64"
669            WHITESPACE " "
670            KW_U128 "u128"
671            WHITESPACE " "
672            KW_I8 "i8"
673            WHITESPACE " "
674            KW_I16 "i16"
675            WHITESPACE " "
676            KW_I32 "i32"
677            WHITESPACE " "
678            KW_I64 "i64"
679            WHITESPACE " "
680            KW_I128 "i128"
681            EOF ""
682        "#]]);
683    }
684
685    #[test]
686    fn lex_more_type_keywords() {
687        check_lex("bool field group scalar address signature string record", expect![[r#"
688            KW_BOOL "bool"
689            WHITESPACE " "
690            KW_FIELD "field"
691            WHITESPACE " "
692            KW_GROUP "group"
693            WHITESPACE " "
694            KW_SCALAR "scalar"
695            WHITESPACE " "
696            KW_ADDRESS "address"
697            WHITESPACE " "
698            KW_SIGNATURE "signature"
699            WHITESPACE " "
700            KW_STRING "string"
701            WHITESPACE " "
702            KW_RECORD "record"
703            EOF ""
704        "#]]);
705    }
706
707    #[test]
708    fn lex_identifier_literal() {
709        check_lex("'foo' 'bar_baz' 'x'", expect![[r#"
710            IDENT_LIT "'foo'"
711            WHITESPACE " "
712            IDENT_LIT "'bar_baz'"
713            WHITESPACE " "
714            IDENT_LIT "'x'"
715            EOF ""
716        "#]]);
717    }
718
719    #[test]
720    fn lex_identifier_keyword() {
721        check_lex("identifier", expect![[r#"
722            KW_IDENTIFIER "identifier"
723            EOF ""
724        "#]]);
725    }
726
727    #[test]
728    fn lex_integers() {
729        check_lex("123 0xFF 0b101 0o77", expect![[r#"
730            INTEGER "123"
731            WHITESPACE " "
732            INTEGER "0xFF"
733            WHITESPACE " "
734            INTEGER "0b101"
735            WHITESPACE " "
736            INTEGER "0o77"
737            EOF ""
738        "#]]);
739    }
740
741    #[test]
742    fn lex_integers_with_underscores() {
743        check_lex("1_000_000 0xFF_FF", expect![[r#"
744            INTEGER "1_000_000"
745            WHITESPACE " "
746            INTEGER "0xFF_FF"
747            EOF ""
748        "#]]);
749    }
750
751    #[test]
752    fn lex_address_literal() {
753        check_lex("aleo1abc123", expect![[r#"
754            ADDRESS_LIT "aleo1abc123"
755            EOF ""
756        "#]]);
757    }
758
759    #[test]
760    fn lex_strings() {
761        check_lex(r#""hello" "world""#, expect![[r#"
762            STRING "\"hello\""
763            WHITESPACE " "
764            STRING "\"world\""
765            EOF ""
766        "#]]);
767    }
768
769    #[test]
770    fn lex_punctuation() {
771        check_lex("( ) [ ] { } , . ; : :: ? -> => _ @", expect![[r#"
772            L_PAREN "("
773            WHITESPACE " "
774            R_PAREN ")"
775            WHITESPACE " "
776            L_BRACKET "["
777            WHITESPACE " "
778            R_BRACKET "]"
779            WHITESPACE " "
780            L_BRACE "{"
781            WHITESPACE " "
782            R_BRACE "}"
783            WHITESPACE " "
784            COMMA ","
785            WHITESPACE " "
786            DOT "."
787            WHITESPACE " "
788            SEMICOLON ";"
789            WHITESPACE " "
790            COLON ":"
791            WHITESPACE " "
792            COLON_COLON "::"
793            WHITESPACE " "
794            QUESTION "?"
795            WHITESPACE " "
796            ARROW "->"
797            WHITESPACE " "
798            FAT_ARROW "=>"
799            WHITESPACE " "
800            UNDERSCORE "_"
801            WHITESPACE " "
802            AT "@"
803            EOF ""
804        "#]]);
805    }
806
807    #[test]
808    fn lex_arithmetic_operators() {
809        check_lex("+ - * / % **", expect![[r#"
810            PLUS "+"
811            WHITESPACE " "
812            MINUS "-"
813            WHITESPACE " "
814            STAR "*"
815            WHITESPACE " "
816            SLASH "/"
817            WHITESPACE " "
818            PERCENT "%"
819            WHITESPACE " "
820            STAR2 "**"
821            EOF ""
822        "#]]);
823    }
824
825    #[test]
826    fn lex_comparison_operators() {
827        check_lex("== != < <= > >=", expect![[r#"
828            EQ2 "=="
829            WHITESPACE " "
830            BANG_EQ "!="
831            WHITESPACE " "
832            LT "<"
833            WHITESPACE " "
834            LT_EQ "<="
835            WHITESPACE " "
836            GT ">"
837            WHITESPACE " "
838            GT_EQ ">="
839            EOF ""
840        "#]]);
841    }
842
843    #[test]
844    fn lex_logical_operators() {
845        check_lex("&& || !", expect![[r#"
846            AMP2 "&&"
847            WHITESPACE " "
848            PIPE2 "||"
849            WHITESPACE " "
850            BANG "!"
851            EOF ""
852        "#]]);
853    }
854
855    #[test]
856    fn lex_bitwise_operators() {
857        check_lex("& | ^ << >>", expect![[r#"
858            AMP "&"
859            WHITESPACE " "
860            PIPE "|"
861            WHITESPACE " "
862            CARET "^"
863            WHITESPACE " "
864            SHL "<<"
865            WHITESPACE " "
866            SHR ">>"
867            EOF ""
868        "#]]);
869    }
870
871    #[test]
872    fn lex_assignment_operators() {
873        check_lex("= += -= *= /= %= **= &&= ||=", expect![[r#"
874            EQ "="
875            WHITESPACE " "
876            PLUS_EQ "+="
877            WHITESPACE " "
878            MINUS_EQ "-="
879            WHITESPACE " "
880            STAR_EQ "*="
881            WHITESPACE " "
882            SLASH_EQ "/="
883            WHITESPACE " "
884            PERCENT_EQ "%="
885            WHITESPACE " "
886            STAR2_EQ "**="
887            WHITESPACE " "
888            AMP2_EQ "&&="
889            WHITESPACE " "
890            PIPE2_EQ "||="
891            EOF ""
892        "#]]);
893    }
894
895    #[test]
896    fn lex_more_assignment_operators() {
897        check_lex("&= |= ^= <<= >>=", expect![[r#"
898            AMP_EQ "&="
899            WHITESPACE " "
900            PIPE_EQ "|="
901            WHITESPACE " "
902            CARET_EQ "^="
903            WHITESPACE " "
904            SHL_EQ "<<="
905            WHITESPACE " "
906            SHR_EQ ">>="
907            EOF ""
908        "#]]);
909    }
910
911    #[test]
912    fn lex_dot_dot() {
913        check_lex("0..10", expect![[r#"
914            INTEGER "0"
915            DOT_DOT ".."
916            INTEGER "10"
917            EOF ""
918        "#]]);
919    }
920
921    #[test]
922    fn lex_simple_expression() {
923        check_lex("x + y * 2", expect![[r#"
924            IDENT "x"
925            WHITESPACE " "
926            PLUS "+"
927            WHITESPACE " "
928            IDENT "y"
929            WHITESPACE " "
930            STAR "*"
931            WHITESPACE " "
932            INTEGER "2"
933            EOF ""
934        "#]]);
935    }
936
937    #[test]
938    fn lex_function_call() {
939        check_lex("foo(a, b)", expect![[r#"
940            IDENT "foo"
941            L_PAREN "("
942            IDENT "a"
943            COMMA ","
944            WHITESPACE " "
945            IDENT "b"
946            R_PAREN ")"
947            EOF ""
948        "#]]);
949    }
950
951    #[test]
952    fn lex_function_definition() {
953        check_lex("fn add(x: u32) -> u32 {", expect![[r#"
954            KW_FN "fn"
955            WHITESPACE " "
956            IDENT "add"
957            L_PAREN "("
958            IDENT "x"
959            COLON ":"
960            WHITESPACE " "
961            KW_U32 "u32"
962            R_PAREN ")"
963            WHITESPACE " "
964            ARROW "->"
965            WHITESPACE " "
966            KW_U32 "u32"
967            WHITESPACE " "
968            L_BRACE "{"
969            EOF ""
970        "#]]);
971    }
972
973    #[test]
974    fn lex_let_statement() {
975        check_lex("let x: u32 = 42;", expect![[r#"
976            KW_LET "let"
977            WHITESPACE " "
978            IDENT "x"
979            COLON ":"
980            WHITESPACE " "
981            KW_U32 "u32"
982            WHITESPACE " "
983            EQ "="
984            WHITESPACE " "
985            INTEGER "42"
986            SEMICOLON ";"
987            EOF ""
988        "#]]);
989    }
990
991    #[test]
992    fn lex_typed_integers() {
993        // Integer literals with type suffixes should be lexed as single tokens
994        check_lex("1000u32 42i64 0u8 255u128", expect![[r#"
995            INTEGER "1000u32"
996            WHITESPACE " "
997            INTEGER "42i64"
998            WHITESPACE " "
999            INTEGER "0u8"
1000            WHITESPACE " "
1001            INTEGER "255u128"
1002            EOF ""
1003        "#]]);
1004    }
1005
1006    #[test]
1007    fn lex_typed_integers_field() {
1008        // Field, group, and scalar suffixes
1009        check_lex("123field 456group 789scalar", expect![[r#"
1010            INTEGER "123field"
1011            WHITESPACE " "
1012            INTEGER "456group"
1013            WHITESPACE " "
1014            INTEGER "789scalar"
1015            EOF ""
1016        "#]]);
1017    }
1018
1019    #[test]
1020    fn lex_special_paths() {
1021        // These are special cases where keywords are followed by ::
1022        check_lex("group::GEN signature::verify Future::await", expect![[r#"
1023            IDENT "group::GEN"
1024            WHITESPACE " "
1025            IDENT "signature::verify"
1026            WHITESPACE " "
1027            IDENT "Future::await"
1028            EOF ""
1029        "#]]);
1030    }
1031
1032    #[test]
1033    fn lex_typed_integer_range() {
1034        // Integer with type suffix followed by range operator
1035        check_lex("0u8..STOP", expect![[r#"
1036            INTEGER "0u8"
1037            DOT_DOT ".."
1038            IDENT "STOP"
1039            EOF ""
1040        "#]]);
1041    }
1042
1043    #[test]
1044    fn lex_error_unknown_char() {
1045        check_lex_errors("hello $ world", expect![[r#"6..7:CouldNotLex { content: "$" }"#]]);
1046    }
1047
1048    #[test]
1049    fn lex_invalid_hex_digit() {
1050        let (tokens, errors) = lex("0xGAu32");
1051        assert_eq!(tokens.len(), 2); // INTEGER + EOF
1052        assert!(!errors.is_empty());
1053        assert!(matches!(errors[0].kind, LexErrorKind::InvalidDigit { digit: 'G', radix: 16, .. }));
1054    }
1055
1056    #[test]
1057    fn lex_invalid_octal_digit() {
1058        let (_, errors) = lex("0o9u32");
1059        assert!(!errors.is_empty());
1060        assert!(matches!(errors[0].kind, LexErrorKind::InvalidDigit { digit: '9', radix: 8, .. }));
1061    }
1062
1063    #[test]
1064    fn lex_invalid_binary_digit() {
1065        let (_, errors) = lex("0b2u32");
1066        assert!(!errors.is_empty());
1067        assert!(matches!(errors[0].kind, LexErrorKind::InvalidDigit { digit: '2', radix: 2, .. }));
1068    }
1069
1070    #[test]
1071    fn lex_valid_hex_is_ok() {
1072        let (_, errors) = lex("0xDEADBEEFu64");
1073        assert!(errors.is_empty());
1074    }
1075
1076    #[test]
1077    fn lex_invalid_hex_lowercase() {
1078        // Lowercase hex digits beyond f are invalid
1079        let (_, errors) = lex("0xghu32");
1080        assert!(!errors.is_empty());
1081        assert!(matches!(errors[0].kind, LexErrorKind::InvalidDigit { digit: 'g', radix: 16, .. }));
1082    }
1083
1084    #[test]
1085    fn lex_bidi_override_error() {
1086        let (_, errors) = lex("let x\u{202E} = 1;");
1087        assert!(!errors.is_empty());
1088        assert!(matches!(errors[0].kind, LexErrorKind::BidiOverride));
1089    }
1090
1091    #[test]
1092    fn lex_unclosed_block_comment() {
1093        let (tokens, errors) = lex("/* unclosed");
1094        assert!(!errors.is_empty());
1095        assert!(matches!(errors[0].kind, LexErrorKind::CouldNotLex { .. }));
1096        // Should still produce a COMMENT_BLOCK token
1097        assert!(tokens.iter().any(|t| t.kind == COMMENT_BLOCK));
1098    }
1099
1100    #[test]
1101    fn lex_nested_comment_not_supported() {
1102        // Leo doesn't support nested comments - the first */ closes the comment
1103        let (tokens, errors) = lex("/* outer /* inner */");
1104        // No errors - the first */ terminates the comment
1105        assert!(errors.is_empty());
1106        // The "outer /* inner " part is the comment, then "*/" closes it
1107        // But actually let's check what happens...
1108        // "/* outer /* inner */" - this finds the first */ which is at position 18
1109        // So the comment is "/* outer /* inner */" which IS terminated
1110        assert!(tokens.iter().any(|t| t.kind == COMMENT_BLOCK));
1111    }
1112
1113    #[test]
1114    fn lex_closed_comment_ok() {
1115        let (_, errors) = lex("/* closed */");
1116        assert!(errors.is_empty());
1117    }
1118}