Skip to main content

leo_parser_rowan/
lexer.rs

1// Copyright (C) 2019-2026 Provable Inc.
2// This file is part of the Leo library.
3
4// The Leo library is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8
9// The Leo library is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13
14// You should have received a copy of the GNU General Public License
15// along with the Leo library. If not, see <https://www.gnu.org/licenses/>.
16
17//! Lexer for the rowan-based Leo parser.
18//!
19//! This module provides a logos-based lexer that produces tokens suitable for
20//! use with rowan's GreenNodeBuilder. All tokens, including whitespace and
21//! comments (trivia), are explicitly represented.
22
23use crate::{SyntaxKind, SyntaxKind::*};
24use logos::Logos;
25use rowan::{TextRange, TextSize};
26
27/// A token produced by the lexer.
28#[derive(Debug, Clone, Copy, PartialEq, Eq)]
29pub struct Token {
30    /// The kind of token.
31    pub kind: SyntaxKind,
32    /// The length in bytes of the token text.
33    pub len: u32,
34}
35
36/// The kind of lexer error, carrying any structured data needed for diagnostics.
37#[derive(Debug, Clone, PartialEq, Eq)]
38pub enum LexErrorKind {
39    /// A digit was invalid for the given radix (e.g. `9` in an octal literal).
40    InvalidDigit { digit: char, radix: u32, token: String },
41    /// A token could not be lexed at all.
42    CouldNotLex { content: String },
43    /// A Unicode bidi override code point was encountered.
44    BidiOverride,
45}
46
47/// An error encountered during lexing.
48#[derive(Debug, Clone, PartialEq, Eq)]
49pub struct LexError {
50    /// The text range where the error occurred.
51    pub range: TextRange,
52    /// Structured error kind.
53    pub kind: LexErrorKind,
54}
55
56/// Callback for parsing block comments.
57///
58/// Block comments can't be matched with a simple regex due to the need to find
59/// the closing `*/`. This also detects bidi override characters for security.
60/// Always returns true to produce a token; unterminated comments are detected
61/// by checking if the slice ends with `*/` in the lex() function.
62fn comment_block(lex: &mut logos::Lexer<LogosToken>) -> bool {
63    let mut last_asterisk = false;
64    for (index, c) in lex.remainder().char_indices() {
65        if c == '*' {
66            last_asterisk = true;
67        } else if c == '/' && last_asterisk {
68            lex.bump(index + 1);
69            return true;
70        } else if matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}') {
71            // Bidi character detected - end the comment token here
72            // so we can report that error separately.
73            lex.bump(index);
74            return true;
75        } else {
76            last_asterisk = false;
77        }
78    }
79    // Unterminated block comment - consume all remaining input
80    let remaining = lex.remainder().len();
81    lex.bump(remaining);
82    true
83}
84
85/// Internal logos token enum.
86///
87/// This is mapped to `SyntaxKind` during lexing. We use a separate enum here
88/// because logos requires ownership of the token type during lexing.
89#[derive(Clone, Copy, Debug, PartialEq, Eq, Logos)]
90enum LogosToken {
91    // =========================================================================
92    // Trivia
93    // =========================================================================
94    #[regex(r"[ \t\f]+")]
95    Whitespace,
96
97    #[regex(r"\r?\n")]
98    Linebreak,
99
100    // Comments don't include line breaks or bidi characters
101    #[regex(r"//[^\r\n\u{202A}-\u{202E}\u{2066}-\u{2069}]*")]
102    CommentLine,
103
104    #[token(r"/*", comment_block)]
105    CommentBlock,
106
107    // =========================================================================
108    // Literals
109    // =========================================================================
110    // Address literals: aleo1...
111    // We lex any length and validate later
112    #[regex(r"aleo1[a-z0-9]*")]
113    AddressLiteral,
114
115    // Integer literals with various radixes
116    // The regex includes type suffixes (u8, i32, field, etc.) to lex as a single token.
117    // We use permissive patterns that allow invalid digits (e.g., G in hex) so we can
118    // report specific validation errors rather than failing to lex.
119    #[regex(r"0x[0-9A-Za-z_]+([ui](8|16|32|64|128)|field|group|scalar)?", priority = 3)]
120    #[regex(r"0o[0-9A-Za-z_]+([ui](8|16|32|64|128)|field|group|scalar)?", priority = 3)]
121    #[regex(r"0b[0-9A-Za-z_]+([ui](8|16|32|64|128)|field|group|scalar)?", priority = 3)]
122    #[regex(r"[0-9][0-9A-Za-z_]*([ui](8|16|32|64|128)|field|group|scalar)?")]
123    Integer,
124
125    #[regex(r#""[^"]*""#)]
126    StaticString,
127
128    // Identifier literals: 'foo', 'bar_baz'
129    #[regex(r"'[a-zA-Z][a-zA-Z0-9_]*'")]
130    IdentifierLiteral,
131
132    // =========================================================================
133    // Identifiers and Keywords
134    // =========================================================================
135    // Note: Complex identifiers (paths like foo::bar, program IDs like foo.aleo,
136    // locators like foo.aleo::bar) are deferred to Phase 2. The lexer produces
137    // simple tokens; the parser handles disambiguation.
138    //
139    // We need special cases for `group::abc`, `signature::abc`, and `Future::abc`
140    // as otherwise these would be keywords followed by ::.
141    #[regex(r"group::[a-zA-Z][a-zA-Z0-9_]*")]
142    #[regex(r"signature::[a-zA-Z][a-zA-Z0-9_]*")]
143    #[regex(r"Future::[a-zA-Z][a-zA-Z0-9_]*")]
144    PathSpecial,
145
146    // Identifiers starting with underscore (intrinsic names)
147    #[regex(r"_[a-zA-Z][a-zA-Z0-9_]*")]
148    IdentIntrinsic,
149
150    // Regular identifiers (keywords are matched by checking the slice)
151    #[regex(r"[a-zA-Z][a-zA-Z0-9_]*")]
152    Ident,
153
154    // =========================================================================
155    // Operators (multi-character first for correct priority)
156    // =========================================================================
157    #[token("**=")]
158    PowAssign,
159    #[token("&&=")]
160    AndAssign,
161    #[token("||=")]
162    OrAssign,
163    #[token("<<=")]
164    ShlAssign,
165    #[token(">>=")]
166    ShrAssign,
167
168    #[token("**")]
169    Pow,
170    #[token("&&")]
171    And,
172    #[token("||")]
173    Or,
174    #[token("<<")]
175    Shl,
176    #[token(">>")]
177    Shr,
178    #[token("==")]
179    EqEq,
180    #[token("!=")]
181    NotEq,
182    #[token("<=")]
183    LtEq,
184    #[token(">=")]
185    GtEq,
186    #[token("+=")]
187    AddAssign,
188    #[token("-=")]
189    SubAssign,
190    #[token("*=")]
191    MulAssign,
192    #[token("/=")]
193    DivAssign,
194    #[token("%=")]
195    RemAssign,
196    #[token("&=")]
197    BitAndAssign,
198    #[token("|=")]
199    BitOrAssign,
200    #[token("^=")]
201    BitXorAssign,
202
203    #[token("->")]
204    Arrow,
205    #[token("=>")]
206    FatArrow,
207    #[token("..=")]
208    DotDotEq,
209    #[token("..")]
210    DotDot,
211    #[token("::")]
212    ColonColon,
213
214    // Single character operators and punctuation
215    #[token("=")]
216    Eq,
217    #[token("!")]
218    Bang,
219    #[token("<")]
220    Lt,
221    #[token(">")]
222    Gt,
223    #[token("+")]
224    Plus,
225    #[token("-")]
226    Minus,
227    #[token("*")]
228    Star,
229    #[token("/")]
230    Slash,
231    #[token("%")]
232    Percent,
233    #[token("&")]
234    Amp,
235    #[token("|")]
236    Pipe,
237    #[token("^")]
238    Caret,
239
240    // =========================================================================
241    // Punctuation
242    // =========================================================================
243    #[token("(")]
244    LParen,
245    #[token(")")]
246    RParen,
247    #[token("[")]
248    LBracket,
249    #[token("]")]
250    RBracket,
251    #[token("{")]
252    LBrace,
253    #[token("}")]
254    RBrace,
255    #[token(",")]
256    Comma,
257    #[token(".")]
258    Dot,
259    #[token(";")]
260    Semicolon,
261    #[token(":")]
262    Colon,
263    #[token("?")]
264    Question,
265    #[token("_")]
266    Underscore,
267    #[token("@")]
268    At,
269
270    // =========================================================================
271    // Security
272    // =========================================================================
273    // Unicode bidirectional control characters are a security risk.
274    // We detect them so we can report an error.
275    #[regex(r"[\u{202A}-\u{202E}\u{2066}-\u{2069}]")]
276    Bidi,
277}
278
279/// Convert an identifier slice to the appropriate SyntaxKind (keyword or IDENT).
280fn ident_to_kind(s: &str) -> SyntaxKind {
281    match s {
282        // Literal keywords
283        "true" => KW_TRUE,
284        "false" => KW_FALSE,
285        "none" => KW_NONE,
286        // Type keywords
287        "address" => KW_ADDRESS,
288        "bool" => KW_BOOL,
289        "field" => KW_FIELD,
290        "group" => KW_GROUP,
291        "scalar" => KW_SCALAR,
292        "signature" => KW_SIGNATURE,
293        "string" => KW_STRING,
294        "record" => KW_RECORD,
295        "dyn" => KW_DYN,
296        "identifier" => KW_IDENTIFIER,
297        "i8" => KW_I8,
298        "i16" => KW_I16,
299        "i32" => KW_I32,
300        "i64" => KW_I64,
301        "i128" => KW_I128,
302        "u8" => KW_U8,
303        "u16" => KW_U16,
304        "u32" => KW_U32,
305        "u64" => KW_U64,
306        "u128" => KW_U128,
307        // Control flow keywords
308        "if" => KW_IF,
309        "else" => KW_ELSE,
310        "for" => KW_FOR,
311        "in" => KW_IN,
312        "return" => KW_RETURN,
313        // Declaration keywords
314        "let" => KW_LET,
315        "const" => KW_CONST,
316        "constant" => KW_CONSTANT,
317        "final" => KW_FINAL,
318        "Final" => KW_FINAL_UPPER,
319        "fn" => KW_FN,
320        "Fn" => KW_FN_UPPER,
321        "struct" => KW_STRUCT,
322        "constructor" => KW_CONSTRUCTOR,
323        "interface" => KW_INTERFACE,
324        // Program structure keywords
325        "program" => KW_PROGRAM,
326        "import" => KW_IMPORT,
327        "mapping" => KW_MAPPING,
328        "storage" => KW_STORAGE,
329        "network" => KW_NETWORK,
330        "aleo" => KW_ALEO,
331        "script" => KW_SCRIPT,
332        "block" => KW_BLOCK,
333        // Visibility & assertion keywords
334        "public" => KW_PUBLIC,
335        "private" => KW_PRIVATE,
336        "as" => KW_AS,
337        "self" => KW_SELF,
338        "assert" => KW_ASSERT,
339        "assert_eq" => KW_ASSERT_EQ,
340        "assert_neq" => KW_ASSERT_NEQ,
341        // Not a keyword
342        _ => IDENT,
343    }
344}
345
346/// Strip integer type suffix from a string, returning the numeric part.
347fn strip_int_suffix(s: &str) -> Option<&str> {
348    // Check for integer type suffixes (longest first to match correctly)
349    let suffixes = ["u128", "i128", "u64", "i64", "u32", "i32", "u16", "i16", "u8", "i8"];
350    for suffix in suffixes {
351        if let Some(prefix) = s.strip_suffix(suffix) {
352            return Some(prefix);
353        }
354    }
355    None
356}
357
358/// Validate integer literal digits for the appropriate radix.
359/// Adds errors to the error vector if invalid digits are found.
360fn validate_integer_digits(text: &str, offset: usize, errors: &mut Vec<LexError>) {
361    // Strip type suffix if present (field, group, scalar, or integer types)
362    let num_part = text
363        .strip_suffix("field")
364        .or_else(|| text.strip_suffix("group"))
365        .or_else(|| text.strip_suffix("scalar"))
366        .or_else(|| strip_int_suffix(text))
367        .unwrap_or(text);
368
369    // Determine the radix and get the digit part
370    let (digits, radix, _prefix_len): (&str, u32, usize) = if let Some(s) = num_part.strip_prefix("0x") {
371        (s, 16, 2)
372    } else if let Some(s) = num_part.strip_prefix("0X") {
373        (s, 16, 2)
374    } else if let Some(s) = num_part.strip_prefix("0o") {
375        (s, 8, 2)
376    } else if let Some(s) = num_part.strip_prefix("0O") {
377        (s, 8, 2)
378    } else if let Some(s) = num_part.strip_prefix("0b") {
379        (s, 2, 2)
380    } else if let Some(s) = num_part.strip_prefix("0B") {
381        (s, 2, 2)
382    } else {
383        // Decimal - no prefix
384        (num_part, 10, 0)
385    };
386
387    // Find the first invalid digit
388    for (_, c) in digits.char_indices() {
389        if c == '_' {
390            continue; // Underscores are allowed
391        }
392        if !c.is_digit(radix) {
393            // Found an invalid digit - span covers the entire numeric part (like LALRPOP)
394            let error_end = offset + num_part.len();
395            errors.push(LexError {
396                range: TextRange::new(TextSize::new(offset as u32), TextSize::new(error_end as u32)),
397                kind: LexErrorKind::InvalidDigit { digit: c, radix, token: num_part.to_string() },
398            });
399            return; // Only report the first invalid digit
400        }
401    }
402}
403
404/// Lex the given source text into a sequence of tokens.
405///
406/// Returns a vector of tokens and any errors encountered. Even if errors
407/// occur, tokens are still produced to enable error recovery in the parser.
408pub fn lex(source: &str) -> (Vec<Token>, Vec<LexError>) {
409    let mut tokens = Vec::new();
410    let mut errors = Vec::new();
411    let mut lexer = LogosToken::lexer(source);
412
413    while let Some(result) = lexer.next() {
414        let span = lexer.span();
415        let len = (span.end - span.start) as u32;
416        let slice = lexer.slice();
417
418        let kind = match result {
419            Ok(token) => match token {
420                // Trivia
421                LogosToken::Whitespace => WHITESPACE,
422                LogosToken::Linebreak => LINEBREAK,
423                LogosToken::CommentLine => COMMENT_LINE,
424                LogosToken::CommentBlock => {
425                    // Check if block comment was properly terminated
426                    if !slice.ends_with("*/") {
427                        let preview_len = slice.len().min(10);
428                        let preview = &slice[..preview_len];
429                        errors.push(LexError {
430                            range: TextRange::new(
431                                TextSize::new(span.start as u32),
432                                TextSize::new((span.start + 2) as u32), // Just the /*
433                            ),
434                            kind: LexErrorKind::CouldNotLex { content: preview.to_string() },
435                        });
436                    }
437                    COMMENT_BLOCK
438                }
439
440                // Literals
441                LogosToken::AddressLiteral => ADDRESS_LIT,
442                LogosToken::Integer => INTEGER,
443                LogosToken::StaticString => STRING,
444                LogosToken::IdentifierLiteral => IDENT_LIT,
445
446                // Identifiers (check for keywords)
447                LogosToken::Ident => ident_to_kind(slice),
448                LogosToken::IdentIntrinsic => IDENT,
449                LogosToken::PathSpecial => IDENT, // Treat as IDENT for now (Phase 2)
450
451                // Multi-char operators
452                LogosToken::PowAssign => STAR2_EQ,
453                LogosToken::AndAssign => AMP2_EQ,
454                LogosToken::OrAssign => PIPE2_EQ,
455                LogosToken::ShlAssign => SHL_EQ,
456                LogosToken::ShrAssign => SHR_EQ,
457                LogosToken::Pow => STAR2,
458                LogosToken::And => AMP2,
459                LogosToken::Or => PIPE2,
460                LogosToken::Shl => SHL,
461                LogosToken::Shr => SHR,
462                LogosToken::EqEq => EQ2,
463                LogosToken::NotEq => BANG_EQ,
464                LogosToken::LtEq => LT_EQ,
465                LogosToken::GtEq => GT_EQ,
466                LogosToken::AddAssign => PLUS_EQ,
467                LogosToken::SubAssign => MINUS_EQ,
468                LogosToken::MulAssign => STAR_EQ,
469                LogosToken::DivAssign => SLASH_EQ,
470                LogosToken::RemAssign => PERCENT_EQ,
471                LogosToken::BitAndAssign => AMP_EQ,
472                LogosToken::BitOrAssign => PIPE_EQ,
473                LogosToken::BitXorAssign => CARET_EQ,
474                LogosToken::Arrow => ARROW,
475                LogosToken::FatArrow => FAT_ARROW,
476                LogosToken::DotDotEq => DOT_DOT_EQ,
477                LogosToken::DotDot => DOT_DOT,
478                LogosToken::ColonColon => COLON_COLON,
479
480                // Single-char operators
481                LogosToken::Eq => EQ,
482                LogosToken::Bang => BANG,
483                LogosToken::Lt => LT,
484                LogosToken::Gt => GT,
485                LogosToken::Plus => PLUS,
486                LogosToken::Minus => MINUS,
487                LogosToken::Star => STAR,
488                LogosToken::Slash => SLASH,
489                LogosToken::Percent => PERCENT,
490                LogosToken::Amp => AMP,
491                LogosToken::Pipe => PIPE,
492                LogosToken::Caret => CARET,
493
494                // Punctuation
495                LogosToken::LParen => L_PAREN,
496                LogosToken::RParen => R_PAREN,
497                LogosToken::LBracket => L_BRACKET,
498                LogosToken::RBracket => R_BRACKET,
499                LogosToken::LBrace => L_BRACE,
500                LogosToken::RBrace => R_BRACE,
501                LogosToken::Comma => COMMA,
502                LogosToken::Dot => DOT,
503                LogosToken::Semicolon => SEMICOLON,
504                LogosToken::Colon => COLON,
505                LogosToken::Question => QUESTION,
506                LogosToken::Underscore => UNDERSCORE,
507                LogosToken::At => AT,
508
509                // Security: bidi characters
510                LogosToken::Bidi => {
511                    errors.push(LexError {
512                        range: TextRange::new(TextSize::new(span.start as u32), TextSize::new(span.end as u32)),
513                        kind: LexErrorKind::BidiOverride,
514                    });
515                    ERROR
516                }
517            },
518            Err(()) => {
519                errors.push(LexError {
520                    range: TextRange::new(TextSize::new(span.start as u32), TextSize::new(span.end as u32)),
521                    kind: LexErrorKind::CouldNotLex { content: slice.to_string() },
522                });
523                ERROR
524            }
525        };
526
527        // Validate integer literal digits for the appropriate radix
528        if kind == INTEGER {
529            validate_integer_digits(slice, span.start, &mut errors);
530        }
531
532        tokens.push(Token { kind, len });
533    }
534
535    // Add EOF token
536    tokens.push(Token { kind: EOF, len: 0 });
537
538    (tokens, errors)
539}
540
541#[cfg(test)]
542mod tests {
543    use super::*;
544    use expect_test::{Expect, expect};
545
546    /// Helper to format tokens for snapshot testing.
547    fn check_lex(input: &str, expect: Expect) {
548        let (tokens, _errors) = lex(input);
549        let mut output = String::new();
550        let mut offset = 0usize;
551        for token in &tokens {
552            let text = &input[offset..offset + token.len as usize];
553            output.push_str(&format!("{:?} {:?}\n", token.kind, text));
554            offset += token.len as usize;
555        }
556        expect.assert_eq(&output);
557    }
558
559    /// Helper to check that lexing produces expected errors.
560    fn check_lex_errors(input: &str, expect: Expect) {
561        let (_tokens, errors) = lex(input);
562        let output = errors
563            .iter()
564            .map(|e| format!("{}..{}:{:?}", u32::from(e.range.start()), u32::from(e.range.end()), e.kind))
565            .collect::<Vec<_>>()
566            .join("\n");
567        expect.assert_eq(&output);
568    }
569
570    #[test]
571    fn lex_empty() {
572        check_lex("", expect![[r#"
573            EOF ""
574        "#]]);
575    }
576
577    #[test]
578    fn lex_whitespace() {
579        check_lex("  \t  ", expect![[r#"
580            WHITESPACE "  \t  "
581            EOF ""
582        "#]]);
583    }
584
585    #[test]
586    fn lex_linebreaks() {
587        check_lex("\n\r\n\n", expect![[r#"
588            LINEBREAK "\n"
589            LINEBREAK "\r\n"
590            LINEBREAK "\n"
591            EOF ""
592"#]]);
593    }
594
595    #[test]
596    fn lex_mixed_whitespace() {
597        check_lex("  \n  \t\n", expect![[r#"
598            WHITESPACE "  "
599            LINEBREAK "\n"
600            WHITESPACE "  \t"
601            LINEBREAK "\n"
602            EOF ""
603        "#]]);
604    }
605
606    #[test]
607    fn lex_line_comments() {
608        check_lex("// hello\n// world", expect![[r#"
609            COMMENT_LINE "// hello"
610            LINEBREAK "\n"
611            COMMENT_LINE "// world"
612            EOF ""
613        "#]]);
614    }
615
616    #[test]
617    fn lex_block_comments() {
618        check_lex("/* hello */ /* multi\nline */", expect![[r#"
619            COMMENT_BLOCK "/* hello */"
620            WHITESPACE " "
621            COMMENT_BLOCK "/* multi\nline */"
622            EOF ""
623        "#]]);
624    }
625
626    #[test]
627    fn lex_identifiers() {
628        check_lex("foo Bar _baz x123", expect![[r#"
629            IDENT "foo"
630            WHITESPACE " "
631            IDENT "Bar"
632            WHITESPACE " "
633            IDENT "_baz"
634            WHITESPACE " "
635            IDENT "x123"
636            EOF ""
637        "#]]);
638    }
639
640    #[test]
641    fn lex_keywords() {
642        check_lex("let fn if return true false", expect![[r#"
643            KW_LET "let"
644            WHITESPACE " "
645            KW_FN "fn"
646            WHITESPACE " "
647            KW_IF "if"
648            WHITESPACE " "
649            KW_RETURN "return"
650            WHITESPACE " "
651            KW_TRUE "true"
652            WHITESPACE " "
653            KW_FALSE "false"
654            EOF ""
655        "#]]);
656    }
657
658    #[test]
659    fn lex_type_keywords() {
660        check_lex("u8 u16 u32 u64 u128 i8 i16 i32 i64 i128", expect![[r#"
661            KW_U8 "u8"
662            WHITESPACE " "
663            KW_U16 "u16"
664            WHITESPACE " "
665            KW_U32 "u32"
666            WHITESPACE " "
667            KW_U64 "u64"
668            WHITESPACE " "
669            KW_U128 "u128"
670            WHITESPACE " "
671            KW_I8 "i8"
672            WHITESPACE " "
673            KW_I16 "i16"
674            WHITESPACE " "
675            KW_I32 "i32"
676            WHITESPACE " "
677            KW_I64 "i64"
678            WHITESPACE " "
679            KW_I128 "i128"
680            EOF ""
681        "#]]);
682    }
683
684    #[test]
685    fn lex_more_type_keywords() {
686        check_lex("bool field group scalar address signature string record", expect![[r#"
687            KW_BOOL "bool"
688            WHITESPACE " "
689            KW_FIELD "field"
690            WHITESPACE " "
691            KW_GROUP "group"
692            WHITESPACE " "
693            KW_SCALAR "scalar"
694            WHITESPACE " "
695            KW_ADDRESS "address"
696            WHITESPACE " "
697            KW_SIGNATURE "signature"
698            WHITESPACE " "
699            KW_STRING "string"
700            WHITESPACE " "
701            KW_RECORD "record"
702            EOF ""
703        "#]]);
704    }
705
706    #[test]
707    fn lex_identifier_literal() {
708        check_lex("'foo' 'bar_baz' 'x'", expect![[r#"
709            IDENT_LIT "'foo'"
710            WHITESPACE " "
711            IDENT_LIT "'bar_baz'"
712            WHITESPACE " "
713            IDENT_LIT "'x'"
714            EOF ""
715        "#]]);
716    }
717
718    #[test]
719    fn lex_identifier_keyword() {
720        check_lex("identifier", expect![[r#"
721            KW_IDENTIFIER "identifier"
722            EOF ""
723        "#]]);
724    }
725
726    #[test]
727    fn lex_integers() {
728        check_lex("123 0xFF 0b101 0o77", expect![[r#"
729            INTEGER "123"
730            WHITESPACE " "
731            INTEGER "0xFF"
732            WHITESPACE " "
733            INTEGER "0b101"
734            WHITESPACE " "
735            INTEGER "0o77"
736            EOF ""
737        "#]]);
738    }
739
740    #[test]
741    fn lex_integers_with_underscores() {
742        check_lex("1_000_000 0xFF_FF", expect![[r#"
743            INTEGER "1_000_000"
744            WHITESPACE " "
745            INTEGER "0xFF_FF"
746            EOF ""
747        "#]]);
748    }
749
750    #[test]
751    fn lex_address_literal() {
752        check_lex("aleo1abc123", expect![[r#"
753            ADDRESS_LIT "aleo1abc123"
754            EOF ""
755        "#]]);
756    }
757
758    #[test]
759    fn lex_strings() {
760        check_lex(r#""hello" "world""#, expect![[r#"
761            STRING "\"hello\""
762            WHITESPACE " "
763            STRING "\"world\""
764            EOF ""
765        "#]]);
766    }
767
768    #[test]
769    fn lex_punctuation() {
770        check_lex("( ) [ ] { } , . ; : :: ? -> => _ @", expect![[r#"
771            L_PAREN "("
772            WHITESPACE " "
773            R_PAREN ")"
774            WHITESPACE " "
775            L_BRACKET "["
776            WHITESPACE " "
777            R_BRACKET "]"
778            WHITESPACE " "
779            L_BRACE "{"
780            WHITESPACE " "
781            R_BRACE "}"
782            WHITESPACE " "
783            COMMA ","
784            WHITESPACE " "
785            DOT "."
786            WHITESPACE " "
787            SEMICOLON ";"
788            WHITESPACE " "
789            COLON ":"
790            WHITESPACE " "
791            COLON_COLON "::"
792            WHITESPACE " "
793            QUESTION "?"
794            WHITESPACE " "
795            ARROW "->"
796            WHITESPACE " "
797            FAT_ARROW "=>"
798            WHITESPACE " "
799            UNDERSCORE "_"
800            WHITESPACE " "
801            AT "@"
802            EOF ""
803        "#]]);
804    }
805
806    #[test]
807    fn lex_arithmetic_operators() {
808        check_lex("+ - * / % **", expect![[r#"
809            PLUS "+"
810            WHITESPACE " "
811            MINUS "-"
812            WHITESPACE " "
813            STAR "*"
814            WHITESPACE " "
815            SLASH "/"
816            WHITESPACE " "
817            PERCENT "%"
818            WHITESPACE " "
819            STAR2 "**"
820            EOF ""
821        "#]]);
822    }
823
824    #[test]
825    fn lex_comparison_operators() {
826        check_lex("== != < <= > >=", expect![[r#"
827            EQ2 "=="
828            WHITESPACE " "
829            BANG_EQ "!="
830            WHITESPACE " "
831            LT "<"
832            WHITESPACE " "
833            LT_EQ "<="
834            WHITESPACE " "
835            GT ">"
836            WHITESPACE " "
837            GT_EQ ">="
838            EOF ""
839        "#]]);
840    }
841
842    #[test]
843    fn lex_logical_operators() {
844        check_lex("&& || !", expect![[r#"
845            AMP2 "&&"
846            WHITESPACE " "
847            PIPE2 "||"
848            WHITESPACE " "
849            BANG "!"
850            EOF ""
851        "#]]);
852    }
853
854    #[test]
855    fn lex_bitwise_operators() {
856        check_lex("& | ^ << >>", expect![[r#"
857            AMP "&"
858            WHITESPACE " "
859            PIPE "|"
860            WHITESPACE " "
861            CARET "^"
862            WHITESPACE " "
863            SHL "<<"
864            WHITESPACE " "
865            SHR ">>"
866            EOF ""
867        "#]]);
868    }
869
870    #[test]
871    fn lex_assignment_operators() {
872        check_lex("= += -= *= /= %= **= &&= ||=", expect![[r#"
873            EQ "="
874            WHITESPACE " "
875            PLUS_EQ "+="
876            WHITESPACE " "
877            MINUS_EQ "-="
878            WHITESPACE " "
879            STAR_EQ "*="
880            WHITESPACE " "
881            SLASH_EQ "/="
882            WHITESPACE " "
883            PERCENT_EQ "%="
884            WHITESPACE " "
885            STAR2_EQ "**="
886            WHITESPACE " "
887            AMP2_EQ "&&="
888            WHITESPACE " "
889            PIPE2_EQ "||="
890            EOF ""
891        "#]]);
892    }
893
894    #[test]
895    fn lex_more_assignment_operators() {
896        check_lex("&= |= ^= <<= >>=", expect![[r#"
897            AMP_EQ "&="
898            WHITESPACE " "
899            PIPE_EQ "|="
900            WHITESPACE " "
901            CARET_EQ "^="
902            WHITESPACE " "
903            SHL_EQ "<<="
904            WHITESPACE " "
905            SHR_EQ ">>="
906            EOF ""
907        "#]]);
908    }
909
910    #[test]
911    fn lex_dot_dot() {
912        check_lex("0..10", expect![[r#"
913            INTEGER "0"
914            DOT_DOT ".."
915            INTEGER "10"
916            EOF ""
917        "#]]);
918    }
919
920    #[test]
921    fn lex_simple_expression() {
922        check_lex("x + y * 2", expect![[r#"
923            IDENT "x"
924            WHITESPACE " "
925            PLUS "+"
926            WHITESPACE " "
927            IDENT "y"
928            WHITESPACE " "
929            STAR "*"
930            WHITESPACE " "
931            INTEGER "2"
932            EOF ""
933        "#]]);
934    }
935
936    #[test]
937    fn lex_function_call() {
938        check_lex("foo(a, b)", expect![[r#"
939            IDENT "foo"
940            L_PAREN "("
941            IDENT "a"
942            COMMA ","
943            WHITESPACE " "
944            IDENT "b"
945            R_PAREN ")"
946            EOF ""
947        "#]]);
948    }
949
950    #[test]
951    fn lex_function_definition() {
952        check_lex("fn add(x: u32) -> u32 {", expect![[r#"
953            KW_FN "fn"
954            WHITESPACE " "
955            IDENT "add"
956            L_PAREN "("
957            IDENT "x"
958            COLON ":"
959            WHITESPACE " "
960            KW_U32 "u32"
961            R_PAREN ")"
962            WHITESPACE " "
963            ARROW "->"
964            WHITESPACE " "
965            KW_U32 "u32"
966            WHITESPACE " "
967            L_BRACE "{"
968            EOF ""
969        "#]]);
970    }
971
972    #[test]
973    fn lex_let_statement() {
974        check_lex("let x: u32 = 42;", expect![[r#"
975            KW_LET "let"
976            WHITESPACE " "
977            IDENT "x"
978            COLON ":"
979            WHITESPACE " "
980            KW_U32 "u32"
981            WHITESPACE " "
982            EQ "="
983            WHITESPACE " "
984            INTEGER "42"
985            SEMICOLON ";"
986            EOF ""
987        "#]]);
988    }
989
990    #[test]
991    fn lex_typed_integers() {
992        // Integer literals with type suffixes should be lexed as single tokens
993        check_lex("1000u32 42i64 0u8 255u128", expect![[r#"
994            INTEGER "1000u32"
995            WHITESPACE " "
996            INTEGER "42i64"
997            WHITESPACE " "
998            INTEGER "0u8"
999            WHITESPACE " "
1000            INTEGER "255u128"
1001            EOF ""
1002        "#]]);
1003    }
1004
1005    #[test]
1006    fn lex_typed_integers_field() {
1007        // Field, group, and scalar suffixes
1008        check_lex("123field 456group 789scalar", expect![[r#"
1009            INTEGER "123field"
1010            WHITESPACE " "
1011            INTEGER "456group"
1012            WHITESPACE " "
1013            INTEGER "789scalar"
1014            EOF ""
1015        "#]]);
1016    }
1017
1018    #[test]
1019    fn lex_special_paths() {
1020        // These are special cases where keywords are followed by ::
1021        check_lex("group::GEN signature::verify Future::await", expect![[r#"
1022            IDENT "group::GEN"
1023            WHITESPACE " "
1024            IDENT "signature::verify"
1025            WHITESPACE " "
1026            IDENT "Future::await"
1027            EOF ""
1028        "#]]);
1029    }
1030
1031    #[test]
1032    fn lex_typed_integer_range() {
1033        // Integer with type suffix followed by range operator
1034        check_lex("0u8..STOP", expect![[r#"
1035            INTEGER "0u8"
1036            DOT_DOT ".."
1037            IDENT "STOP"
1038            EOF ""
1039        "#]]);
1040    }
1041
1042    #[test]
1043    fn lex_error_unknown_char() {
1044        check_lex_errors("hello $ world", expect![[r#"6..7:CouldNotLex { content: "$" }"#]]);
1045    }
1046
1047    #[test]
1048    fn lex_invalid_hex_digit() {
1049        let (tokens, errors) = lex("0xGAu32");
1050        assert_eq!(tokens.len(), 2); // INTEGER + EOF
1051        assert!(!errors.is_empty());
1052        assert!(matches!(errors[0].kind, LexErrorKind::InvalidDigit { digit: 'G', radix: 16, .. }));
1053    }
1054
1055    #[test]
1056    fn lex_invalid_octal_digit() {
1057        let (_, errors) = lex("0o9u32");
1058        assert!(!errors.is_empty());
1059        assert!(matches!(errors[0].kind, LexErrorKind::InvalidDigit { digit: '9', radix: 8, .. }));
1060    }
1061
1062    #[test]
1063    fn lex_invalid_binary_digit() {
1064        let (_, errors) = lex("0b2u32");
1065        assert!(!errors.is_empty());
1066        assert!(matches!(errors[0].kind, LexErrorKind::InvalidDigit { digit: '2', radix: 2, .. }));
1067    }
1068
1069    #[test]
1070    fn lex_valid_hex_is_ok() {
1071        let (_, errors) = lex("0xDEADBEEFu64");
1072        assert!(errors.is_empty());
1073    }
1074
1075    #[test]
1076    fn lex_invalid_hex_lowercase() {
1077        // Lowercase hex digits beyond f are invalid
1078        let (_, errors) = lex("0xghu32");
1079        assert!(!errors.is_empty());
1080        assert!(matches!(errors[0].kind, LexErrorKind::InvalidDigit { digit: 'g', radix: 16, .. }));
1081    }
1082
1083    #[test]
1084    fn lex_bidi_override_error() {
1085        let (_, errors) = lex("let x\u{202E} = 1;");
1086        assert!(!errors.is_empty());
1087        assert!(matches!(errors[0].kind, LexErrorKind::BidiOverride));
1088    }
1089
1090    #[test]
1091    fn lex_unclosed_block_comment() {
1092        let (tokens, errors) = lex("/* unclosed");
1093        assert!(!errors.is_empty());
1094        assert!(matches!(errors[0].kind, LexErrorKind::CouldNotLex { .. }));
1095        // Should still produce a COMMENT_BLOCK token
1096        assert!(tokens.iter().any(|t| t.kind == COMMENT_BLOCK));
1097    }
1098
1099    #[test]
1100    fn lex_nested_comment_not_supported() {
1101        // Leo doesn't support nested comments - the first */ closes the comment
1102        let (tokens, errors) = lex("/* outer /* inner */");
1103        // No errors - the first */ terminates the comment
1104        assert!(errors.is_empty());
1105        // The "outer /* inner " part is the comment, then "*/" closes it
1106        // But actually let's check what happens...
1107        // "/* outer /* inner */" - this finds the first */ which is at position 18
1108        // So the comment is "/* outer /* inner */" which IS terminated
1109        assert!(tokens.iter().any(|t| t.kind == COMMENT_BLOCK));
1110    }
1111
1112    #[test]
1113    fn lex_closed_comment_ok() {
1114        let (_, errors) = lex("/* closed */");
1115        assert!(errors.is_empty());
1116    }
1117}