Skip to main content

luaparse_rs/
lexer.rs

1//! Turns source text into a stream of tokens.
2//!
3//! You normally won't use this module directly. The [`Parser`](crate::Parser)
4//! calls [`lex_for_version`] internally. But the types here are public so you
5//! can inspect tokens if you need to.
6
7use alloc::{string::String, vec::Vec, format};
8
9use logos::Logos;
10use crate::{Span, LexError};
11
12/// A single token produced by the lexer.
13///
14/// Includes Lua keywords, operators, literals, and punctuation.
15/// The lexer is version agnostic; version specific keyword demotion
16/// (e.g. treating `continue` as an identifier in Lua 5.1) happens
17/// in [`lex_for_version`].
18#[derive(Logos, Debug, Clone, PartialEq)]
19#[logos(skip r"[ \t\r\n]+")]
20pub enum Token {
21    #[token("true")]
22    True,
23    
24    #[token("false")]
25    False,
26    
27    #[token("nil")]
28    Nil,
29    
30    #[token("and")]
31    And,
32    
33    #[token("break")]
34    Break,
35    
36    #[token("do")]
37    Do,
38    
39    #[token("else")]
40    Else,
41    
42    #[token("elseif")]
43    Elseif,
44    
45    #[token("end")]
46    End,
47    
48    #[token("for")]
49    For,
50    
51    #[token("function")]
52    Function,
53    
54    #[token("if")]
55    If,
56    
57    #[token("in")]
58    In,
59    
60    #[token("local")]
61    Local,
62    
63    #[token("not")]
64    Not,
65    
66    #[token("or")]
67    Or,
68    
69    #[token("repeat")]
70    Repeat,
71    
72    #[token("return")]
73    Return,
74    
75    #[token("then")]
76    Then,
77    
78    #[token("until")]
79    Until,
80    
81    #[token("while")]
82    While,
83    
84    #[token("continue")]
85    Continue,
86    
87    #[token("export")]
88    Export,
89    
90    #[token("type")]
91    Type,
92    
93    #[token("goto")]
94    Goto,
95    
96    #[token("const")]
97    Const,
98    
99    // here, we use \p{L} for any unciode letter and \p{N} for any unicode number
100    // as lua 5.3+ allows for unicode identifiers to be used
101    // see: https://www.lua.org/manual/5.4/manual.html#3.1
102    #[regex(r"[\p{L}_][\p{L}\p{N}_]*", |lex| lex.slice().to_string())]
103    Identifier(String),
104    
105   #[regex(r"0[xX][0-9a-fA-F_]*(\.[0-9a-fA-F_]*)?([pP][+-]?\d*)?|\d[0-9_]*(\.\d[0-9_]*)?([eE][+-]?\d*)?|\.\d[0-9_]*([eE][+-]?\d*)?|0[bB][01_]*", |lex| lex.slice().to_string())]
106    Number(String),
107    
108    #[regex(r#""([^"\\]|\\.)*""#, parse_string)]
109    #[regex(r#"'([^'\\]|\\.)*'"#, parse_string)]
110    String(String),
111    
112    #[token("`", parse_interpolation_parts)]
113    InterpolatedString(Vec<InterpolationPart>),
114    
115    #[regex(r"--", parse_comment)]
116    Comment(String),
117    
118    #[regex(r"\[[=]*\[", parse_long_string)]
119    LongString(String),
120    
121    #[token("+")]
122    Plus,
123    
124    #[token("-")]
125    Minus,
126    
127    #[token("*")]
128    Star,
129    
130    #[token("/")]
131    Slash,
132    
133    #[token("//")]
134    FloorDiv,
135    
136    #[token("%")]
137    Percent,
138    
139    #[token("^")]
140    Caret,
141    
142    #[token("#")]
143    Hash,
144    
145    #[token("==")]
146    EqEq,
147    
148    #[token("~=")]
149    NotEq,
150    
151    #[token("<=")]
152    LessEq,
153    
154    #[token(">=")]
155    GreaterEq,
156    
157    #[token("<")]
158    Less,
159    
160    #[token(">")]
161    Greater,
162    
163    #[token("=")]
164    Eq,
165    
166    #[token("+=")]
167    PlusEq,
168    
169    #[token("-=")]
170    MinusEq,
171    
172    #[token("*=")]
173    StarEq,
174    
175    #[token("/=")]
176    SlashEq,
177    
178    #[token("//=")]
179    FloorDivEq,
180    
181    #[token("%=")]
182    PercentEq,
183    
184    #[token("^=")]
185    CaretEq,
186    
187    #[token("..=")]
188    ConcatEq,
189    
190    #[token("(")]
191    LParen,
192    
193    #[token(")")]
194    RParen,
195    
196    #[token("{")]
197    LBrace,
198    
199    #[token("}")]
200    RBrace,
201    
202    #[token("[")]
203    LBracket,
204    
205    #[token("]")]
206    RBracket,
207    
208    #[token("::")]
209    ColonColon,
210    
211    #[token(":")]
212    Colon,
213    
214    #[token(";")]
215    Semi,
216    
217    #[token(",")]
218    Comma,
219    
220    #[token("...")]
221    Dot3,
222    
223    #[token("..")]
224    Dot2,
225    
226    #[token(".")]
227    Dot,
228    
229    #[token("->")]
230    Arrow,
231    
232    #[token("|")]
233    Pipe,
234    
235    #[token("&")]
236    Ampersand,
237    
238    #[token("?")]
239    Question,
240    
241    #[token("@")]
242    At,
243    
244    #[token("<<")]
245    LeftShift,
246    
247    #[token(">>")]
248    RightShift,
249    
250    #[token("~")]
251    Tilde,
252    
253    Eof,
254}
255
256/// A piece of a Luau interpolated string token.
257///
258/// The lexer splits `` `hello {expr} world` `` into a sequence of these parts
259/// so the parser can handle the embedded expressions.
260#[derive(Debug, Clone, PartialEq)]
261pub enum InterpolationPart {
262    /// A literal text segment.
263    Text(String),
264    /// The byte range of an embedded expression.
265    ExprSpan { start: usize, end: usize },
266}
267
268fn parse_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
269    let slice = lex.slice();
270    let content = &slice[1..slice.len() - 1];
271    Some(unescape_string(content))
272}
273
274fn parse_interpolation_parts(lex: &mut logos::Lexer<Token>) -> Option<Vec<InterpolationPart>> {
275    let start = lex.span().end;
276    let source = lex.source();
277    let bytes = source.as_bytes();
278    
279    let mut parts = Vec::new();
280    let mut current_text = String::new();
281    let mut pos = start;
282    
283    while pos < bytes.len() {
284        match bytes[pos] {
285            b'`' => {
286                if !current_text.is_empty() {
287                    parts.push(InterpolationPart::Text(current_text));
288                }
289                lex.bump(pos - start + 1);
290                return Some(parts);
291            }
292            b'{' => {
293                if !current_text.is_empty() {
294                    parts.push(InterpolationPart::Text(current_text.clone()));
295                    current_text.clear();
296                }
297                
298                let expr_start = pos + 1;
299                let mut depth = 1;
300                pos += 1;
301                
302                while pos < bytes.len() && depth > 0 {
303                    match bytes[pos] {
304                        b'{' => depth += 1,
305                        b'}' => depth -= 1,
306                        _ => {}
307                    }
308                    pos += 1;
309                }
310                
311                if depth != 0 {
312                    return None;
313                }
314                
315                let expr_end = pos - 1;
316                parts.push(InterpolationPart::ExprSpan {
317                    start: expr_start,
318                    end: expr_end,
319                });
320            }
321            b'\\' if pos + 1 < bytes.len() => {
322                match bytes[pos + 1] {
323                    b'n' => {
324                        current_text.push('\n');
325                        pos += 2;
326                    }
327                    b't' => {
328                        current_text.push('\t');
329                        pos += 2;
330                    }
331                    b'r' => {
332                        current_text.push('\r');
333                        pos += 2;
334                    }
335                    b'\\' | b'`' | b'{' | b'}' => {
336                        current_text.push(bytes[pos + 1] as char);
337                        pos += 2;
338                    }
339                    _ => {
340                        current_text.push(bytes[pos] as char);
341                        pos += 1;
342                    }
343                }
344            }
345            b => {
346                current_text.push(b as char);
347                pos += 1;
348            }
349        }
350    }
351    
352    None
353}
354
355fn parse_comment(lex: &mut logos::Lexer<Token>) -> Option<String> {
356    let start = lex.span().end;
357    let source = lex.source();
358    let rest = &source[start..];
359    
360    // Check if this is a block comment: --[[ or --[=*[
361    if rest.starts_with('[') {
362        let after_bracket = &rest[1..];
363        let eq_count = after_bracket.chars().take_while(|&c| c == '=').count();
364        if after_bracket.len() > eq_count && after_bracket[eq_count..].starts_with('[') {
365            // It's a block comment; find the matching closing ]=*]
366            let closing = format!("]{}]", "=".repeat(eq_count));
367            let block_start = 1 + eq_count + 1; // skip [=*[
368            let content_start = start + block_start;
369            
370            if let Some(end_pos) = source[content_start..].find(&closing) {
371                let content = source[content_start..content_start + end_pos].to_string();
372                lex.bump(block_start + end_pos + closing.len());
373                return Some(content);
374            } else {
375                // Unterminated block comment; consume rest as comment
376                let content = source[content_start..].to_string();
377                lex.bump(source.len() - start);
378                return Some(content);
379            }
380        }
381    }
382    
383    // Regular line comment: consume until newline or EOF
384    if let Some(newline_pos) = rest.find('\n') {
385        let content = rest[..newline_pos].trim().to_string();
386        lex.bump(newline_pos);
387        Some(content)
388    } else {
389        let content = rest.trim().to_string();
390        lex.bump(rest.len());
391        Some(content)
392    }
393}
394
395fn parse_long_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
396    let slice = lex.slice();
397    
398    let equals_count = slice.chars().filter(|&c| c == '=').count();
399    let closing = format!("]{}]", "=".repeat(equals_count));
400    
401    let start = lex.span().end;
402    let source = lex.source();
403    
404    let actual_start = if source[start..].starts_with('\n') {
405        start + 1
406    } else if source[start..].starts_with("\r\n") {
407        start + 2
408    } else {
409        start
410    };
411    
412    if let Some(end_pos) = source[actual_start..].find(&closing) {
413        let content = source[actual_start..actual_start + end_pos].to_string();
414        lex.bump(actual_start - start + end_pos + closing.len());
415        Some(content)
416    } else {
417        None
418    }
419}
420
421fn unescape_string(s: &str) -> String {
422    let mut result = String::new();
423    let mut chars = s.chars().peekable();
424    
425    while let Some(ch) = chars.next() {
426        if ch == '\\' {
427            match chars.next() {
428                Some('n') => result.push('\n'),
429                Some('t') => result.push('\t'),
430                Some('r') => result.push('\r'),
431                Some('\\') => result.push('\\'),
432                Some('"') => result.push('"'),
433                Some('\'') => result.push('\''),
434                Some('0') => result.push('\0'),
435                Some('a') => result.push('\x07'), // bell
436                Some('b') => result.push('\x08'), // backspace
437                Some('f') => result.push('\x0C'), // form feed
438                Some('v') => result.push('\x0B'), // vertical tab
439                
440                // \xHH
441                Some('x') => {
442                    let mut hex = String::new();
443                    if let Some(&h1) = chars.peek() {
444                        if h1.is_ascii_hexdigit() {
445                            hex.push(chars.next().unwrap());
446                            if let Some(&h2) = chars.peek() {
447                                if h2.is_ascii_hexdigit() {
448                                    hex.push(chars.next().unwrap());
449                                }
450                            }
451                        }
452                    }
453                    if let Ok(byte) = u8::from_str_radix(&hex, 16) {
454                        result.push(byte as char);
455                    } else {
456                        result.push('\\');
457                        result.push('x');
458                        result.push_str(&hex);
459                    }
460                }
461                
462                // \u{XXXX}
463                Some('u') => {
464                    if chars.peek() == Some(&'{') {
465                        chars.next(); // consume '{'
466                        let mut hex = String::new();
467                        
468                        while let Some(&ch) = chars.peek() {
469                            if ch == '}' {
470                                chars.next();
471                                break;
472                            }
473                            if ch.is_ascii_hexdigit() {
474                                hex.push(chars.next().unwrap());
475                            } else {
476                                break;
477                            }
478                        }
479                        
480                        if let Ok(code) = u32::from_str_radix(&hex, 16) {
481                            if let Some(unicode_char) = char::from_u32(code) {
482                                result.push(unicode_char);
483                            }
484                        }
485                    } else {
486                        result.push('\\');
487                        result.push('u');
488                    }
489                }
490                
491                // \z
492                Some('z') => {
493                    while let Some(&ch) = chars.peek() {
494                        if ch.is_whitespace() {
495                            chars.next();
496                        } else {
497                            break;
498                        }
499                    }
500                }
501                
502                // \ddd
503                Some(d) if d.is_ascii_digit() => {
504                    let mut num = String::new();
505                    num.push(d);
506                    
507                    for _ in 0..2 {
508                        if let Some(&next) = chars.peek() {
509                            if next.is_ascii_digit() {
510                                num.push(chars.next().unwrap());
511                            } else {
512                                break;
513                            }
514                        }
515                    }
516                    
517                    if let Ok(byte) = num.parse::<u8>() {
518                        result.push(byte as char);
519                    } else {
520                        result.push('\\');
521                        result.push_str(&num);
522                    }
523                }
524                
525                Some(c) => {
526                    result.push('\\');
527                    result.push(c);
528                }
529                None => result.push('\\'),
530            }
531        } else {
532            result.push(ch);
533        }
534    }
535    
536    result
537}
538
539/// Tokenizes source code into a list of `(Token, Span)` pairs.
540///
541/// This is the version agnostic entry point. If the source starts with a
542/// `#!` shebang line, it is silently skipped. For version aware tokenization
543/// (which demotes certain keywords to identifiers based on the Lua version),
544/// use [`lex_for_version`] instead.
545pub fn lex(source: &str) -> Result<Vec<(Token, Span)>, LexError> {
546    // skip shebang line if present as this is a unix execution hint, not a language token
547    let source = if source.starts_with("#!") {
548        match source.find('\n') {
549            Some(pos) => &source[pos + 1..],
550            None => "",
551        }
552    } else {
553        source
554    };
555
556    let mut tokens = Vec::new();
557    let mut lexer = Token::lexer(source);
558    
559    while let Some(token_result) = lexer.next() {
560        let span = lexer.span();
561        match token_result {
562            Ok(token) => {
563                if let Token::Number(ref num) = token {
564                    if !validate_number(num) {
565                        return Err(LexError::InvalidNumber { span });
566                    }
567                }
568                tokens.push((token, span));
569            }
570            Err(_) => {
571                return Err(LexError::InvalidNumber { span });
572            }
573        }
574    }
575    
576    let eof_pos = source.len();
577    tokens.push((Token::Eof, eof_pos..eof_pos));
578    
579    Ok(tokens)
580}
581
582fn validate_number(s: &str) -> bool {
583    if s.starts_with("0x") || s.starts_with("0X") {
584        // HEX
585        // has to have atleast one digit after 0x
586        let after_prefix = &s[2..];
587        if after_prefix.is_empty() {
588            return false;
589        }
590        
591        // check for valid hex with the optional p exponent
592        let parts: Vec<&str> = after_prefix.split(|c| c == 'p' || c == 'P').collect();
593        if parts.len() > 2 {
594            return false;
595        }
596        
597        // the first part must be valid hex (with an optional .)
598        let hex_part = parts[0].replace('_', "");
599        if !hex_part.chars().all(|c| c.is_ascii_hexdigit() || c == '.') {
600            return false;
601        }
602        
603        // if we encounter an exponent, then we validate it
604        if parts.len() == 2 {
605            let exp = parts[1].replace('_', "");
606            let exp = exp.trim_start_matches('+').trim_start_matches('-');
607            if exp.is_empty() || !exp.chars().all(|c| c.is_ascii_digit()) {
608                return false;
609            }
610        }
611    } else if s.starts_with("0b") || s.starts_with("0B") {
612        // BINARY
613        // has to have atleast one digit
614        let after_prefix = &s[2..].replace('_', "");
615        if after_prefix.is_empty() || !after_prefix.chars().all(|c| c == '0' || c == '1') {
616            return false;
617        }
618    } else {
619        // DECIMAL
620        let cleaned = s.replace('_', "");
621        
622        // has to have at least one digit somewhere
623        if !cleaned.chars().any(|c| c.is_ascii_digit()) {
624            return false;
625        }
626        
627        if cleaned.contains('e') || cleaned.contains('E') {
628            let parts: Vec<&str> = cleaned.split(|c| c == 'e' || c == 'E').collect();
629            if parts.len() != 2 {
630                return false;
631            }
632            
633            let exp = parts[1].trim_start_matches('+').trim_start_matches('-');
634            if exp.is_empty() || !exp.chars().all(|c| c.is_ascii_digit()) {
635                return false;
636            }
637        }
638    }
639    
640    true
641}
642
643/// Tokenizes source code with version aware keyword handling.
644///
645/// Calls [`lex`] first, then demotes keywords that don't exist in version `V`
646/// back to plain identifiers. For example, `continue` becomes
647/// `Token::Identifier("continue")` when parsing as [`Lua51`](crate::Lua51).
648pub fn lex_for_version<V: crate::marker::LuaVersion>(
649    source: &str,
650) -> Result<Vec<(Token, Span)>, LexError> {
651    let tokens = lex(source)?;
652
653    Ok(tokens
654        .into_iter()
655        .map(|(token, span)| {
656            let t = match token {
657                Token::Continue if !V::HAS_CONTINUE => Token::Identifier("continue".to_string()),
658                Token::Export if !V::HAS_EXPORT => Token::Identifier("export".to_string()),
659                Token::Type if !V::HAS_TYPE_ANNOTATIONS => Token::Identifier("type".to_string()),
660                Token::Goto if !V::HAS_GOTO => Token::Identifier("goto".to_string()),
661                Token::Const if !V::HAS_CONST => Token::Identifier("const".to_string()),
662                t => t,
663            };
664            (t, span)
665        })
666        .collect()) 
667}