Skip to main content

oak_lua/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for Lua.
3pub mod token_type;
4
5/// Lua lexer implementation.
6///
7/// Implements lexical analysis for the Lua language, converting source code into a sequence of tokens.
8use crate::language::LuaLanguage;
9pub use crate::lexer::token_type::LuaTokenType;
10use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
11
12pub(crate) type State<'a, S> = LexerState<'a, S, LuaLanguage>;
13
14/// Lua lexer.
15#[derive(Clone)]
16pub struct LuaLexer<'config> {
17    config: &'config LuaLanguage,
18}
19
20impl<'config> LuaLexer<'config> {
21    /// Creates a new Lua lexer.
22    pub fn new(config: &'config LuaLanguage) -> Self {
23        Self { config }
24    }
25
26    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
27        while state.not_at_end() {
28            let safe_point = state.get_position();
29
30            // Try various lexical rules
31            if self.skip_whitespace(state) {
32                continue;
33            }
34
35            if self.lex_newline(state) {
36                continue;
37            }
38
39            if self.lex_comment(state) {
40                continue;
41            }
42
43            if self.lex_string(state) {
44                continue;
45            }
46
47            if self.lex_number(state) {
48                continue;
49            }
50
51            if self.lex_identifier_or_keyword(state) {
52                continue;
53            }
54
55            if self.lex_operator_or_delimiter(state) {
56                continue;
57            }
58
59            // If all rules do not match, skip the current character and mark as error
60            let start_pos = state.get_position();
61            if let Some(ch) = state.peek() {
62                state.advance(ch.len_utf8());
63                state.add_token(LuaTokenType::Error, start_pos, state.get_position())
64            }
65
66            state.advance_if_dead_lock(safe_point)
67        }
68
69        Ok(())
70    }
71
72    /// Skips whitespace characters.
73    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
74        let start_pos = state.get_position();
75
76        while let Some(ch) = state.peek() {
77            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
78        }
79
80        if state.get_position() > start_pos {
81            state.add_token(LuaTokenType::Whitespace, start_pos, state.get_position());
82            true
83        }
84        else {
85            false
86        }
87    }
88
89    /// Handles newline characters.
90    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
91        let start_pos = state.get_position();
92
93        if let Some('\n') = state.peek() {
94            state.advance(1);
95            state.add_token(LuaTokenType::Newline, start_pos, state.get_position());
96            true
97        }
98        else if let Some('\r') = state.peek() {
99            state.advance(1);
100            if let Some('\n') = state.peek() {
101                state.advance(1)
102            }
103            state.add_token(LuaTokenType::Newline, start_pos, state.get_position());
104            true
105        }
106        else {
107            false
108        }
109    }
110
111    /// Handles comments.
112    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
113        let start_pos = state.get_position();
114
115        if let Some('-') = state.current() {
116            if let Some('-') = state.peek() {
117                state.advance(1); // First '-'
118                state.advance(1); // Second '-'
119
120                // Check if it's a long comment --[[
121                if let Some('[') = state.current() {
122                    if let Some('[') = state.peek() {
123                        state.advance(1); // '['
124                        state.advance(1); // '['
125
126                        // Find ]]
127                        while let Some(ch) = state.current() {
128                            if ch == ']' {
129                                if let Some(']') = state.peek() {
130                                    state.advance(1); // ']'
131                                    state.advance(1); // ']'
132                                    break;
133                                }
134                            }
135                            state.advance(ch.len_utf8())
136                        }
137                    }
138                    else {
139                        // Single-line comment, read until the end of the line
140                        while let Some(ch) = state.current() {
141                            if ch == '\n' || ch == '\r' {
142                                break;
143                            }
144                            state.advance(ch.len_utf8())
145                        }
146                    }
147                }
148                else {
149                    // Single-line comment, read until the end of the line
150                    while let Some(ch) = state.current() {
151                        if ch == '\n' || ch == '\r' {
152                            break;
153                        }
154                        state.advance(ch.len_utf8())
155                    }
156                }
157
158                state.add_token(LuaTokenType::Comment, start_pos, state.get_position());
159                true
160            }
161            else {
162                false
163            }
164        }
165        else {
166            false
167        }
168    }
169
170    /// Handles string literals.
171    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
172        let start_pos = state.get_position();
173
174        if let Some(quote_char) = state.current() {
175            if quote_char == '"' || quote_char == '\'' {
176                state.advance(1); // Skip start quote
177
178                let mut escaped = false;
179                while let Some(ch) = state.current() {
180                    if escaped {
181                        escaped = false;
182                        state.advance(ch.len_utf8())
183                    }
184                    else if ch == '\\' {
185                        escaped = true;
186                        state.advance(1)
187                    }
188                    else if ch == quote_char {
189                        state.advance(1); // Skip end quote
190                        break;
191                    }
192                    else if ch == '\n' || ch == '\r' {
193                        // Strings cannot span lines unless escaped
194                        break;
195                    }
196                    else {
197                        state.advance(ch.len_utf8())
198                    }
199                }
200
201                state.add_token(LuaTokenType::String, start_pos, state.get_position());
202                true
203            }
204            else if quote_char == '[' {
205                // Long string [[...]]
206                if let Some('[') = state.peek() {
207                    state.advance(1); // '['
208                    state.advance(1); // '['
209
210                    // Find ]]
211                    while let Some(ch) = state.current() {
212                        if ch == ']' {
213                            if let Some(']') = state.peek() {
214                                state.advance(1); // ']'
215                                state.advance(1); // ']'
216                                break;
217                            }
218                        }
219                        state.advance(ch.len_utf8())
220                    }
221
222                    state.add_token(LuaTokenType::String, start_pos, state.get_position());
223                    true
224                }
225                else {
226                    false
227                }
228            }
229            else {
230                false
231            }
232        }
233        else {
234            false
235        }
236    }
237
238    /// Handles numbers.
239    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
240        let start_pos = state.get_position();
241
242        if let Some(ch) = state.current() {
243            if ch.is_ascii_digit() {
244                // Check if it's hexadecimal
245                if ch == '0' {
246                    if let Some(next_ch) = state.peek() {
247                        if next_ch == 'x' || next_ch == 'X' {
248                            state.advance(1); // '0'
249                            state.advance(1); // 'x' 'X'
250
251                            // Read hexadecimal digits
252                            while let Some(hex_ch) = state.current() {
253                                if hex_ch.is_ascii_hexdigit() { state.advance(1) } else { break }
254                            }
255
256                            state.add_token(LuaTokenType::Number, start_pos, state.get_position());
257                            return true;
258                        }
259                    }
260                }
261
262                // Normal number
263                let mut has_dot = false;
264                let mut has_exp = false;
265
266                while let Some(num_ch) = state.current() {
267                    if num_ch.is_ascii_digit() {
268                        state.advance(1)
269                    }
270                    else if num_ch == '.' && !has_dot && !has_exp {
271                        has_dot = true;
272                        state.advance(1)
273                    }
274                    else if (num_ch == 'e' || num_ch == 'E') && !has_exp {
275                        has_exp = true;
276                        state.advance(1);
277
278                        // Optional sign
279                        if let Some(sign_ch) = state.current() {
280                            if sign_ch == '+' || sign_ch == '-' {
281                                state.advance(1)
282                            }
283                        }
284                    }
285                    else {
286                        break;
287                    }
288                }
289
290                state.add_token(LuaTokenType::Number, start_pos, state.get_position());
291                true
292            }
293            else {
294                false
295            }
296        }
297        else {
298            false
299        }
300    }
301
302    /// Handles identifiers or keywords.
303    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
304        if let Some(ch) = state.current() {
305            if ch.is_ascii_alphabetic() || ch == '_' {
306                let range = state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
307                // Use the get_text_in method of the Source trait
308                let text = state.get_text_in(range.clone().into());
309                let token_kind = self.keyword_or_identifier(&text);
310                state.add_token(token_kind, range.start, range.end);
311                true
312            }
313            else {
314                false
315            }
316        }
317        else {
318            false
319        }
320    }
321
322    /// Recognizes keywords.
323    fn keyword_or_identifier(&self, text: &str) -> LuaTokenType {
324        match text {
325            "and" => LuaTokenType::And,
326            "break" => LuaTokenType::Break,
327            "do" => LuaTokenType::Do,
328            "else" => LuaTokenType::Else,
329            "elseif" => LuaTokenType::Elseif,
330            "end" => LuaTokenType::End,
331            "false" => LuaTokenType::False,
332            "for" => LuaTokenType::For,
333            "function" => LuaTokenType::Function,
334            "goto" => LuaTokenType::Goto,
335            "if" => LuaTokenType::If,
336            "in" => LuaTokenType::In,
337            "local" => LuaTokenType::Local,
338            "nil" => LuaTokenType::Nil,
339            "not" => LuaTokenType::Not,
340            "or" => LuaTokenType::Or,
341            "repeat" => LuaTokenType::Repeat,
342            "return" => LuaTokenType::Return,
343            "then" => LuaTokenType::Then,
344            "true" => LuaTokenType::True,
345            "until" => LuaTokenType::Until,
346            "while" => LuaTokenType::While,
347            _ => LuaTokenType::Identifier,
348        }
349    }
350
351    /// Handles operators and delimiters.
352    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
353        let start_pos = state.get_position();
354
355        if let Some(ch) = state.peek() {
356            let token_kind = match ch {
357                '=' => {
358                    state.advance(1);
359                    if let Some('=') = state.peek() {
360                        state.advance(1);
361                        LuaTokenType::EqEq
362                    }
363                    else {
364                        LuaTokenType::Eq
365                    }
366                }
367                '~' => {
368                    state.advance(1);
369                    if let Some('=') = state.peek() {
370                        state.advance(1);
371                        LuaTokenType::TildeEq
372                    }
373                    else {
374                        LuaTokenType::Tilde
375                    }
376                }
377                '<' => {
378                    state.advance(1);
379                    if let Some('=') = state.peek() {
380                        state.advance(1);
381                        LuaTokenType::LtEq
382                    }
383                    else if let Some('<') = state.peek() {
384                        state.advance(1);
385                        LuaTokenType::LtLt
386                    }
387                    else {
388                        LuaTokenType::Lt
389                    }
390                }
391                '>' => {
392                    state.advance(1);
393                    if let Some('=') = state.peek() {
394                        state.advance(1);
395                        LuaTokenType::GtEq
396                    }
397                    else if let Some('>') = state.peek() {
398                        state.advance(1);
399                        LuaTokenType::GtGt
400                    }
401                    else {
402                        LuaTokenType::Gt
403                    }
404                }
405                '.' => {
406                    state.advance(1);
407                    if let Some('.') = state.peek() {
408                        state.advance(1);
409                        if let Some('.') = state.peek() {
410                            state.advance(1);
411                            LuaTokenType::DotDotDot
412                        }
413                        else {
414                            LuaTokenType::DotDot
415                        }
416                    }
417                    else {
418                        LuaTokenType::Dot
419                    }
420                }
421                ':' => {
422                    state.advance(1);
423                    if let Some(':') = state.peek() {
424                        state.advance(1);
425                        LuaTokenType::ColonColon
426                    }
427                    else {
428                        LuaTokenType::Colon
429                    }
430                }
431                '/' => {
432                    state.advance(1);
433                    if let Some('/') = state.peek() {
434                        state.advance(1);
435                        LuaTokenType::SlashSlash
436                    }
437                    else {
438                        LuaTokenType::Slash
439                    }
440                }
441                '+' => {
442                    state.advance(1);
443                    LuaTokenType::Plus
444                }
445                '-' => {
446                    state.advance(1);
447                    LuaTokenType::Minus
448                }
449                '*' => {
450                    state.advance(1);
451                    LuaTokenType::Star
452                }
453                '%' => {
454                    state.advance(1);
455                    LuaTokenType::Percent
456                }
457                '^' => {
458                    state.advance(1);
459                    LuaTokenType::Caret
460                }
461                '#' => {
462                    state.advance(1);
463                    LuaTokenType::Hash
464                }
465                '&' => {
466                    state.advance(1);
467                    LuaTokenType::Ampersand
468                }
469                '|' => {
470                    state.advance(1);
471                    LuaTokenType::Pipe
472                }
473                '(' => {
474                    state.advance(1);
475                    LuaTokenType::LeftParen
476                }
477                ')' => {
478                    state.advance(1);
479                    LuaTokenType::RightParen
480                }
481                '{' => {
482                    state.advance(1);
483                    LuaTokenType::LeftBrace
484                }
485                '}' => {
486                    state.advance(1);
487                    LuaTokenType::RightBrace
488                }
489                '[' => {
490                    state.advance(1);
491                    LuaTokenType::LeftBracket
492                }
493                ']' => {
494                    state.advance(1);
495                    LuaTokenType::RightBracket
496                }
497                ';' => {
498                    state.advance(1);
499                    LuaTokenType::Semicolon
500                }
501                ',' => {
502                    state.advance(1);
503                    LuaTokenType::Comma
504                }
505                _ => return false,
506            };
507
508            state.add_token(token_kind, start_pos, state.get_position());
509            true
510        }
511        else {
512            false
513        }
514    }
515}
516
517impl<'config> Lexer<LuaLanguage> for LuaLexer<'config> {
518    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<LuaLanguage>) -> LexOutput<LuaLanguage> {
519        let mut state = State::new_with_cache(source, 0, cache);
520        let result = self.run(&mut state);
521        if result.is_ok() {
522            state.add_eof()
523        }
524        state.finish_with_cache(result, cache)
525    }
526}