Skip to main content

oak_mojo/lexer/
mod.rs

1/// Token type definitions for the Mojo lexer.
2pub mod token_type;
3pub use token_type::MojoTokenType;
4
5use crate::MojoLanguage;
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::LexOutput,
9    source::{Source, TextEdit},
10};
11
12pub(crate) type State<'a, S> = LexerState<'a, S, MojoLanguage>;
13
14/// Mojo lexer
15#[derive(Clone)]
16pub struct MojoLexer<'config> {
17    config: &'config MojoLanguage,
18}
19
20impl<'config> Lexer<MojoLanguage> for MojoLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<MojoLanguage>) -> LexOutput<MojoLanguage> {
22        let mut state = State::new_with_cache(source, 0, cache);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof();
26        }
27        state.finish_with_cache(result, cache)
28    }
29}
30
31impl<'config> MojoLexer<'config> {
32    /// Creates a new Mojo lexer
33    pub fn new(config: &'config MojoLanguage) -> Self {
34        Self { config }
35    }
36
37    pub(crate) fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
38        let mut indent_stack = vec![0];
39        let mut bracket_level: usize = 0;
40        let mut at_line_start = true;
41
42        while state.not_at_end() {
43            let safe_point = state.get_position();
44
45            if at_line_start && bracket_level == 0 {
46                self.handle_indentation(state, &mut indent_stack);
47                at_line_start = false;
48                continue;
49            }
50
51            if let Some(ch) = state.current() {
52                match ch {
53                    ' ' | '\t' => {
54                        self.skip_whitespace(state);
55                    }
56                    '\n' | '\r' => {
57                        self.lex_newline(state, bracket_level);
58                        at_line_start = true;
59                    }
60                    '#' => {
61                        self.lex_comment(state);
62                    }
63                    '"' | '\'' => {
64                        self.lex_string(state);
65                    }
66                    '0'..='9' => {
67                        self.lex_number(state);
68                    }
69                    'a'..='z' | 'A'..='Z' | '_' => {
70                        self.lex_identifier_or_keyword(state);
71                    }
72                    '(' | '[' | '{' => {
73                        bracket_level += 1;
74                        self.lex_delimiter(state);
75                    }
76                    ')' | ']' | '}' => {
77                        bracket_level = bracket_level.saturating_sub(1);
78                        self.lex_delimiter(state);
79                    }
80                    '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>' | '!' | '&' | '|' => {
81                        self.lex_operator(state);
82                    }
83                    ',' | ':' | ';' | '.' => {
84                        self.lex_delimiter(state);
85                    }
86                    _ => {
87                        state.advance(ch.len_utf8());
88                        state.add_token(MojoTokenType::Error, safe_point, state.get_position())
89                    }
90                }
91            }
92
93            state.advance_if_dead_lock(safe_point)
94        }
95
96        // Emit remaining dedents
97        while indent_stack.len() > 1 {
98            indent_stack.pop();
99            let pos = state.get_position();
100            state.add_token(MojoTokenType::Dedent, pos, pos)
101        }
102
103        Ok(())
104    }
105
106    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
107        let start_pos = state.get_position();
108        while let Some(ch) = state.current() {
109            if ch == ' ' || ch == '\t' {
110                state.advance(ch.len_utf8())
111            }
112            else {
113                break;
114            }
115        }
116        if state.get_position() > start_pos {
117            state.add_token(MojoTokenType::Whitespace, start_pos, state.get_position());
118        }
119    }
120
121    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, bracket_level: usize) {
122        let start_pos = state.get_position();
123        let kind = if bracket_level > 0 { MojoTokenType::Whitespace } else { MojoTokenType::Newline };
124
125        if let Some('\n') = state.current() {
126            state.advance(1);
127            state.add_token(kind, start_pos, state.get_position());
128        }
129        else if let Some('\r') = state.current() {
130            state.advance(1);
131            if let Some('\n') = state.current() {
132                state.advance(1);
133            }
134            state.add_token(kind, start_pos, state.get_position());
135        }
136    }
137
138    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
139        let start_pos = state.get_position();
140        state.advance(1); // skip '#'
141        while let Some(ch) = state.current() {
142            if ch == '\n' || ch == '\r' {
143                break;
144            }
145            state.advance(ch.len_utf8())
146        }
147        state.add_token(MojoTokenType::Comment, start_pos, state.get_position());
148    }
149
150    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
151        let start_pos = state.get_position();
152        let quote = state.current().unwrap();
153        state.advance(1);
154        let mut escaped = false;
155        while let Some(ch) = state.current() {
156            if escaped {
157                escaped = false;
158                state.advance(ch.len_utf8());
159                continue;
160            }
161            if ch == '\\' {
162                escaped = true;
163                state.advance(1);
164                continue;
165            }
166            if ch == quote {
167                state.advance(1);
168                break;
169            }
170            state.advance(ch.len_utf8());
171        }
172        state.add_token(MojoTokenType::String, start_pos, state.get_position());
173    }
174
175    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
176        let start_pos = state.get_position();
177        let mut is_float = false;
178        while let Some(ch) = state.current() {
179            if ch.is_ascii_digit() {
180                state.advance(1);
181            }
182            else if ch == '.' && !is_float {
183                is_float = true;
184                state.advance(1);
185            }
186            else {
187                break;
188            }
189        }
190        let kind = if is_float { MojoTokenType::Float } else { MojoTokenType::Integer };
191        state.add_token(kind, start_pos, state.get_position());
192    }
193
194    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
195        let start_pos = state.get_position();
196        while let Some(ch) = state.current() {
197            if ch.is_alphanumeric() || ch == '_' {
198                state.advance(ch.len_utf8());
199            }
200            else {
201                break;
202            }
203        }
204        let text = state.get_text_in(oak_core::Range { start: start_pos, end: state.get_position() });
205        let kind = match text.as_ref() {
206            "fn" => MojoTokenType::Fn,
207            "struct" => MojoTokenType::Struct,
208            "var" => MojoTokenType::Var,
209            "let" => MojoTokenType::Let,
210            "if" => MojoTokenType::If,
211            "else" => MojoTokenType::Else,
212            "while" => MojoTokenType::While,
213            "for" => MojoTokenType::For,
214            "in" => MojoTokenType::In,
215            "return" => MojoTokenType::Return,
216            "break" => MojoTokenType::Break,
217            "continue" => MojoTokenType::Continue,
218            "import" => MojoTokenType::Import,
219            "from" => MojoTokenType::From,
220            "True" => MojoTokenType::True,
221            "False" => MojoTokenType::False,
222            "None" => MojoTokenType::None,
223            _ => MojoTokenType::Identifier,
224        };
225        state.add_token(kind, start_pos, state.get_position());
226    }
227
228    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
229        let start_pos = state.get_position();
230        let ch = state.current().unwrap();
231        state.advance(1);
232        let kind = match ch {
233            '+' => MojoTokenType::Plus,
234            '-' => {
235                if let Some('>') = state.current() {
236                    state.advance(1);
237                    MojoTokenType::Arrow
238                }
239                else {
240                    MojoTokenType::Minus
241                }
242            }
243            '*' => MojoTokenType::Star,
244            '/' => MojoTokenType::Slash,
245            '%' => MojoTokenType::Percent,
246            '=' => {
247                if let Some('=') = state.current() {
248                    state.advance(1);
249                    MojoTokenType::EqualEqual
250                }
251                else {
252                    MojoTokenType::Equal
253                }
254            }
255            '<' => {
256                if let Some('=') = state.current() {
257                    state.advance(1);
258                    MojoTokenType::LessEqual
259                }
260                else {
261                    MojoTokenType::Less
262                }
263            }
264            '>' => {
265                if let Some('=') = state.current() {
266                    state.advance(1);
267                    MojoTokenType::GreaterEqual
268                }
269                else {
270                    MojoTokenType::Greater
271                }
272            }
273            '!' => {
274                if let Some('=') = state.current() {
275                    state.advance(1);
276                    MojoTokenType::NotEqual
277                }
278                else {
279                    MojoTokenType::Error
280                }
281            }
282            _ => MojoTokenType::Error,
283        };
284        state.add_token(kind, start_pos, state.get_position());
285    }
286
287    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
288        let start_pos = state.get_position();
289        let ch = state.current().unwrap();
290        state.advance(1);
291        let kind = match ch {
292            '(' => MojoTokenType::LeftParen,
293            ')' => MojoTokenType::RightParen,
294            '[' => MojoTokenType::LeftBracket,
295            ']' => MojoTokenType::RightBracket,
296            '{' => MojoTokenType::LeftBrace,
297            '}' => MojoTokenType::RightBrace,
298            ',' => MojoTokenType::Comma,
299            ':' => MojoTokenType::Colon,
300            ';' => MojoTokenType::Semicolon,
301            '.' => MojoTokenType::Dot,
302            _ => MojoTokenType::Error,
303        };
304        state.add_token(kind, start_pos, state.get_position());
305    }
306
307    fn handle_indentation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, stack: &mut Vec<usize>) {
308        let start_pos = state.get_position();
309        let mut indent = 0;
310        let mut temp_pos = start_pos;
311
312        while let Some(ch) = state.get_char_at(temp_pos) {
313            if ch == ' ' {
314                indent += 1;
315            }
316            else if ch == '\t' {
317                indent += 4; // Mojo usually uses 4 spaces for tabs
318            }
319            else {
320                break;
321            }
322            temp_pos += ch.len_utf8();
323        }
324
325        match state.get_char_at(temp_pos) {
326            Some('\n') | Some('\r') | Some('#') => {
327                // Empty line or comment, don't change indentation
328                return;
329            }
330            None => return, // EOF
331            _ => {}
332        }
333
334        state.advance(temp_pos - start_pos);
335        if state.get_position() > start_pos {
336            state.add_token(MojoTokenType::Whitespace, start_pos, state.get_position());
337        }
338
339        let last_indent = *stack.last().unwrap();
340        if indent > last_indent {
341            stack.push(indent);
342            state.add_token(MojoTokenType::Indent, state.get_position(), state.get_position());
343        }
344        else {
345            while indent < *stack.last().unwrap() {
346                stack.pop();
347                state.add_token(MojoTokenType::Dedent, state.get_position(), state.get_position());
348            }
349        }
350    }
351}