Skip to main content

oak_mojo/lexer/
mod.rs

1pub mod token_type;
2pub use token_type::MojoTokenType;
3
4use crate::MojoLanguage;
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError,
7    lexer::LexOutput,
8    source::{Source, TextEdit},
9};
10
11type State<'a, S> = LexerState<'a, S, MojoLanguage>;
12
13/// Mojo 词法分析器
14#[derive(Clone, Default)]
15pub struct MojoLexer {}
16
17impl Lexer<MojoLanguage> for MojoLexer {
18    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<MojoLanguage>) -> LexOutput<MojoLanguage> {
19        let mut state = State::new_with_cache(source, 0, cache);
20        let result = self.run(&mut state);
21        if result.is_ok() {
22            state.add_eof();
23        }
24        state.finish_with_cache(result, cache)
25    }
26}
27
28impl MojoLexer {
29    /// 创建新的词法分析器
30    pub fn new() -> Self {
31        Self {}
32    }
33
34    pub(crate) fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
35        let mut indent_stack = vec![0];
36        let mut bracket_level: usize = 0;
37        let mut at_line_start = true;
38
39        while state.not_at_end() {
40            let safe_point = state.get_position();
41
42            if at_line_start && bracket_level == 0 {
43                self.handle_indentation(state, &mut indent_stack);
44                at_line_start = false;
45                continue;
46            }
47
48            if let Some(ch) = state.current() {
49                match ch {
50                    ' ' | '\t' => {
51                        self.skip_whitespace(state);
52                    }
53                    '\n' | '\r' => {
54                        self.lex_newline(state, bracket_level);
55                        at_line_start = true;
56                    }
57                    '#' => {
58                        self.lex_comment(state);
59                    }
60                    '"' | '\'' => {
61                        self.lex_string(state);
62                    }
63                    '0'..='9' => {
64                        self.lex_number(state);
65                    }
66                    'a'..='z' | 'A'..='Z' | '_' => {
67                        self.lex_identifier_or_keyword(state);
68                    }
69                    '(' | '[' | '{' => {
70                        bracket_level += 1;
71                        self.lex_delimiter(state);
72                    }
73                    ')' | ']' | '}' => {
74                        bracket_level = bracket_level.saturating_sub(1);
75                        self.lex_delimiter(state);
76                    }
77                    '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>' | '!' | '&' | '|' => {
78                        self.lex_operator(state);
79                    }
80                    ',' | ':' | ';' | '.' => {
81                        self.lex_delimiter(state);
82                    }
83                    _ => {
84                        state.advance(ch.len_utf8());
85                        state.add_token(MojoTokenType::Error, safe_point, state.get_position())
86                    }
87                }
88            }
89
90            state.advance_if_dead_lock(safe_point)
91        }
92
93        // Emit remaining dedents
94        while indent_stack.len() > 1 {
95            indent_stack.pop();
96            let pos = state.get_position();
97            state.add_token(MojoTokenType::Dedent, pos, pos)
98        }
99
100        Ok(())
101    }
102
103    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
104        let start_pos = state.get_position();
105        while let Some(ch) = state.current() {
106            if ch == ' ' || ch == '\t' {
107                state.advance(ch.len_utf8())
108            }
109            else {
110                break;
111            }
112        }
113        if state.get_position() > start_pos {
114            state.add_token(MojoTokenType::Whitespace, start_pos, state.get_position());
115        }
116    }
117
118    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, bracket_level: usize) {
119        let start_pos = state.get_position();
120        let kind = if bracket_level > 0 { MojoTokenType::Whitespace } else { MojoTokenType::Newline };
121
122        if let Some('\n') = state.current() {
123            state.advance(1);
124            state.add_token(kind, start_pos, state.get_position());
125        }
126        else if let Some('\r') = state.current() {
127            state.advance(1);
128            if let Some('\n') = state.current() {
129                state.advance(1);
130            }
131            state.add_token(kind, start_pos, state.get_position());
132        }
133    }
134
135    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
136        let start_pos = state.get_position();
137        state.advance(1); // skip '#'
138        while let Some(ch) = state.current() {
139            if ch == '\n' || ch == '\r' {
140                break;
141            }
142            state.advance(ch.len_utf8())
143        }
144        state.add_token(MojoTokenType::Comment, start_pos, state.get_position());
145    }
146
147    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
148        let start_pos = state.get_position();
149        let quote = state.current().unwrap();
150        state.advance(1);
151        let mut escaped = false;
152        while let Some(ch) = state.current() {
153            if escaped {
154                escaped = false;
155                state.advance(ch.len_utf8());
156                continue;
157            }
158            if ch == '\\' {
159                escaped = true;
160                state.advance(1);
161                continue;
162            }
163            if ch == quote {
164                state.advance(1);
165                break;
166            }
167            state.advance(ch.len_utf8());
168        }
169        state.add_token(MojoTokenType::String, start_pos, state.get_position());
170    }
171
172    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
173        let start_pos = state.get_position();
174        let mut is_float = false;
175        while let Some(ch) = state.current() {
176            if ch.is_ascii_digit() {
177                state.advance(1);
178            }
179            else if ch == '.' && !is_float {
180                is_float = true;
181                state.advance(1);
182            }
183            else {
184                break;
185            }
186        }
187        let kind = if is_float { MojoTokenType::Float } else { MojoTokenType::Integer };
188        state.add_token(kind, start_pos, state.get_position());
189    }
190
191    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
192        let start_pos = state.get_position();
193        while let Some(ch) = state.current() {
194            if ch.is_alphanumeric() || ch == '_' {
195                state.advance(ch.len_utf8());
196            }
197            else {
198                break;
199            }
200        }
201        let text = state.get_text_in(oak_core::Range { start: start_pos, end: state.get_position() });
202        let kind = match text.as_ref() {
203            "fn" => MojoTokenType::Fn,
204            "struct" => MojoTokenType::Struct,
205            "var" => MojoTokenType::Var,
206            "let" => MojoTokenType::Let,
207            "if" => MojoTokenType::If,
208            "else" => MojoTokenType::Else,
209            "while" => MojoTokenType::While,
210            "for" => MojoTokenType::For,
211            "in" => MojoTokenType::In,
212            "return" => MojoTokenType::Return,
213            "break" => MojoTokenType::Break,
214            "continue" => MojoTokenType::Continue,
215            "import" => MojoTokenType::Import,
216            "from" => MojoTokenType::From,
217            "True" => MojoTokenType::True,
218            "False" => MojoTokenType::False,
219            "None" => MojoTokenType::None,
220            _ => MojoTokenType::Identifier,
221        };
222        state.add_token(kind, start_pos, state.get_position());
223    }
224
225    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
226        let start_pos = state.get_position();
227        let ch = state.current().unwrap();
228        state.advance(1);
229        let kind = match ch {
230            '+' => MojoTokenType::Plus,
231            '-' => {
232                if let Some('>') = state.current() {
233                    state.advance(1);
234                    MojoTokenType::Arrow
235                }
236                else {
237                    MojoTokenType::Minus
238                }
239            }
240            '*' => MojoTokenType::Star,
241            '/' => MojoTokenType::Slash,
242            '%' => MojoTokenType::Percent,
243            '=' => {
244                if let Some('=') = state.current() {
245                    state.advance(1);
246                    MojoTokenType::EqualEqual
247                }
248                else {
249                    MojoTokenType::Equal
250                }
251            }
252            '<' => {
253                if let Some('=') = state.current() {
254                    state.advance(1);
255                    MojoTokenType::LessEqual
256                }
257                else {
258                    MojoTokenType::Less
259                }
260            }
261            '>' => {
262                if let Some('=') = state.current() {
263                    state.advance(1);
264                    MojoTokenType::GreaterEqual
265                }
266                else {
267                    MojoTokenType::Greater
268                }
269            }
270            '!' => {
271                if let Some('=') = state.current() {
272                    state.advance(1);
273                    MojoTokenType::NotEqual
274                }
275                else {
276                    MojoTokenType::Error
277                }
278            }
279            _ => MojoTokenType::Error,
280        };
281        state.add_token(kind, start_pos, state.get_position());
282    }
283
284    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) {
285        let start_pos = state.get_position();
286        let ch = state.current().unwrap();
287        state.advance(1);
288        let kind = match ch {
289            '(' => MojoTokenType::LeftParen,
290            ')' => MojoTokenType::RightParen,
291            '[' => MojoTokenType::LeftBracket,
292            ']' => MojoTokenType::RightBracket,
293            '{' => MojoTokenType::LeftBrace,
294            '}' => MojoTokenType::RightBrace,
295            ',' => MojoTokenType::Comma,
296            ':' => MojoTokenType::Colon,
297            ';' => MojoTokenType::Semicolon,
298            '.' => MojoTokenType::Dot,
299            _ => MojoTokenType::Error,
300        };
301        state.add_token(kind, start_pos, state.get_position());
302    }
303
304    fn handle_indentation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, stack: &mut Vec<usize>) {
305        let start_pos = state.get_position();
306        let mut indent = 0;
307        let mut temp_pos = start_pos;
308
309        while let Some(ch) = state.get_char_at(temp_pos) {
310            if ch == ' ' {
311                indent += 1;
312            }
313            else if ch == '\t' {
314                indent += 4; // Mojo usually uses 4 spaces for tabs
315            }
316            else {
317                break;
318            }
319            temp_pos += ch.len_utf8();
320        }
321
322        match state.get_char_at(temp_pos) {
323            Some('\n') | Some('\r') | Some('#') => {
324                // Empty line or comment, don't change indentation
325                return;
326            }
327            None => return, // EOF
328            _ => {}
329        }
330
331        state.advance(temp_pos - start_pos);
332        if state.get_position() > start_pos {
333            state.add_token(MojoTokenType::Whitespace, start_pos, state.get_position());
334        }
335
336        let last_indent = *stack.last().unwrap();
337        if indent > last_indent {
338            stack.push(indent);
339            state.add_token(MojoTokenType::Indent, state.get_position(), state.get_position());
340        }
341        else {
342            while indent < *stack.last().unwrap() {
343                stack.pop();
344                state.add_token(MojoTokenType::Dedent, state.get_position(), state.get_position());
345            }
346        }
347    }
348}