oak_matlab/lexer/
mod.rs

1use crate::{kind::MatlabSyntaxKind, language::MatlabLanguage};
2
3use oak_core::{IncrementalCache, Lexer, LexerState, lexer::LexOutput, source::Source};
4
5type State<S> = LexerState<S, MatlabLanguage>;
6
7pub struct MatlabLexer<'config> {
8    config: &'config MatlabLanguage,
9}
10
11impl<'config> MatlabLexer<'config> {
12    pub fn new(config: &'config MatlabLanguage) -> Self {
13        Self { config }
14    }
15
16    /// 跳过空白字符
17    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
18        let start_pos = state.get_position();
19
20        while let Some(ch) = state.peek() {
21            if ch == ' ' || ch == '\t' {
22                state.advance(ch.len_utf8());
23            }
24            else {
25                break;
26            }
27        }
28
29        if state.get_position() > start_pos {
30            state.add_token(MatlabSyntaxKind::Whitespace, start_pos, state.get_position());
31            true
32        }
33        else {
34            false
35        }
36    }
37
38    /// 处理换行
39    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
40        let start_pos = state.get_position();
41
42        if let Some('\n') = state.peek() {
43            state.advance(1);
44            state.add_token(MatlabSyntaxKind::Newline, start_pos, state.get_position());
45            true
46        }
47        else if let Some('\r') = state.peek() {
48            state.advance(1);
49            if let Some('\n') = state.peek() {
50                state.advance(1);
51            }
52            state.add_token(MatlabSyntaxKind::Newline, start_pos, state.get_position());
53            true
54        }
55        else {
56            false
57        }
58    }
59
60    /// 处理标识符和关键
61    fn lex_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
62        let start_pos = state.get_position();
63
64        if let Some(ch) = state.peek() {
65            if !ch.is_ascii_alphabetic() && ch != '_' {
66                return false;
67            }
68
69            // 收集标识符字
70            let mut identifier = String::new();
71            while let Some(ch) = state.peek() {
72                if ch.is_ascii_alphanumeric() || ch == '_' {
73                    identifier.push(ch);
74                    state.advance(1);
75                }
76                else {
77                    break;
78                }
79            }
80
81            // 检查是否是关键
82            let token_kind = match identifier.as_str() {
83                "function" => MatlabSyntaxKind::Function,
84                "end" => MatlabSyntaxKind::End,
85                "if" => MatlabSyntaxKind::If,
86                "else" => MatlabSyntaxKind::Else,
87                "elseif" => MatlabSyntaxKind::Elseif,
88                "while" => MatlabSyntaxKind::While,
89                "for" => MatlabSyntaxKind::For,
90                "break" => MatlabSyntaxKind::Break,
91                "continue" => MatlabSyntaxKind::Continue,
92                "return" => MatlabSyntaxKind::Return,
93                "switch" => MatlabSyntaxKind::Switch,
94                "case" => MatlabSyntaxKind::Case,
95                "otherwise" => MatlabSyntaxKind::Otherwise,
96                "try" => MatlabSyntaxKind::Try,
97                "catch" => MatlabSyntaxKind::Catch,
98                "global" => MatlabSyntaxKind::Global,
99                "persistent" => MatlabSyntaxKind::Persistent,
100                "classdef" => MatlabSyntaxKind::Classdef,
101                "properties" => MatlabSyntaxKind::Properties,
102                "methods" => MatlabSyntaxKind::Methods,
103                "events" => MatlabSyntaxKind::Events,
104                _ => MatlabSyntaxKind::Identifier,
105            };
106
107            state.add_token(token_kind, start_pos, state.get_position());
108            true
109        }
110        else {
111            false
112        }
113    }
114
115    /// 处理数字
116    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
117        let start_pos = state.get_position();
118
119        if let Some(ch) = state.peek() {
120            if !ch.is_ascii_digit() {
121                return false;
122            }
123
124            // 处理整数部分
125            while let Some(ch) = state.peek() {
126                if ch.is_ascii_digit() {
127                    state.advance(ch.len_utf8());
128                }
129                else {
130                    break;
131                }
132            }
133
134            // 处理小数
135            if let Some('.') = state.peek() {
136                if let Some(next_ch) = state.peek_next_n(1) {
137                    if next_ch.is_ascii_digit() {
138                        state.advance(1); // 跳过小数
139                        while let Some(ch) = state.peek() {
140                            if ch.is_ascii_digit() {
141                                state.advance(ch.len_utf8());
142                            }
143                            else {
144                                break;
145                            }
146                        }
147                    }
148                }
149            }
150
151            // 处理科学记数
152            if let Some(ch) = state.peek() {
153                if ch == 'e' || ch == 'E' {
154                    state.advance(1);
155                    if let Some(sign) = state.peek() {
156                        if sign == '+' || sign == '-' {
157                            state.advance(1);
158                        }
159                    }
160                    while let Some(ch) = state.peek() {
161                        if ch.is_ascii_digit() {
162                            state.advance(ch.len_utf8());
163                        }
164                        else {
165                            break;
166                        }
167                    }
168                }
169            }
170
171            state.add_token(MatlabSyntaxKind::Number, start_pos, state.get_position());
172            true
173        }
174        else {
175            false
176        }
177    }
178
179    /// 处理字符
180    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
181        let start_pos = state.get_position();
182
183        if let Some(quote) = state.peek() {
184            if quote != '\'' && quote != '"' {
185                return false;
186            }
187
188            state.advance(1); // 跳过开始引
189            while let Some(ch) = state.peek() {
190                if ch == quote {
191                    state.advance(1); // 跳过结束引号
192                    break;
193                }
194                else if ch == '\\' {
195                    state.advance(1); // 跳过转义字符
196                    if state.peek().is_some() {
197                        state.advance(state.peek().unwrap().len_utf8());
198                    }
199                }
200                else {
201                    state.advance(ch.len_utf8());
202                }
203            }
204
205            let token_kind = if quote == '\'' { MatlabSyntaxKind::Character } else { MatlabSyntaxKind::String };
206
207            state.add_token(token_kind, start_pos, state.get_position());
208            true
209        }
210        else {
211            false
212        }
213    }
214
215    /// 处理注释
216    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
217        let start_pos = state.get_position();
218
219        if let Some('%') = state.peek() {
220            state.advance(1);
221
222            // 检查是否是块注
223            if let Some('{') = state.peek() {
224                state.advance(1);
225
226                // 查找块注释结
227                while let Some(ch) = state.peek() {
228                    if ch == '%' {
229                        if let Some('}') = state.peek_next_n(1) {
230                            state.advance(2); // 跳过 %}
231                            break;
232                        }
233                    }
234                    state.advance(ch.len_utf8());
235                }
236
237                state.add_token(MatlabSyntaxKind::BlockComment, start_pos, state.get_position());
238            }
239            else {
240                // 行注释,读取到行
241                while let Some(ch) = state.peek() {
242                    if ch == '\n' || ch == '\r' {
243                        break;
244                    }
245                    state.advance(ch.len_utf8());
246                }
247
248                state.add_token(MatlabSyntaxKind::Comment, start_pos, state.get_position());
249            }
250            true
251        }
252        else {
253            false
254        }
255    }
256
257    /// 处理运算
258    fn lex_operator<S: Source>(&self, state: &mut State<S>) -> bool {
259        let start_pos = state.get_position();
260
261        if let Some(ch) = state.peek() {
262            let token_kind = match ch {
263                '+' => {
264                    state.advance(1);
265                    MatlabSyntaxKind::Plus
266                }
267                '-' => {
268                    state.advance(1);
269                    MatlabSyntaxKind::Minus
270                }
271                '*' => {
272                    state.advance(1);
273                    MatlabSyntaxKind::Times
274                }
275                '/' => {
276                    state.advance(1);
277                    MatlabSyntaxKind::Divide
278                }
279                '^' => {
280                    state.advance(1);
281                    MatlabSyntaxKind::Power
282                }
283                '\\' => {
284                    state.advance(1);
285                    MatlabSyntaxKind::LeftDivide
286                }
287                '=' => {
288                    state.advance(1);
289                    if let Some('=') = state.peek() {
290                        state.advance(1);
291                        MatlabSyntaxKind::Equal
292                    }
293                    else {
294                        MatlabSyntaxKind::Assign
295                    }
296                }
297                '~' => {
298                    state.advance(1);
299                    if let Some('=') = state.peek() {
300                        state.advance(1);
301                        MatlabSyntaxKind::NotEqual
302                    }
303                    else {
304                        MatlabSyntaxKind::Not
305                    }
306                }
307                '<' => {
308                    state.advance(1);
309                    if let Some('=') = state.peek() {
310                        state.advance(1);
311                        MatlabSyntaxKind::LessEqual
312                    }
313                    else {
314                        MatlabSyntaxKind::Less
315                    }
316                }
317                '>' => {
318                    state.advance(1);
319                    if let Some('=') = state.peek() {
320                        state.advance(1);
321                        MatlabSyntaxKind::GreaterEqual
322                    }
323                    else {
324                        MatlabSyntaxKind::Greater
325                    }
326                }
327                '&' => {
328                    state.advance(1);
329                    if let Some('&') = state.peek() {
330                        state.advance(1);
331                        MatlabSyntaxKind::AndAnd
332                    }
333                    else {
334                        MatlabSyntaxKind::And
335                    }
336                }
337                '|' => {
338                    state.advance(1);
339                    if let Some('|') = state.peek() {
340                        state.advance(1);
341                        MatlabSyntaxKind::OrOr
342                    }
343                    else {
344                        MatlabSyntaxKind::Or
345                    }
346                }
347                '.' => {
348                    state.advance(1);
349                    if let Some(next_ch) = state.peek() {
350                        match next_ch {
351                            '*' => {
352                                state.advance(1);
353                                MatlabSyntaxKind::DotTimes
354                            }
355                            '/' => {
356                                state.advance(1);
357                                MatlabSyntaxKind::DotDivide
358                            }
359                            '^' => {
360                                state.advance(1);
361                                MatlabSyntaxKind::DotPower
362                            }
363                            '\\' => {
364                                state.advance(1);
365                                MatlabSyntaxKind::DotLeftDivide
366                            }
367                            '\'' => {
368                                state.advance(1);
369                                MatlabSyntaxKind::DotTranspose
370                            }
371                            _ => MatlabSyntaxKind::Dot,
372                        }
373                    }
374                    else {
375                        MatlabSyntaxKind::Dot
376                    }
377                }
378                '\'' => {
379                    state.advance(1);
380                    MatlabSyntaxKind::Transpose
381                }
382                _ => return false,
383            };
384
385            state.add_token(token_kind, start_pos, state.get_position());
386            true
387        }
388        else {
389            false
390        }
391    }
392
393    /// 处理分隔
394    fn lex_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
395        let start_pos = state.get_position();
396
397        if let Some(ch) = state.peek() {
398            let token_kind = match ch {
399                '(' => MatlabSyntaxKind::LeftParen,
400                ')' => MatlabSyntaxKind::RightParen,
401                '[' => MatlabSyntaxKind::LeftBracket,
402                ']' => MatlabSyntaxKind::RightBracket,
403                '{' => MatlabSyntaxKind::LeftBrace,
404                '}' => MatlabSyntaxKind::RightBrace,
405                ';' => MatlabSyntaxKind::Semicolon,
406                ',' => MatlabSyntaxKind::Comma,
407                ':' => MatlabSyntaxKind::Colon,
408                '?' => MatlabSyntaxKind::Question,
409                '@' => MatlabSyntaxKind::At,
410                _ => return false,
411            };
412
413            state.advance(ch.len_utf8());
414            state.add_token(token_kind, start_pos, state.get_position());
415            true
416        }
417        else {
418            false
419        }
420    }
421}
422
423impl<'config> Lexer<MatlabLanguage> for MatlabLexer<'config> {
424    fn lex_incremental(
425        &self,
426        source: impl Source,
427        _start_offset: usize,
428        _cache: IncrementalCache<'_, MatlabLanguage>,
429    ) -> LexOutput<MatlabLanguage> {
430        self.lex_internal(source)
431    }
432
433    fn lex(&self, source: impl Source) -> LexOutput<MatlabLanguage> {
434        self.lex_internal(source)
435    }
436}
437
438impl<'config> MatlabLexer<'config> {
439    fn lex_internal<S: Source>(&self, source: S) -> LexOutput<MatlabLanguage> {
440        let mut state = LexerState::new(source);
441
442        while state.not_at_end() {
443            // 跳过空白字符
444            if self.skip_whitespace(&mut state) {
445                continue;
446            }
447
448            // 处理换行
449            if self.lex_newline(&mut state) {
450                continue;
451            }
452
453            // 处理注释
454            if self.lex_comment(&mut state) {
455                continue;
456            }
457
458            // 处理字符
459            if self.lex_string(&mut state) {
460                continue;
461            }
462
463            // 处理数字
464            if self.lex_number(&mut state) {
465                continue;
466            }
467
468            // 处理标识符和关键
469            if self.lex_identifier(&mut state) {
470                continue;
471            }
472
473            // 处理运算
474            if self.lex_operator(&mut state) {
475                continue;
476            }
477
478            // 处理分隔
479            if self.lex_delimiter(&mut state) {
480                continue;
481            }
482
483            // 如果都不匹配,跳过当前字符并标记为错
484            let start_pos = state.get_position();
485            if let Some(_ch) = state.peek() {
486                state.advance(1);
487                state.add_token(MatlabSyntaxKind::Error, start_pos, state.get_position());
488            }
489        }
490
491        state.finish(Ok(()))
492    }
493}