oak_julia/lexer/
mod.rs

1use crate::{kind::JuliaSyntaxKind, language::JuliaLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, JuliaLanguage>;
5
6#[derive(Clone, Debug)]
7pub struct JuliaLexer<'config> {
8    config: &'config JuliaLanguage,
9}
10
11impl<'config> JuliaLexer<'config> {
12    pub fn new(config: &'config JuliaLanguage) -> Self {
13        Self { config }
14    }
15
16    /// 跳过空白字符
17    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
18        let start_pos = state.get_position();
19
20        while let Some(ch) = state.peek() {
21            if ch == ' ' || ch == '\t' {
22                state.advance(ch.len_utf8());
23            }
24            else {
25                break;
26            }
27        }
28
29        if state.get_position() > start_pos {
30            state.add_token(JuliaSyntaxKind::Whitespace, start_pos, state.get_position());
31            true
32        }
33        else {
34            false
35        }
36    }
37
38    /// 处理换行
39    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
40        let start_pos = state.get_position();
41
42        if let Some('\n') = state.peek() {
43            state.advance(1);
44            state.add_token(JuliaSyntaxKind::Newline, start_pos, state.get_position());
45            true
46        }
47        else if let Some('\r') = state.peek() {
48            state.advance(1);
49            if let Some('\n') = state.peek() {
50                state.advance(1);
51            }
52            state.add_token(JuliaSyntaxKind::Newline, start_pos, state.get_position());
53            true
54        }
55        else {
56            false
57        }
58    }
59
60    /// 处理标识符和关键
61    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
62        let start_pos = state.get_position();
63
64        if let Some(ch) = state.peek() {
65            if ch.is_ascii_alphabetic() || ch == '_' {
66                state.advance(ch.len_utf8());
67
68                while let Some(ch) = state.peek() {
69                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '!' || ch == '?' {
70                        state.advance(ch.len_utf8());
71                    }
72                    else {
73                        break;
74                    }
75                }
76
77                let end_pos = state.get_position();
78                let identifier_str = state.get_text_in((start_pos..end_pos).into());
79
80                // 检查是否是关键
81                if let Some(keyword_kind) = JuliaSyntaxKind::from_str(identifier_str) {
82                    state.add_token(keyword_kind, start_pos, end_pos);
83                }
84                else {
85                    state.add_token(JuliaSyntaxKind::Identifier, start_pos, end_pos);
86                }
87                true
88            }
89            else {
90                false
91            }
92        }
93        else {
94            false
95        }
96    }
97
98    /// 处理数字字面
99    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
100        let start_pos = state.get_position();
101
102        if let Some(ch) = state.peek() {
103            if ch.is_ascii_digit() {
104                state.advance(1);
105
106                // 处理整数部分
107                while let Some(ch) = state.peek() {
108                    if ch.is_ascii_digit() || ch == '_' {
109                        state.advance(1);
110                    }
111                    else {
112                        break;
113                    }
114                }
115
116                let mut is_float = false;
117
118                // 检查小数点
119                if let Some('.') = state.peek() {
120                    // 检查下一个字符是否是数字,避免与范围操作符混淆
121                    if let Some(next_ch) = state.peek_next_n(1) {
122                        if next_ch.is_ascii_digit() {
123                            is_float = true;
124                            state.advance(1); // 跳过小数
125                            // 处理小数部分
126                            while let Some(ch) = state.peek() {
127                                if ch.is_ascii_digit() || ch == '_' {
128                                    state.advance(1);
129                                }
130                                else {
131                                    break;
132                                }
133                            }
134                        }
135                    }
136                }
137
138                // 检查科学计数法
139                if let Some(ch) = state.peek() {
140                    if ch == 'e' || ch == 'E' {
141                        is_float = true;
142                        state.advance(1);
143
144                        // 可选的符号
145                        if let Some(sign) = state.peek() {
146                            if sign == '+' || sign == '-' {
147                                state.advance(1);
148                            }
149                        }
150
151                        // 指数部分
152                        while let Some(ch) = state.peek() {
153                            if ch.is_ascii_digit() {
154                                state.advance(1);
155                            }
156                            else {
157                                break;
158                            }
159                        }
160                    }
161                }
162
163                // 检查类型后缀 (f32, f64, i32, i64
164                if let Some(ch) = state.peek() {
165                    if ch.is_ascii_alphabetic() {
166                        while let Some(ch) = state.peek() {
167                            if ch.is_ascii_alphanumeric() {
168                                state.advance(1);
169                            }
170                            else {
171                                break;
172                            }
173                        }
174                    }
175                }
176
177                let token_kind = if is_float { JuliaSyntaxKind::FloatLiteral } else { JuliaSyntaxKind::IntegerLiteral };
178
179                state.add_token(token_kind, start_pos, state.get_position());
180                true
181            }
182            else {
183                false
184            }
185        }
186        else {
187            false
188        }
189    }
190
191    /// 处理字符串字面量
192    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
193        let start_pos = state.get_position();
194
195        if let Some(quote) = state.peek() {
196            if quote == '"' || quote == '\'' {
197                state.advance(1);
198                let mut found_end = false;
199
200                while let Some(ch) = state.peek() {
201                    if ch == quote {
202                        state.advance(1);
203                        found_end = true;
204                        break;
205                    }
206                    else if ch == '\\' {
207                        // 处理转义字符
208                        state.advance(1);
209                        if let Some(_) = state.peek() {
210                            state.advance(1);
211                        }
212                    }
213                    else {
214                        state.advance(ch.len_utf8());
215                    }
216                }
217
218                if found_end {
219                    let token_kind = if quote == '\'' { JuliaSyntaxKind::CharLiteral } else { JuliaSyntaxKind::StringLiteral };
220                    state.add_token(token_kind, start_pos, state.get_position());
221                    true
222                }
223                else {
224                    // 未找到结束引号,回退到开始位                    state.set_position(start_pos);
225                    false
226                }
227            }
228            else {
229                false
230            }
231        }
232        else {
233            false
234        }
235    }
236
237    /// 处理三重引号字符
238    fn lex_triple_string<S: Source>(&self, state: &mut State<S>) -> bool {
239        let start_pos = state.get_position();
240
241        // 检查是否是三重引号
242        if let Some('"') = state.peek() {
243            if let Some('"') = state.peek_next_n(1) {
244                if let Some('"') = state.peek_next_n(2) {
245                    state.advance(3);
246
247                    // 寻找结束的三重引号
248                    while let Some(ch) = state.peek() {
249                        if ch == '"' {
250                            if let Some('"') = state.peek_next_n(1) {
251                                if let Some('"') = state.peek_next_n(2) {
252                                    state.advance(3);
253                                    state.add_token(JuliaSyntaxKind::StringLiteral, start_pos, state.get_position());
254                                    return true;
255                                }
256                            }
257                        }
258                        state.advance(ch.len_utf8());
259                    }
260
261                    // 未找到结束的三重引号,回退
262                    state.set_position(start_pos);
263                }
264            }
265        }
266        false
267    }
268
269    /// 处理注释
270    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
271        let start_pos = state.get_position();
272
273        if let Some('#') = state.peek() {
274            // 检查是否是多行注释 #=
275            if let Some('=') = state.peek_next_n(1) {
276                state.advance(2);
277                let mut depth = 1;
278
279                while let Some(ch) = state.peek()
280                    && depth > 0
281                {
282                    if ch == '#' && state.peek_next_n(1) == Some('=') {
283                        depth += 1;
284                        state.advance(2);
285                    }
286                    else if ch == '=' && state.peek_next_n(1) == Some('#') {
287                        depth -= 1;
288                        state.advance(2);
289                    }
290                    else {
291                        state.advance(ch.len_utf8());
292                    }
293                }
294
295                state.add_token(JuliaSyntaxKind::Comment, start_pos, state.get_position());
296                true
297            }
298            else {
299                // 单行注释
300                state.advance(1);
301
302                while let Some(ch) = state.peek() {
303                    if ch == '\n' || ch == '\r' {
304                        break;
305                    }
306                    state.advance(ch.len_utf8());
307                }
308
309                state.add_token(JuliaSyntaxKind::Comment, start_pos, state.get_position());
310                true
311            }
312        }
313        else {
314            false
315        }
316    }
317
318    /// 处理操作
319    fn lex_operator<S: Source>(&self, state: &mut State<S>) -> bool {
320        let start_pos = state.get_position();
321
322        if let Some(ch) = state.peek() {
323            let token_kind = match ch {
324                '+' => {
325                    state.advance(1);
326                    if let Some('=') = state.peek() {
327                        state.advance(1);
328                        JuliaSyntaxKind::PlusAssign
329                    }
330                    else {
331                        JuliaSyntaxKind::Plus
332                    }
333                }
334                '-' => {
335                    state.advance(1);
336                    if let Some('=') = state.peek() {
337                        state.advance(1);
338                        JuliaSyntaxKind::MinusAssign
339                    }
340                    else if let Some('>') = state.peek() {
341                        state.advance(1);
342                        JuliaSyntaxKind::Arrow
343                    }
344                    else {
345                        JuliaSyntaxKind::Minus
346                    }
347                }
348                '*' => {
349                    state.advance(1);
350                    if let Some('=') = state.peek() {
351                        state.advance(1);
352                        JuliaSyntaxKind::StarAssign
353                    }
354                    else {
355                        JuliaSyntaxKind::Star
356                    }
357                }
358                '/' => {
359                    state.advance(1);
360                    if let Some('=') = state.peek() {
361                        state.advance(1);
362                        JuliaSyntaxKind::SlashAssign
363                    }
364                    else {
365                        JuliaSyntaxKind::Slash
366                    }
367                }
368                '%' => {
369                    state.advance(1);
370                    if let Some('=') = state.peek() {
371                        state.advance(1);
372                        JuliaSyntaxKind::PercentAssign
373                    }
374                    else {
375                        JuliaSyntaxKind::Percent
376                    }
377                }
378                '^' => {
379                    state.advance(1);
380                    if let Some('=') = state.peek() {
381                        state.advance(1);
382                        JuliaSyntaxKind::CaretAssign
383                    }
384                    else {
385                        JuliaSyntaxKind::Caret
386                    }
387                }
388                '=' => {
389                    state.advance(1);
390                    if let Some('=') = state.peek() {
391                        state.advance(1);
392                        JuliaSyntaxKind::Equal
393                    }
394                    else if let Some('>') = state.peek() {
395                        state.advance(1);
396                        JuliaSyntaxKind::FatArrow
397                    }
398                    else {
399                        JuliaSyntaxKind::Assign
400                    }
401                }
402                '!' => {
403                    state.advance(1);
404                    if let Some('=') = state.peek() {
405                        state.advance(1);
406                        JuliaSyntaxKind::NotEqual
407                    }
408                    else {
409                        JuliaSyntaxKind::Not
410                    }
411                }
412                '<' => {
413                    state.advance(1);
414                    if let Some('=') = state.peek() {
415                        state.advance(1);
416                        JuliaSyntaxKind::LessEqual
417                    }
418                    else if let Some('<') = state.peek() {
419                        state.advance(1);
420                        JuliaSyntaxKind::LeftShift
421                    }
422                    else {
423                        JuliaSyntaxKind::LessThan
424                    }
425                }
426                '>' => {
427                    state.advance(1);
428                    if let Some('=') = state.peek() {
429                        state.advance(1);
430                        JuliaSyntaxKind::GreaterEqual
431                    }
432                    else if let Some('>') = state.peek() {
433                        state.advance(1);
434                        JuliaSyntaxKind::RightShift
435                    }
436                    else {
437                        JuliaSyntaxKind::GreaterThan
438                    }
439                }
440                '&' => {
441                    state.advance(1);
442                    if let Some('&') = state.peek() {
443                        state.advance(1);
444                        JuliaSyntaxKind::And
445                    }
446                    else {
447                        JuliaSyntaxKind::BitAnd
448                    }
449                }
450                '|' => {
451                    state.advance(1);
452                    if let Some('|') = state.peek() {
453                        state.advance(1);
454                        JuliaSyntaxKind::Or
455                    }
456                    else {
457                        JuliaSyntaxKind::BitOr
458                    }
459                }
460                '~' => {
461                    state.advance(1);
462                    JuliaSyntaxKind::BitNot
463                }
464                ':' => {
465                    state.advance(1);
466                    JuliaSyntaxKind::Colon
467                }
468                '.' => {
469                    state.advance(1);
470                    if let Some('.') = state.peek() {
471                        state.advance(1);
472                        JuliaSyntaxKind::Range
473                    }
474                    else {
475                        JuliaSyntaxKind::Dot
476                    }
477                }
478                _ => return false,
479            };
480
481            state.add_token(token_kind, start_pos, state.get_position());
482            true
483        }
484        else {
485            false
486        }
487    }
488
489    /// 处理分隔
490    fn lex_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
491        let start_pos = state.get_position();
492
493        if let Some(ch) = state.peek() {
494            let token_kind = match ch {
495                '(' => JuliaSyntaxKind::LeftParen,
496                ')' => JuliaSyntaxKind::RightParen,
497                '[' => JuliaSyntaxKind::LeftBracket,
498                ']' => JuliaSyntaxKind::RightBracket,
499                '{' => JuliaSyntaxKind::LeftBrace,
500                '}' => JuliaSyntaxKind::RightBrace,
501                ',' => JuliaSyntaxKind::Comma,
502                ';' => JuliaSyntaxKind::Semicolon,
503                _ => return false,
504            };
505
506            state.advance(ch.len_utf8());
507            state.add_token(token_kind, start_pos, state.get_position());
508            true
509        }
510        else {
511            false
512        }
513    }
514}
515
516impl<'config> Lexer<JuliaLanguage> for JuliaLexer<'config> {
517    fn lex_incremental(
518        &self,
519        source: impl Source,
520        changed: usize,
521        cache: IncrementalCache<JuliaLanguage>,
522    ) -> LexOutput<JuliaLanguage> {
523        let mut state = LexerState::new_with_cache(source, changed, cache);
524        let result = self.run(&mut state);
525        state.finish(result)
526    }
527}
528
529impl<'config> JuliaLexer<'config> {
530    /// 主要的词法分析循环
531    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), oak_core::OakError> {
532        while state.not_at_end() {
533            // 尝试各种词法规则
534            if self.skip_whitespace(state) {
535                continue;
536            }
537
538            if self.lex_newline(state) {
539                continue;
540            }
541
542            if self.lex_comment(state) {
543                continue;
544            }
545
546            if self.lex_triple_string(state) {
547                continue;
548            }
549
550            if self.lex_string(state) {
551                continue;
552            }
553
554            if self.lex_number(state) {
555                continue;
556            }
557
558            if self.lex_identifier_or_keyword(state) {
559                continue;
560            }
561
562            if self.lex_operator(state) {
563                continue;
564            }
565
566            if self.lex_delimiter(state) {
567                continue;
568            }
569
570            // 如果所有规则都不匹配,跳过当前字符并标记为错误
571            let start_pos = state.get_position();
572            if let Some(ch) = state.peek() {
573                state.advance(ch.len_utf8());
574                state.add_token(JuliaSyntaxKind::Error, start_pos, state.get_position());
575            }
576        }
577
578        // 添加 EOF kind
579        let eof_pos = state.get_position();
580        state.add_token(JuliaSyntaxKind::Eof, eof_pos, eof_pos);
581
582        Ok(())
583    }
584}