oak_zig/lexer/
mod.rs

1use crate::{kind::ZigSyntaxKind, language::ZigLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, ZigLanguage>;
10
11static ZIG_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static ZIG_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static ZIG_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14static ZIG_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
15
16#[derive(Clone)]
17pub struct ZigLexer<'config> {
18    config: &'config ZigLanguage,
19}
20
21impl<'config> Lexer<ZigLanguage> for ZigLexer<'config> {
22    fn lex_incremental(
23        &self,
24        source: impl Source,
25        changed: usize,
26        cache: IncrementalCache<ZigLanguage>,
27    ) -> LexOutput<ZigLanguage> {
28        let mut state = LexerState::new_with_cache(source, changed, cache);
29        let result = self.run(&mut state);
30        state.finish(result)
31    }
32}
33
34impl<'config> ZigLexer<'config> {
35    pub fn new(config: &'config ZigLanguage) -> Self {
36        Self { config }
37    }
38
39    /// 主要的词法分析循环
40    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
41        while state.not_at_end() {
42            let safe_point = state.get_position();
43
44            if self.skip_whitespace(state) {
45                continue;
46            }
47
48            if self.skip_comment(state) {
49                continue;
50            }
51
52            if self.lex_string_literal(state) {
53                continue;
54            }
55
56            if self.lex_char_literal(state) {
57                continue;
58            }
59
60            if self.lex_number_literal(state) {
61                continue;
62            }
63
64            if self.lex_identifier_or_keyword(state) {
65                continue;
66            }
67
68            if self.lex_builtin(state) {
69                continue;
70            }
71
72            if self.lex_operators(state) {
73                continue;
74            }
75
76            if self.lex_single_char_tokens(state) {
77                continue;
78            }
79
80            state.safe_check(safe_point);
81        }
82
83        // 添加 EOF token
84        let eof_pos = state.get_position();
85        state.add_token(ZigSyntaxKind::Eof, eof_pos, eof_pos);
86        Ok(())
87    }
88
89    /// 跳过空白字符
90    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
91        match ZIG_WHITESPACE.scan(state.rest(), state.get_position(), ZigSyntaxKind::Whitespace) {
92            Some(token) => {
93                state.advance_with(token);
94                return true;
95            }
96            None => {}
97        }
98        false
99    }
100
101    /// 跳过注释
102    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
103        let start = state.get_position();
104        let rest = state.rest();
105
106        // 行注释: // ... 直到换行
107        if rest.starts_with("//") {
108            state.advance(2);
109
110            // 检查是否是文档注释 ///
111            let is_doc_comment = if state.peek() == Some('/') {
112                state.advance(1);
113                true
114            }
115            else {
116                false
117            };
118
119            while let Some(ch) = state.peek() {
120                if ch == '\n' || ch == '\r' {
121                    break;
122                }
123                state.advance(ch.len_utf8());
124            }
125
126            let kind = if is_doc_comment { ZigSyntaxKind::DocComment } else { ZigSyntaxKind::Comment };
127            state.add_token(kind, start, state.get_position());
128            return true;
129        }
130
131        false
132    }
133
134    /// 解析字符串字面量
135    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
136        let start = state.get_position();
137
138        // 多行字符串: \\...
139        if state.rest().starts_with("\\\\") {
140            state.advance(2);
141
142            // 跳过到行尾
143            while let Some(ch) = state.peek() {
144                if ch == '\n' {
145                    state.advance(1);
146                    break;
147                }
148                state.advance(ch.len_utf8());
149            }
150
151            // 读取多行字符串内容
152            while state.not_at_end() {
153                let _line_start = state.get_position();
154
155                // 检查是否是续行
156                if !state.rest().starts_with("\\\\") {
157                    break;
158                }
159
160                state.advance(2);
161
162                // 读取到行尾
163                while let Some(ch) = state.peek() {
164                    if ch == '\n' {
165                        state.advance(1);
166                        break;
167                    }
168                    state.advance(ch.len_utf8());
169                }
170            }
171
172            state.add_token(ZigSyntaxKind::StringLiteral, start, state.get_position());
173            return true;
174        }
175
176        // 普通字符串: "..."
177        if state.current() == Some('"') {
178            state.advance(1);
179            let mut escaped = false;
180
181            while let Some(ch) = state.peek() {
182                if ch == '"' && !escaped {
183                    state.advance(1); // consume closing quote
184                    break;
185                }
186
187                state.advance(ch.len_utf8());
188
189                if escaped {
190                    escaped = false;
191                    continue;
192                }
193
194                if ch == '\\' {
195                    escaped = true;
196                    continue;
197                }
198
199                if ch == '\n' || ch == '\r' {
200                    break;
201                }
202            }
203
204            state.add_token(ZigSyntaxKind::StringLiteral, start, state.get_position());
205            return true;
206        }
207
208        false
209    }
210
211    /// 解析字符字面量
212    fn lex_char_literal<S: Source>(&self, state: &mut State<S>) -> bool {
213        let start = state.get_position();
214
215        if state.current() != Some('\'') {
216            return false;
217        }
218
219        state.advance(1); // opening '
220
221        if let Some('\\') = state.peek() {
222            state.advance(1);
223            if let Some(c) = state.peek() {
224                state.advance(c.len_utf8());
225            }
226        }
227        else if let Some(c) = state.peek() {
228            state.advance(c.len_utf8());
229        }
230        else {
231            state.set_position(start);
232            return false;
233        }
234
235        if state.peek() == Some('\'') {
236            state.advance(1);
237            state.add_token(ZigSyntaxKind::CharLiteral, start, state.get_position());
238            return true;
239        }
240
241        state.set_position(start);
242        false
243    }
244
245    /// 解析数字字面量
246    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
247        let start = state.get_position();
248        let first = match state.current() {
249            Some(c) => c,
250            None => return false,
251        };
252
253        if !first.is_ascii_digit() {
254            return false;
255        }
256
257        let mut is_float = false;
258
259        // 处理不同进制
260        if first == '0' {
261            match state.peek_next_n(1) {
262                Some('x') | Some('X') => {
263                    state.advance(2);
264                    while let Some(c) = state.peek() {
265                        if c.is_ascii_hexdigit() || c == '_' {
266                            state.advance(1);
267                        }
268                        else {
269                            break;
270                        }
271                    }
272                }
273                Some('b') | Some('B') => {
274                    state.advance(2);
275                    while let Some(c) = state.peek() {
276                        if c == '0' || c == '1' || c == '_' {
277                            state.advance(1);
278                        }
279                        else {
280                            break;
281                        }
282                    }
283                }
284                Some('o') | Some('O') => {
285                    state.advance(2);
286                    while let Some(c) = state.peek() {
287                        if ('0'..='7').contains(&c) || c == '_' {
288                            state.advance(1);
289                        }
290                        else {
291                            break;
292                        }
293                    }
294                }
295                _ => {
296                    state.advance(1);
297                    while let Some(c) = state.peek() {
298                        if c.is_ascii_digit() || c == '_' {
299                            state.advance(1);
300                        }
301                        else {
302                            break;
303                        }
304                    }
305                }
306            }
307        }
308        else {
309            state.advance(1);
310            while let Some(c) = state.peek() {
311                if c.is_ascii_digit() || c == '_' {
312                    state.advance(1);
313                }
314                else {
315                    break;
316                }
317            }
318        }
319
320        // 小数部分
321        if state.peek() == Some('.') {
322            let n1 = state.peek_next_n(1);
323            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
324                is_float = true;
325                state.advance(1); // consume '.'
326                while let Some(c) = state.peek() {
327                    if c.is_ascii_digit() || c == '_' {
328                        state.advance(1);
329                    }
330                    else {
331                        break;
332                    }
333                }
334            }
335        }
336
337        // 指数部分
338        if let Some(c) = state.peek() {
339            if c == 'e' || c == 'E' {
340                let n1 = state.peek_next_n(1);
341                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
342                    is_float = true;
343                    state.advance(1);
344                    if let Some(sign) = state.peek() {
345                        if sign == '+' || sign == '-' {
346                            state.advance(1);
347                        }
348                    }
349                    while let Some(d) = state.peek() {
350                        if d.is_ascii_digit() || d == '_' {
351                            state.advance(1);
352                        }
353                        else {
354                            break;
355                        }
356                    }
357                }
358            }
359        }
360
361        let end = state.get_position();
362        state.add_token(if is_float { ZigSyntaxKind::FloatLiteral } else { ZigSyntaxKind::IntegerLiteral }, start, end);
363        true
364    }
365
366    /// 解析标识符或关键字
367    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
368        let start = state.get_position();
369        let ch = match state.current() {
370            Some(c) => c,
371            None => return false,
372        };
373
374        if !(ch.is_ascii_alphabetic() || ch == '_') {
375            return false;
376        }
377
378        state.advance(1);
379        while let Some(c) = state.current() {
380            if c.is_ascii_alphanumeric() || c == '_' {
381                state.advance(1);
382            }
383            else {
384                break;
385            }
386        }
387
388        let end = state.get_position();
389        let text = state.get_text_in((start..end).into());
390        let kind = self.get_keyword_or_identifier(text);
391        state.add_token(kind, start, state.get_position());
392        true
393    }
394
395    /// 获取关键字或标识符类型
396    fn get_keyword_or_identifier(&self, text: &str) -> ZigSyntaxKind {
397        match text {
398            // 基本结构
399            "const" => ZigSyntaxKind::Const,
400            "var" => ZigSyntaxKind::Var,
401            "fn" => ZigSyntaxKind::Fn,
402            "struct" => ZigSyntaxKind::Struct,
403            "union" => ZigSyntaxKind::Union,
404            "enum" => ZigSyntaxKind::Enum,
405            "opaque" => ZigSyntaxKind::Opaque,
406            "type" => ZigSyntaxKind::Type,
407            "comptime" => ZigSyntaxKind::Comptime,
408            "inline" => ZigSyntaxKind::Inline,
409            "noinline" => ZigSyntaxKind::NoInline,
410            "pub" => ZigSyntaxKind::Pub,
411            "export" => ZigSyntaxKind::Export,
412            "extern" => ZigSyntaxKind::Extern,
413            "packed" => ZigSyntaxKind::Packed,
414            "align" => ZigSyntaxKind::Align,
415            "callconv" => ZigSyntaxKind::CallConv,
416            "linksection" => ZigSyntaxKind::LinkSection,
417
418            // 控制流
419            "if" => ZigSyntaxKind::If,
420            "else" => ZigSyntaxKind::Else,
421            "switch" => ZigSyntaxKind::Switch,
422            "while" => ZigSyntaxKind::While,
423            "for" => ZigSyntaxKind::For,
424            "break" => ZigSyntaxKind::Break,
425            "continue" => ZigSyntaxKind::Continue,
426            "return" => ZigSyntaxKind::Return,
427            "defer" => ZigSyntaxKind::Defer,
428            "errdefer" => ZigSyntaxKind::ErrDefer,
429            "unreachable" => ZigSyntaxKind::Unreachable,
430            "noreturn" => ZigSyntaxKind::NoReturn,
431
432            // 错误处理
433            "try" => ZigSyntaxKind::TryKeyword,
434            "catch" => ZigSyntaxKind::CatchKeyword,
435            "orelse" => ZigSyntaxKind::OrElse,
436            "error" => ZigSyntaxKind::ErrorKeyword,
437
438            // 测试和异步
439            "test" => ZigSyntaxKind::Test,
440            "async" => ZigSyntaxKind::Async,
441            "await" => ZigSyntaxKind::AwaitKeyword,
442            "suspend" => ZigSyntaxKind::Suspend,
443            "resume" => ZigSyntaxKind::Resume,
444            "cancel" => ZigSyntaxKind::Cancel,
445
446            // 内存管理
447            "undefined" => ZigSyntaxKind::Undefined,
448            "null" => ZigSyntaxKind::Null,
449            "volatile" => ZigSyntaxKind::Volatile,
450            "allowzero" => ZigSyntaxKind::AllowZero,
451            "noalias" => ZigSyntaxKind::NoAlias,
452
453            // 逻辑运算
454            "and" => ZigSyntaxKind::And,
455            "or" => ZigSyntaxKind::Or,
456
457            // 其他
458            "anyframe" => ZigSyntaxKind::AnyFrame,
459            "anytype" => ZigSyntaxKind::AnyType,
460            "threadlocal" => ZigSyntaxKind::ThreadLocal,
461
462            // 基本类型
463            "bool" => ZigSyntaxKind::Bool,
464            "i8" => ZigSyntaxKind::I8,
465            "i16" => ZigSyntaxKind::I16,
466            "i32" => ZigSyntaxKind::I32,
467            "i64" => ZigSyntaxKind::I64,
468            "i128" => ZigSyntaxKind::I128,
469            "isize" => ZigSyntaxKind::Isize,
470            "u8" => ZigSyntaxKind::U8,
471            "u16" => ZigSyntaxKind::U16,
472            "u32" => ZigSyntaxKind::U32,
473            "u64" => ZigSyntaxKind::U64,
474            "u128" => ZigSyntaxKind::U128,
475            "usize" => ZigSyntaxKind::Usize,
476            "f16" => ZigSyntaxKind::F16,
477            "f32" => ZigSyntaxKind::F32,
478            "f64" => ZigSyntaxKind::F64,
479            "f80" => ZigSyntaxKind::F80,
480            "f128" => ZigSyntaxKind::F128,
481            "c_short" => ZigSyntaxKind::C_Short,
482            "c_ushort" => ZigSyntaxKind::C_UShort,
483            "c_int" => ZigSyntaxKind::C_Int,
484            "c_uint" => ZigSyntaxKind::C_UInt,
485            "c_long" => ZigSyntaxKind::C_Long,
486            "c_ulong" => ZigSyntaxKind::C_ULong,
487            "c_longlong" => ZigSyntaxKind::C_LongLong,
488            "c_ulonglong" => ZigSyntaxKind::C_ULongLong,
489            "c_longdouble" => ZigSyntaxKind::C_LongDouble,
490            "c_void" => ZigSyntaxKind::C_Void,
491            "void" => ZigSyntaxKind::Void,
492            "comptime_int" => ZigSyntaxKind::Comptime_Int,
493            "comptime_float" => ZigSyntaxKind::Comptime_Float,
494
495            // 布尔字面量
496            "true" | "false" => ZigSyntaxKind::BooleanLiteral,
497
498            _ => ZigSyntaxKind::Identifier,
499        }
500    }
501
502    /// 解析内置函数 @...
503    fn lex_builtin<S: Source>(&self, state: &mut State<S>) -> bool {
504        let start = state.get_position();
505
506        if state.current() != Some('@') {
507            return false;
508        }
509
510        state.advance(1); // consume '@'
511
512        // 读取内置函数名
513        while let Some(c) = state.peek() {
514            if c.is_ascii_alphanumeric() || c == '_' {
515                state.advance(1);
516            }
517            else {
518                break;
519            }
520        }
521
522        state.add_token(ZigSyntaxKind::At, start, state.get_position());
523        true
524    }
525
526    /// 解析操作符
527    fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
528        let start = state.get_position();
529        let rest = state.rest();
530
531        // 优先匹配较长的操作符
532        let patterns: &[(&str, ZigSyntaxKind)] = &[
533            ("**", ZigSyntaxKind::StarStar),
534            ("+%", ZigSyntaxKind::PlusPercent),
535            ("-%", ZigSyntaxKind::MinusPercent),
536            ("*%", ZigSyntaxKind::StarPercent),
537            ("++", ZigSyntaxKind::PlusPlus),
538            ("<<", ZigSyntaxKind::LessLess),
539            (">>", ZigSyntaxKind::GreaterGreater),
540            ("==", ZigSyntaxKind::Equal),
541            ("!=", ZigSyntaxKind::NotEqual),
542            ("<=", ZigSyntaxKind::LessEqual),
543            (">=", ZigSyntaxKind::GreaterEqual),
544            ("+=", ZigSyntaxKind::PlusAssign),
545            ("-=", ZigSyntaxKind::MinusAssign),
546            ("*=", ZigSyntaxKind::StarAssign),
547            ("/=", ZigSyntaxKind::SlashAssign),
548            ("%=", ZigSyntaxKind::PercentAssign),
549            ("&=", ZigSyntaxKind::AmpersandAssign),
550            ("|=", ZigSyntaxKind::PipeAssign),
551            ("^=", ZigSyntaxKind::CaretAssign),
552            ("<<=", ZigSyntaxKind::LessLessAssign),
553            (">>=", ZigSyntaxKind::GreaterGreaterAssign),
554            ("...", ZigSyntaxKind::DotDotDot),
555            ("..", ZigSyntaxKind::DotDot),
556            ("=>", ZigSyntaxKind::FatArrow),
557        ];
558
559        for (pat, kind) in patterns {
560            if rest.starts_with(pat) {
561                state.advance(pat.len());
562                state.add_token(*kind, start, state.get_position());
563                return true;
564            }
565        }
566
567        // 单字符操作符
568        if let Some(ch) = state.current() {
569            let kind = match ch {
570                '+' => Some(ZigSyntaxKind::Plus),
571                '-' => Some(ZigSyntaxKind::Minus),
572                '*' => Some(ZigSyntaxKind::Star),
573                '/' => Some(ZigSyntaxKind::Slash),
574                '%' => Some(ZigSyntaxKind::Percent),
575                '&' => Some(ZigSyntaxKind::Ampersand),
576                '|' => Some(ZigSyntaxKind::Pipe),
577                '^' => Some(ZigSyntaxKind::Caret),
578                '~' => Some(ZigSyntaxKind::Tilde),
579                '=' => Some(ZigSyntaxKind::Assign),
580                '<' => Some(ZigSyntaxKind::Less),
581                '>' => Some(ZigSyntaxKind::Greater),
582                '.' => Some(ZigSyntaxKind::Dot),
583                '!' => Some(ZigSyntaxKind::Exclamation),
584                '?' => Some(ZigSyntaxKind::Question),
585                _ => None,
586            };
587
588            if let Some(k) = kind {
589                state.advance(ch.len_utf8());
590                state.add_token(k, start, state.get_position());
591                return true;
592            }
593        }
594
595        false
596    }
597
598    /// 解析单字符token
599    fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
600        let start = state.get_position();
601
602        if let Some(ch) = state.current() {
603            let kind = match ch {
604                '(' => ZigSyntaxKind::LeftParen,
605                ')' => ZigSyntaxKind::RightParen,
606                '{' => ZigSyntaxKind::LeftBrace,
607                '}' => ZigSyntaxKind::RightBrace,
608                '[' => ZigSyntaxKind::LeftBracket,
609                ']' => ZigSyntaxKind::RightBracket,
610                ',' => ZigSyntaxKind::Comma,
611                ';' => ZigSyntaxKind::Semicolon,
612                ':' => ZigSyntaxKind::Colon,
613                _ => return false,
614            };
615
616            state.advance(ch.len_utf8());
617            state.add_token(kind, start, state.get_position());
618            return true;
619        }
620
621        false
622    }
623}