oak_zig/lexer/
mod.rs

1use crate::{kind::ZigSyntaxKind, language::ZigLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{LexOutput, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, ZigLanguage>;
10
11static ZIG_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12
13#[derive(Clone)]
14pub struct ZigLexer;
15
16impl Lexer<ZigLanguage> for ZigLexer {
17    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<ZigLanguage>) -> LexOutput<ZigLanguage> {
18        let mut state = LexerState::new(source);
19        let result = self.run(&mut state);
20        if result.is_ok() {
21            state.add_eof();
22        }
23        state.finish_with_cache(result, cache)
24    }
25}
26
27impl ZigLexer {
28    pub fn new(_config: &ZigLanguage) -> Self {
29        Self
30    }
31
32    /// 主要的词法分析循环
33    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34        while state.not_at_end() {
35            let safe_point = state.get_position();
36
37            if self.skip_whitespace(state) {
38                continue;
39            }
40
41            if self.skip_comment(state) {
42                continue;
43            }
44
45            if self.lex_string_literal(state) {
46                continue;
47            }
48
49            if self.lex_char_literal(state) {
50                continue;
51            }
52
53            if self.lex_number_literal(state) {
54                continue;
55            }
56
57            if self.lex_identifier_or_keyword(state) {
58                continue;
59            }
60
61            if self.lex_builtin(state) {
62                continue;
63            }
64
65            if self.lex_operators(state) {
66                continue;
67            }
68
69            if self.lex_single_char_tokens(state) {
70                continue;
71            }
72
73            // 如果没有匹配到任何规则,前进一个字符并标记为错误
74            let start_pos = state.get_position();
75            if let Some(ch) = state.peek() {
76                state.advance(ch.len_utf8());
77                state.add_token(ZigSyntaxKind::Error, start_pos, state.get_position());
78            }
79
80            state.advance_if_dead_lock(safe_point);
81        }
82
83        Ok(())
84    }
85
86    /// 跳过空白字符
87    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        ZIG_WHITESPACE.scan(state, ZigSyntaxKind::Whitespace)
89    }
90
91    /// 跳过注释
92    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
93        let start = state.get_position();
94        let rest = state.rest();
95
96        // 行注释: // ... 直到换行
97        if rest.starts_with("//") {
98            state.advance(2);
99
100            // 检查是否是文档注释 ///
101            let is_doc_comment = if state.peek() == Some('/') {
102                state.advance(1);
103                true
104            }
105            else {
106                false
107            };
108
109            while let Some(ch) = state.peek() {
110                if ch == '\n' || ch == '\r' {
111                    break;
112                }
113                state.advance(ch.len_utf8());
114            }
115
116            let kind = if is_doc_comment { ZigSyntaxKind::DocComment } else { ZigSyntaxKind::Comment };
117            state.add_token(kind, start, state.get_position());
118            return true;
119        }
120
121        false
122    }
123
124    /// 解析字符串字面量
125    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
126        let start = state.get_position();
127
128        // 多行字符串: \\...
129        if state.rest().starts_with("\\\\") {
130            state.advance(2);
131
132            // 跳过到行尾
133            while let Some(ch) = state.peek() {
134                if ch == '\n' {
135                    state.advance(1);
136                    break;
137                }
138                state.advance(ch.len_utf8());
139            }
140
141            // 读取多行字符串内容
142            while state.not_at_end() {
143                let _line_start = state.get_position();
144
145                // 检查是否是续行
146                if !state.rest().starts_with("\\\\") {
147                    break;
148                }
149
150                state.advance(2);
151
152                // 读取到行尾
153                while let Some(ch) = state.peek() {
154                    if ch == '\n' {
155                        state.advance(1);
156                        break;
157                    }
158                    state.advance(ch.len_utf8());
159                }
160            }
161
162            state.add_token(ZigSyntaxKind::StringLiteral, start, state.get_position());
163            return true;
164        }
165
166        // 普通字符串: "..."
167        if state.current() == Some('"') {
168            state.advance(1);
169            while let Some(ch) = state.peek() {
170                if ch == '"' {
171                    state.advance(1);
172                    break;
173                }
174                if ch == '\\' {
175                    state.advance(1);
176                    if let Some(next) = state.peek() {
177                        state.advance(next.len_utf8());
178                    }
179                    continue;
180                }
181                state.advance(ch.len_utf8());
182            }
183            state.add_token(ZigSyntaxKind::StringLiteral, start, state.get_position());
184            return true;
185        }
186
187        false
188    }
189
190    /// 解析字符字面量
191    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
192        let start = state.get_position();
193        if state.current() == Some('\'') {
194            state.advance(1);
195            while let Some(ch) = state.peek() {
196                if ch == '\'' {
197                    state.advance(1);
198                    break;
199                }
200                if ch == '\\' {
201                    state.advance(1);
202                    if let Some(next) = state.peek() {
203                        state.advance(next.len_utf8());
204                    }
205                    continue;
206                }
207                state.advance(ch.len_utf8());
208            }
209            state.add_token(ZigSyntaxKind::CharLiteral, start, state.get_position());
210            return true;
211        }
212        false
213    }
214
215    /// 解析数字字面量
216    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
217        let start = state.get_position();
218        let ch = state.current();
219        let mut is_float = false;
220
221        if let Some(ch) = ch {
222            if ch.is_ascii_digit() {
223                state.advance(1);
224                // 处理十六进制、二进制、八进制
225                if ch == '0' {
226                    if let Some(next) = state.peek() {
227                        match next {
228                            'x' | 'X' => {
229                                state.advance(1);
230                                state.take_while(|c| c.is_ascii_hexdigit() || c == '_');
231                            }
232                            'b' | 'B' => {
233                                state.advance(1);
234                                state.take_while(|c| c == '0' || c == '1' || c == '_');
235                            }
236                            'o' | 'O' => {
237                                state.advance(1);
238                                state.take_while(|c| ('0'..='7').contains(&c) || c == '_');
239                            }
240                            _ => {
241                                state.take_while(|c| c.is_ascii_digit() || c == '_');
242                            }
243                        }
244                    }
245                }
246                else {
247                    state.take_while(|c| c.is_ascii_digit() || c == '_');
248                }
249
250                // 处理小数点
251                if state.current() == Some('.') {
252                    if let Some(next) = state.peek() {
253                        if next.is_ascii_digit() {
254                            is_float = true;
255                            state.advance(1);
256                            state.take_while(|c| c.is_ascii_digit() || c == '_');
257                        }
258                    }
259                }
260
261                // 处理指数
262                if let Some(c) = state.current() {
263                    if c == 'e' || c == 'E' || c == 'p' || c == 'P' {
264                        is_float = true;
265                        state.advance(1);
266                        if let Some(next) = state.peek() {
267                            if next == '+' || next == '-' {
268                                state.advance(1);
269                            }
270                        }
271                        state.take_while(|c| c.is_ascii_digit() || c == '_');
272                    }
273                }
274
275                let kind = if is_float { ZigSyntaxKind::FloatLiteral } else { ZigSyntaxKind::IntegerLiteral };
276                state.add_token(kind, start, state.get_position());
277                return true;
278            }
279        }
280        false
281    }
282
283    /// 解析标识符或关键字
284    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
285        let start = state.get_position();
286        if let Some(ch) = state.current() {
287            if ch.is_ascii_alphabetic() || ch == '_' {
288                state.advance(ch.len_utf8());
289                state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
290
291                let end = state.get_position();
292                let text = state.get_text_in((start..end).into());
293                let kind = self.get_keyword_or_identifier(&text);
294                state.add_token(kind, start, state.get_position());
295                return true;
296            }
297        }
298        false
299    }
300
301    /// 获取关键字或标识符类型
302    fn get_keyword_or_identifier(&self, text: &str) -> ZigSyntaxKind {
303        match text {
304            // 基本结构
305            "const" => ZigSyntaxKind::Const,
306            "var" => ZigSyntaxKind::Var,
307            "fn" => ZigSyntaxKind::Fn,
308            "struct" => ZigSyntaxKind::Struct,
309            "union" => ZigSyntaxKind::Union,
310            "enum" => ZigSyntaxKind::Enum,
311            "opaque" => ZigSyntaxKind::Opaque,
312            "type" => ZigSyntaxKind::Type,
313            "comptime" => ZigSyntaxKind::Comptime,
314            "inline" => ZigSyntaxKind::Inline,
315            "noinline" => ZigSyntaxKind::NoInline,
316            "pub" => ZigSyntaxKind::Pub,
317            "export" => ZigSyntaxKind::Export,
318            "extern" => ZigSyntaxKind::Extern,
319            "packed" => ZigSyntaxKind::Packed,
320            "align" => ZigSyntaxKind::Align,
321            "callconv" => ZigSyntaxKind::CallConv,
322            "linksection" => ZigSyntaxKind::LinkSection,
323
324            // 控制流
325            "if" => ZigSyntaxKind::If,
326            "else" => ZigSyntaxKind::Else,
327            "switch" => ZigSyntaxKind::Switch,
328            "while" => ZigSyntaxKind::While,
329            "for" => ZigSyntaxKind::For,
330            "break" => ZigSyntaxKind::Break,
331            "continue" => ZigSyntaxKind::Continue,
332            "return" => ZigSyntaxKind::Return,
333            "defer" => ZigSyntaxKind::Defer,
334            "errdefer" => ZigSyntaxKind::ErrDefer,
335            "unreachable" => ZigSyntaxKind::Unreachable,
336            "noreturn" => ZigSyntaxKind::NoReturn,
337
338            // 错误处理
339            "try" => ZigSyntaxKind::TryKeyword,
340            "catch" => ZigSyntaxKind::CatchKeyword,
341            "orelse" => ZigSyntaxKind::OrElse,
342            "error" => ZigSyntaxKind::ErrorKeyword,
343
344            // 测试和异步
345            "test" => ZigSyntaxKind::Test,
346            "async" => ZigSyntaxKind::Async,
347            "await" => ZigSyntaxKind::AwaitKeyword,
348            "suspend" => ZigSyntaxKind::Suspend,
349            "resume" => ZigSyntaxKind::Resume,
350            "cancel" => ZigSyntaxKind::Cancel,
351
352            // 内存管理
353            "undefined" => ZigSyntaxKind::Undefined,
354            "null" => ZigSyntaxKind::Null,
355            "volatile" => ZigSyntaxKind::Volatile,
356            "allowzero" => ZigSyntaxKind::AllowZero,
357            "noalias" => ZigSyntaxKind::NoAlias,
358
359            // 逻辑运算
360            "and" => ZigSyntaxKind::And,
361            "or" => ZigSyntaxKind::Or,
362
363            // 其他
364            "anyframe" => ZigSyntaxKind::AnyFrame,
365            "anytype" => ZigSyntaxKind::AnyType,
366            "threadlocal" => ZigSyntaxKind::ThreadLocal,
367
368            // 基本类型
369            "bool" => ZigSyntaxKind::Bool,
370            "i8" => ZigSyntaxKind::I8,
371            "i16" => ZigSyntaxKind::I16,
372            "i32" => ZigSyntaxKind::I32,
373            "i64" => ZigSyntaxKind::I64,
374            "i128" => ZigSyntaxKind::I128,
375            "isize" => ZigSyntaxKind::Isize,
376            "u8" => ZigSyntaxKind::U8,
377            "u16" => ZigSyntaxKind::U16,
378            "u32" => ZigSyntaxKind::U32,
379            "u64" => ZigSyntaxKind::U64,
380            "u128" => ZigSyntaxKind::U128,
381            "usize" => ZigSyntaxKind::Usize,
382            "f16" => ZigSyntaxKind::F16,
383            "f32" => ZigSyntaxKind::F32,
384            "f64" => ZigSyntaxKind::F64,
385            "f80" => ZigSyntaxKind::F80,
386            "f128" => ZigSyntaxKind::F128,
387            "c_short" => ZigSyntaxKind::CShort,
388            "c_ushort" => ZigSyntaxKind::CUshort,
389            "c_int" => ZigSyntaxKind::CInt,
390            "c_uint" => ZigSyntaxKind::CUint,
391            "c_long" => ZigSyntaxKind::CLong,
392            "c_ulong" => ZigSyntaxKind::CUlong,
393            "c_longlong" => ZigSyntaxKind::CLongLong,
394            "c_ulonglong" => ZigSyntaxKind::CUlongLong,
395            "c_longdouble" => ZigSyntaxKind::CLongDouble,
396            "c_void" => ZigSyntaxKind::CVoid,
397            "void" => ZigSyntaxKind::Void,
398            "comptime_int" => ZigSyntaxKind::ComptimeInt,
399            "comptime_float" => ZigSyntaxKind::ComptimeFloat,
400
401            // 布尔字面量
402            "true" | "false" => ZigSyntaxKind::BooleanLiteral,
403
404            _ => ZigSyntaxKind::Identifier,
405        }
406    }
407
408    /// 解析内置标识符 (@import 等)
409    fn lex_builtin<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
410        let start = state.get_position();
411        if state.current() == Some('@') {
412            state.advance(1);
413            if let Some(ch) = state.current() {
414                if ch.is_ascii_alphabetic() || ch == '_' {
415                    state.advance(ch.len_utf8());
416                    state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
417                    state.add_token(ZigSyntaxKind::BuiltinIdentifier, start, state.get_position());
418                    return true;
419                }
420            }
421        }
422        false
423    }
424
425    /// 解析操作符
426    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
427        let start = state.get_position();
428        let rest = state.rest();
429
430        // 尝试匹配最长的操作符
431        let ops = [
432            ("<<=", ZigSyntaxKind::LessLessAssign),
433            (">>=", ZigSyntaxKind::GreaterGreaterAssign),
434            ("...", ZigSyntaxKind::DotDotDot),
435            ("==", ZigSyntaxKind::Equal),
436            ("!=", ZigSyntaxKind::NotEqual),
437            ("<=", ZigSyntaxKind::LessEqual),
438            (">=", ZigSyntaxKind::GreaterEqual),
439            ("&&", ZigSyntaxKind::AndAnd),
440            ("||", ZigSyntaxKind::OrOr),
441            ("+=", ZigSyntaxKind::PlusAssign),
442            ("-=", ZigSyntaxKind::MinusAssign),
443            ("*=", ZigSyntaxKind::StarAssign),
444            ("/=", ZigSyntaxKind::SlashAssign),
445            ("%=", ZigSyntaxKind::PercentAssign),
446            ("&=", ZigSyntaxKind::AmpersandAssign),
447            ("|=", ZigSyntaxKind::PipeAssign),
448            ("^=", ZigSyntaxKind::CaretAssign),
449            ("++", ZigSyntaxKind::PlusPlus),
450            ("--", ZigSyntaxKind::MinusMinus),
451            ("**", ZigSyntaxKind::StarStar),
452            ("->", ZigSyntaxKind::Arrow),
453            ("=>", ZigSyntaxKind::FatArrow),
454            ("<<", ZigSyntaxKind::LessLess),
455            (">>", ZigSyntaxKind::GreaterGreater),
456            (".?", ZigSyntaxKind::DotQuestion),
457            (".*", ZigSyntaxKind::DotStar),
458        ];
459
460        for (op, kind) in ops {
461            if rest.starts_with(op) {
462                state.advance(op.len());
463                state.add_token(kind, start, state.get_position());
464                return true;
465            }
466        }
467
468        false
469    }
470
471    /// 解析单字符标记
472    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
473        let start = state.get_position();
474        if let Some(ch) = state.current() {
475            let kind = match ch {
476                '(' => ZigSyntaxKind::LeftParen,
477                ')' => ZigSyntaxKind::RightParen,
478                '{' => ZigSyntaxKind::LeftBrace,
479                '}' => ZigSyntaxKind::RightBrace,
480                '[' => ZigSyntaxKind::LeftBracket,
481                ']' => ZigSyntaxKind::RightBracket,
482                ',' => ZigSyntaxKind::Comma,
483                '.' => ZigSyntaxKind::Dot,
484                ':' => ZigSyntaxKind::Colon,
485                ';' => ZigSyntaxKind::Semicolon,
486                '+' => ZigSyntaxKind::Plus,
487                '-' => ZigSyntaxKind::Minus,
488                '*' => ZigSyntaxKind::Star,
489                '/' => ZigSyntaxKind::Slash,
490                '%' => ZigSyntaxKind::Percent,
491                '&' => ZigSyntaxKind::Ampersand,
492                '|' => ZigSyntaxKind::Pipe,
493                '^' => ZigSyntaxKind::Caret,
494                '~' => ZigSyntaxKind::Tilde,
495                '!' => ZigSyntaxKind::Exclamation,
496                '?' => ZigSyntaxKind::Question,
497                '<' => ZigSyntaxKind::Less,
498                '>' => ZigSyntaxKind::Greater,
499                '=' => ZigSyntaxKind::Assign,
500                _ => return false,
501            };
502            state.advance(1);
503            state.add_token(kind, start, state.get_position());
504            return true;
505        }
506        false
507    }
508}