Skip to main content

oak_zig/lexer/
mod.rs

1use crate::{kind::ZigSyntaxKind, language::ZigLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{LexOutput, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, ZigLanguage>;
10
11static ZIG_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12
13#[derive(Clone)]
14pub struct ZigLexer<'config> {
15    _config: &'config ZigLanguage,
16}
17
18impl<'config> Lexer<ZigLanguage> for ZigLexer<'config> {
19    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<ZigLanguage>) -> LexOutput<ZigLanguage> {
20        let mut state = State::new_with_cache(source, 0, cache);
21        let result = self.run(&mut state);
22        if result.is_ok() {
23            state.add_eof();
24        }
25        state.finish_with_cache(result, cache)
26    }
27}
28
29impl<'config> ZigLexer<'config> {
30    pub fn new(config: &'config ZigLanguage) -> Self {
31        Self { _config: config }
32    }
33
34    /// 主要的词法分析循环
35    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
36        while state.not_at_end() {
37            let safe_point = state.get_position();
38
39            if self.skip_whitespace(state) {
40                continue;
41            }
42
43            if self.skip_comment(state) {
44                continue;
45            }
46
47            if self.lex_string_literal(state) {
48                continue;
49            }
50
51            if self.lex_char_literal(state) {
52                continue;
53            }
54
55            if self.lex_number_literal(state) {
56                continue;
57            }
58
59            if self.lex_identifier_or_keyword(state) {
60                continue;
61            }
62
63            if self.lex_builtin(state) {
64                continue;
65            }
66
67            if self.lex_operators(state) {
68                continue;
69            }
70
71            if self.lex_single_char_tokens(state) {
72                continue;
73            }
74
75            // 如果没有匹配到任何规则,前进一个字符并标记为错误
76            let start_pos = state.get_position();
77            if let Some(ch) = state.peek() {
78                state.advance(ch.len_utf8());
79                state.add_token(ZigSyntaxKind::Error, start_pos, state.get_position());
80            }
81
82            state.advance_if_dead_lock(safe_point);
83        }
84
85        Ok(())
86    }
87
88    /// 跳过空白字符
89    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
90        ZIG_WHITESPACE.scan(state, ZigSyntaxKind::Whitespace)
91    }
92
93    /// 跳过注释
94    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
95        let start = state.get_position();
96        let rest = state.rest();
97
98        // 行注释: // ... 直到换行
99        if rest.starts_with("//") {
100            state.advance(2);
101
102            // 检查是否是文档注释 ///
103            let is_doc_comment = if state.peek() == Some('/') {
104                state.advance(1);
105                true
106            }
107            else {
108                false
109            };
110
111            while let Some(ch) = state.peek() {
112                if ch == '\n' || ch == '\r' {
113                    break;
114                }
115                state.advance(ch.len_utf8());
116            }
117
118            let kind = if is_doc_comment { ZigSyntaxKind::DocComment } else { ZigSyntaxKind::Comment };
119            state.add_token(kind, start, state.get_position());
120            return true;
121        }
122
123        false
124    }
125
126    /// 解析字符串字面量
127    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
128        let start = state.get_position();
129
130        // 多行字符串: \\...
131        if state.rest().starts_with("\\\\") {
132            state.advance(2);
133
134            // 跳过到行尾
135            while let Some(ch) = state.peek() {
136                if ch == '\n' {
137                    state.advance(1);
138                    break;
139                }
140                state.advance(ch.len_utf8());
141            }
142
143            // 读取多行字符串内容
144            while state.not_at_end() {
145                let _line_start = state.get_position();
146
147                // 检查是否是续行
148                if !state.rest().starts_with("\\\\") {
149                    break;
150                }
151
152                state.advance(2);
153
154                // 读取到行尾
155                while let Some(ch) = state.peek() {
156                    if ch == '\n' {
157                        state.advance(1);
158                        break;
159                    }
160                    state.advance(ch.len_utf8());
161                }
162            }
163
164            state.add_token(ZigSyntaxKind::StringLiteral, start, state.get_position());
165            return true;
166        }
167
168        // 普通字符串: "..."
169        if state.current() == Some('"') {
170            state.advance(1);
171            while let Some(ch) = state.peek() {
172                if ch == '"' {
173                    state.advance(1);
174                    break;
175                }
176                if ch == '\\' {
177                    state.advance(1);
178                    if let Some(next) = state.peek() {
179                        state.advance(next.len_utf8());
180                    }
181                    continue;
182                }
183                state.advance(ch.len_utf8());
184            }
185            state.add_token(ZigSyntaxKind::StringLiteral, start, state.get_position());
186            return true;
187        }
188
189        false
190    }
191
192    /// 解析字符字面量
193    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
194        let start = state.get_position();
195        if state.current() == Some('\'') {
196            state.advance(1);
197            while let Some(ch) = state.peek() {
198                if ch == '\'' {
199                    state.advance(1);
200                    break;
201                }
202                if ch == '\\' {
203                    state.advance(1);
204                    if let Some(next) = state.peek() {
205                        state.advance(next.len_utf8());
206                    }
207                    continue;
208                }
209                state.advance(ch.len_utf8());
210            }
211            state.add_token(ZigSyntaxKind::CharLiteral, start, state.get_position());
212            return true;
213        }
214        false
215    }
216
217    /// 解析数字字面量
218    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
219        let start = state.get_position();
220        let ch = state.current();
221        let mut is_float = false;
222
223        if let Some(ch) = ch {
224            if ch.is_ascii_digit() {
225                state.advance(1);
226                // 处理十六进制、二进制、八进制
227                if ch == '0' {
228                    if let Some(next) = state.peek() {
229                        match next {
230                            'x' | 'X' => {
231                                state.advance(1);
232                                state.take_while(|c| c.is_ascii_hexdigit() || c == '_');
233                            }
234                            'b' | 'B' => {
235                                state.advance(1);
236                                state.take_while(|c| c == '0' || c == '1' || c == '_');
237                            }
238                            'o' | 'O' => {
239                                state.advance(1);
240                                state.take_while(|c| ('0'..='7').contains(&c) || c == '_');
241                            }
242                            _ => {
243                                state.take_while(|c| c.is_ascii_digit() || c == '_');
244                            }
245                        }
246                    }
247                }
248                else {
249                    state.take_while(|c| c.is_ascii_digit() || c == '_');
250                }
251
252                // 处理小数点
253                if state.current() == Some('.') {
254                    if let Some(next) = state.peek() {
255                        if next.is_ascii_digit() {
256                            is_float = true;
257                            state.advance(1);
258                            state.take_while(|c| c.is_ascii_digit() || c == '_');
259                        }
260                    }
261                }
262
263                // 处理指数
264                if let Some(c) = state.current() {
265                    if c == 'e' || c == 'E' || c == 'p' || c == 'P' {
266                        is_float = true;
267                        state.advance(1);
268                        if let Some(next) = state.peek() {
269                            if next == '+' || next == '-' {
270                                state.advance(1);
271                            }
272                        }
273                        state.take_while(|c| c.is_ascii_digit() || c == '_');
274                    }
275                }
276
277                let kind = if is_float { ZigSyntaxKind::FloatLiteral } else { ZigSyntaxKind::IntegerLiteral };
278                state.add_token(kind, start, state.get_position());
279                return true;
280            }
281        }
282        false
283    }
284
285    /// 解析标识符或关键字
286    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
287        let start = state.get_position();
288        if let Some(ch) = state.current() {
289            if ch.is_ascii_alphabetic() || ch == '_' {
290                state.advance(ch.len_utf8());
291                state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
292
293                let end = state.get_position();
294                let text = state.get_text_in((start..end).into());
295                let kind = self.get_keyword_or_identifier(&text);
296                state.add_token(kind, start, state.get_position());
297                return true;
298            }
299        }
300        false
301    }
302
303    /// 获取关键字或标识符类型
304    fn get_keyword_or_identifier(&self, text: &str) -> ZigSyntaxKind {
305        match text {
306            // 基本结构
307            "const" => ZigSyntaxKind::Const,
308            "var" => ZigSyntaxKind::Var,
309            "fn" => ZigSyntaxKind::Fn,
310            "struct" => ZigSyntaxKind::Struct,
311            "union" => ZigSyntaxKind::Union,
312            "enum" => ZigSyntaxKind::Enum,
313            "opaque" => ZigSyntaxKind::Opaque,
314            "type" => ZigSyntaxKind::Type,
315            "comptime" => ZigSyntaxKind::Comptime,
316            "inline" => ZigSyntaxKind::Inline,
317            "noinline" => ZigSyntaxKind::NoInline,
318            "pub" => ZigSyntaxKind::Pub,
319            "export" => ZigSyntaxKind::Export,
320            "extern" => ZigSyntaxKind::Extern,
321            "packed" => ZigSyntaxKind::Packed,
322            "align" => ZigSyntaxKind::Align,
323            "callconv" => ZigSyntaxKind::CallConv,
324            "linksection" => ZigSyntaxKind::LinkSection,
325
326            // 控制流
327            "if" => ZigSyntaxKind::If,
328            "else" => ZigSyntaxKind::Else,
329            "switch" => ZigSyntaxKind::Switch,
330            "while" => ZigSyntaxKind::While,
331            "for" => ZigSyntaxKind::For,
332            "break" => ZigSyntaxKind::Break,
333            "continue" => ZigSyntaxKind::Continue,
334            "return" => ZigSyntaxKind::Return,
335            "defer" => ZigSyntaxKind::Defer,
336            "errdefer" => ZigSyntaxKind::ErrDefer,
337            "unreachable" => ZigSyntaxKind::Unreachable,
338            "noreturn" => ZigSyntaxKind::NoReturn,
339
340            // 错误处理
341            "try" => ZigSyntaxKind::TryKeyword,
342            "catch" => ZigSyntaxKind::CatchKeyword,
343            "orelse" => ZigSyntaxKind::OrElse,
344            "error" => ZigSyntaxKind::ErrorKeyword,
345
346            // 测试和异步
347            "test" => ZigSyntaxKind::Test,
348            "async" => ZigSyntaxKind::Async,
349            "await" => ZigSyntaxKind::AwaitKeyword,
350            "suspend" => ZigSyntaxKind::Suspend,
351            "resume" => ZigSyntaxKind::Resume,
352            "cancel" => ZigSyntaxKind::Cancel,
353
354            // 内存管理
355            "undefined" => ZigSyntaxKind::Undefined,
356            "null" => ZigSyntaxKind::Null,
357            "volatile" => ZigSyntaxKind::Volatile,
358            "allowzero" => ZigSyntaxKind::AllowZero,
359            "noalias" => ZigSyntaxKind::NoAlias,
360
361            // 逻辑运算
362            "and" => ZigSyntaxKind::And,
363            "or" => ZigSyntaxKind::Or,
364
365            // 其他
366            "anyframe" => ZigSyntaxKind::AnyFrame,
367            "anytype" => ZigSyntaxKind::AnyType,
368            "threadlocal" => ZigSyntaxKind::ThreadLocal,
369
370            // 基本类型
371            "bool" => ZigSyntaxKind::Bool,
372            "i8" => ZigSyntaxKind::I8,
373            "i16" => ZigSyntaxKind::I16,
374            "i32" => ZigSyntaxKind::I32,
375            "i64" => ZigSyntaxKind::I64,
376            "i128" => ZigSyntaxKind::I128,
377            "isize" => ZigSyntaxKind::Isize,
378            "u8" => ZigSyntaxKind::U8,
379            "u16" => ZigSyntaxKind::U16,
380            "u32" => ZigSyntaxKind::U32,
381            "u64" => ZigSyntaxKind::U64,
382            "u128" => ZigSyntaxKind::U128,
383            "usize" => ZigSyntaxKind::Usize,
384            "f16" => ZigSyntaxKind::F16,
385            "f32" => ZigSyntaxKind::F32,
386            "f64" => ZigSyntaxKind::F64,
387            "f80" => ZigSyntaxKind::F80,
388            "f128" => ZigSyntaxKind::F128,
389            "c_short" => ZigSyntaxKind::CShort,
390            "c_ushort" => ZigSyntaxKind::CUshort,
391            "c_int" => ZigSyntaxKind::CInt,
392            "c_uint" => ZigSyntaxKind::CUint,
393            "c_long" => ZigSyntaxKind::CLong,
394            "c_ulong" => ZigSyntaxKind::CUlong,
395            "c_longlong" => ZigSyntaxKind::CLongLong,
396            "c_ulonglong" => ZigSyntaxKind::CUlongLong,
397            "c_longdouble" => ZigSyntaxKind::CLongDouble,
398            "c_void" => ZigSyntaxKind::CVoid,
399            "void" => ZigSyntaxKind::Void,
400            "comptime_int" => ZigSyntaxKind::ComptimeInt,
401            "comptime_float" => ZigSyntaxKind::ComptimeFloat,
402
403            // 布尔字面量
404            "true" | "false" => ZigSyntaxKind::BooleanLiteral,
405
406            _ => ZigSyntaxKind::Identifier,
407        }
408    }
409
410    /// 解析内置标识符 (@import 等)
411    fn lex_builtin<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
412        let start = state.get_position();
413        if state.current() == Some('@') {
414            state.advance(1);
415            if let Some(ch) = state.current() {
416                if ch.is_ascii_alphabetic() || ch == '_' {
417                    state.advance(ch.len_utf8());
418                    state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
419                    state.add_token(ZigSyntaxKind::BuiltinIdentifier, start, state.get_position());
420                    return true;
421                }
422            }
423        }
424        false
425    }
426
427    /// 解析操作符
428    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
429        let start = state.get_position();
430        let rest = state.rest();
431
432        // 尝试匹配最长的操作符
433        let ops = [
434            ("<<=", ZigSyntaxKind::LessLessAssign),
435            (">>=", ZigSyntaxKind::GreaterGreaterAssign),
436            ("...", ZigSyntaxKind::DotDotDot),
437            ("==", ZigSyntaxKind::Equal),
438            ("!=", ZigSyntaxKind::NotEqual),
439            ("<=", ZigSyntaxKind::LessEqual),
440            (">=", ZigSyntaxKind::GreaterEqual),
441            ("&&", ZigSyntaxKind::AndAnd),
442            ("||", ZigSyntaxKind::OrOr),
443            ("+=", ZigSyntaxKind::PlusAssign),
444            ("-=", ZigSyntaxKind::MinusAssign),
445            ("*=", ZigSyntaxKind::StarAssign),
446            ("/=", ZigSyntaxKind::SlashAssign),
447            ("%=", ZigSyntaxKind::PercentAssign),
448            ("&=", ZigSyntaxKind::AmpersandAssign),
449            ("|=", ZigSyntaxKind::PipeAssign),
450            ("^=", ZigSyntaxKind::CaretAssign),
451            ("++", ZigSyntaxKind::PlusPlus),
452            ("--", ZigSyntaxKind::MinusMinus),
453            ("**", ZigSyntaxKind::StarStar),
454            ("->", ZigSyntaxKind::Arrow),
455            ("=>", ZigSyntaxKind::FatArrow),
456            ("<<", ZigSyntaxKind::LessLess),
457            (">>", ZigSyntaxKind::GreaterGreater),
458            (".?", ZigSyntaxKind::DotQuestion),
459            (".*", ZigSyntaxKind::DotStar),
460        ];
461
462        for (op, kind) in ops {
463            if rest.starts_with(op) {
464                state.advance(op.len());
465                state.add_token(kind, start, state.get_position());
466                return true;
467            }
468        }
469
470        false
471    }
472
473    /// 解析单字符标记
474    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
475        let start = state.get_position();
476        if let Some(ch) = state.current() {
477            let kind = match ch {
478                '(' => ZigSyntaxKind::LeftParen,
479                ')' => ZigSyntaxKind::RightParen,
480                '{' => ZigSyntaxKind::LeftBrace,
481                '}' => ZigSyntaxKind::RightBrace,
482                '[' => ZigSyntaxKind::LeftBracket,
483                ']' => ZigSyntaxKind::RightBracket,
484                ',' => ZigSyntaxKind::Comma,
485                '.' => ZigSyntaxKind::Dot,
486                ':' => ZigSyntaxKind::Colon,
487                ';' => ZigSyntaxKind::Semicolon,
488                '+' => ZigSyntaxKind::Plus,
489                '-' => ZigSyntaxKind::Minus,
490                '*' => ZigSyntaxKind::Star,
491                '/' => ZigSyntaxKind::Slash,
492                '%' => ZigSyntaxKind::Percent,
493                '&' => ZigSyntaxKind::Ampersand,
494                '|' => ZigSyntaxKind::Pipe,
495                '^' => ZigSyntaxKind::Caret,
496                '~' => ZigSyntaxKind::Tilde,
497                '!' => ZigSyntaxKind::Exclamation,
498                '?' => ZigSyntaxKind::Question,
499                '<' => ZigSyntaxKind::Less,
500                '>' => ZigSyntaxKind::Greater,
501                '=' => ZigSyntaxKind::Assign,
502                _ => return false,
503            };
504            state.advance(1);
505            state.add_token(kind, start, state.get_position());
506            return true;
507        }
508        false
509    }
510}