Skip to main content

oak_zig/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2use crate::{language::ZigLanguage, lexer::token_type::ZigTokenType};
3pub mod token_type;
4use oak_core::{
5    Lexer, LexerCache, LexerState, OakError, Source,
6    lexer::{LexOutput, WhitespaceConfig},
7};
8use std::sync::LazyLock;
9
10type State<'a, S> = LexerState<'a, S, ZigLanguage>;
11
12static ZIG_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13
14#[derive(Clone)]
15pub struct ZigLexer<'config> {
16    _config: &'config ZigLanguage,
17}
18
19impl<'config> Lexer<ZigLanguage> for ZigLexer<'config> {
20    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<ZigLanguage>) -> LexOutput<ZigLanguage> {
21        let mut state = State::new_with_cache(source, 0, cache);
22        let result = self.run(&mut state);
23        if result.is_ok() {
24            state.add_eof()
25        }
26        state.finish_with_cache(result, cache)
27    }
28}
29
30impl<'config> ZigLexer<'config> {
31    pub fn new(config: &'config ZigLanguage) -> Self {
32        Self { _config: config }
33    }
34
35    /// 主要的词法分析循环
36    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
37        while state.not_at_end() {
38            let safe_point = state.get_position();
39
40            if self.skip_whitespace(state) {
41                continue;
42            }
43
44            if self.skip_comment(state) {
45                continue;
46            }
47
48            if self.lex_string_literal(state) {
49                continue;
50            }
51
52            if self.lex_char_literal(state) {
53                continue;
54            }
55
56            if self.lex_number_literal(state) {
57                continue;
58            }
59
60            if self.lex_identifier_or_keyword(state) {
61                continue;
62            }
63
64            if self.lex_builtin(state) {
65                continue;
66            }
67
68            if self.lex_operators(state) {
69                continue;
70            }
71
72            if self.lex_single_char_tokens(state) {
73                continue;
74            }
75
76            // 如果没有匹配到任何规则,前进一个字符并标记为错误
77            let start_pos = state.get_position();
78            if let Some(ch) = state.peek() {
79                state.advance(ch.len_utf8());
80                state.add_token(ZigTokenType::Error, start_pos, state.get_position())
81            }
82
83            state.advance_if_dead_lock(safe_point)
84        }
85
86        Ok(())
87    }
88
89    /// 跳过空白字符
90    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
91        ZIG_WHITESPACE.scan(state, ZigTokenType::Whitespace)
92    }
93
94    /// 跳过注释
95    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
96        let start = state.get_position();
97        let rest = state.rest();
98
99        // 行注释: // ... 直到换行
100        if rest.starts_with("//") {
101            state.advance(2);
102
103            // 检查是否是文档注释 ///
104            let is_doc_comment = if state.peek() == Some('/') {
105                state.advance(1);
106                true
107            }
108            else {
109                false
110            };
111
112            while let Some(ch) = state.peek() {
113                if ch == '\n' || ch == '\r' {
114                    break;
115                }
116                state.advance(ch.len_utf8())
117            }
118
119            let kind = if is_doc_comment { ZigTokenType::DocComment } else { ZigTokenType::Comment };
120            state.add_token(kind, start, state.get_position());
121            return true;
122        }
123
124        false
125    }
126
127    /// 解析字符串字面量
128    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
129        let start = state.get_position();
130
131        // 多行字符串: \\...
132        if state.rest().starts_with("\\\\") {
133            state.advance(2);
134
135            // 跳过到行尾
136            while let Some(ch) = state.peek() {
137                if ch == '\n' {
138                    state.advance(1);
139                    break;
140                }
141                state.advance(ch.len_utf8())
142            }
143
144            // 读取多行字符串内容
145            while state.not_at_end() {
146                let _line_start = state.get_position();
147
148                // 检查是否是续行
149                if !state.rest().starts_with("\\\\") {
150                    break;
151                }
152
153                state.advance(2);
154
155                // 读取到行尾
156                while let Some(ch) = state.peek() {
157                    if ch == '\n' {
158                        state.advance(1);
159                        break;
160                    }
161                    state.advance(ch.len_utf8())
162                }
163            }
164
165            state.add_token(ZigTokenType::StringLiteral, start, state.get_position());
166            return true;
167        }
168
169        // 普通字符串: "..."
170        if state.current() == Some('"') {
171            state.advance(1);
172            while let Some(ch) = state.peek() {
173                if ch == '"' {
174                    state.advance(1);
175                    break;
176                }
177                if ch == '\\' {
178                    state.advance(1);
179                    if let Some(next) = state.peek() {
180                        state.advance(next.len_utf8())
181                    }
182                    continue;
183                }
184                state.advance(ch.len_utf8())
185            }
186            state.add_token(ZigTokenType::StringLiteral, start, state.get_position());
187            return true;
188        }
189
190        false
191    }
192
193    /// 解析字符字面量
194    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
195        let start = state.get_position();
196        if state.current() == Some('\'') {
197            state.advance(1);
198            while let Some(ch) = state.peek() {
199                if ch == '\'' {
200                    state.advance(1);
201                    break;
202                }
203                if ch == '\\' {
204                    state.advance(1);
205                    if let Some(next) = state.peek() {
206                        state.advance(next.len_utf8())
207                    }
208                    continue;
209                }
210                state.advance(ch.len_utf8())
211            }
212            state.add_token(ZigTokenType::CharLiteral, start, state.get_position());
213            return true;
214        }
215        false
216    }
217
218    /// 解析数字字面量
219    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
220        let start = state.get_position();
221        let ch = state.current();
222        let mut is_float = false;
223
224        if let Some(ch) = ch {
225            if ch.is_ascii_digit() {
226                state.advance(1);
227                // 处理十六进制、二进制、八进制
228                if ch == '0' {
229                    if let Some(next) = state.peek() {
230                        match next {
231                            'x' | 'X' => {
232                                state.advance(1);
233                                state.take_while(|c| c.is_ascii_hexdigit() || c == '_');
234                            }
235                            'b' | 'B' => {
236                                state.advance(1);
237                                state.take_while(|c| c == '0' || c == '1' || c == '_');
238                            }
239                            'o' | 'O' => {
240                                state.advance(1);
241                                state.take_while(|c| ('0'..='7').contains(&c) || c == '_');
242                            }
243                            _ => {
244                                state.take_while(|c| c.is_ascii_digit() || c == '_');
245                            }
246                        }
247                    }
248                }
249                else {
250                    state.take_while(|c| c.is_ascii_digit() || c == '_');
251                }
252
253                // 处理小数点
254                if state.current() == Some('.') {
255                    if let Some(next) = state.peek() {
256                        if next.is_ascii_digit() {
257                            is_float = true;
258                            state.advance(1);
259                            state.take_while(|c| c.is_ascii_digit() || c == '_');
260                        }
261                    }
262                }
263
264                // 处理指数
265                if let Some(c) = state.current() {
266                    if c == 'e' || c == 'E' || c == 'p' || c == 'P' {
267                        is_float = true;
268                        state.advance(1);
269                        if let Some(next) = state.peek() {
270                            if next == '+' || next == '-' {
271                                state.advance(1);
272                            }
273                        }
274                        state.take_while(|c| c.is_ascii_digit() || c == '_');
275                    }
276                }
277
278                let kind = if is_float { ZigTokenType::FloatLiteral } else { ZigTokenType::IntegerLiteral };
279                state.add_token(kind, start, state.get_position());
280                return true;
281            }
282        }
283        false
284    }
285
286    /// 解析标识符或关键字
287    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
288        let start = state.get_position();
289        if let Some(ch) = state.current() {
290            if ch.is_ascii_alphabetic() || ch == '_' {
291                state.advance(ch.len_utf8());
292                state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
293
294                let end = state.get_position();
295                let text = state.get_text_in((start..end).into());
296                let kind = self.get_keyword_or_identifier(&text);
297                state.add_token(kind, start, state.get_position());
298                return true;
299            }
300        }
301        false
302    }
303
304    /// 获取关键字或标识符类型
305    fn get_keyword_or_identifier(&self, text: &str) -> ZigTokenType {
306        match text {
307            // 基本结构
308            "const" => ZigTokenType::Const,
309            "var" => ZigTokenType::Var,
310            "fn" => ZigTokenType::Fn,
311            "struct" => ZigTokenType::Struct,
312            "union" => ZigTokenType::Union,
313            "enum" => ZigTokenType::Enum,
314            "opaque" => ZigTokenType::Opaque,
315            "type" => ZigTokenType::Type,
316            "comptime" => ZigTokenType::Comptime,
317            "inline" => ZigTokenType::Inline,
318            "noinline" => ZigTokenType::NoInline,
319            "pub" => ZigTokenType::Pub,
320            "export" => ZigTokenType::Export,
321            "extern" => ZigTokenType::Extern,
322            "packed" => ZigTokenType::Packed,
323            "align" => ZigTokenType::Align,
324            "callconv" => ZigTokenType::CallConv,
325            "linksection" => ZigTokenType::LinkSection,
326
327            // 控制流
328            "if" => ZigTokenType::If,
329            "else" => ZigTokenType::Else,
330            "switch" => ZigTokenType::Switch,
331            "while" => ZigTokenType::While,
332            "for" => ZigTokenType::For,
333            "break" => ZigTokenType::Break,
334            "continue" => ZigTokenType::Continue,
335            "return" => ZigTokenType::Return,
336            "defer" => ZigTokenType::Defer,
337            "errdefer" => ZigTokenType::ErrDefer,
338            "unreachable" => ZigTokenType::Unreachable,
339            "noreturn" => ZigTokenType::NoReturn,
340
341            // 错误处理
342            "try" => ZigTokenType::TryKeyword,
343            "catch" => ZigTokenType::CatchKeyword,
344            "orelse" => ZigTokenType::OrElse,
345            "error" => ZigTokenType::ErrorKeyword,
346
347            // 测试和异步
348            "test" => ZigTokenType::Test,
349            "async" => ZigTokenType::Async,
350            "await" => ZigTokenType::AwaitKeyword,
351            "suspend" => ZigTokenType::Suspend,
352            "resume" => ZigTokenType::Resume,
353            "cancel" => ZigTokenType::Cancel,
354
355            // 内存管理
356            "undefined" => ZigTokenType::Undefined,
357            "null" => ZigTokenType::Null,
358            "volatile" => ZigTokenType::Volatile,
359            "allowzero" => ZigTokenType::AllowZero,
360            "noalias" => ZigTokenType::NoAlias,
361
362            // 逻辑运算
363            "and" => ZigTokenType::And,
364            "or" => ZigTokenType::Or,
365
366            // 其他
367            "anyframe" => ZigTokenType::AnyFrame,
368            "anytype" => ZigTokenType::AnyType,
369            "threadlocal" => ZigTokenType::ThreadLocal,
370
371            // 基本类型
372            "bool" => ZigTokenType::Bool,
373            "i8" => ZigTokenType::I8,
374            "i16" => ZigTokenType::I16,
375            "i32" => ZigTokenType::I32,
376            "i64" => ZigTokenType::I64,
377            "i128" => ZigTokenType::I128,
378            "isize" => ZigTokenType::Isize,
379            "u8" => ZigTokenType::U8,
380            "u16" => ZigTokenType::U16,
381            "u32" => ZigTokenType::U32,
382            "u64" => ZigTokenType::U64,
383            "u128" => ZigTokenType::U128,
384            "usize" => ZigTokenType::Usize,
385            "f16" => ZigTokenType::F16,
386            "f32" => ZigTokenType::F32,
387            "f64" => ZigTokenType::F64,
388            "f80" => ZigTokenType::F80,
389            "f128" => ZigTokenType::F128,
390            "c_short" => ZigTokenType::CShort,
391            "c_ushort" => ZigTokenType::CUshort,
392            "c_int" => ZigTokenType::CInt,
393            "c_uint" => ZigTokenType::CUint,
394            "c_long" => ZigTokenType::CLong,
395            "c_ulong" => ZigTokenType::CUlong,
396            "c_longlong" => ZigTokenType::CLongLong,
397            "c_ulonglong" => ZigTokenType::CUlongLong,
398            "c_longdouble" => ZigTokenType::CLongDouble,
399            "c_void" => ZigTokenType::CVoid,
400            "void" => ZigTokenType::Void,
401            "comptime_int" => ZigTokenType::ComptimeInt,
402            "comptime_float" => ZigTokenType::ComptimeFloat,
403
404            // 布尔字面量
405            "true" | "false" => ZigTokenType::BooleanLiteral,
406
407            _ => ZigTokenType::Identifier,
408        }
409    }
410
411    /// 解析内置标识符 (↯import 等)
412    fn lex_builtin<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
413        let start = state.get_position();
414        if state.current() == Some('↯') {
415            state.advance(1);
416            if let Some(ch) = state.current() {
417                if ch.is_ascii_alphabetic() || ch == '_' {
418                    state.advance(ch.len_utf8());
419                    state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
420                    state.add_token(ZigTokenType::BuiltinIdentifier, start, state.get_position());
421                    return true;
422                }
423            }
424        }
425        false
426    }
427
428    /// 解析操作符
429    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
430        let start = state.get_position();
431        let rest = state.rest();
432
433        // 尝试匹配最长的操作符
434        let ops = [
435            ("<<=", ZigTokenType::LessLessAssign),
436            (">>=", ZigTokenType::GreaterGreaterAssign),
437            ("...", ZigTokenType::DotDotDot),
438            ("==", ZigTokenType::Equal),
439            ("!=", ZigTokenType::NotEqual),
440            ("<=", ZigTokenType::LessEqual),
441            (">=", ZigTokenType::GreaterEqual),
442            ("&&", ZigTokenType::AndAnd),
443            ("||", ZigTokenType::OrOr),
444            ("+=", ZigTokenType::PlusAssign),
445            ("-=", ZigTokenType::MinusAssign),
446            ("*=", ZigTokenType::StarAssign),
447            ("/=", ZigTokenType::SlashAssign),
448            ("%=", ZigTokenType::PercentAssign),
449            ("&=", ZigTokenType::AmpersandAssign),
450            ("|=", ZigTokenType::PipeAssign),
451            ("^=", ZigTokenType::CaretAssign),
452            ("++", ZigTokenType::PlusPlus),
453            ("--", ZigTokenType::MinusMinus),
454            ("**", ZigTokenType::StarStar),
455            ("->", ZigTokenType::Arrow),
456            ("=>", ZigTokenType::FatArrow),
457            ("<<", ZigTokenType::LessLess),
458            (">>", ZigTokenType::GreaterGreater),
459            (".?", ZigTokenType::DotQuestion),
460            (".*", ZigTokenType::DotStar),
461        ];
462
463        for (op, kind) in ops {
464            if rest.starts_with(op) {
465                state.advance(op.len());
466                state.add_token(kind, start, state.get_position());
467                return true;
468            }
469        }
470
471        false
472    }
473
474    /// 解析单字符标记
475    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
476        let start = state.get_position();
477        if let Some(ch) = state.current() {
478            let kind = match ch {
479                '(' => ZigTokenType::LeftParen,
480                ')' => ZigTokenType::RightParen,
481                '{' => ZigTokenType::LeftBrace,
482                '}' => ZigTokenType::RightBrace,
483                '[' => ZigTokenType::LeftBracket,
484                ']' => ZigTokenType::RightBracket,
485                ',' => ZigTokenType::Comma,
486                '.' => ZigTokenType::Dot,
487                ':' => ZigTokenType::Colon,
488                ';' => ZigTokenType::Semicolon,
489                '+' => ZigTokenType::Plus,
490                '-' => ZigTokenType::Minus,
491                '*' => ZigTokenType::Star,
492                '/' => ZigTokenType::Slash,
493                '%' => ZigTokenType::Percent,
494                '&' => ZigTokenType::Ampersand,
495                '|' => ZigTokenType::Pipe,
496                '^' => ZigTokenType::Caret,
497                '~' => ZigTokenType::Tilde,
498                '!' => ZigTokenType::Exclamation,
499                '?' => ZigTokenType::Question,
500                '<' => ZigTokenType::Less,
501                '>' => ZigTokenType::Greater,
502                '=' => ZigTokenType::Assign,
503                _ => return false,
504            };
505            state.advance(1);
506            state.add_token(kind, start, state.get_position());
507            return true;
508        }
509        false
510    }
511}