oak_wit/lexer/
mod.rs

1use crate::{kind::WitSyntaxKind, language::WitLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError, TextEdit,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, WitLanguage>;
10
11static WIT_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static WIT_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
13static WIT_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct WitLexer<'config> {
17    _config: &'config WitLanguage,
18}
19
20impl<'config> Lexer<WitLanguage> for WitLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WitLanguage>) -> LexOutput<WitLanguage> {
22        let mut state: State<'_, S> = LexerState::new(source);
23        let result = self.run(&mut state);
24        state.finish_with_cache(result, cache)
25    }
26}
27
28impl<'config> WitLexer<'config> {
29    pub fn new(config: &'config WitLanguage) -> Self {
30        Self { _config: config }
31    }
32
33    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
34        while state.not_at_end() {
35            if self.skip_whitespace(state) {
36                continue;
37            }
38
39            if self.skip_comment(state) {
40                continue;
41            }
42
43            if self.lex_string_literal(state) {
44                continue;
45            }
46
47            if self.lex_number_literal(state) {
48                continue;
49            }
50
51            if self.lex_identifier_or_keyword(state) {
52                continue;
53            }
54
55            if self.lex_punctuation(state) {
56                continue;
57            }
58
59            if self.lex_text(state) {
60                continue;
61            }
62
63            // 如果所有规则都不匹配,跳过当前字符并标记为错误
64            let start_pos = state.get_position();
65            if let Some(ch) = state.peek() {
66                state.advance(ch.len_utf8());
67                state.add_token(WitSyntaxKind::Error, start_pos, state.get_position());
68            }
69        }
70
71        // 添加 EOF token
72        let eof_pos = state.get_position();
73        state.add_token(WitSyntaxKind::Eof, eof_pos, eof_pos);
74        Ok(())
75    }
76
77    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
78        WIT_WHITESPACE.scan(state, WitSyntaxKind::Whitespace)
79    }
80
81    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
82        WIT_COMMENT.scan(state, WitSyntaxKind::Comment, WitSyntaxKind::Comment)
83    }
84
85    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
86        WIT_STRING.scan(state, WitSyntaxKind::StringLiteral)
87    }
88
89    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
90        let start_pos = state.get_position();
91        let mut has_digits = false;
92
93        // 处理数字
94        while let Some(ch) = state.peek() {
95            if ch.is_ascii_digit() {
96                state.advance(1);
97                has_digits = true;
98            }
99            else {
100                break;
101            }
102        }
103
104        if has_digits {
105            state.add_token(WitSyntaxKind::IntegerLiteral, start_pos, state.get_position());
106            return true;
107        }
108
109        false
110    }
111
112    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
113        let start_pos = state.get_position();
114
115        if let Some(ch) = state.peek() {
116            if ch.is_ascii_alphabetic() || ch == '_' || ch == '%' {
117                state.advance(ch.len_utf8());
118
119                // 继续读取标识符字符
120                while let Some(ch) = state.peek() {
121                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
122                        state.advance(ch.len_utf8());
123                    }
124                    else {
125                        break;
126                    }
127                }
128
129                let text = state.get_text_from(start_pos);
130                let token_kind = match text.as_ref() {
131                    // WIT 关键字
132                    "world" => WitSyntaxKind::WorldKw,
133                    "interface" => WitSyntaxKind::InterfaceKw,
134                    "package" => WitSyntaxKind::PackageKw,
135                    "component" => WitSyntaxKind::ComponentKw,
136                    "instance" => WitSyntaxKind::InstanceKw,
137                    "module" => WitSyntaxKind::ModuleKw,
138                    "core" => WitSyntaxKind::CoreKw,
139                    "func" => WitSyntaxKind::FuncKw,
140                    "type" => WitSyntaxKind::TypeKw,
141                    "record" => WitSyntaxKind::RecordKw,
142                    "variant" => WitSyntaxKind::VariantKw,
143                    "enum" => WitSyntaxKind::EnumKw,
144                    "flags" => WitSyntaxKind::FlagsKw,
145                    "union" => WitSyntaxKind::UnionKw,
146                    "tuple" => WitSyntaxKind::TupleKw,
147                    "list" => WitSyntaxKind::ListKw,
148                    "option" => WitSyntaxKind::OptionKw,
149                    "result" => WitSyntaxKind::ResultKw,
150                    "static" => WitSyntaxKind::StaticKw,
151                    "constructor" => WitSyntaxKind::ConstructorKw,
152                    "method" => WitSyntaxKind::MethodKw,
153                    "import" => WitSyntaxKind::ImportKw,
154                    "export" => WitSyntaxKind::ExportKw,
155                    "use" => WitSyntaxKind::UseKw,
156                    "include" => WitSyntaxKind::IncludeKw,
157                    "with" => WitSyntaxKind::WithKw,
158                    "resource" => WitSyntaxKind::ResourceKw,
159                    "bool" => WitSyntaxKind::BoolKw,
160                    "u8" => WitSyntaxKind::U8Kw,
161                    "u16" => WitSyntaxKind::U16Kw,
162                    "u32" => WitSyntaxKind::U32Kw,
163                    "u64" => WitSyntaxKind::U64Kw,
164                    "s8" => WitSyntaxKind::S8Kw,
165                    "s16" => WitSyntaxKind::S16Kw,
166                    "s32" => WitSyntaxKind::S32Kw,
167                    "s64" => WitSyntaxKind::S64Kw,
168                    "f32" => WitSyntaxKind::F32Kw,
169                    "f64" => WitSyntaxKind::F64Kw,
170                    "char" => WitSyntaxKind::CharKw,
171                    "string" => WitSyntaxKind::StringKw,
172                    _ => WitSyntaxKind::Identifier,
173                };
174
175                state.add_token(token_kind, start_pos, state.get_position());
176                return true;
177            }
178        }
179
180        false
181    }
182
183    fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
184        let start_pos = state.get_position();
185
186        if let Some(ch) = state.peek() {
187            let token_kind = match ch {
188                '(' => {
189                    state.advance(1);
190                    WitSyntaxKind::LeftParen
191                }
192                ')' => {
193                    state.advance(1);
194                    WitSyntaxKind::RightParen
195                }
196                '{' => {
197                    state.advance(1);
198                    WitSyntaxKind::LeftBrace
199                }
200                '}' => {
201                    state.advance(1);
202                    WitSyntaxKind::RightBrace
203                }
204                '[' => {
205                    state.advance(1);
206                    WitSyntaxKind::LeftBracket
207                }
208                ']' => {
209                    state.advance(1);
210                    WitSyntaxKind::RightBracket
211                }
212                '<' => {
213                    state.advance(1);
214                    WitSyntaxKind::Lt
215                }
216                '>' => {
217                    state.advance(1);
218                    WitSyntaxKind::Gt
219                }
220                ',' => {
221                    state.advance(1);
222                    WitSyntaxKind::Comma
223                }
224                ';' => {
225                    state.advance(1);
226                    WitSyntaxKind::Semicolon
227                }
228                ':' => {
229                    state.advance(1);
230                    WitSyntaxKind::Colon
231                }
232                '=' => {
233                    state.advance(1);
234                    WitSyntaxKind::Assign
235                }
236                '.' => {
237                    state.advance(1);
238                    WitSyntaxKind::Dot
239                }
240                '*' => {
241                    state.advance(1);
242                    WitSyntaxKind::Star
243                }
244                '/' => {
245                    state.advance(1);
246                    WitSyntaxKind::Slash
247                }
248                '@' => {
249                    state.advance(1);
250                    WitSyntaxKind::At
251                }
252                '-' => {
253                    state.advance(1);
254                    if state.peek() == Some('>') {
255                        state.advance(1);
256                        WitSyntaxKind::Arrow
257                    }
258                    else {
259                        WitSyntaxKind::Minus
260                    }
261                }
262                _ => return false,
263            };
264
265            state.add_token(token_kind, start_pos, state.get_position());
266            return true;
267        }
268
269        false
270    }
271
272    fn lex_text<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
273        let start_pos = state.get_position();
274
275        if let Some(ch) = state.peek() {
276            state.advance(ch.len_utf8());
277            state.add_token(WitSyntaxKind::Error, start_pos, state.get_position());
278            return true;
279        }
280
281        false
282    }
283}