oak_wit/lexer/
mod.rs

1use crate::{kind::WitSyntaxKind, language::WitLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, WitLanguage>;
10
11static WIT_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static WIT_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static WIT_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct WitLexer<'config> {
17    config: &'config WitLanguage,
18}
19
20impl<'config> Lexer<WitLanguage> for WitLexer<'config> {
21    fn lex_incremental(
22        &self,
23        source: impl Source,
24        changed: usize,
25        cache: IncrementalCache<WitLanguage>,
26    ) -> LexOutput<WitLanguage> {
27        let mut state = LexerState::new(source);
28        let _ = self.run(&mut state);
29        state.finish(Ok(()))
30    }
31}
32
33impl<'config> WitLexer<'config> {
34    pub fn new(config: &'config WitLanguage) -> Self {
35        Self { config }
36    }
37
38    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
39        while state.not_at_end() {
40            if self.skip_whitespace(state) {
41                continue;
42            }
43
44            if self.skip_comment(state) {
45                continue;
46            }
47
48            if self.lex_string_literal(state) {
49                continue;
50            }
51
52            if self.lex_number_literal(state) {
53                continue;
54            }
55
56            if self.lex_identifier_or_keyword(state) {
57                continue;
58            }
59
60            if self.lex_punctuation(state) {
61                continue;
62            }
63
64            if self.lex_text(state) {
65                continue;
66            }
67
68            // 如果所有规则都不匹配,跳过当前字符并标记为错误
69            let start_pos = state.get_position();
70            if let Some(ch) = state.peek() {
71                state.advance(ch.len_utf8());
72                state.add_token(WitSyntaxKind::Error, start_pos, state.get_position());
73            }
74        }
75
76        // 添加 EOF token
77        let eof_pos = state.get_position();
78        state.add_token(WitSyntaxKind::Eof, eof_pos, eof_pos);
79        Ok(())
80    }
81
82    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
83        match WIT_WHITESPACE.scan(state.rest(), state.get_position(), WitSyntaxKind::Whitespace) {
84            Some(token) => {
85                state.advance_with(token);
86                return true;
87            }
88            None => {}
89        }
90        false
91    }
92
93    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
94        match WIT_COMMENT.scan(state.rest(), state.get_position(), WitSyntaxKind::Comment) {
95            Some(token) => {
96                state.advance_with(token);
97                return true;
98            }
99            None => {}
100        }
101        false
102    }
103
104    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
105        let start = state.get_position();
106        match WIT_STRING.scan(state.rest(), start, WitSyntaxKind::StringLiteral) {
107            Some(token) => {
108                state.advance_with(token);
109                return true;
110            }
111            None => {}
112        }
113        false
114    }
115
116    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
117        let start_pos = state.get_position();
118        let mut has_digits = false;
119
120        // 处理数字
121        while let Some(ch) = state.peek() {
122            if ch.is_ascii_digit() {
123                state.advance(1);
124                has_digits = true;
125            }
126            else {
127                break;
128            }
129        }
130
131        if has_digits {
132            state.add_token(WitSyntaxKind::IntegerLiteral, start_pos, state.get_position());
133            return true;
134        }
135
136        false
137    }
138
139    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
140        let start_pos = state.get_position();
141
142        if let Some(ch) = state.peek() {
143            if ch.is_ascii_alphabetic() || ch == '_' || ch == '%' {
144                state.advance(ch.len_utf8());
145
146                // 继续读取标识符字符
147                while let Some(ch) = state.peek() {
148                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
149                        state.advance(ch.len_utf8());
150                    }
151                    else {
152                        break;
153                    }
154                }
155
156                let text = state.get_text_in((start_pos..state.get_position()).into());
157                let token_kind = match text {
158                    // WIT 关键字
159                    "world" => WitSyntaxKind::WorldKw,
160                    "interface" => WitSyntaxKind::InterfaceKw,
161                    "package" => WitSyntaxKind::PackageKw,
162                    "component" => WitSyntaxKind::ComponentKw,
163                    "instance" => WitSyntaxKind::InstanceKw,
164                    "module" => WitSyntaxKind::ModuleKw,
165                    "core" => WitSyntaxKind::CoreKw,
166                    "func" => WitSyntaxKind::FuncKw,
167                    "type" => WitSyntaxKind::TypeKw,
168                    "record" => WitSyntaxKind::RecordKw,
169                    "variant" => WitSyntaxKind::VariantKw,
170                    "enum" => WitSyntaxKind::EnumKw,
171                    "flags" => WitSyntaxKind::FlagsKw,
172                    "union" => WitSyntaxKind::UnionKw,
173                    "tuple" => WitSyntaxKind::TupleKw,
174                    "list" => WitSyntaxKind::ListKw,
175                    "option" => WitSyntaxKind::OptionKw,
176                    "result" => WitSyntaxKind::ResultKw,
177                    "static" => WitSyntaxKind::StaticKw,
178                    "constructor" => WitSyntaxKind::ConstructorKw,
179                    "method" => WitSyntaxKind::MethodKw,
180                    "import" => WitSyntaxKind::ImportKw,
181                    "export" => WitSyntaxKind::ExportKw,
182                    "use" => WitSyntaxKind::UseKw,
183                    "include" => WitSyntaxKind::IncludeKw,
184                    "with" => WitSyntaxKind::WithKw,
185                    "resource" => WitSyntaxKind::ResourceKw,
186                    "bool" => WitSyntaxKind::BoolKw,
187                    "u8" => WitSyntaxKind::U8Kw,
188                    "u16" => WitSyntaxKind::U16Kw,
189                    "u32" => WitSyntaxKind::U32Kw,
190                    "u64" => WitSyntaxKind::U64Kw,
191                    "s8" => WitSyntaxKind::S8Kw,
192                    "s16" => WitSyntaxKind::S16Kw,
193                    "s32" => WitSyntaxKind::S32Kw,
194                    "s64" => WitSyntaxKind::S64Kw,
195                    "f32" => WitSyntaxKind::F32Kw,
196                    "f64" => WitSyntaxKind::F64Kw,
197                    "char" => WitSyntaxKind::CharKw,
198                    "string" => WitSyntaxKind::StringKw,
199                    _ => WitSyntaxKind::Identifier,
200                };
201
202                state.add_token(token_kind, start_pos, state.get_position());
203                return true;
204            }
205        }
206
207        false
208    }
209
210    fn lex_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
211        let start_pos = state.get_position();
212
213        if let Some(ch) = state.peek() {
214            let token_kind = match ch {
215                '(' => {
216                    state.advance(1);
217                    WitSyntaxKind::LeftParen
218                }
219                ')' => {
220                    state.advance(1);
221                    WitSyntaxKind::RightParen
222                }
223                '{' => {
224                    state.advance(1);
225                    WitSyntaxKind::LeftBrace
226                }
227                '}' => {
228                    state.advance(1);
229                    WitSyntaxKind::RightBrace
230                }
231                '[' => {
232                    state.advance(1);
233                    WitSyntaxKind::LeftBracket
234                }
235                ']' => {
236                    state.advance(1);
237                    WitSyntaxKind::RightBracket
238                }
239                '<' => {
240                    state.advance(1);
241                    WitSyntaxKind::Lt
242                }
243                '>' => {
244                    state.advance(1);
245                    WitSyntaxKind::Gt
246                }
247                ',' => {
248                    state.advance(1);
249                    WitSyntaxKind::Comma
250                }
251                ';' => {
252                    state.advance(1);
253                    WitSyntaxKind::Semicolon
254                }
255                ':' => {
256                    state.advance(1);
257                    WitSyntaxKind::Colon
258                }
259                '=' => {
260                    state.advance(1);
261                    WitSyntaxKind::Assign
262                }
263                '.' => {
264                    state.advance(1);
265                    WitSyntaxKind::Dot
266                }
267                '*' => {
268                    state.advance(1);
269                    WitSyntaxKind::Star
270                }
271                '/' => {
272                    state.advance(1);
273                    WitSyntaxKind::Slash
274                }
275                '@' => {
276                    state.advance(1);
277                    WitSyntaxKind::At
278                }
279                '-' => {
280                    state.advance(1);
281                    if state.peek() == Some('>') {
282                        state.advance(1);
283                        WitSyntaxKind::Arrow
284                    }
285                    else {
286                        WitSyntaxKind::Minus
287                    }
288                }
289                _ => return false,
290            };
291
292            state.add_token(token_kind, start_pos, state.get_position());
293            return true;
294        }
295
296        false
297    }
298
299    fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
300        let start_pos = state.get_position();
301
302        if let Some(ch) = state.peek() {
303            state.advance(ch.len_utf8());
304            state.add_token(WitSyntaxKind::Error, start_pos, state.get_position());
305            return true;
306        }
307
308        false
309    }
310}