1use crate::{kind::WitSyntaxKind, language::WitLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, WitLanguage>;
10
11static WIT_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static WIT_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static WIT_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct WitLexer<'config> {
17 config: &'config WitLanguage,
18}
19
20impl<'config> Lexer<WitLanguage> for WitLexer<'config> {
21 fn lex_incremental(
22 &self,
23 source: impl Source,
24 changed: usize,
25 cache: IncrementalCache<WitLanguage>,
26 ) -> LexOutput<WitLanguage> {
27 let mut state = LexerState::new(source);
28 let _ = self.run(&mut state);
29 state.finish(Ok(()))
30 }
31}
32
33impl<'config> WitLexer<'config> {
34 pub fn new(config: &'config WitLanguage) -> Self {
35 Self { config }
36 }
37
38 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
39 while state.not_at_end() {
40 if self.skip_whitespace(state) {
41 continue;
42 }
43
44 if self.skip_comment(state) {
45 continue;
46 }
47
48 if self.lex_string_literal(state) {
49 continue;
50 }
51
52 if self.lex_number_literal(state) {
53 continue;
54 }
55
56 if self.lex_identifier_or_keyword(state) {
57 continue;
58 }
59
60 if self.lex_punctuation(state) {
61 continue;
62 }
63
64 if self.lex_text(state) {
65 continue;
66 }
67
68 let start_pos = state.get_position();
70 if let Some(ch) = state.peek() {
71 state.advance(ch.len_utf8());
72 state.add_token(WitSyntaxKind::Error, start_pos, state.get_position());
73 }
74 }
75
76 let eof_pos = state.get_position();
78 state.add_token(WitSyntaxKind::Eof, eof_pos, eof_pos);
79 Ok(())
80 }
81
82 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
83 match WIT_WHITESPACE.scan(state.rest(), state.get_position(), WitSyntaxKind::Whitespace) {
84 Some(token) => {
85 state.advance_with(token);
86 return true;
87 }
88 None => {}
89 }
90 false
91 }
92
93 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
94 match WIT_COMMENT.scan(state.rest(), state.get_position(), WitSyntaxKind::Comment) {
95 Some(token) => {
96 state.advance_with(token);
97 return true;
98 }
99 None => {}
100 }
101 false
102 }
103
104 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
105 let start = state.get_position();
106 match WIT_STRING.scan(state.rest(), start, WitSyntaxKind::StringLiteral) {
107 Some(token) => {
108 state.advance_with(token);
109 return true;
110 }
111 None => {}
112 }
113 false
114 }
115
116 fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
117 let start_pos = state.get_position();
118 let mut has_digits = false;
119
120 while let Some(ch) = state.peek() {
122 if ch.is_ascii_digit() {
123 state.advance(1);
124 has_digits = true;
125 }
126 else {
127 break;
128 }
129 }
130
131 if has_digits {
132 state.add_token(WitSyntaxKind::IntegerLiteral, start_pos, state.get_position());
133 return true;
134 }
135
136 false
137 }
138
139 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
140 let start_pos = state.get_position();
141
142 if let Some(ch) = state.peek() {
143 if ch.is_ascii_alphabetic() || ch == '_' || ch == '%' {
144 state.advance(ch.len_utf8());
145
146 while let Some(ch) = state.peek() {
148 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
149 state.advance(ch.len_utf8());
150 }
151 else {
152 break;
153 }
154 }
155
156 let text = state.get_text_in((start_pos..state.get_position()).into());
157 let token_kind = match text {
158 "world" => WitSyntaxKind::WorldKw,
160 "interface" => WitSyntaxKind::InterfaceKw,
161 "package" => WitSyntaxKind::PackageKw,
162 "component" => WitSyntaxKind::ComponentKw,
163 "instance" => WitSyntaxKind::InstanceKw,
164 "module" => WitSyntaxKind::ModuleKw,
165 "core" => WitSyntaxKind::CoreKw,
166 "func" => WitSyntaxKind::FuncKw,
167 "type" => WitSyntaxKind::TypeKw,
168 "record" => WitSyntaxKind::RecordKw,
169 "variant" => WitSyntaxKind::VariantKw,
170 "enum" => WitSyntaxKind::EnumKw,
171 "flags" => WitSyntaxKind::FlagsKw,
172 "union" => WitSyntaxKind::UnionKw,
173 "tuple" => WitSyntaxKind::TupleKw,
174 "list" => WitSyntaxKind::ListKw,
175 "option" => WitSyntaxKind::OptionKw,
176 "result" => WitSyntaxKind::ResultKw,
177 "static" => WitSyntaxKind::StaticKw,
178 "constructor" => WitSyntaxKind::ConstructorKw,
179 "method" => WitSyntaxKind::MethodKw,
180 "import" => WitSyntaxKind::ImportKw,
181 "export" => WitSyntaxKind::ExportKw,
182 "use" => WitSyntaxKind::UseKw,
183 "include" => WitSyntaxKind::IncludeKw,
184 "with" => WitSyntaxKind::WithKw,
185 "resource" => WitSyntaxKind::ResourceKw,
186 "bool" => WitSyntaxKind::BoolKw,
187 "u8" => WitSyntaxKind::U8Kw,
188 "u16" => WitSyntaxKind::U16Kw,
189 "u32" => WitSyntaxKind::U32Kw,
190 "u64" => WitSyntaxKind::U64Kw,
191 "s8" => WitSyntaxKind::S8Kw,
192 "s16" => WitSyntaxKind::S16Kw,
193 "s32" => WitSyntaxKind::S32Kw,
194 "s64" => WitSyntaxKind::S64Kw,
195 "f32" => WitSyntaxKind::F32Kw,
196 "f64" => WitSyntaxKind::F64Kw,
197 "char" => WitSyntaxKind::CharKw,
198 "string" => WitSyntaxKind::StringKw,
199 _ => WitSyntaxKind::Identifier,
200 };
201
202 state.add_token(token_kind, start_pos, state.get_position());
203 return true;
204 }
205 }
206
207 false
208 }
209
210 fn lex_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
211 let start_pos = state.get_position();
212
213 if let Some(ch) = state.peek() {
214 let token_kind = match ch {
215 '(' => {
216 state.advance(1);
217 WitSyntaxKind::LeftParen
218 }
219 ')' => {
220 state.advance(1);
221 WitSyntaxKind::RightParen
222 }
223 '{' => {
224 state.advance(1);
225 WitSyntaxKind::LeftBrace
226 }
227 '}' => {
228 state.advance(1);
229 WitSyntaxKind::RightBrace
230 }
231 '[' => {
232 state.advance(1);
233 WitSyntaxKind::LeftBracket
234 }
235 ']' => {
236 state.advance(1);
237 WitSyntaxKind::RightBracket
238 }
239 '<' => {
240 state.advance(1);
241 WitSyntaxKind::Lt
242 }
243 '>' => {
244 state.advance(1);
245 WitSyntaxKind::Gt
246 }
247 ',' => {
248 state.advance(1);
249 WitSyntaxKind::Comma
250 }
251 ';' => {
252 state.advance(1);
253 WitSyntaxKind::Semicolon
254 }
255 ':' => {
256 state.advance(1);
257 WitSyntaxKind::Colon
258 }
259 '=' => {
260 state.advance(1);
261 WitSyntaxKind::Assign
262 }
263 '.' => {
264 state.advance(1);
265 WitSyntaxKind::Dot
266 }
267 '*' => {
268 state.advance(1);
269 WitSyntaxKind::Star
270 }
271 '/' => {
272 state.advance(1);
273 WitSyntaxKind::Slash
274 }
275 '@' => {
276 state.advance(1);
277 WitSyntaxKind::At
278 }
279 '-' => {
280 state.advance(1);
281 if state.peek() == Some('>') {
282 state.advance(1);
283 WitSyntaxKind::Arrow
284 }
285 else {
286 WitSyntaxKind::Minus
287 }
288 }
289 _ => return false,
290 };
291
292 state.add_token(token_kind, start_pos, state.get_position());
293 return true;
294 }
295
296 false
297 }
298
299 fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
300 let start_pos = state.get_position();
301
302 if let Some(ch) = state.peek() {
303 state.advance(ch.len_utf8());
304 state.add_token(WitSyntaxKind::Error, start_pos, state.get_position());
305 return true;
306 }
307
308 false
309 }
310}