1use crate::{kind::WitSyntaxKind, language::WitLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError, TextEdit,
4 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, WitLanguage>;
10
11static WIT_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static WIT_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
13static WIT_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct WitLexer<'config> {
17 _config: &'config WitLanguage,
18}
19
20impl<'config> Lexer<WitLanguage> for WitLexer<'config> {
21 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WitLanguage>) -> LexOutput<WitLanguage> {
22 let mut state: State<'_, S> = LexerState::new(source);
23 let result = self.run(&mut state);
24 state.finish_with_cache(result, cache)
25 }
26}
27
28impl<'config> WitLexer<'config> {
29 pub fn new(config: &'config WitLanguage) -> Self {
30 Self { _config: config }
31 }
32
33 fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
34 while state.not_at_end() {
35 if self.skip_whitespace(state) {
36 continue;
37 }
38
39 if self.skip_comment(state) {
40 continue;
41 }
42
43 if self.lex_string_literal(state) {
44 continue;
45 }
46
47 if self.lex_number_literal(state) {
48 continue;
49 }
50
51 if self.lex_identifier_or_keyword(state) {
52 continue;
53 }
54
55 if self.lex_punctuation(state) {
56 continue;
57 }
58
59 if self.lex_text(state) {
60 continue;
61 }
62
63 let start_pos = state.get_position();
65 if let Some(ch) = state.peek() {
66 state.advance(ch.len_utf8());
67 state.add_token(WitSyntaxKind::Error, start_pos, state.get_position());
68 }
69 }
70
71 let eof_pos = state.get_position();
73 state.add_token(WitSyntaxKind::Eof, eof_pos, eof_pos);
74 Ok(())
75 }
76
77 fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
78 WIT_WHITESPACE.scan(state, WitSyntaxKind::Whitespace)
79 }
80
81 fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
82 WIT_COMMENT.scan(state, WitSyntaxKind::Comment, WitSyntaxKind::Comment)
83 }
84
85 fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
86 WIT_STRING.scan(state, WitSyntaxKind::StringLiteral)
87 }
88
89 fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
90 let start_pos = state.get_position();
91 let mut has_digits = false;
92
93 while let Some(ch) = state.peek() {
95 if ch.is_ascii_digit() {
96 state.advance(1);
97 has_digits = true;
98 }
99 else {
100 break;
101 }
102 }
103
104 if has_digits {
105 state.add_token(WitSyntaxKind::IntegerLiteral, start_pos, state.get_position());
106 return true;
107 }
108
109 false
110 }
111
112 fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
113 let start_pos = state.get_position();
114
115 if let Some(ch) = state.peek() {
116 if ch.is_ascii_alphabetic() || ch == '_' || ch == '%' {
117 state.advance(ch.len_utf8());
118
119 while let Some(ch) = state.peek() {
121 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
122 state.advance(ch.len_utf8());
123 }
124 else {
125 break;
126 }
127 }
128
129 let text = state.get_text_from(start_pos);
130 let token_kind = match text.as_ref() {
131 "world" => WitSyntaxKind::WorldKw,
133 "interface" => WitSyntaxKind::InterfaceKw,
134 "package" => WitSyntaxKind::PackageKw,
135 "component" => WitSyntaxKind::ComponentKw,
136 "instance" => WitSyntaxKind::InstanceKw,
137 "module" => WitSyntaxKind::ModuleKw,
138 "core" => WitSyntaxKind::CoreKw,
139 "func" => WitSyntaxKind::FuncKw,
140 "type" => WitSyntaxKind::TypeKw,
141 "record" => WitSyntaxKind::RecordKw,
142 "variant" => WitSyntaxKind::VariantKw,
143 "enum" => WitSyntaxKind::EnumKw,
144 "flags" => WitSyntaxKind::FlagsKw,
145 "union" => WitSyntaxKind::UnionKw,
146 "tuple" => WitSyntaxKind::TupleKw,
147 "list" => WitSyntaxKind::ListKw,
148 "option" => WitSyntaxKind::OptionKw,
149 "result" => WitSyntaxKind::ResultKw,
150 "static" => WitSyntaxKind::StaticKw,
151 "constructor" => WitSyntaxKind::ConstructorKw,
152 "method" => WitSyntaxKind::MethodKw,
153 "import" => WitSyntaxKind::ImportKw,
154 "export" => WitSyntaxKind::ExportKw,
155 "use" => WitSyntaxKind::UseKw,
156 "include" => WitSyntaxKind::IncludeKw,
157 "with" => WitSyntaxKind::WithKw,
158 "resource" => WitSyntaxKind::ResourceKw,
159 "bool" => WitSyntaxKind::BoolKw,
160 "u8" => WitSyntaxKind::U8Kw,
161 "u16" => WitSyntaxKind::U16Kw,
162 "u32" => WitSyntaxKind::U32Kw,
163 "u64" => WitSyntaxKind::U64Kw,
164 "s8" => WitSyntaxKind::S8Kw,
165 "s16" => WitSyntaxKind::S16Kw,
166 "s32" => WitSyntaxKind::S32Kw,
167 "s64" => WitSyntaxKind::S64Kw,
168 "f32" => WitSyntaxKind::F32Kw,
169 "f64" => WitSyntaxKind::F64Kw,
170 "char" => WitSyntaxKind::CharKw,
171 "string" => WitSyntaxKind::StringKw,
172 _ => WitSyntaxKind::Identifier,
173 };
174
175 state.add_token(token_kind, start_pos, state.get_position());
176 return true;
177 }
178 }
179
180 false
181 }
182
183 fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
184 let start_pos = state.get_position();
185
186 if let Some(ch) = state.peek() {
187 let token_kind = match ch {
188 '(' => {
189 state.advance(1);
190 WitSyntaxKind::LeftParen
191 }
192 ')' => {
193 state.advance(1);
194 WitSyntaxKind::RightParen
195 }
196 '{' => {
197 state.advance(1);
198 WitSyntaxKind::LeftBrace
199 }
200 '}' => {
201 state.advance(1);
202 WitSyntaxKind::RightBrace
203 }
204 '[' => {
205 state.advance(1);
206 WitSyntaxKind::LeftBracket
207 }
208 ']' => {
209 state.advance(1);
210 WitSyntaxKind::RightBracket
211 }
212 '<' => {
213 state.advance(1);
214 WitSyntaxKind::Lt
215 }
216 '>' => {
217 state.advance(1);
218 WitSyntaxKind::Gt
219 }
220 ',' => {
221 state.advance(1);
222 WitSyntaxKind::Comma
223 }
224 ';' => {
225 state.advance(1);
226 WitSyntaxKind::Semicolon
227 }
228 ':' => {
229 state.advance(1);
230 WitSyntaxKind::Colon
231 }
232 '=' => {
233 state.advance(1);
234 WitSyntaxKind::Assign
235 }
236 '.' => {
237 state.advance(1);
238 WitSyntaxKind::Dot
239 }
240 '*' => {
241 state.advance(1);
242 WitSyntaxKind::Star
243 }
244 '/' => {
245 state.advance(1);
246 WitSyntaxKind::Slash
247 }
248 '@' => {
249 state.advance(1);
250 WitSyntaxKind::At
251 }
252 '-' => {
253 state.advance(1);
254 if state.peek() == Some('>') {
255 state.advance(1);
256 WitSyntaxKind::Arrow
257 }
258 else {
259 WitSyntaxKind::Minus
260 }
261 }
262 _ => return false,
263 };
264
265 state.add_token(token_kind, start_pos, state.get_position());
266 return true;
267 }
268
269 false
270 }
271
272 fn lex_text<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
273 let start_pos = state.get_position();
274
275 if let Some(ch) = state.peek() {
276 state.advance(ch.len_utf8());
277 state.add_token(WitSyntaxKind::Error, start_pos, state.get_position());
278 return true;
279 }
280
281 false
282 }
283}