1#![doc = include_str!("readme.md")]
2pub mod token_type;
8
9use crate::{language::WitLanguage, lexer::token_type::WitTokenType};
10use oak_core::{
11 Lexer, LexerCache, LexerState, OakError, TextEdit,
12 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
13 source::Source,
14};
15use std::sync::LazyLock;
16
17pub(crate) type State<'a, S> = LexerState<'a, S, WitLanguage>;
18
19static WIT_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
20static WIT_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
21static WIT_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
22
23#[derive(Clone)]
28pub struct WitLexer<'config> {
29 config: &'config WitLanguage,
30}
31
32impl<'config> Lexer<WitLanguage> for WitLexer<'config> {
33 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WitLanguage>) -> LexOutput<WitLanguage> {
34 let mut state: State<'_, S> = LexerState::new(source);
35 let result = self.run(&mut state);
36 state.finish_with_cache(result, cache)
37 }
38}
39
40impl<'config> WitLexer<'config> {
41 pub fn new(config: &'config WitLanguage) -> Self {
48 Self { config }
49 }
50
51 fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
52 while state.not_at_end() {
53 if self.skip_whitespace(state) {
54 continue;
55 }
56
57 if self.skip_comment(state) {
58 continue;
59 }
60
61 if self.lex_string_literal(state) {
62 continue;
63 }
64
65 if self.lex_number_literal(state) {
66 continue;
67 }
68
69 if self.lex_identifier_or_keyword(state) {
70 continue;
71 }
72
73 if self.lex_punctuation(state) {
74 continue;
75 }
76
77 if self.lex_text(state) {
78 continue;
79 }
80
81 let start_pos = state.get_position();
83 if let Some(ch) = state.peek() {
84 state.advance(ch.len_utf8());
85 state.add_token(WitTokenType::Error, start_pos, state.get_position());
86 }
87 }
88
89 let eof_pos = state.get_position();
91 state.add_token(WitTokenType::Eof, eof_pos, eof_pos);
92 Ok(())
93 }
94
95 fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
96 WIT_WHITESPACE.scan(state, WitTokenType::Whitespace)
97 }
98
99 fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
100 WIT_COMMENT.scan(state, WitTokenType::Comment, WitTokenType::Comment)
101 }
102
103 fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
104 WIT_STRING.scan(state, WitTokenType::StringLiteral)
105 }
106
107 fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
108 let start_pos = state.get_position();
109 let mut has_digits = false;
110
111 while let Some(ch) = state.peek() {
113 if ch.is_ascii_digit() {
114 state.advance(1);
115 has_digits = true;
116 }
117 else {
118 break;
119 }
120 }
121
122 if has_digits {
123 state.add_token(WitTokenType::IntegerLiteral, start_pos, state.get_position());
124 return true;
125 }
126
127 false
128 }
129
130 fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
131 let start_pos = state.get_position();
132
133 if let Some(ch) = state.peek() {
134 if ch.is_ascii_alphabetic() || ch == '_' || ch == '%' {
135 state.advance(ch.len_utf8());
136
137 while let Some(ch) = state.peek() {
139 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
140 state.advance(ch.len_utf8());
141 }
142 else {
143 break;
144 }
145 }
146
147 let text = state.get_text_from(start_pos);
148 let token_kind = match text.as_ref() {
149 "world" => WitTokenType::WorldKw,
151 "interface" => WitTokenType::InterfaceKw,
152 "package" => WitTokenType::PackageKw,
153 "component" => WitTokenType::ComponentKw,
154 "instance" => WitTokenType::InstanceKw,
155 "module" => WitTokenType::ModuleKw,
156 "core" => WitTokenType::CoreKw,
157 "func" => WitTokenType::FuncKw,
158 "type" => WitTokenType::TypeKw,
159 "record" => WitTokenType::RecordKw,
160 "variant" => WitTokenType::VariantKw,
161 "enum" => WitTokenType::EnumKw,
162 "flags" => WitTokenType::FlagsKw,
163 "union" => WitTokenType::UnionKw,
164 "tuple" => WitTokenType::TupleKw,
165 "list" => WitTokenType::ListKw,
166 "option" => WitTokenType::OptionKw,
167 "result" => WitTokenType::ResultKw,
168 "static" => WitTokenType::StaticKw,
169 "constructor" => WitTokenType::ConstructorKw,
170 "method" => WitTokenType::MethodKw,
171 "import" => WitTokenType::ImportKw,
172 "export" => WitTokenType::ExportKw,
173 "use" => WitTokenType::UseKw,
174 "include" => WitTokenType::IncludeKw,
175 "with" => WitTokenType::WithKw,
176 "resource" => WitTokenType::ResourceKw,
177 "bool" => WitTokenType::BoolKw,
178 "u8" => WitTokenType::U8Kw,
179 "u16" => WitTokenType::U16Kw,
180 "u32" => WitTokenType::U32Kw,
181 "u64" => WitTokenType::U64Kw,
182 "s8" => WitTokenType::S8Kw,
183 "s16" => WitTokenType::S16Kw,
184 "s32" => WitTokenType::S32Kw,
185 "s64" => WitTokenType::S64Kw,
186 "f32" => WitTokenType::F32Kw,
187 "f64" => WitTokenType::F64Kw,
188 "char" => WitTokenType::CharKw,
189 "string" => WitTokenType::StringKw,
190 _ => WitTokenType::Identifier,
191 };
192
193 state.add_token(token_kind, start_pos, state.get_position());
194 return true;
195 }
196 }
197
198 false
199 }
200
201 fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
202 let start_pos = state.get_position();
203
204 if let Some(ch) = state.peek() {
205 let token_kind = match ch {
206 '(' => {
207 state.advance(1);
208 WitTokenType::LeftParen
209 }
210 ')' => {
211 state.advance(1);
212 WitTokenType::RightParen
213 }
214 '{' => {
215 state.advance(1);
216 WitTokenType::LeftBrace
217 }
218 '}' => {
219 state.advance(1);
220 WitTokenType::RightBrace
221 }
222 '[' => {
223 state.advance(1);
224 WitTokenType::LeftBracket
225 }
226 ']' => {
227 state.advance(1);
228 WitTokenType::RightBracket
229 }
230 '<' => {
231 state.advance(1);
232 WitTokenType::Lt
233 }
234 '>' => {
235 state.advance(1);
236 WitTokenType::Gt
237 }
238 ',' => {
239 state.advance(1);
240 WitTokenType::Comma
241 }
242 ';' => {
243 state.advance(1);
244 WitTokenType::Semicolon
245 }
246 ':' => {
247 state.advance(1);
248 WitTokenType::Colon
249 }
250 '=' => {
251 state.advance(1);
252 WitTokenType::Assign
253 }
254 '.' => {
255 state.advance(1);
256 WitTokenType::Dot
257 }
258 '*' => {
259 state.advance(1);
260 WitTokenType::Star
261 }
262 '/' => {
263 state.advance(1);
264 WitTokenType::Slash
265 }
266 '@' => {
267 state.advance(1);
268 WitTokenType::At
269 }
270 '-' => {
271 state.advance(1);
272 if state.peek() == Some('>') {
273 state.advance(1);
274 WitTokenType::Arrow
275 }
276 else {
277 WitTokenType::Minus
278 }
279 }
280 _ => return false,
281 };
282
283 state.add_token(token_kind, start_pos, state.get_position());
284 return true;
285 }
286
287 false
288 }
289
290 fn lex_text<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
291 let start_pos = state.get_position();
292
293 if let Some(ch) = state.peek() {
294 state.advance(ch.len_utf8());
295 state.add_token(WitTokenType::Error, start_pos, state.get_position());
296 return true;
297 }
298
299 false
300 }
301}