1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::WitLanguage, lexer::token_type::WitTokenType};
5use oak_core::{
6 Lexer, LexerCache, LexerState, OakError, TextEdit,
7 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8 source::Source,
9};
10use std::sync::LazyLock;
11
12pub(crate) type State<'a, S> = LexerState<'a, S, WitLanguage>;
13
14static WIT_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static WIT_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
16static WIT_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
17
18#[derive(Clone)]
19pub struct WitLexer<'config> {
20 config: &'config WitLanguage,
21}
22
23impl<'config> Lexer<WitLanguage> for WitLexer<'config> {
24 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WitLanguage>) -> LexOutput<WitLanguage> {
25 let mut state: State<'_, S> = LexerState::new(source);
26 let result = self.run(&mut state);
27 state.finish_with_cache(result, cache)
28 }
29}
30
31impl<'config> WitLexer<'config> {
32 pub fn new(config: &'config WitLanguage) -> Self {
33 Self { config }
34 }
35
36 fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
37 while state.not_at_end() {
38 if self.skip_whitespace(state) {
39 continue;
40 }
41
42 if self.skip_comment(state) {
43 continue;
44 }
45
46 if self.lex_string_literal(state) {
47 continue;
48 }
49
50 if self.lex_number_literal(state) {
51 continue;
52 }
53
54 if self.lex_identifier_or_keyword(state) {
55 continue;
56 }
57
58 if self.lex_punctuation(state) {
59 continue;
60 }
61
62 if self.lex_text(state) {
63 continue;
64 }
65
66 let start_pos = state.get_position();
68 if let Some(ch) = state.peek() {
69 state.advance(ch.len_utf8());
70 state.add_token(WitTokenType::Error, start_pos, state.get_position());
71 }
72 }
73
74 let eof_pos = state.get_position();
76 state.add_token(WitTokenType::Eof, eof_pos, eof_pos);
77 Ok(())
78 }
79
80 fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
81 WIT_WHITESPACE.scan(state, WitTokenType::Whitespace)
82 }
83
84 fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
85 WIT_COMMENT.scan(state, WitTokenType::Comment, WitTokenType::Comment)
86 }
87
88 fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
89 WIT_STRING.scan(state, WitTokenType::StringLiteral)
90 }
91
92 fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
93 let start_pos = state.get_position();
94 let mut has_digits = false;
95
96 while let Some(ch) = state.peek() {
98 if ch.is_ascii_digit() {
99 state.advance(1);
100 has_digits = true;
101 }
102 else {
103 break;
104 }
105 }
106
107 if has_digits {
108 state.add_token(WitTokenType::IntegerLiteral, start_pos, state.get_position());
109 return true;
110 }
111
112 false
113 }
114
115 fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
116 let start_pos = state.get_position();
117
118 if let Some(ch) = state.peek() {
119 if ch.is_ascii_alphabetic() || ch == '_' || ch == '%' {
120 state.advance(ch.len_utf8());
121
122 while let Some(ch) = state.peek() {
124 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
125 state.advance(ch.len_utf8());
126 }
127 else {
128 break;
129 }
130 }
131
132 let text = state.get_text_from(start_pos);
133 let token_kind = match text.as_ref() {
134 "world" => WitTokenType::WorldKw,
136 "interface" => WitTokenType::InterfaceKw,
137 "package" => WitTokenType::PackageKw,
138 "component" => WitTokenType::ComponentKw,
139 "instance" => WitTokenType::InstanceKw,
140 "module" => WitTokenType::ModuleKw,
141 "core" => WitTokenType::CoreKw,
142 "func" => WitTokenType::FuncKw,
143 "type" => WitTokenType::TypeKw,
144 "record" => WitTokenType::RecordKw,
145 "variant" => WitTokenType::VariantKw,
146 "enum" => WitTokenType::EnumKw,
147 "flags" => WitTokenType::FlagsKw,
148 "union" => WitTokenType::UnionKw,
149 "tuple" => WitTokenType::TupleKw,
150 "list" => WitTokenType::ListKw,
151 "option" => WitTokenType::OptionKw,
152 "result" => WitTokenType::ResultKw,
153 "static" => WitTokenType::StaticKw,
154 "constructor" => WitTokenType::ConstructorKw,
155 "method" => WitTokenType::MethodKw,
156 "import" => WitTokenType::ImportKw,
157 "export" => WitTokenType::ExportKw,
158 "use" => WitTokenType::UseKw,
159 "include" => WitTokenType::IncludeKw,
160 "with" => WitTokenType::WithKw,
161 "resource" => WitTokenType::ResourceKw,
162 "bool" => WitTokenType::BoolKw,
163 "u8" => WitTokenType::U8Kw,
164 "u16" => WitTokenType::U16Kw,
165 "u32" => WitTokenType::U32Kw,
166 "u64" => WitTokenType::U64Kw,
167 "s8" => WitTokenType::S8Kw,
168 "s16" => WitTokenType::S16Kw,
169 "s32" => WitTokenType::S32Kw,
170 "s64" => WitTokenType::S64Kw,
171 "f32" => WitTokenType::F32Kw,
172 "f64" => WitTokenType::F64Kw,
173 "char" => WitTokenType::CharKw,
174 "string" => WitTokenType::StringKw,
175 _ => WitTokenType::Identifier,
176 };
177
178 state.add_token(token_kind, start_pos, state.get_position());
179 return true;
180 }
181 }
182
183 false
184 }
185
186 fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
187 let start_pos = state.get_position();
188
189 if let Some(ch) = state.peek() {
190 let token_kind = match ch {
191 '(' => {
192 state.advance(1);
193 WitTokenType::LeftParen
194 }
195 ')' => {
196 state.advance(1);
197 WitTokenType::RightParen
198 }
199 '{' => {
200 state.advance(1);
201 WitTokenType::LeftBrace
202 }
203 '}' => {
204 state.advance(1);
205 WitTokenType::RightBrace
206 }
207 '[' => {
208 state.advance(1);
209 WitTokenType::LeftBracket
210 }
211 ']' => {
212 state.advance(1);
213 WitTokenType::RightBracket
214 }
215 '<' => {
216 state.advance(1);
217 WitTokenType::Lt
218 }
219 '>' => {
220 state.advance(1);
221 WitTokenType::Gt
222 }
223 ',' => {
224 state.advance(1);
225 WitTokenType::Comma
226 }
227 ';' => {
228 state.advance(1);
229 WitTokenType::Semicolon
230 }
231 ':' => {
232 state.advance(1);
233 WitTokenType::Colon
234 }
235 '=' => {
236 state.advance(1);
237 WitTokenType::Assign
238 }
239 '.' => {
240 state.advance(1);
241 WitTokenType::Dot
242 }
243 '*' => {
244 state.advance(1);
245 WitTokenType::Star
246 }
247 '/' => {
248 state.advance(1);
249 WitTokenType::Slash
250 }
251 '@' => {
252 state.advance(1);
253 WitTokenType::At
254 }
255 '-' => {
256 state.advance(1);
257 if state.peek() == Some('>') {
258 state.advance(1);
259 WitTokenType::Arrow
260 }
261 else {
262 WitTokenType::Minus
263 }
264 }
265 _ => return false,
266 };
267
268 state.add_token(token_kind, start_pos, state.get_position());
269 return true;
270 }
271
272 false
273 }
274
275 fn lex_text<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
276 let start_pos = state.get_position();
277
278 if let Some(ch) = state.peek() {
279 state.advance(ch.len_utf8());
280 state.add_token(WitTokenType::Error, start_pos, state.get_position());
281 return true;
282 }
283
284 false
285 }
286}