Skip to main content

oak_wit_component/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type definitions for WIT (WebAssembly Interface Types) lexical analysis.
3///
4/// This module provides [`WitTokenType`] which defines all token types
5/// recognized by the WIT lexer, including keywords, identifiers, literals,
6/// and punctuation.
7pub mod token_type;
8
9use crate::{language::WitLanguage, lexer::token_type::WitTokenType};
10use oak_core::{
11    Lexer, LexerCache, LexerState, OakError, TextEdit,
12    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
13    source::Source,
14};
15use std::sync::LazyLock;
16
17pub(crate) type State<'a, S> = LexerState<'a, S, WitLanguage>;
18
19static WIT_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
20static WIT_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
21static WIT_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
22
23/// Lexer for WIT (WebAssembly Interface Types) files.
24///
25/// This lexer tokenizes WIT source code into a sequence of tokens
26/// that can be processed by the parser.
27#[derive(Clone)]
28pub struct WitLexer<'config> {
29    config: &'config WitLanguage,
30}
31
32impl<'config> Lexer<WitLanguage> for WitLexer<'config> {
33    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WitLanguage>) -> LexOutput<WitLanguage> {
34        let mut state: State<'_, S> = LexerState::new(source);
35        let result = self.run(&mut state);
36        state.finish_with_cache(result, cache)
37    }
38}
39
40impl<'config> WitLexer<'config> {
41    /// Creates a new `WitLexer` instance with the specified language configuration.
42    ///
43    /// # Arguments
44    ///
45    /// * `config` - A reference to the `WitLanguage` configuration.
46    /// Creates a new WitComponentLexer with the given language configuration.
47    pub fn new(config: &'config WitLanguage) -> Self {
48        Self { config }
49    }
50
51    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
52        while state.not_at_end() {
53            if self.skip_whitespace(state) {
54                continue;
55            }
56
57            if self.skip_comment(state) {
58                continue;
59            }
60
61            if self.lex_string_literal(state) {
62                continue;
63            }
64
65            if self.lex_number_literal(state) {
66                continue;
67            }
68
69            if self.lex_identifier_or_keyword(state) {
70                continue;
71            }
72
73            if self.lex_punctuation(state) {
74                continue;
75            }
76
77            if self.lex_text(state) {
78                continue;
79            }
80
81            // If no rules match, skip the current character and mark it as an error
82            let start_pos = state.get_position();
83            if let Some(ch) = state.peek() {
84                state.advance(ch.len_utf8());
85                state.add_token(WitTokenType::Error, start_pos, state.get_position());
86            }
87        }
88
89        // Add EOF token
90        let eof_pos = state.get_position();
91        state.add_token(WitTokenType::Eof, eof_pos, eof_pos);
92        Ok(())
93    }
94
95    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
96        WIT_WHITESPACE.scan(state, WitTokenType::Whitespace)
97    }
98
99    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
100        WIT_COMMENT.scan(state, WitTokenType::Comment, WitTokenType::Comment)
101    }
102
103    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
104        WIT_STRING.scan(state, WitTokenType::StringLiteral)
105    }
106
107    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
108        let start_pos = state.get_position();
109        let mut has_digits = false;
110
111        // Handle digits
112        while let Some(ch) = state.peek() {
113            if ch.is_ascii_digit() {
114                state.advance(1);
115                has_digits = true;
116            }
117            else {
118                break;
119            }
120        }
121
122        if has_digits {
123            state.add_token(WitTokenType::IntegerLiteral, start_pos, state.get_position());
124            return true;
125        }
126
127        false
128    }
129
130    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
131        let start_pos = state.get_position();
132
133        if let Some(ch) = state.peek() {
134            if ch.is_ascii_alphabetic() || ch == '_' || ch == '%' {
135                state.advance(ch.len_utf8());
136
137                // Continue reading identifier characters
138                while let Some(ch) = state.peek() {
139                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
140                        state.advance(ch.len_utf8());
141                    }
142                    else {
143                        break;
144                    }
145                }
146
147                let text = state.get_text_from(start_pos);
148                let token_kind = match text.as_ref() {
149                    // WIT keywords
150                    "world" => WitTokenType::WorldKw,
151                    "interface" => WitTokenType::InterfaceKw,
152                    "package" => WitTokenType::PackageKw,
153                    "component" => WitTokenType::ComponentKw,
154                    "instance" => WitTokenType::InstanceKw,
155                    "module" => WitTokenType::ModuleKw,
156                    "core" => WitTokenType::CoreKw,
157                    "func" => WitTokenType::FuncKw,
158                    "type" => WitTokenType::TypeKw,
159                    "record" => WitTokenType::RecordKw,
160                    "variant" => WitTokenType::VariantKw,
161                    "enum" => WitTokenType::EnumKw,
162                    "flags" => WitTokenType::FlagsKw,
163                    "union" => WitTokenType::UnionKw,
164                    "tuple" => WitTokenType::TupleKw,
165                    "list" => WitTokenType::ListKw,
166                    "option" => WitTokenType::OptionKw,
167                    "result" => WitTokenType::ResultKw,
168                    "static" => WitTokenType::StaticKw,
169                    "constructor" => WitTokenType::ConstructorKw,
170                    "method" => WitTokenType::MethodKw,
171                    "import" => WitTokenType::ImportKw,
172                    "export" => WitTokenType::ExportKw,
173                    "use" => WitTokenType::UseKw,
174                    "include" => WitTokenType::IncludeKw,
175                    "with" => WitTokenType::WithKw,
176                    "resource" => WitTokenType::ResourceKw,
177                    "bool" => WitTokenType::BoolKw,
178                    "u8" => WitTokenType::U8Kw,
179                    "u16" => WitTokenType::U16Kw,
180                    "u32" => WitTokenType::U32Kw,
181                    "u64" => WitTokenType::U64Kw,
182                    "s8" => WitTokenType::S8Kw,
183                    "s16" => WitTokenType::S16Kw,
184                    "s32" => WitTokenType::S32Kw,
185                    "s64" => WitTokenType::S64Kw,
186                    "f32" => WitTokenType::F32Kw,
187                    "f64" => WitTokenType::F64Kw,
188                    "char" => WitTokenType::CharKw,
189                    "string" => WitTokenType::StringKw,
190                    _ => WitTokenType::Identifier,
191                };
192
193                state.add_token(token_kind, start_pos, state.get_position());
194                return true;
195            }
196        }
197
198        false
199    }
200
201    fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
202        let start_pos = state.get_position();
203
204        if let Some(ch) = state.peek() {
205            let token_kind = match ch {
206                '(' => {
207                    state.advance(1);
208                    WitTokenType::LeftParen
209                }
210                ')' => {
211                    state.advance(1);
212                    WitTokenType::RightParen
213                }
214                '{' => {
215                    state.advance(1);
216                    WitTokenType::LeftBrace
217                }
218                '}' => {
219                    state.advance(1);
220                    WitTokenType::RightBrace
221                }
222                '[' => {
223                    state.advance(1);
224                    WitTokenType::LeftBracket
225                }
226                ']' => {
227                    state.advance(1);
228                    WitTokenType::RightBracket
229                }
230                '<' => {
231                    state.advance(1);
232                    WitTokenType::Lt
233                }
234                '>' => {
235                    state.advance(1);
236                    WitTokenType::Gt
237                }
238                ',' => {
239                    state.advance(1);
240                    WitTokenType::Comma
241                }
242                ';' => {
243                    state.advance(1);
244                    WitTokenType::Semicolon
245                }
246                ':' => {
247                    state.advance(1);
248                    WitTokenType::Colon
249                }
250                '=' => {
251                    state.advance(1);
252                    WitTokenType::Assign
253                }
254                '.' => {
255                    state.advance(1);
256                    WitTokenType::Dot
257                }
258                '*' => {
259                    state.advance(1);
260                    WitTokenType::Star
261                }
262                '/' => {
263                    state.advance(1);
264                    WitTokenType::Slash
265                }
266                '@' => {
267                    state.advance(1);
268                    WitTokenType::At
269                }
270                '-' => {
271                    state.advance(1);
272                    if state.peek() == Some('>') {
273                        state.advance(1);
274                        WitTokenType::Arrow
275                    }
276                    else {
277                        WitTokenType::Minus
278                    }
279                }
280                _ => return false,
281            };
282
283            state.add_token(token_kind, start_pos, state.get_position());
284            return true;
285        }
286
287        false
288    }
289
290    fn lex_text<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
291        let start_pos = state.get_position();
292
293        if let Some(ch) = state.peek() {
294            state.advance(ch.len_utf8());
295            state.add_token(WitTokenType::Error, start_pos, state.get_position());
296            return true;
297        }
298
299        false
300    }
301}