Skip to main content

oak_wit_component/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::WitLanguage, lexer::token_type::WitTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError, TextEdit,
7    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8    source::Source,
9};
10use std::sync::LazyLock;
11
12pub(crate) type State<'a, S> = LexerState<'a, S, WitLanguage>;
13
14static WIT_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static WIT_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
16static WIT_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
17
18#[derive(Clone)]
19pub struct WitLexer<'config> {
20    config: &'config WitLanguage,
21}
22
23impl<'config> Lexer<WitLanguage> for WitLexer<'config> {
24    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WitLanguage>) -> LexOutput<WitLanguage> {
25        let mut state: State<'_, S> = LexerState::new(source);
26        let result = self.run(&mut state);
27        state.finish_with_cache(result, cache)
28    }
29}
30
31impl<'config> WitLexer<'config> {
32    pub fn new(config: &'config WitLanguage) -> Self {
33        Self { config }
34    }
35
36    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
37        while state.not_at_end() {
38            if self.skip_whitespace(state) {
39                continue;
40            }
41
42            if self.skip_comment(state) {
43                continue;
44            }
45
46            if self.lex_string_literal(state) {
47                continue;
48            }
49
50            if self.lex_number_literal(state) {
51                continue;
52            }
53
54            if self.lex_identifier_or_keyword(state) {
55                continue;
56            }
57
58            if self.lex_punctuation(state) {
59                continue;
60            }
61
62            if self.lex_text(state) {
63                continue;
64            }
65
66            // If no rules match, skip the current character and mark it as an error
67            let start_pos = state.get_position();
68            if let Some(ch) = state.peek() {
69                state.advance(ch.len_utf8());
70                state.add_token(WitTokenType::Error, start_pos, state.get_position());
71            }
72        }
73
74        // Add EOF token
75        let eof_pos = state.get_position();
76        state.add_token(WitTokenType::Eof, eof_pos, eof_pos);
77        Ok(())
78    }
79
80    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
81        WIT_WHITESPACE.scan(state, WitTokenType::Whitespace)
82    }
83
84    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
85        WIT_COMMENT.scan(state, WitTokenType::Comment, WitTokenType::Comment)
86    }
87
88    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
89        WIT_STRING.scan(state, WitTokenType::StringLiteral)
90    }
91
92    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
93        let start_pos = state.get_position();
94        let mut has_digits = false;
95
96        // Handle digits
97        while let Some(ch) = state.peek() {
98            if ch.is_ascii_digit() {
99                state.advance(1);
100                has_digits = true;
101            }
102            else {
103                break;
104            }
105        }
106
107        if has_digits {
108            state.add_token(WitTokenType::IntegerLiteral, start_pos, state.get_position());
109            return true;
110        }
111
112        false
113    }
114
115    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
116        let start_pos = state.get_position();
117
118        if let Some(ch) = state.peek() {
119            if ch.is_ascii_alphabetic() || ch == '_' || ch == '%' {
120                state.advance(ch.len_utf8());
121
122                // Continue reading identifier characters
123                while let Some(ch) = state.peek() {
124                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
125                        state.advance(ch.len_utf8());
126                    }
127                    else {
128                        break;
129                    }
130                }
131
132                let text = state.get_text_from(start_pos);
133                let token_kind = match text.as_ref() {
134                    // WIT keywords
135                    "world" => WitTokenType::WorldKw,
136                    "interface" => WitTokenType::InterfaceKw,
137                    "package" => WitTokenType::PackageKw,
138                    "component" => WitTokenType::ComponentKw,
139                    "instance" => WitTokenType::InstanceKw,
140                    "module" => WitTokenType::ModuleKw,
141                    "core" => WitTokenType::CoreKw,
142                    "func" => WitTokenType::FuncKw,
143                    "type" => WitTokenType::TypeKw,
144                    "record" => WitTokenType::RecordKw,
145                    "variant" => WitTokenType::VariantKw,
146                    "enum" => WitTokenType::EnumKw,
147                    "flags" => WitTokenType::FlagsKw,
148                    "union" => WitTokenType::UnionKw,
149                    "tuple" => WitTokenType::TupleKw,
150                    "list" => WitTokenType::ListKw,
151                    "option" => WitTokenType::OptionKw,
152                    "result" => WitTokenType::ResultKw,
153                    "static" => WitTokenType::StaticKw,
154                    "constructor" => WitTokenType::ConstructorKw,
155                    "method" => WitTokenType::MethodKw,
156                    "import" => WitTokenType::ImportKw,
157                    "export" => WitTokenType::ExportKw,
158                    "use" => WitTokenType::UseKw,
159                    "include" => WitTokenType::IncludeKw,
160                    "with" => WitTokenType::WithKw,
161                    "resource" => WitTokenType::ResourceKw,
162                    "bool" => WitTokenType::BoolKw,
163                    "u8" => WitTokenType::U8Kw,
164                    "u16" => WitTokenType::U16Kw,
165                    "u32" => WitTokenType::U32Kw,
166                    "u64" => WitTokenType::U64Kw,
167                    "s8" => WitTokenType::S8Kw,
168                    "s16" => WitTokenType::S16Kw,
169                    "s32" => WitTokenType::S32Kw,
170                    "s64" => WitTokenType::S64Kw,
171                    "f32" => WitTokenType::F32Kw,
172                    "f64" => WitTokenType::F64Kw,
173                    "char" => WitTokenType::CharKw,
174                    "string" => WitTokenType::StringKw,
175                    _ => WitTokenType::Identifier,
176                };
177
178                state.add_token(token_kind, start_pos, state.get_position());
179                return true;
180            }
181        }
182
183        false
184    }
185
186    fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
187        let start_pos = state.get_position();
188
189        if let Some(ch) = state.peek() {
190            let token_kind = match ch {
191                '(' => {
192                    state.advance(1);
193                    WitTokenType::LeftParen
194                }
195                ')' => {
196                    state.advance(1);
197                    WitTokenType::RightParen
198                }
199                '{' => {
200                    state.advance(1);
201                    WitTokenType::LeftBrace
202                }
203                '}' => {
204                    state.advance(1);
205                    WitTokenType::RightBrace
206                }
207                '[' => {
208                    state.advance(1);
209                    WitTokenType::LeftBracket
210                }
211                ']' => {
212                    state.advance(1);
213                    WitTokenType::RightBracket
214                }
215                '<' => {
216                    state.advance(1);
217                    WitTokenType::Lt
218                }
219                '>' => {
220                    state.advance(1);
221                    WitTokenType::Gt
222                }
223                ',' => {
224                    state.advance(1);
225                    WitTokenType::Comma
226                }
227                ';' => {
228                    state.advance(1);
229                    WitTokenType::Semicolon
230                }
231                ':' => {
232                    state.advance(1);
233                    WitTokenType::Colon
234                }
235                '=' => {
236                    state.advance(1);
237                    WitTokenType::Assign
238                }
239                '.' => {
240                    state.advance(1);
241                    WitTokenType::Dot
242                }
243                '*' => {
244                    state.advance(1);
245                    WitTokenType::Star
246                }
247                '/' => {
248                    state.advance(1);
249                    WitTokenType::Slash
250                }
251                '@' => {
252                    state.advance(1);
253                    WitTokenType::At
254                }
255                '-' => {
256                    state.advance(1);
257                    if state.peek() == Some('>') {
258                        state.advance(1);
259                        WitTokenType::Arrow
260                    }
261                    else {
262                        WitTokenType::Minus
263                    }
264                }
265                _ => return false,
266            };
267
268            state.add_token(token_kind, start_pos, state.get_position());
269            return true;
270        }
271
272        false
273    }
274
275    fn lex_text<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
276        let start_pos = state.get_position();
277
278        if let Some(ch) = state.peek() {
279            state.advance(ch.len_utf8());
280            state.add_token(WitTokenType::Error, start_pos, state.get_position());
281            return true;
282        }
283
284        false
285    }
286}