Skip to main content

oak_zig/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2use crate::{language::ZigLanguage, lexer::token_type::ZigTokenType};
3pub mod token_type;
4use oak_core::{
5    Lexer, LexerCache, LexerState, OakError, Source,
6    lexer::{LexOutput, WhitespaceConfig},
7};
8use std::sync::LazyLock;
9
10pub(crate) type State<'a, S> = LexerState<'a, S, ZigLanguage>;
11
12static ZIG_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13
14/// Lexer for the Zig language.
15#[derive(Clone)]
16pub struct ZigLexer<'config> {
17    config: &'config ZigLanguage,
18}
19
20impl<'config> Lexer<ZigLanguage> for ZigLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<ZigLanguage>) -> LexOutput<ZigLanguage> {
22        let mut state = State::new_with_cache(source, 0, cache);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof()
26        }
27        state.finish_with_cache(result, cache)
28    }
29}
30
31impl<'config> ZigLexer<'config> {
32    /// Creates a new Zig lexer with the given configuration.
33    pub fn new(config: &'config ZigLanguage) -> Self {
34        Self { config }
35    }
36
37    /// Main lexical analysis loop
38    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
39        while state.not_at_end() {
40            let safe_point = state.get_position();
41
42            if self.skip_whitespace(state) {
43                continue;
44            }
45
46            if self.skip_comment(state) {
47                continue;
48            }
49
50            if self.lex_string_literal(state) {
51                continue;
52            }
53
54            if self.lex_char_literal(state) {
55                continue;
56            }
57
58            if self.lex_number_literal(state) {
59                continue;
60            }
61
62            if self.lex_identifier_or_keyword(state) {
63                continue;
64            }
65
66            if self.lex_builtin(state) {
67                continue;
68            }
69
70            if self.lex_operators(state) {
71                continue;
72            }
73
74            if self.lex_single_char_tokens(state) {
75                continue;
76            }
77
78            // If no rules match, advance one character and mark as error
79            let start_pos = state.get_position();
80            if let Some(ch) = state.peek() {
81                state.advance(ch.len_utf8());
82                state.add_token(ZigTokenType::Error, start_pos, state.get_position())
83            }
84
85            state.advance_if_dead_lock(safe_point)
86        }
87
88        Ok(())
89    }
90
91    /// Skips whitespace characters
92    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
93        ZIG_WHITESPACE.scan(state, ZigTokenType::Whitespace)
94    }
95
96    /// Skips comments
97    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
98        let start = state.get_position();
99        let rest = state.rest();
100
101        // Line comment: // ... until newline
102        if rest.starts_with("//") {
103            state.advance(2);
104
105            // Check if it's a doc comment ///
106            let is_doc_comment = if state.peek() == Some('/') {
107                state.advance(1);
108                true
109            }
110            else {
111                false
112            };
113
114            while let Some(ch) = state.peek() {
115                if ch == '\n' || ch == '\r' {
116                    break;
117                }
118                state.advance(ch.len_utf8())
119            }
120
121            let kind = if is_doc_comment { ZigTokenType::DocComment } else { ZigTokenType::Comment };
122            state.add_token(kind, start, state.get_position());
123            return true;
124        }
125
126        false
127    }
128
129    /// Parses string literals
130    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
131        let start = state.get_position();
132
133        // Multiline string: \\...
134        if state.rest().starts_with("\\\\") {
135            state.advance(2);
136
137            // Skip to end of line
138            while let Some(ch) = state.peek() {
139                if ch == '\n' {
140                    state.advance(1);
141                    break;
142                }
143                state.advance(ch.len_utf8())
144            }
145
146            // Read multiline string content
147            while state.not_at_end() {
148                let _line_start = state.get_position();
149
150                // Check if it's a continuation line
151                if !state.rest().starts_with("\\\\") {
152                    break;
153                }
154
155                state.advance(2);
156
157                // Read to end of line
158                while let Some(ch) = state.peek() {
159                    if ch == '\n' {
160                        state.advance(1);
161                        break;
162                    }
163                    state.advance(ch.len_utf8())
164                }
165            }
166
167            state.add_token(ZigTokenType::StringLiteral, start, state.get_position());
168            return true;
169        }
170
171        // Normal string: "..."
172        if state.current() == Some('"') {
173            state.advance(1);
174            while let Some(ch) = state.peek() {
175                if ch == '"' {
176                    state.advance(1);
177                    break;
178                }
179                if ch == '\\' {
180                    state.advance(1);
181                    if let Some(next) = state.peek() {
182                        state.advance(next.len_utf8())
183                    }
184                    continue;
185                }
186                state.advance(ch.len_utf8())
187            }
188            state.add_token(ZigTokenType::StringLiteral, start, state.get_position());
189            return true;
190        }
191
192        false
193    }
194
195    /// Parses character literals
196    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
197        let start = state.get_position();
198        if state.current() == Some('\'') {
199            state.advance(1);
200            while let Some(ch) = state.peek() {
201                if ch == '\'' {
202                    state.advance(1);
203                    break;
204                }
205                if ch == '\\' {
206                    state.advance(1);
207                    if let Some(next) = state.peek() {
208                        state.advance(next.len_utf8())
209                    }
210                    continue;
211                }
212                state.advance(ch.len_utf8())
213            }
214            state.add_token(ZigTokenType::CharLiteral, start, state.get_position());
215            return true;
216        }
217        false
218    }
219
220    /// Parses number literals
221    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
222        let start = state.get_position();
223        let ch = state.current();
224        let mut is_float = false;
225
226        if let Some(ch) = ch {
227            if ch.is_ascii_digit() {
228                state.advance(1);
229                // Handle hexadecimal, binary, octal
230                if ch == '0' {
231                    if let Some(next) = state.peek() {
232                        match next {
233                            'x' | 'X' => {
234                                state.advance(1);
235                                state.take_while(|c| c.is_ascii_hexdigit() || c == '_');
236                            }
237                            'b' | 'B' => {
238                                state.advance(1);
239                                state.take_while(|c| c == '0' || c == '1' || c == '_');
240                            }
241                            'o' | 'O' => {
242                                state.advance(1);
243                                state.take_while(|c| ('0'..='7').contains(&c) || c == '_');
244                            }
245                            _ => {
246                                state.take_while(|c| c.is_ascii_digit() || c == '_');
247                            }
248                        }
249                    }
250                }
251                else {
252                    state.take_while(|c| c.is_ascii_digit() || c == '_');
253                }
254
255                // Handle decimal point
256                if state.current() == Some('.') {
257                    if let Some(next) = state.peek() {
258                        if next.is_ascii_digit() {
259                            is_float = true;
260                            state.advance(1);
261                            state.take_while(|c| c.is_ascii_digit() || c == '_');
262                        }
263                    }
264                }
265
266                // Handle exponent
267                if let Some(c) = state.current() {
268                    if c == 'e' || c == 'E' || c == 'p' || c == 'P' {
269                        is_float = true;
270                        state.advance(1);
271                        if let Some(next) = state.peek() {
272                            if next == '+' || next == '-' {
273                                state.advance(1);
274                            }
275                        }
276                        state.take_while(|c| c.is_ascii_digit() || c == '_');
277                    }
278                }
279
280                let kind = if is_float { ZigTokenType::FloatLiteral } else { ZigTokenType::IntegerLiteral };
281                state.add_token(kind, start, state.get_position());
282                return true;
283            }
284        }
285        false
286    }
287
288    /// Parses identifiers or keywords
289    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
290        let start = state.get_position();
291        if let Some(ch) = state.current() {
292            if ch.is_ascii_alphabetic() || ch == '_' {
293                state.advance(ch.len_utf8());
294                state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
295
296                let end = state.get_position();
297                let text = state.get_text_in((start..end).into());
298                let kind = self.get_keyword_or_identifier(&text);
299                state.add_token(kind, start, state.get_position());
300                return true;
301            }
302        }
303        false
304    }
305
306    /// Gets keyword or identifier type
307    fn get_keyword_or_identifier(&self, text: &str) -> ZigTokenType {
308        match text {
309            // Basic structures
310            "const" => ZigTokenType::Const,
311            "var" => ZigTokenType::Var,
312            "fn" => ZigTokenType::Fn,
313            "struct" => ZigTokenType::Struct,
314            "union" => ZigTokenType::Union,
315            "enum" => ZigTokenType::Enum,
316            "opaque" => ZigTokenType::Opaque,
317            "type" => ZigTokenType::Type,
318            "comptime" => ZigTokenType::Comptime,
319            "inline" => ZigTokenType::Inline,
320            "noinline" => ZigTokenType::NoInline,
321            "pub" => ZigTokenType::Pub,
322            "export" => ZigTokenType::Export,
323            "extern" => ZigTokenType::Extern,
324            "packed" => ZigTokenType::Packed,
325            "align" => ZigTokenType::Align,
326            "callconv" => ZigTokenType::CallConv,
327            "linksection" => ZigTokenType::LinkSection,
328
329            // Control flow
330            "if" => ZigTokenType::If,
331            "else" => ZigTokenType::Else,
332            "switch" => ZigTokenType::Switch,
333            "while" => ZigTokenType::While,
334            "for" => ZigTokenType::For,
335            "break" => ZigTokenType::Break,
336            "continue" => ZigTokenType::Continue,
337            "return" => ZigTokenType::Return,
338            "defer" => ZigTokenType::Defer,
339            "errdefer" => ZigTokenType::ErrDefer,
340            "unreachable" => ZigTokenType::Unreachable,
341            "noreturn" => ZigTokenType::NoReturn,
342
343            // Error handling
344            "try" => ZigTokenType::TryKeyword,
345            "catch" => ZigTokenType::CatchKeyword,
346            "orelse" => ZigTokenType::OrElse,
347            "error" => ZigTokenType::ErrorKeyword,
348
349            // Test and async
350            "test" => ZigTokenType::Test,
351            "async" => ZigTokenType::Async,
352            "await" => ZigTokenType::AwaitKeyword,
353            "suspend" => ZigTokenType::Suspend,
354            "resume" => ZigTokenType::Resume,
355            "cancel" => ZigTokenType::Cancel,
356
357            // Memory management
358            "undefined" => ZigTokenType::Undefined,
359            "null" => ZigTokenType::Null,
360            "volatile" => ZigTokenType::Volatile,
361            "allowzero" => ZigTokenType::AllowZero,
362            "noalias" => ZigTokenType::NoAlias,
363
364            // Logical operations
365            "and" => ZigTokenType::And,
366            "or" => ZigTokenType::Or,
367
368            // Others
369            "anyframe" => ZigTokenType::AnyFrame,
370            "anytype" => ZigTokenType::AnyType,
371            "threadlocal" => ZigTokenType::ThreadLocal,
372
373            // Basic types
374            "bool" => ZigTokenType::Bool,
375            "i8" => ZigTokenType::I8,
376            "i16" => ZigTokenType::I16,
377            "i32" => ZigTokenType::I32,
378            "i64" => ZigTokenType::I64,
379            "i128" => ZigTokenType::I128,
380            "isize" => ZigTokenType::Isize,
381            "u8" => ZigTokenType::U8,
382            "u16" => ZigTokenType::U16,
383            "u32" => ZigTokenType::U32,
384            "u64" => ZigTokenType::U64,
385            "u128" => ZigTokenType::U128,
386            "usize" => ZigTokenType::Usize,
387            "f16" => ZigTokenType::F16,
388            "f32" => ZigTokenType::F32,
389            "f64" => ZigTokenType::F64,
390            "f80" => ZigTokenType::F80,
391            "f128" => ZigTokenType::F128,
392            "c_short" => ZigTokenType::CShort,
393            "c_ushort" => ZigTokenType::CUshort,
394            "c_int" => ZigTokenType::CInt,
395            "c_uint" => ZigTokenType::CUint,
396            "c_long" => ZigTokenType::CLong,
397            "c_ulong" => ZigTokenType::CUlong,
398            "c_longlong" => ZigTokenType::CLongLong,
399            "c_ulonglong" => ZigTokenType::CUlongLong,
400            "c_longdouble" => ZigTokenType::CLongDouble,
401            "c_void" => ZigTokenType::CVoid,
402            "void" => ZigTokenType::Void,
403            "comptime_int" => ZigTokenType::ComptimeInt,
404            "comptime_float" => ZigTokenType::ComptimeFloat,
405
406            // Boolean literals
407            "true" | "false" => ZigTokenType::BooleanLiteral,
408
409            _ => ZigTokenType::Identifier,
410        }
411    }
412
413    /// Parses builtin identifiers (e.g., @import)
414    fn lex_builtin<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
415        let start = state.get_position();
416        if state.current() == Some('@') {
417            state.advance(1);
418            if let Some(ch) = state.current() {
419                if ch.is_ascii_alphabetic() || ch == '_' {
420                    state.advance(ch.len_utf8());
421                    state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
422                    state.add_token(ZigTokenType::BuiltinIdentifier, start, state.get_position());
423                    return true;
424                }
425            }
426        }
427        false
428    }
429
430    /// Parses operators
431    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
432        let start = state.get_position();
433        let rest = state.rest();
434
435        // Try to match the longest operator
436        let ops = [
437            ("<<=", ZigTokenType::LessLessAssign),
438            (">>=", ZigTokenType::GreaterGreaterAssign),
439            ("...", ZigTokenType::DotDotDot),
440            ("==", ZigTokenType::Equal),
441            ("!=", ZigTokenType::NotEqual),
442            ("<=", ZigTokenType::LessEqual),
443            (">=", ZigTokenType::GreaterEqual),
444            ("&&", ZigTokenType::AndAnd),
445            ("||", ZigTokenType::OrOr),
446            ("+=", ZigTokenType::PlusAssign),
447            ("-=", ZigTokenType::MinusAssign),
448            ("*=", ZigTokenType::StarAssign),
449            ("/=", ZigTokenType::SlashAssign),
450            ("%=", ZigTokenType::PercentAssign),
451            ("&=", ZigTokenType::AmpersandAssign),
452            ("|=", ZigTokenType::PipeAssign),
453            ("^=", ZigTokenType::CaretAssign),
454            ("++", ZigTokenType::PlusPlus),
455            ("--", ZigTokenType::MinusMinus),
456            ("**", ZigTokenType::StarStar),
457            ("->", ZigTokenType::Arrow),
458            ("=>", ZigTokenType::FatArrow),
459            ("<<", ZigTokenType::LessLess),
460            (">>", ZigTokenType::GreaterGreater),
461            (".?", ZigTokenType::DotQuestion),
462            (".*", ZigTokenType::DotStar),
463        ];
464
465        for (op, kind) in ops {
466            if rest.starts_with(op) {
467                state.advance(op.len());
468                state.add_token(kind, start, state.get_position());
469                return true;
470            }
471        }
472
473        false
474    }
475
476    /// Parses single-character tokens
477    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
478        let start = state.get_position();
479        if let Some(ch) = state.current() {
480            let kind = match ch {
481                '(' => ZigTokenType::LeftParen,
482                ')' => ZigTokenType::RightParen,
483                '{' => ZigTokenType::LeftBrace,
484                '}' => ZigTokenType::RightBrace,
485                '[' => ZigTokenType::LeftBracket,
486                ']' => ZigTokenType::RightBracket,
487                ',' => ZigTokenType::Comma,
488                '.' => ZigTokenType::Dot,
489                ':' => ZigTokenType::Colon,
490                ';' => ZigTokenType::Semicolon,
491                '+' => ZigTokenType::Plus,
492                '-' => ZigTokenType::Minus,
493                '*' => ZigTokenType::Star,
494                '/' => ZigTokenType::Slash,
495                '%' => ZigTokenType::Percent,
496                '&' => ZigTokenType::Ampersand,
497                '|' => ZigTokenType::Pipe,
498                '^' => ZigTokenType::Caret,
499                '~' => ZigTokenType::Tilde,
500                '!' => ZigTokenType::Exclamation,
501                '?' => ZigTokenType::Question,
502                '<' => ZigTokenType::Less,
503                '>' => ZigTokenType::Greater,
504                '=' => ZigTokenType::Assign,
505                _ => return false,
506            };
507            state.advance(1);
508            state.add_token(kind, start, state.get_position());
509            return true;
510        }
511        false
512    }
513}