Skip to main content

oak_vala/lexer/
mod.rs

1use crate::{kind::ValaSyntaxKind, language::ValaLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, ValaLanguage>;
10
11static VALA_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static VALA_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
13static VALA_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14static VALA_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
15
16#[derive(Clone, Debug)]
17pub struct ValaLexer<'config> {
18    _config: &'config ValaLanguage,
19}
20
21impl<'config> Lexer<ValaLanguage> for ValaLexer<'config> {
22    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<ValaLanguage>) -> LexOutput<ValaLanguage> {
23        let mut state: State<'_, S> = LexerState::new(source);
24        let result = self.run(&mut state);
25        state.finish_with_cache(result, cache)
26    }
27}
28
29impl<'config> ValaLexer<'config> {
30    pub fn new(config: &'config ValaLanguage) -> Self {
31        Self { _config: config }
32    }
33
34    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
35        while state.not_at_end() {
36            let safe_point = state.get_position();
37
38            if self.skip_whitespace(state) {
39                continue;
40            }
41
42            if self.skip_comment(state) {
43                continue;
44            }
45
46            if self.lex_string_literal(state) {
47                continue;
48            }
49
50            if self.lex_char_literal(state) {
51                continue;
52            }
53
54            if self.lex_number_literal(state) {
55                continue;
56            }
57
58            if self.lex_identifier_or_keyword(state) {
59                continue;
60            }
61
62            if self.lex_operators(state) {
63                continue;
64            }
65
66            if self.lex_single_char_tokens(state) {
67                continue;
68            }
69
70            state.advance_if_dead_lock(safe_point);
71        }
72
73        // 添加 EOF token
74        state.add_eof();
75        Ok(())
76    }
77
78    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
79        VALA_WHITESPACE.scan(state, ValaSyntaxKind::Whitespace)
80    }
81
82    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
83        VALA_COMMENT.scan(state, ValaSyntaxKind::LineComment, ValaSyntaxKind::BlockComment)
84    }
85
86    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
87        VALA_STRING.scan(state, ValaSyntaxKind::StringLiteral)
88    }
89
90    fn lex_char_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
91        VALA_CHAR.scan(state, ValaSyntaxKind::CharLiteral)
92    }
93
94    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
95        let start = state.get_position();
96        let first = match state.peek() {
97            Some(c) => c,
98            None => return false,
99        };
100
101        if !first.is_ascii_digit() {
102            return false;
103        }
104
105        let mut is_float = false;
106
107        // 处理十六进制、八进制、二进制
108        if first == '0' {
109            match state.peek_next_n(1) {
110                Some('x') | Some('X') => {
111                    state.advance(2);
112                    while let Some(c) = state.peek() {
113                        if c.is_ascii_hexdigit() || c == '_' {
114                            state.advance(1);
115                        }
116                        else {
117                            break;
118                        }
119                    }
120                }
121                Some('b') | Some('B') => {
122                    state.advance(2);
123                    while let Some(c) = state.peek() {
124                        if c == '0' || c == '1' || c == '_' {
125                            state.advance(1);
126                        }
127                        else {
128                            break;
129                        }
130                    }
131                }
132                Some('o') | Some('O') => {
133                    state.advance(2);
134                    while let Some(c) = state.peek() {
135                        if ('0'..='7').contains(&c) || c == '_' {
136                            state.advance(1);
137                        }
138                        else {
139                            break;
140                        }
141                    }
142                }
143                _ => {
144                    state.advance(1);
145                    while let Some(c) = state.peek() {
146                        if c.is_ascii_digit() || c == '_' {
147                            state.advance(1);
148                        }
149                        else {
150                            break;
151                        }
152                    }
153                }
154            }
155        }
156        else {
157            state.advance(1);
158            while let Some(c) = state.peek() {
159                if c.is_ascii_digit() || c == '_' {
160                    state.advance(1);
161                }
162                else {
163                    break;
164                }
165            }
166        }
167
168        // 小数部分
169        if state.peek() == Some('.') {
170            let n1 = state.peek_next_n(1);
171            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
172                is_float = true;
173                state.advance(1); // consume '.'
174                while let Some(c) = state.peek() {
175                    if c.is_ascii_digit() || c == '_' {
176                        state.advance(1);
177                    }
178                    else {
179                        break;
180                    }
181                }
182            }
183        }
184
185        // 指数部分
186        if let Some(c) = state.peek() {
187            if c == 'e' || c == 'E' {
188                let n1 = state.peek_next_n(1);
189                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
190                    is_float = true;
191                    state.advance(1);
192                    if let Some(sign) = state.peek() {
193                        if sign == '+' || sign == '-' {
194                            state.advance(1);
195                        }
196                    }
197                    while let Some(d) = state.peek() {
198                        if d.is_ascii_digit() || d == '_' {
199                            state.advance(1);
200                        }
201                        else {
202                            break;
203                        }
204                    }
205                }
206            }
207        }
208
209        // 后缀字母 (e.g., f, d, l)
210        while let Some(c) = state.peek() {
211            if c.is_ascii_alphabetic() {
212                state.advance(1);
213            }
214            else {
215                break;
216            }
217        }
218
219        let end = state.get_position();
220        state.add_token(if is_float { ValaSyntaxKind::FloatLiteral } else { ValaSyntaxKind::IntegerLiteral }, start, end);
221        true
222    }
223
224    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
225        let start = state.get_position();
226        let ch = match state.peek() {
227            Some(c) => c,
228            None => return false,
229        };
230
231        if !(ch.is_ascii_alphabetic() || ch == '_') {
232            return false;
233        }
234
235        state.advance(ch.len_utf8());
236        while let Some(c) = state.peek() {
237            if c.is_ascii_alphanumeric() || c == '_' {
238                state.advance(c.len_utf8());
239            }
240            else {
241                break;
242            }
243        }
244
245        let end = state.get_position();
246        let text = state.get_text_in(oak_core::Range { start, end });
247        let kind = match text.as_ref() {
248            "abstract" => ValaSyntaxKind::AbstractKw,
249            "as" => ValaSyntaxKind::AsKw,
250            "base" => ValaSyntaxKind::BaseKw,
251            "break" => ValaSyntaxKind::BreakKw,
252            "case" => ValaSyntaxKind::CaseKw,
253            "catch" => ValaSyntaxKind::CatchKw,
254            "class" => ValaSyntaxKind::ClassKw,
255            "const" => ValaSyntaxKind::ConstKw,
256            "construct" => ValaSyntaxKind::ConstructKw,
257            "continue" => ValaSyntaxKind::ContinueKw,
258            "default" => ValaSyntaxKind::DefaultKw,
259            "delegate" => ValaSyntaxKind::DelegateKw,
260            "delete" => ValaSyntaxKind::DeleteKw,
261            "do" => ValaSyntaxKind::DoKw,
262            "else" => ValaSyntaxKind::ElseKw,
263            "enum" => ValaSyntaxKind::EnumKw,
264            "ensures" => ValaSyntaxKind::EnsuresKw,
265            "errordomain" => ValaSyntaxKind::ErrordomainKw,
266            "extern" => ValaSyntaxKind::ExternKw,
267            "false" => ValaSyntaxKind::FalseKw,
268            "finally" => ValaSyntaxKind::FinallyKw,
269            "for" => ValaSyntaxKind::ForKw,
270            "foreach" => ValaSyntaxKind::ForeachKw,
271            "get" => ValaSyntaxKind::GetKw,
272            "if" => ValaSyntaxKind::IfKw,
273            "in" => ValaSyntaxKind::InKw,
274            "inline" => ValaSyntaxKind::InlineKw,
275            "interface" => ValaSyntaxKind::InterfaceKw,
276            "internal" => ValaSyntaxKind::InternalKw,
277            "is" => ValaSyntaxKind::IsKw,
278            "lock" => ValaSyntaxKind::LockKw,
279            "namespace" => ValaSyntaxKind::NamespaceKw,
280            "new" => ValaSyntaxKind::NewKw,
281            "null" => ValaSyntaxKind::NullKw,
282            "out" => ValaSyntaxKind::OutKw,
283            "override" => ValaSyntaxKind::OverrideKw,
284            "owned" => ValaSyntaxKind::OwnedKw,
285            "private" => ValaSyntaxKind::PrivateKw,
286            "protected" => ValaSyntaxKind::ProtectedKw,
287            "public" => ValaSyntaxKind::PublicKw,
288            "ref" => ValaSyntaxKind::RefKw,
289            "requires" => ValaSyntaxKind::RequiresKw,
290            "return" => ValaSyntaxKind::ReturnKw,
291            "set" => ValaSyntaxKind::SetKw,
292            "sizeof" => ValaSyntaxKind::SizeofKw,
293            "static" => ValaSyntaxKind::StaticKw,
294            "struct" => ValaSyntaxKind::StructKw,
295            "switch" => ValaSyntaxKind::SwitchKw,
296            "this" => ValaSyntaxKind::ThisKw,
297            "throw" => ValaSyntaxKind::ThrowKw,
298            "throws" => ValaSyntaxKind::ThrowsKw,
299            "true" => ValaSyntaxKind::TrueKw,
300            "try" => ValaSyntaxKind::TryKw,
301            "typeof" => ValaSyntaxKind::TypeofKw,
302            "unowned" => ValaSyntaxKind::UnownedKw,
303            "using" => ValaSyntaxKind::UsingKw,
304            "var" => ValaSyntaxKind::VarKw,
305            "virtual" => ValaSyntaxKind::VirtualKw,
306            "void" => ValaSyntaxKind::VoidKw,
307            "volatile" => ValaSyntaxKind::VolatileKw,
308            "weak" => ValaSyntaxKind::WeakKw,
309            "while" => ValaSyntaxKind::WhileKw,
310            "yield" => ValaSyntaxKind::YieldKw,
311            // 基本类型
312            "bool" => ValaSyntaxKind::BoolKw,
313            "char" => ValaSyntaxKind::CharKw,
314            "uchar" => ValaSyntaxKind::UcharKw,
315            "int" => ValaSyntaxKind::IntKw,
316            "uint" => ValaSyntaxKind::UintKw,
317            "short" => ValaSyntaxKind::ShortKw,
318            "ushort" => ValaSyntaxKind::UshortKw,
319            "long" => ValaSyntaxKind::LongKw,
320            "ulong" => ValaSyntaxKind::UlongKw,
321            "int8" => ValaSyntaxKind::Int8Kw,
322            "uint8" => ValaSyntaxKind::Uint8Kw,
323            "int16" => ValaSyntaxKind::Int16Kw,
324            "uint16" => ValaSyntaxKind::Uint16Kw,
325            "int32" => ValaSyntaxKind::Int32Kw,
326            "uint32" => ValaSyntaxKind::Uint32Kw,
327            "int64" => ValaSyntaxKind::Int64Kw,
328            "uint64" => ValaSyntaxKind::Uint64Kw,
329            "float" => ValaSyntaxKind::FloatKw,
330            "double" => ValaSyntaxKind::DoubleKw,
331            "string" => ValaSyntaxKind::StringKw,
332            _ => ValaSyntaxKind::Identifier,
333        };
334
335        state.add_token(kind, start, state.get_position());
336        true
337    }
338
339    fn lex_operators<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
340        let start = state.get_position();
341
342        // 优先匹配较长的操作符
343        let patterns: &[(&str, ValaSyntaxKind)] = &[
344            ("<<", ValaSyntaxKind::LeftShift),
345            (">>", ValaSyntaxKind::RightShift),
346            ("==", ValaSyntaxKind::EqEq),
347            ("!=", ValaSyntaxKind::NotEq),
348            ("<=", ValaSyntaxKind::LessEq),
349            (">=", ValaSyntaxKind::GreaterEq),
350            ("&&", ValaSyntaxKind::AndAnd),
351            ("||", ValaSyntaxKind::OrOr),
352            ("++", ValaSyntaxKind::PlusPlus),
353            ("--", ValaSyntaxKind::MinusMinus),
354            ("+=", ValaSyntaxKind::PlusEq),
355            ("-=", ValaSyntaxKind::MinusEq),
356            ("*=", ValaSyntaxKind::StarEq),
357            ("/=", ValaSyntaxKind::SlashEq),
358            ("%=", ValaSyntaxKind::PercentEq),
359            ("->", ValaSyntaxKind::Arrow),
360        ];
361
362        for (pat, kind) in patterns {
363            if state.starts_with(pat) {
364                state.advance(pat.len());
365                state.add_token(*kind, start, state.get_position());
366                return true;
367            }
368        }
369
370        if let Some(ch) = state.current() {
371            let kind = match ch {
372                '+' => Some(ValaSyntaxKind::Plus),
373                '-' => Some(ValaSyntaxKind::Minus),
374                '*' => Some(ValaSyntaxKind::Star),
375                '/' => Some(ValaSyntaxKind::Slash),
376                '%' => Some(ValaSyntaxKind::Percent),
377                '^' => Some(ValaSyntaxKind::Caret),
378                '!' => Some(ValaSyntaxKind::Bang),
379                '&' => Some(ValaSyntaxKind::Ampersand),
380                '|' => Some(ValaSyntaxKind::Pipe),
381                '=' => Some(ValaSyntaxKind::Eq),
382                '>' => Some(ValaSyntaxKind::GreaterThan),
383                '<' => Some(ValaSyntaxKind::LessThan),
384                '.' => Some(ValaSyntaxKind::Dot),
385                ':' => Some(ValaSyntaxKind::Colon),
386                '?' => Some(ValaSyntaxKind::Question),
387                '~' => Some(ValaSyntaxKind::Tilde),
388                '\\' => Some(ValaSyntaxKind::Backslash),
389                '@' => Some(ValaSyntaxKind::At),
390                '#' => Some(ValaSyntaxKind::Hash),
391                '$' => Some(ValaSyntaxKind::Dollar),
392                _ => None,
393            };
394
395            if let Some(k) = kind {
396                state.advance(ch.len_utf8());
397                state.add_token(k, start, state.get_position());
398                return true;
399            }
400        }
401
402        false
403    }
404
405    fn lex_single_char_tokens<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
406        let start = state.get_position();
407        if let Some(ch) = state.current() {
408            let kind = match ch {
409                '(' => Some(ValaSyntaxKind::LeftParen),
410                ')' => Some(ValaSyntaxKind::RightParen),
411                '{' => Some(ValaSyntaxKind::LeftBrace),
412                '}' => Some(ValaSyntaxKind::RightBrace),
413                '[' => Some(ValaSyntaxKind::LeftBracket),
414                ']' => Some(ValaSyntaxKind::RightBracket),
415                ',' => Some(ValaSyntaxKind::Comma),
416                ';' => Some(ValaSyntaxKind::Semicolon),
417                _ => None,
418            };
419
420            if let Some(k) = kind {
421                state.advance(ch.len_utf8());
422                state.add_token(k, start, state.get_position());
423                return true;
424            }
425        }
426        false
427    }
428}