Skip to main content

oak_vala/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2use oak_core::Source;
3pub mod token_type;
4
5use crate::{language::ValaLanguage, lexer::token_type::ValaTokenType};
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
9};
10use std::sync::LazyLock;
11
12type State<'a, S> = LexerState<'a, S, ValaLanguage>;
13
14static VALA_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static VALA_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
16static VALA_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
17static VALA_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
18
19#[derive(Clone, Debug)]
20pub struct ValaLexer<'config> {
21    _config: &'config ValaLanguage,
22}
23
24impl<'config> Lexer<ValaLanguage> for ValaLexer<'config> {
25    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<ValaLanguage>) -> LexOutput<ValaLanguage> {
26        let mut state: State<'_, S> = LexerState::new(source);
27        let result = self.run(&mut state);
28        state.finish_with_cache(result, cache)
29    }
30}
31
32impl<'config> ValaLexer<'config> {
33    pub fn new(config: &'config ValaLanguage) -> Self {
34        Self { _config: config }
35    }
36
37    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
38        while state.not_at_end() {
39            let safe_point = state.get_position();
40
41            if self.skip_whitespace(state) {
42                continue;
43            }
44
45            if self.skip_comment(state) {
46                continue;
47            }
48
49            if self.lex_string_literal(state) {
50                continue;
51            }
52
53            if self.lex_char_literal(state) {
54                continue;
55            }
56
57            if self.lex_number_literal(state) {
58                continue;
59            }
60
61            if self.lex_identifier_or_keyword(state) {
62                continue;
63            }
64
65            if self.lex_operators(state) {
66                continue;
67            }
68
69            if self.lex_single_char_tokens(state) {
70                continue;
71            }
72
73            state.advance_if_dead_lock(safe_point);
74        }
75
76        // 添加 EOF token
77        state.add_eof();
78        Ok(())
79    }
80
81    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
82        VALA_WHITESPACE.scan(state, ValaTokenType::Whitespace)
83    }
84
85    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
86        VALA_COMMENT.scan(state, ValaTokenType::LineComment, ValaTokenType::BlockComment)
87    }
88
89    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
90        VALA_STRING.scan(state, ValaTokenType::StringLiteral)
91    }
92
93    fn lex_char_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
94        VALA_CHAR.scan(state, ValaTokenType::CharLiteral)
95    }
96
97    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
98        let start = state.get_position();
99        let first = match state.peek() {
100            Some(c) => c,
101            None => return false,
102        };
103
104        if !first.is_ascii_digit() {
105            return false;
106        }
107
108        let mut is_float = false;
109
110        // 处理十六进制、八进制、二进制
111        if first == '0' {
112            match state.peek_next_n(1) {
113                Some('x') | Some('X') => {
114                    state.advance(2);
115                    while let Some(c) = state.peek() {
116                        if c.is_ascii_hexdigit() || c == '_' {
117                            state.advance(1);
118                        }
119                        else {
120                            break;
121                        }
122                    }
123                }
124                Some('b') | Some('B') => {
125                    state.advance(2);
126                    while let Some(c) = state.peek() {
127                        if c == '0' || c == '1' || c == '_' {
128                            state.advance(1);
129                        }
130                        else {
131                            break;
132                        }
133                    }
134                }
135                Some('o') | Some('O') => {
136                    state.advance(2);
137                    while let Some(c) = state.peek() {
138                        if ('0'..='7').contains(&c) || c == '_' {
139                            state.advance(1);
140                        }
141                        else {
142                            break;
143                        }
144                    }
145                }
146                _ => {
147                    state.advance(1);
148                    while let Some(c) = state.peek() {
149                        if c.is_ascii_digit() || c == '_' {
150                            state.advance(1);
151                        }
152                        else {
153                            break;
154                        }
155                    }
156                }
157            }
158        }
159        else {
160            state.advance(1);
161            while let Some(c) = state.peek() {
162                if c.is_ascii_digit() || c == '_' {
163                    state.advance(1);
164                }
165                else {
166                    break;
167                }
168            }
169        }
170
171        // 小数部分
172        if state.peek() == Some('.') {
173            let n1 = state.peek_next_n(1);
174            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
175                is_float = true;
176                state.advance(1); // consume '.'
177                while let Some(c) = state.peek() {
178                    if c.is_ascii_digit() || c == '_' {
179                        state.advance(1);
180                    }
181                    else {
182                        break;
183                    }
184                }
185            }
186        }
187
188        // 指数部分
189        if let Some(c) = state.peek() {
190            if c == 'e' || c == 'E' {
191                let n1 = state.peek_next_n(1);
192                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
193                    is_float = true;
194                    state.advance(1);
195                    if let Some(sign) = state.peek() {
196                        if sign == '+' || sign == '-' {
197                            state.advance(1);
198                        }
199                    }
200                    while let Some(d) = state.peek() {
201                        if d.is_ascii_digit() || d == '_' {
202                            state.advance(1);
203                        }
204                        else {
205                            break;
206                        }
207                    }
208                }
209            }
210        }
211
212        // 后缀字母 (e.g., f, d, l)
213        while let Some(c) = state.peek() {
214            if c.is_ascii_alphabetic() {
215                state.advance(1);
216            }
217            else {
218                break;
219            }
220        }
221
222        let end = state.get_position();
223        state.add_token(if is_float { ValaTokenType::FloatLiteral } else { ValaTokenType::IntegerLiteral }, start, end);
224        true
225    }
226
227    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
228        let start = state.get_position();
229        let ch = match state.peek() {
230            Some(c) => c,
231            None => return false,
232        };
233
234        if !(ch.is_ascii_alphabetic() || ch == '_') {
235            return false;
236        }
237
238        state.advance(ch.len_utf8());
239        while let Some(c) = state.peek() {
240            if c.is_ascii_alphanumeric() || c == '_' {
241                state.advance(c.len_utf8());
242            }
243            else {
244                break;
245            }
246        }
247
248        let end = state.get_position();
249        let text = state.get_text_in(oak_core::Range { start, end });
250        let kind = match text.as_ref() {
251            "abstract" => ValaTokenType::AbstractKw,
252            "as" => ValaTokenType::AsKw,
253            "base" => ValaTokenType::BaseKw,
254            "break" => ValaTokenType::BreakKw,
255            "case" => ValaTokenType::CaseKw,
256            "catch" => ValaTokenType::CatchKw,
257            "class" => ValaTokenType::ClassKw,
258            "const" => ValaTokenType::ConstKw,
259            "construct" => ValaTokenType::ConstructKw,
260            "continue" => ValaTokenType::ContinueKw,
261            "default" => ValaTokenType::DefaultKw,
262            "delegate" => ValaTokenType::DelegateKw,
263            "delete" => ValaTokenType::DeleteKw,
264            "do" => ValaTokenType::DoKw,
265            "else" => ValaTokenType::ElseKw,
266            "enum" => ValaTokenType::EnumKw,
267            "ensures" => ValaTokenType::EnsuresKw,
268            "errordomain" => ValaTokenType::ErrordomainKw,
269            "extern" => ValaTokenType::ExternKw,
270            "false" => ValaTokenType::FalseKw,
271            "finally" => ValaTokenType::FinallyKw,
272            "for" => ValaTokenType::ForKw,
273            "foreach" => ValaTokenType::ForeachKw,
274            "get" => ValaTokenType::GetKw,
275            "if" => ValaTokenType::IfKw,
276            "in" => ValaTokenType::InKw,
277            "inline" => ValaTokenType::InlineKw,
278            "interface" => ValaTokenType::InterfaceKw,
279            "internal" => ValaTokenType::InternalKw,
280            "is" => ValaTokenType::IsKw,
281            "lock" => ValaTokenType::LockKw,
282            "namespace" => ValaTokenType::NamespaceKw,
283            "new" => ValaTokenType::NewKw,
284            "null" => ValaTokenType::NullKw,
285            "out" => ValaTokenType::OutKw,
286            "override" => ValaTokenType::OverrideKw,
287            "owned" => ValaTokenType::OwnedKw,
288            "private" => ValaTokenType::PrivateKw,
289            "protected" => ValaTokenType::ProtectedKw,
290            "public" => ValaTokenType::PublicKw,
291            "ref" => ValaTokenType::RefKw,
292            "requires" => ValaTokenType::RequiresKw,
293            "return" => ValaTokenType::ReturnKw,
294            "set" => ValaTokenType::SetKw,
295            "sizeof" => ValaTokenType::SizeofKw,
296            "static" => ValaTokenType::StaticKw,
297            "struct" => ValaTokenType::StructKw,
298            "switch" => ValaTokenType::SwitchKw,
299            "this" => ValaTokenType::ThisKw,
300            "throw" => ValaTokenType::ThrowKw,
301            "throws" => ValaTokenType::ThrowsKw,
302            "true" => ValaTokenType::TrueKw,
303            "try" => ValaTokenType::TryKw,
304            "typeof" => ValaTokenType::TypeofKw,
305            "unowned" => ValaTokenType::UnownedKw,
306            "using" => ValaTokenType::UsingKw,
307            "var" => ValaTokenType::VarKw,
308            "virtual" => ValaTokenType::VirtualKw,
309            "void" => ValaTokenType::VoidKw,
310            "volatile" => ValaTokenType::VolatileKw,
311            "weak" => ValaTokenType::WeakKw,
312            "while" => ValaTokenType::WhileKw,
313            "yield" => ValaTokenType::YieldKw,
314            // 基本类型
315            "bool" => ValaTokenType::BoolKw,
316            "char" => ValaTokenType::CharKw,
317            "uchar" => ValaTokenType::UcharKw,
318            "int" => ValaTokenType::IntKw,
319            "uint" => ValaTokenType::UintKw,
320            "short" => ValaTokenType::ShortKw,
321            "ushort" => ValaTokenType::UshortKw,
322            "long" => ValaTokenType::LongKw,
323            "ulong" => ValaTokenType::UlongKw,
324            "int8" => ValaTokenType::Int8Kw,
325            "uint8" => ValaTokenType::Uint8Kw,
326            "int16" => ValaTokenType::Int16Kw,
327            "uint16" => ValaTokenType::Uint16Kw,
328            "int32" => ValaTokenType::Int32Kw,
329            "uint32" => ValaTokenType::Uint32Kw,
330            "int64" => ValaTokenType::Int64Kw,
331            "uint64" => ValaTokenType::Uint64Kw,
332            "float" => ValaTokenType::FloatKw,
333            "double" => ValaTokenType::DoubleKw,
334            "string" => ValaTokenType::StringKw,
335            _ => ValaTokenType::Identifier,
336        };
337
338        state.add_token(kind, start, state.get_position());
339        true
340    }
341
342    fn lex_operators<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
343        let start = state.get_position();
344
345        // 优先匹配较长的操作符
346        let patterns: &[(&str, ValaTokenType)] = &[
347            ("<<", ValaTokenType::LeftShift),
348            (">>", ValaTokenType::RightShift),
349            ("==", ValaTokenType::EqEq),
350            ("!=", ValaTokenType::NotEq),
351            ("<=", ValaTokenType::LessEq),
352            (">=", ValaTokenType::GreaterEq),
353            ("&&", ValaTokenType::AndAnd),
354            ("||", ValaTokenType::OrOr),
355            ("++", ValaTokenType::PlusPlus),
356            ("--", ValaTokenType::MinusMinus),
357            ("+=", ValaTokenType::PlusEq),
358            ("-=", ValaTokenType::MinusEq),
359            ("*=", ValaTokenType::StarEq),
360            ("/=", ValaTokenType::SlashEq),
361            ("%=", ValaTokenType::PercentEq),
362            ("->", ValaTokenType::Arrow),
363        ];
364
365        for (pat, kind) in patterns {
366            if state.starts_with(pat) {
367                state.advance(pat.len());
368                state.add_token(*kind, start, state.get_position());
369                return true;
370            }
371        }
372
373        if let Some(ch) = state.current() {
374            let kind = match ch {
375                '+' => Some(ValaTokenType::Plus),
376                '-' => Some(ValaTokenType::Minus),
377                '*' => Some(ValaTokenType::Star),
378                '/' => Some(ValaTokenType::Slash),
379                '%' => Some(ValaTokenType::Percent),
380                '^' => Some(ValaTokenType::Caret),
381                '!' => Some(ValaTokenType::Bang),
382                '&' => Some(ValaTokenType::Ampersand),
383                '|' => Some(ValaTokenType::Pipe),
384                '=' => Some(ValaTokenType::Eq),
385                '>' => Some(ValaTokenType::GreaterThan),
386                '<' => Some(ValaTokenType::LessThan),
387                '.' => Some(ValaTokenType::Dot),
388                ':' => Some(ValaTokenType::Colon),
389                '?' => Some(ValaTokenType::Question),
390                '~' => Some(ValaTokenType::Tilde),
391                '\\' => Some(ValaTokenType::Backslash),
392                '@' => Some(ValaTokenType::At),
393                '#' => Some(ValaTokenType::Hash),
394                '$' => Some(ValaTokenType::Dollar),
395                _ => None,
396            };
397
398            if let Some(k) = kind {
399                state.advance(ch.len_utf8());
400                state.add_token(k, start, state.get_position());
401                return true;
402            }
403        }
404
405        false
406    }
407
408    fn lex_single_char_tokens<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
409        let start = state.get_position();
410        if let Some(ch) = state.current() {
411            let kind = match ch {
412                '(' => Some(ValaTokenType::LeftParen),
413                ')' => Some(ValaTokenType::RightParen),
414                '{' => Some(ValaTokenType::LeftBrace),
415                '}' => Some(ValaTokenType::RightBrace),
416                '[' => Some(ValaTokenType::LeftBracket),
417                ']' => Some(ValaTokenType::RightBracket),
418                ',' => Some(ValaTokenType::Comma),
419                ';' => Some(ValaTokenType::Semicolon),
420                _ => None,
421            };
422
423            if let Some(k) = kind {
424                state.advance(ch.len_utf8());
425                state.add_token(k, start, state.get_position());
426                return true;
427            }
428        }
429        false
430    }
431}