oak_ada/lexer/
mod.rs

1use crate::{kind::AdaSyntaxKind, language::AdaLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, AdaLanguage>;
10
11static ADA_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static ADA_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["--"] });
13static ADA_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: None });
14static ADA_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: None });
15
16#[derive(Clone)]
17pub struct AdaLexer<'config> {
18    config: &'config AdaLanguage,
19}
20
21impl<'config> Lexer<AdaLanguage> for AdaLexer<'config> {
22    fn lex_incremental(
23        &self,
24        source: impl Source,
25        changed: usize,
26        cache: IncrementalCache<AdaLanguage>,
27    ) -> LexOutput<AdaLanguage> {
28        let mut state = LexerState::new_with_cache(source, changed, cache);
29        let result = self.run(&mut state);
30        state.finish(result)
31    }
32}
33
34impl<'config> AdaLexer<'config> {
35    pub fn new(config: &'config AdaLanguage) -> Self {
36        Self { config }
37    }
38
39    /// 主要词法分析逻辑
40    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
41        while state.not_at_end() {
42            let safe_point = state.get_position();
43
44            if self.skip_whitespace(state) {
45                continue;
46            }
47
48            if self.skip_comment(state) {
49                continue;
50            }
51
52            if self.lex_string_literal(state) {
53                continue;
54            }
55
56            if self.lex_char_literal(state) {
57                continue;
58            }
59
60            if self.lex_number_literal(state) {
61                continue;
62            }
63
64            if self.lex_identifier_or_keyword(state) {
65                continue;
66            }
67
68            if self.lex_operators(state) {
69                continue;
70            }
71
72            if self.lex_single_char_tokens(state) {
73                continue;
74            }
75
76            // 如果没有匹配任何模式,跳过当前字符并生成 Error token
77            if let Some(ch) = state.peek() {
78                state.advance(ch.len_utf8());
79                state.add_token(AdaSyntaxKind::Error, safe_point, state.get_position());
80            }
81        }
82
83        // 添加 EOF kind
84        let eof_pos = state.get_position();
85        state.add_token(AdaSyntaxKind::Eof, eof_pos, eof_pos);
86        Ok(())
87    }
88
89    /// 跳过空白字符
90    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
91        match ADA_WHITESPACE.scan(state.rest(), state.get_position(), AdaSyntaxKind::Whitespace) {
92            Some(token) => {
93                state.advance_with(token);
94                return true;
95            }
96            None => {}
97        }
98        false
99    }
100
101    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
102        let start = state.get_position();
103        let rest = state.rest();
104
105        // Ada line comment: -- ... until newline
106        if rest.starts_with("--") {
107            state.advance(2);
108            while let Some(ch) = state.peek() {
109                if ch == '\n' || ch == '\r' {
110                    break;
111                }
112                state.advance(ch.len_utf8());
113            }
114            state.add_token(AdaSyntaxKind::Comment, start, state.get_position());
115            return true;
116        }
117        false
118    }
119
120    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
121        let start = state.get_position();
122
123        // Ada string: "..."
124        if state.current() == Some('"') {
125            state.advance(1);
126            while let Some(ch) = state.peek() {
127                if ch == '"' {
128                    state.advance(1); // consume closing quote
129                    break;
130                }
131                state.advance(ch.len_utf8());
132                if ch == '\n' || ch == '\r' {
133                    break;
134                }
135            }
136            state.add_token(AdaSyntaxKind::StringLiteral, start, state.get_position());
137            return true;
138        }
139        false
140    }
141
142    fn lex_char_literal<S: Source>(&self, state: &mut State<S>) -> bool {
143        let start = state.get_position();
144        if state.current() != Some('\'') {
145            return false;
146        }
147
148        // try parse 'x' etc.; if fails, revert
149        state.advance(1); // opening '
150        if let Some(c) = state.peek() {
151            state.advance(c.len_utf8());
152        }
153        else {
154            state.set_position(start);
155            return false;
156        }
157
158        if state.peek() == Some('\'') {
159            state.advance(1);
160            state.add_token(AdaSyntaxKind::CharacterLiteral, start, state.get_position());
161            return true;
162        }
163        state.set_position(start);
164        false
165    }
166
167    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
168        let start = state.get_position();
169
170        if let Some(ch) = state.current() {
171            if ch.is_ascii_digit() {
172                // consume digits
173                state.advance(ch.len_utf8());
174                while let Some(ch) = state.current() {
175                    if ch.is_ascii_digit() || ch == '_' {
176                        state.advance(ch.len_utf8());
177                    }
178                    else {
179                        break;
180                    }
181                }
182
183                // check for decimal point
184                if state.current() == Some('.') {
185                    state.advance(1);
186                    while let Some(ch) = state.current() {
187                        if ch.is_ascii_digit() || ch == '_' {
188                            state.advance(ch.len_utf8());
189                        }
190                        else {
191                            break;
192                        }
193                    }
194                }
195
196                // check for exponent
197                if let Some(ch) = state.current() {
198                    if ch == 'e' || ch == 'E' {
199                        state.advance(1);
200                        if let Some(sign) = state.current() {
201                            if sign == '+' || sign == '-' {
202                                state.advance(1);
203                            }
204                        }
205                        while let Some(ch) = state.current() {
206                            if ch.is_ascii_digit() {
207                                state.advance(ch.len_utf8());
208                            }
209                            else {
210                                break;
211                            }
212                        }
213                    }
214                }
215
216                state.add_token(AdaSyntaxKind::NumberLiteral, start, state.get_position());
217                return true;
218            }
219        }
220        false
221    }
222
223    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
224        let start = state.get_position();
225
226        if let Some(ch) = state.current() {
227            if ch.is_ascii_alphabetic() || ch == '_' {
228                state.advance(ch.len_utf8());
229
230                while let Some(ch) = state.current() {
231                    if ch.is_ascii_alphanumeric() || ch == '_' {
232                        state.advance(ch.len_utf8());
233                    }
234                    else {
235                        break;
236                    }
237                }
238
239                let text = state.get_text_in((start..state.get_position()).into());
240                let kind = match text.to_lowercase().as_str() {
241                    "abort" => AdaSyntaxKind::AbortKeyword,
242                    "abs" => AdaSyntaxKind::AbsKeyword,
243                    "abstract" => AdaSyntaxKind::AbstractKeyword,
244                    "accept" => AdaSyntaxKind::AcceptKeyword,
245                    "access" => AdaSyntaxKind::AccessKeyword,
246                    "aliased" => AdaSyntaxKind::AliasedKeyword,
247                    "all" => AdaSyntaxKind::AllKeyword,
248                    "and" => AdaSyntaxKind::AndKeyword,
249                    "array" => AdaSyntaxKind::ArrayKeyword,
250                    "at" => AdaSyntaxKind::AtKeyword,
251                    "begin" => AdaSyntaxKind::BeginKeyword,
252                    "body" => AdaSyntaxKind::BodyKeyword,
253                    "case" => AdaSyntaxKind::CaseKeyword,
254                    "constant" => AdaSyntaxKind::ConstantKeyword,
255                    "declare" => AdaSyntaxKind::DeclareKeyword,
256                    "delay" => AdaSyntaxKind::DelayKeyword,
257                    "delta" => AdaSyntaxKind::DeltaKeyword,
258                    "digits" => AdaSyntaxKind::DigitsKeyword,
259                    "do" => AdaSyntaxKind::DoKeyword,
260                    "else" => AdaSyntaxKind::ElseKeyword,
261                    "elsif" => AdaSyntaxKind::ElsifKeyword,
262                    "end" => AdaSyntaxKind::EndKeyword,
263                    "entry" => AdaSyntaxKind::EntryKeyword,
264                    "exception" => AdaSyntaxKind::ExceptionKeyword,
265                    "exit" => AdaSyntaxKind::ExitKeyword,
266                    "for" => AdaSyntaxKind::ForKeyword,
267                    "function" => AdaSyntaxKind::FunctionKeyword,
268                    "generic" => AdaSyntaxKind::GenericKeyword,
269                    "goto" => AdaSyntaxKind::GotoKeyword,
270                    "if" => AdaSyntaxKind::IfKeyword,
271                    "in" => AdaSyntaxKind::InKeyword,
272                    "interface" => AdaSyntaxKind::InterfaceKeyword,
273                    "is" => AdaSyntaxKind::IsKeyword,
274                    "limited" => AdaSyntaxKind::LimitedKeyword,
275                    "loop" => AdaSyntaxKind::LoopKeyword,
276                    "mod" => AdaSyntaxKind::ModKeyword,
277                    "new" => AdaSyntaxKind::NewKeyword,
278                    "not" => AdaSyntaxKind::NotKeyword,
279                    "null" => AdaSyntaxKind::NullKeyword,
280                    "of" => AdaSyntaxKind::OfKeyword,
281                    "or" => AdaSyntaxKind::OrKeyword,
282                    "others" => AdaSyntaxKind::OthersKeyword,
283                    "out" => AdaSyntaxKind::OutKeyword,
284                    "overriding" => AdaSyntaxKind::OverridingKeyword,
285                    "package" => AdaSyntaxKind::PackageKeyword,
286                    "pragma" => AdaSyntaxKind::PragmaKeyword,
287                    "private" => AdaSyntaxKind::PrivateKeyword,
288                    "procedure" => AdaSyntaxKind::ProcedureKeyword,
289                    "protected" => AdaSyntaxKind::ProtectedKeyword,
290                    "raise" => AdaSyntaxKind::RaiseKeyword,
291                    "range" => AdaSyntaxKind::RangeKeyword,
292                    "record" => AdaSyntaxKind::RecordKeyword,
293                    "rem" => AdaSyntaxKind::RemKeyword,
294                    "renames" => AdaSyntaxKind::RenamesKeyword,
295                    "requeue" => AdaSyntaxKind::RequeueKeyword,
296                    "return" => AdaSyntaxKind::ReturnKeyword,
297                    "reverse" => AdaSyntaxKind::ReverseKeyword,
298                    "select" => AdaSyntaxKind::SelectKeyword,
299                    "separate" => AdaSyntaxKind::SeparateKeyword,
300                    "subtype" => AdaSyntaxKind::SubtypeKeyword,
301                    "synchronized" => AdaSyntaxKind::SynchronizedKeyword,
302                    "tagged" => AdaSyntaxKind::TaggedKeyword,
303                    "task" => AdaSyntaxKind::TaskKeyword,
304                    "terminate" => AdaSyntaxKind::TerminateKeyword,
305                    "then" => AdaSyntaxKind::ThenKeyword,
306                    "type" => AdaSyntaxKind::TypeKeyword,
307                    "until" => AdaSyntaxKind::UntilKeyword,
308                    "use" => AdaSyntaxKind::UseKeyword,
309                    "when" => AdaSyntaxKind::WhenKeyword,
310                    "while" => AdaSyntaxKind::WhileKeyword,
311                    "with" => AdaSyntaxKind::WithKeyword,
312                    "xor" => AdaSyntaxKind::XorKeyword,
313                    _ => AdaSyntaxKind::Identifier,
314                };
315
316                state.add_token(kind, start, state.get_position());
317                return true;
318            }
319        }
320        false
321    }
322
323    fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
324        let start = state.get_position();
325        let rest = state.rest();
326
327        // Multi-character operators first
328        if rest.starts_with("**") {
329            state.advance(2);
330            state.add_token(AdaSyntaxKind::DoubleStar, start, state.get_position());
331            return true;
332        }
333        if rest.starts_with("=>") {
334            state.advance(2);
335            state.add_token(AdaSyntaxKind::Arrow, start, state.get_position());
336            return true;
337        }
338        if rest.starts_with("<=") {
339            state.advance(2);
340            state.add_token(AdaSyntaxKind::LessEqual, start, state.get_position());
341            return true;
342        }
343        if rest.starts_with(">=") {
344            state.advance(2);
345            state.add_token(AdaSyntaxKind::GreaterEqual, start, state.get_position());
346            return true;
347        }
348        if rest.starts_with(":=") {
349            state.advance(2);
350            state.add_token(AdaSyntaxKind::ColonEqual, start, state.get_position());
351            return true;
352        }
353        if rest.starts_with("..") {
354            state.advance(2);
355            state.add_token(AdaSyntaxKind::DotDot, start, state.get_position());
356            return true;
357        }
358        if rest.starts_with("/=") {
359            state.advance(2);
360            state.add_token(AdaSyntaxKind::NotEqual, start, state.get_position());
361            return true;
362        }
363
364        // Single-character operators
365        if let Some(ch) = state.current() {
366            let kind = match ch {
367                '+' => AdaSyntaxKind::Plus,
368                '-' => AdaSyntaxKind::Minus,
369                '*' => AdaSyntaxKind::Star,
370                '/' => AdaSyntaxKind::Slash,
371                '=' => AdaSyntaxKind::Equal,
372                '<' => AdaSyntaxKind::Less,
373                '>' => AdaSyntaxKind::Greater,
374                '&' => AdaSyntaxKind::Ampersand,
375                '|' => AdaSyntaxKind::Pipe,
376                _ => return false,
377            };
378            state.advance(1);
379            state.add_token(kind, start, state.get_position());
380            return true;
381        }
382        false
383    }
384
385    fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
386        let start = state.get_position();
387
388        if let Some(ch) = state.current() {
389            let kind = match ch {
390                '(' => AdaSyntaxKind::LeftParen,
391                ')' => AdaSyntaxKind::RightParen,
392                '[' => AdaSyntaxKind::LeftBracket,
393                ']' => AdaSyntaxKind::RightBracket,
394                '{' => AdaSyntaxKind::LeftBrace,
395                '}' => AdaSyntaxKind::RightBrace,
396                ',' => AdaSyntaxKind::Comma,
397                ';' => AdaSyntaxKind::Semicolon,
398                ':' => AdaSyntaxKind::Colon,
399                '.' => AdaSyntaxKind::Dot,
400                _ => return false,
401            };
402            state.advance(1);
403            state.add_token(kind, start, state.get_position());
404            return true;
405        }
406        false
407    }
408}