Skip to main content

oak_ada/lexer/
mod.rs

1pub mod token_type;
2
3pub use token_type::AdaTokenType;
4
5use crate::language::AdaLanguage;
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::{LexOutput, WhitespaceConfig},
9    source::Source,
10};
11use std::sync::LazyLock;
12
13type State<'a, S> = LexerState<'a, S, AdaLanguage>;
14
15static ADA_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16
17#[derive(Clone, Debug)]
18pub struct AdaLexer<'config> {
19    config: &'config AdaLanguage,
20}
21
22impl<'config> Lexer<AdaLanguage> for AdaLexer<'config> {
23    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<AdaLanguage>) -> LexOutput<AdaLanguage> {
24        let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
25        let result = self.run(&mut state);
26        if result.is_ok() {
27            state.add_eof();
28        }
29        state.finish_with_cache(result, cache)
30    }
31}
32
33impl<'config> AdaLexer<'config> {
34    pub fn new(config: &'config AdaLanguage) -> Self {
35        Self { config }
36    }
37
38    /// 主要词法分析逻辑
39    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            let safe_point = state.get_position();
42
43            if self.skip_whitespace(state) {
44                continue;
45            }
46
47            if self.skip_comment(state) {
48                continue;
49            }
50
51            if self.lex_string_literal(state) {
52                continue;
53            }
54
55            if self.lex_char_literal(state) {
56                continue;
57            }
58
59            if self.lex_number_literal(state) {
60                continue;
61            }
62
63            if self.lex_identifier_or_keyword(state) {
64                continue;
65            }
66
67            if self.lex_operators(state) {
68                continue;
69            }
70
71            if self.lex_single_char_tokens(state) {
72                continue;
73            }
74
75            // 如果没有匹配任何模式,跳过当前字符并生成 Error token
76            if let Some(ch) = state.peek() {
77                state.advance(ch.len_utf8());
78                state.add_token(AdaTokenType::Error, safe_point, state.get_position());
79            }
80        }
81
82        Ok(())
83    }
84
85    /// 跳过空白字符
86    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87        ADA_WHITESPACE.scan(state, AdaTokenType::Whitespace)
88    }
89
90    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
91        let start = state.get_position();
92
93        // Ada line comment: -- ... until newline
94        if state.consume_if_starts_with("--") {
95            while let Some(ch) = state.peek() {
96                if ch == '\n' || ch == '\r' {
97                    break;
98                }
99                state.advance(ch.len_utf8());
100            }
101            state.add_token(AdaTokenType::Comment, start, state.get_position());
102            return true;
103        }
104        false
105    }
106
107    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
108        let start = state.get_position();
109
110        // Ada string: "..."
111        if state.peek() == Some('"') {
112            state.advance(1);
113            while let Some(ch) = state.peek() {
114                if ch == '"' {
115                    state.advance(1); // consume closing quote
116                    if state.peek() == Some('"') {
117                        // Double quotes in Ada strings are escaped quotes
118                        state.advance(1);
119                        continue;
120                    }
121                    break;
122                }
123                state.advance(ch.len_utf8());
124                if ch == '\n' || ch == '\r' {
125                    break;
126                }
127            }
128            state.add_token(AdaTokenType::StringLiteral, start, state.get_position());
129            return true;
130        }
131        false
132    }
133
134    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
135        let start = state.get_position();
136        if state.peek() != Some('\'') {
137            return false;
138        }
139
140        // try parse 'x' etc.; if fails, revert
141        state.advance(1); // opening '
142        if let Some(c) = state.peek() {
143            state.advance(c.len_utf8());
144        }
145        else {
146            state.set_position(start);
147            return false;
148        }
149
150        if state.peek() == Some('\'') {
151            state.advance(1);
152            state.add_token(AdaTokenType::CharacterLiteral, start, state.get_position());
153            return true;
154        }
155        state.set_position(start);
156        false
157    }
158
159    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
160        let start = state.get_position();
161
162        if let Some(ch) = state.peek() {
163            if ch.is_ascii_digit() {
164                // consume digits
165                state.advance(ch.len_utf8());
166                while let Some(ch) = state.peek() {
167                    if ch.is_ascii_digit() || ch == '_' {
168                        state.advance(ch.len_utf8());
169                    }
170                    else {
171                        break;
172                    }
173                }
174
175                // check for decimal point
176                if state.peek() == Some('.') {
177                    state.advance(1);
178                    while let Some(ch) = state.peek() {
179                        if ch.is_ascii_digit() || ch == '_' {
180                            state.advance(ch.len_utf8());
181                        }
182                        else {
183                            break;
184                        }
185                    }
186                }
187
188                // check for exponent
189                if let Some(ch) = state.peek() {
190                    if ch == 'e' || ch == 'E' {
191                        state.advance(1);
192                        if let Some(sign) = state.peek() {
193                            if sign == '+' || sign == '-' {
194                                state.advance(1);
195                            }
196                        }
197                        while let Some(ch) = state.peek() {
198                            if ch.is_ascii_digit() {
199                                state.advance(ch.len_utf8());
200                            }
201                            else {
202                                break;
203                            }
204                        }
205                    }
206                }
207
208                state.add_token(AdaTokenType::NumberLiteral, start, state.get_position());
209                return true;
210            }
211        }
212        false
213    }
214
215    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
216        let start = state.get_position();
217
218        if let Some(ch) = state.peek() {
219            if ch.is_ascii_alphabetic() || ch == '_' {
220                state.advance(ch.len_utf8());
221
222                while let Some(ch) = state.peek() {
223                    if ch.is_ascii_alphanumeric() || ch == '_' {
224                        state.advance(ch.len_utf8());
225                    }
226                    else {
227                        break;
228                    }
229                }
230
231                let end = state.get_position();
232                let text = state.get_text_in((start..end).into());
233                let kind = match text.to_lowercase().as_str() {
234                    "abort" => AdaTokenType::Abort,
235                    "abs" => AdaTokenType::Abs,
236                    "abstract" => AdaTokenType::Abstract,
237                    "accept" => AdaTokenType::Accept,
238                    "access" => AdaTokenType::Access,
239                    "aliased" => AdaTokenType::Aliased,
240                    "all" => AdaTokenType::All,
241                    "and" => AdaTokenType::And,
242                    "array" => AdaTokenType::Array,
243                    "at" => AdaTokenType::At,
244                    "begin" => AdaTokenType::Begin,
245                    "body" => AdaTokenType::Body,
246                    "case" => AdaTokenType::Case,
247                    "constant" => AdaTokenType::Constant,
248                    "declare" => AdaTokenType::Declare,
249                    "delay" => AdaTokenType::Delay,
250                    "delta" => AdaTokenType::Delta,
251                    "digits" => AdaTokenType::Digits,
252                    "do" => AdaTokenType::Do,
253                    "else" => AdaTokenType::Else,
254                    "elsif" => AdaTokenType::Elsif,
255                    "end" => AdaTokenType::End,
256                    "entry" => AdaTokenType::Entry,
257                    "exception" => AdaTokenType::Exception,
258                    "exit" => AdaTokenType::Exit,
259                    "for" => AdaTokenType::For,
260                    "function" => AdaTokenType::Function,
261                    "generic" => AdaTokenType::Generic,
262                    "goto" => AdaTokenType::Goto,
263                    "if" => AdaTokenType::If,
264                    "in" => AdaTokenType::In,
265                    "interface" => AdaTokenType::Interface,
266                    "is" => AdaTokenType::Is,
267                    "limited" => AdaTokenType::Limited,
268                    "loop" => AdaTokenType::Loop,
269                    "mod" => AdaTokenType::Mod,
270                    "new" => AdaTokenType::New,
271                    "not" => AdaTokenType::Not,
272                    "null" => AdaTokenType::Null,
273                    "of" => AdaTokenType::Of,
274                    "or" => AdaTokenType::Or,
275                    "others" => AdaTokenType::Others,
276                    "out" => AdaTokenType::Out,
277                    "overriding" => AdaTokenType::Overriding,
278                    "package" => AdaTokenType::Package,
279                    "pragma" => AdaTokenType::Pragma,
280                    "private" => AdaTokenType::Private,
281                    "procedure" => AdaTokenType::Procedure,
282                    "protected" => AdaTokenType::Protected,
283                    "raise" => AdaTokenType::Raise,
284                    "range" => AdaTokenType::Range,
285                    "record" => AdaTokenType::Record,
286                    "rem" => AdaTokenType::Rem,
287                    "renames" => AdaTokenType::Renames,
288                    "requeue" => AdaTokenType::Requeue,
289                    "return" => AdaTokenType::Return,
290                    "reverse" => AdaTokenType::Reverse,
291                    "select" => AdaTokenType::Select,
292                    "separate" => AdaTokenType::Separate,
293                    "some" => AdaTokenType::Some,
294                    "subtype" => AdaTokenType::Subtype,
295                    "synchronized" => AdaTokenType::Synchronized,
296                    "tagged" => AdaTokenType::Tagged,
297                    "task" => AdaTokenType::Task,
298                    "terminate" => AdaTokenType::Terminate,
299                    "then" => AdaTokenType::Then,
300                    "type" => AdaTokenType::Type,
301                    "until" => AdaTokenType::Until,
302                    "use" => AdaTokenType::Use,
303                    "when" => AdaTokenType::When,
304                    "while" => AdaTokenType::While,
305                    "with" => AdaTokenType::With,
306                    "xor" => AdaTokenType::Xor,
307                    _ => AdaTokenType::Identifier,
308                };
309
310                state.add_token(kind, start, end);
311                return true;
312            }
313        }
314        false
315    }
316
317    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
318        let start = state.get_position();
319
320        // Multi-character operators first
321        if state.consume_if_starts_with("**") {
322            state.add_token(AdaTokenType::StarStar, start, state.get_position());
323            return true;
324        }
325        if state.consume_if_starts_with("=>") {
326            state.add_token(AdaTokenType::Arrow, start, state.get_position());
327            return true;
328        }
329        if state.consume_if_starts_with("<=") {
330            state.add_token(AdaTokenType::Le, start, state.get_position());
331            return true;
332        }
333        if state.consume_if_starts_with(">=") {
334            state.add_token(AdaTokenType::Ge, start, state.get_position());
335            return true;
336        }
337        if state.consume_if_starts_with(":=") {
338            state.add_token(AdaTokenType::ColonEq, start, state.get_position());
339            return true;
340        }
341        if state.consume_if_starts_with("..") {
342            state.add_token(AdaTokenType::DotDot, start, state.get_position());
343            return true;
344        }
345        if state.consume_if_starts_with("/=") {
346            state.add_token(AdaTokenType::Ne, start, state.get_position());
347            return true;
348        }
349        if state.consume_if_starts_with("<<") {
350            state.add_token(AdaTokenType::LtLt, start, state.get_position());
351            return true;
352        }
353        if state.consume_if_starts_with(">>") {
354            state.add_token(AdaTokenType::GtGt, start, state.get_position());
355            return true;
356        }
357        if state.consume_if_starts_with("<>") {
358            state.add_token(AdaTokenType::Box, start, state.get_position());
359            return true;
360        }
361
362        // Single-character operators
363        if let Some(ch) = state.peek() {
364            let kind = match ch {
365                '+' => AdaTokenType::Plus,
366                '-' => AdaTokenType::Minus,
367                '*' => AdaTokenType::Star,
368                '/' => AdaTokenType::Slash,
369                '=' => AdaTokenType::Eq,
370                '<' => AdaTokenType::Lt,
371                '>' => AdaTokenType::Gt,
372                '&' => AdaTokenType::Ampersand,
373                '|' => AdaTokenType::Pipe,
374                _ => return false,
375            };
376            state.advance(1);
377            state.add_token(kind, start, state.get_position());
378            return true;
379        }
380        false
381    }
382
383    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
384        let start = state.get_position();
385
386        if let Some(ch) = state.peek() {
387            let kind = match ch {
388                '(' => AdaTokenType::LeftParen,
389                ')' => AdaTokenType::RightParen,
390                '[' => AdaTokenType::LeftBracket,
391                ']' => AdaTokenType::RightBracket,
392                '{' => AdaTokenType::LeftBrace,
393                '}' => AdaTokenType::RightBrace,
394                ',' => AdaTokenType::Comma,
395                ';' => AdaTokenType::Semicolon,
396                ':' => AdaTokenType::Colon,
397                '.' => AdaTokenType::Dot,
398                '\'' => AdaTokenType::Apostrophe,
399                _ => return false,
400            };
401            state.advance(1);
402            state.add_token(kind, start, state.get_position());
403            return true;
404        }
405        false
406    }
407}