oak_ada/lexer/
mod.rs

1pub mod token_type;
2
3pub use token_type::AdaTokenType;
4
5use crate::language::AdaLanguage;
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::{LexOutput, WhitespaceConfig},
9    source::Source,
10};
11use std::sync::LazyLock;
12
13type State<'a, S> = LexerState<'a, S, AdaLanguage>;
14
15static ADA_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16
17#[derive(Clone)]
18pub struct AdaLexer;
19
20impl Lexer<AdaLanguage> for AdaLexer {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<AdaLanguage>) -> LexOutput<AdaLanguage> {
22        let mut state: State<'_, S> = LexerState::new(source);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof();
26        }
27        state.finish_with_cache(result, cache)
28    }
29}
30
31impl AdaLexer {
32    pub fn new(_config: &AdaLanguage) -> Self {
33        Self
34    }
35
36    /// 主要词法分析逻辑
37    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
38        while state.not_at_end() {
39            let safe_point = state.get_position();
40
41            if self.skip_whitespace(state) {
42                continue;
43            }
44
45            if self.skip_comment(state) {
46                continue;
47            }
48
49            if self.lex_string_literal(state) {
50                continue;
51            }
52
53            if self.lex_char_literal(state) {
54                continue;
55            }
56
57            if self.lex_number_literal(state) {
58                continue;
59            }
60
61            if self.lex_identifier_or_keyword(state) {
62                continue;
63            }
64
65            if self.lex_operators(state) {
66                continue;
67            }
68
69            if self.lex_single_char_tokens(state) {
70                continue;
71            }
72
73            // 如果没有匹配任何模式,跳过当前字符并生成 Error token
74            if let Some(ch) = state.peek() {
75                state.advance(ch.len_utf8());
76                state.add_token(AdaTokenType::Error, safe_point, state.get_position());
77            }
78        }
79
80        Ok(())
81    }
82
83    /// 跳过空白字符
84    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
85        ADA_WHITESPACE.scan(state, AdaTokenType::Whitespace)
86    }
87
88    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
89        let start = state.get_position();
90
91        // Ada line comment: -- ... until newline
92        if state.consume_if_starts_with("--") {
93            while let Some(ch) = state.peek() {
94                if ch == '\n' || ch == '\r' {
95                    break;
96                }
97                state.advance(ch.len_utf8());
98            }
99            state.add_token(AdaTokenType::Comment, start, state.get_position());
100            return true;
101        }
102        false
103    }
104
105    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
106        let start = state.get_position();
107
108        // Ada string: "..."
109        if state.peek() == Some('"') {
110            state.advance(1);
111            while let Some(ch) = state.peek() {
112                if ch == '"' {
113                    state.advance(1); // consume closing quote
114                    if state.peek() == Some('"') {
115                        // Double quotes in Ada strings are escaped quotes
116                        state.advance(1);
117                        continue;
118                    }
119                    break;
120                }
121                state.advance(ch.len_utf8());
122                if ch == '\n' || ch == '\r' {
123                    break;
124                }
125            }
126            state.add_token(AdaTokenType::StringLiteral, start, state.get_position());
127            return true;
128        }
129        false
130    }
131
132    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
133        let start = state.get_position();
134        if state.peek() != Some('\'') {
135            return false;
136        }
137
138        // try parse 'x' etc.; if fails, revert
139        state.advance(1); // opening '
140        if let Some(c) = state.peek() {
141            state.advance(c.len_utf8());
142        }
143        else {
144            state.set_position(start);
145            return false;
146        }
147
148        if state.peek() == Some('\'') {
149            state.advance(1);
150            state.add_token(AdaTokenType::CharacterLiteral, start, state.get_position());
151            return true;
152        }
153        state.set_position(start);
154        false
155    }
156
157    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
158        let start = state.get_position();
159
160        if let Some(ch) = state.peek() {
161            if ch.is_ascii_digit() {
162                // consume digits
163                state.advance(ch.len_utf8());
164                while let Some(ch) = state.peek() {
165                    if ch.is_ascii_digit() || ch == '_' {
166                        state.advance(ch.len_utf8());
167                    }
168                    else {
169                        break;
170                    }
171                }
172
173                // check for decimal point
174                if state.peek() == Some('.') {
175                    state.advance(1);
176                    while let Some(ch) = state.peek() {
177                        if ch.is_ascii_digit() || ch == '_' {
178                            state.advance(ch.len_utf8());
179                        }
180                        else {
181                            break;
182                        }
183                    }
184                }
185
186                // check for exponent
187                if let Some(ch) = state.peek() {
188                    if ch == 'e' || ch == 'E' {
189                        state.advance(1);
190                        if let Some(sign) = state.peek() {
191                            if sign == '+' || sign == '-' {
192                                state.advance(1);
193                            }
194                        }
195                        while let Some(ch) = state.peek() {
196                            if ch.is_ascii_digit() {
197                                state.advance(ch.len_utf8());
198                            }
199                            else {
200                                break;
201                            }
202                        }
203                    }
204                }
205
206                state.add_token(AdaTokenType::NumberLiteral, start, state.get_position());
207                return true;
208            }
209        }
210        false
211    }
212
213    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
214        let start = state.get_position();
215
216        if let Some(ch) = state.peek() {
217            if ch.is_ascii_alphabetic() || ch == '_' {
218                state.advance(ch.len_utf8());
219
220                while let Some(ch) = state.peek() {
221                    if ch.is_ascii_alphanumeric() || ch == '_' {
222                        state.advance(ch.len_utf8());
223                    }
224                    else {
225                        break;
226                    }
227                }
228
229                let end = state.get_position();
230                let text = state.get_text_in((start..end).into());
231                let kind = match text.to_lowercase().as_str() {
232                    "abort" => AdaTokenType::Abort,
233                    "abs" => AdaTokenType::Abs,
234                    "abstract" => AdaTokenType::Abstract,
235                    "accept" => AdaTokenType::Accept,
236                    "access" => AdaTokenType::Access,
237                    "aliased" => AdaTokenType::Aliased,
238                    "all" => AdaTokenType::All,
239                    "and" => AdaTokenType::And,
240                    "array" => AdaTokenType::Array,
241                    "at" => AdaTokenType::At,
242                    "begin" => AdaTokenType::Begin,
243                    "body" => AdaTokenType::Body,
244                    "case" => AdaTokenType::Case,
245                    "constant" => AdaTokenType::Constant,
246                    "declare" => AdaTokenType::Declare,
247                    "delay" => AdaTokenType::Delay,
248                    "delta" => AdaTokenType::Delta,
249                    "digits" => AdaTokenType::Digits,
250                    "do" => AdaTokenType::Do,
251                    "else" => AdaTokenType::Else,
252                    "elsif" => AdaTokenType::Elsif,
253                    "end" => AdaTokenType::End,
254                    "entry" => AdaTokenType::Entry,
255                    "exception" => AdaTokenType::Exception,
256                    "exit" => AdaTokenType::Exit,
257                    "for" => AdaTokenType::For,
258                    "function" => AdaTokenType::Function,
259                    "generic" => AdaTokenType::Generic,
260                    "goto" => AdaTokenType::Goto,
261                    "if" => AdaTokenType::If,
262                    "in" => AdaTokenType::In,
263                    "interface" => AdaTokenType::Interface,
264                    "is" => AdaTokenType::Is,
265                    "limited" => AdaTokenType::Limited,
266                    "loop" => AdaTokenType::Loop,
267                    "mod" => AdaTokenType::Mod,
268                    "new" => AdaTokenType::New,
269                    "not" => AdaTokenType::Not,
270                    "null" => AdaTokenType::Null,
271                    "of" => AdaTokenType::Of,
272                    "or" => AdaTokenType::Or,
273                    "others" => AdaTokenType::Others,
274                    "out" => AdaTokenType::Out,
275                    "overriding" => AdaTokenType::Overriding,
276                    "package" => AdaTokenType::Package,
277                    "pragma" => AdaTokenType::Pragma,
278                    "private" => AdaTokenType::Private,
279                    "procedure" => AdaTokenType::Procedure,
280                    "protected" => AdaTokenType::Protected,
281                    "raise" => AdaTokenType::Raise,
282                    "range" => AdaTokenType::Range,
283                    "record" => AdaTokenType::Record,
284                    "rem" => AdaTokenType::Rem,
285                    "renames" => AdaTokenType::Renames,
286                    "requeue" => AdaTokenType::Requeue,
287                    "return" => AdaTokenType::Return,
288                    "reverse" => AdaTokenType::Reverse,
289                    "select" => AdaTokenType::Select,
290                    "separate" => AdaTokenType::Separate,
291                    "some" => AdaTokenType::Some,
292                    "subtype" => AdaTokenType::Subtype,
293                    "synchronized" => AdaTokenType::Synchronized,
294                    "tagged" => AdaTokenType::Tagged,
295                    "task" => AdaTokenType::Task,
296                    "terminate" => AdaTokenType::Terminate,
297                    "then" => AdaTokenType::Then,
298                    "type" => AdaTokenType::Type,
299                    "until" => AdaTokenType::Until,
300                    "use" => AdaTokenType::Use,
301                    "when" => AdaTokenType::When,
302                    "while" => AdaTokenType::While,
303                    "with" => AdaTokenType::With,
304                    "xor" => AdaTokenType::Xor,
305                    _ => AdaTokenType::Identifier,
306                };
307
308                state.add_token(kind, start, end);
309                return true;
310            }
311        }
312        false
313    }
314
315    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
316        let start = state.get_position();
317
318        // Multi-character operators first
319        if state.consume_if_starts_with("**") {
320            state.add_token(AdaTokenType::StarStar, start, state.get_position());
321            return true;
322        }
323        if state.consume_if_starts_with("=>") {
324            state.add_token(AdaTokenType::Arrow, start, state.get_position());
325            return true;
326        }
327        if state.consume_if_starts_with("<=") {
328            state.add_token(AdaTokenType::Le, start, state.get_position());
329            return true;
330        }
331        if state.consume_if_starts_with(">=") {
332            state.add_token(AdaTokenType::Ge, start, state.get_position());
333            return true;
334        }
335        if state.consume_if_starts_with(":=") {
336            state.add_token(AdaTokenType::ColonEq, start, state.get_position());
337            return true;
338        }
339        if state.consume_if_starts_with("..") {
340            state.add_token(AdaTokenType::DotDot, start, state.get_position());
341            return true;
342        }
343        if state.consume_if_starts_with("/=") {
344            state.add_token(AdaTokenType::Ne, start, state.get_position());
345            return true;
346        }
347        if state.consume_if_starts_with("<<") {
348            state.add_token(AdaTokenType::LtLt, start, state.get_position());
349            return true;
350        }
351        if state.consume_if_starts_with(">>") {
352            state.add_token(AdaTokenType::GtGt, start, state.get_position());
353            return true;
354        }
355        if state.consume_if_starts_with("<>") {
356            state.add_token(AdaTokenType::Box, start, state.get_position());
357            return true;
358        }
359
360        // Single-character operators
361        if let Some(ch) = state.peek() {
362            let kind = match ch {
363                '+' => AdaTokenType::Plus,
364                '-' => AdaTokenType::Minus,
365                '*' => AdaTokenType::Star,
366                '/' => AdaTokenType::Slash,
367                '=' => AdaTokenType::Eq,
368                '<' => AdaTokenType::Lt,
369                '>' => AdaTokenType::Gt,
370                '&' => AdaTokenType::Ampersand,
371                '|' => AdaTokenType::Pipe,
372                _ => return false,
373            };
374            state.advance(1);
375            state.add_token(kind, start, state.get_position());
376            return true;
377        }
378        false
379    }
380
381    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
382        let start = state.get_position();
383
384        if let Some(ch) = state.peek() {
385            let kind = match ch {
386                '(' => AdaTokenType::LeftParen,
387                ')' => AdaTokenType::RightParen,
388                '[' => AdaTokenType::LeftBracket,
389                ']' => AdaTokenType::RightBracket,
390                '{' => AdaTokenType::LeftBrace,
391                '}' => AdaTokenType::RightBrace,
392                ',' => AdaTokenType::Comma,
393                ';' => AdaTokenType::Semicolon,
394                ':' => AdaTokenType::Colon,
395                '.' => AdaTokenType::Dot,
396                '\'' => AdaTokenType::Apostrophe,
397                _ => return false,
398            };
399            state.advance(1);
400            state.add_token(kind, start, state.get_position());
401            return true;
402        }
403        false
404    }
405}