Skip to main content

oak_ada/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use token_type::AdaTokenType;
5
6use crate::language::AdaLanguage;
7use oak_core::{
8    Lexer, LexerCache, LexerState, OakError,
9    lexer::{LexOutput, WhitespaceConfig},
10    source::Source,
11};
12use std::sync::LazyLock;
13
14type State<'a, S> = LexerState<'a, S, AdaLanguage>;
15
16static ADA_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
17
18#[derive(Clone, Debug)]
19pub struct AdaLexer<'config> {
20    config: &'config AdaLanguage,
21}
22
23impl<'config> Lexer<AdaLanguage> for AdaLexer<'config> {
24    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<AdaLanguage>) -> LexOutput<AdaLanguage> {
25        let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
26        let result = self.run(&mut state);
27        if result.is_ok() {
28            state.add_eof()
29        }
30        state.finish_with_cache(result, cache)
31    }
32}
33
34impl<'config> AdaLexer<'config> {
35    pub fn new(config: &'config AdaLanguage) -> Self {
36        Self { config }
37    }
38
39    /// 主要词法分析逻辑
40    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
41        while state.not_at_end() {
42            let safe_point = state.get_position();
43
44            if self.skip_whitespace(state) {
45                continue;
46            }
47
48            if self.skip_comment(state) {
49                continue;
50            }
51
52            if self.lex_string_literal(state) {
53                continue;
54            }
55
56            if self.lex_char_literal(state) {
57                continue;
58            }
59
60            if self.lex_number_literal(state) {
61                continue;
62            }
63
64            if self.lex_identifier_or_keyword(state) {
65                continue;
66            }
67
68            if self.lex_operators(state) {
69                continue;
70            }
71
72            if self.lex_single_char_tokens(state) {
73                continue;
74            }
75
76            // 如果没有匹配任何模式,跳过当前字符并生成 Error token
77            if let Some(ch) = state.peek() {
78                state.advance(ch.len_utf8());
79                state.add_token(AdaTokenType::Error, safe_point, state.get_position())
80            }
81        }
82
83        Ok(())
84    }
85
86    /// 跳过空白字符
87    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        ADA_WHITESPACE.scan(state, AdaTokenType::Whitespace)
89    }
90
91    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
92        let start = state.get_position();
93
94        // Ada line comment: -- ... until newline
95        if state.consume_if_starts_with("--") {
96            while let Some(ch) = state.peek() {
97                if ch == '\n' || ch == '\r' {
98                    break;
99                }
100                state.advance(ch.len_utf8())
101            }
102            state.add_token(AdaTokenType::Comment, start, state.get_position());
103            return true;
104        }
105        false
106    }
107
108    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
109        let start = state.get_position();
110
111        // Ada string: "..."
112        if state.peek() == Some('"') {
113            state.advance(1);
114            while let Some(ch) = state.peek() {
115                if ch == '"' {
116                    state.advance(1); // consume closing quote
117                    if state.peek() == Some('"') {
118                        // Double quotes in Ada strings are escaped quotes
119                        state.advance(1);
120                        continue;
121                    }
122                    break;
123                }
124                state.advance(ch.len_utf8());
125                if ch == '\n' || ch == '\r' {
126                    break;
127                }
128            }
129            state.add_token(AdaTokenType::StringLiteral, start, state.get_position());
130            return true;
131        }
132        false
133    }
134
135    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
136        let start = state.get_position();
137        if state.peek() != Some('\'') {
138            return false;
139        }
140
141        // try parse 'x' etc.; if fails, revert
142        state.advance(1); // opening '
143        if let Some(c) = state.peek() {
144            state.advance(c.len_utf8())
145        }
146        else {
147            state.set_position(start);
148            return false;
149        }
150
151        if state.peek() == Some('\'') {
152            state.advance(1);
153            state.add_token(AdaTokenType::CharacterLiteral, start, state.get_position());
154            return true;
155        }
156        state.set_position(start);
157        false
158    }
159
160    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
161        let start = state.get_position();
162
163        if let Some(ch) = state.peek() {
164            if ch.is_ascii_digit() {
165                // consume digits
166                state.advance(ch.len_utf8());
167                while let Some(ch) = state.peek() {
168                    if ch.is_ascii_digit() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
169                }
170
171                // check for decimal point
172                if state.peek() == Some('.') {
173                    state.advance(1);
174                    while let Some(ch) = state.peek() {
175                        if ch.is_ascii_digit() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
176                    }
177                }
178
179                // check for exponent
180                if let Some(ch) = state.peek() {
181                    if ch == 'e' || ch == 'E' {
182                        state.advance(1);
183                        if let Some(sign) = state.peek() {
184                            if sign == '+' || sign == '-' {
185                                state.advance(1)
186                            }
187                        }
188                        while let Some(ch) = state.peek() {
189                            if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
190                        }
191                    }
192                }
193
194                state.add_token(AdaTokenType::NumberLiteral, start, state.get_position());
195                return true;
196            }
197        }
198        false
199    }
200
201    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
202        let start = state.get_position();
203
204        if let Some(ch) = state.peek() {
205            if ch.is_ascii_alphabetic() || ch == '_' {
206                state.advance(ch.len_utf8());
207
208                while let Some(ch) = state.peek() {
209                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
210                }
211
212                let end = state.get_position();
213                let text = state.get_text_in((start..end).into());
214                let kind = match text.to_lowercase().as_str() {
215                    "abort" => AdaTokenType::Abort,
216                    "abs" => AdaTokenType::Abs,
217                    "abstract" => AdaTokenType::Abstract,
218                    "accept" => AdaTokenType::Accept,
219                    "access" => AdaTokenType::Access,
220                    "aliased" => AdaTokenType::Aliased,
221                    "all" => AdaTokenType::All,
222                    "and" => AdaTokenType::And,
223                    "array" => AdaTokenType::Array,
224                    "at" => AdaTokenType::At,
225                    "begin" => AdaTokenType::Begin,
226                    "body" => AdaTokenType::Body,
227                    "case" => AdaTokenType::Case,
228                    "constant" => AdaTokenType::Constant,
229                    "declare" => AdaTokenType::Declare,
230                    "delay" => AdaTokenType::Delay,
231                    "delta" => AdaTokenType::Delta,
232                    "digits" => AdaTokenType::Digits,
233                    "do" => AdaTokenType::Do,
234                    "else" => AdaTokenType::Else,
235                    "elsif" => AdaTokenType::Elsif,
236                    "end" => AdaTokenType::End,
237                    "entry" => AdaTokenType::Entry,
238                    "exception" => AdaTokenType::Exception,
239                    "exit" => AdaTokenType::Exit,
240                    "for" => AdaTokenType::For,
241                    "function" => AdaTokenType::Function,
242                    "generic" => AdaTokenType::Generic,
243                    "goto" => AdaTokenType::Goto,
244                    "if" => AdaTokenType::If,
245                    "in" => AdaTokenType::In,
246                    "interface" => AdaTokenType::Interface,
247                    "is" => AdaTokenType::Is,
248                    "limited" => AdaTokenType::Limited,
249                    "loop" => AdaTokenType::Loop,
250                    "mod" => AdaTokenType::Mod,
251                    "new" => AdaTokenType::New,
252                    "not" => AdaTokenType::Not,
253                    "null" => AdaTokenType::Null,
254                    "of" => AdaTokenType::Of,
255                    "or" => AdaTokenType::Or,
256                    "others" => AdaTokenType::Others,
257                    "out" => AdaTokenType::Out,
258                    "overriding" => AdaTokenType::Overriding,
259                    "package" => AdaTokenType::Package,
260                    "pragma" => AdaTokenType::Pragma,
261                    "private" => AdaTokenType::Private,
262                    "procedure" => AdaTokenType::Procedure,
263                    "protected" => AdaTokenType::Protected,
264                    "raise" => AdaTokenType::Raise,
265                    "range" => AdaTokenType::Range,
266                    "record" => AdaTokenType::Record,
267                    "rem" => AdaTokenType::Rem,
268                    "renames" => AdaTokenType::Renames,
269                    "requeue" => AdaTokenType::Requeue,
270                    "return" => AdaTokenType::Return,
271                    "reverse" => AdaTokenType::Reverse,
272                    "select" => AdaTokenType::Select,
273                    "separate" => AdaTokenType::Separate,
274                    "some" => AdaTokenType::Some,
275                    "subtype" => AdaTokenType::Subtype,
276                    "synchronized" => AdaTokenType::Synchronized,
277                    "tagged" => AdaTokenType::Tagged,
278                    "task" => AdaTokenType::Task,
279                    "terminate" => AdaTokenType::Terminate,
280                    "then" => AdaTokenType::Then,
281                    "type" => AdaTokenType::Type,
282                    "until" => AdaTokenType::Until,
283                    "use" => AdaTokenType::Use,
284                    "when" => AdaTokenType::When,
285                    "while" => AdaTokenType::While,
286                    "with" => AdaTokenType::With,
287                    "xor" => AdaTokenType::Xor,
288                    _ => AdaTokenType::Identifier,
289                };
290
291                state.add_token(kind, start, end);
292                return true;
293            }
294        }
295        false
296    }
297
298    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
299        let start = state.get_position();
300
301        // Multi-character operators first
302        if state.consume_if_starts_with("**") {
303            state.add_token(AdaTokenType::StarStar, start, state.get_position());
304            return true;
305        }
306        if state.consume_if_starts_with("=>") {
307            state.add_token(AdaTokenType::Arrow, start, state.get_position());
308            return true;
309        }
310        if state.consume_if_starts_with("<=") {
311            state.add_token(AdaTokenType::Le, start, state.get_position());
312            return true;
313        }
314        if state.consume_if_starts_with(">=") {
315            state.add_token(AdaTokenType::Ge, start, state.get_position());
316            return true;
317        }
318        if state.consume_if_starts_with(":=") {
319            state.add_token(AdaTokenType::ColonEq, start, state.get_position());
320            return true;
321        }
322        if state.consume_if_starts_with("..") {
323            state.add_token(AdaTokenType::DotDot, start, state.get_position());
324            return true;
325        }
326        if state.consume_if_starts_with("/=") {
327            state.add_token(AdaTokenType::Ne, start, state.get_position());
328            return true;
329        }
330        if state.consume_if_starts_with("<<") {
331            state.add_token(AdaTokenType::LtLt, start, state.get_position());
332            return true;
333        }
334        if state.consume_if_starts_with(">>") {
335            state.add_token(AdaTokenType::GtGt, start, state.get_position());
336            return true;
337        }
338        if state.consume_if_starts_with("<>") {
339            state.add_token(AdaTokenType::Box, start, state.get_position());
340            return true;
341        }
342
343        // Single-character operators
344        if let Some(ch) = state.peek() {
345            let kind = match ch {
346                '+' => AdaTokenType::Plus,
347                '-' => AdaTokenType::Minus,
348                '*' => AdaTokenType::Star,
349                '/' => AdaTokenType::Slash,
350                '=' => AdaTokenType::Eq,
351                '<' => AdaTokenType::Lt,
352                '>' => AdaTokenType::Gt,
353                '&' => AdaTokenType::Ampersand,
354                '|' => AdaTokenType::Pipe,
355                _ => return false,
356            };
357            state.advance(1);
358            state.add_token(kind, start, state.get_position());
359            return true;
360        }
361        false
362    }
363
364    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
365        let start = state.get_position();
366
367        if let Some(ch) = state.peek() {
368            let kind = match ch {
369                '(' => AdaTokenType::LeftParen,
370                ')' => AdaTokenType::RightParen,
371                '[' => AdaTokenType::LeftBracket,
372                ']' => AdaTokenType::RightBracket,
373                '{' => AdaTokenType::LeftBrace,
374                '}' => AdaTokenType::RightBrace,
375                ',' => AdaTokenType::Comma,
376                ';' => AdaTokenType::Semicolon,
377                ':' => AdaTokenType::Colon,
378                '.' => AdaTokenType::Dot,
379                '\'' => AdaTokenType::Apostrophe,
380                _ => return false,
381            };
382            state.advance(1);
383            state.add_token(kind, start, state.get_position());
384            return true;
385        }
386        false
387    }
388}