Skip to main content

oak_ada/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Ada 词法单元类型定义。
3pub mod token_type;
4
5pub use token_type::AdaTokenType;
6
7use crate::language::AdaLanguage;
8use oak_core::{
9    Lexer, LexerCache, LexerState, OakError,
10    lexer::{LexOutput, WhitespaceConfig},
11    source::Source,
12};
13use std::sync::LazyLock;
14
15pub(crate) type State<'a, S> = LexerState<'a, S, AdaLanguage>;
16
17static ADA_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
18
19/// Ada 语言的词法分析器。
20#[derive(Clone, Debug)]
21pub struct AdaLexer<'config> {
22    config: &'config AdaLanguage,
23}
24
25impl<'config> Lexer<AdaLanguage> for AdaLexer<'config> {
26    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<AdaLanguage>) -> LexOutput<AdaLanguage> {
27        let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
28        let result = self.run(&mut state);
29        if result.is_ok() {
30            state.add_eof()
31        }
32        state.finish_with_cache(result, cache)
33    }
34}
35
36impl<'config> AdaLexer<'config> {
37    /// 使用给定的语言配置创建新的 `AdaLexer`。
38    pub fn new(config: &'config AdaLanguage) -> Self {
39        Self { config }
40    }
41
42    /// 主要词法分析逻辑
43    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
44        while state.not_at_end() {
45            let safe_point = state.get_position();
46
47            if self.skip_whitespace(state) {
48                continue;
49            }
50
51            if self.skip_comment(state) {
52                continue;
53            }
54
55            if self.lex_string_literal(state) {
56                continue;
57            }
58
59            if self.lex_char_literal(state) {
60                continue;
61            }
62
63            if self.lex_number_literal(state) {
64                continue;
65            }
66
67            if self.lex_identifier_or_keyword(state) {
68                continue;
69            }
70
71            if self.lex_operators(state) {
72                continue;
73            }
74
75            if self.lex_single_char_tokens(state) {
76                continue;
77            }
78
79            // 如果没有匹配任何模式,跳过当前字符并生成 Error token
80            if let Some(ch) = state.peek() {
81                state.advance(ch.len_utf8());
82                state.add_token(AdaTokenType::Error, safe_point, state.get_position())
83            }
84        }
85
86        Ok(())
87    }
88
89    /// 跳过空白字符
90    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
91        ADA_WHITESPACE.scan(state, AdaTokenType::Whitespace)
92    }
93
94    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95        let start = state.get_position();
96
97        // Ada line comment: -- ... until newline
98        if state.consume_if_starts_with("--") {
99            while let Some(ch) = state.peek() {
100                if ch == '\n' || ch == '\r' {
101                    break;
102                }
103                state.advance(ch.len_utf8())
104            }
105            state.add_token(AdaTokenType::Comment, start, state.get_position());
106            return true;
107        }
108        false
109    }
110
111    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
112        let start = state.get_position();
113
114        // Ada string: "..."
115        if state.peek() == Some('"') {
116            state.advance(1);
117            while let Some(ch) = state.peek() {
118                if ch == '"' {
119                    state.advance(1); // consume closing quote
120                    if state.peek() == Some('"') {
121                        // Double quotes in Ada strings are escaped quotes
122                        state.advance(1);
123                        continue;
124                    }
125                    break;
126                }
127                state.advance(ch.len_utf8());
128                if ch == '\n' || ch == '\r' {
129                    break;
130                }
131            }
132            state.add_token(AdaTokenType::StringLiteral, start, state.get_position());
133            return true;
134        }
135        false
136    }
137
138    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
139        let start = state.get_position();
140        if state.peek() != Some('\'') {
141            return false;
142        }
143
144        // try parse 'x' etc.; if fails, revert
145        state.advance(1); // opening '
146        if let Some(c) = state.peek() {
147            state.advance(c.len_utf8())
148        }
149        else {
150            state.set_position(start);
151            return false;
152        }
153
154        if state.peek() == Some('\'') {
155            state.advance(1);
156            state.add_token(AdaTokenType::CharacterLiteral, start, state.get_position());
157            return true;
158        }
159        state.set_position(start);
160        false
161    }
162
163    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
164        let start = state.get_position();
165
166        if let Some(ch) = state.peek() {
167            if ch.is_ascii_digit() {
168                // consume digits
169                state.advance(ch.len_utf8());
170                while let Some(ch) = state.peek() {
171                    if ch.is_ascii_digit() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
172                }
173
174                // check for decimal point
175                if state.peek() == Some('.') {
176                    state.advance(1);
177                    while let Some(ch) = state.peek() {
178                        if ch.is_ascii_digit() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
179                    }
180                }
181
182                // check for exponent
183                if let Some(ch) = state.peek() {
184                    if ch == 'e' || ch == 'E' {
185                        state.advance(1);
186                        if let Some(sign) = state.peek() {
187                            if sign == '+' || sign == '-' {
188                                state.advance(1)
189                            }
190                        }
191                        while let Some(ch) = state.peek() {
192                            if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
193                        }
194                    }
195                }
196
197                state.add_token(AdaTokenType::NumberLiteral, start, state.get_position());
198                return true;
199            }
200        }
201        false
202    }
203
204    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
205        let start = state.get_position();
206
207        if let Some(ch) = state.peek() {
208            if ch.is_ascii_alphabetic() || ch == '_' {
209                state.advance(ch.len_utf8());
210
211                while let Some(ch) = state.peek() {
212                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
213                }
214
215                let end = state.get_position();
216                let text = state.get_text_in((start..end).into());
217                let kind = match text.to_lowercase().as_str() {
218                    "abort" => AdaTokenType::Abort,
219                    "abs" => AdaTokenType::Abs,
220                    "abstract" => AdaTokenType::Abstract,
221                    "accept" => AdaTokenType::Accept,
222                    "access" => AdaTokenType::Access,
223                    "aliased" => AdaTokenType::Aliased,
224                    "all" => AdaTokenType::All,
225                    "and" => AdaTokenType::And,
226                    "array" => AdaTokenType::Array,
227                    "at" => AdaTokenType::At,
228                    "begin" => AdaTokenType::Begin,
229                    "body" => AdaTokenType::Body,
230                    "case" => AdaTokenType::Case,
231                    "constant" => AdaTokenType::Constant,
232                    "declare" => AdaTokenType::Declare,
233                    "delay" => AdaTokenType::Delay,
234                    "delta" => AdaTokenType::Delta,
235                    "digits" => AdaTokenType::Digits,
236                    "do" => AdaTokenType::Do,
237                    "else" => AdaTokenType::Else,
238                    "elsif" => AdaTokenType::Elsif,
239                    "end" => AdaTokenType::End,
240                    "entry" => AdaTokenType::Entry,
241                    "exception" => AdaTokenType::Exception,
242                    "exit" => AdaTokenType::Exit,
243                    "for" => AdaTokenType::For,
244                    "function" => AdaTokenType::Function,
245                    "generic" => AdaTokenType::Generic,
246                    "goto" => AdaTokenType::Goto,
247                    "if" => AdaTokenType::If,
248                    "in" => AdaTokenType::In,
249                    "interface" => AdaTokenType::Interface,
250                    "is" => AdaTokenType::Is,
251                    "limited" => AdaTokenType::Limited,
252                    "loop" => AdaTokenType::Loop,
253                    "mod" => AdaTokenType::Mod,
254                    "new" => AdaTokenType::New,
255                    "not" => AdaTokenType::Not,
256                    "null" => AdaTokenType::Null,
257                    "of" => AdaTokenType::Of,
258                    "or" => AdaTokenType::Or,
259                    "others" => AdaTokenType::Others,
260                    "out" => AdaTokenType::Out,
261                    "overriding" => AdaTokenType::Overriding,
262                    "package" => AdaTokenType::Package,
263                    "pragma" => AdaTokenType::Pragma,
264                    "private" => AdaTokenType::Private,
265                    "procedure" => AdaTokenType::Procedure,
266                    "protected" => AdaTokenType::Protected,
267                    "raise" => AdaTokenType::Raise,
268                    "range" => AdaTokenType::Range,
269                    "record" => AdaTokenType::Record,
270                    "rem" => AdaTokenType::Rem,
271                    "renames" => AdaTokenType::Renames,
272                    "requeue" => AdaTokenType::Requeue,
273                    "return" => AdaTokenType::Return,
274                    "reverse" => AdaTokenType::Reverse,
275                    "select" => AdaTokenType::Select,
276                    "separate" => AdaTokenType::Separate,
277                    "some" => AdaTokenType::Some,
278                    "subtype" => AdaTokenType::Subtype,
279                    "synchronized" => AdaTokenType::Synchronized,
280                    "tagged" => AdaTokenType::Tagged,
281                    "task" => AdaTokenType::Task,
282                    "terminate" => AdaTokenType::Terminate,
283                    "then" => AdaTokenType::Then,
284                    "type" => AdaTokenType::Type,
285                    "until" => AdaTokenType::Until,
286                    "use" => AdaTokenType::Use,
287                    "when" => AdaTokenType::When,
288                    "while" => AdaTokenType::While,
289                    "with" => AdaTokenType::With,
290                    "xor" => AdaTokenType::Xor,
291                    _ => AdaTokenType::Identifier,
292                };
293
294                state.add_token(kind, start, end);
295                return true;
296            }
297        }
298        false
299    }
300
301    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
302        let start = state.get_position();
303
304        // Multi-character operators first
305        if state.consume_if_starts_with("**") {
306            state.add_token(AdaTokenType::StarStar, start, state.get_position());
307            return true;
308        }
309        if state.consume_if_starts_with("=>") {
310            state.add_token(AdaTokenType::Arrow, start, state.get_position());
311            return true;
312        }
313        if state.consume_if_starts_with("<=") {
314            state.add_token(AdaTokenType::Le, start, state.get_position());
315            return true;
316        }
317        if state.consume_if_starts_with(">=") {
318            state.add_token(AdaTokenType::Ge, start, state.get_position());
319            return true;
320        }
321        if state.consume_if_starts_with(":=") {
322            state.add_token(AdaTokenType::ColonEq, start, state.get_position());
323            return true;
324        }
325        if state.consume_if_starts_with("..") {
326            state.add_token(AdaTokenType::DotDot, start, state.get_position());
327            return true;
328        }
329        if state.consume_if_starts_with("/=") {
330            state.add_token(AdaTokenType::Ne, start, state.get_position());
331            return true;
332        }
333        if state.consume_if_starts_with("<<") {
334            state.add_token(AdaTokenType::LtLt, start, state.get_position());
335            return true;
336        }
337        if state.consume_if_starts_with(">>") {
338            state.add_token(AdaTokenType::GtGt, start, state.get_position());
339            return true;
340        }
341        if state.consume_if_starts_with("<>") {
342            state.add_token(AdaTokenType::Box, start, state.get_position());
343            return true;
344        }
345
346        // Single-character operators
347        if let Some(ch) = state.peek() {
348            let kind = match ch {
349                '+' => AdaTokenType::Plus,
350                '-' => AdaTokenType::Minus,
351                '*' => AdaTokenType::Star,
352                '/' => AdaTokenType::Slash,
353                '=' => AdaTokenType::Eq,
354                '<' => AdaTokenType::Lt,
355                '>' => AdaTokenType::Gt,
356                '&' => AdaTokenType::Ampersand,
357                '|' => AdaTokenType::Pipe,
358                _ => return false,
359            };
360            state.advance(1);
361            state.add_token(kind, start, state.get_position());
362            return true;
363        }
364        false
365    }
366
367    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
368        let start = state.get_position();
369
370        if let Some(ch) = state.peek() {
371            let kind = match ch {
372                '(' => AdaTokenType::LeftParen,
373                ')' => AdaTokenType::RightParen,
374                '[' => AdaTokenType::LeftBracket,
375                ']' => AdaTokenType::RightBracket,
376                '{' => AdaTokenType::LeftBrace,
377                '}' => AdaTokenType::RightBrace,
378                ',' => AdaTokenType::Comma,
379                ';' => AdaTokenType::Semicolon,
380                ':' => AdaTokenType::Colon,
381                '.' => AdaTokenType::Dot,
382                '\'' => AdaTokenType::Apostrophe,
383                _ => return false,
384            };
385            state.advance(1);
386            state.add_token(kind, start, state.get_position());
387            return true;
388        }
389        false
390    }
391}