oak_pascal/lexer/
mod.rs

1use crate::{kind::PascalSyntaxKind, language::PascalLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'s, S> = LexerState<'s, S, PascalLanguage>;
10
11static PASCAL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static PASCAL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "{", block_end: "}", nested_blocks: false });
13
14#[derive(Clone, Default)]
15pub struct PascalLexer;
16
17impl PascalLexer {
18    pub fn new(_config: &PascalLanguage) -> Self {
19        Self
20    }
21
22    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
23        PASCAL_WHITESPACE.scan(state, PascalSyntaxKind::Whitespace)
24    }
25
26    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
27        let start = state.get_position();
28
29        // Line comment starting with //
30        if state.rest().starts_with("//") {
31            return PASCAL_COMMENT.scan(state, PascalSyntaxKind::Comment, PascalSyntaxKind::Comment);
32        }
33
34        // Block comment: { ... }
35        if state.current() == Some('{') {
36            state.advance(1);
37            while let Some(ch) = state.peek() {
38                if ch == '}' {
39                    state.advance(1);
40                    break;
41                }
42                state.advance(ch.len_utf8());
43            }
44            state.add_token(PascalSyntaxKind::Comment, start, state.get_position());
45            return true;
46        }
47
48        // Block comment: (* ... *)
49        if state.rest().starts_with("(*") {
50            state.advance(2);
51            while let Some(ch) = state.peek() {
52                if ch == '*' && state.peek_next_n(1) == Some(')') {
53                    state.advance(2);
54                    break;
55                }
56                state.advance(ch.len_utf8());
57            }
58            state.add_token(PascalSyntaxKind::Comment, start, state.get_position());
59            return true;
60        }
61
62        false
63    }
64
65    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
66        let start = state.get_position();
67
68        // Pascal 字符串字面量:'...'
69        if state.current() == Some('\'') {
70            state.advance(1);
71            while let Some(ch) = state.peek() {
72                if ch == '\'' {
73                    // 检查是否是转义的单引号 ''
74                    if state.peek_next_n(1) == Some('\'') {
75                        state.advance(2); // 跳过 ''
76                        continue;
77                    }
78                    else {
79                        state.advance(1); // 结束引号
80                        break;
81                    }
82                }
83                state.advance(ch.len_utf8());
84            }
85            state.add_token(PascalSyntaxKind::StringLiteral, start, state.get_position());
86            return true;
87        }
88        false
89    }
90
91    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
92        if let Some(ch) = state.peek() {
93            if ch.is_alphabetic() || ch == '_' {
94                let start_pos = state.get_position();
95                let mut text = String::new();
96
97                // 读取标识符
98                while let Some(ch) = state.peek() {
99                    if ch.is_alphanumeric() || ch == '_' {
100                        text.push(ch);
101                        state.advance(ch.len_utf8());
102                    }
103                    else {
104                        break;
105                    }
106                }
107
108                // 检查是否是关键字
109                let kind = match text.to_lowercase().as_str() {
110                    "program" => PascalSyntaxKind::Program,
111                    "var" => PascalSyntaxKind::Var,
112                    "const" => PascalSyntaxKind::Const,
113                    "type" => PascalSyntaxKind::Type,
114                    "procedure" => PascalSyntaxKind::Procedure,
115                    "function" => PascalSyntaxKind::Function,
116                    "begin" => PascalSyntaxKind::Begin,
117                    "end" => PascalSyntaxKind::End,
118                    "if" => PascalSyntaxKind::If,
119                    "then" => PascalSyntaxKind::Then,
120                    "else" => PascalSyntaxKind::Else,
121                    "while" => PascalSyntaxKind::While,
122                    "do" => PascalSyntaxKind::Do,
123                    "for" => PascalSyntaxKind::For,
124                    "to" => PascalSyntaxKind::To,
125                    "downto" => PascalSyntaxKind::Downto,
126                    "repeat" => PascalSyntaxKind::Repeat,
127                    "until" => PascalSyntaxKind::Until,
128                    "case" => PascalSyntaxKind::Case,
129                    "of" => PascalSyntaxKind::Of,
130                    "with" => PascalSyntaxKind::With,
131                    "record" => PascalSyntaxKind::Record,
132                    "array" => PascalSyntaxKind::Array,
133                    "set" => PascalSyntaxKind::Set,
134                    "file" => PascalSyntaxKind::File,
135                    "packed" => PascalSyntaxKind::Packed,
136                    "nil" => PascalSyntaxKind::Nil,
137                    "true" => PascalSyntaxKind::True,
138                    "false" => PascalSyntaxKind::False,
139                    "and" => PascalSyntaxKind::And,
140                    "or" => PascalSyntaxKind::Or,
141                    "not" => PascalSyntaxKind::Not,
142                    "div" => PascalSyntaxKind::Div,
143                    "mod" => PascalSyntaxKind::Mod,
144                    "in" => PascalSyntaxKind::In,
145
146                    _ => PascalSyntaxKind::Identifier,
147                };
148
149                state.add_token(kind, start_pos, state.get_position());
150                true
151            }
152            else {
153                false
154            }
155        }
156        else {
157            false
158        }
159    }
160
161    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
162        if let Some(ch) = state.peek() {
163            if ch.is_ascii_digit() {
164                let start_pos = state.get_position();
165                let mut has_dot = false;
166
167                // 读取数字
168                while let Some(ch) = state.peek() {
169                    if ch.is_ascii_digit() {
170                        state.advance(1);
171                    }
172                    else if ch == '.' && !has_dot {
173                        has_dot = true;
174                        state.advance(1);
175                    }
176                    else {
177                        break;
178                    }
179                }
180
181                let kind = if has_dot { PascalSyntaxKind::RealLiteral } else { PascalSyntaxKind::IntegerLiteral };
182
183                state.add_token(kind, start_pos, state.get_position());
184                true
185            }
186            else {
187                false
188            }
189        }
190        else {
191            false
192        }
193    }
194
195    fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
196        if let Some(ch) = state.peek() {
197            let start_pos = state.get_position();
198
199            let kind = match ch {
200                '+' => {
201                    state.advance(1);
202                    PascalSyntaxKind::Plus
203                }
204                '-' => {
205                    state.advance(1);
206                    PascalSyntaxKind::Minus
207                }
208                '*' => {
209                    state.advance(1);
210                    PascalSyntaxKind::Multiply
211                }
212                '/' => {
213                    state.advance(1);
214                    PascalSyntaxKind::Divide
215                }
216                '=' => {
217                    state.advance(1);
218                    PascalSyntaxKind::Equal
219                }
220                '<' => {
221                    state.advance(1);
222                    if let Some('=') = state.peek() {
223                        state.advance(1);
224                        PascalSyntaxKind::LessEqual
225                    }
226                    else if let Some('>') = state.peek() {
227                        state.advance(1);
228                        PascalSyntaxKind::NotEqual
229                    }
230                    else {
231                        PascalSyntaxKind::Less
232                    }
233                }
234                '>' => {
235                    state.advance(1);
236                    if let Some('=') = state.peek() {
237                        state.advance(1);
238                        PascalSyntaxKind::GreaterEqual
239                    }
240                    else {
241                        PascalSyntaxKind::Greater
242                    }
243                }
244                ':' => {
245                    state.advance(1);
246                    if let Some('=') = state.peek() {
247                        state.advance(1);
248                        PascalSyntaxKind::Assign
249                    }
250                    else {
251                        PascalSyntaxKind::Colon
252                    }
253                }
254                ';' => {
255                    state.advance(1);
256                    PascalSyntaxKind::Semicolon
257                }
258                ',' => {
259                    state.advance(1);
260                    PascalSyntaxKind::Comma
261                }
262                '.' => {
263                    state.advance(1);
264                    if let Some('.') = state.peek() {
265                        state.advance(1);
266                        PascalSyntaxKind::Range
267                    }
268                    else {
269                        PascalSyntaxKind::Dot
270                    }
271                }
272                '(' => {
273                    state.advance(1);
274                    PascalSyntaxKind::LeftParen
275                }
276                ')' => {
277                    state.advance(1);
278                    PascalSyntaxKind::RightParen
279                }
280                '[' => {
281                    state.advance(1);
282                    PascalSyntaxKind::LeftBracket
283                }
284                ']' => {
285                    state.advance(1);
286                    PascalSyntaxKind::RightBracket
287                }
288                '^' => {
289                    state.advance(1);
290                    PascalSyntaxKind::Caret
291                }
292                '\n' => {
293                    state.advance(1);
294                    PascalSyntaxKind::Newline
295                }
296                _ => {
297                    state.advance(ch.len_utf8());
298                    PascalSyntaxKind::Error
299                }
300            };
301
302            state.add_token(kind, start_pos, state.get_position());
303            true
304        }
305        else {
306            false
307        }
308    }
309}
310
311impl Lexer<PascalLanguage> for PascalLexer {
312    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PascalLanguage>) -> LexOutput<PascalLanguage> {
313        let mut state = State::new(source);
314        let result = self.run(&mut state);
315        if result.is_ok() {
316            state.add_eof();
317        }
318        state.finish_with_cache(result, cache)
319    }
320}
321
322impl PascalLexer {
323    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
324        let safe_point = state.get_position();
325        while state.not_at_end() {
326            // 跳过空白字符
327            if self.skip_whitespace(state) {
328                continue;
329            }
330
331            // 处理注释
332            if self.skip_comment(state) {
333                continue;
334            }
335
336            // 处理字符串
337            if self.lex_string(state) {
338                continue;
339            }
340
341            // 处理标识符和关键字
342            if self.lex_identifier_or_keyword(state) {
343                continue;
344            }
345
346            // 处理数字
347            if self.lex_number(state) {
348                continue;
349            }
350
351            // 处理操作符和标点符号
352            if self.lex_operators_and_punctuation(state) {
353                continue;
354            }
355
356            // 如果没有匹配任何模式,创建错误 token
357            let start_pos = state.get_position();
358            if let Some(ch) = state.peek() {
359                state.advance(ch.len_utf8());
360                state.add_token(PascalSyntaxKind::Error, start_pos, state.get_position());
361            }
362
363            state.advance_if_dead_lock(safe_point);
364        }
365
366        // 添加 EOF token
367        Ok(())
368    }
369}