Skip to main content

oak_pascal/lexer/
mod.rs

1use crate::{kind::PascalSyntaxKind, language::PascalLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'s, S> = LexerState<'s, S, PascalLanguage>;
10
11static PASCAL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static PASCAL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "{", block_end: "}", nested_blocks: false });
13
14#[derive(Clone, Debug)]
15pub struct PascalLexer<'config> {
16    _config: &'config PascalLanguage,
17}
18
19impl<'config> PascalLexer<'config> {
20    pub fn new(config: &'config PascalLanguage) -> Self {
21        Self { _config: config }
22    }
23
24    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
25        PASCAL_WHITESPACE.scan(state, PascalSyntaxKind::Whitespace)
26    }
27
28    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
29        let start = state.get_position();
30
31        // Line comment starting with //
32        if state.rest().starts_with("//") {
33            return PASCAL_COMMENT.scan(state, PascalSyntaxKind::Comment, PascalSyntaxKind::Comment);
34        }
35
36        // Block comment: { ... }
37        if state.current() == Some('{') {
38            state.advance(1);
39            while let Some(ch) = state.peek() {
40                if ch == '}' {
41                    state.advance(1);
42                    break;
43                }
44                state.advance(ch.len_utf8());
45            }
46            state.add_token(PascalSyntaxKind::Comment, start, state.get_position());
47            return true;
48        }
49
50        // Block comment: (* ... *)
51        if state.rest().starts_with("(*") {
52            state.advance(2);
53            while let Some(ch) = state.peek() {
54                if ch == '*' && state.peek_next_n(1) == Some(')') {
55                    state.advance(2);
56                    break;
57                }
58                state.advance(ch.len_utf8());
59            }
60            state.add_token(PascalSyntaxKind::Comment, start, state.get_position());
61            return true;
62        }
63
64        false
65    }
66
67    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
68        let start = state.get_position();
69
70        // Pascal 字符串字面量:'...'
71        if state.current() == Some('\'') {
72            state.advance(1);
73            while let Some(ch) = state.peek() {
74                if ch == '\'' {
75                    // 检查是否是转义的单引号 ''
76                    if state.peek_next_n(1) == Some('\'') {
77                        state.advance(2); // 跳过 ''
78                        continue;
79                    }
80                    else {
81                        state.advance(1); // 结束引号
82                        break;
83                    }
84                }
85                state.advance(ch.len_utf8());
86            }
87            state.add_token(PascalSyntaxKind::StringLiteral, start, state.get_position());
88            return true;
89        }
90        false
91    }
92
93    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
94        if let Some(ch) = state.peek() {
95            if ch.is_alphabetic() || ch == '_' {
96                let start_pos = state.get_position();
97                let mut text = String::new();
98
99                // 读取标识符
100                while let Some(ch) = state.peek() {
101                    if ch.is_alphanumeric() || ch == '_' {
102                        text.push(ch);
103                        state.advance(ch.len_utf8());
104                    }
105                    else {
106                        break;
107                    }
108                }
109
110                // 检查是否是关键字
111                let kind = match text.to_lowercase().as_str() {
112                    "program" => PascalSyntaxKind::Program,
113                    "var" => PascalSyntaxKind::Var,
114                    "const" => PascalSyntaxKind::Const,
115                    "type" => PascalSyntaxKind::Type,
116                    "procedure" => PascalSyntaxKind::Procedure,
117                    "function" => PascalSyntaxKind::Function,
118                    "begin" => PascalSyntaxKind::Begin,
119                    "end" => PascalSyntaxKind::End,
120                    "if" => PascalSyntaxKind::If,
121                    "then" => PascalSyntaxKind::Then,
122                    "else" => PascalSyntaxKind::Else,
123                    "while" => PascalSyntaxKind::While,
124                    "do" => PascalSyntaxKind::Do,
125                    "for" => PascalSyntaxKind::For,
126                    "to" => PascalSyntaxKind::To,
127                    "downto" => PascalSyntaxKind::Downto,
128                    "repeat" => PascalSyntaxKind::Repeat,
129                    "until" => PascalSyntaxKind::Until,
130                    "case" => PascalSyntaxKind::Case,
131                    "of" => PascalSyntaxKind::Of,
132                    "with" => PascalSyntaxKind::With,
133                    "record" => PascalSyntaxKind::Record,
134                    "array" => PascalSyntaxKind::Array,
135                    "set" => PascalSyntaxKind::Set,
136                    "file" => PascalSyntaxKind::File,
137                    "packed" => PascalSyntaxKind::Packed,
138                    "nil" => PascalSyntaxKind::Nil,
139                    "true" => PascalSyntaxKind::True,
140                    "false" => PascalSyntaxKind::False,
141                    "and" => PascalSyntaxKind::And,
142                    "or" => PascalSyntaxKind::Or,
143                    "not" => PascalSyntaxKind::Not,
144                    "div" => PascalSyntaxKind::Div,
145                    "mod" => PascalSyntaxKind::Mod,
146                    "in" => PascalSyntaxKind::In,
147
148                    _ => PascalSyntaxKind::Identifier,
149                };
150
151                state.add_token(kind, start_pos, state.get_position());
152                true
153            }
154            else {
155                false
156            }
157        }
158        else {
159            false
160        }
161    }
162
163    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
164        if let Some(ch) = state.peek() {
165            if ch.is_ascii_digit() {
166                let start_pos = state.get_position();
167                let mut has_dot = false;
168
169                // 读取数字
170                while let Some(ch) = state.peek() {
171                    if ch.is_ascii_digit() {
172                        state.advance(1);
173                    }
174                    else if ch == '.' && !has_dot {
175                        has_dot = true;
176                        state.advance(1);
177                    }
178                    else {
179                        break;
180                    }
181                }
182
183                let kind = if has_dot { PascalSyntaxKind::RealLiteral } else { PascalSyntaxKind::IntegerLiteral };
184
185                state.add_token(kind, start_pos, state.get_position());
186                true
187            }
188            else {
189                false
190            }
191        }
192        else {
193            false
194        }
195    }
196
197    fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
198        if let Some(ch) = state.peek() {
199            let start_pos = state.get_position();
200
201            let kind = match ch {
202                '+' => {
203                    state.advance(1);
204                    PascalSyntaxKind::Plus
205                }
206                '-' => {
207                    state.advance(1);
208                    PascalSyntaxKind::Minus
209                }
210                '*' => {
211                    state.advance(1);
212                    PascalSyntaxKind::Multiply
213                }
214                '/' => {
215                    state.advance(1);
216                    PascalSyntaxKind::Divide
217                }
218                '=' => {
219                    state.advance(1);
220                    PascalSyntaxKind::Equal
221                }
222                '<' => {
223                    state.advance(1);
224                    if let Some('=') = state.peek() {
225                        state.advance(1);
226                        PascalSyntaxKind::LessEqual
227                    }
228                    else if let Some('>') = state.peek() {
229                        state.advance(1);
230                        PascalSyntaxKind::NotEqual
231                    }
232                    else {
233                        PascalSyntaxKind::Less
234                    }
235                }
236                '>' => {
237                    state.advance(1);
238                    if let Some('=') = state.peek() {
239                        state.advance(1);
240                        PascalSyntaxKind::GreaterEqual
241                    }
242                    else {
243                        PascalSyntaxKind::Greater
244                    }
245                }
246                ':' => {
247                    state.advance(1);
248                    if let Some('=') = state.peek() {
249                        state.advance(1);
250                        PascalSyntaxKind::Assign
251                    }
252                    else {
253                        PascalSyntaxKind::Colon
254                    }
255                }
256                ';' => {
257                    state.advance(1);
258                    PascalSyntaxKind::Semicolon
259                }
260                ',' => {
261                    state.advance(1);
262                    PascalSyntaxKind::Comma
263                }
264                '.' => {
265                    state.advance(1);
266                    if let Some('.') = state.peek() {
267                        state.advance(1);
268                        PascalSyntaxKind::Range
269                    }
270                    else {
271                        PascalSyntaxKind::Dot
272                    }
273                }
274                '(' => {
275                    state.advance(1);
276                    PascalSyntaxKind::LeftParen
277                }
278                ')' => {
279                    state.advance(1);
280                    PascalSyntaxKind::RightParen
281                }
282                '[' => {
283                    state.advance(1);
284                    PascalSyntaxKind::LeftBracket
285                }
286                ']' => {
287                    state.advance(1);
288                    PascalSyntaxKind::RightBracket
289                }
290                '^' => {
291                    state.advance(1);
292                    PascalSyntaxKind::Caret
293                }
294                '\n' => {
295                    state.advance(1);
296                    PascalSyntaxKind::Newline
297                }
298                _ => {
299                    state.advance(ch.len_utf8());
300                    PascalSyntaxKind::Error
301                }
302            };
303
304            state.add_token(kind, start_pos, state.get_position());
305            true
306        }
307        else {
308            false
309        }
310    }
311}
312
313impl Lexer<PascalLanguage> for PascalLexer<'_> {
314    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PascalLanguage>) -> LexOutput<PascalLanguage> {
315        let mut state = State::new(source);
316        let result = self.run(&mut state);
317        if result.is_ok() {
318            state.add_eof();
319        }
320        state.finish_with_cache(result, cache)
321    }
322}
323
324impl PascalLexer<'_> {
325    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
326        let safe_point = state.get_position();
327        while state.not_at_end() {
328            // 跳过空白字符
329            if self.skip_whitespace(state) {
330                continue;
331            }
332
333            // 处理注释
334            if self.skip_comment(state) {
335                continue;
336            }
337
338            // 处理字符串
339            if self.lex_string(state) {
340                continue;
341            }
342
343            // 处理标识符和关键字
344            if self.lex_identifier_or_keyword(state) {
345                continue;
346            }
347
348            // 处理数字
349            if self.lex_number(state) {
350                continue;
351            }
352
353            // 处理操作符和标点符号
354            if self.lex_operators_and_punctuation(state) {
355                continue;
356            }
357
358            // 如果没有匹配任何模式,创建错误 token
359            let start_pos = state.get_position();
360            if let Some(ch) = state.peek() {
361                state.advance(ch.len_utf8());
362                state.add_token(PascalSyntaxKind::Error, start_pos, state.get_position());
363            }
364
365            state.advance_if_dead_lock(safe_point);
366        }
367
368        // 添加 EOF token
369        Ok(())
370    }
371}