oak_pascal/lexer/
mod.rs

1use crate::{kind::PascalSyntaxKind, language::PascalLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, PascalLanguage>;
10
11static PASCAL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static PASCAL_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static PASCAL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: None });
14
15#[derive(Clone)]
16pub struct PascalLexer<'config> {
17    config: &'config PascalLanguage,
18}
19
20impl<'config> PascalLexer<'config> {
21    pub fn new(config: &'config PascalLanguage) -> Self {
22        Self { config }
23    }
24
25    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
26        match PASCAL_WHITESPACE.scan(state.rest(), state.get_position(), PascalSyntaxKind::Whitespace) {
27            Some(token) => {
28                state.advance_with(token);
29                true
30            }
31            None => false,
32        }
33    }
34
35    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
36        let start = state.get_position();
37        let rest = state.rest();
38
39        // Line comment starting with //
40        if rest.starts_with("//") {
41            match PASCAL_COMMENT.scan(rest, start, PascalSyntaxKind::Comment) {
42                Some(token) => {
43                    state.advance_with(token);
44                    return true;
45                }
46                None => return false,
47            }
48        }
49
50        // Block comment: { ... }
51        if state.current() == Some('{') {
52            state.advance(1);
53            while let Some(ch) = state.peek() {
54                if ch == '}' {
55                    state.advance(1);
56                    break;
57                }
58                state.advance(ch.len_utf8());
59            }
60            state.add_token(PascalSyntaxKind::Comment, start, state.get_position());
61            return true;
62        }
63
64        // Block comment: (* ... *)
65        if rest.starts_with("(*") {
66            state.advance(2);
67            while let Some(ch) = state.peek() {
68                if ch == '*' && state.peek_next_n(1) == Some(')') {
69                    state.advance(2);
70                    break;
71                }
72                state.advance(ch.len_utf8());
73            }
74            state.add_token(PascalSyntaxKind::Comment, start, state.get_position());
75            return true;
76        }
77
78        false
79    }
80
81    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
82        let start = state.get_position();
83
84        // Pascal 字符串字面量:'...'
85        if state.current() == Some('\'') {
86            state.advance(1);
87            while let Some(ch) = state.peek() {
88                if ch == '\'' {
89                    // 检查是否是转义的单引号 ''
90                    if state.peek_next_n(1) == Some('\'') {
91                        state.advance(2); // 跳过 ''
92                        continue;
93                    }
94                    else {
95                        state.advance(1); // 结束引号
96                        break;
97                    }
98                }
99                state.advance(ch.len_utf8());
100            }
101            state.add_token(PascalSyntaxKind::StringLiteral, start, state.get_position());
102            return true;
103        }
104        false
105    }
106
107    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
108        if let Some(ch) = state.peek() {
109            if ch.is_alphabetic() || ch == '_' {
110                let start_pos = state.get_position();
111                let mut text = String::new();
112
113                // 读取标识符
114                while let Some(ch) = state.peek() {
115                    if ch.is_alphanumeric() || ch == '_' {
116                        text.push(ch);
117                        state.advance(ch.len_utf8());
118                    }
119                    else {
120                        break;
121                    }
122                }
123
124                // 检查是否是关键字
125                let kind = match text.to_lowercase().as_str() {
126                    "program" => PascalSyntaxKind::Program,
127                    "var" => PascalSyntaxKind::Var,
128                    "const" => PascalSyntaxKind::Const,
129                    "type" => PascalSyntaxKind::Type,
130                    "procedure" => PascalSyntaxKind::Procedure,
131                    "function" => PascalSyntaxKind::Function,
132                    "begin" => PascalSyntaxKind::Begin,
133                    "end" => PascalSyntaxKind::End,
134                    "if" => PascalSyntaxKind::If,
135                    "then" => PascalSyntaxKind::Then,
136                    "else" => PascalSyntaxKind::Else,
137                    "while" => PascalSyntaxKind::While,
138                    "do" => PascalSyntaxKind::Do,
139                    "for" => PascalSyntaxKind::For,
140                    "to" => PascalSyntaxKind::To,
141                    "downto" => PascalSyntaxKind::Downto,
142                    "repeat" => PascalSyntaxKind::Repeat,
143                    "until" => PascalSyntaxKind::Until,
144                    "case" => PascalSyntaxKind::Case,
145                    "of" => PascalSyntaxKind::Of,
146                    "with" => PascalSyntaxKind::With,
147                    "record" => PascalSyntaxKind::Record,
148                    "array" => PascalSyntaxKind::Array,
149                    "set" => PascalSyntaxKind::Set,
150                    "file" => PascalSyntaxKind::File,
151                    "packed" => PascalSyntaxKind::Packed,
152                    "nil" => PascalSyntaxKind::Nil,
153                    "true" => PascalSyntaxKind::True,
154                    "false" => PascalSyntaxKind::False,
155                    "and" => PascalSyntaxKind::And,
156                    "or" => PascalSyntaxKind::Or,
157                    "not" => PascalSyntaxKind::Not,
158                    "div" => PascalSyntaxKind::Div,
159                    "mod" => PascalSyntaxKind::Mod,
160                    "in" => PascalSyntaxKind::In,
161
162                    _ => PascalSyntaxKind::Identifier,
163                };
164
165                state.add_token(kind, start_pos, state.get_position());
166                true
167            }
168            else {
169                false
170            }
171        }
172        else {
173            false
174        }
175    }
176
177    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
178        if let Some(ch) = state.peek() {
179            if ch.is_ascii_digit() {
180                let start_pos = state.get_position();
181                let mut has_dot = false;
182
183                // 读取数字
184                while let Some(ch) = state.peek() {
185                    if ch.is_ascii_digit() {
186                        state.advance(1);
187                    }
188                    else if ch == '.' && !has_dot {
189                        has_dot = true;
190                        state.advance(1);
191                    }
192                    else {
193                        break;
194                    }
195                }
196
197                let kind = if has_dot { PascalSyntaxKind::RealLiteral } else { PascalSyntaxKind::IntegerLiteral };
198
199                state.add_token(kind, start_pos, state.get_position());
200                true
201            }
202            else {
203                false
204            }
205        }
206        else {
207            false
208        }
209    }
210
211    fn lex_operators_and_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
212        if let Some(ch) = state.peek() {
213            let start_pos = state.get_position();
214
215            let kind = match ch {
216                '+' => {
217                    state.advance(1);
218                    PascalSyntaxKind::Plus
219                }
220                '-' => {
221                    state.advance(1);
222                    PascalSyntaxKind::Minus
223                }
224                '*' => {
225                    state.advance(1);
226                    PascalSyntaxKind::Multiply
227                }
228                '/' => {
229                    state.advance(1);
230                    PascalSyntaxKind::Divide
231                }
232                '=' => {
233                    state.advance(1);
234                    PascalSyntaxKind::Equal
235                }
236                '<' => {
237                    state.advance(1);
238                    if let Some('=') = state.peek() {
239                        state.advance(1);
240                        PascalSyntaxKind::LessEqual
241                    }
242                    else if let Some('>') = state.peek() {
243                        state.advance(1);
244                        PascalSyntaxKind::NotEqual
245                    }
246                    else {
247                        PascalSyntaxKind::Less
248                    }
249                }
250                '>' => {
251                    state.advance(1);
252                    if let Some('=') = state.peek() {
253                        state.advance(1);
254                        PascalSyntaxKind::GreaterEqual
255                    }
256                    else {
257                        PascalSyntaxKind::Greater
258                    }
259                }
260                ':' => {
261                    state.advance(1);
262                    if let Some('=') = state.peek() {
263                        state.advance(1);
264                        PascalSyntaxKind::Assign
265                    }
266                    else {
267                        PascalSyntaxKind::Colon
268                    }
269                }
270                ';' => {
271                    state.advance(1);
272                    PascalSyntaxKind::Semicolon
273                }
274                ',' => {
275                    state.advance(1);
276                    PascalSyntaxKind::Comma
277                }
278                '.' => {
279                    state.advance(1);
280                    if let Some('.') = state.peek() {
281                        state.advance(1);
282                        PascalSyntaxKind::Range
283                    }
284                    else {
285                        PascalSyntaxKind::Dot
286                    }
287                }
288                '(' => {
289                    state.advance(1);
290                    PascalSyntaxKind::LeftParen
291                }
292                ')' => {
293                    state.advance(1);
294                    PascalSyntaxKind::RightParen
295                }
296                '[' => {
297                    state.advance(1);
298                    PascalSyntaxKind::LeftBracket
299                }
300                ']' => {
301                    state.advance(1);
302                    PascalSyntaxKind::RightBracket
303                }
304                '^' => {
305                    state.advance(1);
306                    PascalSyntaxKind::Caret
307                }
308                '\n' => {
309                    state.advance(1);
310                    PascalSyntaxKind::Newline
311                }
312                _ => {
313                    state.advance(ch.len_utf8());
314                    PascalSyntaxKind::Error
315                }
316            };
317
318            state.add_token(kind, start_pos, state.get_position());
319            true
320        }
321        else {
322            false
323        }
324    }
325}
326
327impl<'config> Lexer<PascalLanguage> for PascalLexer<'config> {
328    fn lex_incremental(
329        &self,
330        source: impl Source,
331        changed: usize,
332        cache: IncrementalCache<PascalLanguage>,
333    ) -> LexOutput<PascalLanguage> {
334        let mut state = LexerState::new_with_cache(source, changed, cache);
335        let result = self.run(&mut state);
336        state.finish(result)
337    }
338}
339
340impl<'config> PascalLexer<'config> {
341    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
342        while state.not_at_end() {
343            // 跳过空白字符
344            if self.skip_whitespace(state) {
345                continue;
346            }
347
348            // 处理注释
349            if self.skip_comment(state) {
350                continue;
351            }
352
353            // 处理字符串
354            if self.lex_string(state) {
355                continue;
356            }
357
358            // 处理标识符和关键字
359            if self.lex_identifier_or_keyword(state) {
360                continue;
361            }
362
363            // 处理数字
364            if self.lex_number(state) {
365                continue;
366            }
367
368            // 处理操作符和标点符号
369            if self.lex_operators_and_punctuation(state) {
370                continue;
371            }
372
373            // 如果没有匹配任何模式,创建错误 token
374            let start_pos = state.get_position();
375            if let Some(ch) = state.peek() {
376                state.advance(ch.len_utf8());
377                state.add_token(PascalSyntaxKind::Error, start_pos, state.get_position());
378            }
379            else {
380                break;
381            }
382        }
383
384        // 添加 EOF token
385        let eof_pos = state.get_position();
386        state.add_token(PascalSyntaxKind::Eof, eof_pos, eof_pos);
387        Ok(())
388    }
389}