Skip to main content

oak_pascal/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::PascalLanguage, lexer::token_type::PascalTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError,
7    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
8    source::Source,
9};
10use std::sync::LazyLock;
11
12type State<'s, S> = LexerState<'s, S, PascalLanguage>;
13
14static PASCAL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static PASCAL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "{", block_end: "}", nested_blocks: false });
16
17#[derive(Clone, Debug)]
18pub struct PascalLexer<'config> {
19    _config: &'config PascalLanguage,
20}
21
22impl<'config> PascalLexer<'config> {
23    pub fn new(config: &'config PascalLanguage) -> Self {
24        Self { _config: config }
25    }
26
27    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
28        PASCAL_WHITESPACE.scan(state, PascalTokenType::Whitespace)
29    }
30
31    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
32        let start = state.get_position();
33
34        // Line comment starting with //
35        if state.rest().starts_with("//") {
36            return PASCAL_COMMENT.scan(state, PascalTokenType::Comment, PascalTokenType::Comment);
37        }
38
39        // Block comment: { ... }
40        if state.current() == Some('{') {
41            state.advance(1);
42            while let Some(ch) = state.peek() {
43                if ch == '}' {
44                    state.advance(1);
45                    break;
46                }
47                state.advance(ch.len_utf8());
48            }
49            state.add_token(PascalTokenType::Comment, start, state.get_position());
50            return true;
51        }
52
53        // Block comment: (* ... *)
54        if state.rest().starts_with("(*") {
55            state.advance(2);
56            while let Some(ch) = state.peek() {
57                if ch == '*' && state.peek_next_n(1) == Some(')') {
58                    state.advance(2);
59                    break;
60                }
61                state.advance(ch.len_utf8());
62            }
63            state.add_token(PascalTokenType::Comment, start, state.get_position());
64            return true;
65        }
66
67        false
68    }
69
70    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
71        let start = state.get_position();
72
73        // Pascal 字符串字面量:'...'
74        if state.current() == Some('\'') {
75            state.advance(1);
76            while let Some(ch) = state.peek() {
77                if ch == '\'' {
78                    // 检查是否是转义的单引号 ''
79                    if state.peek_next_n(1) == Some('\'') {
80                        state.advance(2); // 跳过 ''
81                        continue;
82                    }
83                    else {
84                        state.advance(1); // 结束引号
85                        break;
86                    }
87                }
88                state.advance(ch.len_utf8());
89            }
90            state.add_token(PascalTokenType::StringLiteral, start, state.get_position());
91            return true;
92        }
93        false
94    }
95
96    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
97        if let Some(ch) = state.peek() {
98            if ch.is_alphabetic() || ch == '_' {
99                let start_pos = state.get_position();
100                let mut text = String::new();
101
102                // 读取标识符
103                while let Some(ch) = state.peek() {
104                    if ch.is_alphanumeric() || ch == '_' {
105                        text.push(ch);
106                        state.advance(ch.len_utf8());
107                    }
108                    else {
109                        break;
110                    }
111                }
112
113                // 检查是否是关键字
114                let kind = match text.to_lowercase().as_str() {
115                    "program" => PascalTokenType::Program,
116                    "var" => PascalTokenType::Var,
117                    "const" => PascalTokenType::Const,
118                    "type" => PascalTokenType::Type,
119                    "procedure" => PascalTokenType::Procedure,
120                    "function" => PascalTokenType::Function,
121                    "begin" => PascalTokenType::Begin,
122                    "end" => PascalTokenType::End,
123                    "if" => PascalTokenType::If,
124                    "then" => PascalTokenType::Then,
125                    "else" => PascalTokenType::Else,
126                    "while" => PascalTokenType::While,
127                    "do" => PascalTokenType::Do,
128                    "for" => PascalTokenType::For,
129                    "to" => PascalTokenType::To,
130                    "downto" => PascalTokenType::Downto,
131                    "repeat" => PascalTokenType::Repeat,
132                    "until" => PascalTokenType::Until,
133                    "case" => PascalTokenType::Case,
134                    "of" => PascalTokenType::Of,
135                    "with" => PascalTokenType::With,
136                    "record" => PascalTokenType::Record,
137                    "array" => PascalTokenType::Array,
138                    "set" => PascalTokenType::Set,
139                    "file" => PascalTokenType::File,
140                    "packed" => PascalTokenType::Packed,
141                    "nil" => PascalTokenType::Nil,
142                    "true" => PascalTokenType::True,
143                    "false" => PascalTokenType::False,
144                    "and" => PascalTokenType::And,
145                    "or" => PascalTokenType::Or,
146                    "not" => PascalTokenType::Not,
147                    "div" => PascalTokenType::Div,
148                    "mod" => PascalTokenType::Mod,
149                    "in" => PascalTokenType::In,
150
151                    _ => PascalTokenType::Identifier,
152                };
153
154                state.add_token(kind, start_pos, state.get_position());
155                true
156            }
157            else {
158                false
159            }
160        }
161        else {
162            false
163        }
164    }
165
166    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
167        if let Some(ch) = state.peek() {
168            if ch.is_ascii_digit() {
169                let start_pos = state.get_position();
170                let mut has_dot = false;
171
172                // 读取数字
173                while let Some(ch) = state.peek() {
174                    if ch.is_ascii_digit() {
175                        state.advance(1);
176                    }
177                    else if ch == '.' && !has_dot {
178                        has_dot = true;
179                        state.advance(1);
180                    }
181                    else {
182                        break;
183                    }
184                }
185
186                let kind = if has_dot { PascalTokenType::RealLiteral } else { PascalTokenType::IntegerLiteral };
187
188                state.add_token(kind, start_pos, state.get_position());
189                true
190            }
191            else {
192                false
193            }
194        }
195        else {
196            false
197        }
198    }
199
200    fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
201        if let Some(ch) = state.peek() {
202            let start_pos = state.get_position();
203
204            let kind = match ch {
205                '+' => {
206                    state.advance(1);
207                    PascalTokenType::Plus
208                }
209                '-' => {
210                    state.advance(1);
211                    PascalTokenType::Minus
212                }
213                '*' => {
214                    state.advance(1);
215                    PascalTokenType::Multiply
216                }
217                '/' => {
218                    state.advance(1);
219                    PascalTokenType::Divide
220                }
221                '=' => {
222                    state.advance(1);
223                    PascalTokenType::Equal
224                }
225                '<' => {
226                    state.advance(1);
227                    if let Some('=') = state.peek() {
228                        state.advance(1);
229                        PascalTokenType::LessEqual
230                    }
231                    else if let Some('>') = state.peek() {
232                        state.advance(1);
233                        PascalTokenType::NotEqual
234                    }
235                    else {
236                        PascalTokenType::Less
237                    }
238                }
239                '>' => {
240                    state.advance(1);
241                    if let Some('=') = state.peek() {
242                        state.advance(1);
243                        PascalTokenType::GreaterEqual
244                    }
245                    else {
246                        PascalTokenType::Greater
247                    }
248                }
249                ':' => {
250                    state.advance(1);
251                    if let Some('=') = state.peek() {
252                        state.advance(1);
253                        PascalTokenType::Assign
254                    }
255                    else {
256                        PascalTokenType::Colon
257                    }
258                }
259                ';' => {
260                    state.advance(1);
261                    PascalTokenType::Semicolon
262                }
263                ',' => {
264                    state.advance(1);
265                    PascalTokenType::Comma
266                }
267                '.' => {
268                    state.advance(1);
269                    if let Some('.') = state.peek() {
270                        state.advance(1);
271                        PascalTokenType::Range
272                    }
273                    else {
274                        PascalTokenType::Dot
275                    }
276                }
277                '(' => {
278                    state.advance(1);
279                    PascalTokenType::LeftParen
280                }
281                ')' => {
282                    state.advance(1);
283                    PascalTokenType::RightParen
284                }
285                '[' => {
286                    state.advance(1);
287                    PascalTokenType::LeftBracket
288                }
289                ']' => {
290                    state.advance(1);
291                    PascalTokenType::RightBracket
292                }
293                '^' => {
294                    state.advance(1);
295                    PascalTokenType::Caret
296                }
297                '\n' => {
298                    state.advance(1);
299                    PascalTokenType::Newline
300                }
301                _ => {
302                    state.advance(ch.len_utf8());
303                    PascalTokenType::Error
304                }
305            };
306
307            state.add_token(kind, start_pos, state.get_position());
308            true
309        }
310        else {
311            false
312        }
313    }
314}
315
316impl Lexer<PascalLanguage> for PascalLexer<'_> {
317    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PascalLanguage>) -> LexOutput<PascalLanguage> {
318        let mut state = State::new(source);
319        let result = self.run(&mut state);
320        if result.is_ok() {
321            state.add_eof();
322        }
323        state.finish_with_cache(result, cache)
324    }
325}
326
327impl PascalLexer<'_> {
328    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
329        let safe_point = state.get_position();
330        while state.not_at_end() {
331            // 跳过空白字符
332            if self.skip_whitespace(state) {
333                continue;
334            }
335
336            // 处理注释
337            if self.skip_comment(state) {
338                continue;
339            }
340
341            // 处理字符串
342            if self.lex_string(state) {
343                continue;
344            }
345
346            // 处理标识符和关键字
347            if self.lex_identifier_or_keyword(state) {
348                continue;
349            }
350
351            // 处理数字
352            if self.lex_number(state) {
353                continue;
354            }
355
356            // 处理操作符和标点符号
357            if self.lex_operators_and_punctuation(state) {
358                continue;
359            }
360
361            // 如果没有匹配任何模式,创建错误 token
362            let start_pos = state.get_position();
363            if let Some(ch) = state.peek() {
364                state.advance(ch.len_utf8());
365                state.add_token(PascalTokenType::Error, start_pos, state.get_position());
366            }
367
368            state.advance_if_dead_lock(safe_point);
369        }
370
371        // 添加 EOF token
372        Ok(())
373    }
374}