Skip to main content

oak_pascal/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for Pascal.
3pub mod token_type;
4
5use crate::{language::PascalLanguage, lexer::token_type::PascalTokenType};
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
9    source::Source,
10};
11use std::sync::LazyLock;
12
13type State<'s, S> = LexerState<'s, S, PascalLanguage>;
14
15static PASCAL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static PASCAL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "{", block_end: "}", nested_blocks: false });
17
18/// A lexer for Pascal source files.
19#[derive(Clone, Debug)]
20pub struct PascalLexer<'config> {
21    config: &'config PascalLanguage,
22}
23
24impl<'config> PascalLexer<'config> {
25    /// Creates a new `PascalLexer` with the given language configuration.
26    pub fn new(config: &'config PascalLanguage) -> Self {
27        Self { config }
28    }
29
30    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
31        PASCAL_WHITESPACE.scan(state, PascalTokenType::Whitespace)
32    }
33
34    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
35        let start = state.get_position();
36
37        // Line comment starting with //
38        if state.rest().starts_with("//") {
39            return PASCAL_COMMENT.scan(state, PascalTokenType::Comment, PascalTokenType::Comment);
40        }
41
42        // Block comment: { ... }
43        if state.current() == Some('{') {
44            state.advance(1);
45            while let Some(ch) = state.peek() {
46                if ch == '}' {
47                    state.advance(1);
48                    break;
49                }
50                state.advance(ch.len_utf8());
51            }
52            state.add_token(PascalTokenType::Comment, start, state.get_position());
53            return true;
54        }
55
56        // Block comment: (* ... *)
57        if state.rest().starts_with("(*") {
58            state.advance(2);
59            while let Some(ch) = state.peek() {
60                if ch == '*' && state.peek_next_n(1) == Some(')') {
61                    state.advance(2);
62                    break;
63                }
64                state.advance(ch.len_utf8());
65            }
66            state.add_token(PascalTokenType::Comment, start, state.get_position());
67            return true;
68        }
69
70        false
71    }
72
73    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
74        let start = state.get_position();
75
76        // Pascal string literal: '...'
77        if state.current() == Some('\'') {
78            state.advance(1);
79            while let Some(ch) = state.peek() {
80                if ch == '\'' {
81                    // Check if it's an escaped single quote ''
82                    if state.peek_next_n(1) == Some('\'') {
83                        state.advance(2); // skip ''
84                        continue;
85                    }
86                    else {
87                        state.advance(1); // closing quote
88                        break;
89                    }
90                }
91                state.advance(ch.len_utf8());
92            }
93            state.add_token(PascalTokenType::StringLiteral, start, state.get_position());
94            return true;
95        }
96        false
97    }
98
99    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
100        if let Some(ch) = state.peek() {
101            if ch.is_alphabetic() || ch == '_' {
102                let start_pos = state.get_position();
103                let mut text = String::new();
104
105                // Read identifier
106                while let Some(ch) = state.peek() {
107                    if ch.is_alphanumeric() || ch == '_' {
108                        text.push(ch);
109                        state.advance(ch.len_utf8());
110                    }
111                    else {
112                        break;
113                    }
114                }
115
116                // Check if it's a keyword
117                let kind = match text.to_lowercase().as_str() {
118                    "program" => PascalTokenType::Program,
119                    "var" => PascalTokenType::Var,
120                    "const" => PascalTokenType::Const,
121                    "type" => PascalTokenType::Type,
122                    "procedure" => PascalTokenType::Procedure,
123                    "function" => PascalTokenType::Function,
124                    "begin" => PascalTokenType::Begin,
125                    "end" => PascalTokenType::End,
126                    "if" => PascalTokenType::If,
127                    "then" => PascalTokenType::Then,
128                    "else" => PascalTokenType::Else,
129                    "while" => PascalTokenType::While,
130                    "do" => PascalTokenType::Do,
131                    "for" => PascalTokenType::For,
132                    "to" => PascalTokenType::To,
133                    "downto" => PascalTokenType::Downto,
134                    "repeat" => PascalTokenType::Repeat,
135                    "until" => PascalTokenType::Until,
136                    "case" => PascalTokenType::Case,
137                    "of" => PascalTokenType::Of,
138                    "with" => PascalTokenType::With,
139                    "record" => PascalTokenType::Record,
140                    "array" => PascalTokenType::Array,
141                    "set" => PascalTokenType::Set,
142                    "file" => PascalTokenType::File,
143                    "packed" => PascalTokenType::Packed,
144                    "nil" => PascalTokenType::Nil,
145                    "true" => PascalTokenType::True,
146                    "false" => PascalTokenType::False,
147                    "and" => PascalTokenType::And,
148                    "or" => PascalTokenType::Or,
149                    "not" => PascalTokenType::Not,
150                    "div" => PascalTokenType::Div,
151                    "mod" => PascalTokenType::Mod,
152                    "in" => PascalTokenType::In,
153
154                    _ => PascalTokenType::Identifier,
155                };
156
157                state.add_token(kind, start_pos, state.get_position());
158                true
159            }
160            else {
161                false
162            }
163        }
164        else {
165            false
166        }
167    }
168
169    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
170        if let Some(ch) = state.peek() {
171            if ch.is_ascii_digit() {
172                let start_pos = state.get_position();
173                let mut has_dot = false;
174
175                // Read number
176                while let Some(ch) = state.peek() {
177                    if ch.is_ascii_digit() {
178                        state.advance(1);
179                    }
180                    else if ch == '.' && !has_dot {
181                        has_dot = true;
182                        state.advance(1);
183                    }
184                    else {
185                        break;
186                    }
187                }
188
189                let kind = if has_dot { PascalTokenType::RealLiteral } else { PascalTokenType::IntegerLiteral };
190
191                state.add_token(kind, start_pos, state.get_position());
192                true
193            }
194            else {
195                false
196            }
197        }
198        else {
199            false
200        }
201    }
202
203    fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
204        if let Some(ch) = state.peek() {
205            let start_pos = state.get_position();
206
207            let kind = match ch {
208                '+' => {
209                    state.advance(1);
210                    PascalTokenType::Plus
211                }
212                '-' => {
213                    state.advance(1);
214                    PascalTokenType::Minus
215                }
216                '*' => {
217                    state.advance(1);
218                    PascalTokenType::Multiply
219                }
220                '/' => {
221                    state.advance(1);
222                    PascalTokenType::Divide
223                }
224                '=' => {
225                    state.advance(1);
226                    PascalTokenType::Equal
227                }
228                '<' => {
229                    state.advance(1);
230                    if let Some('=') = state.peek() {
231                        state.advance(1);
232                        PascalTokenType::LessEqual
233                    }
234                    else if let Some('>') = state.peek() {
235                        state.advance(1);
236                        PascalTokenType::NotEqual
237                    }
238                    else {
239                        PascalTokenType::Less
240                    }
241                }
242                '>' => {
243                    state.advance(1);
244                    if let Some('=') = state.peek() {
245                        state.advance(1);
246                        PascalTokenType::GreaterEqual
247                    }
248                    else {
249                        PascalTokenType::Greater
250                    }
251                }
252                ':' => {
253                    state.advance(1);
254                    if let Some('=') = state.peek() {
255                        state.advance(1);
256                        PascalTokenType::Assign
257                    }
258                    else {
259                        PascalTokenType::Colon
260                    }
261                }
262                ';' => {
263                    state.advance(1);
264                    PascalTokenType::Semicolon
265                }
266                ',' => {
267                    state.advance(1);
268                    PascalTokenType::Comma
269                }
270                '.' => {
271                    state.advance(1);
272                    if let Some('.') = state.peek() {
273                        state.advance(1);
274                        PascalTokenType::Range
275                    }
276                    else {
277                        PascalTokenType::Dot
278                    }
279                }
280                '(' => {
281                    state.advance(1);
282                    PascalTokenType::LeftParen
283                }
284                ')' => {
285                    state.advance(1);
286                    PascalTokenType::RightParen
287                }
288                '[' => {
289                    state.advance(1);
290                    PascalTokenType::LeftBracket
291                }
292                ']' => {
293                    state.advance(1);
294                    PascalTokenType::RightBracket
295                }
296                '^' => {
297                    state.advance(1);
298                    PascalTokenType::Caret
299                }
300                '\n' => {
301                    state.advance(1);
302                    PascalTokenType::Newline
303                }
304                _ => {
305                    state.advance(ch.len_utf8());
306                    PascalTokenType::Error
307                }
308            };
309
310            state.add_token(kind, start_pos, state.get_position());
311            true
312        }
313        else {
314            false
315        }
316    }
317}
318
319impl Lexer<PascalLanguage> for PascalLexer<'_> {
320    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PascalLanguage>) -> LexOutput<PascalLanguage> {
321        let mut state = State::new(source);
322        let result = self.run(&mut state);
323        if result.is_ok() {
324            state.add_eof();
325        }
326        state.finish_with_cache(result, cache)
327    }
328}
329
330impl PascalLexer<'_> {
331    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
332        let safe_point = state.get_position();
333        while state.not_at_end() {
334            // Skip whitespace
335            if self.skip_whitespace(state) {
336                continue;
337            }
338
339            // Handle comments
340            if self.skip_comment(state) {
341                continue;
342            }
343
344            // Handle strings
345            if self.lex_string(state) {
346                continue;
347            }
348
349            // Handle identifiers and keywords
350            if self.lex_identifier_or_keyword(state) {
351                continue;
352            }
353
354            // Handle numbers
355            if self.lex_number(state) {
356                continue;
357            }
358
359            // Handle operators and punctuation
360            if self.lex_operators_and_punctuation(state) {
361                continue;
362            }
363
364            // If no pattern matches, create an error token
365            let start_pos = state.get_position();
366            if let Some(ch) = state.peek() {
367                state.advance(ch.len_utf8());
368                state.add_token(PascalTokenType::Error, start_pos, state.get_position());
369            }
370
371            state.advance_if_dead_lock(safe_point);
372        }
373
374        // Add EOF token
375        Ok(())
376    }
377}