Skip to main content

oak_delphi/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::DelphiLanguage, lexer::token_type::DelphiTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
6
7pub(crate) type State<'a, S> = LexerState<'a, S, DelphiLanguage>;
8
9/// Lexer implementation for Delphi programming language
10#[derive(Clone, Debug)]
11pub struct DelphiLexer<'config> {
12    config: &'config DelphiLanguage,
13}
14
15impl<'config> DelphiLexer<'config> {
16    /// Creates a new `DelphiLexer`.
17    pub fn new(config: &'config DelphiLanguage) -> Self {
18        Self { config }
19    }
20
21    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
22        while state.not_at_end() {
23            let safe_point = state.get_position();
24
25            if self.skip_whitespace(state) {
26                continue;
27            }
28
29            if self.skip_comment(state) {
30                continue;
31            }
32
33            if self.lex_string_literal(state) {
34                continue;
35            }
36
37            if self.lex_number_literal(state) {
38                continue;
39            }
40
41            if self.lex_identifier_or_keyword(state) {
42                continue;
43            }
44
45            if self.lex_operators(state) {
46                continue;
47            }
48
49            if self.lex_single_char_tokens(state) {
50                continue;
51            }
52
53            // If no rules match, add an error token and advance.
54            let start_pos = state.get_position();
55            if let Some(ch) = state.peek() {
56                state.advance(ch.len_utf8());
57                state.add_token(DelphiTokenType::Error, start_pos, state.get_position())
58            }
59
60            state.advance_if_dead_lock(safe_point)
61        }
62
63        Ok(())
64    }
65
66    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
67        let start_pos = state.get_position();
68        let mut consumed = false;
69        while let Some(ch) = state.peek() {
70            if ch.is_whitespace() {
71                consumed = true;
72                state.advance(ch.len_utf8())
73            }
74            else {
75                break;
76            }
77        }
78        if consumed {
79            state.add_token(DelphiTokenType::Whitespace, start_pos, state.get_position());
80            true
81        }
82        else {
83            false
84        }
85    }
86
87    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        let start = state.get_position();
89
90        // Line comment: // ... until newline
91        if state.consume_if_starts_with("//") {
92            while let Some(ch) = state.peek() {
93                if ch == '\n' || ch == '\r' {
94                    break;
95                }
96                state.advance(ch.len_utf8())
97            }
98            state.add_token(DelphiTokenType::LineComment, start, state.get_position());
99            return true;
100        }
101
102        // Block comment: { ... }
103        if state.consume_if_starts_with("{") {
104            let mut depth = 1usize;
105            while let Some(ch) = state.peek() {
106                if ch == '{' {
107                    depth += 1
108                }
109                else if ch == '}' {
110                    depth -= 1;
111                    if depth == 0 {
112                        state.advance(1);
113                        break;
114                    }
115                }
116                state.advance(ch.len_utf8())
117            }
118            state.add_token(DelphiTokenType::BlockComment, start, state.get_position());
119            return true;
120        }
121
122        // Block comment: (* ... *)
123        if state.consume_if_starts_with("(*") {
124            while let Some(ch) = state.peek() {
125                if state.consume_if_starts_with("*)") {
126                    break;
127                }
128                state.advance(ch.len_utf8())
129            }
130            state.add_token(DelphiTokenType::BlockComment, start, state.get_position());
131            return true;
132        }
133
134        false
135    }
136
137    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
138        let start = state.get_position();
139        if let Some('\'') = state.peek() {
140            state.advance(1);
141            while let Some(ch) = state.peek() {
142                if ch == '\'' {
143                    state.advance(1);
144                    if state.peek() == Some('\'') {
145                        // Double single quote is an escaped single quote
146                        state.advance(1);
147                        continue;
148                    }
149                    break;
150                }
151                state.advance(ch.len_utf8())
152            }
153            state.add_token(DelphiTokenType::String, start, state.get_position());
154            return true;
155        }
156        false
157    }
158
159    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
160        let start = state.get_position();
161        let first = match state.peek() {
162            Some(c) => c,
163            None => return false,
164        };
165
166        if !first.is_ascii_digit() && first != '$' {
167            return false;
168        }
169
170        let mut is_float = false;
171
172        // Hexadecimal number
173        if first == '$' {
174            state.advance(1);
175            while let Some(c) = state.peek() {
176                if c.is_ascii_hexdigit() { state.advance(1) } else { break }
177            }
178        }
179        else {
180            // Decimal number
181            state.advance(1);
182            while let Some(c) = state.peek() {
183                if c.is_ascii_digit() { state.advance(1) } else { break }
184            }
185
186            // Fractional part
187            if state.peek() == Some('.') {
188                let next = state.peek_next_n(1);
189                if next.map(|c| c.is_ascii_digit()).unwrap_or(false) {
190                    is_float = true;
191                    state.advance(1); // consume '.'
192                    while let Some(c) = state.peek() {
193                        if c.is_ascii_digit() { state.advance(1) } else { break }
194                    }
195                }
196            }
197
198            // Exponent part
199            if let Some(c) = state.peek() {
200                if c == 'e' || c == 'E' {
201                    let next = state.peek_next_n(1);
202                    if next == Some('+') || next == Some('-') || next.map(|d| d.is_ascii_digit()).unwrap_or(false) {
203                        is_float = true;
204                        state.advance(1);
205                        if let Some(sign) = state.peek() {
206                            if sign == '+' || sign == '-' {
207                                state.advance(1)
208                            }
209                        }
210                        while let Some(d) = state.peek() {
211                            if d.is_ascii_digit() { state.advance(1) } else { break }
212                        }
213                    }
214                }
215            }
216        }
217
218        state.add_token(if is_float { DelphiTokenType::Float } else { DelphiTokenType::Number }, start, state.get_position());
219        true
220    }
221
222    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
223        let start = state.get_position();
224        let ch = match state.peek() {
225            Some(c) => c,
226            None => return false,
227        };
228
229        if !(ch.is_ascii_alphabetic() || ch == '_') {
230            return false;
231        }
232
233        state.advance(ch.len_utf8());
234        while let Some(c) = state.peek() {
235            if c.is_ascii_alphanumeric() || c == '_' { state.advance(c.len_utf8()) } else { break }
236        }
237
238        let end = state.get_position();
239        let text = state.get_text_in((start..end).into());
240        let kind = match text.to_lowercase().as_str() {
241            "and" => DelphiTokenType::And_,
242            "array" => DelphiTokenType::Array,
243            "as" => DelphiTokenType::As_,
244            "begin" => DelphiTokenType::Begin,
245            "case" => DelphiTokenType::Case,
246            "class" => DelphiTokenType::Class,
247            "const" => DelphiTokenType::Const,
248            "div" => DelphiTokenType::Div,
249            "do" => DelphiTokenType::Do,
250            "downto" => DelphiTokenType::Downto,
251            "else" => DelphiTokenType::Else,
252            "end" => DelphiTokenType::End,
253            "except" => DelphiTokenType::Except,
254            "false" => DelphiTokenType::False_,
255            "finally" => DelphiTokenType::Finally,
256            "for" => DelphiTokenType::For,
257            "function" => DelphiTokenType::Function,
258            "if" => DelphiTokenType::If,
259            "implementation" => DelphiTokenType::Implementation,
260            "in" => DelphiTokenType::In_,
261            "interface" => DelphiTokenType::Interface,
262            "is" => DelphiTokenType::Is_,
263            "mod" => DelphiTokenType::Mod,
264            "nil" => DelphiTokenType::Nil,
265            "not" => DelphiTokenType::Not_,
266            "object" => DelphiTokenType::Object,
267            "of" => DelphiTokenType::Of,
268            "or" => DelphiTokenType::Or_,
269            "procedure" => DelphiTokenType::Procedure,
270            "program" => DelphiTokenType::Program,
271            "record" => DelphiTokenType::Record,
272            "repeat" => DelphiTokenType::Repeat,
273            "set" => DelphiTokenType::Set,
274            "then" => DelphiTokenType::Then,
275            "to" => DelphiTokenType::To,
276            "true" => DelphiTokenType::True_,
277            "try" => DelphiTokenType::Try,
278            "type" => DelphiTokenType::Type,
279            "unit" => DelphiTokenType::Unit,
280            "until" => DelphiTokenType::Until,
281            "uses" => DelphiTokenType::Uses,
282            "var" => DelphiTokenType::Var,
283            "while" => DelphiTokenType::While,
284            "with" => DelphiTokenType::With,
285            _ => DelphiTokenType::Identifier,
286        };
287
288        state.add_token(kind, start, state.get_position());
289        true
290    }
291
292    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
293        let start = state.get_position();
294
295        // Multi-character operators (longest first)
296        let patterns: &[(&str, DelphiTokenType)] = &[(":=", DelphiTokenType::Assign), ("<=", DelphiTokenType::LessEqual), (">=", DelphiTokenType::GreaterEqual), ("<>", DelphiTokenType::NotEqual), ("..", DelphiTokenType::DotDot)];
297
298        for (pat, kind) in patterns {
299            if state.consume_if_starts_with(pat) {
300                state.add_token(*kind, start, state.get_position());
301                return true;
302            }
303        }
304
305        // Single-character operators
306        if let Some(ch) = state.peek() {
307            let kind = match ch {
308                '+' => Some(DelphiTokenType::Plus),
309                '-' => Some(DelphiTokenType::Minus),
310                '*' => Some(DelphiTokenType::Star),
311                '/' => Some(DelphiTokenType::Slash),
312                '=' => Some(DelphiTokenType::Equal),
313                '<' => Some(DelphiTokenType::Less),
314                '>' => Some(DelphiTokenType::Greater),
315                '.' => Some(DelphiTokenType::Dot),
316                ':' => Some(DelphiTokenType::Colon),
317                '^' => Some(DelphiTokenType::Caret),
318                '@' => Some(DelphiTokenType::At),
319                _ => None,
320            };
321
322            if let Some(k) = kind {
323                state.advance(ch.len_utf8());
324                state.add_token(k, start, state.get_position());
325                return true;
326            }
327        }
328
329        false
330    }
331
332    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
333        let start = state.get_position();
334
335        if let Some(ch) = state.peek() {
336            let kind = match ch {
337                '(' => DelphiTokenType::LeftParen,
338                ')' => DelphiTokenType::RightParen,
339                '[' => DelphiTokenType::LeftBracket,
340                ']' => DelphiTokenType::RightBracket,
341                ',' => DelphiTokenType::Comma,
342                ';' => DelphiTokenType::Semicolon,
343                _ => return false,
344            };
345
346            state.advance(ch.len_utf8());
347            state.add_token(kind, start, state.get_position());
348            true
349        }
350        else {
351            false
352        }
353    }
354}
355
356impl<'config> Lexer<DelphiLanguage> for DelphiLexer<'config> {
357    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<DelphiLanguage>) -> LexOutput<DelphiLanguage> {
358        let mut state = State::new_with_cache(source, 0, cache);
359        let result = self.run(&mut state);
360        if result.is_ok() {
361            state.add_eof()
362        }
363        state.finish_with_cache(result, cache)
364    }
365}