oak_delphi/lexer/
mod.rs

1use crate::{kind::DelphiSyntaxKind, language::DelphiLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, DelphiLanguage>;
10
11static DELPHI_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static DELPHI_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static DELPHI_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: None });
14
15/// Lexer implementation for Delphi programming language
16#[derive(Clone)]
17pub struct DelphiLexer<'config> {
18    config: &'config DelphiLanguage,
19}
20
21impl<'config> Lexer<DelphiLanguage> for DelphiLexer<'config> {
22    fn lex_incremental(
23        &self,
24        source: impl Source,
25        changed: usize,
26        cache: IncrementalCache<DelphiLanguage>,
27    ) -> LexOutput<DelphiLanguage> {
28        let mut state = LexerState::new_with_cache(source, changed, cache);
29        let result = self.run(&mut state);
30        state.finish(result)
31    }
32}
33
34impl<'config> DelphiLexer<'config> {
35    pub fn new(config: &'config DelphiLanguage) -> Self {
36        Self { config }
37    }
38
39    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            let safe_point = state.get_position();
42
43            if self.skip_whitespace(state) {
44                continue;
45            }
46
47            if self.skip_comment(state) {
48                continue;
49            }
50
51            if self.lex_string_literal(state) {
52                continue;
53            }
54
55            if self.lex_number_literal(state) {
56                continue;
57            }
58
59            if self.lex_identifier_or_keyword(state) {
60                continue;
61            }
62
63            if self.lex_operators(state) {
64                continue;
65            }
66
67            if self.lex_single_char_tokens(state) {
68                continue;
69            }
70
71            state.safe_check(safe_point);
72        }
73
74        // Add EOF token
75        let eof_pos = state.get_position();
76        state.add_token(DelphiSyntaxKind::Eof, eof_pos, eof_pos);
77        Ok(())
78    }
79
80    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
81        match DELPHI_WHITESPACE.scan(state.rest(), state.get_position(), DelphiSyntaxKind::Whitespace) {
82            Some(token) => {
83                state.advance_with(token);
84                true
85            }
86            None => false,
87        }
88    }
89
90    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
91        let start = state.get_position();
92        let rest = state.rest();
93
94        // Line comment: // ... until newline
95        if rest.starts_with("//") {
96            state.advance(2);
97            while let Some(ch) = state.peek() {
98                if ch == '\n' || ch == '\r' {
99                    break;
100                }
101                state.advance(ch.len_utf8());
102            }
103            state.add_token(DelphiSyntaxKind::Comment, start, state.get_position());
104            return true;
105        }
106
107        // Block comment: { ... } or (* ... *)
108        if rest.starts_with("{") {
109            state.advance(1);
110            while let Some(ch) = state.peek() {
111                if ch == '}' {
112                    state.advance(1);
113                    break;
114                }
115                state.advance(ch.len_utf8());
116            }
117            state.add_token(DelphiSyntaxKind::Comment, start, state.get_position());
118            return true;
119        }
120
121        if rest.starts_with("(*") {
122            state.advance(2);
123            while let Some(ch) = state.peek() {
124                if ch == '*' && state.peek_next_n(1) == Some(')') {
125                    state.advance(2);
126                    break;
127                }
128                state.advance(ch.len_utf8());
129            }
130            state.add_token(DelphiSyntaxKind::Comment, start, state.get_position());
131            return true;
132        }
133
134        false
135    }
136
137    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
138        let start = state.get_position();
139
140        if state.current() != Some('\'') {
141            return false;
142        }
143
144        state.advance(1); // consume opening quote
145        while let Some(ch) = state.peek() {
146            if ch == '\'' {
147                // Check for escaped quote (double quote)
148                if state.peek_next_n(1) == Some('\'') {
149                    state.advance(2); // consume both quotes
150                    continue;
151                }
152                else {
153                    state.advance(1); // consume closing quote
154                    break;
155                }
156            }
157            if ch == '\n' || ch == '\r' {
158                break; // unterminated string
159            }
160            state.advance(ch.len_utf8());
161        }
162
163        state.add_token(DelphiSyntaxKind::String, start, state.get_position());
164        true
165    }
166
167    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
168        let start = state.get_position();
169        let first = match state.current() {
170            Some(c) => c,
171            None => return false,
172        };
173
174        if !first.is_ascii_digit() && first != '$' {
175            return false;
176        }
177
178        let mut is_float = false;
179
180        // Hexadecimal number
181        if first == '$' {
182            state.advance(1);
183            while let Some(c) = state.peek() {
184                if c.is_ascii_hexdigit() {
185                    state.advance(1);
186                }
187                else {
188                    break;
189                }
190            }
191        }
192        else {
193            // Decimal number
194            state.advance(1);
195            while let Some(c) = state.peek() {
196                if c.is_ascii_digit() {
197                    state.advance(1);
198                }
199                else {
200                    break;
201                }
202            }
203
204            // Fractional part
205            if state.peek() == Some('.') {
206                let next = state.peek_next_n(1);
207                if next.map(|c| c.is_ascii_digit()).unwrap_or(false) {
208                    is_float = true;
209                    state.advance(1); // consume '.'
210                    while let Some(c) = state.peek() {
211                        if c.is_ascii_digit() {
212                            state.advance(1);
213                        }
214                        else {
215                            break;
216                        }
217                    }
218                }
219            }
220
221            // Exponent part
222            if let Some(c) = state.peek() {
223                if c == 'e' || c == 'E' {
224                    let next = state.peek_next_n(1);
225                    if next == Some('+') || next == Some('-') || next.map(|d| d.is_ascii_digit()).unwrap_or(false) {
226                        is_float = true;
227                        state.advance(1);
228                        if let Some(sign) = state.peek() {
229                            if sign == '+' || sign == '-' {
230                                state.advance(1);
231                            }
232                        }
233                        while let Some(d) = state.peek() {
234                            if d.is_ascii_digit() {
235                                state.advance(1);
236                            }
237                            else {
238                                break;
239                            }
240                        }
241                    }
242                }
243            }
244        }
245
246        state.add_token(DelphiSyntaxKind::Number, start, state.get_position());
247        true
248    }
249
250    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
251        let start = state.get_position();
252        let ch = match state.current() {
253            Some(c) => c,
254            None => return false,
255        };
256
257        if !(ch.is_ascii_alphabetic() || ch == '_') {
258            return false;
259        }
260
261        state.advance(1);
262        while let Some(c) = state.current() {
263            if c.is_ascii_alphanumeric() || c == '_' {
264                state.advance(1);
265            }
266            else {
267                break;
268            }
269        }
270
271        let end = state.get_position();
272        let text = state.get_text_in((start..end).into());
273        let kind = match text.to_lowercase().as_str() {
274            "and" => DelphiSyntaxKind::And_,
275            "array" => DelphiSyntaxKind::Array,
276            "as" => DelphiSyntaxKind::As_,
277            "begin" => DelphiSyntaxKind::Begin,
278            "case" => DelphiSyntaxKind::Case,
279            "class" => DelphiSyntaxKind::Class,
280            "const" => DelphiSyntaxKind::Const,
281            "div" => DelphiSyntaxKind::Div,
282            "do" => DelphiSyntaxKind::Do,
283            "downto" => DelphiSyntaxKind::Downto,
284            "else" => DelphiSyntaxKind::Else,
285            "end" => DelphiSyntaxKind::End,
286            "except" => DelphiSyntaxKind::Except,
287            "false" => DelphiSyntaxKind::False_,
288            "finally" => DelphiSyntaxKind::Finally,
289            "for" => DelphiSyntaxKind::For,
290            "function" => DelphiSyntaxKind::Function,
291            "if" => DelphiSyntaxKind::If,
292            "implementation" => DelphiSyntaxKind::Implementation,
293            "in" => DelphiSyntaxKind::In_,
294            "interface" => DelphiSyntaxKind::Interface,
295            "is" => DelphiSyntaxKind::Is_,
296            "mod" => DelphiSyntaxKind::Mod,
297            "nil" => DelphiSyntaxKind::Nil,
298            "not" => DelphiSyntaxKind::Not_,
299            "object" => DelphiSyntaxKind::Object,
300            "of" => DelphiSyntaxKind::Of,
301            "or" => DelphiSyntaxKind::Or_,
302            "procedure" => DelphiSyntaxKind::Procedure,
303            "program" => DelphiSyntaxKind::Program,
304            "record" => DelphiSyntaxKind::Record,
305            "repeat" => DelphiSyntaxKind::Repeat,
306            "set" => DelphiSyntaxKind::Set,
307            "then" => DelphiSyntaxKind::Then,
308            "to" => DelphiSyntaxKind::To,
309            "true" => DelphiSyntaxKind::True_,
310            "try" => DelphiSyntaxKind::Try,
311            "type" => DelphiSyntaxKind::Type,
312            "unit" => DelphiSyntaxKind::Unit,
313            "until" => DelphiSyntaxKind::Until,
314            "uses" => DelphiSyntaxKind::Uses,
315            "var" => DelphiSyntaxKind::Var,
316            "while" => DelphiSyntaxKind::While,
317            "with" => DelphiSyntaxKind::With,
318            _ => DelphiSyntaxKind::Identifier,
319        };
320
321        state.add_token(kind, start, state.get_position());
322        true
323    }
324
325    fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
326        let start = state.get_position();
327        let rest = state.rest();
328
329        // Multi-character operators (longest first)
330        let patterns: &[(&str, DelphiSyntaxKind)] = &[
331            (":=", DelphiSyntaxKind::Assign),
332            ("<=", DelphiSyntaxKind::LessEqual),
333            (">=", DelphiSyntaxKind::GreaterEqual),
334            ("<>", DelphiSyntaxKind::NotEqual),
335            ("..", DelphiSyntaxKind::DotDot),
336        ];
337
338        for (pat, kind) in patterns {
339            if rest.starts_with(pat) {
340                state.advance(pat.len());
341                state.add_token(*kind, start, state.get_position());
342                return true;
343            }
344        }
345
346        // Single-character operators
347        if let Some(ch) = state.current() {
348            let kind = match ch {
349                '+' => Some(DelphiSyntaxKind::Plus),
350                '-' => Some(DelphiSyntaxKind::Minus),
351                '*' => Some(DelphiSyntaxKind::Star),
352                '/' => Some(DelphiSyntaxKind::Slash),
353                '=' => Some(DelphiSyntaxKind::Equal),
354                '<' => Some(DelphiSyntaxKind::Less),
355                '>' => Some(DelphiSyntaxKind::Greater),
356                '.' => Some(DelphiSyntaxKind::Dot),
357                ':' => Some(DelphiSyntaxKind::Colon),
358                '^' => Some(DelphiSyntaxKind::Caret),
359                '@' => Some(DelphiSyntaxKind::At),
360                _ => None,
361            };
362
363            if let Some(k) = kind {
364                state.advance(ch.len_utf8());
365                state.add_token(k, start, state.get_position());
366                return true;
367            }
368        }
369
370        false
371    }
372
373    fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
374        let start = state.get_position();
375
376        if let Some(ch) = state.current() {
377            let kind = match ch {
378                '(' => DelphiSyntaxKind::LeftParen,
379                ')' => DelphiSyntaxKind::RightParen,
380                '[' => DelphiSyntaxKind::LeftBracket,
381                ']' => DelphiSyntaxKind::RightBracket,
382                ',' => DelphiSyntaxKind::Comma,
383                ';' => DelphiSyntaxKind::Semicolon,
384                _ => return false,
385            };
386
387            state.advance(ch.len_utf8());
388            state.add_token(kind, start, state.get_position());
389            true
390        }
391        else {
392            false
393        }
394    }
395}