Skip to main content

oak_von/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2
3use oak_core::{
4    Lexer, LexerState, Source, TextEdit,
5    lexer::{LexOutput, LexerCache},
6};
7
8/// Token types for the VON language.
9pub mod token_type;
10use crate::language::VonLanguage;
11pub use token_type::{VonToken, VonTokenType};
12
13pub(crate) type State<'a, S> = LexerState<'a, S, VonLanguage>;
14
15/// A lexer for the VON language.
16#[derive(Clone, Debug)]
17pub struct VonLexer<'config> {
18    config: &'config VonLanguage,
19}
20
21impl<'config> VonLexer<'config> {
22    /// Creates a new `VonLexer` with the given configuration.
23    pub fn new(config: &'config VonLanguage) -> Self {
24        Self { config }
25    }
26
27    /// Skips whitespace characters.
28    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
29        let start_pos = state.get_position();
30
31        while let Some(ch) = state.peek() {
32            if ch == ' ' || ch == '\t' {
33                state.advance(ch.len_utf8());
34            }
35            else {
36                break;
37            }
38        }
39
40        if state.get_position() > start_pos {
41            state.add_token(VonTokenType::Whitespace, start_pos, state.get_position());
42            true
43        }
44        else {
45            false
46        }
47    }
48
49    /// Lexes a newline.
50    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
51        let start_pos = state.get_position();
52
53        if let Some('\n') = state.peek() {
54            state.advance(1);
55            state.add_token(VonTokenType::Newline, start_pos, state.get_position());
56            true
57        }
58        else if let Some('\r') = state.peek() {
59            state.advance(1);
60            if let Some('\n') = state.peek() {
61                state.advance(1);
62            }
63            state.add_token(VonTokenType::Newline, start_pos, state.get_position());
64            true
65        }
66        else {
67            false
68        }
69    }
70
71    /// Lexes a comment.
72    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
73        let start_pos = state.get_position();
74
75        // Single-line comment #
76        if let Some('#') = state.peek() {
77            state.advance(1);
78
79            // Read until end of line
80            while let Some(ch) = state.peek() {
81                if ch == '\n' || ch == '\r' {
82                    break;
83                }
84                state.advance(ch.len_utf8());
85            }
86
87            state.add_token(VonTokenType::Comment, start_pos, state.get_position());
88            return true;
89        }
90        false
91    }
92
93    /// Lexes a string literal or raw string.
94    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95        let start = state.get_position();
96
97        // Check for raw string raw"..."
98        let mut is_raw = false;
99        if let Some('r') = state.peek() {
100            if let Some('a') = state.peek_next_n(1) {
101                if let Some('w') = state.peek_next_n(2) {
102                    if let Some(c) = state.peek_next_n(3) {
103                        if c == '"' || c == '\'' {
104                            is_raw = true;
105                            // Note: don't advance directly here, let the subsequent logic handle quotes
106                        }
107                    }
108                }
109            }
110        }
111
112        let quote = if is_raw {
113            state.peek_next_n(3).unwrap()
114        }
115        else {
116            match state.peek() {
117                Some(c) if c == '"' || c == '\'' => c,
118                _ => return false,
119            }
120        };
121
122        if is_raw {
123            state.advance(3);
124        }
125
126        let mut quote_count = 0;
127        while let Some(c) = state.peek() {
128            if c == quote {
129                quote_count += 1;
130                state.advance(c.len_utf8());
131            }
132            else {
133                break;
134            }
135        }
136
137        // "" or '' are empty strings
138        if quote_count == 2 {
139            state.add_token(VonTokenType::StringLiteral, start, state.get_position());
140            return true;
141        }
142
143        if quote_count == 0 {
144            state.set_position(start);
145            return false;
146        }
147
148        let mut current_consecutive = 0;
149        let mut escaped = false;
150
151        while let Some(c) = state.peek() {
152            if !is_raw && escaped {
153                escaped = false;
154                state.advance(c.len_utf8());
155                current_consecutive = 0;
156                continue;
157            }
158
159            if !is_raw && c == '\\' && quote_count == 1 {
160                escaped = true;
161                state.advance(1);
162                current_consecutive = 0;
163                continue;
164            }
165
166            if c == quote {
167                current_consecutive += 1;
168                state.advance(c.len_utf8());
169                if current_consecutive == quote_count {
170                    state.add_token(VonTokenType::StringLiteral, start, state.get_position());
171                    return true;
172                }
173            }
174            else {
175                current_consecutive = 0;
176                state.advance(c.len_utf8());
177            }
178        }
179
180        // Unclosed string, mark as error but still treat as string for syntax highlighting
181        state.add_token(VonTokenType::Error, start, state.get_position());
182        true
183    }
184
185    /// Handles number literals.
186    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
187        let start_pos = state.get_position();
188
189        if let Some(ch) = state.peek() {
190            // Number must start with a digit, negative sign, or dot (followed by digit)
191            let is_number_start = ch.is_ascii_digit() || (ch == '-' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()));
192
193            if !is_number_start {
194                return false;
195            }
196
197            if ch == '-' {
198                state.advance(1);
199            }
200
201            // Integer part
202            if let Some(first) = state.peek() {
203                if first.is_ascii_digit() {
204                    while let Some(digit) = state.peek() {
205                        if digit.is_ascii_digit() || digit == '_' {
206                            state.advance(1);
207                        }
208                        else {
209                            break;
210                        }
211                    }
212                }
213            }
214
215            // Check for dot
216            if let Some('.') = state.peek() {
217                let mut lookahead = 1;
218                while let Some(c) = state.peek_next_n(lookahead) {
219                    if c == '_' {
220                        lookahead += 1;
221                    }
222                    else {
223                        break;
224                    }
225                }
226                if let Some(next_ch) = state.peek_next_n(lookahead) {
227                    if next_ch.is_ascii_digit() {
228                        state.advance(1); // Skip dot
229                        while let Some(digit) = state.peek() {
230                            if digit.is_ascii_digit() || digit == '_' {
231                                state.advance(1);
232                            }
233                            else {
234                                break;
235                            }
236                        }
237                    }
238                }
239            }
240
241            // Check for exponent
242            if let Some(e) = state.peek() {
243                if e == 'e' || e == 'E' {
244                    // Ensure exponent is followed by digits (or sign + digits)
245                    let mut lookahead = 1;
246                    if let Some(sign) = state.peek_next_n(lookahead) {
247                        if sign == '+' || sign == '-' {
248                            lookahead += 1;
249                        }
250                    }
251
252                    let has_digits = state.peek_next_n(lookahead).map_or(false, |c| c.is_ascii_digit() || (c == '_' && state.peek_next_n(lookahead + 1).map_or(false, |n| n.is_ascii_digit())));
253
254                    if has_digits {
255                        state.advance(1); // Skip e/E
256
257                        // Optional sign
258                        if let Some(sign) = state.peek() {
259                            if sign == '+' || sign == '-' {
260                                state.advance(1);
261                            }
262                        }
263
264                        // Exponent digits
265                        while let Some(digit) = state.peek() {
266                            if digit.is_ascii_digit() || digit == '_' {
267                                state.advance(1);
268                            }
269                            else {
270                                break;
271                            }
272                        }
273                    }
274                }
275            }
276
277            // Only considered a number if at least one digit or negative sign followed by digit is consumed
278            // Also check that it's not immediately followed by a letter, which might be an identifier (e.g. version)
279            if state.get_position() > start_pos {
280                if let Some(next) = state.peek() {
281                    if next.is_ascii_alphabetic() || next == '_' {
282                        state.set_position(start_pos);
283                        return false;
284                    }
285                }
286                state.add_token(VonTokenType::NumberLiteral, start_pos, state.get_position());
287                return true;
288            }
289            false
290        }
291        else {
292            false
293        }
294    }
295
296    /// Handles identifiers and keywords.
297    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
298        let start_pos = state.get_position();
299
300        if let Some(ch) = state.peek() {
301            if ch.is_ascii_alphabetic() || ch == '_' {
302                // If it's 'r', it might be 'raw', need to check if it's the start of a raw string
303                if ch == 'r' {
304                    if let Some('a') = state.peek_next_n(1) {
305                        if let Some('w') = state.peek_next_n(2) {
306                            if let Some(c) = state.peek_next_n(3) {
307                                if c == '"' || c == '\'' {
308                                    // This is a raw string, handled by lex_string
309                                    return false;
310                                }
311                            }
312                        }
313                    }
314                }
315
316                while let Some(ch) = state.peek() {
317                    if ch.is_ascii_alphanumeric() || ch == '_' {
318                        state.advance(ch.len_utf8());
319                    }
320                    else {
321                        break;
322                    }
323                }
324
325                let text = state.get_text_in((start_pos..state.get_position()).into());
326                let token_kind = match text.as_ref() {
327                    "true" | "false" => VonTokenType::BoolLiteral,
328                    "null" => VonTokenType::NullLiteral,
329                    _ => VonTokenType::Identifier,
330                };
331
332                state.add_token(token_kind, start_pos, state.get_position());
333                return true;
334            }
335        }
336        false
337    }
338
339    /// Handles operators and punctuation.
340    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
341        let start_pos = state.get_position();
342
343        if let Some(ch) = state.peek() {
344            let token_kind = match ch {
345                '[' => {
346                    state.advance(1);
347                    VonTokenType::LeftBracket
348                }
349                ']' => {
350                    state.advance(1);
351                    VonTokenType::RightBracket
352                }
353                '{' => {
354                    state.advance(1);
355                    VonTokenType::LeftBrace
356                }
357                '}' => {
358                    state.advance(1);
359                    VonTokenType::RightBrace
360                }
361                ',' => {
362                    state.advance(1);
363                    VonTokenType::Comma
364                }
365                ':' => {
366                    state.advance(1);
367                    VonTokenType::Colon
368                }
369                '=' => {
370                    state.advance(1);
371                    VonTokenType::Eq
372                }
373                _ => return false,
374            };
375            state.add_token(token_kind, start_pos, state.get_position());
376            true
377        }
378        else {
379            false
380        }
381    }
382}
383
384impl<'config> Lexer<VonLanguage> for VonLexer<'config> {
385    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<VonLanguage>) -> LexOutput<VonLanguage> {
386        let mut state = State::new(source);
387        while state.not_at_end() {
388            if self.skip_whitespace(&mut state) {
389                continue;
390            }
391            if self.lex_newline(&mut state) {
392                continue;
393            }
394            if self.lex_comment(&mut state) {
395                continue;
396            }
397            if self.lex_identifier_or_keyword(&mut state) {
398                continue;
399            }
400            if self.lex_number(&mut state) {
401                continue;
402            }
403            if self.lex_string(&mut state) {
404                continue;
405            }
406            if self.lex_operator(&mut state) {
407                continue;
408            }
409
410            // If no match, treat as error and skip one character
411            let start_pos = state.get_position();
412            if let Some(ch) = state.peek() {
413                state.advance(ch.len_utf8());
414                state.add_token(VonTokenType::Error, start_pos, state.get_position());
415            }
416            else {
417                break;
418            }
419        }
420
421        state.finish(Ok(()))
422    }
423}