Skip to main content

nautilus_schema/
lexer.rs

1//! Lexer for schema language.
2
3use crate::error::{Result, SchemaError};
4use crate::span::Span;
5use crate::token::{Token, TokenKind};
6
7/// Lexer for tokenizing schema source code.
8pub struct Lexer<'a> {
9    /// Source text being lexed.
10    source: &'a str,
11    /// Current byte position in source.
12    pos: usize,
13    /// Characters remaining (for efficient peeking).
14    chars: std::str::Chars<'a>,
15    /// Peeked character cache.
16    peeked: Option<char>,
17}
18
19impl<'a> Lexer<'a> {
20    /// Create a new lexer for the given source.
21    pub fn new(source: &'a str) -> Self {
22        Self {
23            source,
24            pos: 0,
25            chars: source.chars(),
26            peeked: None,
27        }
28    }
29
30    /// Get the next token.
31    pub fn next_token(&mut self) -> Result<Token> {
32        self.skip_whitespace();
33
34        while self.peek() == Some('/')
35            && (self.peek_n(1) == Some('/') || self.peek_n(1) == Some('*'))
36        {
37            self.skip_comment()?;
38            self.skip_whitespace();
39        }
40
41        let start = self.pos;
42
43        if self.is_at_end() {
44            return Ok(Token::new(TokenKind::Eof, Span::new(start, start)));
45        }
46
47        let ch = self.peek().unwrap();
48
49        if ch == '\n' {
50            self.advance();
51            return Ok(Token::new(TokenKind::Newline, Span::new(start, self.pos)));
52        }
53
54        if ch == '"' || ch == '\'' {
55            return self.lex_string(start, ch);
56        }
57
58        if ch.is_ascii_digit() {
59            return self.lex_number(start);
60        }
61
62        if ch.is_alphabetic() || ch == '_' {
63            return self.lex_identifier_or_keyword(start);
64        }
65
66        if ch == '@' {
67            self.advance();
68            if self.peek() == Some('@') {
69                self.advance();
70                return Ok(Token::new(TokenKind::AtAt, Span::new(start, self.pos)));
71            }
72            return Ok(Token::new(TokenKind::At, Span::new(start, self.pos)));
73        }
74
75        let kind = match ch {
76            '{' => {
77                self.advance();
78                TokenKind::LBrace
79            }
80            '}' => {
81                self.advance();
82                TokenKind::RBrace
83            }
84            '[' => {
85                self.advance();
86                TokenKind::LBracket
87            }
88            ']' => {
89                self.advance();
90                TokenKind::RBracket
91            }
92            '(' => {
93                self.advance();
94                TokenKind::LParen
95            }
96            ')' => {
97                self.advance();
98                TokenKind::RParen
99            }
100            ',' => {
101                self.advance();
102                TokenKind::Comma
103            }
104            ':' => {
105                self.advance();
106                TokenKind::Colon
107            }
108            '=' => {
109                self.advance();
110                TokenKind::Equal
111            }
112            '?' => {
113                self.advance();
114                TokenKind::Question
115            }
116            '!' => {
117                self.advance();
118                if self.peek() == Some('=') {
119                    self.advance();
120                    TokenKind::BangEqual
121                } else {
122                    TokenKind::Bang
123                }
124            }
125            '.' => {
126                self.advance();
127                TokenKind::Dot
128            }
129            '*' => {
130                self.advance();
131                TokenKind::Star
132            }
133            '+' => {
134                self.advance();
135                TokenKind::Plus
136            }
137            '-' => {
138                self.advance();
139                TokenKind::Minus
140            }
141            '<' => {
142                self.advance();
143                if self.peek() == Some('=') {
144                    self.advance();
145                    TokenKind::LessEqual
146                } else {
147                    TokenKind::LAngle
148                }
149            }
150            '>' => {
151                self.advance();
152                if self.peek() == Some('=') {
153                    self.advance();
154                    TokenKind::GreaterEqual
155                } else {
156                    TokenKind::RAngle
157                }
158            }
159            '%' => {
160                self.advance();
161                TokenKind::Percent
162            }
163            '|' => {
164                self.advance();
165                if self.peek() == Some('|') {
166                    self.advance();
167                    TokenKind::DoublePipe
168                } else {
169                    TokenKind::Pipe
170                }
171            }
172            _ => {
173                self.advance();
174                return Err(SchemaError::UnexpectedCharacter(ch, Span::single(start)));
175            }
176        };
177
178        Ok(Token::new(kind, Span::new(start, self.pos)))
179    }
180
181    /// Lex an identifier or keyword.
182    fn lex_identifier_or_keyword(&mut self, start: usize) -> Result<Token> {
183        while let Some(ch) = self.peek() {
184            if ch.is_alphanumeric() || ch == '_' {
185                self.advance();
186            } else {
187                break;
188            }
189        }
190
191        let text = &self.source[start..self.pos];
192        let kind = TokenKind::from_ident(text);
193        Ok(Token::new(kind, Span::new(start, self.pos)))
194    }
195
196    /// Lex a string literal.
197    fn lex_string(&mut self, start: usize, quote: char) -> Result<Token> {
198        self.advance();
199
200        let mut value = String::new();
201
202        loop {
203            match self.peek() {
204                None | Some('\n') => {
205                    return Err(SchemaError::UnterminatedString(Span::new(start, self.pos)));
206                }
207                Some(ch) if ch == quote => {
208                    if quote == '\'' && self.peek_n(1) == Some('\'') {
209                        value.push('\'');
210                        self.advance();
211                        self.advance();
212                        continue;
213                    }
214                    self.advance();
215                    break;
216                }
217                Some('\\') => {
218                    self.advance();
219                    match self.peek() {
220                        Some(ch) => {
221                            let escaped = match ch {
222                                'n' => '\n',
223                                't' => '\t',
224                                'r' => '\r',
225                                '\\' => '\\',
226                                '"' if quote == '"' => '"',
227                                '\'' if quote == '\'' => '\'',
228                                _ => ch,
229                            };
230                            value.push(escaped);
231                            self.advance();
232                        }
233                        None => {
234                            return Err(SchemaError::UnterminatedString(Span::new(
235                                start, self.pos,
236                            )));
237                        }
238                    }
239                }
240                Some(ch) => {
241                    value.push(ch);
242                    self.advance();
243                }
244            }
245        }
246
247        Ok(Token::new(
248            TokenKind::String(value),
249            Span::new(start, self.pos),
250        ))
251    }
252
253    /// Lex a number literal.
254    fn lex_number(&mut self, start: usize) -> Result<Token> {
255        while let Some(ch) = self.peek() {
256            if ch.is_ascii_digit() {
257                self.advance();
258            } else {
259                break;
260            }
261        }
262
263        if self.peek() == Some('.') && self.peek_n(1).is_some_and(|ch| ch.is_ascii_digit()) {
264            self.advance(); // consume '.'
265
266            while let Some(ch) = self.peek() {
267                if ch.is_ascii_digit() {
268                    self.advance();
269                } else {
270                    break;
271                }
272            }
273        }
274
275        let text = &self.source[start..self.pos];
276
277        if text.parse::<f64>().is_err() {
278            return Err(SchemaError::InvalidNumber(
279                text.to_string(),
280                Span::new(start, self.pos),
281            ));
282        }
283
284        Ok(Token::new(
285            TokenKind::Number(text.to_string()),
286            Span::new(start, self.pos),
287        ))
288    }
289
290    /// Skip whitespace characters (but not newlines).
291    fn skip_whitespace(&mut self) {
292        while let Some(ch) = self.peek() {
293            if ch == ' ' || ch == '\t' || ch == '\r' {
294                self.advance();
295            } else {
296                break;
297            }
298        }
299    }
300
301    /// Skip comments (single-line or block).
302    fn skip_comment(&mut self) -> Result<()> {
303        if self.peek() != Some('/') {
304            return Ok(());
305        }
306
307        let start = self.pos;
308        self.advance(); // consume first '/'
309
310        match self.peek() {
311            Some('/') => {
312                self.advance(); // consume second '/'
313                while let Some(ch) = self.peek() {
314                    if ch == '\n' {
315                        break;
316                    }
317                    self.advance();
318                }
319            }
320            Some('*') => {
321                self.advance(); // consume '*'
322
323                loop {
324                    match self.peek() {
325                        None => {
326                            return Err(SchemaError::Lexer(
327                                "Unterminated block comment".to_string(),
328                                Span::new(start, self.pos),
329                            ));
330                        }
331                        Some('*') => {
332                            self.advance();
333                            if self.peek() == Some('/') {
334                                self.advance();
335                                break;
336                            }
337                        }
338                        Some(_) => {
339                            self.advance();
340                        }
341                    }
342                }
343            }
344            _ => {}
345        }
346
347        Ok(())
348    }
349
350    /// Peek at the current character without consuming it.
351    fn peek(&mut self) -> Option<char> {
352        if self.peeked.is_none() {
353            self.peeked = self.chars.next();
354        }
355        self.peeked
356    }
357
358    /// Peek at the nth character ahead without consuming.
359    fn peek_n(&self, n: usize) -> Option<char> {
360        self.source[self.pos..].chars().nth(n)
361    }
362
363    /// Advance to the next character.
364    fn advance(&mut self) -> Option<char> {
365        let ch = self.peek()?;
366        self.pos += ch.len_utf8();
367        self.peeked = None;
368        Some(ch)
369    }
370
371    /// Check if we've reached the end of the source.
372    fn is_at_end(&mut self) -> bool {
373        self.peek().is_none()
374    }
375}
376
377#[cfg(test)]
378mod tests {
379    use super::*;
380
381    fn tokenize(source: &str) -> Result<Vec<TokenKind>> {
382        let mut lexer = Lexer::new(source);
383        let mut tokens = Vec::new();
384
385        loop {
386            let token = lexer.next_token()?;
387            if token.kind == TokenKind::Eof {
388                break;
389            }
390            tokens.push(token.kind);
391        }
392
393        Ok(tokens)
394    }
395
396    #[test]
397    fn test_keywords() {
398        let tokens = tokenize("datasource generator model enum").unwrap();
399        assert_eq!(
400            tokens,
401            vec![
402                TokenKind::Datasource,
403                TokenKind::Generator,
404                TokenKind::Model,
405                TokenKind::Enum
406            ]
407        );
408    }
409
410    #[test]
411    fn test_identifiers() {
412        let tokens = tokenize("User email_address _private").unwrap();
413        assert_eq!(
414            tokens,
415            vec![
416                TokenKind::Ident("User".to_string()),
417                TokenKind::Ident("email_address".to_string()),
418                TokenKind::Ident("_private".to_string()),
419            ]
420        );
421    }
422
423    #[test]
424    fn test_string_literals() {
425        let tokens = tokenize(r#""hello" "world""#).unwrap();
426        assert_eq!(
427            tokens,
428            vec![
429                TokenKind::String("hello".to_string()),
430                TokenKind::String("world".to_string()),
431            ]
432        );
433
434        let tokens = tokenize("'hello' 'world'").unwrap();
435        assert_eq!(
436            tokens,
437            vec![
438                TokenKind::String("hello".to_string()),
439                TokenKind::String("world".to_string()),
440            ]
441        );
442    }
443
444    #[test]
445    fn test_string_escapes() {
446        let tokens = tokenize(r#""hello \"world\"""#).unwrap();
447        assert_eq!(
448            tokens,
449            vec![TokenKind::String("hello \"world\"".to_string())]
450        );
451
452        let tokens = tokenize(r#""line1\nline2""#).unwrap();
453        assert_eq!(tokens, vec![TokenKind::String("line1\nline2".to_string())]);
454
455        let tokens = tokenize("'O''Reilly'").unwrap();
456        assert_eq!(tokens, vec![TokenKind::String("O'Reilly".to_string())]);
457    }
458
459    #[test]
460    fn test_numbers() {
461        let tokens = tokenize("42 3.14 100").unwrap();
462        assert_eq!(
463            tokens,
464            vec![
465                TokenKind::Number("42".to_string()),
466                TokenKind::Number("3.14".to_string()),
467                TokenKind::Number("100".to_string()),
468            ]
469        );
470    }
471
472    #[test]
473    fn test_punctuation() {
474        let tokens = tokenize("{ } [ ] ( ) , : = ? .").unwrap();
475        assert_eq!(
476            tokens,
477            vec![
478                TokenKind::LBrace,
479                TokenKind::RBrace,
480                TokenKind::LBracket,
481                TokenKind::RBracket,
482                TokenKind::LParen,
483                TokenKind::RParen,
484                TokenKind::Comma,
485                TokenKind::Colon,
486                TokenKind::Equal,
487                TokenKind::Question,
488                TokenKind::Dot,
489            ]
490        );
491    }
492
493    #[test]
494    fn test_attributes() {
495        let tokens = tokenize("@ @@ @id @@map").unwrap();
496        assert_eq!(
497            tokens,
498            vec![
499                TokenKind::At,
500                TokenKind::AtAt,
501                TokenKind::At,
502                TokenKind::Ident("id".to_string()),
503                TokenKind::AtAt,
504                TokenKind::Ident("map".to_string()),
505            ]
506        );
507    }
508
509    #[test]
510    fn test_single_line_comment() {
511        let tokens = tokenize("model // this is a comment\nUser").unwrap();
512        assert_eq!(
513            tokens,
514            vec![
515                TokenKind::Model,
516                TokenKind::Newline,
517                TokenKind::Ident("User".to_string()),
518            ]
519        );
520    }
521
522    #[test]
523    fn test_block_comment() {
524        let tokens = tokenize("model /* comment */ User").unwrap();
525        assert_eq!(
526            tokens,
527            vec![TokenKind::Model, TokenKind::Ident("User".to_string()),]
528        );
529    }
530
531    #[test]
532    fn test_multiline_block_comment() {
533        let tokens = tokenize("model /* line 1\nline 2\nline 3 */ User").unwrap();
534        assert_eq!(
535            tokens,
536            vec![TokenKind::Model, TokenKind::Ident("User".to_string()),]
537        );
538    }
539
540    #[test]
541    fn test_unterminated_string() {
542        let result = tokenize(r#""hello"#);
543        assert!(result.is_err());
544        match result.unwrap_err() {
545            SchemaError::UnterminatedString(_) => {}
546            _ => panic!("Expected UnterminatedString error"),
547        }
548    }
549
550    #[test]
551    fn test_unexpected_character() {
552        let result = tokenize("model #");
553        assert!(result.is_err());
554        match result.unwrap_err() {
555            SchemaError::UnexpectedCharacter('#', _) => {}
556            _ => panic!("Expected UnexpectedCharacter error"),
557        }
558    }
559
560    #[test]
561    fn test_newlines() {
562        let tokens = tokenize("model\nUser\n").unwrap();
563        assert_eq!(
564            tokens,
565            vec![
566                TokenKind::Model,
567                TokenKind::Newline,
568                TokenKind::Ident("User".to_string()),
569                TokenKind::Newline,
570            ]
571        );
572    }
573
574    #[test]
575    fn test_schema_snippet() {
576        let source = r#"
577model User {
578  id    Int    @id
579  email String @unique
580}
581"#;
582        let tokens = tokenize(source).unwrap();
583        assert!(tokens.contains(&TokenKind::Model));
584        assert!(tokens.contains(&TokenKind::Ident("User".to_string())));
585        assert!(tokens.contains(&TokenKind::LBrace));
586        assert!(tokens.contains(&TokenKind::At));
587    }
588}