Skip to main content

nautilus_schema/
lexer.rs

1//! Lexer for schema language.
2
3use crate::error::{Result, SchemaError};
4use crate::span::Span;
5use crate::token::{Token, TokenKind};
6
7/// Lexer for tokenizing schema source code.
8pub struct Lexer<'a> {
9    /// Source text being lexed.
10    source: &'a str,
11    /// Current byte position in source.
12    pos: usize,
13    /// Characters remaining (for efficient peeking).
14    chars: std::str::Chars<'a>,
15    /// Peeked character cache.
16    peeked: Option<char>,
17}
18
19impl<'a> Lexer<'a> {
20    /// Create a new lexer for the given source.
21    pub fn new(source: &'a str) -> Self {
22        Self {
23            source,
24            pos: 0,
25            chars: source.chars(),
26            peeked: None,
27        }
28    }
29
30    /// Get the next token.
31    pub fn next_token(&mut self) -> Result<Token> {
32        self.skip_whitespace();
33
34        while self.peek() == Some('/')
35            && (self.peek_n(1) == Some('/') || self.peek_n(1) == Some('*'))
36        {
37            self.skip_comment()?;
38            self.skip_whitespace();
39        }
40
41        let start = self.pos;
42
43        if self.is_at_end() {
44            return Ok(Token::new(TokenKind::Eof, Span::new(start, start)));
45        }
46
47        let ch = self.peek().unwrap();
48
49        if ch == '\n' {
50            self.advance();
51            return Ok(Token::new(TokenKind::Newline, Span::new(start, self.pos)));
52        }
53
54        if ch == '"' {
55            return self.lex_string(start);
56        }
57
58        if ch.is_ascii_digit() {
59            return self.lex_number(start);
60        }
61
62        if ch.is_alphabetic() || ch == '_' {
63            return self.lex_identifier_or_keyword(start);
64        }
65
66        if ch == '@' {
67            self.advance();
68            if self.peek() == Some('@') {
69                self.advance();
70                return Ok(Token::new(TokenKind::AtAt, Span::new(start, self.pos)));
71            }
72            return Ok(Token::new(TokenKind::At, Span::new(start, self.pos)));
73        }
74
75        let kind = match ch {
76            '{' => {
77                self.advance();
78                TokenKind::LBrace
79            }
80            '}' => {
81                self.advance();
82                TokenKind::RBrace
83            }
84            '[' => {
85                self.advance();
86                TokenKind::LBracket
87            }
88            ']' => {
89                self.advance();
90                TokenKind::RBracket
91            }
92            '(' => {
93                self.advance();
94                TokenKind::LParen
95            }
96            ')' => {
97                self.advance();
98                TokenKind::RParen
99            }
100            ',' => {
101                self.advance();
102                TokenKind::Comma
103            }
104            ':' => {
105                self.advance();
106                TokenKind::Colon
107            }
108            '=' => {
109                self.advance();
110                TokenKind::Equal
111            }
112            '?' => {
113                self.advance();
114                TokenKind::Question
115            }
116            '!' => {
117                self.advance();
118                if self.peek() == Some('=') {
119                    self.advance();
120                    TokenKind::BangEqual
121                } else {
122                    TokenKind::Bang
123                }
124            }
125            '.' => {
126                self.advance();
127                TokenKind::Dot
128            }
129            '*' => {
130                self.advance();
131                TokenKind::Star
132            }
133            '+' => {
134                self.advance();
135                TokenKind::Plus
136            }
137            '-' => {
138                self.advance();
139                TokenKind::Minus
140            }
141            '<' => {
142                self.advance();
143                if self.peek() == Some('=') {
144                    self.advance();
145                    TokenKind::LessEqual
146                } else {
147                    TokenKind::LAngle
148                }
149            }
150            '>' => {
151                self.advance();
152                if self.peek() == Some('=') {
153                    self.advance();
154                    TokenKind::GreaterEqual
155                } else {
156                    TokenKind::RAngle
157                }
158            }
159            '%' => {
160                self.advance();
161                TokenKind::Percent
162            }
163            '|' => {
164                self.advance();
165                if self.peek() == Some('|') {
166                    self.advance();
167                    TokenKind::DoublePipe
168                } else {
169                    TokenKind::Pipe
170                }
171            }
172            _ => {
173                self.advance();
174                return Err(SchemaError::UnexpectedCharacter(ch, Span::single(start)));
175            }
176        };
177
178        Ok(Token::new(kind, Span::new(start, self.pos)))
179    }
180
181    /// Lex an identifier or keyword.
182    fn lex_identifier_or_keyword(&mut self, start: usize) -> Result<Token> {
183        while let Some(ch) = self.peek() {
184            if ch.is_alphanumeric() || ch == '_' {
185                self.advance();
186            } else {
187                break;
188            }
189        }
190
191        let text = &self.source[start..self.pos];
192        let kind = TokenKind::from_ident(text);
193        Ok(Token::new(kind, Span::new(start, self.pos)))
194    }
195
196    /// Lex a string literal.
197    fn lex_string(&mut self, start: usize) -> Result<Token> {
198        self.advance();
199
200        let mut value = String::new();
201
202        loop {
203            match self.peek() {
204                None | Some('\n') => {
205                    return Err(SchemaError::UnterminatedString(Span::new(start, self.pos)));
206                }
207                Some('"') => {
208                    self.advance();
209                    break;
210                }
211                Some('\\') => {
212                    self.advance();
213                    match self.peek() {
214                        Some(ch) => {
215                            let escaped = match ch {
216                                'n' => '\n',
217                                't' => '\t',
218                                'r' => '\r',
219                                '\\' => '\\',
220                                '"' => '"',
221                                _ => ch,
222                            };
223                            value.push(escaped);
224                            self.advance();
225                        }
226                        None => {
227                            return Err(SchemaError::UnterminatedString(Span::new(
228                                start, self.pos,
229                            )));
230                        }
231                    }
232                }
233                Some(ch) => {
234                    value.push(ch);
235                    self.advance();
236                }
237            }
238        }
239
240        Ok(Token::new(
241            TokenKind::String(value),
242            Span::new(start, self.pos),
243        ))
244    }
245
246    /// Lex a number literal.
247    fn lex_number(&mut self, start: usize) -> Result<Token> {
248        while let Some(ch) = self.peek() {
249            if ch.is_ascii_digit() {
250                self.advance();
251            } else {
252                break;
253            }
254        }
255
256        if self.peek() == Some('.') && self.peek_n(1).is_some_and(|ch| ch.is_ascii_digit()) {
257            self.advance(); // consume '.'
258
259            while let Some(ch) = self.peek() {
260                if ch.is_ascii_digit() {
261                    self.advance();
262                } else {
263                    break;
264                }
265            }
266        }
267
268        let text = &self.source[start..self.pos];
269
270        if text.parse::<f64>().is_err() {
271            return Err(SchemaError::InvalidNumber(
272                text.to_string(),
273                Span::new(start, self.pos),
274            ));
275        }
276
277        Ok(Token::new(
278            TokenKind::Number(text.to_string()),
279            Span::new(start, self.pos),
280        ))
281    }
282
283    /// Skip whitespace characters (but not newlines).
284    fn skip_whitespace(&mut self) {
285        while let Some(ch) = self.peek() {
286            if ch == ' ' || ch == '\t' || ch == '\r' {
287                self.advance();
288            } else {
289                break;
290            }
291        }
292    }
293
294    /// Skip comments (single-line or block).
295    fn skip_comment(&mut self) -> Result<()> {
296        if self.peek() != Some('/') {
297            return Ok(());
298        }
299
300        let start = self.pos;
301        self.advance(); // consume first '/'
302
303        match self.peek() {
304            Some('/') => {
305                self.advance(); // consume second '/'
306                while let Some(ch) = self.peek() {
307                    if ch == '\n' {
308                        break;
309                    }
310                    self.advance();
311                }
312            }
313            Some('*') => {
314                self.advance(); // consume '*'
315
316                loop {
317                    match self.peek() {
318                        None => {
319                            return Err(SchemaError::Lexer(
320                                "Unterminated block comment".to_string(),
321                                Span::new(start, self.pos),
322                            ));
323                        }
324                        Some('*') => {
325                            self.advance();
326                            if self.peek() == Some('/') {
327                                self.advance();
328                                break;
329                            }
330                        }
331                        Some(_) => {
332                            self.advance();
333                        }
334                    }
335                }
336            }
337            _ => {}
338        }
339
340        Ok(())
341    }
342
343    /// Peek at the current character without consuming it.
344    fn peek(&mut self) -> Option<char> {
345        if self.peeked.is_none() {
346            self.peeked = self.chars.next();
347        }
348        self.peeked
349    }
350
351    /// Peek at the nth character ahead without consuming.
352    fn peek_n(&self, n: usize) -> Option<char> {
353        self.source[self.pos..].chars().nth(n)
354    }
355
356    /// Advance to the next character.
357    fn advance(&mut self) -> Option<char> {
358        let ch = self.peek()?;
359        self.pos += ch.len_utf8();
360        self.peeked = None;
361        Some(ch)
362    }
363
364    /// Check if we've reached the end of the source.
365    fn is_at_end(&mut self) -> bool {
366        self.peek().is_none()
367    }
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373
374    fn tokenize(source: &str) -> Result<Vec<TokenKind>> {
375        let mut lexer = Lexer::new(source);
376        let mut tokens = Vec::new();
377
378        loop {
379            let token = lexer.next_token()?;
380            if token.kind == TokenKind::Eof {
381                break;
382            }
383            tokens.push(token.kind);
384        }
385
386        Ok(tokens)
387    }
388
389    #[test]
390    fn test_keywords() {
391        let tokens = tokenize("datasource generator model enum").unwrap();
392        assert_eq!(
393            tokens,
394            vec![
395                TokenKind::Datasource,
396                TokenKind::Generator,
397                TokenKind::Model,
398                TokenKind::Enum
399            ]
400        );
401    }
402
403    #[test]
404    fn test_identifiers() {
405        let tokens = tokenize("User email_address _private").unwrap();
406        assert_eq!(
407            tokens,
408            vec![
409                TokenKind::Ident("User".to_string()),
410                TokenKind::Ident("email_address".to_string()),
411                TokenKind::Ident("_private".to_string()),
412            ]
413        );
414    }
415
416    #[test]
417    fn test_string_literals() {
418        let tokens = tokenize(r#""hello" "world""#).unwrap();
419        assert_eq!(
420            tokens,
421            vec![
422                TokenKind::String("hello".to_string()),
423                TokenKind::String("world".to_string()),
424            ]
425        );
426    }
427
428    #[test]
429    fn test_string_escapes() {
430        let tokens = tokenize(r#""hello \"world\"""#).unwrap();
431        assert_eq!(
432            tokens,
433            vec![TokenKind::String("hello \"world\"".to_string())]
434        );
435
436        let tokens = tokenize(r#""line1\nline2""#).unwrap();
437        assert_eq!(tokens, vec![TokenKind::String("line1\nline2".to_string())]);
438    }
439
440    #[test]
441    fn test_numbers() {
442        let tokens = tokenize("42 3.14 100").unwrap();
443        assert_eq!(
444            tokens,
445            vec![
446                TokenKind::Number("42".to_string()),
447                TokenKind::Number("3.14".to_string()),
448                TokenKind::Number("100".to_string()),
449            ]
450        );
451    }
452
453    #[test]
454    fn test_punctuation() {
455        let tokens = tokenize("{ } [ ] ( ) , : = ? .").unwrap();
456        assert_eq!(
457            tokens,
458            vec![
459                TokenKind::LBrace,
460                TokenKind::RBrace,
461                TokenKind::LBracket,
462                TokenKind::RBracket,
463                TokenKind::LParen,
464                TokenKind::RParen,
465                TokenKind::Comma,
466                TokenKind::Colon,
467                TokenKind::Equal,
468                TokenKind::Question,
469                TokenKind::Dot,
470            ]
471        );
472    }
473
474    #[test]
475    fn test_attributes() {
476        let tokens = tokenize("@ @@ @id @@map").unwrap();
477        assert_eq!(
478            tokens,
479            vec![
480                TokenKind::At,
481                TokenKind::AtAt,
482                TokenKind::At,
483                TokenKind::Ident("id".to_string()),
484                TokenKind::AtAt,
485                TokenKind::Ident("map".to_string()),
486            ]
487        );
488    }
489
490    #[test]
491    fn test_single_line_comment() {
492        let tokens = tokenize("model // this is a comment\nUser").unwrap();
493        assert_eq!(
494            tokens,
495            vec![
496                TokenKind::Model,
497                TokenKind::Newline,
498                TokenKind::Ident("User".to_string()),
499            ]
500        );
501    }
502
503    #[test]
504    fn test_block_comment() {
505        let tokens = tokenize("model /* comment */ User").unwrap();
506        assert_eq!(
507            tokens,
508            vec![TokenKind::Model, TokenKind::Ident("User".to_string()),]
509        );
510    }
511
512    #[test]
513    fn test_multiline_block_comment() {
514        let tokens = tokenize("model /* line 1\nline 2\nline 3 */ User").unwrap();
515        assert_eq!(
516            tokens,
517            vec![TokenKind::Model, TokenKind::Ident("User".to_string()),]
518        );
519    }
520
521    #[test]
522    fn test_unterminated_string() {
523        let result = tokenize(r#""hello"#);
524        assert!(result.is_err());
525        match result.unwrap_err() {
526            SchemaError::UnterminatedString(_) => {}
527            _ => panic!("Expected UnterminatedString error"),
528        }
529    }
530
531    #[test]
532    fn test_unexpected_character() {
533        let result = tokenize("model #");
534        assert!(result.is_err());
535        match result.unwrap_err() {
536            SchemaError::UnexpectedCharacter('#', _) => {}
537            _ => panic!("Expected UnexpectedCharacter error"),
538        }
539    }
540
541    #[test]
542    fn test_newlines() {
543        let tokens = tokenize("model\nUser\n").unwrap();
544        assert_eq!(
545            tokens,
546            vec![
547                TokenKind::Model,
548                TokenKind::Newline,
549                TokenKind::Ident("User".to_string()),
550                TokenKind::Newline,
551            ]
552        );
553    }
554
555    #[test]
556    fn test_schema_snippet() {
557        let source = r#"
558model User {
559  id    Int    @id
560  email String @unique
561}
562"#;
563        let tokens = tokenize(source).unwrap();
564        assert!(tokens.contains(&TokenKind::Model));
565        assert!(tokens.contains(&TokenKind::Ident("User".to_string())));
566        assert!(tokens.contains(&TokenKind::LBrace));
567        assert!(tokens.contains(&TokenKind::At));
568    }
569}