1use crate::error::{Result, SchemaError};
4use crate::span::Span;
5use crate::token::{Token, TokenKind};
6
7pub struct Lexer<'a> {
9 source: &'a str,
11 pos: usize,
13 chars: std::str::Chars<'a>,
15 peeked: Option<char>,
17}
18
19impl<'a> Lexer<'a> {
20 pub fn new(source: &'a str) -> Self {
22 Self {
23 source,
24 pos: 0,
25 chars: source.chars(),
26 peeked: None,
27 }
28 }
29
30 pub fn next_token(&mut self) -> Result<Token> {
32 self.skip_whitespace();
33
34 while self.peek() == Some('/')
35 && (self.peek_n(1) == Some('/') || self.peek_n(1) == Some('*'))
36 {
37 self.skip_comment()?;
38 self.skip_whitespace();
39 }
40
41 let start = self.pos;
42
43 if self.is_at_end() {
44 return Ok(Token::new(TokenKind::Eof, Span::new(start, start)));
45 }
46
47 let ch = self.peek().unwrap();
48
49 if ch == '\n' {
50 self.advance();
51 return Ok(Token::new(TokenKind::Newline, Span::new(start, self.pos)));
52 }
53
54 if ch == '"' || ch == '\'' {
55 return self.lex_string(start, ch);
56 }
57
58 if ch.is_ascii_digit() {
59 return self.lex_number(start);
60 }
61
62 if ch.is_alphabetic() || ch == '_' {
63 return self.lex_identifier_or_keyword(start);
64 }
65
66 if ch == '@' {
67 self.advance();
68 if self.peek() == Some('@') {
69 self.advance();
70 return Ok(Token::new(TokenKind::AtAt, Span::new(start, self.pos)));
71 }
72 return Ok(Token::new(TokenKind::At, Span::new(start, self.pos)));
73 }
74
75 let kind = match ch {
76 '{' => {
77 self.advance();
78 TokenKind::LBrace
79 }
80 '}' => {
81 self.advance();
82 TokenKind::RBrace
83 }
84 '[' => {
85 self.advance();
86 TokenKind::LBracket
87 }
88 ']' => {
89 self.advance();
90 TokenKind::RBracket
91 }
92 '(' => {
93 self.advance();
94 TokenKind::LParen
95 }
96 ')' => {
97 self.advance();
98 TokenKind::RParen
99 }
100 ',' => {
101 self.advance();
102 TokenKind::Comma
103 }
104 ':' => {
105 self.advance();
106 TokenKind::Colon
107 }
108 '=' => {
109 self.advance();
110 TokenKind::Equal
111 }
112 '?' => {
113 self.advance();
114 TokenKind::Question
115 }
116 '!' => {
117 self.advance();
118 if self.peek() == Some('=') {
119 self.advance();
120 TokenKind::BangEqual
121 } else {
122 TokenKind::Bang
123 }
124 }
125 '.' => {
126 self.advance();
127 TokenKind::Dot
128 }
129 '*' => {
130 self.advance();
131 TokenKind::Star
132 }
133 '+' => {
134 self.advance();
135 TokenKind::Plus
136 }
137 '-' => {
138 self.advance();
139 TokenKind::Minus
140 }
141 '<' => {
142 self.advance();
143 if self.peek() == Some('=') {
144 self.advance();
145 TokenKind::LessEqual
146 } else {
147 TokenKind::LAngle
148 }
149 }
150 '>' => {
151 self.advance();
152 if self.peek() == Some('=') {
153 self.advance();
154 TokenKind::GreaterEqual
155 } else {
156 TokenKind::RAngle
157 }
158 }
159 '%' => {
160 self.advance();
161 TokenKind::Percent
162 }
163 '|' => {
164 self.advance();
165 if self.peek() == Some('|') {
166 self.advance();
167 TokenKind::DoublePipe
168 } else {
169 TokenKind::Pipe
170 }
171 }
172 _ => {
173 self.advance();
174 return Err(SchemaError::UnexpectedCharacter(ch, Span::single(start)));
175 }
176 };
177
178 Ok(Token::new(kind, Span::new(start, self.pos)))
179 }
180
181 fn lex_identifier_or_keyword(&mut self, start: usize) -> Result<Token> {
183 while let Some(ch) = self.peek() {
184 if ch.is_alphanumeric() || ch == '_' {
185 self.advance();
186 } else {
187 break;
188 }
189 }
190
191 let text = &self.source[start..self.pos];
192 let kind = TokenKind::from_ident(text);
193 Ok(Token::new(kind, Span::new(start, self.pos)))
194 }
195
196 fn lex_string(&mut self, start: usize, quote: char) -> Result<Token> {
198 self.advance();
199
200 let mut value = String::new();
201
202 loop {
203 match self.peek() {
204 None | Some('\n') => {
205 return Err(SchemaError::UnterminatedString(Span::new(start, self.pos)));
206 }
207 Some(ch) if ch == quote => {
208 if quote == '\'' && self.peek_n(1) == Some('\'') {
209 value.push('\'');
210 self.advance();
211 self.advance();
212 continue;
213 }
214 self.advance();
215 break;
216 }
217 Some('\\') => {
218 self.advance();
219 match self.peek() {
220 Some(ch) => {
221 let escaped = match ch {
222 'n' => '\n',
223 't' => '\t',
224 'r' => '\r',
225 '\\' => '\\',
226 '"' if quote == '"' => '"',
227 '\'' if quote == '\'' => '\'',
228 _ => ch,
229 };
230 value.push(escaped);
231 self.advance();
232 }
233 None => {
234 return Err(SchemaError::UnterminatedString(Span::new(
235 start, self.pos,
236 )));
237 }
238 }
239 }
240 Some(ch) => {
241 value.push(ch);
242 self.advance();
243 }
244 }
245 }
246
247 Ok(Token::new(
248 TokenKind::String(value),
249 Span::new(start, self.pos),
250 ))
251 }
252
253 fn lex_number(&mut self, start: usize) -> Result<Token> {
255 while let Some(ch) = self.peek() {
256 if ch.is_ascii_digit() {
257 self.advance();
258 } else {
259 break;
260 }
261 }
262
263 if self.peek() == Some('.') && self.peek_n(1).is_some_and(|ch| ch.is_ascii_digit()) {
264 self.advance(); while let Some(ch) = self.peek() {
267 if ch.is_ascii_digit() {
268 self.advance();
269 } else {
270 break;
271 }
272 }
273 }
274
275 let text = &self.source[start..self.pos];
276
277 if text.parse::<f64>().is_err() {
278 return Err(SchemaError::InvalidNumber(
279 text.to_string(),
280 Span::new(start, self.pos),
281 ));
282 }
283
284 Ok(Token::new(
285 TokenKind::Number(text.to_string()),
286 Span::new(start, self.pos),
287 ))
288 }
289
290 fn skip_whitespace(&mut self) {
292 while let Some(ch) = self.peek() {
293 if ch == ' ' || ch == '\t' || ch == '\r' {
294 self.advance();
295 } else {
296 break;
297 }
298 }
299 }
300
301 fn skip_comment(&mut self) -> Result<()> {
303 if self.peek() != Some('/') {
304 return Ok(());
305 }
306
307 let start = self.pos;
308 self.advance(); match self.peek() {
311 Some('/') => {
312 self.advance(); while let Some(ch) = self.peek() {
314 if ch == '\n' {
315 break;
316 }
317 self.advance();
318 }
319 }
320 Some('*') => {
321 self.advance(); loop {
324 match self.peek() {
325 None => {
326 return Err(SchemaError::Lexer(
327 "Unterminated block comment".to_string(),
328 Span::new(start, self.pos),
329 ));
330 }
331 Some('*') => {
332 self.advance();
333 if self.peek() == Some('/') {
334 self.advance();
335 break;
336 }
337 }
338 Some(_) => {
339 self.advance();
340 }
341 }
342 }
343 }
344 _ => {}
345 }
346
347 Ok(())
348 }
349
350 fn peek(&mut self) -> Option<char> {
352 if self.peeked.is_none() {
353 self.peeked = self.chars.next();
354 }
355 self.peeked
356 }
357
358 fn peek_n(&self, n: usize) -> Option<char> {
360 self.source[self.pos..].chars().nth(n)
361 }
362
363 fn advance(&mut self) -> Option<char> {
365 let ch = self.peek()?;
366 self.pos += ch.len_utf8();
367 self.peeked = None;
368 Some(ch)
369 }
370
371 fn is_at_end(&mut self) -> bool {
373 self.peek().is_none()
374 }
375}
376
377#[cfg(test)]
378mod tests {
379 use super::*;
380
381 fn tokenize(source: &str) -> Result<Vec<TokenKind>> {
382 let mut lexer = Lexer::new(source);
383 let mut tokens = Vec::new();
384
385 loop {
386 let token = lexer.next_token()?;
387 if token.kind == TokenKind::Eof {
388 break;
389 }
390 tokens.push(token.kind);
391 }
392
393 Ok(tokens)
394 }
395
396 #[test]
397 fn test_keywords() {
398 let tokens = tokenize("datasource generator model enum").unwrap();
399 assert_eq!(
400 tokens,
401 vec![
402 TokenKind::Datasource,
403 TokenKind::Generator,
404 TokenKind::Model,
405 TokenKind::Enum
406 ]
407 );
408 }
409
410 #[test]
411 fn test_identifiers() {
412 let tokens = tokenize("User email_address _private").unwrap();
413 assert_eq!(
414 tokens,
415 vec![
416 TokenKind::Ident("User".to_string()),
417 TokenKind::Ident("email_address".to_string()),
418 TokenKind::Ident("_private".to_string()),
419 ]
420 );
421 }
422
423 #[test]
424 fn test_string_literals() {
425 let tokens = tokenize(r#""hello" "world""#).unwrap();
426 assert_eq!(
427 tokens,
428 vec![
429 TokenKind::String("hello".to_string()),
430 TokenKind::String("world".to_string()),
431 ]
432 );
433
434 let tokens = tokenize("'hello' 'world'").unwrap();
435 assert_eq!(
436 tokens,
437 vec![
438 TokenKind::String("hello".to_string()),
439 TokenKind::String("world".to_string()),
440 ]
441 );
442 }
443
444 #[test]
445 fn test_string_escapes() {
446 let tokens = tokenize(r#""hello \"world\"""#).unwrap();
447 assert_eq!(
448 tokens,
449 vec![TokenKind::String("hello \"world\"".to_string())]
450 );
451
452 let tokens = tokenize(r#""line1\nline2""#).unwrap();
453 assert_eq!(tokens, vec![TokenKind::String("line1\nline2".to_string())]);
454
455 let tokens = tokenize("'O''Reilly'").unwrap();
456 assert_eq!(tokens, vec![TokenKind::String("O'Reilly".to_string())]);
457 }
458
459 #[test]
460 fn test_numbers() {
461 let tokens = tokenize("42 3.14 100").unwrap();
462 assert_eq!(
463 tokens,
464 vec![
465 TokenKind::Number("42".to_string()),
466 TokenKind::Number("3.14".to_string()),
467 TokenKind::Number("100".to_string()),
468 ]
469 );
470 }
471
472 #[test]
473 fn test_punctuation() {
474 let tokens = tokenize("{ } [ ] ( ) , : = ? .").unwrap();
475 assert_eq!(
476 tokens,
477 vec![
478 TokenKind::LBrace,
479 TokenKind::RBrace,
480 TokenKind::LBracket,
481 TokenKind::RBracket,
482 TokenKind::LParen,
483 TokenKind::RParen,
484 TokenKind::Comma,
485 TokenKind::Colon,
486 TokenKind::Equal,
487 TokenKind::Question,
488 TokenKind::Dot,
489 ]
490 );
491 }
492
493 #[test]
494 fn test_attributes() {
495 let tokens = tokenize("@ @@ @id @@map").unwrap();
496 assert_eq!(
497 tokens,
498 vec![
499 TokenKind::At,
500 TokenKind::AtAt,
501 TokenKind::At,
502 TokenKind::Ident("id".to_string()),
503 TokenKind::AtAt,
504 TokenKind::Ident("map".to_string()),
505 ]
506 );
507 }
508
509 #[test]
510 fn test_single_line_comment() {
511 let tokens = tokenize("model // this is a comment\nUser").unwrap();
512 assert_eq!(
513 tokens,
514 vec![
515 TokenKind::Model,
516 TokenKind::Newline,
517 TokenKind::Ident("User".to_string()),
518 ]
519 );
520 }
521
522 #[test]
523 fn test_block_comment() {
524 let tokens = tokenize("model /* comment */ User").unwrap();
525 assert_eq!(
526 tokens,
527 vec![TokenKind::Model, TokenKind::Ident("User".to_string()),]
528 );
529 }
530
531 #[test]
532 fn test_multiline_block_comment() {
533 let tokens = tokenize("model /* line 1\nline 2\nline 3 */ User").unwrap();
534 assert_eq!(
535 tokens,
536 vec![TokenKind::Model, TokenKind::Ident("User".to_string()),]
537 );
538 }
539
540 #[test]
541 fn test_unterminated_string() {
542 let result = tokenize(r#""hello"#);
543 assert!(result.is_err());
544 match result.unwrap_err() {
545 SchemaError::UnterminatedString(_) => {}
546 _ => panic!("Expected UnterminatedString error"),
547 }
548 }
549
550 #[test]
551 fn test_unexpected_character() {
552 let result = tokenize("model #");
553 assert!(result.is_err());
554 match result.unwrap_err() {
555 SchemaError::UnexpectedCharacter('#', _) => {}
556 _ => panic!("Expected UnexpectedCharacter error"),
557 }
558 }
559
560 #[test]
561 fn test_newlines() {
562 let tokens = tokenize("model\nUser\n").unwrap();
563 assert_eq!(
564 tokens,
565 vec![
566 TokenKind::Model,
567 TokenKind::Newline,
568 TokenKind::Ident("User".to_string()),
569 TokenKind::Newline,
570 ]
571 );
572 }
573
574 #[test]
575 fn test_schema_snippet() {
576 let source = r#"
577model User {
578 id Int @id
579 email String @unique
580}
581"#;
582 let tokens = tokenize(source).unwrap();
583 assert!(tokens.contains(&TokenKind::Model));
584 assert!(tokens.contains(&TokenKind::Ident("User".to_string())));
585 assert!(tokens.contains(&TokenKind::LBrace));
586 assert!(tokens.contains(&TokenKind::At));
587 }
588}