1use crate::error::{LexError, Position};
7
8#[derive(Debug, Clone, PartialEq)]
10pub struct Token {
11 pub kind: TokenKind,
13 pub position: Position,
15}
16
17impl Token {
18 pub fn new(kind: TokenKind, position: Position) -> Self {
20 Self { kind, position }
21 }
22}
23
24#[derive(Debug, Clone, PartialEq)]
26pub enum TokenKind {
27 Integer(i64),
30 Float(f64),
32 String(String),
34
35 Identifier(String),
38 Let,
40 If,
42 Else,
44 For,
46 Fn,
48 Return,
50 True,
52 False,
54
55 Plus,
58 Minus,
60 Star,
62 Slash,
64 Percent,
66 Assign,
68 Equal,
70 NotEqual,
72 LessThan,
74 LessEqual,
76 GreaterThan,
78 GreaterEqual,
80 Bang,
82 And,
84 Or,
86
87 LeftParen,
90 RightParen,
92 LeftBrace,
94 RightBrace,
96 LeftBracket,
98 RightBracket,
100 Comma,
102 Semicolon,
104 Colon,
106 Dot,
108
109 Eof,
112}
113
114impl std::fmt::Display for TokenKind {
115 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
116 match self {
117 TokenKind::Integer(n) => write!(f, "{}", n),
118 TokenKind::Float(fl) => write!(f, "{}", fl),
119 TokenKind::String(s) => write!(f, "\"{}\"", s),
120 TokenKind::Identifier(s) => write!(f, "{}", s),
121 TokenKind::Let => write!(f, "let"),
122 TokenKind::If => write!(f, "if"),
123 TokenKind::Else => write!(f, "else"),
124 TokenKind::For => write!(f, "for"),
125 TokenKind::Fn => write!(f, "fn"),
126 TokenKind::Return => write!(f, "return"),
127 TokenKind::True => write!(f, "true"),
128 TokenKind::False => write!(f, "false"),
129 TokenKind::Plus => write!(f, "+"),
130 TokenKind::Minus => write!(f, "-"),
131 TokenKind::Star => write!(f, "*"),
132 TokenKind::Slash => write!(f, "/"),
133 TokenKind::Percent => write!(f, "%"),
134 TokenKind::Assign => write!(f, "="),
135 TokenKind::Equal => write!(f, "=="),
136 TokenKind::NotEqual => write!(f, "!="),
137 TokenKind::LessThan => write!(f, "<"),
138 TokenKind::LessEqual => write!(f, "<="),
139 TokenKind::GreaterThan => write!(f, ">"),
140 TokenKind::GreaterEqual => write!(f, ">="),
141 TokenKind::Bang => write!(f, "!"),
142 TokenKind::And => write!(f, "&&"),
143 TokenKind::Or => write!(f, "||"),
144 TokenKind::LeftParen => write!(f, "("),
145 TokenKind::RightParen => write!(f, ")"),
146 TokenKind::LeftBrace => write!(f, "{{"),
147 TokenKind::RightBrace => write!(f, "}}"),
148 TokenKind::LeftBracket => write!(f, "["),
149 TokenKind::RightBracket => write!(f, "]"),
150 TokenKind::Comma => write!(f, ","),
151 TokenKind::Semicolon => write!(f, ";"),
152 TokenKind::Colon => write!(f, ":"),
153 TokenKind::Dot => write!(f, "."),
154 TokenKind::Eof => write!(f, "EOF"),
155 }
156 }
157}
158
159pub struct Lexer<'a> {
161 source: &'a str,
163 chars: std::iter::Peekable<std::str::CharIndices<'a>>,
165 line: usize,
167 column: usize,
169 offset: usize,
171 at_eof: bool,
173}
174
175impl<'a> Lexer<'a> {
176 pub fn new(source: &'a str) -> Self {
178 Self {
179 source,
180 chars: source.char_indices().peekable(),
181 line: 1,
182 column: 1,
183 offset: 0,
184 at_eof: false,
185 }
186 }
187
188 fn position(&self) -> Position {
190 Position::new(self.line, self.column, self.offset)
191 }
192
193 fn advance(&mut self) -> Option<char> {
195 if let Some((idx, ch)) = self.chars.next() {
196 self.offset = idx + ch.len_utf8();
197 if ch == '\n' {
198 self.line += 1;
199 self.column = 1;
200 } else {
201 self.column += 1;
202 }
203 Some(ch)
204 } else {
205 None
206 }
207 }
208
209 fn peek(&mut self) -> Option<char> {
211 self.chars.peek().map(|(_, ch)| *ch)
212 }
213
214 fn peek_next(&self) -> Option<char> {
216 let mut iter = self.chars.clone();
217 iter.next(); iter.peek().map(|(_, ch)| *ch)
219 }
220
221 fn skip_whitespace_and_comments(&mut self) {
223 loop {
224 while let Some(ch) = self.peek() {
226 if ch.is_whitespace() {
227 self.advance();
228 } else {
229 break;
230 }
231 }
232
233 if self.peek() == Some('/') && self.peek_next() == Some('/') {
235 self.advance(); self.advance(); while let Some(ch) = self.peek() {
239 if ch == '\n' {
240 break;
241 }
242 self.advance();
243 }
244 continue; }
246
247 break;
248 }
249 }
250
251 fn scan_identifier(&mut self, first_char: char, start_pos: Position) -> Token {
253 let start_offset = self.offset - first_char.len_utf8();
254
255 while let Some(ch) = self.peek() {
256 if ch.is_alphanumeric() || ch == '_' {
257 self.advance();
258 } else {
259 break;
260 }
261 }
262
263 let text = &self.source[start_offset..self.offset];
264 let kind = match text {
265 "let" => TokenKind::Let,
266 "if" => TokenKind::If,
267 "else" => TokenKind::Else,
268 "for" => TokenKind::For,
269 "fn" => TokenKind::Fn,
270 "return" => TokenKind::Return,
271 "true" => TokenKind::True,
272 "false" => TokenKind::False,
273 _ => TokenKind::Identifier(text.to_string()),
274 };
275
276 Token::new(kind, start_pos)
277 }
278
279 fn scan_number(&mut self, first_char: char, start_pos: Position) -> Result<Token, LexError> {
281 let start_offset = self.offset - first_char.len_utf8();
282 let mut has_decimal = false;
283
284 while let Some(ch) = self.peek() {
286 if ch.is_ascii_digit() {
287 self.advance();
288 } else if ch == '.' && !has_decimal {
289 if let Some(next_ch) = self.peek_next() {
291 if next_ch.is_ascii_digit() {
292 has_decimal = true;
293 self.advance(); } else {
295 break;
297 }
298 } else {
299 break;
301 }
302 } else {
303 break;
304 }
305 }
306
307 let text = &self.source[start_offset..self.offset];
308
309 if has_decimal {
310 match text.parse::<f64>() {
312 Ok(value) => Ok(Token::new(TokenKind::Float(value), start_pos)),
313 Err(_) => Err(LexError::InvalidNumber(start_pos)),
314 }
315 } else {
316 match text.parse::<i64>() {
318 Ok(value) => Ok(Token::new(TokenKind::Integer(value), start_pos)),
319 Err(_) => Err(LexError::InvalidNumber(start_pos)),
320 }
321 }
322 }
323
324 fn scan_string(&mut self, start_pos: Position) -> Result<Token, LexError> {
326 let mut value = String::new();
327
328 loop {
329 match self.advance() {
330 Some('"') => break,
331 Some('\\') => {
332 match self.advance() {
334 Some('n') => value.push('\n'),
335 Some('t') => value.push('\t'),
336 Some('r') => value.push('\r'),
337 Some('\\') => value.push('\\'),
338 Some('"') => value.push('"'),
339 Some(ch) => return Err(LexError::InvalidEscape(ch, self.position())),
340 None => return Err(LexError::UnterminatedString(start_pos)),
341 }
342 }
343 Some(ch) => value.push(ch),
344 None => return Err(LexError::UnterminatedString(start_pos)),
345 }
346 }
347
348 Ok(Token::new(TokenKind::String(value), start_pos))
349 }
350
351 pub fn next_token(&mut self) -> Result<Token, LexError> {
353 self.skip_whitespace_and_comments();
354
355 let pos = self.position();
356
357 let Some(ch) = self.advance() else {
358 self.at_eof = true;
359 return Ok(Token::new(TokenKind::Eof, pos));
360 };
361
362 let kind = match ch {
363 '+' => TokenKind::Plus,
365 '-' => TokenKind::Minus,
366 '*' => TokenKind::Star,
367 '/' => TokenKind::Slash,
368 '%' => TokenKind::Percent,
369 '(' => TokenKind::LeftParen,
370 ')' => TokenKind::RightParen,
371 '{' => TokenKind::LeftBrace,
372 '}' => TokenKind::RightBrace,
373 '[' => TokenKind::LeftBracket,
374 ']' => TokenKind::RightBracket,
375 ',' => TokenKind::Comma,
376 ';' => TokenKind::Semicolon,
377 ':' => TokenKind::Colon,
378 '.' => TokenKind::Dot,
379
380 '=' => {
382 if self.peek() == Some('=') {
383 self.advance();
384 TokenKind::Equal
385 } else {
386 TokenKind::Assign
387 }
388 }
389 '!' => {
390 if self.peek() == Some('=') {
391 self.advance();
392 TokenKind::NotEqual
393 } else {
394 TokenKind::Bang
395 }
396 }
397 '<' => {
398 if self.peek() == Some('=') {
399 self.advance();
400 TokenKind::LessEqual
401 } else {
402 TokenKind::LessThan
403 }
404 }
405 '>' => {
406 if self.peek() == Some('=') {
407 self.advance();
408 TokenKind::GreaterEqual
409 } else {
410 TokenKind::GreaterThan
411 }
412 }
413 '&' => {
414 if self.peek() == Some('&') {
415 self.advance();
416 TokenKind::And
417 } else {
418 return Err(LexError::UnexpectedCharacter(ch, pos));
419 }
420 }
421 '|' => {
422 if self.peek() == Some('|') {
423 self.advance();
424 TokenKind::Or
425 } else {
426 return Err(LexError::UnexpectedCharacter(ch, pos));
427 }
428 }
429
430 '"' => return self.scan_string(pos),
432
433 ch if ch.is_ascii_digit() => return self.scan_number(ch, pos),
435
436 ch if ch.is_alphabetic() || ch == '_' => {
438 return Ok(self.scan_identifier(ch, pos));
439 }
440
441 _ => return Err(LexError::UnexpectedCharacter(ch, pos)),
443 };
444
445 Ok(Token::new(kind, pos))
446 }
447
448 pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
450 let mut tokens = Vec::new();
451 loop {
452 let token = self.next_token()?;
453 let is_eof = token.kind == TokenKind::Eof;
454 tokens.push(token);
455 if is_eof {
456 break;
457 }
458 }
459 Ok(tokens)
460 }
461}
462
463#[cfg(test)]
464mod tests {
465 use super::*;
466
467 #[test]
468 fn test_empty_source() {
469 let mut lexer = Lexer::new("");
470 let token = lexer.next_token().unwrap();
471 assert_eq!(token.kind, TokenKind::Eof);
472 }
473
474 #[test]
475 fn test_single_tokens() {
476 let mut lexer = Lexer::new("+ - * / % ( ) { } , ;");
477 let tokens = lexer.tokenize().unwrap();
478 assert!(matches!(tokens[0].kind, TokenKind::Plus));
479 assert!(matches!(tokens[1].kind, TokenKind::Minus));
480 assert!(matches!(tokens[2].kind, TokenKind::Star));
481 assert!(matches!(tokens[3].kind, TokenKind::Slash));
482 assert!(matches!(tokens[4].kind, TokenKind::Percent));
483 assert!(matches!(tokens[5].kind, TokenKind::LeftParen));
484 assert!(matches!(tokens[6].kind, TokenKind::RightParen));
485 assert!(matches!(tokens[7].kind, TokenKind::LeftBrace));
486 assert!(matches!(tokens[8].kind, TokenKind::RightBrace));
487 assert!(matches!(tokens[9].kind, TokenKind::Comma));
488 assert!(matches!(tokens[10].kind, TokenKind::Semicolon));
489 }
490
491 #[test]
492 fn test_comparison_operators() {
493 let mut lexer = Lexer::new("= == != < <= > >=");
494 let tokens = lexer.tokenize().unwrap();
495 assert!(matches!(tokens[0].kind, TokenKind::Assign));
496 assert!(matches!(tokens[1].kind, TokenKind::Equal));
497 assert!(matches!(tokens[2].kind, TokenKind::NotEqual));
498 assert!(matches!(tokens[3].kind, TokenKind::LessThan));
499 assert!(matches!(tokens[4].kind, TokenKind::LessEqual));
500 assert!(matches!(tokens[5].kind, TokenKind::GreaterThan));
501 assert!(matches!(tokens[6].kind, TokenKind::GreaterEqual));
502 }
503
504 #[test]
505 fn test_logical_operators() {
506 let mut lexer = Lexer::new("! && ||");
507 let tokens = lexer.tokenize().unwrap();
508 assert!(matches!(tokens[0].kind, TokenKind::Bang));
509 assert!(matches!(tokens[1].kind, TokenKind::And));
510 assert!(matches!(tokens[2].kind, TokenKind::Or));
511 }
512
513 #[test]
514 fn test_keywords() {
515 let mut lexer = Lexer::new("let if else for fn return true false");
516 let tokens = lexer.tokenize().unwrap();
517 assert!(matches!(tokens[0].kind, TokenKind::Let));
518 assert!(matches!(tokens[1].kind, TokenKind::If));
519 assert!(matches!(tokens[2].kind, TokenKind::Else));
520 assert!(matches!(tokens[3].kind, TokenKind::For));
521 assert!(matches!(tokens[4].kind, TokenKind::Fn));
522 assert!(matches!(tokens[5].kind, TokenKind::Return));
523 assert!(matches!(tokens[6].kind, TokenKind::True));
524 assert!(matches!(tokens[7].kind, TokenKind::False));
525 }
526
527 #[test]
528 fn test_identifier() {
529 let mut lexer = Lexer::new("foo bar_123 _test");
530 let tokens = lexer.tokenize().unwrap();
531 assert!(matches!(&tokens[0].kind, TokenKind::Identifier(s) if s == "foo"));
532 assert!(matches!(&tokens[1].kind, TokenKind::Identifier(s) if s == "bar_123"));
533 assert!(matches!(&tokens[2].kind, TokenKind::Identifier(s) if s == "_test"));
534 }
535
536 #[test]
537 fn test_integer() {
538 let mut lexer = Lexer::new("42 0 12345");
539 let tokens = lexer.tokenize().unwrap();
540 assert!(matches!(tokens[0].kind, TokenKind::Integer(42)));
541 assert!(matches!(tokens[1].kind, TokenKind::Integer(0)));
542 assert!(matches!(tokens[2].kind, TokenKind::Integer(12345)));
543 }
544
545 #[test]
546 fn test_string() {
547 let mut lexer = Lexer::new(r#""hello" "world" "with\nescapes""#);
548 let tokens = lexer.tokenize().unwrap();
549 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "hello"));
550 assert!(matches!(&tokens[1].kind, TokenKind::String(s) if s == "world"));
551 assert!(matches!(&tokens[2].kind, TokenKind::String(s) if s == "with\nescapes"));
552 }
553
554 #[test]
555 fn test_comments() {
556 let mut lexer = Lexer::new("let x = 10; // this is a comment\nlet y = 20;");
557 let tokens = lexer.tokenize().unwrap();
558 assert_eq!(tokens.len(), 11);
560 assert!(matches!(tokens[0].kind, TokenKind::Let));
561 assert!(matches!(tokens[5].kind, TokenKind::Let));
562 }
563
564 #[test]
565 fn test_position_tracking() {
566 let mut lexer = Lexer::new("let x\ny");
567 let tokens = lexer.tokenize().unwrap();
568 assert_eq!(tokens[0].position.line, 1);
569 assert_eq!(tokens[0].position.column, 1);
570 assert_eq!(tokens[1].position.line, 1);
571 assert_eq!(tokens[1].position.column, 5);
572 assert_eq!(tokens[2].position.line, 2);
573 assert_eq!(tokens[2].position.column, 1);
574 }
575
576 #[test]
577 fn test_unterminated_string() {
578 let mut lexer = Lexer::new(r#""hello"#);
579 let result = lexer.next_token();
580 assert!(matches!(result, Err(LexError::UnterminatedString(_))));
581 }
582
583 #[test]
584 fn test_unexpected_character() {
585 let mut lexer = Lexer::new("@");
586 let result = lexer.next_token();
587 assert!(matches!(result, Err(LexError::UnexpectedCharacter('@', _))));
588 }
589
590 #[test]
591 fn test_dot_token() {
592 let mut lexer = Lexer::new(".");
593 let tokens = lexer.tokenize().unwrap();
594 assert!(matches!(tokens[0].kind, TokenKind::Dot));
595 }
596
597 #[test]
598 fn test_method_call_tokens() {
599 let mut lexer = Lexer::new("foo.bar()");
600 let tokens = lexer.tokenize().unwrap();
601 assert!(matches!(&tokens[0].kind, TokenKind::Identifier(s) if s == "foo"));
602 assert!(matches!(tokens[1].kind, TokenKind::Dot));
603 assert!(matches!(&tokens[2].kind, TokenKind::Identifier(s) if s == "bar"));
604 assert!(matches!(tokens[3].kind, TokenKind::LeftParen));
605 assert!(matches!(tokens[4].kind, TokenKind::RightParen));
606 }
607}