1use crate::compiler::tokens::{Span, Token, TokenKind};
4use thiserror::Error;
5
6#[derive(Debug, Error)]
7pub enum LexError {
8 #[error("unexpected character '{ch}' at line {line}, col {col}")]
9 UnexpectedChar { ch: char, line: usize, col: usize },
10 #[error("unterminated string at line {line}, col {col}")]
11 UnterminatedString { line: usize, col: usize },
12 #[error("inconsistent indentation at line {line}")]
13 InconsistentIndent { line: usize },
14 #[error("invalid number at line {line}, col {col}")]
15 InvalidNumber { line: usize, col: usize },
16 #[error("invalid bytes literal at line {line}, col {col}")]
17 InvalidBytesLiteral { line: usize, col: usize },
18 #[error("invalid unicode escape at line {line}, col {col}")]
19 InvalidUnicodeEscape { line: usize, col: usize },
20}
21
22pub struct Lexer {
23 source: Vec<char>,
24 pos: usize,
25 line: usize,
26 col: usize,
27 byte_offset: usize,
28 base_line: usize,
29 base_offset: usize,
30 indent_stack: Vec<usize>,
31 pending: Vec<Token>,
32 at_line_start: bool,
33}
34
35impl Lexer {
36 pub fn new(source: &str, base_line: usize, base_offset: usize) -> Self {
37 Self {
38 source: source.chars().collect(),
39 pos: 0,
40 line: 1,
41 col: 1,
42 byte_offset: 0,
43 base_line,
44 base_offset,
45 indent_stack: vec![0],
46 pending: Vec::new(),
47 at_line_start: true,
48 }
49 }
50
51 fn current(&self) -> Option<char> {
52 self.source.get(self.pos).copied()
53 }
54 fn peek(&self) -> Option<char> {
55 self.source.get(self.pos + 1).copied()
56 }
57 fn peek2(&self) -> Option<char> {
58 self.source.get(self.pos + 2).copied()
59 }
60
61 fn looks_like_interpolation_start(&self) -> bool {
62 let mut i = self.pos + 1; while let Some(ch) = self.source.get(i).copied() {
64 if ch.is_whitespace() {
65 i += 1;
66 continue;
67 }
68 return matches!(
69 ch,
70 'a'..='z' | 'A'..='Z' | '_' | '(' | '[' | '-' | '0'..='9'
71 );
72 }
73 false
74 }
75
76 fn advance(&mut self) -> Option<char> {
77 let ch = self.source.get(self.pos).copied()?;
78 self.pos += 1;
79 self.byte_offset += ch.len_utf8();
80 if ch == '\n' {
81 self.line += 1;
82 self.col = 1;
83 self.at_line_start = true;
84 } else {
85 self.col += 1;
86 }
87 Some(ch)
88 }
89
90 fn span_here(&self) -> Span {
91 Span::new(
92 self.base_offset + self.byte_offset,
93 self.base_offset + self.byte_offset,
94 self.base_line + self.line - 1,
95 self.col,
96 )
97 }
98
99 fn span_from(&self, so: usize, sl: usize, sc: usize) -> Span {
100 Span::new(
101 self.base_offset + so,
102 self.base_offset + self.byte_offset,
103 self.base_line + sl - 1,
104 sc,
105 )
106 }
107
108 fn handle_indentation(&mut self) -> Result<(), LexError> {
109 let mut indent = 0;
110 while let Some(ch) = self.current() {
111 match ch {
112 ' ' => {
113 indent += 1;
114 self.advance();
115 }
116 '\t' => {
117 indent += 2;
118 self.advance();
119 }
120 _ => break,
121 }
122 }
123 if matches!(self.current(), None | Some('\n') | Some('#')) {
124 if self.current().is_none() {
125 while self.indent_stack.len() > 1 {
126 self.indent_stack.pop();
127 self.pending
128 .push(Token::new(TokenKind::Dedent, self.span_here()));
129 }
130 }
131 return Ok(());
132 }
133 let cur = *self.indent_stack.last().unwrap();
134 if indent > cur {
135 self.indent_stack.push(indent);
136 self.pending
137 .push(Token::new(TokenKind::Indent, self.span_here()));
138 } else if indent < cur {
139 while let Some(&top) = self.indent_stack.last() {
140 if top > indent {
141 self.indent_stack.pop();
142 self.pending
143 .push(Token::new(TokenKind::Dedent, self.span_here()));
144 } else {
145 break;
146 }
147 }
148 if *self.indent_stack.last().unwrap() != indent {
149 return Err(LexError::InconsistentIndent {
150 line: self.base_line + self.line - 1,
151 });
152 }
153 }
154 Ok(())
155 }
156
157 fn read_unicode_escape(&mut self, sl: usize, sc: usize) -> Result<char, LexError> {
159 if self.current() != Some('{') {
161 return Err(LexError::InvalidUnicodeEscape {
162 line: self.base_line + sl - 1,
163 col: sc,
164 });
165 }
166 self.advance(); let mut hex = String::new();
168 while let Some(c) = self.current() {
169 if c == '}' {
170 break;
171 }
172 hex.push(c);
173 self.advance();
174 }
175 if self.current() != Some('}') {
176 return Err(LexError::InvalidUnicodeEscape {
177 line: self.base_line + sl - 1,
178 col: sc,
179 });
180 }
181 self.advance(); u32::from_str_radix(&hex, 16)
183 .ok()
184 .and_then(char::from_u32)
185 .ok_or(LexError::InvalidUnicodeEscape {
186 line: self.base_line + sl - 1,
187 col: sc,
188 })
189 }
190
191 fn process_escape(&mut self, buf: &mut String, sl: usize, sc: usize) -> Result<(), LexError> {
193 match self.current() {
194 Some('n') => {
195 buf.push('\n');
196 self.advance();
197 }
198 Some('t') => {
199 buf.push('\t');
200 self.advance();
201 }
202 Some('r') => {
203 buf.push('\r');
204 self.advance();
205 }
206 Some('\\') => {
207 buf.push('\\');
208 self.advance();
209 }
210 Some('"') => {
211 buf.push('"');
212 self.advance();
213 }
214 Some('{') => {
215 buf.push('{');
216 self.advance();
217 }
218 Some('0') => {
219 buf.push('\0');
220 self.advance();
221 }
222 Some('u') => {
223 self.advance(); let ch = self.read_unicode_escape(sl, sc)?;
225 buf.push(ch);
226 }
227 Some('x') => {
228 self.advance(); let mut hex = String::new();
230 for _ in 0..2 {
231 match self.current() {
232 Some(c) if c.is_ascii_hexdigit() => {
233 hex.push(c);
234 self.advance();
235 }
236 _ => {
237 return Err(LexError::InvalidUnicodeEscape {
238 line: self.base_line + sl - 1,
239 col: sc,
240 })
241 }
242 }
243 }
244 let byte =
245 u8::from_str_radix(&hex, 16).map_err(|_| LexError::InvalidUnicodeEscape {
246 line: self.base_line + sl - 1,
247 col: sc,
248 })?;
249 buf.push(byte as char);
250 }
251 Some(c) => {
252 buf.push('\\');
253 buf.push(c);
254 self.advance();
255 }
256 None => {
257 return Err(LexError::UnterminatedString {
258 line: self.base_line + sl - 1,
259 col: sc,
260 })
261 }
262 }
263 Ok(())
264 }
265
266 fn read_triple_quoted_string(&mut self) -> Result<Token, LexError> {
267 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
268 self.advance();
270 self.advance();
271 self.advance();
272
273 let mut segments = Vec::new();
274 let mut cur_segment = String::new();
275 let mut is_interp = false;
276
277 loop {
278 match self.current() {
279 None => {
280 return Err(LexError::UnterminatedString {
281 line: self.base_line + sl - 1,
282 col: sc,
283 })
284 }
285 Some('"') if self.peek() == Some('"') && self.peek2() == Some('"') => {
286 self.advance();
287 self.advance();
288 self.advance();
289 break;
290 }
291 Some('\\') => {
292 self.advance();
293 self.process_escape(&mut cur_segment, sl, sc)?;
294 }
295 Some('{') if self.looks_like_interpolation_start() => {
296 is_interp = true;
297 if !cur_segment.is_empty() {
298 segments.push((false, cur_segment.clone()));
299 cur_segment.clear();
300 }
301 self.advance(); let mut expr_str = String::new();
303 let mut brace_balance = 1;
304 while let Some(c) = self.current() {
305 if c == '}' {
306 brace_balance -= 1;
307 if brace_balance == 0 {
308 break;
309 }
310 expr_str.push(c);
311 self.advance();
312 } else if c == '{' {
313 brace_balance += 1;
314 expr_str.push(c);
315 self.advance();
316 } else if c == '"' {
317 expr_str.push(c);
318 self.advance();
319 while let Some(ic) = self.current() {
320 expr_str.push(ic);
321 self.advance();
322 if ic == '"' && !expr_str.ends_with("\\\"") {
323 break;
324 }
325 }
326 } else {
327 expr_str.push(c);
328 self.advance();
329 }
330 }
331 if brace_balance != 0 {
332 return Err(LexError::UnterminatedString {
333 line: self.base_line + sl - 1,
334 col: sc,
335 });
336 }
337 self.advance(); segments.push((true, expr_str.trim().to_string()));
339 }
340 Some('{') => {
341 cur_segment.push('{');
342 self.advance();
343 }
344 Some(c) => {
345 cur_segment.push(c);
346 self.advance();
347 }
348 }
349 }
350
351 let raw_content = if is_interp {
353 if !cur_segment.is_empty() {
354 segments.push((false, cur_segment));
355 }
356 self.dedent_interp_segments(&mut segments);
358 let span = self.span_from(so, sl, sc);
359 return Ok(Token::new(TokenKind::StringInterpLit(segments), span));
360 } else {
361 cur_segment
362 };
363
364 let dedented = self.dedent_string(&raw_content);
365 let span = self.span_from(so, sl, sc);
366 Ok(Token::new(TokenKind::StringLit(dedented), span))
367 }
368
369 fn dedent_string(&self, s: &str) -> String {
370 let lines: Vec<&str> = s.split('\n').collect();
371 if lines.len() <= 1 {
372 return s.to_string();
373 }
374 let min_indent = lines
376 .iter()
377 .skip(1)
378 .filter(|l| !l.trim().is_empty())
379 .map(|l| l.len() - l.trim_start().len())
380 .min()
381 .unwrap_or(0);
382
383 let mut result = Vec::new();
384 for (i, line) in lines.iter().enumerate() {
385 if i == 0 {
386 result.push(*line);
387 } else if line.len() >= min_indent {
388 result.push(&line[min_indent..]);
389 } else {
390 result.push(line.trim());
391 }
392 }
393 let joined = result.join("\n");
395 let trimmed = joined.trim_start_matches('\n');
396 let trimmed = trimmed.trim_end_matches('\n');
397 trimmed.to_string()
398 }
399
400 fn dedent_interp_segments(&self, segments: &mut [(bool, String)]) {
401 for seg in segments.iter_mut() {
403 if !seg.0 {
404 seg.1 = self.dedent_string(&seg.1);
405 }
406 }
407 }
408
409 fn read_raw_string(&mut self) -> Result<Token, LexError> {
410 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
411 self.advance(); if self.current() == Some('"') && self.peek() == Some('"') && self.peek2() == Some('"') {
414 self.advance();
415 self.advance();
416 self.advance(); let mut content = String::new();
418 loop {
419 match self.current() {
420 None => {
421 return Err(LexError::UnterminatedString {
422 line: self.base_line + sl - 1,
423 col: sc,
424 })
425 }
426 Some('"') if self.peek() == Some('"') && self.peek2() == Some('"') => {
427 self.advance();
428 self.advance();
429 self.advance();
430 break;
431 }
432 Some(c) => {
433 content.push(c);
434 self.advance();
435 }
436 }
437 }
438 let dedented = self.dedent_string(&content);
439 let span = self.span_from(so, sl, sc);
440 return Ok(Token::new(TokenKind::RawStringLit(dedented), span));
441 }
442 if self.current() != Some('"') {
444 return Err(LexError::UnexpectedChar {
449 ch: self.current().unwrap_or(' '),
450 line: self.base_line + sl - 1,
451 col: sc,
452 });
453 }
454 self.advance(); let mut content = String::new();
456 loop {
457 match self.current() {
458 None | Some('\n') => {
459 return Err(LexError::UnterminatedString {
460 line: self.base_line + sl - 1,
461 col: sc,
462 })
463 }
464 Some('"') => {
465 self.advance();
466 break;
467 }
468 Some(c) => {
469 content.push(c);
470 self.advance();
471 }
472 }
473 }
474 let span = self.span_from(so, sl, sc);
475 Ok(Token::new(TokenKind::RawStringLit(content), span))
476 }
477
478 fn read_bytes_literal(&mut self) -> Result<Token, LexError> {
479 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
480 self.advance(); if self.current() != Some('"') {
482 return Err(LexError::InvalidBytesLiteral {
483 line: self.base_line + sl - 1,
484 col: sc,
485 });
486 }
487 self.advance(); let mut bytes = Vec::new();
489 loop {
490 match self.current() {
491 None | Some('\n') => {
492 return Err(LexError::UnterminatedString {
493 line: self.base_line + sl - 1,
494 col: sc,
495 })
496 }
497 Some('"') => {
498 self.advance();
499 break;
500 }
501 Some(c) if c.is_ascii_hexdigit() => {
502 let hi = c;
503 self.advance();
504 match self.current() {
505 Some(lo) if lo.is_ascii_hexdigit() => {
506 self.advance();
507 let byte =
508 u8::from_str_radix(&format!("{}{}", hi, lo), 16).map_err(|_| {
509 LexError::InvalidBytesLiteral {
510 line: self.base_line + sl - 1,
511 col: sc,
512 }
513 })?;
514 bytes.push(byte);
515 }
516 _ => {
517 return Err(LexError::InvalidBytesLiteral {
518 line: self.base_line + sl - 1,
519 col: sc,
520 })
521 }
522 }
523 }
524 _ => {
525 return Err(LexError::InvalidBytesLiteral {
526 line: self.base_line + sl - 1,
527 col: sc,
528 })
529 }
530 }
531 }
532 let span = self.span_from(so, sl, sc);
533 Ok(Token::new(TokenKind::BytesLit(bytes), span))
534 }
535
536 fn read_string(&mut self) -> Result<Token, LexError> {
537 if self.peek() == Some('"') && self.peek2() == Some('"') {
539 return self.read_triple_quoted_string();
540 }
541
542 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
543 self.advance(); let mut segments = Vec::new();
545 let mut cur_segment = String::new();
546 let mut is_interp = false;
547
548 loop {
549 match self.current() {
550 None | Some('\n') => {
551 return Err(LexError::UnterminatedString {
552 line: self.base_line + sl - 1,
553 col: sc,
554 })
555 }
556 Some('\\') => {
557 self.advance();
558 self.process_escape(&mut cur_segment, sl, sc)?;
559 }
560 Some('{') if self.looks_like_interpolation_start() => {
561 is_interp = true;
563 if !cur_segment.is_empty() {
564 segments.push((false, cur_segment.clone()));
565 cur_segment.clear();
566 }
567 self.advance(); let mut expr_str = String::new();
570 let mut brace_balance = 1;
571 while let Some(c) = self.current() {
572 if c == '}' {
573 brace_balance -= 1;
574 if brace_balance == 0 {
575 break;
576 }
577 expr_str.push(c);
578 self.advance();
579 } else if c == '{' {
580 brace_balance += 1;
581 expr_str.push(c);
582 self.advance();
583 } else if c == '"' {
584 expr_str.push(c);
586 self.advance();
587 while let Some(ic) = self.current() {
588 expr_str.push(ic);
589 self.advance();
590 if ic == '"' && !expr_str.ends_with("\\\"") {
591 break;
592 }
593 }
594 } else {
595 expr_str.push(c);
596 self.advance();
597 }
598 }
599 if brace_balance != 0 {
600 return Err(LexError::UnterminatedString {
601 line: self.base_line + sl - 1,
602 col: sc,
603 });
604 }
605 self.advance(); segments.push((true, expr_str.trim().to_string()));
607 }
608 Some('{') => {
609 cur_segment.push('{');
610 self.advance();
611 }
612 Some('"') => {
613 self.advance();
614 break;
615 }
616 Some(c) => {
617 cur_segment.push(c);
618 self.advance();
619 }
620 }
621 }
622
623 let span = self.span_from(so, sl, sc);
624 if is_interp {
625 if !cur_segment.is_empty() {
626 segments.push((false, cur_segment));
627 }
628 Ok(Token::new(TokenKind::StringInterpLit(segments), span))
629 } else {
630 Ok(Token::new(TokenKind::StringLit(cur_segment), span))
631 }
632 }
633
634 fn read_number(&mut self) -> Result<Token, LexError> {
635 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
636
637 if self.current() == Some('0') {
639 match self.peek() {
640 Some('x') | Some('X') => return self.read_hex_number(so, sl, sc),
641 Some('b') if matches!(self.peek2(), Some('0') | Some('1')) => {
642 return self.read_bin_number(so, sl, sc)
643 }
644 Some('o') => return self.read_oct_number(so, sl, sc),
645 _ => {}
646 }
647 }
648
649 let mut ns = String::new();
650 let mut is_float = false;
651 while let Some(ch) = self.current() {
652 if ch.is_ascii_digit() {
653 ns.push(ch);
654 self.advance();
655 } else if ch == '.' && !is_float {
656 if self.peek() == Some('.') {
658 break;
659 }
660 if matches!(self.peek(), Some(d) if d.is_ascii_digit()) {
661 is_float = true;
662 ns.push(ch);
663 self.advance();
664 } else {
665 break;
666 }
667 } else if ch == '_' {
668 self.advance();
669 } else if (ch == 'e' || ch == 'E') && !is_float {
670 is_float = true;
672 ns.push(ch);
673 self.advance();
674 if matches!(self.current(), Some('+') | Some('-')) {
676 ns.push(self.current().unwrap());
677 self.advance();
678 }
679 } else {
680 break;
681 }
682 }
683 let span = self.span_from(so, sl, sc);
684 if is_float {
685 ns.parse::<f64>()
686 .map(|f| Token::new(TokenKind::FloatLit(f), span))
687 .map_err(|_| LexError::InvalidNumber {
688 line: self.base_line + sl - 1,
689 col: sc,
690 })
691 } else {
692 ns.parse::<i64>()
693 .map(|n| Token::new(TokenKind::IntLit(n), span))
694 .map_err(|_| LexError::InvalidNumber {
695 line: self.base_line + sl - 1,
696 col: sc,
697 })
698 }
699 }
700
701 fn read_hex_number(&mut self, so: usize, sl: usize, sc: usize) -> Result<Token, LexError> {
702 self.advance(); self.advance(); let mut hex = String::new();
705 while let Some(ch) = self.current() {
706 if ch.is_ascii_hexdigit() {
707 hex.push(ch);
708 self.advance();
709 } else if ch == '_' {
710 self.advance();
711 } else {
712 break;
713 }
714 }
715 if hex.is_empty() {
716 return Err(LexError::InvalidNumber {
717 line: self.base_line + sl - 1,
718 col: sc,
719 });
720 }
721 let span = self.span_from(so, sl, sc);
722 i64::from_str_radix(&hex, 16)
723 .map(|n| Token::new(TokenKind::IntLit(n), span))
724 .map_err(|_| LexError::InvalidNumber {
725 line: self.base_line + sl - 1,
726 col: sc,
727 })
728 }
729
730 fn read_bin_number(&mut self, so: usize, sl: usize, sc: usize) -> Result<Token, LexError> {
731 self.advance(); self.advance(); let mut bin = String::new();
734 while let Some(ch) = self.current() {
735 if ch == '0' || ch == '1' {
736 bin.push(ch);
737 self.advance();
738 } else if ch == '_' {
739 self.advance();
740 } else {
741 break;
742 }
743 }
744 if bin.is_empty() {
745 return Err(LexError::InvalidNumber {
746 line: self.base_line + sl - 1,
747 col: sc,
748 });
749 }
750 let span = self.span_from(so, sl, sc);
751 i64::from_str_radix(&bin, 2)
752 .map(|n| Token::new(TokenKind::IntLit(n), span))
753 .map_err(|_| LexError::InvalidNumber {
754 line: self.base_line + sl - 1,
755 col: sc,
756 })
757 }
758
759 fn read_oct_number(&mut self, so: usize, sl: usize, sc: usize) -> Result<Token, LexError> {
760 self.advance(); self.advance(); let mut oct = String::new();
763 while let Some(ch) = self.current() {
764 if ('0'..='7').contains(&ch) {
765 oct.push(ch);
766 self.advance();
767 } else if ch == '_' {
768 self.advance();
769 } else {
770 break;
771 }
772 }
773 if oct.is_empty() {
774 return Err(LexError::InvalidNumber {
775 line: self.base_line + sl - 1,
776 col: sc,
777 });
778 }
779 let span = self.span_from(so, sl, sc);
780 i64::from_str_radix(&oct, 8)
781 .map(|n| Token::new(TokenKind::IntLit(n), span))
782 .map_err(|_| LexError::InvalidNumber {
783 line: self.base_line + sl - 1,
784 col: sc,
785 })
786 }
787
788 fn read_ident(&mut self) -> Token {
789 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
790 let mut id = String::new();
791 while let Some(ch) = self.current() {
792 if ch.is_alphanumeric() || ch == '_' {
793 id.push(ch);
794 self.advance();
795 } else {
796 break;
797 }
798 }
799 let span = self.span_from(so, sl, sc);
800 let kind = match id.as_str() {
801 "record" => TokenKind::Record,
802 "enum" => TokenKind::Enum,
803 "cell" => TokenKind::Cell,
804 "let" => TokenKind::Let,
805 "if" => TokenKind::If,
806 "else" => TokenKind::Else,
807 "for" => TokenKind::For,
808 "in" => TokenKind::In,
809 "match" => TokenKind::Match,
810 "return" => TokenKind::Return,
811 "halt" => TokenKind::Halt,
812 "end" => TokenKind::End,
813 "use" => TokenKind::Use,
814 "tool" => TokenKind::Tool,
815 "as" => TokenKind::As,
816 "grant" => TokenKind::Grant,
817 "expect" => TokenKind::Expect,
818 "schema" => TokenKind::Schema,
819 "role" => TokenKind::Role,
820 "where" => TokenKind::Where,
821 "and" => TokenKind::And,
822 "or" => TokenKind::Or,
823 "not" => TokenKind::Not,
824 "null" => TokenKind::NullLit,
825 "Null" => TokenKind::Null,
826 "result" => TokenKind::Result,
827 "ok" => TokenKind::Ok_,
828 "err" => TokenKind::Err_,
829 "list" => TokenKind::List,
830 "map" => TokenKind::Map,
831 "true" => TokenKind::BoolLit(true),
832 "false" => TokenKind::BoolLit(false),
833 "while" => TokenKind::While,
835 "loop" => TokenKind::Loop,
836 "break" => TokenKind::Break,
837 "continue" => TokenKind::Continue,
838 "mut" => TokenKind::Mut,
839 "const" => TokenKind::Const,
840 "pub" => TokenKind::Pub,
841 "import" => TokenKind::Import,
842 "from" => TokenKind::From,
843 "async" => TokenKind::Async,
844 "await" => TokenKind::Await,
845 "parallel" => TokenKind::Parallel,
846 "fn" => TokenKind::Fn,
847 "trait" => TokenKind::Trait,
848 "impl" => TokenKind::Impl,
849 "type" => TokenKind::Type,
850 "set" => TokenKind::Set,
851 "tuple" => TokenKind::Tuple,
852 "emit" => TokenKind::Emit,
853 "yield" => TokenKind::Yield,
854 "mod" => TokenKind::Mod,
855 "self" => TokenKind::SelfKw,
856 "with" => TokenKind::With,
857 "try" => TokenKind::Try,
858 "union" => TokenKind::Union,
859 "step" => TokenKind::Step,
860 "comptime" => TokenKind::Comptime,
861 "macro" => TokenKind::Macro,
862 "extern" => TokenKind::Extern,
863 "then" => TokenKind::Then,
864 "when" => TokenKind::When,
865 "is" => TokenKind::Is,
866 "defer" => TokenKind::Defer,
867 "bool" => TokenKind::Bool,
869 "int" => TokenKind::Int_,
870 "float" => TokenKind::Float_,
871 "string" => TokenKind::String_,
872 "bytes" => TokenKind::Bytes,
873 "json" => TokenKind::Json,
874 _ => TokenKind::Ident(id),
875 };
876 Token::new(kind, span)
877 }
878
879 fn single(&mut self, kind: TokenKind) -> Token {
880 let span = self.span_here();
881 self.advance();
882 Token::new(kind, span)
883 }
884
885 pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
886 let mut tokens = Vec::new();
887 while self.pos < self.source.len() {
888 if self.at_line_start {
889 self.at_line_start = false;
890 self.handle_indentation()?;
891 tokens.append(&mut self.pending);
892 }
893 let ch = match self.current() {
894 Some(c) => c,
895 None => break,
896 };
897 match ch {
898 '\n' => {
899 let span = self.span_here();
900 self.advance();
901 if !matches!(
902 tokens.last().map(|t| &t.kind),
903 Some(TokenKind::Newline) | Some(TokenKind::Indent) | None
904 ) {
905 tokens.push(Token::new(TokenKind::Newline, span));
906 }
907 }
908 ' ' | '\t' | '\r' => {
909 while matches!(self.current(), Some(' ' | '\t' | '\r')) {
910 self.advance();
911 }
912 }
913 '#' => {
914 while matches!(self.current(), Some(c) if c != '\n') {
915 self.advance();
916 }
917 }
918 '"' => tokens.push(self.read_string()?),
919 '0'..='9' => tokens.push(self.read_number()?),
920 'r' if self.peek() == Some('"') => tokens.push(self.read_raw_string()?),
921 'b' if self.peek() == Some('"') => tokens.push(self.read_bytes_literal()?),
922 'a'..='z' | 'A'..='Z' | '_' => tokens.push(self.read_ident()),
923 '+' => {
924 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
925 self.advance();
926 match self.current() {
927 Some('=') => {
928 self.advance();
929 tokens.push(Token::new(
930 TokenKind::PlusAssign,
931 self.span_from(so, sl, sc),
932 ));
933 }
934 Some('+') => {
935 self.advance();
936 tokens
937 .push(Token::new(TokenKind::PlusPlus, self.span_from(so, sl, sc)));
938 }
939 _ => {
940 tokens.push(Token::new(TokenKind::Plus, self.span_from(so, sl, sc)));
941 }
942 }
943 }
944 '-' => {
945 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
946 self.advance();
947 match self.current() {
948 Some('>') => {
949 self.advance();
950 tokens.push(Token::new(TokenKind::Arrow, self.span_from(so, sl, sc)));
951 }
952 Some('=') => {
953 self.advance();
954 tokens.push(Token::new(
955 TokenKind::MinusAssign,
956 self.span_from(so, sl, sc),
957 ));
958 }
959 _ => {
960 tokens.push(Token::new(TokenKind::Minus, self.span_from(so, sl, sc)));
961 }
962 }
963 }
964 '*' => {
965 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
966 self.advance();
967 match self.current() {
968 Some('*') => {
969 self.advance();
970 if self.current() == Some('=') {
971 self.advance();
972 tokens.push(Token::new(
973 TokenKind::StarStarAssign,
974 self.span_from(so, sl, sc),
975 ));
976 } else {
977 tokens.push(Token::new(
978 TokenKind::StarStar,
979 self.span_from(so, sl, sc),
980 ));
981 }
982 }
983 Some('=') => {
984 self.advance();
985 tokens.push(Token::new(
986 TokenKind::StarAssign,
987 self.span_from(so, sl, sc),
988 ));
989 }
990 _ => {
991 tokens.push(Token::new(TokenKind::Star, self.span_from(so, sl, sc)));
992 }
993 }
994 }
995 '/' => {
996 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
997 self.advance();
998 match self.current() {
999 Some('/') => {
1000 self.advance();
1001 if self.current() == Some('=') {
1002 self.advance();
1003 tokens.push(Token::new(
1004 TokenKind::FloorDivAssign,
1005 self.span_from(so, sl, sc),
1006 ));
1007 } else {
1008 tokens.push(Token::new(
1009 TokenKind::FloorDiv,
1010 self.span_from(so, sl, sc),
1011 ));
1012 }
1013 }
1014 Some('=') => {
1015 self.advance();
1016 tokens.push(Token::new(
1017 TokenKind::SlashAssign,
1018 self.span_from(so, sl, sc),
1019 ));
1020 }
1021 _ => {
1022 tokens.push(Token::new(TokenKind::Slash, self.span_from(so, sl, sc)));
1023 }
1024 }
1025 }
1026 '%' => {
1027 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
1028 self.advance();
1029 if self.current() == Some('=') {
1030 self.advance();
1031 tokens.push(Token::new(
1032 TokenKind::PercentAssign,
1033 self.span_from(so, sl, sc),
1034 ));
1035 } else {
1036 tokens.push(Token::new(TokenKind::Percent, self.span_from(so, sl, sc)));
1037 }
1038 }
1039 '=' => {
1040 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
1041 self.advance();
1042 match self.current() {
1043 Some('=') => {
1044 self.advance();
1045 tokens.push(Token::new(TokenKind::Eq, self.span_from(so, sl, sc)));
1046 }
1047 Some('>') => {
1048 self.advance();
1049 tokens
1050 .push(Token::new(TokenKind::FatArrow, self.span_from(so, sl, sc)));
1051 }
1052 _ => {
1053 tokens.push(Token::new(TokenKind::Assign, self.span_from(so, sl, sc)));
1054 }
1055 }
1056 }
1057 '!' => {
1058 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
1059 self.advance();
1060 match self.current() {
1061 Some('=') => {
1062 self.advance();
1063 tokens.push(Token::new(TokenKind::NotEq, self.span_from(so, sl, sc)));
1064 }
1065 _ => {
1066 tokens.push(Token::new(TokenKind::Bang, self.span_from(so, sl, sc)));
1067 }
1068 }
1069 }
1070 '?' => {
1071 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
1072 self.advance();
1073 match self.current() {
1074 Some('?') => {
1075 self.advance();
1076 tokens.push(Token::new(
1077 TokenKind::QuestionQuestion,
1078 self.span_from(so, sl, sc),
1079 ));
1080 }
1081 Some('.') => {
1082 self.advance();
1083 tokens.push(Token::new(
1084 TokenKind::QuestionDot,
1085 self.span_from(so, sl, sc),
1086 ));
1087 }
1088 Some('[') => {
1089 self.advance();
1090 tokens.push(Token::new(
1091 TokenKind::QuestionBracket,
1092 self.span_from(so, sl, sc),
1093 ));
1094 }
1095 _ => {
1096 tokens
1097 .push(Token::new(TokenKind::Question, self.span_from(so, sl, sc)));
1098 }
1099 }
1100 }
1101 '<' => {
1102 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
1103 self.advance();
1104 match self.current() {
1105 Some('=') => {
1106 self.advance();
1107 tokens.push(Token::new(TokenKind::LtEq, self.span_from(so, sl, sc)));
1108 }
1109 Some('<') => {
1110 self.advance();
1111 tokens
1112 .push(Token::new(TokenKind::LeftShift, self.span_from(so, sl, sc)));
1113 }
1114 _ => {
1115 tokens.push(Token::new(TokenKind::Lt, self.span_from(so, sl, sc)));
1116 }
1117 }
1118 }
1119 '>' => {
1120 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
1121 self.advance();
1122 match self.current() {
1123 Some('=') => {
1124 self.advance();
1125 tokens.push(Token::new(TokenKind::GtEq, self.span_from(so, sl, sc)));
1126 }
1127 Some('>') => {
1128 self.advance();
1129 tokens.push(Token::new(
1130 TokenKind::RightShift,
1131 self.span_from(so, sl, sc),
1132 ));
1133 }
1134 _ => {
1135 tokens.push(Token::new(TokenKind::Gt, self.span_from(so, sl, sc)));
1136 }
1137 }
1138 }
1139 '.' => {
1140 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
1141 self.advance();
1142 if self.current() == Some('.') {
1143 self.advance();
1144 if self.current() == Some('.') {
1145 self.advance();
1146 tokens
1147 .push(Token::new(TokenKind::DotDotDot, self.span_from(so, sl, sc)));
1148 } else if self.current() == Some('=') {
1149 self.advance();
1150 tokens
1151 .push(Token::new(TokenKind::DotDotEq, self.span_from(so, sl, sc)));
1152 } else {
1153 tokens.push(Token::new(TokenKind::DotDot, self.span_from(so, sl, sc)));
1154 }
1155 } else {
1156 tokens.push(Token::new(TokenKind::Dot, self.span_from(so, sl, sc)));
1157 }
1158 }
1159 ',' => tokens.push(self.single(TokenKind::Comma)),
1160 ':' => tokens.push(self.single(TokenKind::Colon)),
1161 ';' => tokens.push(self.single(TokenKind::Semicolon)),
1162 '|' => {
1163 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
1164 self.advance();
1165 match self.current() {
1166 Some('>') => {
1167 self.advance();
1168 tokens.push(Token::new(
1169 TokenKind::PipeForward,
1170 self.span_from(so, sl, sc),
1171 ));
1172 }
1173 Some('=') => {
1174 self.advance();
1175 tokens.push(Token::new(
1176 TokenKind::PipeAssign,
1177 self.span_from(so, sl, sc),
1178 ));
1179 }
1180 _ => {
1181 tokens.push(Token::new(TokenKind::Pipe, self.span_from(so, sl, sc)));
1182 }
1183 }
1184 }
1185 '@' => tokens.push(self.single(TokenKind::At)),
1186 '&' => {
1187 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
1188 self.advance();
1189 if self.current() == Some('=') {
1190 self.advance();
1191 tokens.push(Token::new(TokenKind::AmpAssign, self.span_from(so, sl, sc)));
1192 } else {
1193 tokens.push(Token::new(TokenKind::Ampersand, self.span_from(so, sl, sc)));
1194 }
1195 }
1196 '~' => {
1197 let so = self.byte_offset;
1198 let sl = self.line;
1199 let sc = self.col;
1200 self.advance();
1201 match self.current() {
1202 Some('>') => {
1203 self.advance();
1204 tokens.push(Token::new(
1205 TokenKind::TildeArrow,
1206 self.span_from(so, sl, sc),
1207 ));
1208 }
1209 _ => {
1210 tokens.push(Token::new(TokenKind::Tilde, self.span_from(so, sl, sc)));
1211 }
1212 }
1213 }
1214 '^' => {
1215 let (so, sl, sc) = (self.byte_offset, self.line, self.col);
1216 self.advance();
1217 if self.current() == Some('=') {
1218 self.advance();
1219 tokens.push(Token::new(
1220 TokenKind::CaretAssign,
1221 self.span_from(so, sl, sc),
1222 ));
1223 } else {
1224 tokens.push(Token::new(TokenKind::Caret, self.span_from(so, sl, sc)));
1225 }
1226 }
1227 '(' => tokens.push(self.single(TokenKind::LParen)),
1228 ')' => tokens.push(self.single(TokenKind::RParen)),
1229 '[' => tokens.push(self.single(TokenKind::LBracket)),
1230 ']' => tokens.push(self.single(TokenKind::RBracket)),
1231 '{' => tokens.push(self.single(TokenKind::LBrace)),
1232 '}' => tokens.push(self.single(TokenKind::RBrace)),
1233 '\\' if self.peek() == Some('\n') => {
1234 self.advance(); self.advance(); self.at_line_start = false;
1239 while matches!(self.current(), Some(' ' | '\t')) {
1241 self.advance();
1242 }
1243 }
1244 c => tokens.push(self.single(TokenKind::Symbol(c))),
1245 }
1246 }
1247 while self.indent_stack.len() > 1 {
1248 self.indent_stack.pop();
1249 tokens.push(Token::new(TokenKind::Dedent, self.span_here()));
1250 }
1251 tokens.push(Token::new(TokenKind::Eof, self.span_here()));
1252 Ok(tokens)
1253 }
1254}
1255
1256#[cfg(test)]
1257mod tests {
1258 use super::*;
1259
1260 #[test]
1261 fn test_lex_cell() {
1262 let src = "cell main() -> Int\n return 42\nend";
1263 let mut lexer = Lexer::new(src, 1, 0);
1264 let tokens = lexer.tokenize().unwrap();
1265 assert!(matches!(&tokens[0].kind, TokenKind::Cell));
1266 assert!(matches!(&tokens[1].kind, TokenKind::Ident(s) if s == "main"));
1267 }
1268
1269 #[test]
1270 fn test_lex_operators() {
1271 let src = "a + b == c";
1272 let mut lexer = Lexer::new(src, 1, 0);
1273 let tokens = lexer.tokenize().unwrap();
1274 assert!(matches!(&tokens[1].kind, TokenKind::Plus));
1275 assert!(matches!(&tokens[3].kind, TokenKind::Eq));
1276 }
1277
1278 #[test]
1279 fn test_lex_string() {
1280 let mut lexer = Lexer::new(r#""hello""#, 1, 0);
1281 let tokens = lexer.tokenize().unwrap();
1282 assert!(matches!(&tokens[0].kind, TokenKind::StringLit(s) if s == "hello"));
1283 }
1284
1285 #[test]
1286 fn test_lex_indent() {
1287 let mut lexer = Lexer::new("if x\n return 1\nend", 1, 0);
1288 let tokens = lexer.tokenize().unwrap();
1289 let kinds: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
1290 assert!(kinds.contains(&&TokenKind::Indent));
1291 assert!(kinds.contains(&&TokenKind::Dedent));
1292 }
1293
1294 #[test]
1295 fn test_lex_hex_number() {
1296 let mut lexer = Lexer::new("0xFF", 1, 0);
1297 let tokens = lexer.tokenize().unwrap();
1298 assert!(matches!(&tokens[0].kind, TokenKind::IntLit(255)));
1299 }
1300
1301 #[test]
1302 fn test_lex_bin_number() {
1303 let mut lexer = Lexer::new("0b1010", 1, 0);
1304 let tokens = lexer.tokenize().unwrap();
1305 assert!(matches!(&tokens[0].kind, TokenKind::IntLit(10)));
1306 }
1307
1308 #[test]
1309 fn test_lex_oct_number() {
1310 let mut lexer = Lexer::new("0o777", 1, 0);
1311 let tokens = lexer.tokenize().unwrap();
1312 assert!(matches!(&tokens[0].kind, TokenKind::IntLit(511)));
1313 }
1314
1315 #[test]
1316 fn test_lex_scientific() {
1317 let mut lexer = Lexer::new("1e10", 1, 0);
1318 let tokens = lexer.tokenize().unwrap();
1319 assert!(matches!(&tokens[0].kind, TokenKind::FloatLit(f) if *f == 1e10));
1320 }
1321
1322 #[test]
1323 fn test_lex_compound_assign() {
1324 let mut lexer = Lexer::new("+= -= *= /=", 1, 0);
1325 let tokens = lexer.tokenize().unwrap();
1326 assert!(matches!(&tokens[0].kind, TokenKind::PlusAssign));
1327 assert!(matches!(&tokens[1].kind, TokenKind::MinusAssign));
1328 assert!(matches!(&tokens[2].kind, TokenKind::StarAssign));
1329 assert!(matches!(&tokens[3].kind, TokenKind::SlashAssign));
1330 }
1331
1332 #[test]
1333 fn test_lex_new_operators() {
1334 let mut lexer = Lexer::new("** .. ..= |> >> ?? ?. ! ? ... => ++ & ~ ^", 1, 0);
1335 let tokens = lexer.tokenize().unwrap();
1336 assert!(matches!(&tokens[0].kind, TokenKind::StarStar));
1337 assert!(matches!(&tokens[1].kind, TokenKind::DotDot));
1338 assert!(matches!(&tokens[2].kind, TokenKind::DotDotEq));
1339 assert!(matches!(&tokens[3].kind, TokenKind::PipeForward));
1340 assert!(matches!(&tokens[4].kind, TokenKind::RightShift));
1341 assert!(matches!(&tokens[5].kind, TokenKind::QuestionQuestion));
1342 assert!(matches!(&tokens[6].kind, TokenKind::QuestionDot));
1343 assert!(matches!(&tokens[7].kind, TokenKind::Bang));
1344 assert!(matches!(&tokens[8].kind, TokenKind::Question));
1345 assert!(matches!(&tokens[9].kind, TokenKind::DotDotDot));
1346 assert!(matches!(&tokens[10].kind, TokenKind::FatArrow));
1347 assert!(matches!(&tokens[11].kind, TokenKind::PlusPlus));
1348 assert!(matches!(&tokens[12].kind, TokenKind::Ampersand));
1349 assert!(matches!(&tokens[13].kind, TokenKind::Tilde));
1350 assert!(matches!(&tokens[14].kind, TokenKind::Caret));
1351 }
1352
1353 #[test]
1354 fn test_lex_new_keywords() {
1355 let mut lexer = Lexer::new("while loop break continue mut const pub import from async await parallel fn trait impl type set tuple emit yield mod self with try union step comptime macro extern then when", 1, 0);
1356 let tokens = lexer.tokenize().unwrap();
1357 assert!(matches!(&tokens[0].kind, TokenKind::While));
1358 assert!(matches!(&tokens[1].kind, TokenKind::Loop));
1359 assert!(matches!(&tokens[2].kind, TokenKind::Break));
1360 assert!(matches!(&tokens[3].kind, TokenKind::Continue));
1361 assert!(matches!(&tokens[4].kind, TokenKind::Mut));
1362 assert!(matches!(&tokens[5].kind, TokenKind::Const));
1363 assert!(matches!(&tokens[6].kind, TokenKind::Pub));
1364 assert!(matches!(&tokens[7].kind, TokenKind::Import));
1365 assert!(matches!(&tokens[8].kind, TokenKind::From));
1366 assert!(matches!(&tokens[9].kind, TokenKind::Async));
1367 assert!(matches!(&tokens[10].kind, TokenKind::Await));
1368 assert!(matches!(&tokens[11].kind, TokenKind::Parallel));
1369 assert!(matches!(&tokens[12].kind, TokenKind::Fn));
1370 assert!(matches!(&tokens[13].kind, TokenKind::Trait));
1371 assert!(matches!(&tokens[14].kind, TokenKind::Impl));
1372 assert!(matches!(&tokens[15].kind, TokenKind::Type));
1373 assert!(matches!(&tokens[16].kind, TokenKind::Set));
1374 assert!(matches!(&tokens[17].kind, TokenKind::Tuple));
1375 assert!(matches!(&tokens[18].kind, TokenKind::Emit));
1376 assert!(matches!(&tokens[19].kind, TokenKind::Yield));
1377 assert!(matches!(&tokens[20].kind, TokenKind::Mod));
1378 assert!(matches!(&tokens[21].kind, TokenKind::SelfKw));
1379 assert!(matches!(&tokens[22].kind, TokenKind::With));
1380 assert!(matches!(&tokens[23].kind, TokenKind::Try));
1381 assert!(matches!(&tokens[24].kind, TokenKind::Union));
1382 assert!(matches!(&tokens[25].kind, TokenKind::Step));
1383 assert!(matches!(&tokens[26].kind, TokenKind::Comptime));
1384 assert!(matches!(&tokens[27].kind, TokenKind::Macro));
1385 assert!(matches!(&tokens[28].kind, TokenKind::Extern));
1386 assert!(matches!(&tokens[29].kind, TokenKind::Then));
1387 assert!(matches!(&tokens[30].kind, TokenKind::When));
1388 }
1389
1390 #[test]
1391 fn test_lex_raw_string() {
1392 let mut lexer = Lexer::new(r#"r"no \n here""#, 1, 0);
1393 let tokens = lexer.tokenize().unwrap();
1394 assert!(matches!(&tokens[0].kind, TokenKind::RawStringLit(s) if s == r"no \n here"));
1395 }
1396
1397 #[test]
1398 fn test_lex_bytes_literal() {
1399 let mut lexer = Lexer::new(r#"b"48656C6C6F""#, 1, 0);
1400 let tokens = lexer.tokenize().unwrap();
1401 assert!(
1402 matches!(&tokens[0].kind, TokenKind::BytesLit(b) if b == &[0x48, 0x65, 0x6C, 0x6C, 0x6F])
1403 );
1404 }
1405
1406 #[test]
1407 fn test_lex_fat_arrow() {
1408 let mut lexer = Lexer::new("=>", 1, 0);
1409 let tokens = lexer.tokenize().unwrap();
1410 assert!(matches!(&tokens[0].kind, TokenKind::FatArrow));
1411 }
1412
1413 #[test]
1414 fn test_lex_line_continuation() {
1415 let mut lexer = Lexer::new("a +\\\n b", 1, 0);
1416 let tokens = lexer.tokenize().unwrap();
1417 let kinds: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
1418 assert!(!kinds.contains(&&TokenKind::Newline));
1420 assert!(matches!(&tokens[0].kind, TokenKind::Ident(s) if s == "a"));
1421 assert!(matches!(&tokens[1].kind, TokenKind::Plus));
1422 assert!(matches!(&tokens[2].kind, TokenKind::Ident(s) if s == "b"));
1423 }
1424
1425 #[test]
1426 fn test_lex_null_literal() {
1427 let mut lexer = Lexer::new("null", 1, 0);
1428 let tokens = lexer.tokenize().unwrap();
1429 assert!(matches!(&tokens[0].kind, TokenKind::NullLit));
1430 }
1431
1432 #[test]
1433 fn test_lex_unicode_escape() {
1434 let mut lexer = Lexer::new(r#""\u{0041}""#, 1, 0);
1435 let tokens = lexer.tokenize().unwrap();
1436 assert!(matches!(&tokens[0].kind, TokenKind::StringLit(s) if s == "A"));
1437 }
1438
1439 #[test]
1440 fn test_lex_hex_byte_escape() {
1441 let mut lexer = Lexer::new(r#""\x41""#, 1, 0);
1442 let tokens = lexer.tokenize().unwrap();
1443 assert!(matches!(&tokens[0].kind, TokenKind::StringLit(s) if s == "A"));
1444 }
1445}