1#![forbid(unsafe_code)]
2use std::iter::Peekable;
27use std::str::CharIndices;
28
29mod error;
30pub use error::{LexError, LexErrorKind};
32mod span;
33pub use span::LineMap;
35mod iter;
36pub use iter::{tokenize_iter, Tokens};
38
39#[cfg_attr(feature = "serde", derive(serde::Serialize))]
42#[derive(Debug, Clone, Copy, PartialEq, Eq)]
43pub enum BorrowedTokenKind<'a> {
44 Ident(&'a str),
45 Number(&'a str),
46 String(&'a str),
47 True,
48 False,
49 If,
50 Then,
51 Else,
52 Let,
53 Rule,
54 And,
55 Or,
56 LParen,
57 RParen,
58 LBrace,
59 RBrace,
60 LBracket,
61 RBracket,
62 Comma,
63 Colon,
64 Semicolon,
65 Arrow,
66 Eq,
67 Plus,
68 Minus,
69 Star,
70 Slash,
71 Dot,
72 DoubleDot,
73 At,
74}
75
76#[cfg_attr(feature = "serde", derive(serde::Serialize))]
78#[derive(Debug, Clone, Copy, PartialEq, Eq)]
79pub struct BorrowedToken<'a> {
80 pub kind: BorrowedTokenKind<'a>,
81 pub span: Span,
82}
83
84#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
86#[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))]
87#[derive(Debug, Clone, PartialEq, Eq)]
88pub enum TokenKind {
89 Ident(String),
90 Number(String),
91 String(String),
92 True,
93 False,
94 If,
95 Then,
96 Else,
97 Let,
98 Rule,
99 And,
100 Or,
101 LParen,
102 RParen,
103 LBrace,
104 RBrace,
105 LBracket,
106 RBracket,
107 Comma,
108 Colon,
109 Semicolon,
110 Arrow,
111 Eq,
112 Plus,
113 Minus,
114 Star,
115 Slash,
116 Dot,
117 DoubleDot,
118 At,
119}
120
121#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
123#[derive(Debug, Clone, Copy, PartialEq, Eq)]
124pub struct Span {
125 pub start: usize,
126 pub end: usize,
127}
128
129#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
131#[derive(Debug, Clone, PartialEq, Eq)]
132pub struct Token {
133 pub kind: TokenKind,
134 pub span: Span,
135}
136
137#[derive(Debug)]
139pub struct Lexer<'a> {
140 src: &'a str,
141 it: Peekable<CharIndices<'a>>,
142 cur: Option<(usize, char)>,
143}
144
145impl<'a> Lexer<'a> {
146 pub fn new(src: &'a str) -> Self {
147 let mut it = src.char_indices().peekable();
148 let cur = it.next();
149 Self { src, it, cur }
150 }
151
152 fn bump(&mut self) -> Option<(usize, char)> {
153 let out = self.cur;
154 self.cur = self.it.next();
155 out
156 }
157
158 fn peek(&self) -> Option<(usize, char)> {
159 self.cur
160 }
161
162 fn skip_ws_and_comments(&mut self) {
163 loop {
164 let mut progressed = false;
165 while let Some((_, c)) = self.peek() {
166 if c.is_whitespace() {
167 self.bump();
168 progressed = true;
169 } else {
170 break;
171 }
172 }
173 if let Some((_, '/')) = self.peek() {
174 let mut clone = self.it.clone();
175 if let Some((_, '/')) = clone.next() {
176 self.bump();
177 self.bump();
178 while let Some((_, c)) = self.peek() {
179 if c == '\n' {
180 break;
181 }
182 self.bump();
183 }
184 continue;
185 }
186 }
187 if !progressed {
188 break;
189 }
190 }
191 }
192
193 fn kw_or_ident(s: &str) -> TokenKind {
194 match s {
195 "true" => TokenKind::True,
196 "false" => TokenKind::False,
197 "if" => TokenKind::If,
198 "then" => TokenKind::Then,
199 "else" => TokenKind::Else,
200 "let" => TokenKind::Let,
201 "rule" => TokenKind::Rule,
202 "and" => TokenKind::And,
203 "or" => TokenKind::Or,
204 _ => TokenKind::Ident(s.to_string()),
205 }
206 }
207
208 fn lex_number(&mut self, start: usize) -> Result<Token, LexError> {
209 let mut seen_dot = false;
210 let mut seen_exp = false;
211 let mut last_was_dot = false;
212 self.bump(); while let Some((idx, ch)) = self.peek() {
215 if ch.is_ascii_digit() {
216 self.bump();
217 last_was_dot = false;
218 } else if ch == '.' {
219 if seen_dot {
220 if last_was_dot {
222 break;
223 }
224 return Err(LexError::new(
226 LexErrorKind::InvalidNumber,
227 Span {
228 start,
229 end: idx + ch.len_utf8(),
230 },
231 ));
232 }
233 let mut clone = self.it.clone();
235 if let Some((_, next)) = clone.next() {
236 if next == '.' {
237 break;
238 }
239 if !next.is_ascii_digit() {
240 break;
241 }
242 } else {
243 break;
244 }
245 seen_dot = true;
246 last_was_dot = true;
247 self.bump();
248 } else if (ch == 'e' || ch == 'E') && !seen_exp {
249 seen_exp = true;
250 last_was_dot = false;
251 self.bump();
252 if let Some((_, sign)) = self.peek() {
253 if sign == '+' || sign == '-' {
254 self.bump();
255 }
256 }
257 match self.peek() {
258 Some((_, d)) if d.is_ascii_digit() => {}
259 _ => {
260 return Err(LexError::new(
261 LexErrorKind::InvalidNumber,
262 Span {
263 start,
264 end: idx + ch.len_utf8(),
265 },
266 ));
267 }
268 }
269 } else {
270 break;
271 }
272 }
273
274 let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
275 Ok(Token {
276 kind: TokenKind::Number(self.src[start..end].to_string()),
277 span: Span { start, end },
278 })
279 }
280
281 fn lex_number_borrowed(&mut self, start: usize) -> Result<BorrowedToken<'a>, LexError> {
282 let mut seen_dot = false;
283 let mut seen_exp = false;
284 let mut last_was_dot = false;
285 self.bump(); while let Some((idx, ch)) = self.peek() {
288 if ch.is_ascii_digit() {
289 self.bump();
290 last_was_dot = false;
291 } else if ch == '.' {
292 if seen_dot {
293 if last_was_dot {
294 break;
295 }
296 return Err(LexError::new(
297 LexErrorKind::InvalidNumber,
298 Span {
299 start,
300 end: idx + ch.len_utf8(),
301 },
302 ));
303 }
304 let mut clone = self.it.clone();
305 if let Some((_, next)) = clone.next() {
306 if next == '.' {
307 break;
308 }
309 if !next.is_ascii_digit() {
310 break;
311 }
312 } else {
313 break;
314 }
315 seen_dot = true;
316 last_was_dot = true;
317 self.bump();
318 } else if (ch == 'e' || ch == 'E') && !seen_exp {
319 seen_exp = true;
320 last_was_dot = false;
321 self.bump();
322 if let Some((_, sign)) = self.peek() {
323 if sign == '+' || sign == '-' {
324 self.bump();
325 }
326 }
327 match self.peek() {
328 Some((_, d)) if d.is_ascii_digit() => {}
329 _ => {
330 return Err(LexError::new(
331 LexErrorKind::InvalidNumber,
332 Span {
333 start,
334 end: idx + ch.len_utf8(),
335 },
336 ))
337 }
338 }
339 } else {
340 break;
341 }
342 }
343
344 let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
345 Ok(BorrowedToken {
346 kind: BorrowedTokenKind::Number(&self.src[start..end]),
347 span: Span { start, end },
348 })
349 }
350
351 fn next_token_borrowed(&mut self) -> Option<Result<BorrowedToken<'a>, LexError>> {
353 self.skip_ws_and_comments();
354 let (i, c) = self.peek()?;
355
356 if c == '"' {
358 let start = i; self.bump();
360 let content_start = start + 1;
361 loop {
362 let Some((j, ch)) = self.bump() else {
363 return Some(Err(LexError::new(
364 LexErrorKind::UnterminatedString,
365 Span {
366 start,
367 end: self.src.len(),
368 },
369 )));
370 };
371 match ch {
372 '\\' => {
373 let Some((k, esc)) = self.bump() else {
375 return Some(Err(LexError::new(
376 LexErrorKind::UnterminatedEscape,
377 Span {
378 start: j,
379 end: j + 1,
380 },
381 )));
382 };
383 match esc {
384 'n' | 't' | 'r' | '"' | '\\' => {
385 let _ = k;
386 }
387 _ => {
388 let escape_end = k + esc.len_utf8();
389 return Some(Err(LexError::new(
390 LexErrorKind::InvalidEscape,
391 Span {
392 start: j,
393 end: escape_end,
394 },
395 )));
396 }
397 }
398 }
399 '"' => {
400 let end = j + 1; return Some(Ok(BorrowedToken {
402 kind: BorrowedTokenKind::String(&self.src[content_start..j]),
403 span: Span { start, end },
404 }));
405 }
406 _ => {}
407 }
408 }
409 }
410
411 if c == '.' {
413 let start = i;
414 self.bump();
415 if let Some((j, '.')) = self.peek() {
416 self.bump();
417 return Some(Ok(BorrowedToken {
418 kind: BorrowedTokenKind::DoubleDot,
419 span: Span { start, end: j + 1 },
420 }));
421 } else {
422 return Some(Ok(BorrowedToken {
423 kind: BorrowedTokenKind::Dot,
424 span: Span {
425 start,
426 end: start + 1,
427 },
428 }));
429 }
430 }
431
432 if c.is_ascii_digit() {
434 match self.lex_number_borrowed(i) {
435 Ok(tok) => return Some(Ok(tok)),
436 Err(e) => return Some(Err(e)),
437 }
438 }
439
440 if c.is_ascii_alphabetic() || c == '_' {
442 let start = i;
443 self.bump();
444 while let Some((_, p)) = self.peek() {
445 if p.is_ascii_alphanumeric() || p == '_' {
446 self.bump();
447 } else {
448 break;
449 }
450 }
451 let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
452 let kind = match &self.src[start..end] {
453 "true" => BorrowedTokenKind::True,
454 "false" => BorrowedTokenKind::False,
455 "if" => BorrowedTokenKind::If,
456 "then" => BorrowedTokenKind::Then,
457 "else" => BorrowedTokenKind::Else,
458 "let" => BorrowedTokenKind::Let,
459 "rule" => BorrowedTokenKind::Rule,
460 "and" => BorrowedTokenKind::And,
461 "or" => BorrowedTokenKind::Or,
462 s => BorrowedTokenKind::Ident(s),
463 };
464 return Some(Ok(BorrowedToken {
465 kind,
466 span: Span { start, end },
467 }));
468 }
469
470 if c == '-' {
472 let start = i;
473 self.bump();
474 if let Some((j, '>')) = self.peek() {
475 self.bump();
476 return Some(Ok(BorrowedToken {
477 kind: BorrowedTokenKind::Arrow,
478 span: Span { start, end: j + 1 },
479 }));
480 } else {
481 return Some(Ok(BorrowedToken {
482 kind: BorrowedTokenKind::Minus,
483 span: Span {
484 start,
485 end: start + 1,
486 },
487 }));
488 }
489 }
490
491 let start = i;
493 self.bump();
494 let tk = match c {
495 '(' => BorrowedTokenKind::LParen,
496 ')' => BorrowedTokenKind::RParen,
497 '{' => BorrowedTokenKind::LBrace,
498 '}' => BorrowedTokenKind::RBrace,
499 '[' => BorrowedTokenKind::LBracket,
500 ']' => BorrowedTokenKind::RBracket,
501 ',' => BorrowedTokenKind::Comma,
502 ':' => BorrowedTokenKind::Colon,
503 ';' => BorrowedTokenKind::Semicolon,
504 '=' => BorrowedTokenKind::Eq,
505 '+' => BorrowedTokenKind::Plus,
506 '*' => BorrowedTokenKind::Star,
507 '/' => BorrowedTokenKind::Slash,
508 '@' => BorrowedTokenKind::At,
509 other => {
510 return Some(Err(LexError::new(
511 LexErrorKind::UnexpectedChar,
512 Span {
513 start,
514 end: start + other.len_utf8(),
515 },
516 )));
517 }
518 };
519 Some(Ok(BorrowedToken {
520 kind: tk,
521 span: Span {
522 start,
523 end: start + 1,
524 },
525 }))
526 }
527
528 #[inline]
531 pub(crate) fn next_token(&mut self) -> Option<Result<Token, LexError>> {
532 self.skip_ws_and_comments();
533 let (i, c) = self.peek()?;
534
535 if c == '"' {
537 let start = i;
538 self.bump();
539 let mut s = String::new();
540 loop {
541 let Some((j, ch)) = self.bump() else {
542 return Some(Err(LexError::new(
543 LexErrorKind::UnterminatedString,
544 Span {
545 start,
546 end: self.src.len(),
547 },
548 )));
549 };
550 match ch {
551 '\\' => {
552 let Some((k, esc)) = self.bump() else {
554 return Some(Err(LexError::new(
555 LexErrorKind::UnterminatedEscape,
556 Span {
557 start: j,
558 end: j + 1,
559 },
560 )));
561 };
562 let ch = match esc {
563 'n' => '\n',
564 't' => '\t',
565 'r' => '\r',
566 '"' => '"',
567 '\\' => '\\',
568 _ => {
569 let escape_end = k + esc.len_utf8();
570 return Some(Err(LexError::new(
571 LexErrorKind::InvalidEscape,
572 Span {
573 start: j,
574 end: escape_end,
575 },
576 )));
577 }
578 };
579 s.push(ch);
580 }
581 '"' => {
582 return Some(Ok(Token {
583 kind: TokenKind::String(s),
584 span: Span { start, end: j + 1 },
585 }));
586 }
587 _ => s.push(ch),
588 }
589 }
590 }
591
592 if c == '.' {
594 let start = i;
595 self.bump();
596 if let Some((j, '.')) = self.peek() {
597 self.bump();
598 return Some(Ok(Token {
599 kind: TokenKind::DoubleDot,
600 span: Span { start, end: j + 1 },
601 }));
602 } else {
603 return Some(Ok(Token {
604 kind: TokenKind::Dot,
605 span: Span {
606 start,
607 end: start + 1,
608 },
609 }));
610 }
611 }
612
613 if c.is_ascii_digit() {
615 match self.lex_number(i) {
616 Ok(tok) => return Some(Ok(tok)),
617 Err(e) => return Some(Err(e)),
618 }
619 }
620
621 if c.is_ascii_alphabetic() || c == '_' {
623 let start = i;
624 self.bump();
625 while let Some((_, p)) = self.peek() {
626 if p.is_ascii_alphanumeric() || p == '_' {
627 self.bump();
628 } else {
629 break;
630 }
631 }
632 let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
633 let kind = Self::kw_or_ident(&self.src[start..end]);
634 return Some(Ok(Token {
635 kind,
636 span: Span { start, end },
637 }));
638 }
639
640 if c == '-' {
642 let start = i;
643 self.bump();
644 if let Some((j, '>')) = self.peek() {
645 self.bump();
646 return Some(Ok(Token {
647 kind: TokenKind::Arrow,
648 span: Span { start, end: j + 1 },
649 }));
650 } else {
651 return Some(Ok(Token {
652 kind: TokenKind::Minus,
653 span: Span {
654 start,
655 end: start + 1,
656 },
657 }));
658 }
659 }
660
661 let start = i;
663 self.bump();
664 let tk = match c {
665 '(' => TokenKind::LParen,
666 ')' => TokenKind::RParen,
667 '{' => TokenKind::LBrace,
668 '}' => TokenKind::RBrace,
669 '[' => TokenKind::LBracket,
670 ']' => TokenKind::RBracket,
671 ',' => TokenKind::Comma,
672 ':' => TokenKind::Colon,
673 ';' => TokenKind::Semicolon,
674 '=' => TokenKind::Eq,
675 '+' => TokenKind::Plus,
676 '*' => TokenKind::Star,
677 '/' => TokenKind::Slash,
678 '@' => TokenKind::At,
679 other => {
680 return Some(Err(LexError::new(
681 LexErrorKind::UnexpectedChar,
682 Span {
683 start,
684 end: start + other.len_utf8(),
685 },
686 )));
687 }
688 };
689 Some(Ok(Token {
690 kind: tk,
691 span: Span {
692 start,
693 end: start + 1,
694 },
695 }))
696 }
697
698 pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
699 let mut out = Vec::new();
700 while let Some(res) = self.next_token() {
701 match res {
702 Ok(tok) => out.push(tok),
703 Err(e) => return Err(e),
704 }
705 }
706 Ok(out)
707 }
708}
709
710pub fn tokenize(src: &str) -> Result<Vec<Token>, LexError> {
713 Lexer::new(src).tokenize()
714}
715
716pub fn tokenize_borrowed(src: &str) -> Result<Vec<BorrowedToken<'_>>, LexError> {
719 let mut lx = Lexer::new(src);
720 let mut out = Vec::new();
721 while let Some(res) = lx.next_token_borrowed() {
722 match res {
723 Ok(t) => out.push(t),
724 Err(e) => return Err(e),
725 }
726 }
727 Ok(out)
728}
729
730#[cfg(test)]
731mod tests {
732 use super::*;
733 #[test]
734 fn error_kind_as_str_and_display_messages() {
735 use super::{LexError, LexErrorKind, Span};
736 let span = Span { start: 1, end: 3 };
737 let cases: &[(LexErrorKind, &str, &str)] = &[
738 (
739 LexErrorKind::UnexpectedChar,
740 "unexpected character",
741 "unexpected char",
742 ),
743 (
744 LexErrorKind::UnterminatedString,
745 "unterminated string",
746 "unterminated string",
747 ),
748 (
749 LexErrorKind::UnterminatedEscape,
750 "unterminated escape",
751 "unterminated escape",
752 ),
753 (
754 LexErrorKind::InvalidNumber,
755 "invalid number",
756 "invalid number",
757 ),
758 (
759 LexErrorKind::InvalidEscape,
760 "invalid escape sequence",
761 "invalid escape",
762 ),
763 ];
764
765 for (kind, as_str_msg, display_msg) in cases.iter().cloned() {
766 assert_eq!(kind.as_str(), as_str_msg);
767 let err = LexError::new(kind, span);
768 let rendered = format!("{}", err);
769 assert_eq!(
770 rendered,
771 format!("{} at {}..{}", display_msg, span.start, span.end)
772 );
773 let _e: &dyn std::error::Error = &err;
774 let _dbg = format!("{:?}", err.clone());
775 assert!(!_dbg.is_empty());
776 }
777 }
778 #[test]
779 fn numbers_second_dot_invalid_unless_range() {
780 let err = tokenize("123.45.6").expect_err("second dot should be invalid unless range");
782 assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
783
784 let toks = tokenize("1..2").unwrap();
786 assert!(matches!(toks[0].kind, TokenKind::Number(ref s) if s == "1"));
787 assert!(matches!(toks[1].kind, TokenKind::DoubleDot));
788 assert!(matches!(toks[2].kind, TokenKind::Number(ref s) if s == "2"));
789 }
790
791 #[test]
792 fn numbers_exponent_rules() {
793 let toks = tokenize("1e10 1E+10 1.23e-4").unwrap();
795 assert!(toks
796 .iter()
797 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1e10")));
798 assert!(toks
799 .iter()
800 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1E+10")));
801 assert!(toks
802 .iter()
803 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.23e-4")));
804
805 let err = tokenize("1e+").expect_err("missing exponent digits");
807 assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
808
809 let err = tokenize("2E-").expect_err("missing exponent digits");
810 assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
811 }
812 #[test]
813 fn basic() {
814 let code = r#"
815 // sample
816 let rule greet(name) = "hi, " + name
817 if true and false then x = 1 else x = 2;
818 "#;
819 let toks = tokenize(code).unwrap();
820 assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Let)));
821 assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Rule)));
822 assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::String(_))));
823 }
824
825 #[test]
826 fn new_token_types_owned() {
827 use TokenKind as K;
828 let toks = tokenize("a.b ..c @d").unwrap();
829 assert!(matches!(toks[1].kind, K::Dot));
830 assert!(matches!(toks[3].kind, K::DoubleDot));
831 assert!(matches!(toks[5].kind, K::At));
832 }
833
834 #[test]
835 fn numbers_and_ranges() {
836 let toks = tokenize("1 1.0 1.2e-3").unwrap();
838 assert!(toks
839 .iter()
840 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1")));
841 assert!(toks
842 .iter()
843 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.0")));
844 assert!(toks
845 .iter()
846 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.2e-3")));
847
848 let toks = tokenize("1..2").unwrap();
850 assert!(matches!(toks[0].kind, TokenKind::Number(ref s) if s == "1"));
851 assert!(matches!(toks[1].kind, TokenKind::DoubleDot));
852 assert!(matches!(toks[2].kind, TokenKind::Number(ref s) if s == "2"));
853 }
854
855 #[test]
856 fn string_escapes() {
857 let toks = tokenize("\"a\\n\\t\\r\\\\\\\"\"").unwrap();
859 assert!(matches!(toks[0].kind, TokenKind::String(_)));
860
861 let err = tokenize("\"\\x\"").unwrap_err();
863 assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
864 }
865
866 #[test]
867 fn numbers_trailing_dot_is_error() {
868 let toks = tokenize("0.").unwrap();
869 assert!(matches!(toks[0].kind, TokenKind::Number(ref s) if s == "0"));
870 assert!(matches!(toks[1].kind, TokenKind::Dot));
871 }
872
873 #[test]
874 fn dot_tokens_work() {
875 let toks = tokenize("a.b ..c d.e").unwrap();
876 use TokenKind as K;
877 assert!(matches!(toks[1].kind, K::Dot));
878 assert!(matches!(toks[3].kind, K::DoubleDot));
879 assert!(matches!(toks[6].kind, K::Dot));
880 }
881
882 #[test]
883 fn strings_empty_and_raw_newline_and_escapes() {
884 let toks = tokenize("\"\"").unwrap();
886 assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s.is_empty()));
887
888 let toks = tokenize("\"a\nb\"").unwrap();
890 assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "a\nb"));
891
892 let toks = tokenize("\"\\\"\\\\\t\"").unwrap();
894 assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "\"\\\t"));
895 }
896
897 #[test]
898 fn streaming_iterator_matches_tokenize_and_propagates_error() {
899 let src = "let x = 1 + 2\nrule r() = \"ok\"";
901 let vec_tokens = tokenize(src).unwrap();
902 let iter_tokens: Result<Vec<_>, _> = tokenize_iter(src).collect();
903 let iter_tokens = iter_tokens.unwrap();
904 assert_eq!(vec_tokens, iter_tokens);
905
906 let src_err = "\"abc\\x\" rest";
908 let mut it = tokenize_iter(src_err);
909 match it.next() {
910 Some(Err(e)) => assert!(matches!(e.kind, LexErrorKind::InvalidEscape)),
911 other => panic!("expected first item to be Err, got {:?}", other),
912 }
913 assert!(it.next().is_none(), "iterator should end after error");
914 }
915
916 #[test]
917 fn invalid_escape_span_is_precise() {
918 let src = "\"abc\\x\"";
920 let err = tokenize(src).unwrap_err();
921 assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
922 assert_eq!(err.span, Span { start: 4, end: 6 });
924 }
925
926 #[test]
927 fn strings_unterminated_and_unterminated_escape() {
928 let err = tokenize("\"abc").expect_err("unterminated string");
930 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
931
932 let err = tokenize("\"abc\\").expect_err("unterminated escape");
934 assert!(matches!(err.kind, LexErrorKind::UnterminatedEscape));
935 }
936
937 #[test]
938 fn idents_and_keywords() {
939 let toks = tokenize("let letx _x1").unwrap();
940 assert!(matches!(toks[0].kind, TokenKind::Let));
941 assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "letx"));
942 assert!(matches!(toks[2].kind, TokenKind::Ident(ref s) if s == "_x1"));
943 }
944
945 #[test]
946 fn comments_do_not_leak() {
947 let toks = tokenize("foo // comment\nbar").unwrap();
948 assert!(matches!(toks[0].kind, TokenKind::Ident(ref s) if s == "foo"));
949 assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "bar"));
950 assert_eq!(toks.len(), 2);
951 }
952
953 #[test]
954 fn unknown_char_errors_with_span() {
955 let err = tokenize("a # b").expect_err("unknown char '#'");
956 assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
957 assert!(err.span.start < err.span.end);
958 }
959
960 #[test]
961 fn golden_small_input() {
962 let src = "let rule f(x) = \"hi\" + x";
963 let toks = tokenize(src).unwrap();
964 use TokenKind::*;
965 let kinds: Vec<&'static str> = toks
966 .iter()
967 .map(|t| match &t.kind {
968 Let => "Let",
969 Rule => "Rule",
970 Ident(s) if s == "f" => "Ident(f)",
971 LParen => "LParen",
972 Ident(s) if s == "x" => "Ident(x)",
973 RParen => "RParen",
974 Eq => "Eq",
975 String(s) if s == "hi" => "String(hi)",
976 Plus => "Plus",
977 Ident(s) if s == "x" => "Ident(x)",
978 other => panic!("unexpected token in golden: {:?}", other),
979 })
980 .collect();
981 assert_eq!(
982 kinds,
983 vec![
984 "Let",
985 "Rule",
986 "Ident(f)",
987 "LParen",
988 "Ident(x)",
989 "RParen",
990 "Eq",
991 "String(hi)",
992 "Plus",
993 "Ident(x)"
994 ]
995 );
996 }
997
998 #[cfg(feature = "serde")]
999 #[test]
1000 fn serde_round_trip_token() {
1001 let toks = tokenize("let x = 1").unwrap();
1002 let json = serde_json::to_string(&toks).unwrap();
1003 let back: Vec<Token> = serde_json::from_str(&json).unwrap();
1004 assert_eq!(toks, back);
1005 }
1006
1007 #[test]
1008 fn borrowed_basic_no_escapes() {
1009 let toks = tokenize_borrowed("let x = \"hi\" 123").unwrap();
1010 use BorrowedTokenKind as K;
1011 assert!(matches!(toks[0].kind, K::Let));
1012 assert!(matches!(toks[1].kind, K::Ident("x")));
1013 assert!(matches!(toks[3].kind, K::String("hi")));
1014 assert!(matches!(toks[4].kind, K::Number("123")));
1015 }
1016
1017 #[test]
1018 fn borrowed_string_keeps_escapes() {
1019 let toks = tokenize_borrowed("\"a\\n\"").unwrap();
1020 use BorrowedTokenKind as K;
1021 assert!(matches!(toks[0].kind, K::String("a\\n")));
1022 }
1023
1024 #[test]
1026 fn borrowed_operators_and_singles() {
1027 use BorrowedTokenKind as K;
1028 let src = "()->{}[],:;=+ - * / ->";
1030 let toks = tokenize_borrowed(src).unwrap();
1031 let kinds: Vec<&'static str> = toks
1032 .iter()
1033 .map(|t| match t.kind {
1034 K::LParen => "LParen",
1035 K::RParen => "RParen",
1036 K::Arrow => "Arrow",
1037 K::LBrace => "LBrace",
1038 K::RBrace => "RBrace",
1039 K::LBracket => "LBracket",
1040 K::RBracket => "RBracket",
1041 K::Comma => "Comma",
1042 K::Colon => "Colon",
1043 K::Semicolon => "Semicolon",
1044 K::Eq => "Eq",
1045 K::Plus => "Plus",
1046 K::Minus => "Minus",
1047 K::Star => "Star",
1048 K::Slash => "Slash",
1049 _ => "Other",
1050 })
1051 .collect();
1052 assert_eq!(
1053 kinds,
1054 vec![
1055 "LParen",
1056 "RParen",
1057 "Arrow",
1058 "LBrace",
1059 "RBrace",
1060 "LBracket",
1061 "RBracket",
1062 "Comma",
1063 "Colon",
1064 "Semicolon",
1065 "Eq",
1066 "Plus",
1067 "Minus",
1068 "Star",
1069 "Slash",
1070 "Arrow"
1071 ]
1072 );
1073 }
1074
1075 #[test]
1076 fn borrowed_keywords_and_idents() {
1077 use BorrowedTokenKind as K;
1078 let toks =
1079 tokenize_borrowed("true false if then else let rule and or foo _bar a1").unwrap();
1080 assert!(matches!(toks[0].kind, K::True));
1082 assert!(matches!(toks[1].kind, K::False));
1083 assert!(matches!(toks[2].kind, K::If));
1084 assert!(matches!(toks[3].kind, K::Then));
1085 assert!(matches!(toks[4].kind, K::Else));
1086 assert!(matches!(toks[5].kind, K::Let));
1087 assert!(matches!(toks[6].kind, K::Rule));
1088 assert!(matches!(toks[7].kind, K::And));
1089 assert!(matches!(toks[8].kind, K::Or));
1090 assert!(matches!(toks[9].kind, K::Ident("foo")));
1091 assert!(matches!(toks[10].kind, K::Ident("_bar")));
1092 assert!(matches!(toks[11].kind, K::Ident("a1")));
1093 }
1094
1095 #[test]
1096 fn borrowed_comments_skipped() {
1097 use BorrowedTokenKind as K;
1098 let toks = tokenize_borrowed("foo // comment\nbar").unwrap();
1099 assert!(matches!(toks[0].kind, K::Ident("foo")));
1100 assert!(matches!(toks[1].kind, K::Ident("bar")));
1101 assert_eq!(toks.len(), 2);
1102 }
1103
1104 #[test]
1105 fn borrowed_numbers_errors_and_valid() {
1106 use BorrowedTokenKind as K;
1107 let toks = tokenize_borrowed("1 1.0 1.2e-3").unwrap();
1109 assert!(matches!(toks[0].kind, K::Number("1")));
1110 assert!(matches!(toks[1].kind, K::Number("1.0")));
1111 assert!(matches!(toks[2].kind, K::Number("1.2e-3")));
1112 let err = tokenize_borrowed("123.45.6").expect_err("second dot invalid");
1114 assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
1115 let err = tokenize_borrowed("1e+").expect_err("missing exponent digits");
1117 assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
1118 }
1119
1120 #[test]
1121 fn borrowed_string_errors() {
1122 let err = tokenize_borrowed("\"\\x\"").unwrap_err();
1124 assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
1125 let err = tokenize_borrowed("\"abc").unwrap_err();
1127 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1128 let err = tokenize_borrowed("\"abc\\").unwrap_err();
1130 assert!(matches!(err.kind, LexErrorKind::UnterminatedEscape));
1131 }
1132
1133 #[test]
1134 fn borrowed_unexpected_char_error() {
1135 let err = tokenize_borrowed("a # b").expect_err("unexpected '#'");
1136 assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
1137 assert!(err.span.start < err.span.end);
1138 }
1139
1140 #[test]
1141 fn new_token_types() {
1142 use BorrowedTokenKind as K;
1143 let toks = tokenize_borrowed("a.b ..c @d").unwrap();
1144 assert!(matches!(toks[1].kind, K::Dot));
1145 assert!(matches!(toks[3].kind, K::DoubleDot));
1146 assert!(matches!(toks[5].kind, K::At));
1147 }
1148
1149 #[test]
1150 fn dot_vs_double_dot() {
1151 use BorrowedTokenKind as K;
1152 let toks = tokenize_borrowed("a.b ..c d.e").unwrap();
1153 assert!(matches!(toks[1].kind, K::Dot));
1154 assert!(matches!(toks[3].kind, K::DoubleDot));
1155 assert!(matches!(toks[6].kind, K::Dot));
1156 }
1157}