1use std::iter::Peekable;
26use std::str::CharIndices;
27
28mod error;
29pub use error::{LexError, LexErrorKind};
31mod span;
32pub use span::LineMap;
34mod iter;
35pub use iter::{tokenize_iter, Tokens};
37
38#[derive(Debug, Clone, PartialEq, Eq)]
40pub enum TokenKind {
41 Ident(String),
42 Number(String),
43 String(String),
44 True,
45 False,
46 If,
47 Then,
48 Else,
49 Let,
50 Rule,
51 And,
52 Or,
53 LParen,
54 RParen,
55 LBrace,
56 RBrace,
57 LBracket,
58 RBracket,
59 Comma,
60 Colon,
61 Semicolon,
62 Arrow,
63 Eq,
64 Plus,
65 Minus,
66 Star,
67 Slash,
68}
69
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72pub struct Span {
73 pub start: usize,
74 pub end: usize,
75}
76
77#[derive(Debug, Clone, PartialEq, Eq)]
79pub struct Token {
80 pub kind: TokenKind,
81 pub span: Span,
82}
83
84pub struct Lexer<'a> {
86 src: &'a str,
87 it: Peekable<CharIndices<'a>>,
88 cur: Option<(usize, char)>,
89}
90
91impl<'a> Lexer<'a> {
92 pub fn new(src: &'a str) -> Self {
93 let mut it = src.char_indices().peekable();
94 let cur = it.next();
95 Self { src, it, cur }
96 }
97
98 fn bump(&mut self) -> Option<(usize, char)> {
99 let out = self.cur;
100 self.cur = self.it.next();
101 out
102 }
103
104 fn peek(&self) -> Option<(usize, char)> {
105 self.cur
106 }
107
108 fn skip_ws_and_comments(&mut self) {
109 loop {
110 let mut progressed = false;
111 while let Some((_, c)) = self.peek() {
112 if c.is_whitespace() {
113 self.bump();
114 progressed = true;
115 } else {
116 break;
117 }
118 }
119 if let Some((_, '/')) = self.peek() {
120 let mut clone = self.it.clone();
121 if let Some((_, '/')) = clone.next() {
122 self.bump();
123 self.bump();
124 while let Some((_, c)) = self.peek() {
125 if c == '\n' {
126 break;
127 }
128 self.bump();
129 }
130 continue;
131 }
132 }
133 if !progressed {
134 break;
135 }
136 }
137 }
138
139 fn kw_or_ident(s: &str) -> TokenKind {
140 match s {
141 "true" => TokenKind::True,
142 "false" => TokenKind::False,
143 "if" => TokenKind::If,
144 "then" => TokenKind::Then,
145 "else" => TokenKind::Else,
146 "let" => TokenKind::Let,
147 "rule" => TokenKind::Rule,
148 "and" => TokenKind::And,
149 "or" => TokenKind::Or,
150 _ => TokenKind::Ident(s.to_string()),
151 }
152 }
153
154 fn lex_number(&mut self, start: usize) -> Result<Token, LexError> {
155 let mut seen_dot = false;
156 let mut seen_exp = false;
157 let mut last_was_dot = false;
158 self.bump(); while let Some((idx, ch)) = self.peek() {
161 if ch.is_ascii_digit() {
162 self.bump();
163 last_was_dot = false;
164 } else if ch == '.' {
165 if seen_dot {
166 if last_was_dot {
168 break;
169 }
170 return Err(LexError::new(
172 LexErrorKind::InvalidNumber,
173 Span {
174 start,
175 end: idx + ch.len_utf8(),
176 },
177 ));
178 }
179 let mut clone = self.it.clone();
181 if let Some((_, next)) = clone.next() {
182 if next == '.' {
183 break;
184 }
185 if !next.is_ascii_digit() {
186 break;
187 }
188 } else {
189 break;
190 }
191 seen_dot = true;
192 last_was_dot = true;
193 self.bump();
194 } else if (ch == 'e' || ch == 'E') && !seen_exp {
195 seen_exp = true;
196 last_was_dot = false;
197 self.bump();
198 if let Some((_, sign)) = self.peek() {
199 if sign == '+' || sign == '-' {
200 self.bump();
201 }
202 }
203 match self.peek() {
204 Some((_, d)) if d.is_ascii_digit() => {}
205 _ => {
206 return Err(LexError::new(
207 LexErrorKind::InvalidNumber,
208 Span {
209 start,
210 end: idx + ch.len_utf8(),
211 },
212 ));
213 }
214 }
215 } else {
216 break;
217 }
218 }
219
220 let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
221 Ok(Token {
222 kind: TokenKind::Number(self.src[start..end].to_string()),
223 span: Span { start, end },
224 })
225 }
226
227 pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
228 let mut out = Vec::new();
229 loop {
230 self.skip_ws_and_comments();
231 let Some((i, c)) = self.peek() else {
232 break;
233 };
234
235 if c == '"' {
236 let start = i;
237 self.bump();
238 let mut s = String::new();
239 loop {
240 let Some((j, ch)) = self.bump() else {
241 return Err(LexError::new(
242 LexErrorKind::UnterminatedString,
243 Span {
244 start,
245 end: self.src.len(),
246 },
247 ));
248 };
249 match ch {
250 '\\' => {
251 let Some((_, esc)) = self.bump() else {
252 return Err(LexError::new(
253 LexErrorKind::UnterminatedEscape,
254 Span { start, end: j + 1 },
255 ));
256 };
257 let ch = match esc {
258 'n' => '\n',
259 't' => '\t',
260 'r' => '\r',
261 '"' => '"',
262 '\\' => '\\',
263 _ => {
264 return Err(LexError::new(
266 LexErrorKind::InvalidEscape,
267 Span {
268 start,
269 end: self.src.len(),
270 },
271 ));
272 }
273 };
274 s.push(ch);
275 }
276 '"' => {
277 out.push(Token {
278 kind: TokenKind::String(s),
279 span: Span { start, end: j + 1 },
280 });
281 break;
282 }
283 _ => s.push(ch),
284 }
285 }
286 continue;
287 }
288
289 if c.is_ascii_digit() {
290 match self.lex_number(i) {
291 Ok(tok) => out.push(tok),
292 Err(e) => return Err(e),
293 }
294 continue;
295 }
296
297 if c.is_ascii_alphabetic() || c == '_' {
298 let start = i;
299 self.bump();
300 while let Some((_, p)) = self.peek() {
301 if p.is_ascii_alphanumeric() || p == '_' {
302 self.bump();
303 } else {
304 break;
305 }
306 }
307 let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
308 let kind = Self::kw_or_ident(&self.src[start..end]);
309 out.push(Token {
310 kind,
311 span: Span { start, end },
312 });
313 continue;
314 }
315
316 if c == '-' {
317 let start = i;
318 self.bump();
319 if let Some((j, '>')) = self.peek() {
320 self.bump();
321 out.push(Token {
322 kind: TokenKind::Arrow,
323 span: Span { start, end: j + 1 },
324 });
325 } else {
326 out.push(Token {
327 kind: TokenKind::Minus,
328 span: Span {
329 start,
330 end: start + 1,
331 },
332 });
333 }
334 continue;
335 }
336
337 let start = i;
338 self.bump();
339 let tk = match c {
340 '(' => TokenKind::LParen,
341 ')' => TokenKind::RParen,
342 '{' => TokenKind::LBrace,
343 '}' => TokenKind::RBrace,
344 '[' => TokenKind::LBracket,
345 ']' => TokenKind::RBracket,
346 ',' => TokenKind::Comma,
347 ':' => TokenKind::Colon,
348 ';' => TokenKind::Semicolon,
349 '=' => TokenKind::Eq,
350 '+' => TokenKind::Plus,
351 '*' => TokenKind::Star,
352 '/' => TokenKind::Slash,
353 other => {
354 return Err(LexError::new(
355 LexErrorKind::UnexpectedChar,
356 Span {
357 start,
358 end: start + other.len_utf8(),
359 },
360 ))
361 }
362 };
363 out.push(Token {
364 kind: tk,
365 span: Span {
366 start,
367 end: start + 1,
368 },
369 });
370 }
371 Ok(out)
372 }
373}
374
375pub fn tokenize(src: &str) -> Result<Vec<Token>, LexError> {
378 Lexer::new(src).tokenize()
379}
380
381#[cfg(test)]
382mod tests {
383 use super::*;
384 #[test]
385 fn error_kind_as_str_and_display_messages() {
386 use super::{LexError, LexErrorKind, Span};
387 let span = Span { start: 1, end: 3 };
388 let cases: &[(LexErrorKind, &str, &str)] = &[
389 (
390 LexErrorKind::UnexpectedChar,
391 "unexpected character",
392 "unexpected char",
393 ),
394 (
395 LexErrorKind::UnterminatedString,
396 "unterminated string",
397 "unterminated string",
398 ),
399 (
400 LexErrorKind::UnterminatedEscape,
401 "unterminated escape",
402 "unterminated escape",
403 ),
404 (
405 LexErrorKind::InvalidNumber,
406 "invalid number",
407 "invalid number",
408 ),
409 (
410 LexErrorKind::InvalidEscape,
411 "invalid escape sequence",
412 "invalid escape",
413 ),
414 ];
415
416 for (kind, as_str_msg, display_msg) in cases.iter().cloned() {
417 assert_eq!(kind.as_str(), as_str_msg);
418 let err = LexError::new(kind, span);
419 let rendered = format!("{}", err);
420 assert_eq!(
421 rendered,
422 format!("{} at {}..{}", display_msg, span.start, span.end)
423 );
424 let _e: &dyn std::error::Error = &err;
425 let _dbg = format!("{:?}", err.clone());
426 assert!(!_dbg.is_empty());
427 }
428 }
429 #[test]
430 fn numbers_second_dot_invalid_unless_range() {
431 let err = tokenize("123.45.6").expect_err("second dot should be invalid unless range");
433 assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
434
435 let err = tokenize("1..2").expect_err("range dot should not be consumed by number");
437 assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
438 }
439
440 #[test]
441 fn numbers_exponent_rules() {
442 let toks = tokenize("1e10 1E+10 1.23e-4").unwrap();
444 assert!(toks
445 .iter()
446 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1e10")));
447 assert!(toks
448 .iter()
449 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1E+10")));
450 assert!(toks
451 .iter()
452 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.23e-4")));
453
454 let err = tokenize("1e+").expect_err("missing exponent digits");
456 assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
457
458 let err = tokenize("2E-").expect_err("missing exponent digits");
459 assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
460 }
461 #[test]
462 fn basic() {
463 let code = r#"
464 // sample
465 let rule greet(name) = "hi, " + name
466 if true and false then x = 1 else x = 2;
467 "#;
468 let toks = tokenize(code).unwrap();
469 assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Let)));
470 assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Rule)));
471 assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::String(_))));
472 }
473
474 #[test]
475 fn numbers_and_ranges() {
476 let toks = tokenize("1 1.0 1.2e-3").unwrap();
478 assert!(toks
479 .iter()
480 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1")));
481 assert!(toks
482 .iter()
483 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.0")));
484 assert!(toks
485 .iter()
486 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.2e-3")));
487
488 let err = tokenize("1..2").expect_err("should error on unexpected '.'");
490 assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
491 }
492
493 #[test]
494 fn string_escapes() {
495 let toks = tokenize("\"a\\n\\t\\r\\\\\\\"\"").unwrap();
497 assert!(matches!(toks[0].kind, TokenKind::String(_)));
498
499 let err = tokenize("\"\\x\"").unwrap_err();
501 assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
502 }
503
504 #[test]
505 fn numbers_trailing_dot_is_error() {
506 let err = tokenize("0.").expect_err("trailing dot should error");
507 assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
508 }
509
510 #[test]
511 fn strings_empty_and_raw_newline_and_escapes() {
512 let toks = tokenize("\"\"").unwrap();
514 assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s.is_empty()));
515
516 let toks = tokenize("\"a\nb\"").unwrap();
518 assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "a\nb"));
519
520 let toks = tokenize("\"\\\"\\\\\t\"").unwrap();
522 assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "\"\\\t"));
523 }
524
525 #[test]
526 fn strings_unterminated_and_unterminated_escape() {
527 let err = tokenize("\"abc").expect_err("unterminated string");
529 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
530
531 let err = tokenize("\"abc\\").expect_err("unterminated escape");
533 assert!(matches!(err.kind, LexErrorKind::UnterminatedEscape));
534 }
535
536 #[test]
537 fn idents_and_keywords() {
538 let toks = tokenize("let letx _x1").unwrap();
539 assert!(matches!(toks[0].kind, TokenKind::Let));
540 assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "letx"));
541 assert!(matches!(toks[2].kind, TokenKind::Ident(ref s) if s == "_x1"));
542 }
543
544 #[test]
545 fn comments_do_not_leak() {
546 let toks = tokenize("foo // comment\nbar").unwrap();
547 assert!(matches!(toks[0].kind, TokenKind::Ident(ref s) if s == "foo"));
548 assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "bar"));
549 assert_eq!(toks.len(), 2);
550 }
551
552 #[test]
553 fn unknown_char_errors_with_span() {
554 let err = tokenize("a @ b").expect_err("unknown char '@'");
555 assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
556 assert!(err.span.start < err.span.end);
557 }
558
559 #[test]
560 fn golden_small_input() {
561 let src = "let rule f(x) = \"hi\" + x";
562 let toks = tokenize(src).unwrap();
563 use TokenKind::*;
564 let kinds: Vec<&'static str> = toks
565 .iter()
566 .map(|t| match &t.kind {
567 Let => "Let",
568 Rule => "Rule",
569 Ident(s) if s == "f" => "Ident(f)",
570 LParen => "LParen",
571 Ident(s) if s == "x" => "Ident(x)",
572 RParen => "RParen",
573 Eq => "Eq",
574 String(s) if s == "hi" => "String(hi)",
575 Plus => "Plus",
576 Ident(s) if s == "x" => "Ident(x)",
577 other => panic!("unexpected token in golden: {:?}", other),
578 })
579 .collect();
580 assert_eq!(
581 kinds,
582 vec![
583 "Let",
584 "Rule",
585 "Ident(f)",
586 "LParen",
587 "Ident(x)",
588 "RParen",
589 "Eq",
590 "String(hi)",
591 "Plus",
592 "Ident(x)"
593 ]
594 );
595 }
596}