1use std::iter::Peekable;
25use std::str::CharIndices;
26
27mod error;
28pub use error::{LexError, LexErrorKind};
30mod span;
31pub use span::LineMap;
33mod iter;
34pub use iter::{tokenize_iter, Tokens};
36
37#[derive(Debug, Clone, PartialEq, Eq)]
39pub enum TokenKind {
40 Ident(String),
41 Number(String),
42 String(String),
43 True,
44 False,
45 If,
46 Then,
47 Else,
48 Let,
49 Rule,
50 And,
51 Or,
52 LParen,
53 RParen,
54 LBrace,
55 RBrace,
56 LBracket,
57 RBracket,
58 Comma,
59 Colon,
60 Semicolon,
61 Arrow,
62 Eq,
63 Plus,
64 Minus,
65 Star,
66 Slash,
67}
68
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
71pub struct Span {
72 pub start: usize,
73 pub end: usize,
74}
75
76#[derive(Debug, Clone, PartialEq, Eq)]
78pub struct Token {
79 pub kind: TokenKind,
80 pub span: Span,
81}
82
83pub struct Lexer<'a> {
85 src: &'a str,
86 it: Peekable<CharIndices<'a>>,
87 cur: Option<(usize, char)>,
88}
89
90impl<'a> Lexer<'a> {
91 pub fn new(src: &'a str) -> Self {
92 let mut it = src.char_indices().peekable();
93 let cur = it.next();
94 Self { src, it, cur }
95 }
96
97 fn bump(&mut self) -> Option<(usize, char)> {
98 let out = self.cur;
99 self.cur = self.it.next();
100 out
101 }
102
103 fn peek(&self) -> Option<(usize, char)> {
104 self.cur
105 }
106
107 fn skip_ws_and_comments(&mut self) {
108 loop {
109 let mut progressed = false;
110 while let Some((_, c)) = self.peek() {
111 if c.is_whitespace() {
112 self.bump();
113 progressed = true;
114 } else {
115 break;
116 }
117 }
118 if let Some((_, '/')) = self.peek() {
119 let mut clone = self.it.clone();
120 if let Some((_, '/')) = clone.next() {
121 self.bump();
122 self.bump();
123 while let Some((_, c)) = self.peek() {
124 if c == '\n' {
125 break;
126 }
127 self.bump();
128 }
129 continue;
130 }
131 }
132 if !progressed {
133 break;
134 }
135 }
136 }
137
138 fn kw_or_ident(s: &str) -> TokenKind {
139 match s {
140 "true" => TokenKind::True,
141 "false" => TokenKind::False,
142 "if" => TokenKind::If,
143 "then" => TokenKind::Then,
144 "else" => TokenKind::Else,
145 "let" => TokenKind::Let,
146 "rule" => TokenKind::Rule,
147 "and" => TokenKind::And,
148 "or" => TokenKind::Or,
149 _ => TokenKind::Ident(s.to_string()),
150 }
151 }
152
153 fn lex_number(&mut self, start: usize) -> Result<Token, LexError> {
154 let mut seen_dot = false;
155 let mut seen_exp = false;
156 let mut last_was_dot = false;
157 self.bump(); while let Some((idx, ch)) = self.peek() {
160 if ch.is_ascii_digit() {
161 self.bump();
162 last_was_dot = false;
163 } else if ch == '.' {
164 if seen_dot {
165 if last_was_dot {
167 break;
168 }
169 return Err(LexError::new(
171 LexErrorKind::InvalidNumber,
172 Span {
173 start,
174 end: idx + ch.len_utf8(),
175 },
176 ));
177 }
178 let mut clone = self.it.clone();
180 if let Some((_, next)) = clone.next() {
181 if next == '.' {
182 break;
183 }
184 if !next.is_ascii_digit() {
185 break;
186 }
187 } else {
188 break;
189 }
190 seen_dot = true;
191 last_was_dot = true;
192 self.bump();
193 } else if (ch == 'e' || ch == 'E') && !seen_exp {
194 seen_exp = true;
195 last_was_dot = false;
196 self.bump();
197 if let Some((_, sign)) = self.peek() {
198 if sign == '+' || sign == '-' {
199 self.bump();
200 }
201 }
202 match self.peek() {
203 Some((_, d)) if d.is_ascii_digit() => {}
204 _ => {
205 return Err(LexError::new(
206 LexErrorKind::InvalidNumber,
207 Span {
208 start,
209 end: idx + ch.len_utf8(),
210 },
211 ));
212 }
213 }
214 } else {
215 break;
216 }
217 }
218
219 let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
220 Ok(Token {
221 kind: TokenKind::Number(self.src[start..end].to_string()),
222 span: Span { start, end },
223 })
224 }
225
226 pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
227 let mut out = Vec::new();
228 loop {
229 self.skip_ws_and_comments();
230 let Some((i, c)) = self.peek() else {
231 break;
232 };
233
234 if c == '"' {
235 let start = i;
236 self.bump();
237 let mut s = String::new();
238 loop {
239 let Some((j, ch)) = self.bump() else {
240 return Err(LexError::new(
241 LexErrorKind::UnterminatedString,
242 Span {
243 start,
244 end: self.src.len(),
245 },
246 ));
247 };
248 match ch {
249 '\\' => {
250 let Some((_, esc)) = self.bump() else {
251 return Err(LexError::new(
252 LexErrorKind::UnterminatedEscape,
253 Span { start, end: j + 1 },
254 ));
255 };
256 let ch = match esc {
257 'n' => '\n',
258 't' => '\t',
259 'r' => '\r',
260 '"' => '"',
261 '\\' => '\\',
262 _ => {
263 return Err(LexError::new(
265 LexErrorKind::InvalidEscape,
266 Span {
267 start,
268 end: self.src.len(),
269 },
270 ));
271 }
272 };
273 s.push(ch);
274 }
275 '"' => {
276 out.push(Token {
277 kind: TokenKind::String(s),
278 span: Span { start, end: j + 1 },
279 });
280 break;
281 }
282 _ => s.push(ch),
283 }
284 }
285 continue;
286 }
287
288 if c.is_ascii_digit() {
289 match self.lex_number(i) {
290 Ok(tok) => out.push(tok),
291 Err(e) => return Err(e),
292 }
293 continue;
294 }
295
296 if c.is_ascii_alphabetic() || c == '_' {
297 let start = i;
298 self.bump();
299 while let Some((_, p)) = self.peek() {
300 if p.is_ascii_alphanumeric() || p == '_' {
301 self.bump();
302 } else {
303 break;
304 }
305 }
306 let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
307 let kind = Self::kw_or_ident(&self.src[start..end]);
308 out.push(Token {
309 kind,
310 span: Span { start, end },
311 });
312 continue;
313 }
314
315 if c == '-' {
316 let start = i;
317 self.bump();
318 if let Some((j, '>')) = self.peek() {
319 self.bump();
320 out.push(Token {
321 kind: TokenKind::Arrow,
322 span: Span { start, end: j + 1 },
323 });
324 } else {
325 out.push(Token {
326 kind: TokenKind::Minus,
327 span: Span {
328 start,
329 end: start + 1,
330 },
331 });
332 }
333 continue;
334 }
335
336 let start = i;
337 self.bump();
338 let tk = match c {
339 '(' => TokenKind::LParen,
340 ')' => TokenKind::RParen,
341 '{' => TokenKind::LBrace,
342 '}' => TokenKind::RBrace,
343 '[' => TokenKind::LBracket,
344 ']' => TokenKind::RBracket,
345 ',' => TokenKind::Comma,
346 ':' => TokenKind::Colon,
347 ';' => TokenKind::Semicolon,
348 '=' => TokenKind::Eq,
349 '+' => TokenKind::Plus,
350 '*' => TokenKind::Star,
351 '/' => TokenKind::Slash,
352 other => {
353 return Err(LexError::new(
354 LexErrorKind::UnexpectedChar,
355 Span {
356 start,
357 end: start + other.len_utf8(),
358 },
359 ))
360 }
361 };
362 out.push(Token {
363 kind: tk,
364 span: Span {
365 start,
366 end: start + 1,
367 },
368 });
369 }
370 Ok(out)
371 }
372}
373
374pub fn tokenize(src: &str) -> Result<Vec<Token>, LexError> {
377 Lexer::new(src).tokenize()
378}
379
380#[cfg(test)]
381mod tests {
382 use super::*;
383 #[test]
384 fn numbers_second_dot_invalid_unless_range() {
385 let err = tokenize("123.45.6").expect_err("second dot should be invalid unless range");
387 assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
388
389 let err = tokenize("1..2").expect_err("range dot should not be consumed by number");
391 assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
392 }
393
394 #[test]
395 fn numbers_exponent_rules() {
396 let toks = tokenize("1e10 1E+10 1.23e-4").unwrap();
398 assert!(toks
399 .iter()
400 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1e10")));
401 assert!(toks
402 .iter()
403 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1E+10")));
404 assert!(toks
405 .iter()
406 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.23e-4")));
407
408 let err = tokenize("1e+").expect_err("missing exponent digits");
410 assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
411
412 let err = tokenize("2E-").expect_err("missing exponent digits");
413 assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
414 }
415 #[test]
416 fn basic() {
417 let code = r#"
418 // sample
419 let rule greet(name) = "hi, " + name
420 if true and false then x = 1 else x = 2;
421 "#;
422 let toks = tokenize(code).unwrap();
423 assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Let)));
424 assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Rule)));
425 assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::String(_))));
426 }
427
428 #[test]
429 fn numbers_and_ranges() {
430 let toks = tokenize("1 1.0 1.2e-3").unwrap();
432 assert!(toks
433 .iter()
434 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1")));
435 assert!(toks
436 .iter()
437 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.0")));
438 assert!(toks
439 .iter()
440 .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.2e-3")));
441
442 let err = tokenize("1..2").expect_err("should error on unexpected '.'");
444 assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
445 }
446
447 #[test]
448 fn string_escapes() {
449 let toks = tokenize("\"a\\n\\t\\r\\\\\\\"\"").unwrap();
451 assert!(matches!(toks[0].kind, TokenKind::String(_)));
452
453 let err = tokenize("\"\\x\"").unwrap_err();
455 assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
456 }
457
458 #[test]
459 fn numbers_trailing_dot_is_error() {
460 let err = tokenize("0.").expect_err("trailing dot should error");
461 assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
462 }
463
464 #[test]
465 fn strings_empty_and_raw_newline_and_escapes() {
466 let toks = tokenize("\"\"").unwrap();
468 assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s.is_empty()));
469
470 let toks = tokenize("\"a\nb\"").unwrap();
472 assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "a\nb"));
473
474 let toks = tokenize("\"\\\"\\\\\t\"").unwrap();
476 assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "\"\\\t"));
477 }
478
479 #[test]
480 fn strings_unterminated_and_unterminated_escape() {
481 let err = tokenize("\"abc").expect_err("unterminated string");
483 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
484
485 let err = tokenize("\"abc\\").expect_err("unterminated escape");
487 assert!(matches!(err.kind, LexErrorKind::UnterminatedEscape));
488 }
489
490 #[test]
491 fn idents_and_keywords() {
492 let toks = tokenize("let letx _x1").unwrap();
493 assert!(matches!(toks[0].kind, TokenKind::Let));
494 assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "letx"));
495 assert!(matches!(toks[2].kind, TokenKind::Ident(ref s) if s == "_x1"));
496 }
497
498 #[test]
499 fn comments_do_not_leak() {
500 let toks = tokenize("foo // comment\nbar").unwrap();
501 assert!(matches!(toks[0].kind, TokenKind::Ident(ref s) if s == "foo"));
502 assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "bar"));
503 assert_eq!(toks.len(), 2);
504 }
505
506 #[test]
507 fn unknown_char_errors_with_span() {
508 let err = tokenize("a @ b").expect_err("unknown char '@'");
509 assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
510 assert!(err.span.start < err.span.end);
511 }
512
513 #[test]
514 fn golden_small_input() {
515 let src = "let rule f(x) = \"hi\" + x";
516 let toks = tokenize(src).unwrap();
517 use TokenKind::*;
518 let kinds: Vec<&'static str> = toks
519 .iter()
520 .map(|t| match &t.kind {
521 Let => "Let",
522 Rule => "Rule",
523 Ident(s) if s == "f" => "Ident(f)",
524 LParen => "LParen",
525 Ident(s) if s == "x" => "Ident(x)",
526 RParen => "RParen",
527 Eq => "Eq",
528 String(s) if s == "hi" => "String(hi)",
529 Plus => "Plus",
530 Ident(s) if s == "x" => "Ident(x)",
531 other => panic!("unexpected token in golden: {:?}", other),
532 })
533 .collect();
534 assert_eq!(
535 kinds,
536 vec![
537 "Let",
538 "Rule",
539 "Ident(f)",
540 "LParen",
541 "Ident(x)",
542 "RParen",
543 "Eq",
544 "String(hi)",
545 "Plus",
546 "Ident(x)"
547 ]
548 );
549 }
550}