1use compact_str::CompactString;
2use serde::{Deserialize, Serialize};
3use thiserror::Error;
4
5#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
6pub struct Span {
7 pub start: usize,
8 pub end: usize,
9}
10
11#[derive(Clone, Debug, PartialEq)]
12pub struct Token {
13 pub kind: TokenKind,
14 pub span: Span,
15}
16
17#[derive(Clone, Debug, PartialEq)]
18pub enum TokenKind {
19 Ident(CompactString),
20 String(CompactString),
21 Number(f64),
22 LBrace,
23 RBrace,
24 LParen,
25 RParen,
26 LBracket,
27 RBracket,
28 Comma,
29 Colon,
30 At,
31 Question,
32 Dot,
33 Bang,
34 Equal,
35 DoubleEqual,
36 BangEqual,
37 AndAnd,
38 OrOr,
39 Pipe,
40 Less,
41 LessEqual,
42 Greater,
43 GreaterEqual,
44 Plus,
45 Minus,
46 Star,
47 Slash,
48 Percent,
49 If,
50 Else,
51 For,
52 In,
53 Await,
54 Cancel,
55 Submit,
56 Print,
57 Call,
58 And,
59 Or,
60 Not,
61 True,
62 False,
63 Null,
64 Eof,
65}
66
67#[derive(Debug, Error, PartialEq)]
68pub enum LexError {
69 #[error("unexpected `{ch}`")]
70 UnexpectedChar { ch: char, offset: usize },
71 #[error("unterminated string")]
72 UnterminatedString { offset: usize },
73 #[error("invalid number `{lexeme}`")]
74 InvalidNumber { lexeme: String, offset: usize },
75}
76
77impl LexError {
78 pub fn offset(&self) -> usize {
79 match self {
80 Self::UnexpectedChar { offset, .. }
81 | Self::UnterminatedString { offset }
82 | Self::InvalidNumber { offset, .. } => *offset,
83 }
84 }
85}
86
87pub fn lex(source: &str) -> Result<Vec<Token>, LexError> {
88 let mut lexer = Lexer {
89 source,
90 chars: source.char_indices().peekable(),
91 };
92 lexer.lex_all()
93}
94
95struct Lexer<'a> {
96 source: &'a str,
97 chars: std::iter::Peekable<std::str::CharIndices<'a>>,
98}
99
100impl<'a> Lexer<'a> {
101 fn lex_all(&mut self) -> Result<Vec<Token>, LexError> {
102 let mut tokens = Vec::with_capacity((self.source.len() / 4).max(8));
103 while let Some((offset, ch)) = self.peek() {
104 if ch.is_whitespace() || ch == ';' {
105 self.bump();
106 continue;
107 }
108 if ch == '#' {
109 self.skip_comment();
110 continue;
111 }
112 if ch == '/' && self.peek_second() == Some('/') {
113 self.bump();
114 self.bump();
115 self.skip_comment();
116 continue;
117 }
118
119 let token = match ch {
120 '{' => self.single(TokenKind::LBrace),
121 '}' => self.single(TokenKind::RBrace),
122 '(' => self.single(TokenKind::LParen),
123 ')' => self.single(TokenKind::RParen),
124 '[' => self.single(TokenKind::LBracket),
125 ']' => self.single(TokenKind::RBracket),
126 ',' => self.single(TokenKind::Comma),
127 ':' => self.single(TokenKind::Colon),
128 '@' => self.single(TokenKind::At),
129 '?' => self.single(TokenKind::Question),
130 '.' => self.single(TokenKind::Dot),
131 '+' => self.single(TokenKind::Plus),
132 '-' => self.single(TokenKind::Minus),
133 '*' => self.single(TokenKind::Star),
134 '/' => self.single(TokenKind::Slash),
135 '%' => self.single(TokenKind::Percent),
136 '=' => self.double_or_single('=', TokenKind::DoubleEqual, TokenKind::Equal),
137 '!' => self.double_or_single('=', TokenKind::BangEqual, TokenKind::Bang),
138 '&' => self.required_double('&', TokenKind::AndAnd)?,
139 '|' => self.double_or_single('|', TokenKind::OrOr, TokenKind::Pipe),
140 '<' => self.double_or_single('=', TokenKind::LessEqual, TokenKind::Less),
141 '>' => self.double_or_single('=', TokenKind::GreaterEqual, TokenKind::Greater),
142 '"' | '\'' => self.quoted_string(ch)?,
143 'r' | 'R' if self.raw_string_delimiter(offset).is_some() => self.raw_string()?,
144 c if is_ident_start(c) => self.ident_or_keyword(),
145 c if c.is_ascii_digit() => self.number()?,
146 _ => return Err(LexError::UnexpectedChar { ch, offset }),
147 };
148 tokens.push(token);
149 }
150
151 let end = self.source.len();
152 tokens.push(Token {
153 kind: TokenKind::Eof,
154 span: Span { start: end, end },
155 });
156 Ok(tokens)
157 }
158
159 fn single(&mut self, kind: TokenKind) -> Token {
160 let (start, ch) = self.bump().expect("single token requires input");
161 Token {
162 kind,
163 span: Span {
164 start,
165 end: start + ch.len_utf8(),
166 },
167 }
168 }
169
170 fn double_or_single(
171 &mut self,
172 second: char,
173 double_kind: TokenKind,
174 single_kind: TokenKind,
175 ) -> Token {
176 let (start, ch) = self.bump().expect("double token requires input");
177 let end = if self.consume_if(second) {
178 start + ch.len_utf8() + second.len_utf8()
179 } else {
180 start + ch.len_utf8()
181 };
182 Token {
183 kind: if end > start + ch.len_utf8() {
184 double_kind
185 } else {
186 single_kind
187 },
188 span: Span { start, end },
189 }
190 }
191
192 fn quoted_string(&mut self, quote: char) -> Result<Token, LexError> {
193 let (start, _) = self.peek().expect("string requires quote");
194 let delimiter = string_delimiter(quote, self.starts_with_triple_quote_at(start, quote));
195 let content_start = start + delimiter.len();
196 self.consume_until_byte(content_start);
197
198 let mut value = String::new();
199 while let Some((offset, ch)) = self.bump() {
200 if delimiter.len() == 1 {
201 if ch == quote {
202 return Ok(Token {
203 kind: TokenKind::String(value.into()),
204 span: Span {
205 start,
206 end: offset + quote.len_utf8(),
207 },
208 });
209 }
210 } else if self.starts_with_at(offset, &delimiter) {
211 self.consume_until_byte(offset + delimiter.len());
212 return Ok(Token {
213 kind: TokenKind::String(value.into()),
214 span: Span {
215 start,
216 end: offset + delimiter.len(),
217 },
218 });
219 }
220
221 if ch == '\\' {
222 let Some((_, escaped)) = self.bump() else {
223 return Err(LexError::UnterminatedString { offset: start });
224 };
225 value.push(translate_escape(escaped, quote));
226 } else {
227 value.push(ch);
228 }
229 }
230 Err(LexError::UnterminatedString { offset: start })
231 }
232
233 fn raw_string_delimiter(&self, offset: usize) -> Option<String> {
234 let rest = self.source.get(offset..)?;
235 let mut chars = rest.chars();
236 match chars.next()? {
237 'r' | 'R' => {}
238 _ => return None,
239 }
240 let quote = chars.next()?;
241 if quote != '"' && quote != '\'' {
242 return None;
243 }
244 let after_prefix = offset + 1;
245 Some(string_delimiter(
246 quote,
247 self.starts_with_triple_quote_at(after_prefix, quote),
248 ))
249 }
250
251 fn raw_string(&mut self) -> Result<Token, LexError> {
252 let (start, _) = self.peek().expect("raw string requires input");
253 let delimiter = self
254 .raw_string_delimiter(start)
255 .expect("raw string branch requires valid opener");
256 let content_start = start + 1 + delimiter.len();
257
258 let Some(relative_end) = self.source[content_start..].find(&delimiter) else {
259 return Err(LexError::UnterminatedString { offset: start });
260 };
261 let content_end = content_start + relative_end;
262 let end = content_end + delimiter.len();
263 let value = CompactString::from(&self.source[content_start..content_end]);
264 self.consume_until_byte(end);
265 Ok(Token {
266 kind: TokenKind::String(value),
267 span: Span { start, end },
268 })
269 }
270
271 fn required_double(&mut self, expected: char, kind: TokenKind) -> Result<Token, LexError> {
272 let (start, ch) = self.bump().expect("double token requires input");
273 if !self.consume_if(expected) {
274 return Err(LexError::UnexpectedChar { ch, offset: start });
275 }
276 Ok(Token {
277 kind,
278 span: Span {
279 start,
280 end: start + ch.len_utf8() + expected.len_utf8(),
281 },
282 })
283 }
284
285 fn ident_or_keyword(&mut self) -> Token {
286 let (start, _) = self.peek().expect("identifier requires input");
287 let mut end = start;
288 while let Some((offset, ch)) = self.peek() {
289 if !is_ident_continue(ch) {
290 break;
291 }
292 end = offset + ch.len_utf8();
293 self.bump();
294 }
295 let text = &self.source[start..end];
296 let kind = match text {
297 "if" => TokenKind::If,
298 "else" => TokenKind::Else,
299 "for" => TokenKind::For,
300 "in" => TokenKind::In,
301 "await" => TokenKind::Await,
302 "cancel" => TokenKind::Cancel,
303 "submit" => TokenKind::Submit,
304 "print" => TokenKind::Print,
305 "call" => TokenKind::Call,
306 "and" => TokenKind::And,
307 "or" => TokenKind::Or,
308 "not" => TokenKind::Not,
309 "true" => TokenKind::True,
310 "false" => TokenKind::False,
311 "null" => TokenKind::Null,
312 _ => TokenKind::Ident(text.into()),
313 };
314 Token {
315 kind,
316 span: Span { start, end },
317 }
318 }
319
320 fn number(&mut self) -> Result<Token, LexError> {
321 let (start, _) = self.peek().expect("number requires input");
322 let mut end = start;
323 let mut seen_dot = false;
324 while let Some((offset, ch)) = self.peek() {
325 if ch == '.' && !seen_dot {
326 seen_dot = true;
327 end = offset + 1;
328 self.bump();
329 continue;
330 }
331 if !ch.is_ascii_digit() {
332 break;
333 }
334 end = offset + ch.len_utf8();
335 self.bump();
336 }
337 let lexeme = &self.source[start..end];
338 let value = lexeme.parse::<f64>().map_err(|_| LexError::InvalidNumber {
339 lexeme: lexeme.to_string(),
340 offset: start,
341 })?;
342 Ok(Token {
343 kind: TokenKind::Number(value),
344 span: Span { start, end },
345 })
346 }
347
348 fn skip_comment(&mut self) {
349 while let Some((_, ch)) = self.bump() {
350 if ch == '\n' {
351 break;
352 }
353 }
354 }
355
356 fn bump(&mut self) -> Option<(usize, char)> {
357 self.chars.next()
358 }
359
360 fn peek(&mut self) -> Option<(usize, char)> {
361 self.chars.peek().copied()
362 }
363
364 fn peek_second(&self) -> Option<char> {
365 let mut chars = self.chars.clone();
366 chars.next()?;
367 chars.next().map(|(_, ch)| ch)
368 }
369
370 fn starts_with_at(&self, offset: usize, needle: &str) -> bool {
371 self.source[offset..].starts_with(needle)
372 }
373
374 fn starts_with_triple_quote_at(&self, offset: usize, quote: char) -> bool {
375 let mut delimiter = String::with_capacity(3);
376 delimiter.push(quote);
377 delimiter.push(quote);
378 delimiter.push(quote);
379 self.starts_with_at(offset, &delimiter)
380 }
381
382 fn consume_until_byte(&mut self, end: usize) {
383 while let Some((offset, _)) = self.peek() {
384 if offset >= end {
385 break;
386 }
387 self.bump();
388 }
389 }
390
391 fn consume_if(&mut self, expected: char) -> bool {
392 match self.peek() {
393 Some((_, ch)) if ch == expected => {
394 self.bump();
395 true
396 }
397 _ => false,
398 }
399 }
400}
401
402fn is_ident_start(ch: char) -> bool {
403 ch == '_' || ch.is_ascii_alphabetic()
404}
405
406fn is_ident_continue(ch: char) -> bool {
407 is_ident_start(ch) || ch.is_ascii_digit()
408}
409
410fn string_delimiter(quote: char, triple: bool) -> String {
411 let count = if triple { 3 } else { 1 };
412 std::iter::repeat_n(quote, count).collect()
413}
414
415fn translate_escape(escaped: char, quote: char) -> char {
416 match escaped {
417 '\\' => '\\',
418 'n' => '\n',
419 'r' => '\r',
420 't' => '\t',
421 other if other == quote => quote,
422 other => other,
423 }
424}
425
426#[cfg(test)]
427mod tests {
428 use super::*;
429
430 #[test]
431 fn lexes_all_token_classes_and_comments() {
432 let tokens = lex(r#"
433 # comment
434 // comment
435 if else for in await cancel submit print call and or not true false null start
436 name _x a1 "hi\n\t\"\\\r\q" 12 3.5 { } ( ) [ ] , : @ ? . ! = == != && || | < <= > >= + - * / %
437 "#)
438 .expect("lexing should succeed");
439
440 let kinds: Vec<_> = tokens.into_iter().map(|token| token.kind).collect();
441 assert_eq!(
442 kinds,
443 vec![
444 TokenKind::If,
445 TokenKind::Else,
446 TokenKind::For,
447 TokenKind::In,
448 TokenKind::Await,
449 TokenKind::Cancel,
450 TokenKind::Submit,
451 TokenKind::Print,
452 TokenKind::Call,
453 TokenKind::And,
454 TokenKind::Or,
455 TokenKind::Not,
456 TokenKind::True,
457 TokenKind::False,
458 TokenKind::Null,
459 TokenKind::Ident("start".into()),
460 TokenKind::Ident("name".into()),
461 TokenKind::Ident("_x".into()),
462 TokenKind::Ident("a1".into()),
463 TokenKind::String("hi\n\t\"\\\rq".into()),
464 TokenKind::Number(12.0),
465 TokenKind::Number(3.5),
466 TokenKind::LBrace,
467 TokenKind::RBrace,
468 TokenKind::LParen,
469 TokenKind::RParen,
470 TokenKind::LBracket,
471 TokenKind::RBracket,
472 TokenKind::Comma,
473 TokenKind::Colon,
474 TokenKind::At,
475 TokenKind::Question,
476 TokenKind::Dot,
477 TokenKind::Bang,
478 TokenKind::Equal,
479 TokenKind::DoubleEqual,
480 TokenKind::BangEqual,
481 TokenKind::AndAnd,
482 TokenKind::OrOr,
483 TokenKind::Pipe,
484 TokenKind::Less,
485 TokenKind::LessEqual,
486 TokenKind::Greater,
487 TokenKind::GreaterEqual,
488 TokenKind::Plus,
489 TokenKind::Minus,
490 TokenKind::Star,
491 TokenKind::Slash,
492 TokenKind::Percent,
493 TokenKind::Eof,
494 ]
495 );
496 }
497
498 #[test]
499 fn rejects_unexpected_characters() {
500 let err = lex("`").expect_err("lexing should fail");
501 assert_eq!(err, LexError::UnexpectedChar { ch: '`', offset: 0 });
502 }
503
504 #[test]
505 fn lexes_python_shaped_string_literals() {
506 let tokens = lex(r####"
507 double = "hi\n\t\"\\\r\q"
508 single = 'it\'s ok\n'
509 triple_double = """line1\n"quoted"
510line2"""
511 triple_single = '''line1\n'quoted'
512line2'''
513 raw_double = r"*** Begin Patch
514@@
515\n { untouched }
516*** End Patch"
517 raw_single = r'path\to\file'
518 "####)
519 .expect("lexing should succeed");
520
521 let strings: Vec<_> = tokens
522 .into_iter()
523 .filter_map(|token| match token.kind {
524 TokenKind::String(value) => Some(value),
525 _ => None,
526 })
527 .collect();
528 assert_eq!(
529 strings,
530 vec![
531 CompactString::from("hi\n\t\"\\\rq"),
532 CompactString::from("it's ok\n"),
533 CompactString::from("line1\n\"quoted\"\nline2"),
534 CompactString::from("line1\n'quoted'\nline2"),
535 CompactString::from("*** Begin Patch\n@@\n\\n { untouched }\n*** End Patch"),
536 CompactString::from("path\\to\\file"),
537 ]
538 );
539 }
540
541 #[test]
542 fn lexes_shell_and_formatter_shaped_string_literals() {
543 let tokens = lex(r####"
544 date = "date '+%Y-%m-%d %H:%M:%S %Z (%z)'"
545 printf = 'printf "%s\\n" "$value"'
546 json = "{\"cmd\":\"echo 'ok'\"}"
547 shell = "${HOME:-/tmp} && echo %done"
548 comment_text = "// not a comment # also not a comment"
549 label_text = "@label(title: \"plain\")"
550 "####)
551 .expect("lexing should succeed");
552
553 let strings: Vec<_> = tokens
554 .into_iter()
555 .filter_map(|token| match token.kind {
556 TokenKind::String(value) => Some(value),
557 _ => None,
558 })
559 .collect();
560 assert_eq!(
561 strings,
562 vec![
563 CompactString::from("date '+%Y-%m-%d %H:%M:%S %Z (%z)'"),
564 CompactString::from("printf \"%s\\n\" \"$value\""),
565 CompactString::from("{\"cmd\":\"echo 'ok'\"}"),
566 CompactString::from("${HOME:-/tmp} && echo %done"),
567 CompactString::from("// not a comment # also not a comment"),
568 CompactString::from("@label(title: \"plain\")"),
569 ]
570 );
571 }
572
573 #[test]
574 fn lexes_raw_triple_strings() {
575 let tokens = lex(r#####"
576 script = r'''python3 - <<'PY'
577print("""double quotes are preserved""")
578\n { braces stay raw }
579PY'''
580 markdown = R"""This body can mention " and ' without escaping.
581It ends only at three double quotes."""
582 "#####)
583 .expect("lexing should succeed");
584
585 let strings: Vec<_> = tokens
586 .into_iter()
587 .filter_map(|token| match token.kind {
588 TokenKind::String(value) => Some(value),
589 _ => None,
590 })
591 .collect();
592 assert_eq!(
593 strings,
594 vec![
595 CompactString::from(
596 "python3 - <<'PY'\nprint(\"\"\"double quotes are preserved\"\"\")\n\\n { braces stay raw }\nPY",
597 ),
598 CompactString::from(
599 "This body can mention \" and ' without escaping.\nIt ends only at three double quotes.",
600 ),
601 ]
602 );
603 }
604
605 #[test]
606 fn lexes_label_annotation_text_inside_strings_as_strings() {
607 let tokens = lex(r####"
608 regular = "@label(title: \"plain\")"
609 multiline = """@label(title: "plain")
610finish null"""
611 raw = r"""@label(title: "plain")
612@label(title: "still plain") finish null"""
613 "####)
614 .expect("lexing should succeed");
615
616 assert!(
617 tokens
618 .iter()
619 .all(|token| !matches!(token.kind, TokenKind::At)),
620 "`@` inside strings must not lex as annotation syntax"
621 );
622 let strings: Vec<_> = tokens
623 .into_iter()
624 .filter_map(|token| match token.kind {
625 TokenKind::String(value) => Some(value),
626 _ => None,
627 })
628 .collect();
629 assert_eq!(
630 strings,
631 vec![
632 CompactString::from("@label(title: \"plain\")"),
633 CompactString::from("@label(title: \"plain\")\nfinish null"),
634 CompactString::from(
635 "@label(title: \"plain\")\n@label(title: \"still plain\") finish null"
636 ),
637 ]
638 );
639 }
640
641 #[test]
642 fn lexes_double_slash_comments_without_breaking_division() {
643 let tokens = lex(r#"
644 value = 6 / 2
645 // trailing comment
646 submit value
647 "#)
648 .expect("lexing should succeed");
649
650 let kinds: Vec<_> = tokens.into_iter().map(|token| token.kind).collect();
651 assert_eq!(
652 kinds,
653 vec![
654 TokenKind::Ident("value".into()),
655 TokenKind::Equal,
656 TokenKind::Number(6.0),
657 TokenKind::Slash,
658 TokenKind::Number(2.0),
659 TokenKind::Submit,
660 TokenKind::Ident("value".into()),
661 TokenKind::Eof,
662 ]
663 );
664 }
665
666 #[test]
667 fn rejects_unterminated_strings() {
668 let err = lex("\"abc").expect_err("lexing should fail");
669 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
670
671 let err = lex("\"abc\\").expect_err("lexing should fail");
672 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
673
674 let err = lex("\"\"\"abc").expect_err("lexing should fail");
675 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
676
677 let err = lex("'abc").expect_err("lexing should fail");
678 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
679
680 let err = lex("'''abc").expect_err("lexing should fail");
681 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
682
683 let err = lex("r\"abc").expect_err("lexing should fail");
684 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
685
686 let err = lex("r'''abc").expect_err("lexing should fail");
687 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
688 }
689
690 #[test]
691 fn rust_style_raw_strings_are_not_recognized() {
692 let tokens = lex("submit r#\"abc\"#").expect("lexing treats hash as comment");
693 let kinds: Vec<_> = tokens.into_iter().map(|token| token.kind).collect();
694 assert_eq!(
695 kinds,
696 vec![
697 TokenKind::Submit,
698 TokenKind::Ident("r".into()),
699 TokenKind::Eof,
700 ]
701 );
702 }
703
704 #[test]
705 fn internal_number_error_path_is_covered() {
706 let mut lexer = Lexer {
707 source: ".",
708 chars: ".".char_indices().peekable(),
709 };
710 let err = lexer.number().expect_err("number parsing should fail");
711 assert_eq!(
712 err,
713 LexError::InvalidNumber {
714 lexeme: ".".to_string(),
715 offset: 0
716 }
717 );
718 }
719
720 #[test]
721 fn identifier_helpers_cover_true_and_false_cases() {
722 assert!(is_ident_start('_'));
723 assert!(is_ident_start('a'));
724 assert!(!is_ident_start('1'));
725
726 assert!(is_ident_continue('9'));
727 assert!(is_ident_continue('_'));
728 assert!(!is_ident_continue('-'));
729 }
730}