1use compact_str::CompactString;
2use serde::{Deserialize, Serialize};
3use thiserror::Error;
4
5#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
6pub struct Span {
7 pub start: usize,
8 pub end: usize,
9}
10
11#[derive(Clone, Debug, PartialEq)]
12pub struct Token {
13 pub kind: TokenKind,
14 pub span: Span,
15}
16
17#[derive(Clone, Debug, PartialEq)]
18pub enum TokenKind {
19 Ident(CompactString),
20 String(CompactString),
21 Number(f64),
22 LBrace,
23 RBrace,
24 LParen,
25 RParen,
26 LBracket,
27 RBracket,
28 Comma,
29 Colon,
30 At,
31 Question,
32 Dot,
33 Bang,
34 Equal,
35 DoubleEqual,
36 BangEqual,
37 AndAnd,
38 OrOr,
39 Pipe,
40 Less,
41 LessEqual,
42 Greater,
43 GreaterEqual,
44 Plus,
45 Minus,
46 Star,
47 Slash,
48 Percent,
49 If,
50 Else,
51 For,
52 In,
53 Await,
54 Cancel,
55 Submit,
56 Print,
57 Call,
58 And,
59 Or,
60 Not,
61 True,
62 False,
63 Null,
64 Eof,
65}
66
67#[derive(Debug, Error, PartialEq)]
68pub enum LexError {
69 #[error("unexpected `{ch}`")]
70 UnexpectedChar { ch: char, offset: usize },
71 #[error("unterminated string")]
72 UnterminatedString { offset: usize },
73 #[error("invalid number `{lexeme}`")]
74 InvalidNumber { lexeme: String, offset: usize },
75}
76
77impl LexError {
78 pub fn offset(&self) -> usize {
79 match self {
80 Self::UnexpectedChar { offset, .. }
81 | Self::UnterminatedString { offset }
82 | Self::InvalidNumber { offset, .. } => *offset,
83 }
84 }
85}
86
87pub fn lex(source: &str) -> Result<Vec<Token>, LexError> {
88 let mut lexer = Lexer {
89 source,
90 chars: source.char_indices().peekable(),
91 };
92 lexer.lex_all()
93}
94
95struct Lexer<'a> {
96 source: &'a str,
97 chars: std::iter::Peekable<std::str::CharIndices<'a>>,
98}
99
100impl<'a> Lexer<'a> {
101 fn lex_all(&mut self) -> Result<Vec<Token>, LexError> {
102 let mut tokens = Vec::with_capacity((self.source.len() / 4).max(8));
103 while let Some((offset, ch)) = self.peek() {
104 if ch.is_whitespace() || ch == ';' {
105 self.bump();
106 continue;
107 }
108 if ch == '#' {
109 self.skip_comment();
110 continue;
111 }
112 if ch == '/' && self.peek_second() == Some('/') {
113 self.bump();
114 self.bump();
115 self.skip_comment();
116 continue;
117 }
118
119 let token = match ch {
120 '{' => self.single(TokenKind::LBrace),
121 '}' => self.single(TokenKind::RBrace),
122 '(' => self.single(TokenKind::LParen),
123 ')' => self.single(TokenKind::RParen),
124 '[' => self.single(TokenKind::LBracket),
125 ']' => self.single(TokenKind::RBracket),
126 ',' => self.single(TokenKind::Comma),
127 ':' => self.single(TokenKind::Colon),
128 '@' => self.single(TokenKind::At),
129 '?' => self.single(TokenKind::Question),
130 '.' => self.single(TokenKind::Dot),
131 '+' => self.single(TokenKind::Plus),
132 '-' => self.single(TokenKind::Minus),
133 '*' => self.single(TokenKind::Star),
134 '/' => self.single(TokenKind::Slash),
135 '%' => self.single(TokenKind::Percent),
136 '=' => self.double_or_single('=', TokenKind::DoubleEqual, TokenKind::Equal),
137 '!' => self.double_or_single('=', TokenKind::BangEqual, TokenKind::Bang),
138 '&' => self.required_double('&', TokenKind::AndAnd)?,
139 '|' => self.double_or_single('|', TokenKind::OrOr, TokenKind::Pipe),
140 '<' => self.double_or_single('=', TokenKind::LessEqual, TokenKind::Less),
141 '>' => self.double_or_single('=', TokenKind::GreaterEqual, TokenKind::Greater),
142 '"' if self.starts_with_at(offset, "\"\"\"") => {
143 self.triple_string(false, "\"\"\"")?
144 }
145 '"' => self.string()?,
146 'r' if self.starts_with_at(offset, "r\"\"\"") => {
147 self.triple_string(true, "\"\"\"")?
148 }
149 'r' if self.starts_with_at(offset, "r'''") => self.triple_string(true, "'''")?,
150 c if is_ident_start(c) => self.ident_or_keyword(),
151 c if c.is_ascii_digit() => self.number()?,
152 _ => return Err(LexError::UnexpectedChar { ch, offset }),
153 };
154 tokens.push(token);
155 }
156
157 let end = self.source.len();
158 tokens.push(Token {
159 kind: TokenKind::Eof,
160 span: Span { start: end, end },
161 });
162 Ok(tokens)
163 }
164
165 fn single(&mut self, kind: TokenKind) -> Token {
166 let (start, ch) = self.bump().expect("single token requires input");
167 Token {
168 kind,
169 span: Span {
170 start,
171 end: start + ch.len_utf8(),
172 },
173 }
174 }
175
176 fn double_or_single(
177 &mut self,
178 second: char,
179 double_kind: TokenKind,
180 single_kind: TokenKind,
181 ) -> Token {
182 let (start, ch) = self.bump().expect("double token requires input");
183 let end = if self.consume_if(second) {
184 start + ch.len_utf8() + second.len_utf8()
185 } else {
186 start + ch.len_utf8()
187 };
188 Token {
189 kind: if end > start + ch.len_utf8() {
190 double_kind
191 } else {
192 single_kind
193 },
194 span: Span { start, end },
195 }
196 }
197
198 fn string(&mut self) -> Result<Token, LexError> {
199 let (start, _) = self.bump().expect("string requires quote");
200 let content_start = start + 1;
201 let mut value: Option<String> = None;
202 while let Some((offset, ch)) = self.bump() {
203 match ch {
204 '"' => {
205 let value = match value {
206 Some(value) => CompactString::from(value),
207 None => CompactString::from(&self.source[content_start..offset]),
208 };
209 return Ok(Token {
210 kind: TokenKind::String(value),
211 span: Span {
212 start,
213 end: offset + 1,
214 },
215 });
216 }
217 '\\' => {
218 let value =
219 value.get_or_insert_with(|| self.source[content_start..offset].to_string());
220 let Some((_, escaped)) = self.bump() else {
221 return Err(LexError::UnterminatedString { offset: start });
222 };
223 let translated = match escaped {
224 '"' => '"',
225 '\\' => '\\',
226 'n' => '\n',
227 'r' => '\r',
228 't' => '\t',
229 other => other,
230 };
231 value.push(translated);
232 }
233 other => {
234 if let Some(value) = &mut value {
235 value.push(other);
236 }
237 }
238 }
239 }
240 Err(LexError::UnterminatedString { offset: start })
241 }
242
243 fn triple_string(&mut self, raw: bool, delimiter: &str) -> Result<Token, LexError> {
244 let (start, _) = self.peek().expect("triple string requires input");
245 let delimiter_start = start + usize::from(raw);
246 let content_start = delimiter_start + delimiter.len();
247 self.consume_until_byte(content_start);
248
249 if raw {
250 let Some(relative_end) = self.source[content_start..].find(delimiter) else {
251 return Err(LexError::UnterminatedString { offset: start });
252 };
253 let content_end = content_start + relative_end;
254 let end = content_end + delimiter.len();
255 let value = CompactString::from(&self.source[content_start..content_end]);
256 self.consume_until_byte(end);
257 return Ok(Token {
258 kind: TokenKind::String(value),
259 span: Span { start, end },
260 });
261 }
262
263 let mut value = String::new();
264 while let Some((offset, ch)) = self.bump() {
265 if self.starts_with_at(offset, delimiter) {
266 self.consume_until_byte(offset + delimiter.len());
267 return Ok(Token {
268 kind: TokenKind::String(value.into()),
269 span: Span {
270 start,
271 end: offset + delimiter.len(),
272 },
273 });
274 }
275 if ch == '\\' {
276 let Some((_, escaped)) = self.bump() else {
277 return Err(LexError::UnterminatedString { offset: start });
278 };
279 let translated = match escaped {
280 '"' => '"',
281 '\\' => '\\',
282 'n' => '\n',
283 'r' => '\r',
284 't' => '\t',
285 other => other,
286 };
287 value.push(translated);
288 } else {
289 value.push(ch);
290 }
291 }
292
293 Err(LexError::UnterminatedString { offset: start })
294 }
295
296 fn required_double(&mut self, expected: char, kind: TokenKind) -> Result<Token, LexError> {
297 let (start, ch) = self.bump().expect("double token requires input");
298 if !self.consume_if(expected) {
299 return Err(LexError::UnexpectedChar { ch, offset: start });
300 }
301 Ok(Token {
302 kind,
303 span: Span {
304 start,
305 end: start + ch.len_utf8() + expected.len_utf8(),
306 },
307 })
308 }
309
310 fn ident_or_keyword(&mut self) -> Token {
311 let (start, _) = self.peek().expect("identifier requires input");
312 let mut end = start;
313 while let Some((offset, ch)) = self.peek() {
314 if !is_ident_continue(ch) {
315 break;
316 }
317 end = offset + ch.len_utf8();
318 self.bump();
319 }
320 let text = &self.source[start..end];
321 let kind = match text {
322 "if" => TokenKind::If,
323 "else" => TokenKind::Else,
324 "for" => TokenKind::For,
325 "in" => TokenKind::In,
326 "await" => TokenKind::Await,
327 "cancel" => TokenKind::Cancel,
328 "submit" => TokenKind::Submit,
329 "print" => TokenKind::Print,
330 "call" => TokenKind::Call,
331 "and" => TokenKind::And,
332 "or" => TokenKind::Or,
333 "not" => TokenKind::Not,
334 "true" => TokenKind::True,
335 "false" => TokenKind::False,
336 "null" => TokenKind::Null,
337 _ => TokenKind::Ident(text.into()),
338 };
339 Token {
340 kind,
341 span: Span { start, end },
342 }
343 }
344
345 fn number(&mut self) -> Result<Token, LexError> {
346 let (start, _) = self.peek().expect("number requires input");
347 let mut end = start;
348 let mut seen_dot = false;
349 while let Some((offset, ch)) = self.peek() {
350 if ch == '.' && !seen_dot {
351 seen_dot = true;
352 end = offset + 1;
353 self.bump();
354 continue;
355 }
356 if !ch.is_ascii_digit() {
357 break;
358 }
359 end = offset + ch.len_utf8();
360 self.bump();
361 }
362 let lexeme = &self.source[start..end];
363 let value = lexeme.parse::<f64>().map_err(|_| LexError::InvalidNumber {
364 lexeme: lexeme.to_string(),
365 offset: start,
366 })?;
367 Ok(Token {
368 kind: TokenKind::Number(value),
369 span: Span { start, end },
370 })
371 }
372
373 fn skip_comment(&mut self) {
374 while let Some((_, ch)) = self.bump() {
375 if ch == '\n' {
376 break;
377 }
378 }
379 }
380
381 fn bump(&mut self) -> Option<(usize, char)> {
382 self.chars.next()
383 }
384
385 fn peek(&mut self) -> Option<(usize, char)> {
386 self.chars.peek().copied()
387 }
388
389 fn peek_second(&self) -> Option<char> {
390 let mut chars = self.chars.clone();
391 chars.next()?;
392 chars.next().map(|(_, ch)| ch)
393 }
394
395 fn starts_with_at(&self, offset: usize, needle: &str) -> bool {
396 self.source[offset..].starts_with(needle)
397 }
398
399 fn consume_until_byte(&mut self, end: usize) {
400 while let Some((offset, _)) = self.peek() {
401 if offset >= end {
402 break;
403 }
404 self.bump();
405 }
406 }
407
408 fn consume_if(&mut self, expected: char) -> bool {
409 match self.peek() {
410 Some((_, ch)) if ch == expected => {
411 self.bump();
412 true
413 }
414 _ => false,
415 }
416 }
417}
418
419fn is_ident_start(ch: char) -> bool {
420 ch == '_' || ch.is_ascii_alphabetic()
421}
422
423fn is_ident_continue(ch: char) -> bool {
424 is_ident_start(ch) || ch.is_ascii_digit()
425}
426
427#[cfg(test)]
428mod tests {
429 use super::*;
430
431 #[test]
432 fn lexes_all_token_classes_and_comments() {
433 let tokens = lex(r#"
434 # comment
435 // comment
436 if else for in await cancel submit print call and or not true false null start
437 name _x a1 "hi\n\t\"\\\r\q" 12 3.5 { } ( ) [ ] , : @ ? . ! = == != && || | < <= > >= + - * / %
438 "#)
439 .expect("lexing should succeed");
440
441 let kinds: Vec<_> = tokens.into_iter().map(|token| token.kind).collect();
442 assert_eq!(
443 kinds,
444 vec![
445 TokenKind::If,
446 TokenKind::Else,
447 TokenKind::For,
448 TokenKind::In,
449 TokenKind::Await,
450 TokenKind::Cancel,
451 TokenKind::Submit,
452 TokenKind::Print,
453 TokenKind::Call,
454 TokenKind::And,
455 TokenKind::Or,
456 TokenKind::Not,
457 TokenKind::True,
458 TokenKind::False,
459 TokenKind::Null,
460 TokenKind::Ident("start".into()),
461 TokenKind::Ident("name".into()),
462 TokenKind::Ident("_x".into()),
463 TokenKind::Ident("a1".into()),
464 TokenKind::String("hi\n\t\"\\\rq".into()),
465 TokenKind::Number(12.0),
466 TokenKind::Number(3.5),
467 TokenKind::LBrace,
468 TokenKind::RBrace,
469 TokenKind::LParen,
470 TokenKind::RParen,
471 TokenKind::LBracket,
472 TokenKind::RBracket,
473 TokenKind::Comma,
474 TokenKind::Colon,
475 TokenKind::At,
476 TokenKind::Question,
477 TokenKind::Dot,
478 TokenKind::Bang,
479 TokenKind::Equal,
480 TokenKind::DoubleEqual,
481 TokenKind::BangEqual,
482 TokenKind::AndAnd,
483 TokenKind::OrOr,
484 TokenKind::Pipe,
485 TokenKind::Less,
486 TokenKind::LessEqual,
487 TokenKind::Greater,
488 TokenKind::GreaterEqual,
489 TokenKind::Plus,
490 TokenKind::Minus,
491 TokenKind::Star,
492 TokenKind::Slash,
493 TokenKind::Percent,
494 TokenKind::Eof,
495 ]
496 );
497 }
498
499 #[test]
500 fn rejects_unexpected_characters() {
501 let err = lex("`").expect_err("lexing should fail");
502 assert_eq!(err, LexError::UnexpectedChar { ch: '`', offset: 0 });
503 }
504
505 #[test]
506 fn lexes_multiline_and_raw_multiline_strings() {
507 let tokens = lex(r####"
508 normal = """line1\n"quoted"
509line2"""
510 raw = r"""*** Begin Patch
511@@
512\n { untouched }
513*** End Patch"""
514 "####)
515 .expect("lexing should succeed");
516
517 let strings: Vec<_> = tokens
518 .into_iter()
519 .filter_map(|token| match token.kind {
520 TokenKind::String(value) => Some(value),
521 _ => None,
522 })
523 .collect();
524 assert_eq!(
525 strings,
526 vec![
527 CompactString::from("line1\n\"quoted\"\nline2"),
528 CompactString::from("*** Begin Patch\n@@\n\\n { untouched }\n*** End Patch"),
529 ]
530 );
531 }
532
533 #[test]
534 fn lexes_raw_triple_single_quoted_strings() {
535 let tokens = lex(r####"
536 script = r'''python3 - <<'PY'
537print("""double quotes are preserved""")
538\n { braces stay raw }
539PY'''
540 "####)
541 .expect("lexing should succeed");
542
543 let strings: Vec<_> = tokens
544 .into_iter()
545 .filter_map(|token| match token.kind {
546 TokenKind::String(value) => Some(value),
547 _ => None,
548 })
549 .collect();
550 assert_eq!(
551 strings,
552 vec![CompactString::from(
553 "python3 - <<'PY'\nprint(\"\"\"double quotes are preserved\"\"\")\n\\n { braces stay raw }\nPY"
554 )]
555 );
556 }
557
558 #[test]
559 fn lexes_label_annotation_text_inside_strings_as_strings() {
560 let tokens = lex(r####"
561 regular = "@label(title: \"plain\")"
562 multiline = """@label(title: "plain")
563finish null"""
564 raw = r'''@label(title: "plain")
565@label(title: "still plain") finish null'''
566 "####)
567 .expect("lexing should succeed");
568
569 assert!(
570 tokens
571 .iter()
572 .all(|token| !matches!(token.kind, TokenKind::At)),
573 "`@` inside strings must not lex as annotation syntax"
574 );
575 let strings: Vec<_> = tokens
576 .into_iter()
577 .filter_map(|token| match token.kind {
578 TokenKind::String(value) => Some(value),
579 _ => None,
580 })
581 .collect();
582 assert_eq!(
583 strings,
584 vec![
585 CompactString::from("@label(title: \"plain\")"),
586 CompactString::from("@label(title: \"plain\")\nfinish null"),
587 CompactString::from(
588 "@label(title: \"plain\")\n@label(title: \"still plain\") finish null"
589 ),
590 ]
591 );
592 }
593
594 #[test]
595 fn lexes_double_slash_comments_without_breaking_division() {
596 let tokens = lex(r#"
597 value = 6 / 2
598 // trailing comment
599 submit value
600 "#)
601 .expect("lexing should succeed");
602
603 let kinds: Vec<_> = tokens.into_iter().map(|token| token.kind).collect();
604 assert_eq!(
605 kinds,
606 vec![
607 TokenKind::Ident("value".into()),
608 TokenKind::Equal,
609 TokenKind::Number(6.0),
610 TokenKind::Slash,
611 TokenKind::Number(2.0),
612 TokenKind::Submit,
613 TokenKind::Ident("value".into()),
614 TokenKind::Eof,
615 ]
616 );
617 }
618
619 #[test]
620 fn rejects_unterminated_strings() {
621 let err = lex("\"abc").expect_err("lexing should fail");
622 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
623
624 let err = lex("\"abc\\").expect_err("lexing should fail");
625 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
626
627 let err = lex("\"\"\"abc").expect_err("lexing should fail");
628 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
629
630 let err = lex("r\"\"\"abc").expect_err("lexing should fail");
631 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
632
633 let err = lex("r'''abc").expect_err("lexing should fail");
634 assert_eq!(err, LexError::UnterminatedString { offset: 0 });
635 }
636
637 #[test]
638 fn internal_number_error_path_is_covered() {
639 let mut lexer = Lexer {
640 source: ".",
641 chars: ".".char_indices().peekable(),
642 };
643 let err = lexer.number().expect_err("number parsing should fail");
644 assert_eq!(
645 err,
646 LexError::InvalidNumber {
647 lexeme: ".".to_string(),
648 offset: 0
649 }
650 );
651 }
652
653 #[test]
654 fn identifier_helpers_cover_true_and_false_cases() {
655 assert!(is_ident_start('_'));
656 assert!(is_ident_start('a'));
657 assert!(!is_ident_start('1'));
658
659 assert!(is_ident_continue('9'));
660 assert!(is_ident_continue('_'));
661 assert!(!is_ident_continue('-'));
662 }
663}