1use std::borrow::Cow;
9
10use styx_tokenizer::{Span, Token, TokenKind, Tokenizer};
11
12use crate::events::ScalarKind;
13
14#[derive(Debug, Clone, PartialEq)]
16pub enum Lexeme<'src> {
17 Scalar {
19 span: Span,
20 value: Cow<'src, str>,
21 kind: ScalarKind,
22 },
23
24 Unit { span: Span },
26
27 Tag {
30 span: Span,
31 name: &'src str,
32 has_payload: bool,
34 },
35
36 ObjectStart { span: Span },
38
39 ObjectEnd { span: Span },
41
42 SeqStart { span: Span },
44
45 SeqEnd { span: Span },
47
48 AttrKey {
50 span: Span,
52 key_span: Span,
54 key: &'src str,
56 },
57
58 Comma { span: Span },
60
61 Newline { span: Span },
63
64 Comment { span: Span, text: &'src str },
66
67 DocComment { span: Span, text: &'src str },
69
70 Eof,
72
73 Error { span: Span, message: &'static str },
75}
76
77impl Lexeme<'_> {
78 pub fn span(&self) -> Span {
81 match self {
82 Lexeme::Scalar { span, .. }
83 | Lexeme::Unit { span }
84 | Lexeme::Tag { span, .. }
85 | Lexeme::ObjectStart { span }
86 | Lexeme::ObjectEnd { span }
87 | Lexeme::SeqStart { span }
88 | Lexeme::SeqEnd { span }
89 | Lexeme::AttrKey { span, .. }
90 | Lexeme::Comma { span }
91 | Lexeme::Newline { span }
92 | Lexeme::Comment { span, .. }
93 | Lexeme::DocComment { span, .. }
94 | Lexeme::Error { span, .. } => *span,
95 Lexeme::Eof => Span::new(0, 0),
96 }
97 }
98}
99
100#[derive(Clone)]
102pub struct Lexer<'src> {
103 tokenizer: Tokenizer<'src>,
104 peeked: Option<Token<'src>>,
106}
107
108impl<'src> Lexer<'src> {
109 pub fn new(source: &'src str) -> Self {
111 Self {
112 tokenizer: Tokenizer::new(source),
113 peeked: None,
114 }
115 }
116
117 fn peek_token(&mut self) -> &Token<'src> {
119 if self.peeked.is_none() {
120 self.peeked = Some(self.tokenizer.next_token());
121 }
122 self.peeked.as_ref().unwrap()
123 }
124
125 fn next_token(&mut self) -> Token<'src> {
127 self.peeked
128 .take()
129 .unwrap_or_else(|| self.tokenizer.next_token())
130 }
131
132 pub fn next_lexeme(&mut self) -> Lexeme<'src> {
134 loop {
136 let tok = self.peek_token();
137 if tok.kind == TokenKind::Whitespace {
138 self.next_token();
139 } else {
140 break;
141 }
142 }
143
144 let tok = self.next_token();
145
146 match tok.kind {
147 TokenKind::Eof => Lexeme::Eof,
148
149 TokenKind::LBrace => Lexeme::ObjectStart { span: tok.span },
150 TokenKind::RBrace => Lexeme::ObjectEnd { span: tok.span },
151 TokenKind::LParen => Lexeme::SeqStart { span: tok.span },
152 TokenKind::RParen => Lexeme::SeqEnd { span: tok.span },
153 TokenKind::Comma => Lexeme::Comma { span: tok.span },
154 TokenKind::Gt => {
155 Lexeme::Error {
158 span: tok.span,
159 message: "unexpected `>` (attribute syntax requires no spaces: key>value)",
160 }
161 }
162 TokenKind::Newline => Lexeme::Newline { span: tok.span },
163
164 TokenKind::LineComment => Lexeme::Comment {
165 span: tok.span,
166 text: tok.text,
167 },
168 TokenKind::DocComment => Lexeme::DocComment {
169 span: tok.span,
170 text: tok.text,
171 },
172
173 TokenKind::At => {
174 let next = self.peek_token();
176 if next.span.start == tok.span.end && next.kind == TokenKind::BareScalar {
177 let bad_tok = self.next_token();
179 return Lexeme::Error {
180 span: Span::new(tok.span.start, bad_tok.span.end),
181 message: "invalid tag name",
182 };
183 }
184 Lexeme::Unit { span: tok.span }
186 }
187
188 TokenKind::Tag => {
189 let name = &tok.text[1..];
192 let is_chained = name.contains("/@");
193
194 let payload_tok = self.peek_token();
197 let is_adjacent = payload_tok.span.start == tok.span.end;
198 let is_valid_payload = matches!(
199 payload_tok.kind,
200 TokenKind::LBrace
201 | TokenKind::LParen
202 | TokenKind::QuotedScalar
203 | TokenKind::RawScalar
204 | TokenKind::At
205 | TokenKind::Tag
206 );
207
208 if is_adjacent && !is_valid_payload && payload_tok.kind == TokenKind::BareScalar {
212 let bad_tok = self.next_token();
214 return Lexeme::Error {
215 span: Span::new(tok.span.start, bad_tok.span.end),
216 message: "invalid tag name",
217 };
218 }
219
220 Lexeme::Tag {
221 span: tok.span,
222 name,
223 has_payload: is_chained || (is_adjacent && is_valid_payload),
224 }
225 }
226
227 TokenKind::BareScalar => {
228 let next = self.peek_token();
230 let is_attr = next.kind == TokenKind::Gt && next.span.start == tok.span.end;
231 let gt_end = next.span.end;
232 if is_attr {
233 self.next_token(); let value_tok = self.peek_token();
238 let gt_span = Span::new(gt_end - 1, gt_end);
239 if value_tok.kind == TokenKind::Newline || value_tok.kind == TokenKind::Eof {
240 return Lexeme::Error {
241 span: gt_span,
242 message: "expected a value",
243 };
244 }
245 if value_tok.kind == TokenKind::Whitespace {
246 return Lexeme::Error {
247 span: gt_span,
248 message: "whitespace after `>` in attribute (use key>value with no spaces)",
249 };
250 }
251
252 return Lexeme::AttrKey {
253 span: Span::new(tok.span.start, gt_end),
254 key_span: tok.span,
255 key: tok.text,
256 };
257 }
258
259 Lexeme::Scalar {
260 span: tok.span,
261 value: Cow::Borrowed(tok.text),
262 kind: ScalarKind::Bare,
263 }
264 }
265
266 TokenKind::QuotedScalar => {
267 let inner = &tok.text[1..tok.text.len() - 1]; match process_escapes(inner) {
270 Ok(value) => Lexeme::Scalar {
271 span: tok.span,
272 value,
273 kind: ScalarKind::Quoted,
274 },
275 Err(msg) => Lexeme::Error {
276 span: tok.span,
277 message: msg,
278 },
279 }
280 }
281
282 TokenKind::RawScalar => {
283 let text = tok.text;
285 let hash_count = text[1..].chars().take_while(|&c| c == '#').count();
287 let start = 1 + hash_count + 1; let end = text.len() - hash_count - 1; let content = &text[start..end];
291
292 Lexeme::Scalar {
293 span: tok.span,
294 value: Cow::Borrowed(content),
295 kind: ScalarKind::Raw,
296 }
297 }
298
299 TokenKind::HeredocStart => {
300 let start_span = tok.span;
302 let mut content = String::new();
303 let end_span;
304 let mut closing_indent = 0usize;
305
306 loop {
307 if let Some(indent) = self.tokenizer.heredoc_closing_indent() {
310 closing_indent = indent;
311 }
312
313 let next = self.next_token();
314 match next.kind {
315 TokenKind::HeredocContent => {
316 content.push_str(next.text);
317 }
318 TokenKind::HeredocEnd => {
319 end_span = next.span;
320 break;
321 }
322 TokenKind::Eof => {
323 return Lexeme::Error {
324 span: start_span,
325 message: "unterminated heredoc",
326 };
327 }
328 _ => {
329 return Lexeme::Error {
330 span: next.span,
331 message: "unexpected token in heredoc",
332 };
333 }
334 }
335 }
336
337 if closing_indent > 0 {
339 content = dedent_heredoc(&content, closing_indent);
340 }
341
342 Lexeme::Scalar {
343 span: Span::new(start_span.start, end_span.end),
344 value: Cow::Owned(content),
345 kind: ScalarKind::Heredoc,
346 }
347 }
348
349 TokenKind::HeredocContent | TokenKind::HeredocEnd => {
350 Lexeme::Error {
352 span: tok.span,
353 message: "unexpected heredoc token",
354 }
355 }
356
357 TokenKind::Whitespace => {
358 unreachable!("whitespace should be skipped")
360 }
361
362 TokenKind::Error => Lexeme::Error {
363 span: tok.span,
364 message: "tokenizer error",
365 },
366 }
367 }
368}
369
370impl<'src> Iterator for Lexer<'src> {
371 type Item = Lexeme<'src>;
372
373 fn next(&mut self) -> Option<Self::Item> {
374 let lexeme = self.next_lexeme();
375 if matches!(lexeme, Lexeme::Eof) {
376 None
377 } else {
378 Some(lexeme)
379 }
380 }
381}
382
383fn dedent_heredoc(content: &str, indent_len: usize) -> String {
385 let mut result = String::with_capacity(content.len());
386 for (i, line) in content.split('\n').enumerate() {
387 if i > 0 {
388 result.push('\n');
389 }
390 let mut stripped = 0;
392 let mut char_indices = line.char_indices().peekable();
393 while stripped < indent_len {
394 if let Some(&(_, ch)) = char_indices.peek() {
395 if ch == ' ' || ch == '\t' {
396 char_indices.next();
397 stripped += 1;
398 } else {
399 break;
400 }
401 } else {
402 break;
403 }
404 }
405 if let Some(&(idx, _)) = char_indices.peek() {
407 result.push_str(&line[idx..]);
408 }
409 }
410 result
411}
412
413fn process_escapes(s: &str) -> Result<Cow<'_, str>, &'static str> {
415 if !s.contains('\\') {
417 return Ok(Cow::Borrowed(s));
418 }
419
420 let mut result = String::with_capacity(s.len());
421 let mut chars = s.chars().peekable();
422
423 while let Some(c) = chars.next() {
424 if c != '\\' {
425 result.push(c);
426 continue;
427 }
428
429 match chars.next() {
430 Some('\\') => result.push('\\'),
431 Some('"') => result.push('"'),
432 Some('n') => result.push('\n'),
433 Some('r') => result.push('\r'),
434 Some('t') => result.push('\t'),
435 Some('u') => {
436 match chars.peek() {
438 Some('{') => {
439 chars.next(); let mut hex = String::new();
441 loop {
442 match chars.next() {
443 Some('}') => break,
444 Some(c) if c.is_ascii_hexdigit() => hex.push(c),
445 _ => return Err("invalid unicode escape"),
446 }
447 }
448 let code =
449 u32::from_str_radix(&hex, 16).map_err(|_| "invalid unicode escape")?;
450 let ch = char::from_u32(code).ok_or("invalid unicode code point")?;
451 result.push(ch);
452 }
453 Some(_) => {
454 let mut hex = String::with_capacity(4);
456 for _ in 0..4 {
457 match chars.next() {
458 Some(c) if c.is_ascii_hexdigit() => hex.push(c),
459 _ => return Err("invalid unicode escape"),
460 }
461 }
462 let code =
463 u32::from_str_radix(&hex, 16).map_err(|_| "invalid unicode escape")?;
464 let ch = char::from_u32(code).ok_or("invalid unicode code point")?;
465 result.push(ch);
466 }
467 None => return Err("invalid unicode escape"),
468 }
469 }
470 Some(_) => return Err("invalid escape sequence"),
471 None => return Err("trailing backslash"),
472 }
473 }
474
475 Ok(Cow::Owned(result))
476}
477
478#[cfg(test)]
479mod tests {
480 use super::*;
481
482 #[test]
483 fn test_process_escapes_double_backslash() {
484 let result = process_escapes(r"path\\to\\file").unwrap();
487 assert_eq!(result, r"path\to\file");
488 }
489
490 fn lex(source: &str) -> Vec<Lexeme<'_>> {
491 Lexer::new(source).collect()
492 }
493
494 #[test]
495 fn test_unit() {
496 let lexemes = lex("@");
497 assert!(matches!(&lexemes[0], Lexeme::Unit { .. }));
498 }
499
500 #[test]
501 fn test_tag_no_payload() {
502 let lexemes = lex("@foo");
503 assert!(matches!(
504 &lexemes[0],
505 Lexeme::Tag {
506 name: "foo",
507 has_payload: false,
508 ..
509 }
510 ));
511 }
512
513 #[test]
514 fn test_tag_with_object_payload() {
515 let lexemes = lex("@tag{}");
516 assert!(matches!(
517 &lexemes[0],
518 Lexeme::Tag {
519 name: "tag",
520 has_payload: true,
521 ..
522 }
523 ));
524 assert!(matches!(&lexemes[1], Lexeme::ObjectStart { .. }));
525 assert!(matches!(&lexemes[2], Lexeme::ObjectEnd { .. }));
526 }
527
528 #[test]
529 fn test_tag_with_space_before_object() {
530 let lexemes = lex("@tag {}");
532 assert!(matches!(
533 &lexemes[0],
534 Lexeme::Tag {
535 name: "tag",
536 has_payload: false,
537 ..
538 }
539 ));
540 }
541
542 #[test]
543 fn test_bare_scalar() {
544 let lexemes = lex("hello");
545 assert!(matches!(
546 &lexemes[0],
547 Lexeme::Scalar {
548 kind: ScalarKind::Bare,
549 ..
550 }
551 ));
552 }
553
554 #[test]
555 fn test_quoted_scalar() {
556 let lexemes = lex(r#""hello\nworld""#);
557 match &lexemes[0] {
558 Lexeme::Scalar {
559 value,
560 kind: ScalarKind::Quoted,
561 ..
562 } => {
563 assert_eq!(value.as_ref(), "hello\nworld");
564 }
565 other => panic!("expected quoted scalar, got {:?}", other),
566 }
567 }
568
569 #[test]
570 fn test_raw_scalar() {
571 let lexemes = lex(r##"r#"hello"#"##);
572 match &lexemes[0] {
573 Lexeme::Scalar {
574 value,
575 kind: ScalarKind::Raw,
576 ..
577 } => {
578 assert_eq!(value.as_ref(), "hello");
579 }
580 other => panic!("expected raw scalar, got {:?}", other),
581 }
582 }
583
584 #[test]
585 fn test_tag_with_quoted_payload() {
586 let lexemes = lex(r#"@env"staging""#);
587 assert!(matches!(
588 &lexemes[0],
589 Lexeme::Tag {
590 name: "env",
591 has_payload: true,
592 ..
593 }
594 ));
595 match &lexemes[1] {
596 Lexeme::Scalar {
597 value,
598 kind: ScalarKind::Quoted,
599 ..
600 } => {
601 assert_eq!(value.as_ref(), "staging");
602 }
603 other => panic!("expected quoted scalar, got {:?}", other),
604 }
605 }
606
607 #[test]
608 fn test_tag_with_sequence_payload() {
609 let lexemes = lex("@rgb(255 128 0)");
610 assert!(matches!(
611 &lexemes[0],
612 Lexeme::Tag {
613 name: "rgb",
614 has_payload: true,
615 ..
616 }
617 ));
618 assert!(matches!(&lexemes[1], Lexeme::SeqStart { .. }));
619 }
620
621 #[test]
622 fn test_chained_tag_without_trailing_payload() {
623 let lexemes = lex("@must_emit/@discover_end");
624 assert!(matches!(
625 &lexemes[0],
626 Lexeme::Tag {
627 name: "must_emit/@discover_end",
628 has_payload: true,
629 ..
630 }
631 ));
632 }
633
634 #[test]
635 fn test_chained_tag_with_object_payload() {
636 let lexemes = lex("@must_emit/@discover_start{executor default}");
637 assert!(matches!(
638 &lexemes[0],
639 Lexeme::Tag {
640 name: "must_emit/@discover_start",
641 has_payload: true,
642 ..
643 }
644 ));
645 assert!(matches!(&lexemes[1], Lexeme::ObjectStart { .. }));
646 }
647
648 #[test]
649 fn test_three_segment_chained_tag_lexeme() {
650 let lexemes = lex("@a/@b/@c");
651 assert!(matches!(
652 &lexemes[0],
653 Lexeme::Tag {
654 name: "a/@b/@c",
655 has_payload: true,
656 ..
657 }
658 ));
659 }
660
661 #[test]
662 fn test_chained_tag_with_quoted_leaf_payload() {
663 let lexemes = lex(r#"@a/@b"foo""#);
664 assert!(matches!(
665 &lexemes[0],
666 Lexeme::Tag {
667 name: "a/@b",
668 has_payload: true,
669 ..
670 }
671 ));
672 match &lexemes[1] {
673 Lexeme::Scalar {
674 value,
675 kind: ScalarKind::Quoted,
676 ..
677 } => assert_eq!(value.as_ref(), "foo"),
678 other => panic!("expected quoted scalar, got {:?}", other),
679 }
680 }
681
682 #[test]
683 fn test_chained_tag_with_raw_leaf_payload() {
684 let lexemes = lex(r##"@a/@br#"foo"#"##);
685 assert!(matches!(
686 &lexemes[0],
687 Lexeme::Tag {
688 name: "a/@b",
689 has_payload: true,
690 ..
691 }
692 ));
693 match &lexemes[1] {
694 Lexeme::Scalar {
695 value,
696 kind: ScalarKind::Raw,
697 ..
698 } => assert_eq!(value.as_ref(), "foo"),
699 other => panic!("expected raw scalar, got {:?}", other),
700 }
701 }
702
703 #[test]
704 fn test_chained_tag_with_heredoc_leaf_payload() {
705 let lexemes = lex("@a/@b<<EOF\nhello\nEOF");
706 assert!(matches!(
707 &lexemes[0],
708 Lexeme::Tag {
709 name: "a/@b",
710 has_payload: true,
711 ..
712 }
713 ));
714 match &lexemes[1] {
715 Lexeme::Scalar {
716 value,
717 kind: ScalarKind::Heredoc,
718 ..
719 } => assert_eq!(value.as_ref(), "hello\n"),
720 other => panic!("expected heredoc scalar, got {:?}", other),
721 }
722 }
723
724 #[test]
725 fn test_tag_with_unit_payload() {
726 let lexemes = lex("@tag@");
728 assert!(matches!(
729 &lexemes[0],
730 Lexeme::Tag {
731 name: "tag",
732 has_payload: true,
733 ..
734 }
735 ));
736 assert!(matches!(&lexemes[1], Lexeme::Unit { .. }));
737 }
738
739 #[test]
740 fn test_tag_with_raw_payload() {
741 let lexemes = lex(r##"@tagr#"x"#"##);
743 assert!(matches!(
744 &lexemes[0],
745 Lexeme::Tag {
746 name: "tag",
747 has_payload: true,
748 ..
749 }
750 ));
751 match &lexemes[1] {
752 Lexeme::Scalar {
753 value,
754 kind: ScalarKind::Raw,
755 ..
756 } => {
757 assert_eq!(value.as_ref(), "x");
758 }
759 other => panic!("expected raw scalar, got {:?}", other),
760 }
761 }
762
763 #[test]
764 fn test_tag_with_space_before_sequence() {
765 let lexemes = lex("@tag (a b)");
766 assert!(matches!(
767 &lexemes[0],
768 Lexeme::Tag {
769 name: "tag",
770 has_payload: false,
771 ..
772 }
773 ));
774 }
775
776 #[test]
777 fn test_tag_with_space_before_quoted() {
778 let lexemes = lex(r#"@tag "value""#);
779 assert!(matches!(
780 &lexemes[0],
781 Lexeme::Tag {
782 name: "tag",
783 has_payload: false,
784 ..
785 }
786 ));
787 }
788
789 #[test]
795 fn test_at_followed_by_digit() {
796 let lexemes = lex("@123");
798 assert!(matches!(
799 &lexemes[0],
800 Lexeme::Error {
801 message: "invalid tag name",
802 ..
803 }
804 ));
805 }
806
807 #[test]
808 fn test_structural() {
809 let lexemes = lex("{x 1}");
810 assert!(matches!(&lexemes[0], Lexeme::ObjectStart { .. }));
811 assert!(matches!(&lexemes[1], Lexeme::Scalar { .. }));
812 assert!(matches!(&lexemes[2], Lexeme::Scalar { .. }));
813 assert!(matches!(&lexemes[3], Lexeme::ObjectEnd { .. }));
814 }
815
816 #[test]
817 fn test_sequence() {
818 let lexemes = lex("(a b)");
819 assert!(matches!(&lexemes[0], Lexeme::SeqStart { .. }));
820 assert!(matches!(&lexemes[1], Lexeme::Scalar { .. }));
821 assert!(matches!(&lexemes[2], Lexeme::Scalar { .. }));
822 assert!(matches!(&lexemes[3], Lexeme::SeqEnd { .. }));
823 }
824
825 #[test]
826 fn test_newlines_preserved() {
827 let lexemes = lex("a\nb");
828 assert!(matches!(&lexemes[0], Lexeme::Scalar { .. }));
829 assert!(matches!(&lexemes[1], Lexeme::Newline { .. }));
830 assert!(matches!(&lexemes[2], Lexeme::Scalar { .. }));
831 }
832
833 #[test]
834 fn test_unicode_escape_braces() {
835 let lexemes = lex(r#""\u{1F600}""#);
836 match &lexemes[0] {
837 Lexeme::Scalar { value, .. } => {
838 assert_eq!(value.as_ref(), "😀");
839 }
840 other => panic!("expected scalar, got {:?}", other),
841 }
842 }
843
844 #[test]
845 fn test_unicode_escape_4digit() {
846 let lexemes = lex(r#""\u0041""#);
847 match &lexemes[0] {
848 Lexeme::Scalar { value, .. } => {
849 assert_eq!(value.as_ref(), "A");
850 }
851 other => panic!("expected scalar, got {:?}", other),
852 }
853 }
854
855 #[test]
856 fn test_dotted_value_is_scalar() {
857 let lexemes = lex("a.b.c");
860 match &lexemes[0] {
861 Lexeme::Scalar {
862 value,
863 kind: ScalarKind::Bare,
864 ..
865 } => {
866 assert_eq!(value.as_ref(), "a.b.c");
867 }
868 other => panic!("expected scalar, got {:?}", other),
869 }
870 }
871
872 #[test]
873 fn test_attr_key() {
874 let lexemes = lex("name>value");
875 assert!(matches!(&lexemes[0], Lexeme::AttrKey { key: "name", .. }));
876 assert!(matches!(&lexemes[1], Lexeme::Scalar { .. }));
877 }
878
879 #[test]
880 fn test_attr_key_with_object() {
881 let lexemes = lex("opts>{x 1}");
882 assert!(matches!(&lexemes[0], Lexeme::AttrKey { key: "opts", .. }));
883 assert!(matches!(&lexemes[1], Lexeme::ObjectStart { .. }));
884 }
885
886 #[test]
887 fn test_attr_key_with_sequence() {
888 let lexemes = lex("tags>(a b)");
889 assert!(matches!(&lexemes[0], Lexeme::AttrKey { key: "tags", .. }));
890 assert!(matches!(&lexemes[1], Lexeme::SeqStart { .. }));
891 }
892
893 #[test]
894 fn test_standalone_gt_error() {
895 let lexemes = lex("x > y");
897 assert!(matches!(&lexemes[0], Lexeme::Scalar { .. }));
898 assert!(matches!(&lexemes[1], Lexeme::Error { .. }));
899 }
900
901 #[test]
902 fn test_attr_whitespace_after_gt_error() {
903 let lexemes = lex("name> value");
905 assert!(matches!(
906 &lexemes[0],
907 Lexeme::Error {
908 message: "whitespace after `>` in attribute (use key>value with no spaces)",
909 ..
910 }
911 ));
912 }
913}