1use std::borrow::Cow;
9
10use styx_tokenizer::{Span, Token, TokenKind, Tokenizer};
11
12use crate::events::ScalarKind;
13
14#[derive(Debug, Clone, PartialEq)]
16pub enum Lexeme<'src> {
17 Scalar {
19 span: Span,
20 value: Cow<'src, str>,
21 kind: ScalarKind,
22 },
23
24 Unit { span: Span },
26
27 Tag {
30 span: Span,
31 name: &'src str,
32 has_payload: bool,
34 },
35
36 ObjectStart { span: Span },
38
39 ObjectEnd { span: Span },
41
42 SeqStart { span: Span },
44
45 SeqEnd { span: Span },
47
48 AttrKey {
50 span: Span,
52 key_span: Span,
54 key: &'src str,
56 },
57
58 Comma { span: Span },
60
61 Newline { span: Span },
63
64 Comment { span: Span, text: &'src str },
66
67 DocComment { span: Span, text: &'src str },
69
70 Eof,
72
73 Error { span: Span, message: &'static str },
75}
76
77impl Lexeme<'_> {
78 pub fn span(&self) -> Span {
81 match self {
82 Lexeme::Scalar { span, .. }
83 | Lexeme::Unit { span }
84 | Lexeme::Tag { span, .. }
85 | Lexeme::ObjectStart { span }
86 | Lexeme::ObjectEnd { span }
87 | Lexeme::SeqStart { span }
88 | Lexeme::SeqEnd { span }
89 | Lexeme::AttrKey { span, .. }
90 | Lexeme::Comma { span }
91 | Lexeme::Newline { span }
92 | Lexeme::Comment { span, .. }
93 | Lexeme::DocComment { span, .. }
94 | Lexeme::Error { span, .. } => *span,
95 Lexeme::Eof => Span::new(0, 0),
96 }
97 }
98}
99
100#[derive(Clone)]
102pub struct Lexer<'src> {
103 tokenizer: Tokenizer<'src>,
104 peeked: Option<Token<'src>>,
106}
107
108impl<'src> Lexer<'src> {
109 pub fn new(source: &'src str) -> Self {
111 Self {
112 tokenizer: Tokenizer::new(source),
113 peeked: None,
114 }
115 }
116
117 fn peek_token(&mut self) -> &Token<'src> {
119 if self.peeked.is_none() {
120 self.peeked = Some(self.tokenizer.next_token());
121 }
122 self.peeked.as_ref().unwrap()
123 }
124
125 fn next_token(&mut self) -> Token<'src> {
127 self.peeked
128 .take()
129 .unwrap_or_else(|| self.tokenizer.next_token())
130 }
131
132 pub fn next_lexeme(&mut self) -> Lexeme<'src> {
134 loop {
136 let tok = self.peek_token();
137 if tok.kind == TokenKind::Whitespace {
138 self.next_token();
139 } else {
140 break;
141 }
142 }
143
144 let tok = self.next_token();
145
146 match tok.kind {
147 TokenKind::Eof => Lexeme::Eof,
148
149 TokenKind::LBrace => Lexeme::ObjectStart { span: tok.span },
150 TokenKind::RBrace => Lexeme::ObjectEnd { span: tok.span },
151 TokenKind::LParen => Lexeme::SeqStart { span: tok.span },
152 TokenKind::RParen => Lexeme::SeqEnd { span: tok.span },
153 TokenKind::Comma => Lexeme::Comma { span: tok.span },
154 TokenKind::Gt => {
155 Lexeme::Error {
158 span: tok.span,
159 message: "unexpected `>` (attribute syntax requires no spaces: key>value)",
160 }
161 }
162 TokenKind::Newline => Lexeme::Newline { span: tok.span },
163
164 TokenKind::LineComment => Lexeme::Comment {
165 span: tok.span,
166 text: tok.text,
167 },
168 TokenKind::DocComment => Lexeme::DocComment {
169 span: tok.span,
170 text: tok.text,
171 },
172
173 TokenKind::At => {
174 let next = self.peek_token();
176 if next.span.start == tok.span.end && next.kind == TokenKind::BareScalar {
177 let bad_tok = self.next_token();
179 return Lexeme::Error {
180 span: Span::new(tok.span.start, bad_tok.span.end),
181 message: "invalid tag name",
182 };
183 }
184 Lexeme::Unit { span: tok.span }
186 }
187
188 TokenKind::Tag => {
189 let name = &tok.text[1..];
192
193 let payload_tok = self.peek_token();
196 let is_adjacent = payload_tok.span.start == tok.span.end;
197 let is_valid_payload = matches!(
198 payload_tok.kind,
199 TokenKind::LBrace
200 | TokenKind::LParen
201 | TokenKind::QuotedScalar
202 | TokenKind::RawScalar
203 | TokenKind::At
204 | TokenKind::Tag
205 );
206
207 if is_adjacent && !is_valid_payload && payload_tok.kind == TokenKind::BareScalar {
211 let bad_tok = self.next_token();
213 return Lexeme::Error {
214 span: Span::new(tok.span.start, bad_tok.span.end),
215 message: "invalid tag name",
216 };
217 }
218
219 Lexeme::Tag {
220 span: tok.span,
221 name,
222 has_payload: is_adjacent && is_valid_payload,
223 }
224 }
225
226 TokenKind::BareScalar => {
227 let next = self.peek_token();
229 let is_attr = next.kind == TokenKind::Gt && next.span.start == tok.span.end;
230 let gt_end = next.span.end;
231 if is_attr {
232 self.next_token(); let value_tok = self.peek_token();
237 let gt_span = Span::new(gt_end - 1, gt_end);
238 if value_tok.kind == TokenKind::Newline || value_tok.kind == TokenKind::Eof {
239 return Lexeme::Error {
240 span: gt_span,
241 message: "expected a value",
242 };
243 }
244 if value_tok.kind == TokenKind::Whitespace {
245 return Lexeme::Error {
246 span: gt_span,
247 message: "whitespace after `>` in attribute (use key>value with no spaces)",
248 };
249 }
250
251 return Lexeme::AttrKey {
252 span: Span::new(tok.span.start, gt_end),
253 key_span: tok.span,
254 key: tok.text,
255 };
256 }
257
258 Lexeme::Scalar {
259 span: tok.span,
260 value: Cow::Borrowed(tok.text),
261 kind: ScalarKind::Bare,
262 }
263 }
264
265 TokenKind::QuotedScalar => {
266 let inner = &tok.text[1..tok.text.len() - 1]; match process_escapes(inner) {
269 Ok(value) => Lexeme::Scalar {
270 span: tok.span,
271 value,
272 kind: ScalarKind::Quoted,
273 },
274 Err(msg) => Lexeme::Error {
275 span: tok.span,
276 message: msg,
277 },
278 }
279 }
280
281 TokenKind::RawScalar => {
282 let text = tok.text;
284 let hash_count = text[1..].chars().take_while(|&c| c == '#').count();
286 let start = 1 + hash_count + 1; let end = text.len() - hash_count - 1; let content = &text[start..end];
290
291 Lexeme::Scalar {
292 span: tok.span,
293 value: Cow::Borrowed(content),
294 kind: ScalarKind::Raw,
295 }
296 }
297
298 TokenKind::HeredocStart => {
299 let start_span = tok.span;
301 let mut content = String::new();
302 let end_span;
303 let mut closing_indent = 0usize;
304
305 loop {
306 if let Some(indent) = self.tokenizer.heredoc_closing_indent() {
309 closing_indent = indent;
310 }
311
312 let next = self.next_token();
313 match next.kind {
314 TokenKind::HeredocContent => {
315 content.push_str(next.text);
316 }
317 TokenKind::HeredocEnd => {
318 end_span = next.span;
319 break;
320 }
321 TokenKind::Eof => {
322 return Lexeme::Error {
323 span: start_span,
324 message: "unterminated heredoc",
325 };
326 }
327 _ => {
328 return Lexeme::Error {
329 span: next.span,
330 message: "unexpected token in heredoc",
331 };
332 }
333 }
334 }
335
336 if closing_indent > 0 {
338 content = dedent_heredoc(&content, closing_indent);
339 }
340
341 Lexeme::Scalar {
342 span: Span::new(start_span.start, end_span.end),
343 value: Cow::Owned(content),
344 kind: ScalarKind::Heredoc,
345 }
346 }
347
348 TokenKind::HeredocContent | TokenKind::HeredocEnd => {
349 Lexeme::Error {
351 span: tok.span,
352 message: "unexpected heredoc token",
353 }
354 }
355
356 TokenKind::Whitespace => {
357 unreachable!("whitespace should be skipped")
359 }
360
361 TokenKind::Error => Lexeme::Error {
362 span: tok.span,
363 message: "tokenizer error",
364 },
365 }
366 }
367}
368
369impl<'src> Iterator for Lexer<'src> {
370 type Item = Lexeme<'src>;
371
372 fn next(&mut self) -> Option<Self::Item> {
373 let lexeme = self.next_lexeme();
374 if matches!(lexeme, Lexeme::Eof) {
375 None
376 } else {
377 Some(lexeme)
378 }
379 }
380}
381
382fn dedent_heredoc(content: &str, indent_len: usize) -> String {
384 let mut result = String::with_capacity(content.len());
385 for (i, line) in content.split('\n').enumerate() {
386 if i > 0 {
387 result.push('\n');
388 }
389 let mut stripped = 0;
391 let mut char_indices = line.char_indices().peekable();
392 while stripped < indent_len {
393 if let Some(&(_, ch)) = char_indices.peek() {
394 if ch == ' ' || ch == '\t' {
395 char_indices.next();
396 stripped += 1;
397 } else {
398 break;
399 }
400 } else {
401 break;
402 }
403 }
404 if let Some(&(idx, _)) = char_indices.peek() {
406 result.push_str(&line[idx..]);
407 }
408 }
409 result
410}
411
412fn process_escapes(s: &str) -> Result<Cow<'_, str>, &'static str> {
414 if !s.contains('\\') {
416 return Ok(Cow::Borrowed(s));
417 }
418
419 let mut result = String::with_capacity(s.len());
420 let mut chars = s.chars().peekable();
421
422 while let Some(c) = chars.next() {
423 if c != '\\' {
424 result.push(c);
425 continue;
426 }
427
428 match chars.next() {
429 Some('\\') => result.push('\\'),
430 Some('"') => result.push('"'),
431 Some('n') => result.push('\n'),
432 Some('r') => result.push('\r'),
433 Some('t') => result.push('\t'),
434 Some('u') => {
435 match chars.peek() {
437 Some('{') => {
438 chars.next(); let mut hex = String::new();
440 loop {
441 match chars.next() {
442 Some('}') => break,
443 Some(c) if c.is_ascii_hexdigit() => hex.push(c),
444 _ => return Err("invalid unicode escape"),
445 }
446 }
447 let code =
448 u32::from_str_radix(&hex, 16).map_err(|_| "invalid unicode escape")?;
449 let ch = char::from_u32(code).ok_or("invalid unicode code point")?;
450 result.push(ch);
451 }
452 Some(_) => {
453 let mut hex = String::with_capacity(4);
455 for _ in 0..4 {
456 match chars.next() {
457 Some(c) if c.is_ascii_hexdigit() => hex.push(c),
458 _ => return Err("invalid unicode escape"),
459 }
460 }
461 let code =
462 u32::from_str_radix(&hex, 16).map_err(|_| "invalid unicode escape")?;
463 let ch = char::from_u32(code).ok_or("invalid unicode code point")?;
464 result.push(ch);
465 }
466 None => return Err("invalid unicode escape"),
467 }
468 }
469 Some(_) => return Err("invalid escape sequence"),
470 None => return Err("trailing backslash"),
471 }
472 }
473
474 Ok(Cow::Owned(result))
475}
476
477#[cfg(test)]
478mod tests {
479 use super::*;
480
481 #[test]
482 fn test_process_escapes_double_backslash() {
483 let result = process_escapes(r"path\\to\\file").unwrap();
486 assert_eq!(result, r"path\to\file");
487 }
488
489 fn lex(source: &str) -> Vec<Lexeme<'_>> {
490 Lexer::new(source).collect()
491 }
492
493 #[test]
494 fn test_unit() {
495 let lexemes = lex("@");
496 assert!(matches!(&lexemes[0], Lexeme::Unit { .. }));
497 }
498
499 #[test]
500 fn test_tag_no_payload() {
501 let lexemes = lex("@foo");
502 assert!(matches!(
503 &lexemes[0],
504 Lexeme::Tag {
505 name: "foo",
506 has_payload: false,
507 ..
508 }
509 ));
510 }
511
512 #[test]
513 fn test_tag_with_object_payload() {
514 let lexemes = lex("@tag{}");
515 assert!(matches!(
516 &lexemes[0],
517 Lexeme::Tag {
518 name: "tag",
519 has_payload: true,
520 ..
521 }
522 ));
523 assert!(matches!(&lexemes[1], Lexeme::ObjectStart { .. }));
524 assert!(matches!(&lexemes[2], Lexeme::ObjectEnd { .. }));
525 }
526
527 #[test]
528 fn test_tag_with_space_before_object() {
529 let lexemes = lex("@tag {}");
531 assert!(matches!(
532 &lexemes[0],
533 Lexeme::Tag {
534 name: "tag",
535 has_payload: false,
536 ..
537 }
538 ));
539 }
540
541 #[test]
542 fn test_bare_scalar() {
543 let lexemes = lex("hello");
544 assert!(matches!(
545 &lexemes[0],
546 Lexeme::Scalar {
547 kind: ScalarKind::Bare,
548 ..
549 }
550 ));
551 }
552
553 #[test]
554 fn test_quoted_scalar() {
555 let lexemes = lex(r#""hello\nworld""#);
556 match &lexemes[0] {
557 Lexeme::Scalar {
558 value,
559 kind: ScalarKind::Quoted,
560 ..
561 } => {
562 assert_eq!(value.as_ref(), "hello\nworld");
563 }
564 other => panic!("expected quoted scalar, got {:?}", other),
565 }
566 }
567
568 #[test]
569 fn test_raw_scalar() {
570 let lexemes = lex(r##"r#"hello"#"##);
571 match &lexemes[0] {
572 Lexeme::Scalar {
573 value,
574 kind: ScalarKind::Raw,
575 ..
576 } => {
577 assert_eq!(value.as_ref(), "hello");
578 }
579 other => panic!("expected raw scalar, got {:?}", other),
580 }
581 }
582
583 #[test]
584 fn test_tag_with_quoted_payload() {
585 let lexemes = lex(r#"@env"staging""#);
586 assert!(matches!(
587 &lexemes[0],
588 Lexeme::Tag {
589 name: "env",
590 has_payload: true,
591 ..
592 }
593 ));
594 match &lexemes[1] {
595 Lexeme::Scalar {
596 value,
597 kind: ScalarKind::Quoted,
598 ..
599 } => {
600 assert_eq!(value.as_ref(), "staging");
601 }
602 other => panic!("expected quoted scalar, got {:?}", other),
603 }
604 }
605
606 #[test]
607 fn test_tag_with_sequence_payload() {
608 let lexemes = lex("@rgb(255 128 0)");
609 assert!(matches!(
610 &lexemes[0],
611 Lexeme::Tag {
612 name: "rgb",
613 has_payload: true,
614 ..
615 }
616 ));
617 assert!(matches!(&lexemes[1], Lexeme::SeqStart { .. }));
618 }
619
620 #[test]
621 fn test_tag_with_unit_payload() {
622 let lexemes = lex("@tag@");
624 assert!(matches!(
625 &lexemes[0],
626 Lexeme::Tag {
627 name: "tag",
628 has_payload: true,
629 ..
630 }
631 ));
632 assert!(matches!(&lexemes[1], Lexeme::Unit { .. }));
633 }
634
635 #[test]
636 fn test_tag_with_raw_payload() {
637 let lexemes = lex(r##"@tagr#"x"#"##);
639 assert!(matches!(
640 &lexemes[0],
641 Lexeme::Tag {
642 name: "tag",
643 has_payload: true,
644 ..
645 }
646 ));
647 match &lexemes[1] {
648 Lexeme::Scalar {
649 value,
650 kind: ScalarKind::Raw,
651 ..
652 } => {
653 assert_eq!(value.as_ref(), "x");
654 }
655 other => panic!("expected raw scalar, got {:?}", other),
656 }
657 }
658
659 #[test]
660 fn test_tag_with_space_before_sequence() {
661 let lexemes = lex("@tag (a b)");
662 assert!(matches!(
663 &lexemes[0],
664 Lexeme::Tag {
665 name: "tag",
666 has_payload: false,
667 ..
668 }
669 ));
670 }
671
672 #[test]
673 fn test_tag_with_space_before_quoted() {
674 let lexemes = lex(r#"@tag "value""#);
675 assert!(matches!(
676 &lexemes[0],
677 Lexeme::Tag {
678 name: "tag",
679 has_payload: false,
680 ..
681 }
682 ));
683 }
684
685 #[test]
691 fn test_at_followed_by_digit() {
692 let lexemes = lex("@123");
694 assert!(matches!(
695 &lexemes[0],
696 Lexeme::Error {
697 message: "invalid tag name",
698 ..
699 }
700 ));
701 }
702
703 #[test]
704 fn test_structural() {
705 let lexemes = lex("{x 1}");
706 assert!(matches!(&lexemes[0], Lexeme::ObjectStart { .. }));
707 assert!(matches!(&lexemes[1], Lexeme::Scalar { .. }));
708 assert!(matches!(&lexemes[2], Lexeme::Scalar { .. }));
709 assert!(matches!(&lexemes[3], Lexeme::ObjectEnd { .. }));
710 }
711
712 #[test]
713 fn test_sequence() {
714 let lexemes = lex("(a b)");
715 assert!(matches!(&lexemes[0], Lexeme::SeqStart { .. }));
716 assert!(matches!(&lexemes[1], Lexeme::Scalar { .. }));
717 assert!(matches!(&lexemes[2], Lexeme::Scalar { .. }));
718 assert!(matches!(&lexemes[3], Lexeme::SeqEnd { .. }));
719 }
720
721 #[test]
722 fn test_newlines_preserved() {
723 let lexemes = lex("a\nb");
724 assert!(matches!(&lexemes[0], Lexeme::Scalar { .. }));
725 assert!(matches!(&lexemes[1], Lexeme::Newline { .. }));
726 assert!(matches!(&lexemes[2], Lexeme::Scalar { .. }));
727 }
728
729 #[test]
730 fn test_unicode_escape_braces() {
731 let lexemes = lex(r#""\u{1F600}""#);
732 match &lexemes[0] {
733 Lexeme::Scalar { value, .. } => {
734 assert_eq!(value.as_ref(), "😀");
735 }
736 other => panic!("expected scalar, got {:?}", other),
737 }
738 }
739
740 #[test]
741 fn test_unicode_escape_4digit() {
742 let lexemes = lex(r#""\u0041""#);
743 match &lexemes[0] {
744 Lexeme::Scalar { value, .. } => {
745 assert_eq!(value.as_ref(), "A");
746 }
747 other => panic!("expected scalar, got {:?}", other),
748 }
749 }
750
751 #[test]
752 fn test_dotted_value_is_scalar() {
753 let lexemes = lex("a.b.c");
756 match &lexemes[0] {
757 Lexeme::Scalar {
758 value,
759 kind: ScalarKind::Bare,
760 ..
761 } => {
762 assert_eq!(value.as_ref(), "a.b.c");
763 }
764 other => panic!("expected scalar, got {:?}", other),
765 }
766 }
767
768 #[test]
769 fn test_attr_key() {
770 let lexemes = lex("name>value");
771 assert!(matches!(&lexemes[0], Lexeme::AttrKey { key: "name", .. }));
772 assert!(matches!(&lexemes[1], Lexeme::Scalar { .. }));
773 }
774
775 #[test]
776 fn test_attr_key_with_object() {
777 let lexemes = lex("opts>{x 1}");
778 assert!(matches!(&lexemes[0], Lexeme::AttrKey { key: "opts", .. }));
779 assert!(matches!(&lexemes[1], Lexeme::ObjectStart { .. }));
780 }
781
782 #[test]
783 fn test_attr_key_with_sequence() {
784 let lexemes = lex("tags>(a b)");
785 assert!(matches!(&lexemes[0], Lexeme::AttrKey { key: "tags", .. }));
786 assert!(matches!(&lexemes[1], Lexeme::SeqStart { .. }));
787 }
788
789 #[test]
790 fn test_standalone_gt_error() {
791 let lexemes = lex("x > y");
793 assert!(matches!(&lexemes[0], Lexeme::Scalar { .. }));
794 assert!(matches!(&lexemes[1], Lexeme::Error { .. }));
795 }
796
797 #[test]
798 fn test_attr_whitespace_after_gt_error() {
799 let lexemes = lex("name> value");
801 assert!(matches!(
802 &lexemes[0],
803 Lexeme::Error {
804 message: "whitespace after `>` in attribute (use key>value with no spaces)",
805 ..
806 }
807 ));
808 }
809}