Skip to main content

styx_parse/
lexer.rs

1//! Lexer for Styx - produces lexemes from tokens.
2//!
3//! The Lexer sits between the Tokenizer and Parser:
4//! - Tokenizer → Token (raw: At, BareScalar, LBrace, etc.)
5//! - Lexer → Lexeme (atoms: Scalar, Tag, Unit, structural markers)
6//! - Parser → Events (structure: entries, objects, sequences)
7
8use std::borrow::Cow;
9
10use styx_tokenizer::{Span, Token, TokenKind, Tokenizer};
11
12use crate::events::ScalarKind;
13
14/// A lexeme produced by the Lexer from raw tokens.
15#[derive(Debug, Clone, PartialEq)]
16pub enum Lexeme<'src> {
17    /// A scalar value (bare, quoted, raw, or heredoc)
18    Scalar {
19        span: Span,
20        value: Cow<'src, str>,
21        kind: ScalarKind,
22    },
23
24    /// Unit value: standalone `@`
25    Unit { span: Span },
26
27    /// A tag: `@name`
28    /// The payload (if any) comes as the next lexeme
29    Tag {
30        span: Span,
31        name: &'src str,
32        /// True if an immediate payload follows (no whitespace): `@tag{}`, `@tag()`, `@tag"x"`, `@tag@`
33        has_payload: bool,
34    },
35
36    /// Start of object `{`
37    ObjectStart { span: Span },
38
39    /// End of object `}`
40    ObjectEnd { span: Span },
41
42    /// Start of sequence `(`
43    SeqStart { span: Span },
44
45    /// End of sequence `)`
46    SeqEnd { span: Span },
47
48    /// An attribute key `key>` - value follows as next lexeme(s)
49    AttrKey {
50        /// Span of the full `key>` including the `>`
51        span: Span,
52        /// Span of just the key (excluding `>`)
53        key_span: Span,
54        /// The key text
55        key: &'src str,
56    },
57
58    /// Comma separator
59    Comma { span: Span },
60
61    /// Newline (significant for separator detection)
62    Newline { span: Span },
63
64    /// Line comment `// ...`
65    Comment { span: Span, text: &'src str },
66
67    /// Doc comment `/// ...`
68    DocComment { span: Span, text: &'src str },
69
70    /// End of input
71    Eof,
72
73    /// Tokenizer error
74    Error { span: Span, message: &'static str },
75}
76
77impl Lexeme<'_> {
78    /// Get the span of this lexeme.
79    /// Returns a zero-length span at position 0 for Eof.
80    pub fn span(&self) -> Span {
81        match self {
82            Lexeme::Scalar { span, .. }
83            | Lexeme::Unit { span }
84            | Lexeme::Tag { span, .. }
85            | Lexeme::ObjectStart { span }
86            | Lexeme::ObjectEnd { span }
87            | Lexeme::SeqStart { span }
88            | Lexeme::SeqEnd { span }
89            | Lexeme::AttrKey { span, .. }
90            | Lexeme::Comma { span }
91            | Lexeme::Newline { span }
92            | Lexeme::Comment { span, .. }
93            | Lexeme::DocComment { span, .. }
94            | Lexeme::Error { span, .. } => *span,
95            Lexeme::Eof => Span::new(0, 0),
96        }
97    }
98}
99
100/// Lexer that produces lexemes from tokens.
101#[derive(Clone)]
102pub struct Lexer<'src> {
103    tokenizer: Tokenizer<'src>,
104    /// Peeked token (if any)
105    peeked: Option<Token<'src>>,
106}
107
108impl<'src> Lexer<'src> {
109    /// Create a new lexer for the given source.
110    pub fn new(source: &'src str) -> Self {
111        Self {
112            tokenizer: Tokenizer::new(source),
113            peeked: None,
114        }
115    }
116
117    /// Peek at the next token without consuming it.
118    fn peek_token(&mut self) -> &Token<'src> {
119        if self.peeked.is_none() {
120            self.peeked = Some(self.tokenizer.next_token());
121        }
122        self.peeked.as_ref().unwrap()
123    }
124
125    /// Consume and return the next token.
126    fn next_token(&mut self) -> Token<'src> {
127        self.peeked
128            .take()
129            .unwrap_or_else(|| self.tokenizer.next_token())
130    }
131
132    /// Get the next lexeme.
133    pub fn next_lexeme(&mut self) -> Lexeme<'src> {
134        // Skip whitespace (but not newlines - those are significant)
135        loop {
136            let tok = self.peek_token();
137            if tok.kind == TokenKind::Whitespace {
138                self.next_token();
139            } else {
140                break;
141            }
142        }
143
144        let tok = self.next_token();
145
146        match tok.kind {
147            TokenKind::Eof => Lexeme::Eof,
148
149            TokenKind::LBrace => Lexeme::ObjectStart { span: tok.span },
150            TokenKind::RBrace => Lexeme::ObjectEnd { span: tok.span },
151            TokenKind::LParen => Lexeme::SeqStart { span: tok.span },
152            TokenKind::RParen => Lexeme::SeqEnd { span: tok.span },
153            TokenKind::Comma => Lexeme::Comma { span: tok.span },
154            TokenKind::Gt => {
155                // Standalone `>` (with whitespace before it) - not valid in Styx
156                // Attribute syntax requires no space: `key>value`
157                Lexeme::Error {
158                    span: tok.span,
159                    message: "unexpected `>` (attribute syntax requires no spaces: key>value)",
160                }
161            }
162            TokenKind::Newline => Lexeme::Newline { span: tok.span },
163
164            TokenKind::LineComment => Lexeme::Comment {
165                span: tok.span,
166                text: tok.text,
167            },
168            TokenKind::DocComment => Lexeme::DocComment {
169                span: tok.span,
170                text: tok.text,
171            },
172
173            TokenKind::At => {
174                // Check if followed immediately by a bare scalar (invalid tag like @123)
175                let next = self.peek_token();
176                if next.span.start == tok.span.end && next.kind == TokenKind::BareScalar {
177                    // Consume the adjacent token to include it in the error span
178                    let bad_tok = self.next_token();
179                    return Lexeme::Error {
180                        span: Span::new(tok.span.start, bad_tok.span.end),
181                        message: "invalid tag name",
182                    };
183                }
184                // Standalone @ = unit
185                Lexeme::Unit { span: tok.span }
186            }
187
188            TokenKind::Tag => {
189                // Tag token includes the @ and name, e.g. "@foo"
190                // Extract the name (skip the @)
191                let name = &tok.text[1..];
192                let is_chained = name.contains("/@");
193
194                // Check if payload follows immediately (no whitespace)
195                // Payload can be: { ( " r#" @ or Tag
196                let payload_tok = self.peek_token();
197                let is_adjacent = payload_tok.span.start == tok.span.end;
198                let is_valid_payload = matches!(
199                    payload_tok.kind,
200                    TokenKind::LBrace
201                        | TokenKind::LParen
202                        | TokenKind::QuotedScalar
203                        | TokenKind::RawScalar
204                        | TokenKind::At
205                        | TokenKind::Tag
206                );
207
208                // If a bare scalar is adjacent (no whitespace), it's an invalid tag name
209                // e.g., @org/package where /package is adjacent
210                // But structural tokens like ) } , or newlines are fine - they end the tag
211                if is_adjacent && !is_valid_payload && payload_tok.kind == TokenKind::BareScalar {
212                    // Consume the adjacent token to include it in the error span
213                    let bad_tok = self.next_token();
214                    return Lexeme::Error {
215                        span: Span::new(tok.span.start, bad_tok.span.end),
216                        message: "invalid tag name",
217                    };
218                }
219
220                Lexeme::Tag {
221                    span: tok.span,
222                    name,
223                    has_payload: is_chained || (is_adjacent && is_valid_payload),
224                }
225            }
226
227            TokenKind::BareScalar => {
228                // Check if followed by `>` (attribute syntax)
229                let next = self.peek_token();
230                let is_attr = next.kind == TokenKind::Gt && next.span.start == tok.span.end;
231                let gt_end = next.span.end;
232                if is_attr {
233                    // Attribute: key>
234                    self.next_token(); // consume `>`
235
236                    // Check that value follows immediately (no whitespace after `>`)
237                    let value_tok = self.peek_token();
238                    let gt_span = Span::new(gt_end - 1, gt_end);
239                    if value_tok.kind == TokenKind::Newline || value_tok.kind == TokenKind::Eof {
240                        return Lexeme::Error {
241                            span: gt_span,
242                            message: "expected a value",
243                        };
244                    }
245                    if value_tok.kind == TokenKind::Whitespace {
246                        return Lexeme::Error {
247                            span: gt_span,
248                            message: "whitespace after `>` in attribute (use key>value with no spaces)",
249                        };
250                    }
251
252                    return Lexeme::AttrKey {
253                        span: Span::new(tok.span.start, gt_end),
254                        key_span: tok.span,
255                        key: tok.text,
256                    };
257                }
258
259                Lexeme::Scalar {
260                    span: tok.span,
261                    value: Cow::Borrowed(tok.text),
262                    kind: ScalarKind::Bare,
263                }
264            }
265
266            TokenKind::QuotedScalar => {
267                // Process escape sequences
268                let inner = &tok.text[1..tok.text.len() - 1]; // strip quotes
269                match process_escapes(inner) {
270                    Ok(value) => Lexeme::Scalar {
271                        span: tok.span,
272                        value,
273                        kind: ScalarKind::Quoted,
274                    },
275                    Err(msg) => Lexeme::Error {
276                        span: tok.span,
277                        message: msg,
278                    },
279                }
280            }
281
282            TokenKind::RawScalar => {
283                // r#"..."# - extract content between quotes
284                let text = tok.text;
285                // Count leading #s after 'r'
286                let hash_count = text[1..].chars().take_while(|&c| c == '#').count();
287                // Content is between r##" and "##
288                let start = 1 + hash_count + 1; // r + hashes + quote
289                let end = text.len() - hash_count - 1; // quote + hashes
290                let content = &text[start..end];
291
292                Lexeme::Scalar {
293                    span: tok.span,
294                    value: Cow::Borrowed(content),
295                    kind: ScalarKind::Raw,
296                }
297            }
298
299            TokenKind::HeredocStart => {
300                // Collect heredoc content
301                let start_span = tok.span;
302                let mut content = String::new();
303                let end_span;
304                let mut closing_indent = 0usize;
305
306                loop {
307                    // Check for closing indent before consuming content token
308                    // (it's set after HeredocContent is produced, before HeredocEnd)
309                    if let Some(indent) = self.tokenizer.heredoc_closing_indent() {
310                        closing_indent = indent;
311                    }
312
313                    let next = self.next_token();
314                    match next.kind {
315                        TokenKind::HeredocContent => {
316                            content.push_str(next.text);
317                        }
318                        TokenKind::HeredocEnd => {
319                            end_span = next.span;
320                            break;
321                        }
322                        TokenKind::Eof => {
323                            return Lexeme::Error {
324                                span: start_span,
325                                message: "unterminated heredoc",
326                            };
327                        }
328                        _ => {
329                            return Lexeme::Error {
330                                span: next.span,
331                                message: "unexpected token in heredoc",
332                            };
333                        }
334                    }
335                }
336
337                // Apply dedent if closing delimiter was indented
338                if closing_indent > 0 {
339                    content = dedent_heredoc(&content, closing_indent);
340                }
341
342                Lexeme::Scalar {
343                    span: Span::new(start_span.start, end_span.end),
344                    value: Cow::Owned(content),
345                    kind: ScalarKind::Heredoc,
346                }
347            }
348
349            TokenKind::HeredocContent | TokenKind::HeredocEnd => {
350                // Should not see these outside heredoc context
351                Lexeme::Error {
352                    span: tok.span,
353                    message: "unexpected heredoc token",
354                }
355            }
356
357            TokenKind::Whitespace => {
358                // Should have been skipped above
359                unreachable!("whitespace should be skipped")
360            }
361
362            TokenKind::Error => Lexeme::Error {
363                span: tok.span,
364                message: "tokenizer error",
365            },
366        }
367    }
368}
369
370impl<'src> Iterator for Lexer<'src> {
371    type Item = Lexeme<'src>;
372
373    fn next(&mut self) -> Option<Self::Item> {
374        let lexeme = self.next_lexeme();
375        if matches!(lexeme, Lexeme::Eof) {
376            None
377        } else {
378            Some(lexeme)
379        }
380    }
381}
382
383/// Strip up to `indent_len` whitespace characters from the start of each line.
384fn dedent_heredoc(content: &str, indent_len: usize) -> String {
385    let mut result = String::with_capacity(content.len());
386    for (i, line) in content.split('\n').enumerate() {
387        if i > 0 {
388            result.push('\n');
389        }
390        // Strip up to indent_len whitespace chars from start of line
391        let mut stripped = 0;
392        let mut char_indices = line.char_indices().peekable();
393        while stripped < indent_len {
394            if let Some(&(_, ch)) = char_indices.peek() {
395                if ch == ' ' || ch == '\t' {
396                    char_indices.next();
397                    stripped += 1;
398                } else {
399                    break;
400                }
401            } else {
402                break;
403            }
404        }
405        // Append the rest of the line
406        if let Some(&(idx, _)) = char_indices.peek() {
407            result.push_str(&line[idx..]);
408        }
409    }
410    result
411}
412
413/// Process escape sequences in a quoted string.
414fn process_escapes(s: &str) -> Result<Cow<'_, str>, &'static str> {
415    // Fast path: no escapes
416    if !s.contains('\\') {
417        return Ok(Cow::Borrowed(s));
418    }
419
420    let mut result = String::with_capacity(s.len());
421    let mut chars = s.chars().peekable();
422
423    while let Some(c) = chars.next() {
424        if c != '\\' {
425            result.push(c);
426            continue;
427        }
428
429        match chars.next() {
430            Some('\\') => result.push('\\'),
431            Some('"') => result.push('"'),
432            Some('n') => result.push('\n'),
433            Some('r') => result.push('\r'),
434            Some('t') => result.push('\t'),
435            Some('u') => {
436                // Unicode escape: \uXXXX or \u{X...}
437                match chars.peek() {
438                    Some('{') => {
439                        chars.next(); // consume '{'
440                        let mut hex = String::new();
441                        loop {
442                            match chars.next() {
443                                Some('}') => break,
444                                Some(c) if c.is_ascii_hexdigit() => hex.push(c),
445                                _ => return Err("invalid unicode escape"),
446                            }
447                        }
448                        let code =
449                            u32::from_str_radix(&hex, 16).map_err(|_| "invalid unicode escape")?;
450                        let ch = char::from_u32(code).ok_or("invalid unicode code point")?;
451                        result.push(ch);
452                    }
453                    Some(_) => {
454                        // \uXXXX - exactly 4 hex digits
455                        let mut hex = String::with_capacity(4);
456                        for _ in 0..4 {
457                            match chars.next() {
458                                Some(c) if c.is_ascii_hexdigit() => hex.push(c),
459                                _ => return Err("invalid unicode escape"),
460                            }
461                        }
462                        let code =
463                            u32::from_str_radix(&hex, 16).map_err(|_| "invalid unicode escape")?;
464                        let ch = char::from_u32(code).ok_or("invalid unicode code point")?;
465                        result.push(ch);
466                    }
467                    None => return Err("invalid unicode escape"),
468                }
469            }
470            Some(_) => return Err("invalid escape sequence"),
471            None => return Err("trailing backslash"),
472        }
473    }
474
475    Ok(Cow::Owned(result))
476}
477
478#[cfg(test)]
479mod tests {
480    use super::*;
481
482    #[test]
483    fn test_process_escapes_double_backslash() {
484        // Input: path\\to\\file (two backslash pairs)
485        // Expected: path\to\file (two literal backslashes)
486        let result = process_escapes(r"path\\to\\file").unwrap();
487        assert_eq!(result, r"path\to\file");
488    }
489
490    fn lex(source: &str) -> Vec<Lexeme<'_>> {
491        Lexer::new(source).collect()
492    }
493
494    #[test]
495    fn test_unit() {
496        let lexemes = lex("@");
497        assert!(matches!(&lexemes[0], Lexeme::Unit { .. }));
498    }
499
500    #[test]
501    fn test_tag_no_payload() {
502        let lexemes = lex("@foo");
503        assert!(matches!(
504            &lexemes[0],
505            Lexeme::Tag {
506                name: "foo",
507                has_payload: false,
508                ..
509            }
510        ));
511    }
512
513    #[test]
514    fn test_tag_with_object_payload() {
515        let lexemes = lex("@tag{}");
516        assert!(matches!(
517            &lexemes[0],
518            Lexeme::Tag {
519                name: "tag",
520                has_payload: true,
521                ..
522            }
523        ));
524        assert!(matches!(&lexemes[1], Lexeme::ObjectStart { .. }));
525        assert!(matches!(&lexemes[2], Lexeme::ObjectEnd { .. }));
526    }
527
528    #[test]
529    fn test_tag_with_space_before_object() {
530        // @tag {} - space means NOT a payload
531        let lexemes = lex("@tag {}");
532        assert!(matches!(
533            &lexemes[0],
534            Lexeme::Tag {
535                name: "tag",
536                has_payload: false,
537                ..
538            }
539        ));
540    }
541
542    #[test]
543    fn test_bare_scalar() {
544        let lexemes = lex("hello");
545        assert!(matches!(
546            &lexemes[0],
547            Lexeme::Scalar {
548                kind: ScalarKind::Bare,
549                ..
550            }
551        ));
552    }
553
554    #[test]
555    fn test_quoted_scalar() {
556        let lexemes = lex(r#""hello\nworld""#);
557        match &lexemes[0] {
558            Lexeme::Scalar {
559                value,
560                kind: ScalarKind::Quoted,
561                ..
562            } => {
563                assert_eq!(value.as_ref(), "hello\nworld");
564            }
565            other => panic!("expected quoted scalar, got {:?}", other),
566        }
567    }
568
569    #[test]
570    fn test_raw_scalar() {
571        let lexemes = lex(r##"r#"hello"#"##);
572        match &lexemes[0] {
573            Lexeme::Scalar {
574                value,
575                kind: ScalarKind::Raw,
576                ..
577            } => {
578                assert_eq!(value.as_ref(), "hello");
579            }
580            other => panic!("expected raw scalar, got {:?}", other),
581        }
582    }
583
584    #[test]
585    fn test_tag_with_quoted_payload() {
586        let lexemes = lex(r#"@env"staging""#);
587        assert!(matches!(
588            &lexemes[0],
589            Lexeme::Tag {
590                name: "env",
591                has_payload: true,
592                ..
593            }
594        ));
595        match &lexemes[1] {
596            Lexeme::Scalar {
597                value,
598                kind: ScalarKind::Quoted,
599                ..
600            } => {
601                assert_eq!(value.as_ref(), "staging");
602            }
603            other => panic!("expected quoted scalar, got {:?}", other),
604        }
605    }
606
607    #[test]
608    fn test_tag_with_sequence_payload() {
609        let lexemes = lex("@rgb(255 128 0)");
610        assert!(matches!(
611            &lexemes[0],
612            Lexeme::Tag {
613                name: "rgb",
614                has_payload: true,
615                ..
616            }
617        ));
618        assert!(matches!(&lexemes[1], Lexeme::SeqStart { .. }));
619    }
620
621    #[test]
622    fn test_chained_tag_without_trailing_payload() {
623        let lexemes = lex("@must_emit/@discover_end");
624        assert!(matches!(
625            &lexemes[0],
626            Lexeme::Tag {
627                name: "must_emit/@discover_end",
628                has_payload: true,
629                ..
630            }
631        ));
632    }
633
634    #[test]
635    fn test_chained_tag_with_object_payload() {
636        let lexemes = lex("@must_emit/@discover_start{executor default}");
637        assert!(matches!(
638            &lexemes[0],
639            Lexeme::Tag {
640                name: "must_emit/@discover_start",
641                has_payload: true,
642                ..
643            }
644        ));
645        assert!(matches!(&lexemes[1], Lexeme::ObjectStart { .. }));
646    }
647
648    #[test]
649    fn test_three_segment_chained_tag_lexeme() {
650        let lexemes = lex("@a/@b/@c");
651        assert!(matches!(
652            &lexemes[0],
653            Lexeme::Tag {
654                name: "a/@b/@c",
655                has_payload: true,
656                ..
657            }
658        ));
659    }
660
661    #[test]
662    fn test_chained_tag_with_quoted_leaf_payload() {
663        let lexemes = lex(r#"@a/@b"foo""#);
664        assert!(matches!(
665            &lexemes[0],
666            Lexeme::Tag {
667                name: "a/@b",
668                has_payload: true,
669                ..
670            }
671        ));
672        match &lexemes[1] {
673            Lexeme::Scalar {
674                value,
675                kind: ScalarKind::Quoted,
676                ..
677            } => assert_eq!(value.as_ref(), "foo"),
678            other => panic!("expected quoted scalar, got {:?}", other),
679        }
680    }
681
682    #[test]
683    fn test_chained_tag_with_raw_leaf_payload() {
684        let lexemes = lex(r##"@a/@br#"foo"#"##);
685        assert!(matches!(
686            &lexemes[0],
687            Lexeme::Tag {
688                name: "a/@b",
689                has_payload: true,
690                ..
691            }
692        ));
693        match &lexemes[1] {
694            Lexeme::Scalar {
695                value,
696                kind: ScalarKind::Raw,
697                ..
698            } => assert_eq!(value.as_ref(), "foo"),
699            other => panic!("expected raw scalar, got {:?}", other),
700        }
701    }
702
703    #[test]
704    fn test_chained_tag_with_heredoc_leaf_payload() {
705        let lexemes = lex("@a/@b<<EOF\nhello\nEOF");
706        assert!(matches!(
707            &lexemes[0],
708            Lexeme::Tag {
709                name: "a/@b",
710                has_payload: true,
711                ..
712            }
713        ));
714        match &lexemes[1] {
715            Lexeme::Scalar {
716                value,
717                kind: ScalarKind::Heredoc,
718                ..
719            } => assert_eq!(value.as_ref(), "hello\n"),
720            other => panic!("expected heredoc scalar, got {:?}", other),
721        }
722    }
723
724    #[test]
725    fn test_tag_with_unit_payload() {
726        // @tag@ - tag with explicit unit payload
727        let lexemes = lex("@tag@");
728        assert!(matches!(
729            &lexemes[0],
730            Lexeme::Tag {
731                name: "tag",
732                has_payload: true,
733                ..
734            }
735        ));
736        assert!(matches!(&lexemes[1], Lexeme::Unit { .. }));
737    }
738
739    #[test]
740    fn test_tag_with_raw_payload() {
741        // @tagr#"x"# - tag "tag" with raw string payload
742        let lexemes = lex(r##"@tagr#"x"#"##);
743        assert!(matches!(
744            &lexemes[0],
745            Lexeme::Tag {
746                name: "tag",
747                has_payload: true,
748                ..
749            }
750        ));
751        match &lexemes[1] {
752            Lexeme::Scalar {
753                value,
754                kind: ScalarKind::Raw,
755                ..
756            } => {
757                assert_eq!(value.as_ref(), "x");
758            }
759            other => panic!("expected raw scalar, got {:?}", other),
760        }
761    }
762
763    #[test]
764    fn test_tag_with_space_before_sequence() {
765        let lexemes = lex("@tag (a b)");
766        assert!(matches!(
767            &lexemes[0],
768            Lexeme::Tag {
769                name: "tag",
770                has_payload: false,
771                ..
772            }
773        ));
774    }
775
776    #[test]
777    fn test_tag_with_space_before_quoted() {
778        let lexemes = lex(r#"@tag "value""#);
779        assert!(matches!(
780            &lexemes[0],
781            Lexeme::Tag {
782                name: "tag",
783                has_payload: false,
784                ..
785            }
786        ));
787    }
788
789    // Note: @tag@ (explicit unit payload) requires tokenizer changes
790    // The tokenizer currently produces `At` + `BareScalar("tag@")` because
791    // `@` is allowed in bare scalars after the first char.
792    // This will be addressed when we update the tokenizer.
793
794    #[test]
795    fn test_at_followed_by_digit() {
796        // @123 is an invalid tag name - the error span includes both @ and 123
797        let lexemes = lex("@123");
798        assert!(matches!(
799            &lexemes[0],
800            Lexeme::Error {
801                message: "invalid tag name",
802                ..
803            }
804        ));
805    }
806
807    #[test]
808    fn test_structural() {
809        let lexemes = lex("{x 1}");
810        assert!(matches!(&lexemes[0], Lexeme::ObjectStart { .. }));
811        assert!(matches!(&lexemes[1], Lexeme::Scalar { .. }));
812        assert!(matches!(&lexemes[2], Lexeme::Scalar { .. }));
813        assert!(matches!(&lexemes[3], Lexeme::ObjectEnd { .. }));
814    }
815
816    #[test]
817    fn test_sequence() {
818        let lexemes = lex("(a b)");
819        assert!(matches!(&lexemes[0], Lexeme::SeqStart { .. }));
820        assert!(matches!(&lexemes[1], Lexeme::Scalar { .. }));
821        assert!(matches!(&lexemes[2], Lexeme::Scalar { .. }));
822        assert!(matches!(&lexemes[3], Lexeme::SeqEnd { .. }));
823    }
824
825    #[test]
826    fn test_newlines_preserved() {
827        let lexemes = lex("a\nb");
828        assert!(matches!(&lexemes[0], Lexeme::Scalar { .. }));
829        assert!(matches!(&lexemes[1], Lexeme::Newline { .. }));
830        assert!(matches!(&lexemes[2], Lexeme::Scalar { .. }));
831    }
832
833    #[test]
834    fn test_unicode_escape_braces() {
835        let lexemes = lex(r#""\u{1F600}""#);
836        match &lexemes[0] {
837            Lexeme::Scalar { value, .. } => {
838                assert_eq!(value.as_ref(), "😀");
839            }
840            other => panic!("expected scalar, got {:?}", other),
841        }
842    }
843
844    #[test]
845    fn test_unicode_escape_4digit() {
846        let lexemes = lex(r#""\u0041""#);
847        match &lexemes[0] {
848            Lexeme::Scalar { value, .. } => {
849                assert_eq!(value.as_ref(), "A");
850            }
851            other => panic!("expected scalar, got {:?}", other),
852        }
853    }
854
855    #[test]
856    fn test_dotted_value_is_scalar() {
857        // Dots in bare scalars are just part of the value
858        // Parser handles dot-splitting for keys
859        let lexemes = lex("a.b.c");
860        match &lexemes[0] {
861            Lexeme::Scalar {
862                value,
863                kind: ScalarKind::Bare,
864                ..
865            } => {
866                assert_eq!(value.as_ref(), "a.b.c");
867            }
868            other => panic!("expected scalar, got {:?}", other),
869        }
870    }
871
872    #[test]
873    fn test_attr_key() {
874        let lexemes = lex("name>value");
875        assert!(matches!(&lexemes[0], Lexeme::AttrKey { key: "name", .. }));
876        assert!(matches!(&lexemes[1], Lexeme::Scalar { .. }));
877    }
878
879    #[test]
880    fn test_attr_key_with_object() {
881        let lexemes = lex("opts>{x 1}");
882        assert!(matches!(&lexemes[0], Lexeme::AttrKey { key: "opts", .. }));
883        assert!(matches!(&lexemes[1], Lexeme::ObjectStart { .. }));
884    }
885
886    #[test]
887    fn test_attr_key_with_sequence() {
888        let lexemes = lex("tags>(a b)");
889        assert!(matches!(&lexemes[0], Lexeme::AttrKey { key: "tags", .. }));
890        assert!(matches!(&lexemes[1], Lexeme::SeqStart { .. }));
891    }
892
893    #[test]
894    fn test_standalone_gt_error() {
895        // `x > y` with spaces - the `>` is not attribute syntax
896        let lexemes = lex("x > y");
897        assert!(matches!(&lexemes[0], Lexeme::Scalar { .. }));
898        assert!(matches!(&lexemes[1], Lexeme::Error { .. }));
899    }
900
901    #[test]
902    fn test_attr_whitespace_after_gt_error() {
903        // `name> value` with space after `>` is an error
904        let lexemes = lex("name> value");
905        assert!(matches!(
906            &lexemes[0],
907            Lexeme::Error {
908                message: "whitespace after `>` in attribute (use key>value with no spaces)",
909                ..
910            }
911        ));
912    }
913}