styx_cst/
parser.rs

1//! CST parser for Styx using rowan's GreenNodeBuilder.
2//!
3//! This parser produces a lossless concrete syntax tree that preserves
4//! all whitespace, comments, and exact source representation.
5
6use rowan::GreenNode;
7use styx_parse::{Lexer, Token, TokenKind};
8
9use crate::syntax_kind::{SyntaxKind, SyntaxNode};
10
11/// A parsed Styx document.
12#[derive(Debug, Clone)]
13pub struct Parse {
14    green: GreenNode,
15    errors: Vec<ParseError>,
16}
17
18impl Parse {
19    /// Get the root syntax node.
20    pub fn syntax(&self) -> SyntaxNode {
21        SyntaxNode::new_root(self.green.clone())
22    }
23
24    /// Get parse errors.
25    pub fn errors(&self) -> &[ParseError] {
26        &self.errors
27    }
28
29    /// Check if parsing succeeded without errors.
30    pub fn is_ok(&self) -> bool {
31        self.errors.is_empty()
32    }
33
34    /// Convert to Result, returning errors if any.
35    pub fn ok(self) -> Result<SyntaxNode, Vec<ParseError>> {
36        if self.errors.is_empty() {
37            Ok(self.syntax())
38        } else {
39            Err(self.errors)
40        }
41    }
42
43    /// Get the green node (for testing/debugging).
44    pub fn green(&self) -> &GreenNode {
45        &self.green
46    }
47}
48
49/// A parse error with location information.
50#[derive(Debug, Clone, PartialEq, Eq)]
51pub struct ParseError {
52    /// Byte offset where the error occurred.
53    pub offset: u32,
54    /// Error message.
55    pub message: String,
56}
57
58impl ParseError {
59    fn new(offset: u32, message: impl Into<String>) -> Self {
60        Self {
61            offset,
62            message: message.into(),
63        }
64    }
65}
66
67/// Parse Styx source into a CST.
68pub fn parse(source: &str) -> Parse {
69    let parser = CstParser::new(source);
70    parser.parse()
71}
72
73/// CST parser that builds a green tree using rowan.
74struct CstParser<'src> {
75    #[allow(dead_code)]
76    source: &'src str,
77    lexer: std::iter::Peekable<TokenIter<'src>>,
78    builder: rowan::GreenNodeBuilder<'static>,
79    errors: Vec<ParseError>,
80}
81
82/// Iterator adapter for the lexer that includes EOF.
83struct TokenIter<'src> {
84    lexer: Lexer<'src>,
85    done: bool,
86}
87
88impl<'src> Iterator for TokenIter<'src> {
89    type Item = Token<'src>;
90
91    fn next(&mut self) -> Option<Self::Item> {
92        if self.done {
93            return None;
94        }
95        let token = self.lexer.next_token();
96        if token.kind == TokenKind::Eof {
97            self.done = true;
98        }
99        Some(token)
100    }
101}
102
103impl<'src> CstParser<'src> {
104    fn new(source: &'src str) -> Self {
105        let lexer = Lexer::new(source);
106        Self {
107            source,
108            lexer: TokenIter { lexer, done: false }.peekable(),
109            builder: rowan::GreenNodeBuilder::new(),
110            errors: Vec::new(),
111        }
112    }
113
114    fn parse(mut self) -> Parse {
115        self.builder.start_node(SyntaxKind::DOCUMENT.into());
116        self.parse_entries(None);
117        self.builder.finish_node();
118
119        Parse {
120            green: self.builder.finish(),
121            errors: self.errors,
122        }
123    }
124
125    /// Peek at the current token kind.
126    fn peek(&mut self) -> TokenKind {
127        self.lexer.peek().map(|t| t.kind).unwrap_or(TokenKind::Eof)
128    }
129
130    /// Peek at the current token.
131    fn peek_token(&mut self) -> Option<&Token<'src>> {
132        self.lexer.peek()
133    }
134
135    /// Get the current token's start position, if any.
136    fn current_pos(&mut self) -> u32 {
137        self.lexer.peek().map(|t| t.span.start).unwrap_or(0)
138    }
139
140    /// Consume and add the current token to the tree.
141    fn bump(&mut self) {
142        if let Some(token) = self.lexer.next() {
143            self.builder
144                .token(SyntaxKind::from(token.kind).into(), token.text);
145        }
146    }
147
148    /// Skip trivia (whitespace and line comments), adding them to the tree.
149    fn skip_trivia(&mut self) {
150        while matches!(
151            self.peek(),
152            TokenKind::Whitespace | TokenKind::Newline | TokenKind::LineComment
153        ) {
154            self.bump();
155        }
156    }
157
158    /// Skip horizontal whitespace only.
159    fn skip_whitespace(&mut self) {
160        while self.peek() == TokenKind::Whitespace {
161            self.bump();
162        }
163    }
164
165    /// Check if we're at EOF.
166    fn at_eof(&mut self) -> bool {
167        self.peek() == TokenKind::Eof
168    }
169
170    /// Check if we're at a token that ends an entry.
171    fn at_entry_end(&mut self, closing: Option<TokenKind>) -> bool {
172        let kind = self.peek();
173        kind == TokenKind::Eof
174            || kind == TokenKind::Newline
175            || kind == TokenKind::LineComment
176            || kind == TokenKind::Comma
177            || closing.is_some_and(|c| kind == c)
178    }
179
180    /// Check if the current position starts an attribute (bare_scalar followed by =).
181    fn at_attribute(&mut self) -> bool {
182        if self.peek() != TokenKind::BareScalar {
183            return false;
184        }
185        // Check if there's a > sign after this bare scalar (possibly with whitespace)
186        let token = match self.peek_token() {
187            Some(t) => t,
188            None => return false,
189        };
190        let after_scalar = token.span.end as usize;
191
192        // Look for > in the source after the scalar, skipping whitespace
193        let rest = &self.source[after_scalar..];
194        for ch in rest.chars() {
195            match ch {
196                ' ' | '\t' => continue,
197                '>' => return true,
198                _ => return false,
199            }
200        }
201        false
202    }
203
204    /// Parse entries (at document level or inside an object).
205    fn parse_entries(&mut self, closing: Option<TokenKind>) {
206        loop {
207            self.skip_trivia();
208
209            // Check for doc comments - they attach to the next entry
210            while self.peek() == TokenKind::DocComment {
211                self.bump();
212                // Skip whitespace/newlines after doc comment
213                while matches!(self.peek(), TokenKind::Whitespace | TokenKind::Newline) {
214                    self.bump();
215                }
216            }
217
218            // Check for closing or EOF
219            if self.at_eof() {
220                break;
221            }
222            if closing.is_some_and(|close| self.peek() == close) {
223                break;
224            }
225
226            // Parse an entry
227            self.parse_entry(closing);
228
229            // Handle separator
230            self.skip_whitespace();
231            if matches!(self.peek(), TokenKind::Comma | TokenKind::Newline) {
232                self.bump();
233            }
234        }
235    }
236
237    /// Parse a single entry.
238    ///
239    /// An entry can be:
240    /// - A sequence of attributes: `key1=value1 key2=value2`
241    /// - A key with zero or more values: `key` or `key value1 value2`
242    /// - A key followed by attributes and then more values: `div id=main { ... }`
243    fn parse_entry(&mut self, closing: Option<TokenKind>) {
244        self.builder.start_node(SyntaxKind::ENTRY.into());
245
246        // Check if this starts with attributes (entry is just attributes)
247        if self.at_attribute() {
248            self.parse_attributes(closing);
249        } else {
250            // Parse first atom as the key
251            if !self.at_entry_end(closing) {
252                self.builder.start_node(SyntaxKind::KEY.into());
253                self.parse_atom();
254                self.builder.finish_node();
255            }
256
257            // Skip horizontal whitespace
258            self.skip_whitespace();
259
260            // Parse remaining atoms/attributes as values
261            let mut value_count = 0;
262            while !self.at_entry_end(closing) {
263                // Check if we have attributes next
264                if self.at_attribute() {
265                    self.builder.start_node(SyntaxKind::VALUE.into());
266                    self.parse_attributes(closing);
267                    self.builder.finish_node();
268                    value_count += 1;
269                } else {
270                    // If we're in an inline object (closing is RBrace) and we already have a value,
271                    // seeing another bare scalar likely means a missing comma
272                    if closing == Some(TokenKind::RBrace)
273                        && value_count > 0
274                        && self.peek() == TokenKind::BareScalar
275                    {
276                        let pos = self.current_pos();
277                        self.errors.push(ParseError::new(
278                            pos,
279                            "unexpected atom after value (missing comma between entries?)"
280                                .to_string(),
281                        ));
282                    }
283                    self.builder.start_node(SyntaxKind::VALUE.into());
284                    self.parse_atom();
285                    self.builder.finish_node();
286                    value_count += 1;
287                }
288                self.skip_whitespace();
289            }
290        }
291
292        self.builder.finish_node();
293    }
294
295    /// Parse a sequence of attributes: `key1=value1 key2=value2 ...`
296    fn parse_attributes(&mut self, closing: Option<TokenKind>) {
297        self.builder.start_node(SyntaxKind::ATTRIBUTES.into());
298
299        while self.at_attribute() {
300            self.parse_attribute();
301            self.skip_whitespace();
302
303            // Stop if we hit entry end
304            if self.at_entry_end(closing) {
305                break;
306            }
307        }
308
309        self.builder.finish_node();
310    }
311
312    /// Parse a single attribute: `key>value`
313    ///
314    /// Per spec r[attr.syntax]: "The `>` has no spaces around it."
315    fn parse_attribute(&mut self) {
316        self.builder.start_node(SyntaxKind::ATTRIBUTE.into());
317
318        // Key (bare scalar)
319        self.bump();
320
321        // > sign (no whitespace allowed before or after)
322        if self.peek() == TokenKind::Gt {
323            self.bump();
324        } else {
325            let pos = self.current_pos();
326            self.errors.push(ParseError::new(
327                pos,
328                "expected `>` immediately after attribute key".to_string(),
329            ));
330        }
331
332        // Value (no whitespace allowed between > and value)
333        if self.peek() == TokenKind::Whitespace || self.peek() == TokenKind::Newline {
334            let pos = self.current_pos();
335            self.errors.push(ParseError::new(
336                pos,
337                "no whitespace allowed after `>` in attribute".to_string(),
338            ));
339            // Skip the whitespace so we can continue parsing
340            self.skip_whitespace();
341        }
342
343        self.parse_atom();
344
345        self.builder.finish_node();
346    }
347
348    /// Parse a single atom (scalar, object, sequence, tag, or unit).
349    fn parse_atom(&mut self) {
350        let kind = self.peek();
351        match kind {
352            TokenKind::LBrace => self.parse_object(),
353            TokenKind::LParen => self.parse_sequence(),
354            TokenKind::At => self.parse_tag_or_unit(),
355            TokenKind::BareScalar | TokenKind::QuotedScalar | TokenKind::RawScalar => {
356                self.builder.start_node(SyntaxKind::SCALAR.into());
357                self.bump();
358                self.builder.finish_node();
359            }
360            TokenKind::HeredocStart => self.parse_heredoc(),
361            _ => {
362                // Error: unexpected token
363                let pos = self.current_pos();
364                self.errors.push(ParseError::new(
365                    pos,
366                    format!("unexpected token: {:?}", kind),
367                ));
368                // Consume the error token
369                self.bump();
370            }
371        }
372    }
373
374    /// Parse an object `{ ... }`.
375    fn parse_object(&mut self) {
376        self.builder.start_node(SyntaxKind::OBJECT.into());
377
378        // Consume `{`
379        self.bump();
380
381        // Parse entries until `}`
382        self.parse_entries(Some(TokenKind::RBrace));
383
384        // Consume `}` or error
385        self.skip_trivia();
386        if self.peek() == TokenKind::RBrace {
387            self.bump();
388        } else {
389            let pos = self.current_pos();
390            self.errors
391                .push(ParseError::new(pos, "unclosed object, expected `}`"));
392        }
393
394        self.builder.finish_node();
395    }
396
397    /// Parse a sequence `( ... )`.
398    fn parse_sequence(&mut self) {
399        self.builder.start_node(SyntaxKind::SEQUENCE.into());
400
401        // Consume `(`
402        self.bump();
403
404        // Parse elements until `)`
405        loop {
406            self.skip_trivia();
407
408            if self.at_eof() {
409                let pos = self.current_pos();
410                self.errors
411                    .push(ParseError::new(pos, "unclosed sequence, expected `)`"));
412                break;
413            }
414            if self.peek() == TokenKind::RParen {
415                break;
416            }
417
418            // In sequences, each element is wrapped in an ENTRY with just a KEY
419            self.builder.start_node(SyntaxKind::ENTRY.into());
420            self.builder.start_node(SyntaxKind::KEY.into());
421            self.parse_atom();
422            self.builder.finish_node();
423            self.builder.finish_node();
424
425            // Skip whitespace between elements
426            self.skip_whitespace();
427        }
428
429        // Consume `)` if present
430        if self.peek() == TokenKind::RParen {
431            self.bump();
432        }
433
434        self.builder.finish_node();
435    }
436
437    /// Parse `@` (unit) or `@name` (tag).
438    fn parse_tag_or_unit(&mut self) {
439        // Consume @
440        let at_token = self.lexer.next();
441        let at_end = at_token.as_ref().map(|t| t.span.end).unwrap_or(0);
442
443        // Check what follows the @
444        let (is_unit, next_start) = match self.lexer.peek() {
445            None => (true, 0),
446            Some(t) => {
447                // It's a unit if followed by whitespace, newline, or structural token
448                // or if the bare scalar doesn't start immediately after @
449                let is_unit = t.kind != TokenKind::BareScalar || t.span.start != at_end;
450                (is_unit, t.span.start)
451            }
452        };
453        let _ = next_start; // Silence unused warning
454
455        if is_unit {
456            // Just @
457            self.builder.start_node(SyntaxKind::UNIT.into());
458            if let Some(token) = at_token {
459                self.builder.token(SyntaxKind::AT.into(), token.text);
460            }
461            self.builder.finish_node();
462        } else {
463            // @name with optional payload
464            self.builder.start_node(SyntaxKind::TAG.into());
465
466            // Add @
467            if let Some(token) = at_token {
468                self.builder.token(SyntaxKind::AT.into(), token.text);
469            }
470
471            // Add tag name and track its end position
472            self.builder.start_node(SyntaxKind::TAG_NAME.into());
473            let name_end = self.lexer.peek().map(|t| t.span.end).unwrap_or(0);
474            self.bump(); // The bare scalar
475            self.builder.finish_node();
476
477            // Check for payload - must IMMEDIATELY follow tag name (no whitespace)
478            // Per grammar: Tag ::= '@' TagName TagPayload?
479            // TagPayload ::= Object | Sequence | QuotedScalar | RawScalar | HeredocScalar | '@'
480            let has_immediate_payload = self.lexer.peek().is_some_and(|t| {
481                t.span.start == name_end
482                    && matches!(
483                        t.kind,
484                        TokenKind::LBrace
485                            | TokenKind::LParen
486                            | TokenKind::QuotedScalar
487                            | TokenKind::RawScalar
488                            | TokenKind::HeredocStart
489                            | TokenKind::At
490                    )
491            });
492
493            if has_immediate_payload {
494                self.builder.start_node(SyntaxKind::TAG_PAYLOAD.into());
495                self.parse_atom();
496                self.builder.finish_node();
497            }
498
499            self.builder.finish_node();
500        }
501    }
502
503    /// Parse a heredoc `<<DELIM...DELIM`.
504    fn parse_heredoc(&mut self) {
505        self.builder.start_node(SyntaxKind::HEREDOC.into());
506
507        // Consume heredoc start
508        self.bump();
509
510        // Consume content if present
511        if self.peek() == TokenKind::HeredocContent {
512            self.bump();
513        }
514
515        // Consume end marker
516        if self.peek() == TokenKind::HeredocEnd {
517            self.bump();
518        } else {
519            let pos = self.current_pos();
520            self.errors
521                .push(ParseError::new(pos, "unterminated heredoc"));
522        }
523
524        self.builder.finish_node();
525    }
526}
527
528#[cfg(test)]
529mod tests {
530    use super::*;
531
532    fn parse_ok(source: &str) -> SyntaxNode {
533        let parse = parse(source);
534        assert!(parse.is_ok(), "parse errors: {:?}", parse.errors());
535        parse.syntax()
536    }
537
538    #[allow(dead_code)]
539    fn debug_tree(node: &SyntaxNode) -> String {
540        format!("{:#?}", node)
541    }
542
543    #[test]
544    fn test_empty_document() {
545        let node = parse_ok("");
546        assert_eq!(node.kind(), SyntaxKind::DOCUMENT);
547    }
548
549    #[test]
550    fn test_simple_entry() {
551        let node = parse_ok("host localhost");
552        assert_eq!(node.kind(), SyntaxKind::DOCUMENT);
553
554        // Check we have an entry
555        let entry = node.children().next().unwrap();
556        assert_eq!(entry.kind(), SyntaxKind::ENTRY);
557    }
558
559    #[test]
560    fn test_object() {
561        let node = parse_ok("{ host localhost }");
562        let entry = node.children().next().unwrap();
563        assert_eq!(entry.kind(), SyntaxKind::ENTRY);
564
565        // The key should contain an object
566        let key = entry.children().next().unwrap();
567        assert_eq!(key.kind(), SyntaxKind::KEY);
568
569        let obj = key.children().next().unwrap();
570        assert_eq!(obj.kind(), SyntaxKind::OBJECT);
571    }
572
573    #[test]
574    fn test_sequence() {
575        let node = parse_ok("items (a b c)");
576        let entry = node.children().next().unwrap();
577        let value = entry.children().nth(1).unwrap();
578        assert_eq!(value.kind(), SyntaxKind::VALUE);
579
580        let seq = value.children().next().unwrap();
581        assert_eq!(seq.kind(), SyntaxKind::SEQUENCE);
582    }
583
584    #[test]
585    fn test_roundtrip() {
586        let sources = [
587            "host localhost",
588            "{ a b, c d }",
589            "items (1 2 3)",
590            "name \"hello world\"",
591            "@unit",
592            "@tag payload",
593            "// comment\nkey value",
594        ];
595
596        for source in sources {
597            let parse = parse(source);
598            let reconstructed = parse.syntax().to_string();
599            assert_eq!(source, reconstructed, "roundtrip failed for: {}", source);
600        }
601    }
602
603    #[test]
604    fn test_preserves_whitespace() {
605        let source = "  host   localhost  ";
606        let parse = parse(source);
607        assert_eq!(source, parse.syntax().to_string());
608    }
609
610    #[test]
611    fn test_preserves_comments() {
612        let source = "// header comment\nhost localhost // trailing";
613        let parse = parse(source);
614        assert_eq!(source, parse.syntax().to_string());
615    }
616
617    #[test]
618    fn test_unit() {
619        let node = parse_ok("empty @");
620        let entry = node.children().next().unwrap();
621        let value = entry.children().nth(1).unwrap();
622        let unit = value.children().next().unwrap();
623        assert_eq!(unit.kind(), SyntaxKind::UNIT);
624    }
625
626    #[test]
627    fn test_tag_with_payload() {
628        let node = parse_ok("@Some value");
629        let entry = node.children().next().unwrap();
630        let key = entry.children().next().unwrap();
631        let tag = key.children().next().unwrap();
632        assert_eq!(tag.kind(), SyntaxKind::TAG);
633    }
634
635    #[test]
636    fn test_heredoc() {
637        let source = "content <<EOF\nhello\nworld\nEOF";
638        let parse = parse(source);
639        assert!(parse.is_ok(), "errors: {:?}", parse.errors());
640        assert_eq!(source, parse.syntax().to_string());
641    }
642
643    #[test]
644    fn test_attributes() {
645        let source = "id>main class>\"container\"";
646        let parse = parse(source);
647        assert!(parse.is_ok(), "errors: {:?}", parse.errors());
648        assert_eq!(source, parse.syntax().to_string());
649
650        let entry = parse.syntax().children().next().unwrap();
651        let attrs = entry.children().next().unwrap();
652        assert_eq!(attrs.kind(), SyntaxKind::ATTRIBUTES);
653    }
654
655    #[test]
656    fn test_multiple_values() {
657        let source = "key value1 value2 value3";
658        let parse = parse(source);
659        assert!(parse.is_ok(), "errors: {:?}", parse.errors());
660
661        let entry = parse.syntax().children().next().unwrap();
662        // Should have KEY + 3 VALUEs
663        let children: Vec<_> = entry.children().collect();
664        assert_eq!(children.len(), 4);
665        assert_eq!(children[0].kind(), SyntaxKind::KEY);
666        assert_eq!(children[1].kind(), SyntaxKind::VALUE);
667        assert_eq!(children[2].kind(), SyntaxKind::VALUE);
668        assert_eq!(children[3].kind(), SyntaxKind::VALUE);
669    }
670
671    #[test]
672    fn test_showcase_file() {
673        let source = include_str!("../../../examples/showcase.styx");
674        let parse = parse(source);
675
676        // Should parse without errors
677        assert!(parse.is_ok(), "parse errors: {:?}", parse.errors());
678
679        // Should roundtrip perfectly
680        assert_eq!(source, parse.syntax().to_string(), "roundtrip failed");
681    }
682}