yamp/
parser.rs

1use crate::lexer::{Lexer, Token, TokenKind};
2use std::borrow::Cow;
3use std::collections::BTreeMap;
4
5#[derive(Debug, Clone, Copy)]
6enum ChompMode {
7    Strip, // - remove trailing newlines
8    Clip,  // default - single newline
9    Keep,  // + keep all trailing newlines
10}
11
12#[derive(Debug, Clone, PartialEq)]
13pub enum YamlValue<'a> {
14    String(Cow<'a, str>),
15    Array(Vec<YamlNode<'a>>),
16    Object(BTreeMap<Cow<'a, str>, YamlNode<'a>>),
17}
18
19#[derive(Debug, Clone, PartialEq)]
20pub struct YamlNode<'a> {
21    pub value: YamlValue<'a>,
22    pub leading_comment: Option<Cow<'a, str>>,
23    pub inline_comment: Option<Cow<'a, str>>,
24}
25
26impl<'a> YamlNode<'a> {
27    pub(crate) fn new(value: YamlValue<'a>) -> Self {
28        YamlNode {
29            value,
30            leading_comment: None,
31            inline_comment: None,
32        }
33    }
34
35    pub(crate) fn with_comments(
36        value: YamlValue<'a>,
37        leading: Option<Cow<'a, str>>,
38        inline: Option<Cow<'a, str>>,
39    ) -> Self {
40        YamlNode {
41            value,
42            leading_comment: leading,
43            inline_comment: inline,
44        }
45    }
46
47    // Public constructor for external use
48    pub fn from_value(value: YamlValue<'a>) -> Self {
49        YamlNode {
50            value,
51            leading_comment: None,
52            inline_comment: None,
53        }
54    }
55
56    // Helper methods for ergonomic value access
57
58    /// Returns the string value if this node contains a string
59    pub fn as_str(&self) -> Option<&str> {
60        match &self.value {
61            YamlValue::String(s) => Some(s.as_ref()),
62            _ => None,
63        }
64    }
65
66    /// Returns the object map if this node contains an object
67    pub fn as_object(&self) -> Option<&BTreeMap<Cow<'a, str>, YamlNode<'a>>> {
68        match &self.value {
69            YamlValue::Object(map) => Some(map),
70            _ => None,
71        }
72    }
73
74    /// Returns the array items if this node contains an array
75    pub fn as_array(&self) -> Option<&[YamlNode<'a>]> {
76        match &self.value {
77            YamlValue::Array(items) => Some(items),
78            _ => None,
79        }
80    }
81
82    /// Gets a child node by key if this node is an object
83    pub fn get(&self, key: &str) -> Option<&YamlNode<'a>> {
84        match &self.value {
85            YamlValue::Object(map) => {
86                // Try to find the key in the map
87                for (k, v) in map.iter() {
88                    if k.as_ref() == key {
89                        return Some(v);
90                    }
91                }
92                None
93            }
94            _ => None,
95        }
96    }
97
98    /// Returns true if this node is a string
99    pub fn is_string(&self) -> bool {
100        matches!(&self.value, YamlValue::String(_))
101    }
102
103    /// Returns true if this node is an object
104    pub fn is_object(&self) -> bool {
105        matches!(&self.value, YamlValue::Object(_))
106    }
107
108    /// Returns true if this node is an array
109    pub fn is_array(&self) -> bool {
110        matches!(&self.value, YamlValue::Array(_))
111    }
112}
113
114pub(crate) struct Parser<'g> {
115    tokens: Vec<Token<'g>>,
116    current: usize,
117}
118
119impl<'g> Parser<'g> {
120    pub(crate) fn new(source: &'g str) -> Self {
121        let mut lexer = Lexer::new(source);
122        let tokens = lexer.tokenize();
123        Parser { tokens, current: 0 }
124    }
125
126    pub(crate) fn parse(&mut self) -> Result<YamlNode<'g>, String> {
127        self.skip_whitespace_and_newlines();
128        let result = self.parse_value(0)?;
129        Ok(result)
130    }
131
132    fn current_token(&self) -> Option<&Token<'g>> {
133        self.tokens.get(self.current)
134    }
135
136    fn advance(&mut self) -> Option<&Token<'g>> {
137        if self.current < self.tokens.len() {
138            let token = &self.tokens[self.current];
139            self.current += 1;
140            Some(token)
141        } else {
142            None
143        }
144    }
145
146    fn skip_whitespace(&mut self) {
147        while let Some(token) = self.current_token() {
148            if token.kind != TokenKind::Whitespace {
149                break;
150            }
151            self.advance();
152        }
153    }
154
155    fn skip_whitespace_and_newlines(&mut self) {
156        while let Some(token) = self.current_token() {
157            match token.kind {
158                TokenKind::Whitespace
159                | TokenKind::NewLine
160                | TokenKind::Indent
161                | TokenKind::Dedent => {
162                    self.advance();
163                }
164                TokenKind::Identifier
165                | TokenKind::Colon
166                | TokenKind::String
167                | TokenKind::Hyphen
168                | TokenKind::Comment
169                | TokenKind::Pipe
170                | TokenKind::GreaterThan => break,
171            }
172        }
173    }
174
175    fn collect_comment(&mut self) -> Option<Cow<'g, str>> {
176        self.skip_whitespace();
177        if let Some(token) = self.current_token()
178            && token.kind == TokenKind::Comment
179        {
180            let comment = token.text.trim_start_matches('#').trim();
181            self.advance();
182            return Some(Cow::Borrowed(comment));
183        }
184        None
185    }
186
187    fn parse_value(&mut self, min_indent: usize) -> Result<YamlNode<'g>, String> {
188        self.skip_whitespace();
189
190        // Collect leading comment if it's on a line by itself
191        let mut leading_comment: Option<Cow<'g, str>> = None;
192        if let Some(token) = self.current_token()
193            && token.kind == TokenKind::Comment
194        {
195            leading_comment = Some(Cow::Borrowed(token.text.trim_start_matches('#').trim()));
196            self.advance();
197            self.skip_whitespace_and_newlines();
198        }
199
200        let token = self
201            .current_token()
202            .ok_or_else(|| "Unexpected end of input".to_string())?;
203
204        let node = match token.kind {
205            TokenKind::Hyphen => {
206                let value = self.parse_array(min_indent)?;
207                YamlNode::new(value)
208            }
209            TokenKind::Identifier => {
210                let text = token.text;
211                self.advance();
212
213                self.skip_whitespace();
214                if let Some(next) = self.current_token()
215                    && next.kind == TokenKind::Colon
216                {
217                    self.current -= 1; // Back up
218                    return self.parse_object(min_indent);
219                }
220
221                // It's a scalar value - always treat as string
222                YamlNode::new(YamlValue::String(Cow::Borrowed(text)))
223            }
224            TokenKind::String => {
225                let text = token.text;
226                let content = if text.starts_with('"') || text.starts_with('\'') {
227                    &text[1..text.len() - 1]
228                } else {
229                    text
230                };
231                self.advance();
232                YamlNode::new(YamlValue::String(Cow::Borrowed(content)))
233            }
234            TokenKind::Whitespace
235            | TokenKind::NewLine
236            | TokenKind::Colon
237            | TokenKind::Comment
238            | TokenKind::Indent
239            | TokenKind::Dedent
240            | TokenKind::Pipe
241            | TokenKind::GreaterThan => {
242                return Err(format!("Unexpected token: {:?}", token.kind));
243            }
244        };
245
246        let inline_comment = self.collect_comment();
247
248        Ok(YamlNode::with_comments(
249            node.value,
250            leading_comment,
251            inline_comment,
252        ))
253    }
254
255    fn parse_inline_value(&mut self) -> Result<YamlNode<'g>, String> {
256        // Collect tokens until we hit a newline or comment
257        let start_token = self
258            .current_token()
259            .ok_or_else(|| "Expected value".to_string())?;
260
261        // Check for special single-token values first
262        match start_token.kind {
263            TokenKind::String => {
264                let text = start_token.text;
265                let content = if text.starts_with('"') || text.starts_with('\'') {
266                    &text[1..text.len() - 1]
267                } else {
268                    text
269                };
270                self.advance();
271                let inline_comment = self.collect_comment();
272                return Ok(YamlNode::with_comments(
273                    YamlValue::String(Cow::Borrowed(content)),
274                    None,
275                    inline_comment,
276                ));
277            }
278            TokenKind::Identifier
279            | TokenKind::Colon
280            | TokenKind::Whitespace
281            | TokenKind::NewLine
282            | TokenKind::Hyphen
283            | TokenKind::Comment
284            | TokenKind::Indent
285            | TokenKind::Dedent
286            | TokenKind::Pipe
287            | TokenKind::GreaterThan => {}
288        }
289
290        // Otherwise collect all tokens until newline or comment
291        let mut value_parts = Vec::with_capacity(4); // Most values are 1-4 tokens
292        let mut single_token_text: Option<&'g str> = None;
293
294        while let Some(token) = self.current_token() {
295            match token.kind {
296                TokenKind::NewLine | TokenKind::Comment => break,
297                TokenKind::Whitespace => {
298                    value_parts.push(" ");
299                    self.advance();
300                }
301                TokenKind::Identifier
302                | TokenKind::Colon
303                | TokenKind::String
304                | TokenKind::Hyphen
305                | TokenKind::Indent
306                | TokenKind::Dedent
307                | TokenKind::Pipe
308                | TokenKind::GreaterThan => {
309                    if value_parts.is_empty() && single_token_text.is_none() {
310                        single_token_text = Some(token.text);
311                    }
312                    value_parts.push(token.text);
313                    self.advance();
314                }
315            }
316        }
317
318        // Trim trailing whitespace from value_parts
319        while value_parts.last() == Some(&" ") {
320            value_parts.pop();
321        }
322
323        // Everything is a string now
324        let value = if let Some(text) = single_token_text.filter(|_| value_parts.len() == 1) {
325            YamlValue::String(Cow::Borrowed(text))
326        } else {
327            // For multi-token values, join them
328            let value_str = value_parts.join("");
329            YamlValue::String(Cow::Owned(value_str))
330        };
331
332        let inline_comment = self.collect_comment();
333
334        Ok(YamlNode::with_comments(value, None, inline_comment))
335    }
336
337    fn parse_array(&mut self, min_indent: usize) -> Result<YamlValue<'g>, String> {
338        let mut items = Vec::new();
339
340        while let Some(token) = self.current_token() {
341            if token.kind == TokenKind::Hyphen {
342                self.advance(); // consume hyphen
343                self.skip_whitespace();
344
345                let item = self.parse_value(min_indent)?;
346                items.push(item);
347
348                self.skip_whitespace();
349                if let Some(token) = self.current_token() {
350                    if token.kind == TokenKind::NewLine {
351                        self.advance();
352                        self.skip_whitespace_and_newlines();
353                    } else if token.kind != TokenKind::Hyphen {
354                        break;
355                    }
356                }
357            } else {
358                break;
359            }
360        }
361
362        Ok(YamlValue::Array(items))
363    }
364
365    fn parse_multiline_string(
366        &mut self,
367        base_indent: usize,
368        is_literal: bool,
369    ) -> Result<YamlNode<'g>, String> {
370        // Skip any remaining whitespace and comments on the same line
371        self.skip_whitespace();
372
373        // Handle optional chomping indicator (-, +, or none)
374        let mut chomp_mode = ChompMode::Clip; // default
375        if let Some(token) = self.current_token() {
376            match token.text {
377                "-" => {
378                    chomp_mode = ChompMode::Strip;
379                    self.advance();
380                }
381                "+" => {
382                    chomp_mode = ChompMode::Keep;
383                    self.advance();
384                }
385                _ => {}
386            }
387        }
388
389        // Skip to next line
390        while let Some(token) = self.current_token() {
391            if token.kind == TokenKind::NewLine {
392                self.advance();
393                break;
394            }
395            // Skip any other tokens (comments, etc.)
396            self.advance();
397        }
398
399        let mut lines = Vec::new();
400        let mut content_indent = None;
401
402        // Collect all lines that are more indented than base_indent
403        while let Some(token) = self.current_token() {
404            // Check if we've dedented back to or past the base level
405            if token.kind == TokenKind::Dedent {
406                // Check the next non-whitespace token's column
407                let mut peek_index = self.current + 1;
408                while peek_index < self.tokens.len() {
409                    let peek_token = &self.tokens[peek_index];
410                    if peek_token.kind != TokenKind::Whitespace
411                        && peek_token.kind != TokenKind::Indent
412                        && peek_token.kind != TokenKind::Dedent
413                    {
414                        if peek_token.column <= base_indent {
415                            break;
416                        }
417                        break;
418                    }
419                    peek_index += 1;
420                }
421                if peek_index < self.tokens.len() && self.tokens[peek_index].column <= base_indent {
422                    break;
423                }
424            }
425
426            // Skip whitespace but track indentation
427            if token.kind == TokenKind::Whitespace || token.kind == TokenKind::Indent {
428                self.advance();
429                continue;
430            }
431
432            // If it's a newline, add an empty line
433            if token.kind == TokenKind::NewLine {
434                lines.push("");
435                self.advance();
436                continue;
437            }
438
439            // Check indentation
440            if token.column <= base_indent {
441                break;
442            }
443
444            // Set content indent from first content line
445            if content_indent.is_none() {
446                content_indent = Some(token.column);
447            }
448
449            // Collect the line
450            let _line_start = self.current;
451            let mut line_text = String::new();
452
453            while let Some(token) = self.current_token() {
454                if token.kind == TokenKind::NewLine {
455                    break;
456                }
457
458                // For literal mode, preserve everything
459                // For folded mode, we'll process later
460                line_text.push_str(token.text);
461                self.advance();
462            }
463
464            lines.push(line_text.leak()); // Convert to &'static str for simplicity
465
466            if let Some(token) = self.current_token()
467                && token.kind == TokenKind::NewLine
468            {
469                self.advance();
470            }
471        }
472
473        // Process the lines based on mode
474        let result = if is_literal {
475            // Literal mode: preserve line breaks
476            let mut result = lines.join("\n");
477
478            // Apply chomping
479            match chomp_mode {
480                ChompMode::Strip => {
481                    // Remove all trailing newlines
482                    while result.ends_with('\n') {
483                        result.pop();
484                    }
485                }
486                ChompMode::Clip => {
487                    // Keep single trailing newline (default)
488                    while result.ends_with("\n\n") {
489                        result.pop();
490                    }
491                    if !result.ends_with('\n') && !result.is_empty() {
492                        result.push('\n');
493                    }
494                }
495                ChompMode::Keep => {
496                    // Keep all trailing newlines
497                    result.push('\n');
498                }
499            }
500
501            result
502        } else {
503            // Folded mode: fold lines together
504            let mut result = String::new();
505            let mut prev_empty = false;
506
507            for (i, line) in lines.iter().enumerate() {
508                if line.is_empty() {
509                    if !prev_empty && i > 0 {
510                        result.push('\n');
511                    }
512                    prev_empty = true;
513                } else {
514                    if i > 0 && !prev_empty {
515                        result.push(' ');
516                    }
517                    result.push_str(line.trim_start());
518                    prev_empty = false;
519                }
520            }
521
522            // Apply chomping
523            match chomp_mode {
524                ChompMode::Strip => {
525                    while result.ends_with('\n') || result.ends_with(' ') {
526                        result.pop();
527                    }
528                }
529                ChompMode::Clip => {
530                    while result.ends_with('\n') || result.ends_with(' ') {
531                        result.pop();
532                    }
533                    // Add single trailing newline for Clip mode
534                    if !result.is_empty() {
535                        result.push('\n');
536                    }
537                }
538                ChompMode::Keep => {
539                    // Keep trailing whitespace
540                    if !result.is_empty() && !result.ends_with('\n') {
541                        result.push('\n');
542                    }
543                }
544            }
545
546            result
547        };
548
549        Ok(YamlNode::new(YamlValue::String(Cow::Owned(result))))
550    }
551
552    fn parse_object(&mut self, min_indent: usize) -> Result<YamlNode<'g>, String> {
553        let mut map = BTreeMap::new();
554
555        while let Some(token) = self.current_token() {
556            if token.kind != TokenKind::Identifier {
557                break;
558            }
559
560            // Check if this key is at the right indentation level
561            // If we're in a nested object, keys should be more indented than min_indent
562            if min_indent > 0 && token.column <= min_indent {
563                break;
564            }
565
566            let key_column = token.column;
567            let key = Cow::Borrowed(token.text);
568            self.advance();
569
570            self.skip_whitespace();
571
572            // Early return if no colon found
573            let Some(token) = self.current_token() else {
574                return Err("Expected colon after key".to_string());
575            };
576            if token.kind != TokenKind::Colon {
577                return Err(format!("Expected colon after key, got {:?}", token.kind));
578            }
579            self.advance();
580
581            self.skip_whitespace();
582
583            // Skip whitespace after colon
584            self.skip_whitespace();
585
586            // Collect the value - could be multiple tokens on the same line
587            let Some(token) = self.current_token() else {
588                return Err("Expected value after colon".to_string());
589            };
590
591            let value = if token.kind == TokenKind::Pipe || token.kind == TokenKind::GreaterThan {
592                // Multiline string indicator
593                let is_literal = token.kind == TokenKind::Pipe;
594                self.advance(); // consume | or >
595                self.parse_multiline_string(key_column, is_literal)?
596            } else if token.kind == TokenKind::NewLine || token.kind == TokenKind::Indent {
597                // Value is on next line
598                self.skip_whitespace_and_newlines();
599                // Use key_column as the new min_indent for nested values
600                self.parse_value(key_column)?
601            } else {
602                // Value is on same line - collect until newline
603                self.parse_inline_value()?
604            };
605
606            map.insert(key, value);
607
608            self.skip_whitespace();
609            if let Some(token) = self.current_token()
610                && token.kind == TokenKind::NewLine
611            {
612                self.advance();
613                self.skip_whitespace_and_newlines();
614            }
615
616            // Check if we've dedented or reached end
617            if let Some(token) = self.current_token()
618                && token.kind == TokenKind::Dedent
619            {
620                self.advance();
621                break;
622            }
623        }
624
625        Ok(YamlNode::new(YamlValue::Object(map)))
626    }
627}
628
629#[cfg(test)]
630mod tests {
631    use super::*;
632
633    #[test]
634    fn test_parse_simple_object() {
635        let yaml = "name: John\nage: 30";
636        let mut parser = Parser::new(yaml);
637        let result = parser.parse().unwrap();
638
639        if let YamlValue::Object(map) = &result.value {
640            assert_eq!(map.len(), 2);
641
642            let name_node = map.get(&Cow::Borrowed("name")).unwrap();
643            assert_eq!(name_node.value, YamlValue::String(Cow::Borrowed("John")));
644
645            let age_node = map.get(&Cow::Borrowed("age")).unwrap();
646            assert_eq!(age_node.value, YamlValue::String(Cow::Borrowed("30")));
647        } else {
648            panic!("Expected object");
649        }
650    }
651
652    #[test]
653    fn test_parse_array() {
654        let yaml = "- apple\n- banana\n- cherry";
655        let mut parser = Parser::new(yaml);
656        let result = parser.parse().unwrap();
657
658        if let YamlValue::Array(items) = &result.value {
659            assert_eq!(items.len(), 3);
660            assert_eq!(items[0].value, YamlValue::String(Cow::Borrowed("apple")));
661            assert_eq!(items[1].value, YamlValue::String(Cow::Borrowed("banana")));
662            assert_eq!(items[2].value, YamlValue::String(Cow::Borrowed("cherry")));
663        } else {
664            panic!("Expected array");
665        }
666    }
667
668    #[test]
669    fn test_parse_with_comments() {
670        let yaml = "name: John # inline comment\nage: 30";
671        let mut parser = Parser::new(yaml);
672        let result = parser.parse().unwrap();
673
674        if let YamlValue::Object(map) = &result.value {
675            let name_node = map.get(&Cow::Borrowed("name")).unwrap();
676            assert_eq!(
677                name_node.inline_comment,
678                Some(Cow::Borrowed("inline comment"))
679            );
680        } else {
681            panic!("Expected object");
682        }
683    }
684
685    #[test]
686    fn test_parse_mixed_types() {
687        let yaml = "enabled: true\ncount: 42\nratio: 2.5\nempty: null";
688        let mut parser = Parser::new(yaml);
689        let result = parser.parse().unwrap();
690
691        if let YamlValue::Object(map) = &result.value {
692            assert_eq!(
693                map.get(&Cow::Borrowed("enabled")).unwrap().value,
694                YamlValue::String(Cow::Borrowed("true"))
695            );
696            assert_eq!(
697                map.get(&Cow::Borrowed("count")).unwrap().value,
698                YamlValue::String(Cow::Borrowed("42"))
699            );
700            assert_eq!(
701                map.get(&Cow::Borrowed("ratio")).unwrap().value,
702                YamlValue::String(Cow::Borrowed("2.5"))
703            );
704            assert_eq!(
705                map.get(&Cow::Borrowed("empty")).unwrap().value,
706                YamlValue::String(Cow::Borrowed("null"))
707            );
708        } else {
709            panic!("Expected object");
710        }
711    }
712}