Skip to main content

toon_format_rs/
parser.rs

1use crate::error::{Error, Result};
2use crate::lexer::{Lexer, Token};
3use crate::value::Value;
4use indexmap::IndexMap;
5
6/// Parser for TOON format
7pub struct Parser<'a> {
8    tokens: Vec<Token>,
9    pos: usize,
10    #[allow(dead_code)]
11    input: &'a str,
12}
13
14impl<'a> Parser<'a> {
15    /// Creates a new parser from input string
16    pub fn new(input: &'a str) -> Result<Self> {
17        let mut lexer = Lexer::new(input);
18        let tokens = lexer.lex()?;
19        Ok(Parser {
20            tokens,
21            pos: 0,
22            input,
23        })
24    }
25
26    /// Parses the input and returns a Value
27    pub fn parse(&mut self) -> Result<Value> {
28        self.skip_empty_lines();
29
30        if self.is_eof() {
31            return Ok(Value::Object(IndexMap::new()));
32        }
33
34        // Check if document starts with an expanded array
35        if let Some(Token::Hyphen) = self.peek() {
36            return self.parse_expanded_array();
37        }
38
39        // Root is always an object
40        self.parse_object_at_depth(0)
41    }
42
43    /// Returns the current token without consuming it
44    fn peek(&self) -> Option<&Token> {
45        self.tokens.get(self.pos)
46    }
47
48    /// Consumes and returns the next token
49    fn next_token(&mut self) -> Option<&Token> {
50        let token = self.tokens.get(self.pos);
51        if token.is_some() {
52            self.pos += 1;
53        }
54        token
55    }
56
57    /// Checks if we're at EOF
58    fn is_eof(&self) -> bool {
59        matches!(self.peek(), Some(Token::Eof) | None)
60    }
61
62    /// Expects a specific token and consumes it
63    fn expect(&mut self, expected: &Token) -> Result<()> {
64        match self.next_token() {
65            Some(token) if token == expected => Ok(()),
66            Some(token) => Err(Error::Message(format!(
67                "expected {:?}, found {:?}",
68                expected, token
69            ))),
70            None => Err(Error::UnexpectedEof),
71        }
72    }
73
74    /// Skips newlines and indents (treats them as whitespace)
75    fn skip_whitespace(&mut self) {
76        while matches!(self.peek(), Some(Token::Newline) | Some(Token::Indent(_))) {
77            self.pos += 1;
78        }
79    }
80
81    /// Skips empty lines (newlines without content)
82    fn skip_empty_lines(&mut self) {
83        while matches!(self.peek(), Some(Token::Newline)) {
84            self.pos += 1;
85        }
86    }
87
88    /// Gets the indent level at current position (0 if not at indent)
89    #[allow(dead_code)]
90    fn get_indent(&self) -> usize {
91        match self.peek() {
92            Some(Token::Indent(n)) => *n,
93            _ => 0,
94        }
95    }
96
97    /// Consumes indent token if present
98    fn consume_indent(&mut self) -> usize {
99        match self.peek() {
100            Some(Token::Indent(n)) => {
101                let level = *n;
102                self.pos += 1;
103                level
104            }
105            _ => 0,
106        }
107    }
108
109    /// Parses an object at a given depth level
110    fn parse_object_at_depth(&mut self, expected_depth: usize) -> Result<Value> {
111        let mut object = IndexMap::new();
112
113        loop {
114            self.skip_empty_lines();
115
116            if self.is_eof() {
117                break;
118            }
119
120            // Check indentation level
121            let current_depth = self.consume_indent();
122
123            // If we're at EOF after consuming indent, break
124            if self.is_eof() {
125                break;
126            }
127
128            // If depth is less than expected, we've exited this object
129            if current_depth < expected_depth {
130                // Put back the indent token if we consumed one
131                if current_depth > 0 {
132                    self.pos -= 1;
133                }
134                break;
135            }
136
137            // Parse the key
138            let key = match self.next_token() {
139                Some(Token::Ident(s)) => s.clone(),
140                Some(Token::String(s)) => s.clone(),
141                Some(token) => {
142                    return Err(Error::Message(format!(
143                        "expected key, found {:?}",
144                        token
145                    )))
146                }
147                None => return Err(Error::UnexpectedEof),
148            };
149
150            // Check if this is an array header (key[N]...)
151            if let Some(Token::OpenBracket) = self.peek() {
152                // Array header
153                let arr = self.parse_array_header_rest()?;
154                object.insert(key, arr);
155            } else {
156                // Regular key-value
157                self.expect(&Token::Colon)?;
158                let value = self.parse_value_after_colon(current_depth)?;
159                object.insert(key, value);
160            }
161        }
162
163        Ok(Value::Object(object))
164    }
165
166    /// Parses the rest of an array header after the key (already consumed)
167    fn parse_array_header_rest(&mut self) -> Result<Value> {
168        // Parse [length]
169        self.expect(&Token::OpenBracket)?;
170        let length = match self.next_token() {
171            Some(Token::Number(n)) => n.parse::<usize>().map_err(|_| Error::InvalidNumber {
172                text: n.clone(),
173                pos: 0,
174            })?,
175            Some(token) => {
176                return Err(Error::Message(format!(
177                    "expected number in array length, found {:?}",
178                    token
179                )))
180            }
181            None => return Err(Error::UnexpectedEof),
182        };
183        self.expect(&Token::CloseBracket)?;
184
185        // Parse optional {fields}
186        let fields = if let Some(Token::OpenBrace) = self.peek() {
187            self.pos += 1;
188            let mut field_names = Vec::new();
189            loop {
190                match self.next_token() {
191                    Some(Token::Ident(s)) => field_names.push(s.clone()),
192                    Some(Token::String(s)) => field_names.push(s.clone()),
193                    Some(token) => {
194                        return Err(Error::Message(format!(
195                            "expected field name, found {:?}",
196                            token
197                        )))
198                    }
199                    None => return Err(Error::UnexpectedEof),
200                }
201
202                match self.peek() {
203                    Some(Token::Comma) => self.pos += 1,
204                    Some(Token::CloseBrace) => {
205                        self.pos += 1;
206                        break;
207                    }
208                    Some(token) => {
209                        return Err(Error::Message(format!(
210                            "expected comma or }}, found {:?}",
211                            token
212                        )))
213                    }
214                    None => return Err(Error::UnexpectedEof),
215                }
216            }
217            Some(field_names)
218        } else {
219            None
220        };
221
222        // Expect colon
223        self.expect(&Token::Colon)?;
224
225        // Parse array content
226        if let Some(fields) = fields {
227            // Tabular array
228            self.parse_tabular_rows(&fields, length)
229        } else {
230            // Inline primitive array
231            self.parse_inline_primitives(length)
232        }
233    }
234
235    /// Parses value after a colon (at a given parent depth)
236    fn parse_value_after_colon(&mut self, parent_depth: usize) -> Result<Value> {
237        // Skip inline indents only, NOT newlines
238        while matches!(self.peek(), Some(Token::Indent(_))) {
239            self.pos += 1;
240        }
241
242        match self.peek() {
243            Some(Token::True) => {
244                self.pos += 1;
245                Ok(Value::Bool(true))
246            }
247            Some(Token::False) => {
248                self.pos += 1;
249                Ok(Value::Bool(false))
250            }
251            Some(Token::Null) => {
252                self.pos += 1;
253                Ok(Value::Null)
254            }
255            Some(Token::Number(n)) => {
256                let val = n.clone();
257                self.pos += 1;
258                Ok(Value::Number(val))
259            }
260            Some(Token::String(s)) => {
261                let val = s.clone();
262                self.pos += 1;
263                Ok(Value::String(val))
264            }
265            Some(Token::Ident(s)) => {
266                let mut val = s.clone();
267                self.pos += 1;
268                // Collect consecutive idents/numbers as a multi-word string
269                while let Some(Token::Ident(ref w)) | Some(Token::Number(ref w)) = self.peek() {
270                    if val == "true" || val == "false" || val == "null" {
271                        break;
272                    }
273                    val.push(' ');
274                    val.push_str(w);
275                    self.pos += 1;
276                }
277                Ok(Value::String(val))
278            }
279            Some(Token::OpenBracket) => self.parse_bracket_array(),
280            Some(Token::Hyphen) => self.parse_expanded_array(),
281            Some(Token::Newline) => {
282                // Value is on the next line - parse nested block
283                self.pos += 1;
284                self.parse_nested_block(parent_depth + 1)
285            }
286            Some(Token::Eof) => Ok(Value::Null),
287            Some(token) => Err(Error::Message(format!(
288                "unexpected token after colon: {:?}",
289                token
290            ))),
291            None => Err(Error::UnexpectedEof),
292        }
293    }
294
295    /// Parses a nested block at a given depth
296    fn parse_nested_block(&mut self, expected_depth: usize) -> Result<Value> {
297        self.skip_empty_lines();
298
299        if self.is_eof() {
300            return Ok(Value::Null);
301        }
302
303        // Check what's at this depth
304        match self.peek() {
305            Some(Token::Indent(n)) if *n >= expected_depth => {
306                self.parse_object_at_depth(expected_depth)
307            }
308            Some(Token::Hyphen) => {
309                // Could be expanded array starting at this depth
310                self.parse_expanded_array()
311            }
312            _ => {
313                // Single value
314                self.parse_value()
315            }
316        }
317    }
318
319    /// Parses an expanded array with hyphens
320    fn parse_expanded_array(&mut self) -> Result<Value> {
321        let mut items = Vec::new();
322
323        while let Some(Token::Hyphen) = self.peek() {
324            self.pos += 1; // consume hyphen
325            let value = self.parse_value()?;
326            items.push(value);
327            self.skip_whitespace();
328        }
329
330        Ok(Value::Array(items))
331    }
332
333    /// Parses an inline JSON-like array [1, 2, 3]
334    fn parse_bracket_array(&mut self) -> Result<Value> {
335        self.expect(&Token::OpenBracket)?;
336        let mut values = Vec::new();
337
338        loop {
339            self.skip_whitespace();
340            match self.peek() {
341                Some(Token::CloseBracket) => {
342                    self.pos += 1;
343                    break;
344                }
345                _ => {
346                    let value = self.parse_value()?;
347                    values.push(value);
348
349                    self.skip_whitespace();
350                    match self.peek() {
351                        Some(Token::Comma) => {
352                            self.pos += 1;
353                        }
354                        Some(Token::CloseBracket) => {
355                            self.pos += 1;
356                            break;
357                        }
358                        _ => break,
359                    }
360                }
361            }
362        }
363
364        Ok(Value::Array(values))
365    }
366
367    /// Parses inline primitive values separated by delimiters
368    fn parse_inline_primitives(&mut self, expected_count: usize) -> Result<Value> {
369        let mut values = Vec::new();
370        let mut count = 0;
371
372        while count < expected_count {
373            self.skip_whitespace();
374
375            if self.is_eof() {
376                break;
377            }
378
379            let value = match self.peek() {
380                Some(Token::Number(n)) => {
381                    let val = n.clone();
382                    self.pos += 1;
383                    Value::Number(val)
384                }
385                Some(Token::String(s)) => {
386                    let val = s.clone();
387                    self.pos += 1;
388                    Value::String(val)
389                }
390                Some(Token::Ident(s)) => {
391                    let mut val = s.clone();
392                    self.pos += 1;
393                    // Collect consecutive idents/numbers as a multi-word string
394                    loop {
395                        if let Some(Token::Ident(ref w)) = self.peek() {
396                            if val == "true" || val == "false" || val == "null" {
397                                break;
398                            }
399                            val.push(' ');
400                            val.push_str(w);
401                            self.pos += 1;
402                        } else if let Some(Token::Number(ref w)) = self.peek() {
403                            if val == "true" || val == "false" || val == "null" {
404                                break;
405                            }
406                            val.push(' ');
407                            val.push_str(w);
408                            self.pos += 1;
409                        } else {
410                            break;
411                        }
412                    }
413                    if val == "true" {
414                        Value::Bool(true)
415                    } else if val == "false" {
416                        Value::Bool(false)
417                    } else if val == "null" {
418                        Value::Null
419                    } else {
420                        Value::String(val)
421                    }
422                }
423                Some(Token::True) => {
424                    self.pos += 1;
425                    Value::Bool(true)
426                }
427                Some(Token::False) => {
428                    self.pos += 1;
429                    Value::Bool(false)
430                }
431                Some(Token::Null) => {
432                    self.pos += 1;
433                    Value::Null
434                }
435                _ => break,
436            };
437
438            values.push(value);
439            count += 1;
440
441            // Check for delimiter
442            match self.peek() {
443                Some(Token::Comma) | Some(Token::Pipe) => {
444                    self.pos += 1;
445                }
446                _ => break,
447            }
448        }
449
450        if count != expected_count {
451            return Err(Error::ArrayLengthMismatch {
452                declared: expected_count,
453                found: count,
454                pos: 0,
455            });
456        }
457
458        Ok(Value::Array(values))
459    }
460
461    /// Parses tabular array rows
462    fn parse_tabular_rows(&mut self, fields: &[String], expected_count: usize) -> Result<Value> {
463        let mut rows = Vec::new();
464        let mut count = 0;
465
466        while count < expected_count {
467            self.skip_empty_lines();
468
469            if self.is_eof() {
470                break;
471            }
472
473            // Check if next line is a row (starts with indent)
474            match self.peek() {
475                Some(Token::Indent(_)) => {
476                    self.consume_indent();
477                }
478                Some(Token::Ident(_)) | Some(Token::String(_)) | Some(Token::Hyphen) => {
479                    // Could be a new key at parent level - stop
480                    break;
481                }
482                _ => {
483                    break;
484                }
485            }
486
487            let mut row = IndexMap::new();
488            let mut field_idx = 0;
489
490            while field_idx < fields.len() {
491                let value = match self.peek() {
492                    Some(Token::Number(n)) => {
493                        let val = n.clone();
494                        self.pos += 1;
495                        Value::Number(val)
496                    }
497                    Some(Token::String(s)) => {
498                        let val = s.clone();
499                        self.pos += 1;
500                        Value::String(val)
501                    }
502                    Some(Token::Ident(s)) => {
503                        let mut val = s.clone();
504                        self.pos += 1;
505                        // Collect consecutive idents/numbers as a multi-word string
506                        loop {
507                            if let Some(Token::Ident(ref w)) = self.peek() {
508                                if val == "true" || val == "false" || val == "null" {
509                                    break;
510                                }
511                                val.push(' ');
512                                val.push_str(w);
513                                self.pos += 1;
514                            } else if let Some(Token::Number(ref w)) = self.peek() {
515                                if val == "true" || val == "false" || val == "null" {
516                                    break;
517                                }
518                                val.push(' ');
519                                val.push_str(w);
520                                self.pos += 1;
521                            } else {
522                                break;
523                            }
524                        }
525                        if val == "true" {
526                            Value::Bool(true)
527                        } else if val == "false" {
528                            Value::Bool(false)
529                        } else if val == "null" {
530                            Value::Null
531                        } else {
532                            Value::String(val)
533                        }
534                    }
535                    Some(Token::True) => {
536                        self.pos += 1;
537                        Value::Bool(true)
538                    }
539                    Some(Token::False) => {
540                        self.pos += 1;
541                        Value::Bool(false)
542                    }
543                    Some(Token::Null) => {
544                        self.pos += 1;
545                        Value::Null
546                    }
547                    _ => break,
548                };
549
550                row.insert(fields[field_idx].clone(), value);
551                field_idx += 1;
552
553                // Check for delimiter
554                match self.peek() {
555                    Some(Token::Comma) | Some(Token::Pipe) => {
556                        self.pos += 1;
557                    }
558                    _ => break,
559                }
560            }
561
562            rows.push(Value::Object(row));
563            count += 1;
564        }
565
566        if count != expected_count {
567            return Err(Error::ArrayLengthMismatch {
568                declared: expected_count,
569                found: count,
570                pos: 0,
571            });
572        }
573
574        Ok(Value::Array(rows))
575    }
576
577    /// Parses a single value
578    fn parse_value(&mut self) -> Result<Value> {
579        self.skip_whitespace();
580
581        match self.peek() {
582            Some(Token::True) => {
583                self.pos += 1;
584                Ok(Value::Bool(true))
585            }
586            Some(Token::False) => {
587                self.pos += 1;
588                Ok(Value::Bool(false))
589            }
590            Some(Token::Null) => {
591                self.pos += 1;
592                Ok(Value::Null)
593            }
594            Some(Token::Number(n)) => {
595                let val = n.clone();
596                self.pos += 1;
597                Ok(Value::Number(val))
598            }
599            Some(Token::String(s)) => {
600                let val = s.clone();
601                self.pos += 1;
602                Ok(Value::String(val))
603            }
604            Some(Token::Ident(s)) => {
605                let val = s.clone();
606                self.pos += 1;
607                Ok(Value::String(val))
608            }
609            Some(Token::OpenBracket) => self.parse_bracket_array(),
610            Some(Token::Hyphen) => self.parse_expanded_array(),
611            Some(token) => Err(Error::Message(format!(
612                "unexpected token in value: {:?}",
613                token
614            ))),
615            None => Err(Error::UnexpectedEof),
616        }
617    }
618}
619
620/// Convenience function to parse TOON string into Value
621pub fn parse(input: &str) -> Result<Value> {
622    let mut parser = Parser::new(input)?;
623    parser.parse()
624}
625
626#[cfg(test)]
627mod tests {
628    use super::*;
629
630    #[test]
631    fn test_parse_simple_object() {
632        let input = "id: 123\nname: Alice\nactive: true";
633        let value = parse(input).unwrap();
634
635        assert!(value.is_object());
636        assert_eq!(value.get("id").unwrap().as_i64(), Some(123));
637        assert_eq!(value.get("name").unwrap().as_str(), Some("Alice"));
638        assert_eq!(value.get("active").unwrap().as_bool(), Some(true));
639    }
640
641    #[test]
642    fn test_parse_nested_object() {
643        let input = "user:\n  id: 1\n  name: Alice";
644        let value = parse(input).unwrap();
645
646        assert!(value.is_object());
647        let user = value.get("user").unwrap();
648        assert!(user.is_object());
649        assert_eq!(user.get("id").unwrap().as_i64(), Some(1));
650        assert_eq!(user.get("name").unwrap().as_str(), Some("Alice"));
651    }
652
653    #[test]
654    fn test_parse_array_inline() {
655        let input = "tags[3]: foo,bar,baz";
656        let value = parse(input).unwrap();
657
658        assert!(value.is_object());
659        let arr = value.get("tags").unwrap();
660        assert!(arr.is_array());
661        assert_eq!(arr.as_array().unwrap().len(), 3);
662        assert_eq!(arr.as_array().unwrap()[0].as_str(), Some("foo"));
663        assert_eq!(arr.as_array().unwrap()[1].as_str(), Some("bar"));
664        assert_eq!(arr.as_array().unwrap()[2].as_str(), Some("baz"));
665    }
666
667    #[test]
668    fn test_parse_tabular_array() {
669        let input = "users[2]{id,name,role}:\n  1,Alice,admin\n  2,Bob,user";
670        let value = parse(input).unwrap();
671
672        assert!(value.is_object());
673        let arr = value.get("users").unwrap();
674        assert!(arr.is_array());
675        assert_eq!(arr.as_array().unwrap().len(), 2);
676
677        let row1 = arr.as_array().unwrap()[0].as_object().unwrap();
678        assert_eq!(row1.get("id").unwrap().as_i64(), Some(1));
679        assert_eq!(row1.get("name").unwrap().as_str(), Some("Alice"));
680        assert_eq!(row1.get("role").unwrap().as_str(), Some("admin"));
681
682        let row2 = arr.as_array().unwrap()[1].as_object().unwrap();
683        assert_eq!(row2.get("id").unwrap().as_i64(), Some(2));
684        assert_eq!(row2.get("name").unwrap().as_str(), Some("Bob"));
685        assert_eq!(row2.get("role").unwrap().as_str(), Some("user"));
686    }
687
688    #[test]
689    fn test_parse_expanded_array() {
690        let input = "- first\n- second\n- third";
691        let value = parse(input).unwrap();
692
693        assert!(value.is_array());
694        let arr = value.as_array().unwrap();
695        assert_eq!(arr.len(), 3);
696        assert_eq!(arr[0].as_str(), Some("first"));
697        assert_eq!(arr[1].as_str(), Some("second"));
698        assert_eq!(arr[2].as_str(), Some("third"));
699    }
700
701    #[test]
702    fn test_parse_complex_document() {
703        let input = r#"context:
704  task: Our favorite hikes together
705  location: Boulder
706  season: spring_2025
707friends[3]: ana,luis,sam
708users[2]{id,name,role}:
709  1,Alice,admin
710  2,Bob,user"#;
711
712        let value = parse(input).unwrap();
713        assert!(value.is_object());
714
715        let context = value.get("context").unwrap();
716        assert!(context.is_object());
717        assert_eq!(
718            context.get("task").unwrap().as_str(),
719            Some("Our favorite hikes together")
720        );
721
722        let friends = value.get("friends").unwrap();
723        assert!(friends.is_array());
724        assert_eq!(friends.as_array().unwrap().len(), 3);
725
726        let users = value.get("users").unwrap();
727        assert!(users.is_array());
728        let users_arr = users.as_array().unwrap();
729        assert_eq!(users_arr.len(), 2);
730
731        let user1 = users_arr[0].as_object().unwrap();
732        assert_eq!(user1.get("name").unwrap().as_str(), Some("Alice"));
733        assert_eq!(user1.get("role").unwrap().as_str(), Some("admin"));
734    }
735}