Skip to main content

tealeaf/
parser.rs

1//! Parser for TeaLeaf text format
2
3use std::path::Path;
4use indexmap::IndexMap;
5use crate::{Error, Result, Value, Schema, Field, FieldType, Union, Variant};
6use crate::types::ObjectMap;
7use crate::lexer::{Token, TokenKind, Lexer};
8
9/// Maximum recursion depth for nested parse_value calls (arrays, objects, maps, tuples, tags).
10/// Matches the binary reader's MAX_DECODE_DEPTH to ensure text↔binary parity.
11const MAX_PARSE_DEPTH: usize = 256;
12
13pub struct Parser {
14    tokens: Vec<Token>,
15    pos: usize,
16    schemas: IndexMap<String, Schema>,
17    unions: IndexMap<String, Union>,
18    base_path: Option<std::path::PathBuf>,
19    /// Tracks included file paths for cycle detection
20    include_stack: Vec<std::path::PathBuf>,
21    /// Indicates the source was a root-level JSON array (set by @root-array directive)
22    is_root_array: bool,
23}
24
25impl Parser {
26    pub fn new(tokens: Vec<Token>) -> Self {
27        Self {
28            tokens,
29            pos: 0,
30            schemas: IndexMap::new(),
31            unions: IndexMap::new(),
32            base_path: None,
33            include_stack: Vec::new(),
34            is_root_array: false,
35        }
36    }
37
38    pub fn with_base_path(mut self, path: &Path) -> Self {
39        self.base_path = path.parent().map(|p| p.to_path_buf());
40        self
41    }
42
43    pub fn parse(&mut self) -> Result<IndexMap<String, Value>> {
44        let mut result = IndexMap::new();
45
46        while !self.at_end() {
47            match self.current_kind() {
48                TokenKind::Directive(d) => {
49                    let directive = d.clone();
50                    self.advance();
51                    match directive.as_str() {
52                        "struct" => self.parse_struct_def()?,
53                        "union" => self.parse_union_def()?,
54                        "include" => {
55                            let included = self.parse_include()?;
56                            for (k, v) in included {
57                                result.insert(k, v);
58                            }
59                        }
60                        "root-array" => {
61                            // Marks this document as representing a root-level JSON array
62                            self.is_root_array = true;
63                        }
64                        _ => {
65                            // Unknown top-level directive: silently ignored (spec §1.18).
66                            // Consume same-line argument for forward compatibility —
67                            // a future directive like @custom foo should not leave
68                            // "foo" to be misparsed as a key.
69                            let directive_line = self.tokens[self.pos - 1].line;
70                            if !self.at_end()
71                                && self.current().line == directive_line
72                                && self.can_start_value()
73                            {
74                                let _ = self.parse_value(0)?;
75                            }
76                        }
77                    }
78                }
79                TokenKind::Word(_) | TokenKind::String(_) => {
80                    let (key, value) = self.parse_pair(0)?;
81                    result.insert(key, value);
82                }
83                TokenKind::Ref(r) => {
84                    let ref_name = r.clone();
85                    self.advance();
86                    self.expect(TokenKind::Colon)?;
87                    let value = self.parse_value(0)?;
88                    result.insert(format!("!{}", ref_name), value);
89                }
90                TokenKind::Eof => break,
91                _ => { self.advance(); }
92            }
93        }
94
95        Ok(result)
96    }
97
98    pub fn into_schemas(self) -> IndexMap<String, Schema> {
99        self.schemas
100    }
101
102    pub fn into_unions(self) -> IndexMap<String, Union> {
103        self.unions
104    }
105
106    /// Consume the parser and return both schemas and unions.
107    pub fn into_schemas_and_unions(self) -> (IndexMap<String, Schema>, IndexMap<String, Union>) {
108        (self.schemas, self.unions)
109    }
110
111    /// Check if the @root-array directive was present
112    pub fn is_root_array(&self) -> bool {
113        self.is_root_array
114    }
115
116    // =========================================================================
117    // Struct Definition
118    // =========================================================================
119
120    fn parse_struct_def(&mut self) -> Result<()> {
121        let name = self.expect_word()?;
122        self.expect(TokenKind::LParen)?;
123
124        let mut schema = Schema::new(&name);
125
126        while !self.check(TokenKind::RParen) {
127            // Field names can be unquoted words or quoted strings (for names
128            // that contain special characters like @type, $ref, etc.)
129            let field_name = match self.current_kind() {
130                TokenKind::Word(w) => {
131                    let w = w.clone();
132                    self.advance();
133                    w
134                }
135                TokenKind::String(s) => {
136                    let s = s.clone();
137                    self.advance();
138                    s
139                }
140                _ => return Err(Error::UnexpectedToken {
141                    expected: "field name".to_string(),
142                    got: format!("{:?}", self.current_kind()),
143                }),
144            };
145
146            let field_type = if self.check(TokenKind::Colon) {
147                self.advance();
148                self.parse_field_type()?
149            } else {
150                FieldType::new("string")
151            };
152
153            schema.add_field(field_name, field_type);
154
155            if self.check(TokenKind::Comma) {
156                self.advance();
157            }
158        }
159
160        self.expect(TokenKind::RParen)?;
161        self.schemas.insert(name, schema);
162        Ok(())
163    }
164
165    // =========================================================================
166    // Union Definition
167    // =========================================================================
168
169    fn parse_union_def(&mut self) -> Result<()> {
170        let name = self.expect_word()?;
171        self.expect(TokenKind::LBrace)?;
172
173        let mut union_type = Union::new(&name);
174
175        while !self.check(TokenKind::RBrace) {
176            let variant_name = self.expect_word()?;
177            self.expect(TokenKind::LParen)?;
178
179            let mut variant = Variant::new(&variant_name);
180
181            while !self.check(TokenKind::RParen) {
182                let field_name = self.expect_word()?;
183
184                let field_type = if self.check(TokenKind::Colon) {
185                    self.advance();
186                    self.parse_field_type()?
187                } else {
188                    FieldType::new("string")
189                };
190
191                variant.fields.push(Field::new(field_name, field_type));
192
193                if self.check(TokenKind::Comma) {
194                    self.advance();
195                }
196            }
197
198            self.expect(TokenKind::RParen)?;
199            union_type.add_variant(variant);
200
201            if self.check(TokenKind::Comma) {
202                self.advance();
203            }
204        }
205
206        self.expect(TokenKind::RBrace)?;
207        self.unions.insert(name, union_type);
208        Ok(())
209    }
210
211    // =========================================================================
212    // Include Directive
213    // =========================================================================
214
215    fn parse_include(&mut self) -> Result<IndexMap<String, Value>> {
216        let path_str = match self.current_kind() {
217            TokenKind::String(s) => s.clone(),
218            TokenKind::Word(w) => w.clone(),
219            _ => return Err(Error::UnexpectedToken {
220                expected: "file path".to_string(),
221                got: format!("{:?}", self.current_kind()),
222            }),
223        };
224        self.advance();
225
226        // Resolve path relative to current file
227        let include_path = if let Some(ref base) = self.base_path {
228            base.join(&path_str)
229        } else {
230            std::path::PathBuf::from(&path_str)
231        };
232
233        // Cycle detection and depth limit
234        let canonical = include_path.canonicalize()
235            .unwrap_or_else(|_| include_path.clone());
236        if self.include_stack.contains(&canonical) {
237            return Err(Error::ParseError(format!(
238                "Circular include detected: {}", canonical.display()
239            )));
240        }
241        if self.include_stack.len() >= 32 {
242            return Err(Error::ParseError(
243                "Include depth exceeds limit of 32".into()
244            ));
245        }
246
247        // Read and parse the included file
248        let content = std::fs::read_to_string(&include_path)
249            .map_err(|e| Error::ParseError(format!("Failed to include {}: {}", path_str, e)))?;
250
251        let tokens = Lexer::new(&content).tokenize()?;
252        let mut parser = Parser::new(tokens);
253        if let Some(parent) = include_path.parent() {
254            parser.base_path = Some(parent.to_path_buf());
255        }
256        // Propagate include stack and accumulated schemas/unions to child parser
257        // so that schemas from earlier includes are available in later includes.
258        parser.include_stack = self.include_stack.clone();
259        parser.include_stack.push(canonical);
260        parser.schemas = self.schemas.clone();
261        parser.unions = self.unions.clone();
262
263        let data = parser.parse()?;
264
265        // Merge schemas and unions
266        for (name, schema) in parser.schemas {
267            self.schemas.insert(name, schema);
268        }
269        for (name, union_type) in parser.unions {
270            self.unions.insert(name, union_type);
271        }
272
273        Ok(data)
274    }
275
276    fn parse_field_type(&mut self) -> Result<FieldType> {
277        let mut type_str = String::new();
278
279        // Handle array prefix
280        if self.check(TokenKind::LBracket) {
281            self.advance();
282            self.expect(TokenKind::RBracket)?;
283            type_str.push_str("[]");
284        }
285
286        // Base type
287        let base = self.expect_word()?;
288
289        // Reject value-only types that cannot be schema field types (spec §2.1)
290        match base.as_str() {
291            "object" | "map" | "tuple" | "ref" | "tagged" => {
292                return Err(Error::ParseError(
293                    format!("'{}' is a value type and cannot be used as a schema field type", base)
294                ));
295            }
296            _ => {}
297        }
298
299        type_str.push_str(&base);
300
301        // Nullable suffix
302        if self.check(TokenKind::Question) {
303            self.advance();
304            type_str.push('?');
305        }
306
307        Ok(FieldType::parse(&type_str))
308    }
309
310    // =========================================================================
311    // Key-Value Pairs
312    // =========================================================================
313
314    fn parse_pair(&mut self, depth: usize) -> Result<(String, Value)> {
315        let key = match self.current_kind() {
316            TokenKind::Word(w) => w.clone(),
317            TokenKind::String(s) => s.clone(),
318            _ => return Err(Error::UnexpectedToken {
319                expected: "key".to_string(),
320                got: format!("{:?}", self.current_kind()),
321            }),
322        };
323        self.advance();
324        self.expect(TokenKind::Colon)?;
325        let value = self.parse_value(depth)?;
326        Ok((key, value))
327    }
328
329    // =========================================================================
330    // Values
331    // =========================================================================
332
333    fn parse_value(&mut self, depth: usize) -> Result<Value> {
334        if depth > MAX_PARSE_DEPTH {
335            return Err(Error::ParseError("maximum parse nesting depth exceeded".into()));
336        }
337        match self.current_kind() {
338            TokenKind::Null => { self.advance(); Ok(Value::Null) }
339            TokenKind::Bool(b) => { let b = *b; self.advance(); Ok(Value::Bool(b)) }
340            TokenKind::Int(i) => { let i = *i; self.advance(); Ok(Value::Int(i)) }
341            TokenKind::UInt(u) => { let u = *u; self.advance(); Ok(Value::UInt(u)) }
342            TokenKind::JsonNumber(s) => { let s = s.clone(); self.advance(); Ok(Value::JsonNumber(s)) }
343            TokenKind::Float(f) => { let f = *f; self.advance(); Ok(Value::Float(f)) }
344            TokenKind::String(s) => { let s = s.clone(); self.advance(); Ok(Value::String(s)) }
345            TokenKind::Bytes(b) => { let b = b.clone(); self.advance(); Ok(Value::Bytes(b)) }
346            TokenKind::Word(w) => { let w = w.clone(); self.advance(); Ok(Value::String(w)) }
347            TokenKind::Ref(r) => { let r = r.clone(); self.advance(); Ok(Value::Ref(r)) }
348            TokenKind::Timestamp(ts, tz) => { let ts = *ts; let tz = *tz; self.advance(); Ok(Value::Timestamp(ts, tz)) }
349            TokenKind::Colon => {
350                self.advance(); // consume ':'
351                match self.current_kind() {
352                    TokenKind::Word(w) => {
353                        let tag = w.clone();
354                        self.advance(); // consume tag name
355                        let inner = self.parse_value(depth + 1)?;
356                        Ok(Value::Tagged(tag, Box::new(inner)))
357                    }
358                    _ => Err(Error::UnexpectedToken {
359                        expected: "tag name after ':'".to_string(),
360                        got: format!("{:?}", self.current_kind()),
361                    })
362                }
363            }
364            TokenKind::Directive(d) => {
365                let directive = d.clone();
366                self.advance();
367                self.parse_directive_value(&directive, depth)
368            }
369            TokenKind::LBrace => self.parse_object(depth + 1),
370            TokenKind::LBracket => self.parse_array(depth + 1),
371            TokenKind::LParen => self.parse_tuple(depth + 1),
372            _ => Err(Error::UnexpectedToken {
373                expected: "value".to_string(),
374                got: format!("{:?}", self.current_kind()),
375            }),
376        }
377    }
378
379    fn parse_directive_value(&mut self, directive: &str, depth: usize) -> Result<Value> {
380        match directive {
381            "table" => self.parse_table(depth),
382            "map" => self.parse_map(depth),
383            _ => {
384                // Unknown directive in value position: consume argument, return null (spec §1.18)
385                if self.can_start_value() {
386                    let _ = self.parse_value(depth)?;
387                }
388                Ok(Value::Null)
389            }
390        }
391    }
392
393    /// Returns true if the current token can begin a value expression.
394    fn can_start_value(&self) -> bool {
395        matches!(
396            self.current_kind(),
397            TokenKind::Null
398                | TokenKind::Bool(_)
399                | TokenKind::Int(_)
400                | TokenKind::UInt(_)
401                | TokenKind::Float(_)
402                | TokenKind::String(_)
403                | TokenKind::Bytes(_)
404                | TokenKind::Word(_)
405                | TokenKind::Ref(_)
406                | TokenKind::Timestamp(_, _)
407                | TokenKind::JsonNumber(_)
408                | TokenKind::Colon
409                | TokenKind::Directive(_)
410                | TokenKind::LBrace
411                | TokenKind::LBracket
412                | TokenKind::LParen
413        )
414    }
415
416    fn parse_map(&mut self, depth: usize) -> Result<Value> {
417        self.expect(TokenKind::LBrace)?;
418        let mut pairs = Vec::new();
419
420        while !self.check(TokenKind::RBrace) {
421            // Parse key (string, name, or integer per spec grammar:
422            // map_key = string | name | integer)
423            let key = match self.current_kind() {
424                TokenKind::String(s) => { let s = s.clone(); self.advance(); Value::String(s) }
425                TokenKind::Word(w) => { let w = w.clone(); self.advance(); Value::String(w) }
426                TokenKind::Int(i) => { let i = *i; self.advance(); Value::Int(i) }
427                TokenKind::UInt(u) => { let u = *u; self.advance(); Value::UInt(u) }
428                _ => return Err(Error::UnexpectedToken {
429                    expected: "map key".to_string(),
430                    got: format!("{:?}", self.current_kind()),
431                }),
432            };
433
434            self.expect(TokenKind::Colon)?;
435            let value = self.parse_value(depth + 1)?;
436            pairs.push((key, value));
437
438            if self.check(TokenKind::Comma) {
439                self.advance();
440            }
441        }
442
443        self.expect(TokenKind::RBrace)?;
444        Ok(Value::Map(pairs))
445    }
446
447    fn parse_table(&mut self, depth: usize) -> Result<Value> {
448        let struct_name = self.expect_word()?;
449        let schema = self.schemas
450            .get(&struct_name)
451            .ok_or_else(|| Error::UnknownStruct(struct_name.clone()))?
452            .clone();
453
454        self.expect(TokenKind::LBracket)?;
455
456        let mut rows = Vec::new();
457        while !self.check(TokenKind::RBracket) {
458            let row = self.parse_tuple_with_schema(&schema, depth + 1)?;
459            rows.push(row);
460            if self.check(TokenKind::Comma) {
461                self.advance();
462            }
463        }
464
465        self.expect(TokenKind::RBracket)?;
466        Ok(Value::Array(rows))
467    }
468
469    fn parse_tuple_with_schema(&mut self, schema: &Schema, depth: usize) -> Result<Value> {
470        self.expect(TokenKind::LParen)?;
471
472        let mut obj = ObjectMap::new();
473        for field in &schema.fields {
474            let value = self.parse_value_for_field(&field.field_type, depth)?;
475            obj.insert(field.name.clone(), value);
476            if self.check(TokenKind::Comma) {
477                self.advance();
478            }
479        }
480
481        self.expect(TokenKind::RParen)?;
482        Ok(Value::Object(obj))
483    }
484
485    fn parse_value_for_field(&mut self, field_type: &FieldType, depth: usize) -> Result<Value> {
486        // Handle null
487        if self.check(TokenKind::Null) {
488            self.advance();
489            return Ok(Value::Null);
490        }
491
492        // Handle nested struct — schema names shadow built-in types, so check
493        // by name rather than relying on is_struct() which lacks schema context.
494        // The LParen guard disambiguates: struct tuples always start with `(`,
495        // while primitive values (int, bool, string, etc.) never do.
496        if !field_type.is_array && self.check(TokenKind::LParen) {
497            if let Some(schema) = self.schemas.get(&field_type.base).cloned() {
498                return self.parse_tuple_with_schema(&schema, depth + 1);
499            }
500        }
501
502        // Handle array
503        if field_type.is_array {
504            self.expect(TokenKind::LBracket)?;
505            let mut arr = Vec::new();
506            let inner_type = FieldType::new(&field_type.base);
507            while !self.check(TokenKind::RBracket) {
508                arr.push(self.parse_value_for_field(&inner_type, depth + 1)?);
509                if self.check(TokenKind::Comma) {
510                    self.advance();
511                }
512            }
513            self.expect(TokenKind::RBracket)?;
514            return Ok(Value::Array(arr));
515        }
516
517        // Regular value
518        self.parse_value(depth)
519    }
520
521    fn parse_object(&mut self, depth: usize) -> Result<Value> {
522        self.expect(TokenKind::LBrace)?;
523        let mut obj = ObjectMap::new();
524
525        while !self.check(TokenKind::RBrace) {
526            if let TokenKind::Ref(r) = self.current_kind() {
527                let key = format!("!{}", r);
528                self.advance();
529                self.expect(TokenKind::Colon)?;
530                let value = self.parse_value(depth)?;
531                obj.insert(key, value);
532            } else {
533                let (key, value) = self.parse_pair(depth)?;
534                obj.insert(key, value);
535            }
536            if self.check(TokenKind::Comma) {
537                self.advance();
538            }
539        }
540
541        self.expect(TokenKind::RBrace)?;
542        Ok(Value::Object(obj))
543    }
544
545    fn parse_array(&mut self, depth: usize) -> Result<Value> {
546        self.expect(TokenKind::LBracket)?;
547        let mut arr = Vec::new();
548
549        while !self.check(TokenKind::RBracket) {
550            arr.push(self.parse_value(depth)?);
551            if self.check(TokenKind::Comma) {
552                self.advance();
553            }
554        }
555
556        self.expect(TokenKind::RBracket)?;
557        Ok(Value::Array(arr))
558    }
559
560    fn parse_tuple(&mut self, depth: usize) -> Result<Value> {
561        self.expect(TokenKind::LParen)?;
562        let mut arr = Vec::new();
563
564        while !self.check(TokenKind::RParen) {
565            arr.push(self.parse_value(depth)?);
566            if self.check(TokenKind::Comma) {
567                self.advance();
568            }
569        }
570
571        self.expect(TokenKind::RParen)?;
572        Ok(Value::Array(arr))
573    }
574
575    // =========================================================================
576    // Helpers
577    // =========================================================================
578
579    fn current(&self) -> &Token {
580        self.tokens.get(self.pos).unwrap_or(&Token {
581            kind: TokenKind::Eof,
582            line: 0,
583            col: 0,
584        })
585    }
586
587    fn current_kind(&self) -> &TokenKind {
588        &self.current().kind
589    }
590
591    fn advance(&mut self) {
592        if self.pos < self.tokens.len() {
593            self.pos += 1;
594        }
595    }
596
597    fn check(&self, expected: TokenKind) -> bool {
598        std::mem::discriminant(self.current_kind()) == std::mem::discriminant(&expected)
599    }
600
601    fn expect(&mut self, expected: TokenKind) -> Result<()> {
602        if self.check(expected.clone()) {
603            self.advance();
604            Ok(())
605        } else {
606            Err(Error::UnexpectedToken {
607                expected: format!("{:?}", expected),
608                got: format!("{:?}", self.current_kind()),
609            })
610        }
611    }
612
613    fn expect_word(&mut self) -> Result<String> {
614        match self.current_kind() {
615            TokenKind::Word(w) => {
616                let w = w.clone();
617                self.advance();
618                Ok(w)
619            }
620            _ => Err(Error::UnexpectedToken {
621                expected: "word".to_string(),
622                got: format!("{:?}", self.current_kind()),
623            }),
624        }
625    }
626
627    fn at_end(&self) -> bool {
628        matches!(self.current_kind(), TokenKind::Eof)
629    }
630}
631
632#[cfg(test)]
633mod tests {
634    use super::*;
635    use crate::lexer::Lexer;
636
637    fn parse(input: &str) -> Result<IndexMap<String, Value>> {
638        let tokens = Lexer::new(input).tokenize()?;
639        Parser::new(tokens).parse()
640    }
641
642    #[test]
643    fn test_simple_values() {
644        let data = parse("a: 1, b: hello, c: true, d: ~").unwrap();
645        assert_eq!(data.get("a").unwrap().as_int(), Some(1));
646        assert_eq!(data.get("b").unwrap().as_str(), Some("hello"));
647        assert_eq!(data.get("c").unwrap().as_bool(), Some(true));
648        assert!(data.get("d").unwrap().is_null());
649    }
650
651    #[test]
652    fn test_object() {
653        let data = parse("obj: {x: 1, y: 2}").unwrap();
654        let obj = data.get("obj").unwrap().as_object().unwrap();
655        assert_eq!(obj.get("x").unwrap().as_int(), Some(1));
656        assert_eq!(obj.get("y").unwrap().as_int(), Some(2));
657    }
658
659    #[test]
660    fn test_array() {
661        let data = parse("arr: [1, 2, 3]").unwrap();
662        let arr = data.get("arr").unwrap().as_array().unwrap();
663        assert_eq!(arr.len(), 3);
664        assert_eq!(arr[0].as_int(), Some(1));
665    }
666
667    #[test]
668    fn test_struct_and_table() {
669        let input = r#"
670            @struct point (x: int, y: int)
671            points: @table point [
672                (1, 2),
673                (3, 4),
674            ]
675        "#;
676        let tokens = Lexer::new(input).tokenize().unwrap();
677        let mut parser = Parser::new(tokens);
678        let data = parser.parse().unwrap();
679
680        let points = data.get("points").unwrap().as_array().unwrap();
681        assert_eq!(points.len(), 2);
682
683        let p0 = points[0].as_object().unwrap();
684        assert_eq!(p0.get("x").unwrap().as_int(), Some(1));
685        assert_eq!(p0.get("y").unwrap().as_int(), Some(2));
686    }
687
688    // -------------------------------------------------------------------------
689    // Union parsing
690    // -------------------------------------------------------------------------
691
692    #[test]
693    fn test_union_def() {
694        let input = r#"
695            @union Shape {
696                Circle(radius: float),
697                Rectangle(width: float, height: float),
698                Point(),
699            }
700        "#;
701        let tokens = Lexer::new(input).tokenize().unwrap();
702        let mut parser = Parser::new(tokens);
703        parser.parse().unwrap();
704        let unions = parser.into_unions();
705        let shape = unions.get("Shape").unwrap();
706        assert_eq!(shape.variants.len(), 3);
707        assert_eq!(shape.variants[0].name, "Circle");
708        assert_eq!(shape.variants[0].fields.len(), 1);
709        assert_eq!(shape.variants[1].name, "Rectangle");
710        assert_eq!(shape.variants[1].fields.len(), 2);
711        assert_eq!(shape.variants[2].name, "Point");
712        assert_eq!(shape.variants[2].fields.len(), 0);
713    }
714
715    // -------------------------------------------------------------------------
716    // Map parsing
717    // -------------------------------------------------------------------------
718
719    #[test]
720    fn test_map_value() {
721        let data = parse("m: @map {1: one, 2: two}").unwrap();
722        let m = data.get("m").unwrap().as_map().unwrap();
723        assert_eq!(m.len(), 2);
724        assert_eq!(m[0].0.as_int(), Some(1));
725        assert_eq!(m[0].1.as_str(), Some("one"));
726        assert_eq!(m[1].0.as_int(), Some(2));
727        assert_eq!(m[1].1.as_str(), Some("two"));
728    }
729
730    #[test]
731    fn test_map_with_string_keys() {
732        let data = parse(r#"m: @map {"key1": 10, "key2": 20}"#).unwrap();
733        let m = data.get("m").unwrap().as_map().unwrap();
734        assert_eq!(m.len(), 2);
735    }
736
737    #[test]
738    fn test_map_empty() {
739        let data = parse("m: @map {}").unwrap();
740        let m = data.get("m").unwrap().as_map().unwrap();
741        assert_eq!(m.len(), 0);
742    }
743
744    // -------------------------------------------------------------------------
745    // Ref and Tagged values
746    // -------------------------------------------------------------------------
747
748    #[test]
749    fn test_ref_value() {
750        let data = parse("config: !base_config").unwrap();
751        assert_eq!(data.get("config").unwrap().as_ref_name(), Some("base_config"));
752    }
753
754    #[test]
755    fn test_tagged_value() {
756        let data = parse("status: :ok 200").unwrap();
757        let (tag, inner) = data.get("status").unwrap().as_tagged().unwrap();
758        assert_eq!(tag, "ok");
759        assert_eq!(inner.as_int(), Some(200));
760    }
761
762    #[test]
763    fn test_tagged_null() {
764        let data = parse("status: :none ~").unwrap();
765        let (tag, inner) = data.get("status").unwrap().as_tagged().unwrap();
766        assert_eq!(tag, "none");
767        assert!(inner.is_null());
768    }
769
770    #[test]
771    fn test_tagged_value_no_space_after_colon() {
772        // key::tag without spaces — works because lexer emits Colon Colon Word
773        let data = parse("status::ok 200").unwrap();
774        let (tag, inner) = data.get("status").unwrap().as_tagged().unwrap();
775        assert_eq!(tag, "ok");
776        assert_eq!(inner.as_int(), Some(200));
777    }
778
779    #[test]
780    fn test_key_value_no_space_after_colon() {
781        // key:value without space — works because lexer emits Word Colon Word
782        let data = parse("name:alice\nage:30").unwrap();
783        assert_eq!(data.get("name").unwrap().as_str(), Some("alice"));
784        assert_eq!(data.get("age").unwrap().as_int(), Some(30));
785    }
786
787    // -------------------------------------------------------------------------
788    // Tuple and nested structures
789    // -------------------------------------------------------------------------
790
791    #[test]
792    fn test_tuple_value() {
793        let data = parse("point: (1, 2, 3)").unwrap();
794        let arr = data.get("point").unwrap().as_array().unwrap();
795        assert_eq!(arr.len(), 3);
796        assert_eq!(arr[0].as_int(), Some(1));
797        assert_eq!(arr[1].as_int(), Some(2));
798        assert_eq!(arr[2].as_int(), Some(3));
799    }
800
801    #[test]
802    fn test_nested_object() {
803        let data = parse("outer: {inner: {x: 1}}").unwrap();
804        let outer = data.get("outer").unwrap().as_object().unwrap();
805        let inner = outer.get("inner").unwrap().as_object().unwrap();
806        assert_eq!(inner.get("x").unwrap().as_int(), Some(1));
807    }
808
809    #[test]
810    fn test_nested_arrays() {
811        let data = parse("matrix: [[1, 2], [3, 4]]").unwrap();
812        let matrix = data.get("matrix").unwrap().as_array().unwrap();
813        assert_eq!(matrix.len(), 2);
814        let row0 = matrix[0].as_array().unwrap();
815        assert_eq!(row0[0].as_int(), Some(1));
816    }
817
818    // -------------------------------------------------------------------------
819    // Struct fields with types
820    // -------------------------------------------------------------------------
821
822    #[test]
823    fn test_struct_with_nullable_field() {
824        let input = r#"
825            @struct user (name: string, email: string?)
826            users: @table user [
827                (alice, "a@test.com"),
828                (bob, ~),
829            ]
830        "#;
831        let tokens = Lexer::new(input).tokenize().unwrap();
832        let mut parser = Parser::new(tokens);
833        let data = parser.parse().unwrap();
834        let schemas = parser.into_schemas();
835
836        let schema = schemas.get("user").unwrap();
837        assert!(schema.fields[1].field_type.nullable);
838
839        let users = data.get("users").unwrap().as_array().unwrap();
840        assert_eq!(users.len(), 2);
841        assert!(users[1].as_object().unwrap().get("email").unwrap().is_null());
842    }
843
844    #[test]
845    fn test_struct_with_array_field() {
846        let input = r#"
847            @struct item (name: string, tags: []string)
848            items: @table item [
849                (widget, [cool, useful]),
850            ]
851        "#;
852        let tokens = Lexer::new(input).tokenize().unwrap();
853        let mut parser = Parser::new(tokens);
854        let data = parser.parse().unwrap();
855
856        let items = data.get("items").unwrap().as_array().unwrap();
857        let tags = items[0].as_object().unwrap().get("tags").unwrap().as_array().unwrap();
858        assert_eq!(tags.len(), 2);
859    }
860
861    // -------------------------------------------------------------------------
862    // Root-array directive
863    // -------------------------------------------------------------------------
864
865    #[test]
866    fn test_root_array_directive() {
867        let input = "@root-array\nroot: [1, 2, 3]";
868        let tokens = Lexer::new(input).tokenize().unwrap();
869        let mut parser = Parser::new(tokens);
870        parser.parse().unwrap();
871        assert!(parser.is_root_array());
872    }
873
874    // -------------------------------------------------------------------------
875    // Ref key at top level
876    // -------------------------------------------------------------------------
877
878    #[test]
879    fn test_ref_key_at_top_level() {
880        let input = "!defaults: {theme: dark}";
881        let data = parse(input).unwrap();
882        assert!(data.contains_key("!defaults"));
883        let obj = data.get("!defaults").unwrap().as_object().unwrap();
884        assert_eq!(obj.get("theme").unwrap().as_str(), Some("dark"));
885    }
886
887    // -------------------------------------------------------------------------
888    // String keys
889    // -------------------------------------------------------------------------
890
891    #[test]
892    fn test_string_key() {
893        let data = parse(r#""my key": 42"#).unwrap();
894        assert_eq!(data.get("my key").unwrap().as_int(), Some(42));
895    }
896
897    // -------------------------------------------------------------------------
898    // Error cases
899    // -------------------------------------------------------------------------
900
901    #[test]
902    fn test_unexpected_token_error() {
903        let result = parse("] invalid");
904        // The parser may skip unexpected tokens or error
905        // Just ensure it doesn't panic
906        let _ = result;
907    }
908
909    #[test]
910    fn test_missing_colon_error() {
911        // A word followed by a value without colon
912        let input = "key value";
913        let result = parse(input);
914        assert!(result.is_err());
915    }
916
917    #[test]
918    fn test_unknown_struct_in_table() {
919        let input = "data: @table nonexistent [(1, 2)]";
920        let result = parse(input);
921        assert!(result.is_err());
922    }
923
924    // -------------------------------------------------------------------------
925    // Struct field type defaults
926    // -------------------------------------------------------------------------
927
928    #[test]
929    fn test_struct_field_without_type() {
930        let input = r#"
931            @struct simple (name, value)
932            items: @table simple [
933                (hello, world),
934            ]
935        "#;
936        let tokens = Lexer::new(input).tokenize().unwrap();
937        let mut parser = Parser::new(tokens);
938        let data = parser.parse().unwrap();
939        let schemas = parser.into_schemas();
940
941        // Fields without explicit type default to "string"
942        let schema = schemas.get("simple").unwrap();
943        assert_eq!(schema.fields[0].field_type.base, "string");
944        assert_eq!(schema.fields[1].field_type.base, "string");
945
946        let items = data.get("items").unwrap().as_array().unwrap();
947        assert_eq!(items[0].as_object().unwrap().get("name").unwrap().as_str(), Some("hello"));
948    }
949
950    // -------------------------------------------------------------------------
951    // Unknown directive
952    // -------------------------------------------------------------------------
953
954    #[test]
955    fn test_unknown_directive_ignored() {
956        // Directive on its own line — next line is a key-value, not an argument
957        let data = parse("@custom_directive\nkey: value").unwrap();
958        assert_eq!(data.get("key").unwrap().as_str(), Some("value"));
959    }
960
961    #[test]
962    fn test_unknown_directive_consumes_same_line_argument() {
963        // Same-line word argument: consumed, not misparsed as a key
964        let data = parse("@custom foo\nkey: value").unwrap();
965        assert!(data.get("foo").is_none(), "foo should be consumed as directive arg, not a key");
966        assert_eq!(data.get("key").unwrap().as_str(), Some("value"));
967
968        // Same-line array argument
969        let data = parse("@custom [1, 2, 3]\nkey: value").unwrap();
970        assert_eq!(data.get("key").unwrap().as_str(), Some("value"));
971
972        // Same-line object argument
973        let data = parse("@custom {a: 1}\nkey: value").unwrap();
974        assert_eq!(data.get("key").unwrap().as_str(), Some("value"));
975
976        // No argument (directive alone on line)
977        let data = parse("@custom\nkey: value").unwrap();
978        assert_eq!(data.get("key").unwrap().as_str(), Some("value"));
979
980        // No argument (directive at end of file)
981        let data = parse("key: value\n@custom").unwrap();
982        assert_eq!(data.get("key").unwrap().as_str(), Some("value"));
983
984        // Argument on next line: NOT consumed
985        let data = parse("@custom\nfoo: bar").unwrap();
986        assert_eq!(data.get("foo").unwrap().as_str(), Some("bar"));
987    }
988
989    #[test]
990    fn test_unknown_directive_value_consumes_argument() {
991        // Spec §1.18: unknown directive in value position consumes argument, returns null
992        let data = parse("key: @unknown [1, 2, 3]\nother: 42").unwrap();
993        assert!(data.get("key").unwrap().is_null(), "unknown directive value should be null");
994        assert_eq!(data.get("other").unwrap().as_int(), Some(42), "next key should parse normally");
995
996        // With object argument
997        let data = parse("key: @unknown {a: 1}\nother: ok").unwrap();
998        assert!(data.get("key").unwrap().is_null());
999        assert_eq!(data.get("other").unwrap().as_str(), Some("ok"));
1000
1001        // With simple argument
1002        let data = parse("key: @unknown 42\nother: ok").unwrap();
1003        assert!(data.get("key").unwrap().is_null());
1004        assert_eq!(data.get("other").unwrap().as_str(), Some("ok"));
1005
1006        // Without argument (just the directive)
1007        let data = parse("arr: [@unknown, 1, 2]").unwrap();
1008        let arr = data.get("arr").unwrap().as_array().unwrap();
1009        assert!(arr[0].is_null());
1010        assert_eq!(arr[1].as_int(), Some(1));
1011    }
1012
1013    // -------------------------------------------------------------------------
1014    // Object with ref keys
1015    // -------------------------------------------------------------------------
1016
1017    #[test]
1018    fn test_object_with_ref_key() {
1019        let data = parse("obj: {!base: 1, key: 2}").unwrap();
1020        let obj = data.get("obj").unwrap().as_object().unwrap();
1021        assert!(obj.contains_key("!base"));
1022        assert_eq!(obj.get("!base").unwrap().as_int(), Some(1));
1023        assert_eq!(obj.get("key").unwrap().as_int(), Some(2));
1024    }
1025
1026    // -------------------------------------------------------------------------
1027    // Nested struct in table
1028    // -------------------------------------------------------------------------
1029
1030    #[test]
1031    fn test_nested_struct_in_table() {
1032        let input = r#"
1033            @struct addr (city: string, zip: string)
1034            @struct person (name: string, home: addr)
1035            people: @table person [
1036                (alice, (Boston, "02101")),
1037                (bob, (NYC, "10001")),
1038            ]
1039        "#;
1040        let tokens = Lexer::new(input).tokenize().unwrap();
1041        let mut parser = Parser::new(tokens);
1042        let data = parser.parse().unwrap();
1043
1044        let people = data.get("people").unwrap().as_array().unwrap();
1045        let alice_home = people[0].as_object().unwrap().get("home").unwrap().as_object().unwrap();
1046        assert_eq!(alice_home.get("city").unwrap().as_str(), Some("Boston"));
1047    }
1048
1049    #[test]
1050    fn test_include_cycle_detection() {
1051        // Create a file that includes itself
1052        let dir = std::env::temp_dir();
1053        let file_path = dir.join("test_cycle_self.tl");
1054        std::fs::write(&file_path, "@include \"test_cycle_self.tl\"\nval: 1").unwrap();
1055
1056        let content = std::fs::read_to_string(&file_path).unwrap();
1057        let tokens = Lexer::new(&content).tokenize().unwrap();
1058        let mut parser = Parser::new(tokens).with_base_path(&file_path);
1059        let result = parser.parse();
1060        assert!(result.is_err(), "Should detect self-referencing include");
1061        let err_msg = result.unwrap_err().to_string();
1062        assert!(err_msg.contains("Circular include"), "Error should mention circular include: {}", err_msg);
1063
1064        std::fs::remove_file(&file_path).ok();
1065    }
1066
1067    #[test]
1068    fn test_include_mutual_cycle_detection() {
1069        // Create two files that include each other: A -> B -> A
1070        let dir = std::env::temp_dir();
1071        let file_a = dir.join("test_cycle_a.tl");
1072        let file_b = dir.join("test_cycle_b.tl");
1073        std::fs::write(&file_a, "@include \"test_cycle_b.tl\"\na_val: 1").unwrap();
1074        std::fs::write(&file_b, "@include \"test_cycle_a.tl\"\nb_val: 2").unwrap();
1075
1076        let content = std::fs::read_to_string(&file_a).unwrap();
1077        let tokens = Lexer::new(&content).tokenize().unwrap();
1078        let mut parser = Parser::new(tokens).with_base_path(&file_a);
1079        let result = parser.parse();
1080        assert!(result.is_err(), "Should detect mutual cycle between A and B");
1081        let err_msg = result.unwrap_err().to_string();
1082        assert!(err_msg.contains("Circular include"), "Error should mention circular include: {}", err_msg);
1083
1084        std::fs::remove_file(&file_a).ok();
1085        std::fs::remove_file(&file_b).ok();
1086    }
1087
1088    #[test]
1089    fn test_include_stack_propagated_to_child() {
1090        // Verify that the include_stack starts empty
1091        let parser = Parser::new(vec![]);
1092        assert!(parser.include_stack.is_empty(), "New parser should have empty include stack");
1093    }
1094
1095    // -------------------------------------------------------------------------
1096    // Bytes literal parsing
1097    // -------------------------------------------------------------------------
1098
1099    #[test]
1100    fn test_bytes_literal_value() {
1101        let data = parse(r#"payload: b"cafef00d""#).unwrap();
1102        let val = data.get("payload").unwrap();
1103        assert_eq!(val.as_bytes(), Some(&[0xca, 0xfe, 0xf0, 0x0d][..]));
1104    }
1105
1106    #[test]
1107    fn test_bytes_literal_empty_value() {
1108        let data = parse(r#"empty: b"""#).unwrap();
1109        let val = data.get("empty").unwrap();
1110        assert_eq!(val.as_bytes(), Some(&[][..]));
1111    }
1112
1113    #[test]
1114    fn test_bytes_literal_in_array() {
1115        let data = parse(r#"arr: [b"cafe", b"babe"]"#).unwrap();
1116        let arr = data.get("arr").unwrap().as_array().unwrap();
1117        assert_eq!(arr[0].as_bytes(), Some(&[0xca, 0xfe][..]));
1118        assert_eq!(arr[1].as_bytes(), Some(&[0xba, 0xbe][..]));
1119    }
1120
1121    #[test]
1122    fn test_bytes_literal_in_object() {
1123        let data = parse(r#"obj: {data: b"ff00"}"#).unwrap();
1124        let obj = data.get("obj").unwrap().as_object().unwrap();
1125        assert_eq!(obj.get("data").unwrap().as_bytes(), Some(&[0xff, 0x00][..]));
1126    }
1127
1128    // -------------------------------------------------------------------------
1129    // Fuzz regression tests (full TeaLeaf::parse path)
1130    // -------------------------------------------------------------------------
1131
1132    #[test]
1133    fn test_fuzz_deeply_nested_arrays_no_stack_overflow() {
1134        // Crafted input with 500 nested arrays — exceeds MAX_PARSE_DEPTH (256)
1135        let depth = 500;
1136        let input = format!("key: {}{}", "[".repeat(depth), "]".repeat(depth));
1137        let result = crate::TeaLeaf::parse(&input);
1138        match result {
1139            Err(e) => {
1140                let err = format!("{}", e);
1141                assert!(err.contains("nesting depth"), "Error should mention nesting depth: {}", err);
1142            }
1143            Ok(_) => panic!("Should fail with depth exceeded, not succeed"),
1144        }
1145    }
1146
1147    #[test]
1148    fn test_fuzz_deeply_nested_objects_no_stack_overflow() {
1149        // Crafted input with 500 nested objects
1150        let depth = 500;
1151        let mut input = String::from("key: ");
1152        for i in 0..depth {
1153            input.push_str(&format!("{{k{}: ", i));
1154        }
1155        input.push_str("1");
1156        for _ in 0..depth {
1157            input.push('}');
1158        }
1159        let result = crate::TeaLeaf::parse(&input);
1160        assert!(result.is_err(), "Should fail with depth exceeded, not stack overflow");
1161    }
1162
1163    #[test]
1164    fn test_fuzz_deeply_nested_tags_no_stack_overflow() {
1165        // Crafted input with 500 nested tags: :a :b :c ... value
1166        let depth = 500;
1167        let mut input = String::from("key: ");
1168        for i in 0..depth {
1169            input.push_str(&format!(":t{} ", i));
1170        }
1171        input.push_str("42");
1172        let result = crate::TeaLeaf::parse(&input);
1173        assert!(result.is_err(), "Should fail with depth exceeded, not stack overflow");
1174    }
1175
1176    #[test]
1177    fn test_parse_depth_256_succeeds() {
1178        // 200 levels of nesting should succeed (within MAX_PARSE_DEPTH=256)
1179        let depth = 200;
1180        let input = format!("key: {}1{}", "[".repeat(depth), "]".repeat(depth));
1181        let result = crate::TeaLeaf::parse(&input);
1182        if let Err(e) = &result {
1183            panic!("200 levels of nesting should be fine: {}", e);
1184        }
1185    }
1186
1187    #[test]
1188    fn test_fuzz_crash_e42e_full_parse_no_panic() {
1189        // Regression: fuzz_parse crash-e42e7ae2f5127519e7e60e87d1cbfbc2a5bf878d
1190        // Must not panic through TeaLeaf::parse (the actual fuzz path)
1191        let input = "\"0B\u{10}\u{3}#\"0BP\u{07FE}-----\u{061D}\u{07FE}\u{07FE}-----\u{061D}\u{3}#\"0B\u{10}\u{3}#\"0BP\u{07FE}-----\u{061D}\u{07FE}\u{07FE}-----\u{061D}\u{07FE}";
1192        let _ = crate::TeaLeaf::parse(input);
1193    }
1194
1195    #[test]
1196    fn test_fuzz_crash_d038_full_parse_no_panic() {
1197        // Regression: fuzz_parse crash-d0387cbd639a8db9789ab68057f3c58c7bebbfa5
1198        // Large input with repeated date-like patterns. Must not panic.
1199        let input = "z\" \"-\"\t; \"\"\")\"\"\" 8] ] 02)3313312)313-333-333-3332)33-133-3-33331333302)33";
1200        let _ = crate::TeaLeaf::parse(input);
1201    }
1202
1203    #[test]
1204    fn test_reject_value_only_schema_field_types() {
1205        // Spec §2.1: object, map, tuple, ref, tagged are value types, not schema field types
1206        for bad_type in &["object", "map", "tuple", "ref", "tagged"] {
1207            let input = format!("@struct Bad (field: {})\n", bad_type);
1208            let result = crate::TeaLeaf::parse(&input);
1209            assert!(result.is_err(), "should reject '{}' as schema field type", bad_type);
1210            let err = format!("{}", result.err().unwrap());
1211            assert!(err.contains("value type"), "error for '{}' should mention 'value type': {}", bad_type, err);
1212        }
1213        // Array of value-only type should also be rejected
1214        let result = crate::TeaLeaf::parse("@struct Bad (field: []object)\n");
1215        assert!(result.is_err(), "should reject '[]object' as schema field type");
1216
1217        // Valid types should still work
1218        for good_type in &["string", "int", "int8", "float", "bool", "bytes", "timestamp", "MyStruct"] {
1219            let input = format!("@struct Good (field: {})\n", good_type);
1220            assert!(crate::TeaLeaf::parse(&input).is_ok(), "'{}' should be accepted", good_type);
1221        }
1222    }
1223
1224    #[test]
1225    fn test_parse_struct_with_quoted_fields() {
1226        // Quoted field names in @struct definitions (e.g. JSON-LD @type)
1227        let input = "@struct foo(\"@type\":string, name:string)\ndata:@table foo[(A,x),(B,y)]\n";
1228        let doc = crate::TeaLeaf::parse(input).unwrap();
1229        let arr = doc.get("data").unwrap().as_array().unwrap();
1230        assert_eq!(arr.len(), 2);
1231
1232        let first = arr[0].as_object().unwrap();
1233        assert_eq!(first.get("@type").unwrap().as_str(), Some("A"));
1234        assert_eq!(first.get("name").unwrap().as_str(), Some("x"));
1235
1236        let second = arr[1].as_object().unwrap();
1237        assert_eq!(second.get("@type").unwrap().as_str(), Some("B"));
1238        assert_eq!(second.get("name").unwrap().as_str(), Some("y"));
1239    }
1240}