sif_parser/
parse.rs

1// SIF Core v1 — Document parser.
2//
3// Built from the Structured Interchange Format specification,
4// following the collected ABNF in Appendix A.
5//
6// This module provides `parse()` for full-document parsing and
7// individual parsing functions for types, schemas, and values.
8
9use crate::error::{err, ErrorKind, Result};
10use crate::types::*;
11use std::collections::HashMap;
12
13// ── Public API ──────────────────────────────────────────────────────
14
15/// Parse a complete SIF document from a string.
16pub fn parse(input: &str) -> Result<Document> {
17    let mut parser = DocumentParser::new(input);
18    parser.parse()
19}
20
21/// Parse a `#schema` field-definition list (without the `#schema` prefix).
22pub fn parse_schema(input: &str) -> Result<Schema> {
23    parse_schema_str(input, 0)
24}
25
26/// Parse a SIF type string (e.g., `"str[]?"`, `"enum(a,b,c)"`).
27pub fn parse_type_str(input: &str) -> Result<Type> {
28    let (ty, rest) = parse_type(input)?;
29    if !rest.is_empty() {
30        return Err(err(
31            ErrorKind::InvalidType,
32            0,
33            format!("trailing characters after type: {:?}", rest),
34        ));
35    }
36    Ok(ty)
37}
38
39/// Parse a single field value according to a declared type.
40pub fn parse_typed_value(input: &str, ty: &Type) -> Result<Value> {
41    parse_value_typed(input, ty, 0)
42}
43
44/// Parse a single field value using untyped disambiguation (§9.2).
45pub fn parse_untyped_value(input: &str) -> Value {
46    parse_value_untyped(input)
47}
48
49/// Parse inline semantic annotations in a string (§16.3).
50pub fn parse_inline_annotations(input: &str) -> Vec<Span> {
51    parse_spans(input)
52}
53
54/// Parse an inline SIF expression (§21): `sif::#schema ...::record::record`.
55pub fn parse_inline_sif(input: &str) -> Result<Document> {
56    if !input.starts_with("sif::") {
57        return Err(err(ErrorKind::InvalidHeader, 0, "inline SIF must start with 'sif::'"));
58    }
59    let rest = &input[5..];
60    let parts: Vec<&str> = rest.split("::").collect();
61    // Reconstruct as a normal SIF document with newlines.
62    let mut doc_str = String::from("#!sif v1\n");
63    for part in parts {
64        doc_str.push_str(part);
65        doc_str.push('\n');
66    }
67    parse(&doc_str)
68}
69
70// ── Type Parsing (§7, Appendix A) ───────────────────────────────────
71
72/// Parse a type from the start of `input`, returning (Type, remaining).
73///
74/// ABNF:
75///   type       = base-type *type-suffix
76///   base-type  = scalar-type / "map"
77///   type-suffix = "[]" / "?"
78fn parse_type(input: &str) -> Result<(Type, &str)> {
79    let (base, mut rest) = parse_base_type(input)?;
80
81    // Apply suffixes left-to-right: str[]? → Nullable(Array(Str))
82    let mut ty = base;
83    loop {
84        if rest.starts_with("[]") {
85            ty = Type::Array(Box::new(ty));
86            rest = &rest[2..];
87        } else if rest.starts_with('?') {
88            ty = Type::Nullable(Box::new(ty));
89            rest = &rest[1..];
90        } else {
91            break;
92        }
93    }
94    Ok((ty, rest))
95}
96
97fn parse_base_type(input: &str) -> Result<(Type, &str)> {
98    // Try each keyword, longest match first for datetime vs date.
99    let keywords: &[(&str, Type)] = &[
100        ("datetime", Type::DateTime),
101        ("duration", Type::Duration),
102        ("float", Type::Float),
103        ("bool", Type::Bool),
104        ("uint", Type::Uint),
105        ("int", Type::Int),
106        ("str", Type::Str),
107        ("date", Type::Date),
108        ("bytes", Type::Bytes),
109        ("null", Type::Null),
110        ("any", Type::Any),
111        ("map", Type::Map),
112    ];
113
114    for (kw, ty) in keywords {
115        if input.starts_with(kw) {
116            let after = &input[kw.len()..];
117            // Make sure it's a full token (not e.g. "integer")
118            if after.is_empty()
119                || after.starts_with("[]")
120                || after.starts_with('?')
121                || after.starts_with(':')
122                || after.starts_with('|')
123                || after.starts_with(' ')
124                || after.starts_with(',')
125                || after.starts_with(')')
126            {
127                return Ok((ty.clone(), after));
128            }
129        }
130    }
131
132    // Try enum(a,b,c)
133    if input.starts_with("enum(") {
134        let after_paren = &input[5..];
135        let close = after_paren
136            .find(')')
137            .ok_or_else(|| err(ErrorKind::InvalidType, 0, "unterminated enum type"))?;
138        let variants_str = &after_paren[..close];
139        let variants: Vec<String> = variants_str.split(',').map(|s| s.trim().to_string()).collect();
140        if variants.iter().any(|v| v.is_empty()) {
141            return Err(err(ErrorKind::InvalidType, 0, "empty enum variant"));
142        }
143        // Validate variant characters: ALPHA / DIGIT / "_" / "-"
144        for v in &variants {
145            if !v.chars().all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-') {
146                return Err(err(
147                    ErrorKind::InvalidType,
148                    0,
149                    format!("invalid enum variant: {:?}", v),
150                ));
151            }
152        }
153        return Ok((Type::Enum(variants), &after_paren[close + 1..]));
154    }
155
156    Err(err(
157        ErrorKind::InvalidType,
158        0,
159        format!("unrecognized type at: {:?}", &input[..input.len().min(20)]),
160    ))
161}
162
163// ── Schema Parsing (§8) ─────────────────────────────────────────────
164
165/// Parse the field definitions from a schema directive body.
166///
167/// `line_num` is used for error reporting.
168pub(crate) fn parse_schema_str(input: &str, line_num: usize) -> Result<Schema> {
169    let input = input.trim();
170    if input.is_empty() {
171        return Err(err(ErrorKind::InvalidSchema, line_num, "empty schema"));
172    }
173
174    let mut fields = Vec::new();
175    let mut seen_names: HashMap<&str, usize> = HashMap::new();
176
177    // Split on spaces, but we need to handle enum(...) which contains no spaces
178    // so a simple split works here per the ABNF.
179    let tokens = split_schema_fields(input);
180
181    for token in &tokens {
182        let field = parse_field_def(token, line_num)?;
183        if let Some(prev) = seen_names.get(field.name.as_str()) {
184            return Err(err(
185                ErrorKind::DuplicateField,
186                line_num,
187                format!(
188                    "duplicate field name {:?} (first at position {})",
189                    field.name,
190                    prev + 1
191                ),
192            ));
193        }
194        seen_names.insert(unsafe {
195            // SAFETY: field.name is allocated and lives as long as `fields`
196            // We only use this for duplicate detection within this function.
197            std::mem::transmute::<&str, &str>(field.name.as_str())
198        }, fields.len());
199        fields.push(field);
200    }
201
202    Ok(Schema { fields })
203}
204
205/// Split schema field definitions on spaces, respecting enum(...) parentheses
206/// and quoted modifier values.
207fn split_schema_fields(input: &str) -> Vec<&str> {
208    let mut fields = Vec::new();
209    let mut start = 0;
210    let mut paren_depth = 0;
211    let mut in_quotes = false;
212
213    for (i, c) in input.char_indices() {
214        match c {
215            '(' if !in_quotes => paren_depth += 1,
216            ')' if !in_quotes => {
217                if paren_depth > 0 {
218                    paren_depth -= 1;
219                }
220            }
221            '"' => in_quotes = !in_quotes,
222            ' ' if paren_depth == 0 && !in_quotes => {
223                let token = input[start..i].trim();
224                if !token.is_empty() {
225                    fields.push(token);
226                }
227                start = i + 1;
228            }
229            _ => {}
230        }
231    }
232    let last = input[start..].trim();
233    if !last.is_empty() {
234        fields.push(last);
235    }
236    fields
237}
238
239/// Parse a single field definition per §8 / §8.5 / §8.6:
240///
241///   field-def = [ "∅" ] field-name ":" type [ ":" semantic ]
242///               [ "|" modifier *( "," modifier ) ]
243fn parse_field_def(input: &str, line_num: usize) -> Result<FieldDef> {
244    let mut s = input;
245
246    // §8.5: Check for deprecated prefix ∅ (U+2205)
247    let deprecated = if s.starts_with('∅') {
248        s = &s['∅'.len_utf8()..];
249        true
250    } else {
251        false
252    };
253
254    // §8.6: Split on | for modifiers (at most one split, outside parens)
255    let (left, modifiers_str) = split_on_pipe(s);
256
257    // Parse modifiers
258    let modifiers = if let Some(mods) = modifiers_str {
259        parse_modifiers(mods, line_num)?
260    } else {
261        Vec::new()
262    };
263
264    // Split left side on : for name:type[:semantic]
265    // But we must be careful with enum(a,b,c) which doesn't contain ':'
266    // at the outer level, and with type suffixes like str[]?
267    let parts = split_field_parts(left);
268
269    if parts.len() < 2 {
270        return Err(err(
271            ErrorKind::InvalidSchema,
272            line_num,
273            format!("field definition must have at least name:type, got {:?}", input),
274        ));
275    }
276
277    let name = parts[0].to_string();
278    // Validate field name: 1-63 chars of ALPHA / DIGIT / "_" / "."
279    if name.is_empty() || name.len() > 63 {
280        return Err(err(
281            ErrorKind::InvalidSchema,
282            line_num,
283            format!("field name must be 1-63 characters, got {:?}", name),
284        ));
285    }
286    if !name
287        .chars()
288        .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '.')
289    {
290        return Err(err(
291            ErrorKind::InvalidSchema,
292            line_num,
293            format!("invalid field name characters: {:?}", name),
294        ));
295    }
296
297    let (field_type, _) = parse_type(parts[1])
298        .map_err(|e| err(ErrorKind::InvalidType, line_num, e.message))?;
299
300    let semantic = if parts.len() >= 3 && !parts[2].is_empty() {
301        Some(parts[2].to_string())
302    } else {
303        None
304    };
305
306    Ok(FieldDef {
307        name,
308        field_type,
309        semantic,
310        deprecated,
311        modifiers,
312    })
313}
314
315/// Split on `|` at the top level (not inside parens or quotes).
316fn split_on_pipe(input: &str) -> (&str, Option<&str>) {
317    let mut paren_depth = 0;
318    let mut in_quotes = false;
319    for (i, c) in input.char_indices() {
320        match c {
321            '(' if !in_quotes => paren_depth += 1,
322            ')' if !in_quotes => {
323                if paren_depth > 0 {
324                    paren_depth -= 1;
325                }
326            }
327            '"' => in_quotes = !in_quotes,
328            '|' if paren_depth == 0 && !in_quotes => {
329                return (&input[..i], Some(&input[i + 1..]));
330            }
331            _ => {}
332        }
333    }
334    (input, None)
335}
336
337/// Split a field's name:type[:semantic] on `:`, but skip `:` inside
338/// `enum(...)` and after the type portion.
339///
340/// Strategy: split on `:` up to 3 parts, where the second part (type)
341/// may contain colons if it's an enum (but enum uses commas, not colons,
342/// so this is safe). The only colon in a type is if someone writes
343/// something unexpected — we handle it by greedily consuming the type.
344fn split_field_parts(input: &str) -> Vec<&str> {
345    let mut parts = Vec::new();
346    let mut paren_depth = 0;
347
348    // Find first colon outside parens = name boundary
349    let mut first_colon = None;
350    for (i, c) in input.char_indices() {
351        match c {
352            '(' => paren_depth += 1,
353            ')' => {
354                if paren_depth > 0 {
355                    paren_depth -= 1;
356                }
357            }
358            ':' if paren_depth == 0 => {
359                first_colon = Some(i);
360                break;
361            }
362            _ => {}
363        }
364    }
365
366    let Some(fc) = first_colon else {
367        return vec![input];
368    };
369
370    parts.push(&input[..fc]);
371    let after_name = &input[fc + 1..];
372
373    // Now parse the type, which may contain parens (enum).
374    // Find the next colon outside parens = semantic boundary.
375    paren_depth = 0;
376    let mut second_colon = None;
377    for (i, c) in after_name.char_indices() {
378        match c {
379            '(' => paren_depth += 1,
380            ')' => {
381                if paren_depth > 0 {
382                    paren_depth -= 1;
383                }
384            }
385            ':' if paren_depth == 0 => {
386                second_colon = Some(i);
387                break;
388            }
389            _ => {}
390        }
391    }
392
393    if let Some(sc) = second_colon {
394        parts.push(&after_name[..sc]);
395        parts.push(&after_name[sc + 1..]);
396    } else {
397        parts.push(after_name);
398    }
399
400    parts
401}
402
403/// Parse comma-separated modifiers: `mod1,mod2=val,mod3="quoted"`.
404fn parse_modifiers(input: &str, line_num: usize) -> Result<Vec<Modifier>> {
405    let mut modifiers = Vec::new();
406    if input.is_empty() {
407        return Ok(modifiers);
408    }
409
410    // Split on commas, respecting quotes
411    let tokens = split_on_comma_unquoted(input);
412
413    for token in tokens {
414        let token = token.trim();
415        if token.is_empty() {
416            continue;
417        }
418        if let Some(eq_pos) = token.find('=') {
419            let name = token[..eq_pos].to_string();
420            let raw_value = &token[eq_pos + 1..];
421            let value = if raw_value.starts_with('"') {
422                // Unquote
423                parse_quoted_string_value(raw_value)
424                    .map_err(|e| err(ErrorKind::InvalidSchema, line_num, e.message))?
425            } else {
426                raw_value.to_string()
427            };
428            modifiers.push(Modifier {
429                name,
430                value: Some(value),
431            });
432        } else {
433            modifiers.push(Modifier {
434                name: token.to_string(),
435                value: None,
436            });
437        }
438    }
439    Ok(modifiers)
440}
441
442fn split_on_comma_unquoted(input: &str) -> Vec<&str> {
443    let mut parts = Vec::new();
444    let mut start = 0;
445    let mut in_quotes = false;
446
447    for (i, c) in input.char_indices() {
448        match c {
449            '"' => in_quotes = !in_quotes,
450            ',' if !in_quotes => {
451                parts.push(&input[start..i]);
452                start = i + 1;
453            }
454            _ => {}
455        }
456    }
457    parts.push(&input[start..]);
458    parts
459}
460
461// ── Value Parsing (§9–§15) ──────────────────────────────────────────
462
463/// Parse a value according to a known type (§9.1).
464fn parse_value_typed(input: &str, ty: &Type, line_num: usize) -> Result<Value> {
465    // §10: Null literal.
466    // Spec says `_` is only valid for nullable types, but real-world SIF
467    // files commonly use `_` in non-nullable fields to mean "no value."
468    // We accept it leniently for all types.
469    if input == "_" {
470        return Ok(Value::Null);
471    }
472
473    // §9.3: Empty field
474    if input.is_empty() {
475        return match ty {
476            Type::Str | Type::Any => Ok(Value::Str(String::new())),
477            Type::Nullable(_) => Ok(Value::Null),
478            _ => Err(err(
479                ErrorKind::TypeMismatch,
480                line_num,
481                format!("empty field for non-nullable type {}", ty),
482            )),
483        };
484    }
485
486    match ty {
487        Type::Bool => parse_bool(input, line_num),
488        Type::Int => parse_int(input, line_num),
489        Type::Uint => parse_uint(input, line_num),
490        Type::Float => parse_float(input, line_num),
491        Type::Str => Ok(Value::Str(parse_string_value(input)?)),
492        Type::Date => {
493            let s = parse_string_value(input)?;
494            validate_date(&s, line_num)?;
495            Ok(Value::Date(s))
496        }
497        Type::DateTime => {
498            let s = parse_string_value(input)?;
499            validate_datetime(&s, line_num)?;
500            Ok(Value::DateTime(s))
501        }
502        Type::Duration => {
503            let s = parse_string_value(input)?;
504            validate_duration(&s, line_num)?;
505            Ok(Value::Duration(s))
506        }
507        Type::Bytes => {
508            let s = parse_string_value(input)?;
509            let bytes = base64_decode(&s)
510                .map_err(|e| err(ErrorKind::TypeMismatch, line_num, e))?;
511            Ok(Value::Bytes(bytes))
512        }
513        Type::Enum(variants) => {
514            let s = parse_string_value(input)?;
515            if !variants.contains(&s) {
516                return Err(err(
517                    ErrorKind::TypeMismatch,
518                    line_num,
519                    format!("value {:?} not in enum({})", s, variants.join(",")),
520                ));
521            }
522            Ok(Value::Enum(s))
523        }
524        Type::Null => {
525            // Only `_` is valid for the null type, handled above.
526            Err(err(
527                ErrorKind::TypeMismatch,
528                line_num,
529                format!("expected null ('_'), got {:?}", input),
530            ))
531        }
532        Type::Any => Ok(parse_value_untyped(input)),
533        Type::Map => parse_map_value(input, line_num),
534        Type::Array(elem_ty) => parse_array_value(input, elem_ty, line_num),
535        Type::Nullable(inner) => parse_value_typed(input, inner, line_num),
536    }
537}
538
539/// Parse a value using untyped disambiguation rules (§9.2).
540fn parse_value_untyped(input: &str) -> Value {
541    // 1. null
542    if input == "_" {
543        return Value::Null;
544    }
545    // 2. bool (only T/F in untyped context)
546    if input == "T" {
547        return Value::Bool(true);
548    }
549    if input == "F" {
550        return Value::Bool(false);
551    }
552    // 3. array
553    if input.starts_with('[') && input.ends_with(']') {
554        if let Ok(v) = parse_array_value(input, &Type::Any, 0) {
555            return v;
556        }
557    }
558    // 4. map
559    if input.starts_with('{') && input.ends_with('}') {
560        if let Ok(v) = parse_map_value(input, 0) {
561            return v;
562        }
563    }
564    // 5. signed integer
565    if is_int_literal(input) {
566        if let Ok(n) = input.parse::<i64>() {
567            return Value::Int(n);
568        }
569    }
570    // 6. float (contains '.')
571    if input.contains('.') {
572        if let Ok(n) = input.parse::<f64>() {
573            if n.is_finite() {
574                return Value::Float(n);
575            }
576        }
577    }
578    // 7. string
579    if input.starts_with('"') {
580        if let Ok(s) = parse_quoted_string_value(input) {
581            return Value::Str(s);
582        }
583    }
584    Value::Str(input.to_string())
585}
586
587fn is_int_literal(s: &str) -> bool {
588    let s = s.strip_prefix('-').unwrap_or(s);
589    !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())
590}
591
592fn parse_bool(input: &str, line_num: usize) -> Result<Value> {
593    match input {
594        "T" | "true" | "TRUE" | "True" | "1" => Ok(Value::Bool(true)),
595        "F" | "false" | "FALSE" | "False" | "0" => Ok(Value::Bool(false)),
596        _ => Err(err(
597            ErrorKind::TypeMismatch,
598            line_num,
599            format!("invalid bool: {:?}", input),
600        )),
601    }
602}
603
604fn parse_int(input: &str, line_num: usize) -> Result<Value> {
605    input
606        .parse::<i64>()
607        .map(Value::Int)
608        .map_err(|_| err(ErrorKind::TypeMismatch, line_num, format!("invalid int: {:?}", input)))
609}
610
611fn parse_uint(input: &str, line_num: usize) -> Result<Value> {
612    if input.starts_with('-') {
613        return Err(err(
614            ErrorKind::TypeMismatch,
615            line_num,
616            format!("negative value for uint: {:?}", input),
617        ));
618    }
619    input
620        .parse::<u64>()
621        .map(Value::Uint)
622        .map_err(|_| err(ErrorKind::TypeMismatch, line_num, format!("invalid uint: {:?}", input)))
623}
624
625fn parse_float(input: &str, line_num: usize) -> Result<Value> {
626    let n: f64 = input
627        .parse()
628        .map_err(|_| err(ErrorKind::TypeMismatch, line_num, format!("invalid float: {:?}", input)))?;
629    if !n.is_finite() {
630        return Err(err(
631            ErrorKind::TypeMismatch,
632            line_num,
633            "NaN/Inf not allowed in SIF floats",
634        ));
635    }
636    Ok(Value::Float(n))
637}
638
639// ── String Parsing (§11) ────────────────────────────────────────────
640
641/// Parse a string value — unquoted passthrough or quoted with escaping.
642fn parse_string_value(input: &str) -> Result<String> {
643    if input.starts_with('"') {
644        parse_quoted_string_value(input)
645    } else {
646        Ok(input.to_string())
647    }
648}
649
650/// Parse a quoted string: `"content with \"escapes\""`.
651fn parse_quoted_string_value(input: &str) -> Result<String> {
652    if !input.starts_with('"') {
653        return Err(err(ErrorKind::InvalidString, 0, "expected opening quote"));
654    }
655
656    let inner = &input[1..];
657    let mut result = String::new();
658    let mut chars = inner.chars();
659    loop {
660        match chars.next() {
661            None => {
662                return Err(err(ErrorKind::InvalidString, 0, "unterminated quoted string"));
663            }
664            Some('"') => {
665                // End of string — verify nothing follows except whitespace
666                // (in record context, the rest of the field ends at the tab)
667                break;
668            }
669            Some('\\') => {
670                match chars.next() {
671                    Some('n') => result.push('\n'),
672                    Some('t') => result.push('\t'),
673                    Some('\\') => result.push('\\'),
674                    Some('"') => result.push('"'),
675                    Some(c) => {
676                        return Err(err(
677                            ErrorKind::InvalidString,
678                            0,
679                            format!("invalid escape sequence: \\{}", c),
680                        ));
681                    }
682                    None => {
683                        return Err(err(
684                            ErrorKind::InvalidString,
685                            0,
686                            "unterminated escape sequence",
687                        ));
688                    }
689                }
690            }
691            Some(c) => result.push(c),
692        }
693    }
694    Ok(result)
695}
696
697// ── Array Parsing (§14) ─────────────────────────────────────────────
698
699fn parse_array_value(input: &str, elem_ty: &Type, line_num: usize) -> Result<Value> {
700    if !input.starts_with('[') || !input.ends_with(']') {
701        return Err(err(
702            ErrorKind::InvalidArray,
703            line_num,
704            format!("expected array literal, got {:?}", input),
705        ));
706    }
707    let inner = &input[1..input.len() - 1];
708    if inner.is_empty() {
709        return Ok(Value::Array(Vec::new()));
710    }
711
712    let elements = split_array_elements(inner);
713    let mut values = Vec::with_capacity(elements.len());
714    for elem in &elements {
715        let v = parse_value_typed(elem, elem_ty, line_num)?;
716        values.push(v);
717    }
718    Ok(Value::Array(values))
719}
720
721/// Split array elements on commas, respecting nested brackets and quotes.
722fn split_array_elements(input: &str) -> Vec<&str> {
723    let mut elements = Vec::new();
724    let mut start = 0;
725    let mut bracket_depth = 0;
726    let mut brace_depth = 0;
727    let mut in_quotes = false;
728
729    for (i, c) in input.char_indices() {
730        match c {
731            '"' => in_quotes = !in_quotes,
732            '[' if !in_quotes => bracket_depth += 1,
733            ']' if !in_quotes => bracket_depth -= 1,
734            '{' if !in_quotes => brace_depth += 1,
735            '}' if !in_quotes => brace_depth -= 1,
736            ',' if !in_quotes && bracket_depth == 0 && brace_depth == 0 => {
737                elements.push(&input[start..i]);
738                start = i + 1;
739            }
740            _ => {}
741        }
742    }
743    elements.push(&input[start..]);
744    elements
745}
746
747// ── Map Parsing (§15) ───────────────────────────────────────────────
748
749fn parse_map_value(input: &str, line_num: usize) -> Result<Value> {
750    if !input.starts_with('{') || !input.ends_with('}') {
751        return Err(err(
752            ErrorKind::InvalidMap,
753            line_num,
754            format!("expected map literal, got {:?}", input),
755        ));
756    }
757    let inner = &input[1..input.len() - 1];
758    if inner.is_empty() {
759        return Ok(Value::Map(Vec::new()));
760    }
761
762    let entries_str = split_array_elements(inner); // reuse comma splitter
763    let mut entries = Vec::with_capacity(entries_str.len());
764    for entry in &entries_str {
765        let colon_pos = entry.find(':').ok_or_else(|| {
766            err(
767                ErrorKind::InvalidMap,
768                line_num,
769                format!("map entry missing ':' separator: {:?}", entry),
770            )
771        })?;
772        let key = &entry[..colon_pos];
773        // Validate key: 1*(ALPHA / DIGIT / "_")
774        if key.is_empty() || !key.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
775            return Err(err(
776                ErrorKind::InvalidMap,
777                line_num,
778                format!("invalid map key: {:?}", key),
779            ));
780        }
781        let val_str = &entry[colon_pos + 1..];
782        let val = parse_value_untyped(val_str);
783        entries.push((key.to_string(), val));
784    }
785    Ok(Value::Map(entries))
786}
787
788// ── Date/Time Validation (§7.3) ─────────────────────────────────────
789
790fn validate_date(s: &str, line_num: usize) -> Result<()> {
791    // YYYY-MM-DD
792    if s.len() < 10 {
793        return Err(err(ErrorKind::TypeMismatch, line_num, format!("invalid date: {:?}", s)));
794    }
795    let bytes = s.as_bytes();
796    if bytes[4] != b'-' || bytes[7] != b'-' {
797        return Err(err(ErrorKind::TypeMismatch, line_num, format!("invalid date format: {:?}", s)));
798    }
799    // Basic digit check
800    for &i in &[0, 1, 2, 3, 5, 6, 8, 9] {
801        if !bytes[i].is_ascii_digit() {
802            return Err(err(ErrorKind::TypeMismatch, line_num, format!("invalid date: {:?}", s)));
803        }
804    }
805    Ok(())
806}
807
808fn validate_datetime(s: &str, line_num: usize) -> Result<()> {
809    // Must start with a valid date
810    if s.len() < 10 {
811        return Err(err(ErrorKind::TypeMismatch, line_num, format!("invalid datetime: {:?}", s)));
812    }
813    validate_date(&s[..10], line_num)?;
814    // After the date there should be 'T' followed by time
815    if s.len() > 10 && s.as_bytes()[10] != b'T' {
816        return Err(err(
817            ErrorKind::TypeMismatch,
818            line_num,
819            format!("invalid datetime separator (expected 'T'): {:?}", s),
820        ));
821    }
822    Ok(())
823}
824
825fn validate_duration(s: &str, line_num: usize) -> Result<()> {
826    if !s.starts_with('P') {
827        return Err(err(
828            ErrorKind::TypeMismatch,
829            line_num,
830            format!("duration must start with 'P': {:?}", s),
831        ));
832    }
833    Ok(())
834}
835
836// ── Base64 Decoding ─────────────────────────────────────────────────
837
838fn base64_decode(input: &str) -> std::result::Result<Vec<u8>, String> {
839    fn char_val(c: u8) -> std::result::Result<u8, String> {
840        match c {
841            b'A'..=b'Z' => Ok(c - b'A'),
842            b'a'..=b'z' => Ok(c - b'a' + 26),
843            b'0'..=b'9' => Ok(c - b'0' + 52),
844            b'+' => Ok(62),
845            b'/' => Ok(63),
846            _ => Err(format!("invalid base64 character: {:?}", c as char)),
847        }
848    }
849
850    let input = input.trim_end_matches('=');
851    let bytes = input.as_bytes();
852    let mut out = Vec::with_capacity(bytes.len() * 3 / 4);
853
854    for chunk in bytes.chunks(4) {
855        let a = char_val(chunk[0])?;
856        let b = if chunk.len() > 1 { char_val(chunk[1])? } else { 0 };
857        let c = if chunk.len() > 2 { char_val(chunk[2])? } else { 0 };
858        let d = if chunk.len() > 3 { char_val(chunk[3])? } else { 0 };
859
860        let n = (a as u32) << 18 | (b as u32) << 12 | (c as u32) << 6 | d as u32;
861
862        out.push((n >> 16) as u8);
863        if chunk.len() > 2 {
864            out.push((n >> 8) as u8);
865        }
866        if chunk.len() > 3 {
867            out.push(n as u8);
868        }
869    }
870    Ok(out)
871}
872
873// ── Header Parsing (§6) ─────────────────────────────────────────────
874
875fn parse_header(line: &str, line_num: usize) -> Result<Header> {
876    let s = line.trim_end();
877    if !s.starts_with("#!sif ") {
878        return Err(err(ErrorKind::InvalidHeader, line_num, "header must start with '#!sif '"));
879    }
880    let rest = &s[6..]; // after "#!sif "
881
882    // Tokenize respecting quoted attribute values
883    let tokens = tokenize_header(rest);
884    if tokens.is_empty() {
885        return Err(err(ErrorKind::InvalidHeader, line_num, "missing version"));
886    }
887
888    // First token must be vN
889    let version_str = &tokens[0];
890    if !version_str.starts_with('v') {
891        return Err(err(
892            ErrorKind::InvalidHeader,
893            line_num,
894            format!("expected version like 'v1', got {:?}", version_str),
895        ));
896    }
897    let version: u32 = version_str[1..]
898        .parse()
899        .map_err(|_| err(ErrorKind::InvalidHeader, line_num, "invalid version number"))?;
900
901    if version != 1 {
902        return Err(err(
903            ErrorKind::UnsupportedVersion,
904            line_num,
905            format!("unsupported SIF version {}, only v1 is supported", version),
906        ));
907    }
908
909    // Remaining tokens are key=value attributes
910    let mut attributes = HashMap::new();
911    for token in &tokens[1..] {
912        if let Some(eq_pos) = token.find('=') {
913            let key = &token[..eq_pos];
914            let raw_val = &token[eq_pos + 1..];
915            let val = if raw_val.starts_with('"') {
916                parse_quoted_string_value(raw_val)
917                    .map_err(|e| err(ErrorKind::InvalidHeader, line_num, e.message))?
918            } else {
919                raw_val.to_string()
920            };
921            attributes.insert(key.to_string(), val);
922        } else {
923            return Err(err(
924                ErrorKind::InvalidHeader,
925                line_num,
926                format!("invalid header attribute (missing '='): {:?}", token),
927            ));
928        }
929    }
930
931    Ok(Header { version, attributes })
932}
933
934/// Tokenize header content on spaces, respecting quoted values.
935fn tokenize_header(input: &str) -> Vec<&str> {
936    let mut tokens = Vec::new();
937    let mut start = 0;
938    let mut in_quotes = false;
939    let bytes = input.as_bytes();
940
941    let mut i = 0;
942    while i < bytes.len() {
943        match bytes[i] {
944            b'"' => in_quotes = !in_quotes,
945            b' ' if !in_quotes => {
946                let tok = &input[start..i];
947                if !tok.is_empty() {
948                    tokens.push(tok);
949                }
950                start = i + 1;
951            }
952            _ => {}
953        }
954        i += 1;
955    }
956    let last = &input[start..];
957    if !last.is_empty() {
958        tokens.push(last);
959    }
960    tokens
961}
962
963// ── Directive Parsing (§18) ─────────────────────────────────────────
964
965fn parse_directive(line: &str, line_num: usize) -> Result<Option<Directive>> {
966    let s = line.trim_end();
967
968    if s == "#recall schema" {
969        return Ok(Some(Directive::Recall));
970    }
971
972    if let Some(rest) = s.strip_prefix("#context ") {
973        return Ok(Some(Directive::Context(rest.to_string())));
974    }
975    if let Some(rest) = s.strip_prefix("#source ") {
976        return Ok(Some(Directive::Source(rest.to_string())));
977    }
978    if let Some(rest) = s.strip_prefix("#license ") {
979        return Ok(Some(Directive::License(rest.to_string())));
980    }
981    if let Some(rest) = s.strip_prefix("#error ") {
982        return Ok(Some(Directive::Error(rest.to_string())));
983    }
984    if let Some(rest) = s.strip_prefix("#filter ") {
985        return Ok(Some(Directive::Filter(rest.to_string())));
986    }
987    if let Some(rest) = s.strip_prefix("#sort ") {
988        let parts: Vec<&str> = rest.splitn(2, ' ').collect();
989        let field = parts[0].to_string();
990        let direction = if parts.len() > 1 {
991            match parts[1] {
992                "asc" => SortDirection::Asc,
993                "desc" => SortDirection::Desc,
994                _ => {
995                    return Err(err(
996                        ErrorKind::InvalidDirective,
997                        line_num,
998                        format!("invalid sort direction: {:?}", parts[1]),
999                    ));
1000                }
1001            }
1002        } else {
1003            SortDirection::Asc
1004        };
1005        return Ok(Some(Directive::Sort { field, direction }));
1006    }
1007    if let Some(rest) = s.strip_prefix("#limit ") {
1008        let n: u64 = rest
1009            .trim()
1010            .parse()
1011            .map_err(|_| err(ErrorKind::InvalidDirective, line_num, "invalid limit value"))?;
1012        return Ok(Some(Directive::Limit(n)));
1013    }
1014    if let Some(rest) = s.strip_prefix("#truncated") {
1015        let attrs_str = rest.trim();
1016        let attrs = parse_header_attrs(attrs_str, line_num)?;
1017        return Ok(Some(Directive::Truncated(attrs)));
1018    }
1019    if let Some(rest) = s.strip_prefix("#relation ") {
1020        return parse_relation(rest, line_num).map(Some);
1021    }
1022
1023    // §18.8: Unknown directives — skip without error.
1024    if s.starts_with('#') {
1025        let name_end = s[1..]
1026            .find(' ')
1027            .map(|i| i + 1)
1028            .unwrap_or(s.len());
1029        let name = &s[1..name_end];
1030        let content = if name_end < s.len() {
1031            s[name_end + 1..].to_string()
1032        } else {
1033            String::new()
1034        };
1035        return Ok(Some(Directive::Unknown {
1036            name: name.to_string(),
1037            content,
1038        }));
1039    }
1040
1041    Ok(None)
1042}
1043
1044fn parse_header_attrs(input: &str, line_num: usize) -> Result<Vec<(String, String)>> {
1045    let mut attrs = Vec::new();
1046    if input.is_empty() {
1047        return Ok(attrs);
1048    }
1049    let tokens = tokenize_header(input);
1050    for token in tokens {
1051        if let Some(eq_pos) = token.find('=') {
1052            let key = token[..eq_pos].to_string();
1053            let raw_val = &token[eq_pos + 1..];
1054            let val = if raw_val.starts_with('"') {
1055                parse_quoted_string_value(raw_val)
1056                    .map_err(|e| err(ErrorKind::InvalidDirective, line_num, e.message))?
1057            } else {
1058                raw_val.to_string()
1059            };
1060            attrs.push((key, val));
1061        }
1062    }
1063    Ok(attrs)
1064}
1065
1066fn parse_relation(input: &str, line_num: usize) -> Result<Directive> {
1067    // field_ref SP "->" SP field_ref
1068    let parts: Vec<&str> = input.splitn(3, " -> ").collect();
1069    if parts.len() != 2 {
1070        // Try splitting on " -> " with different whitespace
1071        let arrow = input.find("->").ok_or_else(|| {
1072            err(
1073                ErrorKind::InvalidDirective,
1074                line_num,
1075                "relation directive missing '->'",
1076            )
1077        })?;
1078        let from_str = input[..arrow].trim();
1079        let to_str = input[arrow + 2..].trim();
1080        return Ok(Directive::Relation {
1081            from: parse_field_ref(from_str),
1082            to: parse_field_ref(to_str),
1083        });
1084    }
1085    Ok(Directive::Relation {
1086        from: parse_field_ref(parts[0].trim()),
1087        to: parse_field_ref(parts[1].trim()),
1088    })
1089}
1090
1091fn parse_field_ref(input: &str) -> FieldRef {
1092    if input.starts_with('§') {
1093        let rest = &input['§'.len_utf8()..];
1094        if let Some(dot_pos) = rest.find('.') {
1095            return FieldRef {
1096                section: Some(rest[..dot_pos].to_string()),
1097                field: rest[dot_pos + 1..].to_string(),
1098            };
1099        }
1100    }
1101    FieldRef {
1102        section: None,
1103        field: input.to_string(),
1104    }
1105}
1106
1107// ── Block Parsing (§19) ─────────────────────────────────────────────
1108
1109fn parse_block_start(line: &str, line_num: usize) -> Result<(BlockType, Vec<(String, String)>)> {
1110    let s = line.trim_end();
1111    let rest = s
1112        .strip_prefix("#block ")
1113        .ok_or_else(|| err(ErrorKind::InvalidBlock, line_num, "expected '#block <type>'"))?;
1114
1115    let tokens = tokenize_header(rest);
1116    if tokens.is_empty() {
1117        return Err(err(ErrorKind::InvalidBlock, line_num, "missing block type"));
1118    }
1119
1120    let block_type = match tokens[0] {
1121        "code" => BlockType::Code,
1122        "text" => BlockType::Text,
1123        "diff" => BlockType::Diff,
1124        "raw" => BlockType::Raw,
1125        "template" => BlockType::Template,
1126        other => {
1127            return Err(err(
1128                ErrorKind::InvalidBlock,
1129                line_num,
1130                format!("unknown block type: {:?}", other),
1131            ));
1132        }
1133    };
1134
1135    let mut attrs = Vec::new();
1136    for token in &tokens[1..] {
1137        if let Some(eq_pos) = token.find('=') {
1138            let key = token[..eq_pos].to_string();
1139            let raw_val = &token[eq_pos + 1..];
1140            let val = if raw_val.starts_with('"') {
1141                parse_quoted_string_value(raw_val)
1142                    .map_err(|e| err(ErrorKind::InvalidBlock, line_num, e.message))?
1143            } else {
1144                raw_val.to_string()
1145            };
1146            attrs.push((key, val));
1147        }
1148    }
1149
1150    Ok((block_type, attrs))
1151}
1152
1153// ── Inline Annotations (§16.3) ──────────────────────────────────────
1154
1155fn parse_spans(input: &str) -> Vec<Span> {
1156    let mut spans = Vec::new();
1157    let mut pos = 0;
1158    let bytes = input.as_bytes();
1159    let len = bytes.len();
1160
1161    while pos < len {
1162        // Look for @semantic{ pattern
1163        if bytes[pos] == b'@' && pos + 1 < len {
1164            // Find the semantic name (up to '{')
1165            let sem_start = pos + 1;
1166            let mut brace_pos = None;
1167            let mut j = sem_start;
1168            while j < len {
1169                if bytes[j] == b'{' {
1170                    brace_pos = Some(j);
1171                    break;
1172                }
1173                if !bytes[j].is_ascii_alphanumeric() && bytes[j] != b'_' {
1174                    break;
1175                }
1176                j += 1;
1177            }
1178
1179            if let Some(bp) = brace_pos {
1180                if bp > sem_start {
1181                    // Flush preceding text
1182                    if pos > spans_text_end(&spans, input) {
1183                        let text_start = spans_text_end(&spans, input);
1184                        if text_start < pos {
1185                            // Already handled
1186                        }
1187                    }
1188                    // Parse the annotation
1189                    let semantic = &input[sem_start..bp];
1190                    let content_start = bp + 1;
1191                    // Find matching closing brace, respecting nesting
1192                    if let Some(content_end) = find_matching_brace(input, content_start) {
1193                        let content = &input[content_start..content_end];
1194                        let children = parse_spans(content);
1195                        spans.push(Span::Annotated {
1196                            semantic: semantic.to_string(),
1197                            children,
1198                        });
1199                        pos = content_end + 1;
1200                        continue;
1201                    }
1202                }
1203            }
1204        }
1205
1206        // Accumulate plain text
1207        let text_start = pos;
1208        while pos < len {
1209            if bytes[pos] == b'@' && pos + 1 < len && bytes[pos + 1].is_ascii_alphabetic() {
1210                // Check if this is actually an annotation
1211                let mut k = pos + 1;
1212                while k < len && (bytes[k].is_ascii_alphanumeric() || bytes[k] == b'_') {
1213                    k += 1;
1214                }
1215                if k < len && bytes[k] == b'{' {
1216                    break;
1217                }
1218            }
1219            pos += 1;
1220        }
1221        if pos > text_start {
1222            spans.push(Span::Text(input[text_start..pos].to_string()));
1223        }
1224    }
1225
1226    spans
1227}
1228
1229fn spans_text_end(_spans: &[Span], _input: &str) -> usize {
1230    0
1231}
1232
1233fn find_matching_brace(input: &str, start: usize) -> Option<usize> {
1234    let bytes = input.as_bytes();
1235    let mut depth = 1;
1236    let mut i = start;
1237    while i < bytes.len() {
1238        match bytes[i] {
1239            b'{' => depth += 1,
1240            b'}' => {
1241                depth -= 1;
1242                if depth == 0 {
1243                    return Some(i);
1244                }
1245            }
1246            _ => {}
1247        }
1248        i += 1;
1249    }
1250    None
1251}
1252
1253// ── Record Parsing ──────────────────────────────────────────────────
1254
1255fn parse_record_line(
1256    line: &str,
1257    schema: &Schema,
1258    line_num: usize,
1259) -> Result<Record> {
1260    let s = line.trim_end_matches('\n').trim_end_matches('\r');
1261
1262    // Check for CDC prefix (§ from SIF Streaming spec)
1263    let (cdc_op, data) = if s.starts_with('Δ') {
1264        (CdcOp::Update, &s['Δ'.len_utf8()..])
1265    } else if s.starts_with('∅') {
1266        (CdcOp::Delete, &s['∅'.len_utf8()..])
1267    } else {
1268        (CdcOp::Insert, s)
1269    };
1270
1271    // Split on tabs
1272    let raw_fields: Vec<&str> = data.split('\t').collect();
1273    let field_count = schema.field_count();
1274
1275    let mut values = Vec::with_capacity(field_count);
1276
1277    // CDC delete tombstones contain only :id field values.
1278    // Non-id fields are implicitly null.
1279    if cdc_op == CdcOp::Delete {
1280        let id_fields: Vec<usize> = schema
1281            .fields
1282            .iter()
1283            .enumerate()
1284            .filter(|(_, f)| f.semantic.as_deref() == Some("id"))
1285            .map(|(i, _)| i)
1286            .collect();
1287
1288        let mut id_idx = 0;
1289        for (i, field_def) in schema.fields.iter().enumerate() {
1290            if id_fields.contains(&i) {
1291                let raw = if id_idx < raw_fields.len() {
1292                    raw_fields[id_idx]
1293                } else {
1294                    ""
1295                };
1296                id_idx += 1;
1297                let value = parse_value_typed(raw, &field_def.field_type, line_num)?;
1298                values.push(value);
1299            } else {
1300                values.push(Value::Null);
1301            }
1302        }
1303    } else {
1304        for (i, field_def) in schema.fields.iter().enumerate() {
1305            let raw = if i < raw_fields.len() {
1306                raw_fields[i]
1307            } else {
1308                // §9: Too few fields — treat missing trailing fields as null
1309                ""
1310            };
1311
1312            let value = parse_value_typed(raw, &field_def.field_type, line_num)?;
1313            values.push(value);
1314        }
1315    }
1316
1317    // §9: Too many fields — we parse them but could warn.
1318    // For now, silently accept extra fields (lenient mode).
1319
1320    Ok(Record { values, cdc_op })
1321}
1322
1323// ── Document Parser ─────────────────────────────────────────────────
1324
1325struct DocumentParser<'a> {
1326    lines: Vec<&'a str>,
1327    pos: usize,
1328}
1329
1330impl<'a> DocumentParser<'a> {
1331    fn new(input: &'a str) -> Self {
1332        let lines: Vec<&str> = input.lines().collect();
1333        Self { lines, pos: 0 }
1334    }
1335
1336    fn line_num(&self) -> usize {
1337        self.pos + 1
1338    }
1339
1340    fn peek(&self) -> Option<&'a str> {
1341        self.lines.get(self.pos).copied()
1342    }
1343
1344    fn advance(&mut self) -> Option<&'a str> {
1345        let line = self.lines.get(self.pos).copied();
1346        if line.is_some() {
1347            self.pos += 1;
1348        }
1349        line
1350    }
1351
1352    fn parse(&mut self) -> Result<Document> {
1353        // §4: Handle BOM
1354        if let Some(first) = self.peek() {
1355            if first.starts_with('\u{FEFF}') {
1356                // Strip BOM, continue with warning (non-fatal).
1357                // We modify the line in place conceptually.
1358                let stripped = &first[3..]; // UTF-8 BOM is 3 bytes
1359                self.lines[0] = stripped;
1360            }
1361        }
1362
1363        // §6: Parse header
1364        let header_line = self
1365            .advance()
1366            .ok_or_else(|| err(ErrorKind::UnexpectedEof, 1, "empty document"))?;
1367        let header = parse_header(header_line, 1)?;
1368
1369        // Parse sections
1370        let mut sections = Vec::new();
1371        let mut current = self.new_section();
1372
1373        while let Some(line) = self.peek() {
1374            let trimmed = line.trim_end();
1375
1376            // Skip empty lines (§5.1)
1377            if trimmed.is_empty() {
1378                self.advance();
1379                continue;
1380            }
1381
1382            // §5.3: Lines beginning with #! after header — silently ignore
1383            if trimmed.starts_with("#!") {
1384                self.advance();
1385                continue;
1386            }
1387
1388            // Section break (§17)
1389            if trimmed == "---" {
1390                self.advance();
1391                sections.push(current);
1392                current = self.new_section();
1393                continue;
1394            }
1395
1396            // Section identifier (§17.3)
1397            if trimmed.starts_with('§') {
1398                self.advance();
1399                let id = &trimmed['§'.len_utf8()..];
1400                current.id = Some(id.to_string());
1401                continue;
1402            }
1403
1404            // Block (§19)
1405            if trimmed.starts_with("#block ") {
1406                let block = self.parse_block()?;
1407                current.blocks.push(block);
1408                continue;
1409            }
1410
1411            // Template (§22)
1412            if trimmed.starts_with("#template ") {
1413                let template = self.parse_template()?;
1414                current.templates.push(template);
1415                continue;
1416            }
1417
1418            // Block end outside block — error
1419            if trimmed == "#/block" {
1420                return Err(err(
1421                    ErrorKind::InvalidBlock,
1422                    self.line_num(),
1423                    "unexpected #/block outside block",
1424                ));
1425            }
1426
1427            // Schema (§8)
1428            if trimmed.starts_with("#schema ") {
1429                self.advance();
1430                let schema_body = &trimmed[8..];
1431                let schema = parse_schema_str(schema_body, self.line_num() - 1)?;
1432                current.schema = Some(schema);
1433                continue;
1434            }
1435
1436            // Recall (§18.10) — no-op
1437            if trimmed == "#recall schema" {
1438                self.advance();
1439                current.directives.push(Directive::Recall);
1440                continue;
1441            }
1442
1443            // Other directives (§18)
1444            if trimmed.starts_with('#') {
1445                self.advance();
1446                if let Some(directive) = parse_directive(trimmed, self.line_num() - 1)? {
1447                    current.directives.push(directive);
1448                }
1449                continue;
1450            }
1451
1452            // Record (§9)
1453            if let Some(ref schema) = current.schema {
1454                self.advance();
1455                let schema_clone = schema.clone();
1456                let record = parse_record_line(trimmed, &schema_clone, self.line_num() - 1)?;
1457                current.records.push(record);
1458            } else {
1459                // §3: A section without a schema MUST NOT contain records.
1460                return Err(err(
1461                    ErrorKind::RecordWithoutSchema,
1462                    self.line_num(),
1463                    "record found before any #schema directive in this section",
1464                ));
1465            }
1466        }
1467
1468        sections.push(current);
1469        Ok(Document { header, sections })
1470    }
1471
1472    fn new_section(&self) -> Section {
1473        Section {
1474            id: None,
1475            directives: Vec::new(),
1476            schema: None,
1477            records: Vec::new(),
1478            blocks: Vec::new(),
1479            templates: Vec::new(),
1480        }
1481    }
1482
1483    fn parse_block(&mut self) -> Result<Block> {
1484        let start_line = self.line_num();
1485        let start = self
1486            .advance()
1487            .ok_or_else(|| err(ErrorKind::UnexpectedEof, start_line, "expected block start"))?;
1488        let (block_type, attributes) = parse_block_start(start, start_line)?;
1489
1490        let mut content = String::new();
1491        loop {
1492            let line = self
1493                .advance()
1494                .ok_or_else(|| err(ErrorKind::InvalidBlock, start_line, "unterminated block"))?;
1495            if line.trim_end() == "#/block" {
1496                break;
1497            }
1498            if !content.is_empty() {
1499                content.push('\n');
1500            }
1501            content.push_str(line);
1502        }
1503
1504        Ok(Block {
1505            block_type,
1506            attributes,
1507            content,
1508        })
1509    }
1510
1511    fn parse_template(&mut self) -> Result<Template> {
1512        let start_line = self.line_num();
1513        let start = self
1514            .advance()
1515            .ok_or_else(|| err(ErrorKind::UnexpectedEof, start_line, "expected template start"))?;
1516
1517        let name = start
1518            .trim_end()
1519            .strip_prefix("#template ")
1520            .ok_or_else(|| err(ErrorKind::InvalidTemplate, start_line, "expected '#template <name>'"))?
1521            .trim()
1522            .to_string();
1523
1524        if name.is_empty() {
1525            return Err(err(
1526                ErrorKind::InvalidTemplate,
1527                start_line,
1528                "template name cannot be empty",
1529            ));
1530        }
1531
1532        let mut body = String::new();
1533        loop {
1534            let line = self.advance().ok_or_else(|| {
1535                err(
1536                    ErrorKind::InvalidTemplate,
1537                    start_line,
1538                    "unterminated template",
1539                )
1540            })?;
1541            if line.trim_end() == "#/template" {
1542                break;
1543            }
1544            if !body.is_empty() {
1545                body.push('\n');
1546            }
1547            body.push_str(line);
1548        }
1549
1550        Ok(Template { name, body })
1551    }
1552}
1553
1554// ── Functions exposed to sibling modules (reader.rs) ────────────────
1555
1556pub(crate) fn parse_header_public(line: &str, line_num: usize) -> Result<Header> {
1557    parse_header(line, line_num)
1558}
1559
1560pub(crate) fn parse_directive_public(
1561    line: &str,
1562    line_num: usize,
1563) -> Result<Option<Directive>> {
1564    parse_directive(line, line_num)
1565}
1566
1567pub(crate) fn parse_record_public(
1568    line: &str,
1569    schema: &Schema,
1570    line_num: usize,
1571) -> Result<Record> {
1572    parse_record_line(line, schema, line_num)
1573}
1574
1575// ── Tests ───────────────────────────────────────────────────────────
1576
1577#[cfg(test)]
1578mod tests {
1579    use super::*;
1580
1581    // -- Header tests --
1582
1583    #[test]
1584    fn test_minimal_header() {
1585        let doc = parse("#!sif v1\n").unwrap();
1586        assert_eq!(doc.header.version, 1);
1587        assert!(doc.header.attributes.is_empty());
1588    }
1589
1590    #[test]
1591    fn test_header_with_attrs() {
1592        let doc = parse("#!sif v1 origin=sif-cli/1.0.0 created=2026-03-08T06:50:51Z\n").unwrap();
1593        assert_eq!(doc.header.attributes["origin"], "sif-cli/1.0.0");
1594        assert_eq!(doc.header.attributes["created"], "2026-03-08T06:50:51Z");
1595    }
1596
1597    #[test]
1598    fn test_header_quoted_attr() {
1599        let doc = parse("#!sif v1 context=\"GitHub issue export for serde-rs/serde\"\n").unwrap();
1600        assert_eq!(
1601            doc.header.attributes["context"],
1602            "GitHub issue export for serde-rs/serde"
1603        );
1604    }
1605
1606    #[test]
1607    fn test_unsupported_version() {
1608        assert!(parse("#!sif v2\n").is_err());
1609    }
1610
1611    // -- Type tests --
1612
1613    #[test]
1614    fn test_scalar_types() {
1615        assert_eq!(parse_type_str("bool").unwrap(), Type::Bool);
1616        assert_eq!(parse_type_str("int").unwrap(), Type::Int);
1617        assert_eq!(parse_type_str("uint").unwrap(), Type::Uint);
1618        assert_eq!(parse_type_str("float").unwrap(), Type::Float);
1619        assert_eq!(parse_type_str("str").unwrap(), Type::Str);
1620        assert_eq!(parse_type_str("date").unwrap(), Type::Date);
1621        assert_eq!(parse_type_str("datetime").unwrap(), Type::DateTime);
1622        assert_eq!(parse_type_str("duration").unwrap(), Type::Duration);
1623        assert_eq!(parse_type_str("bytes").unwrap(), Type::Bytes);
1624        assert_eq!(parse_type_str("null").unwrap(), Type::Null);
1625        assert_eq!(parse_type_str("any").unwrap(), Type::Any);
1626        assert_eq!(parse_type_str("map").unwrap(), Type::Map);
1627    }
1628
1629    #[test]
1630    fn test_compound_types() {
1631        assert_eq!(
1632            parse_type_str("str[]").unwrap(),
1633            Type::Array(Box::new(Type::Str))
1634        );
1635        assert_eq!(
1636            parse_type_str("int?").unwrap(),
1637            Type::Nullable(Box::new(Type::Int))
1638        );
1639        // str[]? → Nullable(Array(Str))
1640        assert_eq!(
1641            parse_type_str("str[]?").unwrap(),
1642            Type::Nullable(Box::new(Type::Array(Box::new(Type::Str))))
1643        );
1644        // int?[] → Array(Nullable(Int))
1645        assert_eq!(
1646            parse_type_str("int?[]").unwrap(),
1647            Type::Array(Box::new(Type::Nullable(Box::new(Type::Int))))
1648        );
1649    }
1650
1651    #[test]
1652    fn test_enum_type() {
1653        assert_eq!(
1654            parse_type_str("enum(open,closed,merged)").unwrap(),
1655            Type::Enum(vec![
1656                "open".to_string(),
1657                "closed".to_string(),
1658                "merged".to_string()
1659            ])
1660        );
1661    }
1662
1663    // -- Schema tests --
1664
1665    #[test]
1666    fn test_basic_schema() {
1667        let s = parse_schema("id:uint:id title:str status:enum(open,closed)").unwrap();
1668        assert_eq!(s.fields.len(), 3);
1669        assert_eq!(s.fields[0].name, "id");
1670        assert_eq!(s.fields[0].field_type, Type::Uint);
1671        assert_eq!(s.fields[0].semantic.as_deref(), Some("id"));
1672        assert_eq!(s.fields[1].name, "title");
1673        assert_eq!(s.fields[1].field_type, Type::Str);
1674        assert!(s.fields[1].semantic.is_none());
1675        assert_eq!(s.fields[2].name, "status");
1676        assert_eq!(
1677            s.fields[2].field_type,
1678            Type::Enum(vec!["open".to_string(), "closed".to_string()])
1679        );
1680    }
1681
1682    #[test]
1683    fn test_deprecated_field() {
1684        let s = parse_schema("id:uint:id ∅old_email:str email:str:email").unwrap();
1685        assert!(!s.fields[0].deprecated);
1686        assert!(s.fields[1].deprecated);
1687        assert_eq!(s.fields[1].name, "old_email");
1688        assert!(!s.fields[2].deprecated);
1689    }
1690
1691    #[test]
1692    fn test_field_modifiers() {
1693        let s =
1694            parse_schema("stock:uint:metric|agg=sum,align=right price:float|unit=usd,fmt=currency")
1695                .unwrap();
1696        assert_eq!(s.fields[0].modifiers.len(), 2);
1697        assert_eq!(s.fields[0].modifiers[0].name, "agg");
1698        assert_eq!(s.fields[0].modifiers[0].value.as_deref(), Some("sum"));
1699        assert_eq!(s.fields[0].modifiers[1].name, "align");
1700        assert_eq!(s.fields[0].modifiers[1].value.as_deref(), Some("right"));
1701    }
1702
1703    // -- Value tests --
1704
1705    #[test]
1706    fn test_untyped_disambiguation() {
1707        assert_eq!(parse_untyped_value("_"), Value::Null);
1708        assert_eq!(parse_untyped_value("T"), Value::Bool(true));
1709        assert_eq!(parse_untyped_value("F"), Value::Bool(false));
1710        assert_eq!(parse_untyped_value("42"), Value::Int(42));
1711        assert_eq!(parse_untyped_value("-7"), Value::Int(-7));
1712        assert_eq!(parse_untyped_value("3.14"), Value::Float(3.14));
1713        assert_eq!(
1714            parse_untyped_value("hello"),
1715            Value::Str("hello".to_string())
1716        );
1717        // "true" is NOT bool in untyped context — only T/F
1718        assert_eq!(
1719            parse_untyped_value("true"),
1720            Value::Str("true".to_string())
1721        );
1722    }
1723
1724    #[test]
1725    fn test_typed_values() {
1726        assert_eq!(
1727            parse_typed_value("T", &Type::Bool).unwrap(),
1728            Value::Bool(true)
1729        );
1730        assert_eq!(
1731            parse_typed_value("true", &Type::Bool).unwrap(),
1732            Value::Bool(true)
1733        );
1734        assert_eq!(
1735            parse_typed_value("42", &Type::Int).unwrap(),
1736            Value::Int(42)
1737        );
1738        assert_eq!(
1739            parse_typed_value("42", &Type::Uint).unwrap(),
1740            Value::Uint(42)
1741        );
1742        assert!(parse_typed_value("-1", &Type::Uint).is_err());
1743        assert_eq!(
1744            parse_typed_value("3.14", &Type::Float).unwrap(),
1745            Value::Float(3.14)
1746        );
1747    }
1748
1749    #[test]
1750    fn test_nullable() {
1751        assert_eq!(
1752            parse_typed_value("_", &Type::Nullable(Box::new(Type::Int))).unwrap(),
1753            Value::Null
1754        );
1755        assert_eq!(
1756            parse_typed_value("42", &Type::Nullable(Box::new(Type::Int))).unwrap(),
1757            Value::Int(42)
1758        );
1759        // Lenient: _ accepted for non-nullable types too (real-world compat)
1760        assert_eq!(parse_typed_value("_", &Type::Int).unwrap(), Value::Null);
1761    }
1762
1763    #[test]
1764    fn test_enum_validation() {
1765        let ty = Type::Enum(vec!["open".to_string(), "closed".to_string()]);
1766        assert_eq!(
1767            parse_typed_value("open", &ty).unwrap(),
1768            Value::Enum("open".to_string())
1769        );
1770        assert!(parse_typed_value("invalid", &ty).is_err());
1771    }
1772
1773    #[test]
1774    fn test_array_value() {
1775        assert_eq!(
1776            parse_typed_value("[1,2,3]", &Type::Array(Box::new(Type::Int))).unwrap(),
1777            Value::Array(vec![Value::Int(1), Value::Int(2), Value::Int(3)])
1778        );
1779        assert_eq!(
1780            parse_typed_value("[]", &Type::Array(Box::new(Type::Str))).unwrap(),
1781            Value::Array(Vec::new())
1782        );
1783    }
1784
1785    #[test]
1786    fn test_map_value() {
1787        assert_eq!(
1788            parse_typed_value("{name:alice,age:30}", &Type::Map).unwrap(),
1789            Value::Map(vec![
1790                ("name".to_string(), Value::Str("alice".to_string())),
1791                ("age".to_string(), Value::Int(30)),
1792            ])
1793        );
1794    }
1795
1796    #[test]
1797    fn test_string_escaping() {
1798        assert_eq!(
1799            parse_typed_value(r#""has a \t tab""#, &Type::Str).unwrap(),
1800            Value::Str("has a \t tab".to_string())
1801        );
1802        assert_eq!(
1803            parse_typed_value(r#""line\nbreak""#, &Type::Str).unwrap(),
1804            Value::Str("line\nbreak".to_string())
1805        );
1806        assert_eq!(
1807            parse_typed_value(r#""she said \"hello\"""#, &Type::Str).unwrap(),
1808            Value::Str("she said \"hello\"".to_string())
1809        );
1810    }
1811
1812    // -- Full document tests --
1813
1814    #[test]
1815    fn test_typical_document() {
1816        let input = "\
1817#!sif v1
1818#context Repository issues
1819#schema id:uint:id title:str status:enum(open,closed) created:datetime
18201\tFix flatten in tagged enums\topen\t2026-01-15T10:30:00Z
18212\tCow borrows owned\tclosed\t2026-01-16T08:00:00Z
1822";
1823        let doc = parse(input).unwrap();
1824        assert_eq!(doc.sections.len(), 1);
1825        let sec = &doc.sections[0];
1826        assert_eq!(sec.records.len(), 2);
1827        assert_eq!(sec.records[0].values[0], Value::Uint(1));
1828        assert_eq!(
1829            sec.records[0].values[1],
1830            Value::Str("Fix flatten in tagged enums".to_string())
1831        );
1832        assert_eq!(
1833            sec.records[0].values[2],
1834            Value::Enum("open".to_string())
1835        );
1836    }
1837
1838    #[test]
1839    fn test_multi_section() {
1840        let input = "\
1841#!sif v1
1842#context Repos
1843#schema name:str stars:uint
1844serde\t8947
1845---
1846#context Issues
1847#schema id:uint:id title:str
18481\tFlatten bug
1849";
1850        let doc = parse(input).unwrap();
1851        assert_eq!(doc.sections.len(), 2);
1852        assert_eq!(doc.sections[0].records.len(), 1);
1853        assert_eq!(doc.sections[1].records.len(), 1);
1854    }
1855
1856    #[test]
1857    fn test_section_identifiers() {
1858        let input = "\
1859#!sif v1
1860§repos
1861#schema name:str stars:uint
1862serde\t8947
1863---
1864§issues
1865#schema id:uint:id title:str
18661\tFlatten bug
1867";
1868        let doc = parse(input).unwrap();
1869        assert_eq!(doc.sections[0].id.as_deref(), Some("repos"));
1870        assert_eq!(doc.sections[1].id.as_deref(), Some("issues"));
1871        assert!(doc.section_by_id("repos").is_some());
1872    }
1873
1874    #[test]
1875    fn test_blocks() {
1876        let input = "\
1877#!sif v1
1878#block code language=rust
1879fn main() {
1880    println!(\"hello\");
1881}
1882#/block
1883";
1884        let doc = parse(input).unwrap();
1885        assert_eq!(doc.sections[0].blocks.len(), 1);
1886        let block = &doc.sections[0].blocks[0];
1887        assert_eq!(block.block_type, BlockType::Code);
1888        assert_eq!(block.attributes, vec![("language".to_string(), "rust".to_string())]);
1889        assert!(block.content.contains("fn main()"));
1890    }
1891
1892    #[test]
1893    fn test_templates() {
1894        let input = "\
1895#!sif v1
1896#schema id:uint name:str
1897#template greeting
1898Hello, @{name}! Your ID is @{id}.
1899#/template
19001\talice
1901";
1902        let doc = parse(input).unwrap();
1903        assert_eq!(doc.sections[0].templates.len(), 1);
1904        let tmpl = &doc.sections[0].templates[0];
1905        assert_eq!(tmpl.name, "greeting");
1906
1907        let schema = doc.sections[0].schema.as_ref().unwrap();
1908        let record = &doc.sections[0].records[0];
1909        let rendered = tmpl.render(record, schema);
1910        assert_eq!(rendered, "Hello, alice! Your ID is 1.");
1911    }
1912
1913    #[test]
1914    fn test_recall_is_noop() {
1915        let input = "\
1916#!sif v1
1917#schema id:uint name:str
19181\talice
1919#recall schema
19202\tbob
1921";
1922        let doc = parse(input).unwrap();
1923        assert_eq!(doc.sections[0].records.len(), 2);
1924    }
1925
1926    #[test]
1927    fn test_inline_annotations() {
1928        let spans = parse_inline_annotations(
1929            "Error in @path{src/main.rs} at @line_number{42}: @error{expected usize}",
1930        );
1931        // Should produce: Text, Annotated(path), Text, Annotated(line_number), Text, Annotated(error)
1932        assert_eq!(spans.len(), 6);
1933        match &spans[1] {
1934            Span::Annotated { semantic, children } => {
1935                assert_eq!(semantic, "path");
1936                assert_eq!(children.len(), 1);
1937            }
1938            _ => panic!("expected annotated span"),
1939        }
1940    }
1941
1942    #[test]
1943    fn test_inline_sif() {
1944        let doc =
1945            parse_inline_sif("sif::#schema id:uint name:str::1\talice::2\tbob").unwrap();
1946        assert_eq!(doc.sections[0].records.len(), 2);
1947    }
1948
1949    #[test]
1950    fn test_cdc_prefixes() {
1951        let schema = parse_schema("id:uint:id name:str").unwrap();
1952        let rec = parse_record_line("Δ1\tupdated", &schema, 1).unwrap();
1953        assert_eq!(rec.cdc_op, CdcOp::Update);
1954        assert_eq!(rec.values[0], Value::Uint(1));
1955
1956        let rec = parse_record_line("∅2\tdeleted", &schema, 1).unwrap();
1957        assert_eq!(rec.cdc_op, CdcOp::Delete);
1958    }
1959
1960    #[test]
1961    fn test_relation_directive() {
1962        let input = "\
1963#!sif v1
1964#schema id:uint:id parent_id:uint?:ref name:str
1965#relation parent_id -> id
19661\t_\tRoot
19672\t1\tChild
1968";
1969        let doc = parse(input).unwrap();
1970        let directives = &doc.sections[0].directives;
1971        assert!(directives.iter().any(|d| matches!(d, Directive::Relation { .. })));
1972    }
1973
1974    #[test]
1975    fn test_cross_section_reference() {
1976        let input = "\
1977#!sif v1
1978§auth
1979#schema name:str:id type:str
1980jwt\tbearer
1981---
1982§endpoints
1983#schema path:str:path auth:str?
1984/users\t§auth.jwt
1985";
1986        let doc = parse(input).unwrap();
1987        let rec = &doc.sections[1].records[0];
1988        // The reference is parsed as a plain string value
1989        assert_eq!(rec.values[1], Value::Str("§auth.jwt".to_string()));
1990    }
1991
1992    #[test]
1993    fn test_date_types() {
1994        let schema = parse_schema("d:date dt:datetime dur:duration").unwrap();
1995        let rec = parse_record_line("2026-03-14\t2026-03-14T10:30:00Z\tPT2H30M", &schema, 1).unwrap();
1996        assert_eq!(rec.values[0], Value::Date("2026-03-14".to_string()));
1997        assert_eq!(rec.values[1], Value::DateTime("2026-03-14T10:30:00Z".to_string()));
1998        assert_eq!(rec.values[2], Value::Duration("PT2H30M".to_string()));
1999    }
2000
2001    #[test]
2002    fn test_empty_document() {
2003        let doc = parse("#!sif v1\n").unwrap();
2004        assert_eq!(doc.sections.len(), 1);
2005        assert!(doc.sections[0].records.is_empty());
2006    }
2007
2008    #[test]
2009    fn test_nullable_array() {
2010        let schema = parse_schema("tags:str[]?").unwrap();
2011        assert_eq!(
2012            schema.fields[0].field_type,
2013            Type::Nullable(Box::new(Type::Array(Box::new(Type::Str))))
2014        );
2015        let rec = parse_record_line("_", &schema, 1).unwrap();
2016        assert_eq!(rec.values[0], Value::Null);
2017        let rec = parse_record_line("[a,b]", &schema, 1).unwrap();
2018        assert_eq!(
2019            rec.values[0],
2020            Value::Array(vec![
2021                Value::Str("a".to_string()),
2022                Value::Str("b".to_string()),
2023            ])
2024        );
2025    }
2026
2027    #[test]
2028    fn test_missing_trailing_fields() {
2029        let schema = parse_schema("a:str b:str? c:str?").unwrap();
2030        let rec = parse_record_line("hello", &schema, 1).unwrap();
2031        assert_eq!(rec.values.len(), 3);
2032        assert_eq!(rec.values[0], Value::Str("hello".to_string()));
2033        assert_eq!(rec.values[1], Value::Null);
2034        assert_eq!(rec.values[2], Value::Null);
2035    }
2036
2037    #[test]
2038    fn test_unknown_directive_ignored() {
2039        let input = "\
2040#!sif v1
2041#custom_directive some value here
2042#schema id:uint
20431
2044";
2045        let doc = parse(input).unwrap();
2046        assert!(doc.sections[0]
2047            .directives
2048            .iter()
2049            .any(|d| matches!(d, Directive::Unknown { name, .. } if name == "custom_directive")));
2050        assert_eq!(doc.sections[0].records.len(), 1);
2051    }
2052}
sif_parser/parse.rs

sif_parser/
parse.rs