Skip to main content

crous_core/
text.rs

1//! Human-readable Crous text format: parser and pretty-printer.
2//!
3//! # Crous Text Syntax (summary)
4//!
5//! The Crous textual notation is a unique, deterministic syntax that maps
6//! 1:1 to the binary format. It is NOT a JSON clone — it has its own rules:
7//!
8//! - Objects use `{ key: value; key2: value2; }` with `;` as mandatory terminator.
9//! - Arrays use `[ value, value, value ]` with `,` separator.
10//! - Strings are double-quoted: `"hello world"`.
11//! - Binary data uses `b64#<base64>;` marker.
12//! - Integers: unsigned are bare digits, signed use `+` or `-` prefix.
13//! - Floats use decimal point: `3.14`, `-2.718`.
14//! - Null is `null`, booleans are `true`/`false`.
15//! - Optional type annotations: `42::u32`, `"hello"::str`.
16//! - Comments: `// line comment` and `/* block comment */`.
17//!
18//! # ABNF Grammar
19//!
20//! ```abnf
21//! document     = value
22//! value        = null / boolean / integer / float / string / bytes
23//!              / array / object
24//! null         = "null"
25//! boolean      = "true" / "false"
26//! integer      = [sign] 1*DIGIT [type-ann]
27//! float        = [sign] 1*DIGIT "." 1*DIGIT [exponent] [type-ann]
28//! exponent     = ("e" / "E") [sign] 1*DIGIT
29//! sign         = "+" / "-"
30//! string       = DQUOTE *char DQUOTE [type-ann]
31//! bytes        = "b64#" base64-data ";"
32//! base64-data  = *( ALPHA / DIGIT / "+" / "/" / "=" )
33//! array        = "[" [value *("," value)] "]"
34//! object       = "{" *field "}"
35//! field        = key ":" value ";"
36//! key          = identifier / string
37//! identifier   = (ALPHA / "_") *(ALPHA / DIGIT / "_")
38//! type-ann     = "::" type-name
39//! type-name    = identifier
40//! comment      = line-comment / block-comment
41//! line-comment = "//" *(%x20-7E) LF
42//! block-comment= "/*" *(comment-char) "*/"
43//! ```
44
45use crate::error::{CrousError, Result};
46use crate::value::Value;
47use base64::Engine;
48
49// ---------------------------------------------------------------------------
50// Parser
51// ---------------------------------------------------------------------------
52
53/// Parse a Crous text document into a `Value`.
54///
55/// ```
56/// use crous_core::text::parse;
57/// use crous_core::Value;
58///
59/// let v = parse(r#"{ name: "Alice"; age: 30; }"#).unwrap();
60/// assert_eq!(
61///     v,
62///     Value::Object(vec![
63///         ("name".into(), Value::Str("Alice".into())),
64///         ("age".into(), Value::UInt(30)),
65///     ])
66/// );
67/// ```
68pub fn parse(input: &str) -> Result<Value> {
69    let mut parser = Parser::new(input);
70    let value = parser.parse_value()?;
71    parser.skip_whitespace_and_comments();
72    Ok(value)
73}
74
75struct Parser<'a> {
76    input: &'a str,
77    pos: usize,
78    line: usize,
79    col: usize,
80}
81
82impl<'a> Parser<'a> {
83    fn new(input: &'a str) -> Self {
84        Self {
85            input,
86            pos: 0,
87            line: 1,
88            col: 1,
89        }
90    }
91
92    fn peek(&self) -> Option<char> {
93        self.input[self.pos..].chars().next()
94    }
95
96    fn advance(&mut self) -> Option<char> {
97        let ch = self.peek()?;
98        self.pos += ch.len_utf8();
99        if ch == '\n' {
100            self.line += 1;
101            self.col = 1;
102        } else {
103            self.col += 1;
104        }
105        Some(ch)
106    }
107
108    fn remaining(&self) -> &'a str {
109        &self.input[self.pos..]
110    }
111
112    fn error(&self, msg: impl Into<String>) -> CrousError {
113        CrousError::ParseError {
114            line: self.line,
115            col: self.col,
116            message: msg.into(),
117        }
118    }
119
120    fn skip_whitespace_and_comments(&mut self) {
121        loop {
122            // Skip whitespace.
123            while let Some(ch) = self.peek() {
124                if ch.is_whitespace() {
125                    self.advance();
126                } else {
127                    break;
128                }
129            }
130            // Skip line comments.
131            if self.remaining().starts_with("//") {
132                while let Some(ch) = self.advance() {
133                    if ch == '\n' {
134                        break;
135                    }
136                }
137                continue;
138            }
139            // Skip block comments.
140            if self.remaining().starts_with("/*") {
141                self.advance(); // '/'
142                self.advance(); // '*'
143                let mut depth = 1;
144                while depth > 0 {
145                    match self.advance() {
146                        Some('*') if self.peek() == Some('/') => {
147                            self.advance();
148                            depth -= 1;
149                        }
150                        Some('/') if self.peek() == Some('*') => {
151                            self.advance();
152                            depth += 1;
153                        }
154                        Some(_) => {}
155                        None => break,
156                    }
157                }
158                continue;
159            }
160            break;
161        }
162    }
163
164    fn expect_char(&mut self, expected: char) -> Result<()> {
165        self.skip_whitespace_and_comments();
166        match self.advance() {
167            Some(ch) if ch == expected => Ok(()),
168            Some(ch) => Err(self.error(format!("expected '{expected}', got '{ch}'"))),
169            None => Err(self.error(format!("expected '{expected}', got EOF"))),
170        }
171    }
172
173    fn parse_value(&mut self) -> Result<Value> {
174        self.skip_whitespace_and_comments();
175
176        match self.peek() {
177            None => Err(self.error("unexpected end of input")),
178            Some('{') => self.parse_object(),
179            Some('[') => self.parse_array(),
180            Some('"') => self.parse_string_value(),
181            Some('b') if self.remaining().starts_with("b64#") => self.parse_bytes(),
182            Some('t') if self.remaining().starts_with("true") => self.parse_true(),
183            Some('f') if self.remaining().starts_with("false") => self.parse_false(),
184            Some('n') if self.remaining().starts_with("null") => self.parse_null(),
185            Some('i') if self.remaining().starts_with("inf") => self.parse_inf(false),
186            Some('N') if self.remaining().starts_with("NaN") => self.parse_nan(),
187            Some(ch) if ch == '-' || ch == '+' || ch.is_ascii_digit() => {
188                // Check for "-inf"
189                if ch == '-' && self.remaining().starts_with("-inf") {
190                    self.parse_inf(true)
191                } else {
192                    self.parse_number()
193                }
194            }
195            Some(ch) => Err(self.error(format!("unexpected character: '{ch}'"))),
196        }
197    }
198
199    fn parse_null(&mut self) -> Result<Value> {
200        for _ in 0..4 {
201            self.advance();
202        }
203        self.skip_type_annotation();
204        Ok(Value::Null)
205    }
206
207    fn parse_inf(&mut self, negative: bool) -> Result<Value> {
208        if negative {
209            // skip "-inf"
210            for _ in 0..4 {
211                self.advance();
212            }
213            self.skip_type_annotation();
214            Ok(Value::Float(f64::NEG_INFINITY))
215        } else {
216            // skip "inf"
217            for _ in 0..3 {
218                self.advance();
219            }
220            self.skip_type_annotation();
221            Ok(Value::Float(f64::INFINITY))
222        }
223    }
224
225    fn parse_nan(&mut self) -> Result<Value> {
226        for _ in 0..3 {
227            self.advance();
228        }
229        self.skip_type_annotation();
230        Ok(Value::Float(f64::NAN))
231    }
232
233    fn parse_true(&mut self) -> Result<Value> {
234        for _ in 0..4 {
235            self.advance();
236        }
237        self.skip_type_annotation();
238        Ok(Value::Bool(true))
239    }
240
241    fn parse_false(&mut self) -> Result<Value> {
242        for _ in 0..5 {
243            self.advance();
244        }
245        self.skip_type_annotation();
246        Ok(Value::Bool(false))
247    }
248
249    fn parse_number(&mut self) -> Result<Value> {
250        let start = self.pos;
251        let mut is_negative = false;
252        let mut is_float = false;
253
254        if self.peek() == Some('-') {
255            is_negative = true;
256            self.advance();
257        } else if self.peek() == Some('+') {
258            self.advance();
259        }
260
261        while let Some(ch) = self.peek() {
262            if ch.is_ascii_digit() {
263                self.advance();
264            } else if ch == '.' {
265                is_float = true;
266                self.advance();
267            } else if ch == 'e' || ch == 'E' {
268                is_float = true;
269                self.advance();
270                if self.peek() == Some('+') || self.peek() == Some('-') {
271                    self.advance();
272                }
273            } else {
274                break;
275            }
276        }
277
278        let num_str = &self.input[start..self.pos];
279        self.skip_type_annotation();
280
281        if is_float {
282            let f: f64 = num_str
283                .parse()
284                .map_err(|_| self.error(format!("invalid float: {num_str}")))?;
285            Ok(Value::Float(f))
286        } else if is_negative {
287            let i: i64 = num_str
288                .parse()
289                .map_err(|_| self.error(format!("invalid integer: {num_str}")))?;
290            Ok(Value::Int(i))
291        } else {
292            let u: u64 = num_str
293                .parse()
294                .map_err(|_| self.error(format!("invalid integer: {num_str}")))?;
295            Ok(Value::UInt(u))
296        }
297    }
298
299    fn parse_string_value(&mut self) -> Result<Value> {
300        let s = self.parse_quoted_string()?;
301        self.skip_type_annotation();
302        Ok(Value::Str(s))
303    }
304
305    fn parse_quoted_string(&mut self) -> Result<String> {
306        self.expect_char('"')?;
307        let mut s = String::new();
308        loop {
309            match self.advance() {
310                Some('"') => break,
311                Some('\\') => match self.advance() {
312                    Some('n') => s.push('\n'),
313                    Some('t') => s.push('\t'),
314                    Some('r') => s.push('\r'),
315                    Some('\\') => s.push('\\'),
316                    Some('"') => s.push('"'),
317                    Some(ch) => {
318                        s.push('\\');
319                        s.push(ch);
320                    }
321                    None => return Err(self.error("unterminated string escape")),
322                },
323                Some(ch) => s.push(ch),
324                None => return Err(self.error("unterminated string")),
325            }
326        }
327        Ok(s)
328    }
329
330    fn parse_bytes(&mut self) -> Result<Value> {
331        // Consume "b64#"
332        for _ in 0..4 {
333            self.advance();
334        }
335        let start = self.pos;
336        // Read until ';' but do NOT consume the ';' — it's the statement terminator
337        // and will be consumed by the object/array parser.
338        while let Some(ch) = self.peek() {
339            if ch == ';' {
340                break;
341            }
342            self.advance();
343        }
344        let b64_str = &self.input[start..self.pos];
345        // Do NOT advance past ';' — let the caller handle it.
346
347        let bytes = base64::engine::general_purpose::STANDARD
348            .decode(b64_str.trim())
349            .map_err(|e| self.error(format!("invalid base64: {e}")))?;
350        Ok(Value::Bytes(bytes))
351    }
352
353    fn parse_array(&mut self) -> Result<Value> {
354        self.expect_char('[')?;
355        let mut items = Vec::new();
356
357        loop {
358            self.skip_whitespace_and_comments();
359            if self.peek() == Some(']') {
360                self.advance();
361                break;
362            }
363            items.push(self.parse_value()?);
364            self.skip_whitespace_and_comments();
365            if self.peek() == Some(',') {
366                self.advance();
367            }
368        }
369
370        Ok(Value::Array(items))
371    }
372
373    fn parse_object(&mut self) -> Result<Value> {
374        self.expect_char('{')?;
375        let mut entries = Vec::new();
376
377        loop {
378            self.skip_whitespace_and_comments();
379            if self.peek() == Some('}') {
380                self.advance();
381                break;
382            }
383
384            // Parse key: either an identifier or a quoted string.
385            let key = self.parse_key()?;
386            self.expect_char(':')?;
387            let value = self.parse_value()?;
388            self.expect_char(';')?;
389
390            entries.push((key, value));
391        }
392
393        Ok(Value::Object(entries))
394    }
395
396    fn parse_key(&mut self) -> Result<String> {
397        self.skip_whitespace_and_comments();
398        if self.peek() == Some('"') {
399            self.parse_quoted_string()
400        } else {
401            self.parse_identifier()
402        }
403    }
404
405    fn parse_identifier(&mut self) -> Result<String> {
406        let start = self.pos;
407        match self.peek() {
408            Some(ch) if ch.is_alphabetic() || ch == '_' => {
409                self.advance();
410            }
411            _ => return Err(self.error("expected identifier")),
412        }
413        while let Some(ch) = self.peek() {
414            if ch.is_alphanumeric() || ch == '_' {
415                self.advance();
416            } else {
417                break;
418            }
419        }
420        Ok(self.input[start..self.pos].to_string())
421    }
422
423    /// Skip optional type annotations like `::u32`, `::str`.
424    fn skip_type_annotation(&mut self) {
425        if self.remaining().starts_with("::") {
426            self.advance(); // ':'
427            self.advance(); // ':'
428            while let Some(ch) = self.peek() {
429                if ch.is_alphanumeric() || ch == '_' {
430                    self.advance();
431                } else {
432                    break;
433                }
434            }
435        }
436    }
437}
438
439// ---------------------------------------------------------------------------
440// Pretty-printer
441// ---------------------------------------------------------------------------
442
443/// Pretty-print a `Value` in canonical Crous text notation.
444///
445/// The output is deterministic: the same `Value` always produces the same text.
446///
447/// ```
448/// use crous_core::text::pretty_print;
449/// use crous_core::Value;
450///
451/// let v = Value::Object(vec![
452///     ("name".into(), Value::Str("Alice".into())),
453///     ("age".into(), Value::UInt(30)),
454/// ]);
455/// let text = pretty_print(&v, 0);
456/// assert!(text.contains("name: \"Alice\";"));
457/// assert!(text.contains("age: 30;"));
458/// ```
459pub fn pretty_print(value: &Value, indent: usize) -> String {
460    let mut out = String::new();
461    write_value(&mut out, value, indent, 0);
462    out
463}
464
465fn write_value(out: &mut String, value: &Value, indent_size: usize, depth: usize) {
466    let indent = " ".repeat(indent_size * depth);
467    let inner_indent = " ".repeat(indent_size * (depth + 1));
468
469    match value {
470        Value::Null => out.push_str("null"),
471        Value::Bool(b) => out.push_str(if *b { "true" } else { "false" }),
472        Value::UInt(n) => out.push_str(&n.to_string()),
473        Value::Int(n) => {
474            // Int(0) pretty-prints as "0" which on reparse becomes UInt(0).
475            // Emit a negative-zero form to preserve the Int type for roundtrip.
476            if *n == 0 {
477                out.push_str("-0");
478            } else {
479                out.push_str(&n.to_string());
480            }
481        }
482        Value::Float(f) => {
483            if f.is_nan() {
484                out.push_str("NaN");
485            } else if f.is_infinite() {
486                if f.is_sign_negative() {
487                    out.push_str("-inf");
488                } else {
489                    out.push_str("inf");
490                }
491            } else {
492                // Ensure float always has a decimal point for deterministic output.
493                let s = format!("{f}");
494                if s.contains('.') || s.contains('e') || s.contains('E') {
495                    out.push_str(&s);
496                } else {
497                    out.push_str(&format!("{f}.0"));
498                }
499            }
500        }
501        Value::Str(s) => {
502            out.push('"');
503            for ch in s.chars() {
504                match ch {
505                    '"' => out.push_str("\\\""),
506                    '\\' => out.push_str("\\\\"),
507                    '\n' => out.push_str("\\n"),
508                    '\r' => out.push_str("\\r"),
509                    '\t' => out.push_str("\\t"),
510                    c => out.push(c),
511                }
512            }
513            out.push('"');
514        }
515        Value::Bytes(b) => {
516            out.push_str("b64#");
517            out.push_str(&base64::engine::general_purpose::STANDARD.encode(b));
518        }
519        Value::Array(items) => {
520            if items.is_empty() {
521                out.push_str("[]");
522            } else if is_simple_array(items) {
523                // Inline for simple arrays.
524                out.push('[');
525                for (i, item) in items.iter().enumerate() {
526                    if i > 0 {
527                        out.push_str(", ");
528                    }
529                    write_value(out, item, indent_size, depth);
530                }
531                out.push(']');
532            } else {
533                out.push_str("[\n");
534                for (i, item) in items.iter().enumerate() {
535                    out.push_str(&inner_indent);
536                    write_value(out, item, indent_size, depth + 1);
537                    if i < items.len() - 1 {
538                        out.push(',');
539                    }
540                    out.push('\n');
541                }
542                out.push_str(&indent);
543                out.push(']');
544            }
545        }
546        Value::Object(entries) => {
547            if entries.is_empty() {
548                out.push_str("{}");
549            } else {
550                out.push_str("{\n");
551                for (key, val) in entries {
552                    out.push_str(&inner_indent);
553                    if is_valid_identifier(key) {
554                        out.push_str(key);
555                    } else {
556                        out.push('"');
557                        out.push_str(key);
558                        out.push('"');
559                    }
560                    out.push_str(": ");
561                    write_value(out, val, indent_size, depth + 1);
562                    out.push_str(";\n");
563                }
564                out.push_str(&indent);
565                out.push('}');
566            }
567        }
568    }
569}
570
571/// Check if an array contains only simple scalar values (no nesting).
572fn is_simple_array(items: &[Value]) -> bool {
573    items.len() <= 8
574        && items.iter().all(|v| {
575            matches!(
576                v,
577                Value::Null
578                    | Value::Bool(_)
579                    | Value::UInt(_)
580                    | Value::Int(_)
581                    | Value::Float(_)
582                    | Value::Str(_)
583            )
584        })
585}
586
587/// Check if a string is a valid unquoted identifier.
588fn is_valid_identifier(s: &str) -> bool {
589    let mut chars = s.chars();
590    match chars.next() {
591        Some(c) if c.is_alphabetic() || c == '_' => {}
592        _ => return false,
593    }
594    chars.all(|c| c.is_alphanumeric() || c == '_')
595}
596
597#[cfg(test)]
598mod tests {
599    use super::*;
600
601    #[test]
602    fn parse_null() {
603        assert_eq!(parse("null").unwrap(), Value::Null);
604    }
605
606    #[test]
607    fn parse_bool() {
608        assert_eq!(parse("true").unwrap(), Value::Bool(true));
609        assert_eq!(parse("false").unwrap(), Value::Bool(false));
610    }
611
612    #[test]
613    fn parse_uint() {
614        assert_eq!(parse("42").unwrap(), Value::UInt(42));
615        assert_eq!(parse("0").unwrap(), Value::UInt(0));
616    }
617
618    #[test]
619    fn parse_int() {
620        assert_eq!(parse("-1").unwrap(), Value::Int(-1));
621        assert_eq!(parse("-42").unwrap(), Value::Int(-42));
622    }
623
624    #[test]
625    fn parse_float() {
626        assert_eq!(parse("3.125").unwrap(), Value::Float(3.125));
627        assert_eq!(parse("-2.5").unwrap(), Value::Float(-2.5));
628    }
629
630    #[test]
631    fn parse_string() {
632        assert_eq!(parse(r#""hello""#).unwrap(), Value::Str("hello".into()));
633        assert_eq!(
634            parse(r#""with \"quotes\"""#).unwrap(),
635            Value::Str("with \"quotes\"".into())
636        );
637    }
638
639    #[test]
640    fn parse_bytes() {
641        let v = parse("b64#AQID;").unwrap();
642        assert_eq!(v, Value::Bytes(vec![1, 2, 3]));
643    }
644
645    #[test]
646    fn parse_array() {
647        let v = parse("[1, 2, 3]").unwrap();
648        assert_eq!(
649            v,
650            Value::Array(vec![Value::UInt(1), Value::UInt(2), Value::UInt(3)])
651        );
652    }
653
654    #[test]
655    fn parse_object() {
656        let v = parse(r#"{ name: "Alice"; age: 30; }"#).unwrap();
657        assert_eq!(
658            v,
659            Value::Object(vec![
660                ("name".into(), Value::Str("Alice".into())),
661                ("age".into(), Value::UInt(30)),
662            ])
663        );
664    }
665
666    #[test]
667    fn parse_nested() {
668        let input = r#"{
669            users: [
670                { name: "Bob"; scores: [100, 95, 87]; }
671            ];
672            count: 1;
673        }"#;
674        let v = parse(input).unwrap();
675        let expected = Value::Object(vec![
676            (
677                "users".into(),
678                Value::Array(vec![Value::Object(vec![
679                    ("name".into(), Value::Str("Bob".into())),
680                    (
681                        "scores".into(),
682                        Value::Array(vec![Value::UInt(100), Value::UInt(95), Value::UInt(87)]),
683                    ),
684                ])]),
685            ),
686            ("count".into(), Value::UInt(1)),
687        ]);
688        assert_eq!(v, expected);
689    }
690
691    #[test]
692    fn parse_comments() {
693        let input = r#"{
694            // This is a comment
695            name: "Alice"; /* inline comment */
696            age: 30;
697        }"#;
698        let v = parse(input).unwrap();
699        assert_eq!(
700            v,
701            Value::Object(vec![
702                ("name".into(), Value::Str("Alice".into())),
703                ("age".into(), Value::UInt(30)),
704            ])
705        );
706    }
707
708    #[test]
709    fn parse_type_annotation() {
710        let v = parse("42::u32").unwrap();
711        assert_eq!(v, Value::UInt(42));
712    }
713
714    #[test]
715    fn pretty_print_roundtrip() {
716        let original = Value::Object(vec![
717            ("name".into(), Value::Str("Alice".into())),
718            ("age".into(), Value::UInt(30)),
719            (
720                "tags".into(),
721                Value::Array(vec![Value::Str("admin".into()), Value::Str("user".into())]),
722            ),
723        ]);
724        let text = pretty_print(&original, 4);
725        let parsed = parse(&text).unwrap();
726        assert_eq!(parsed, original);
727    }
728
729    #[test]
730    fn pretty_print_bytes() {
731        let v = Value::Bytes(vec![0xDE, 0xAD, 0xBE, 0xEF]);
732        let text = pretty_print(&v, 0);
733        assert!(text.starts_with("b64#"));
734        // b64# values no longer include trailing ';' — the terminator is handled
735        // by the enclosing object/array syntax.
736        let parsed = parse(&text).unwrap();
737        assert_eq!(parsed, v);
738    }
739
740    #[test]
741    fn text_binary_text_roundtrip() {
742        // Parse text → encode binary → decode binary → pretty-print text → parse text
743        let input = r#"{ name: "Alice"; age: 30; active: true; }"#;
744        let val1 = parse(input).unwrap();
745
746        let mut enc = crate::encoder::Encoder::new();
747        enc.encode_value(&val1).unwrap();
748        let binary = enc.finish().unwrap();
749
750        let mut dec = crate::decoder::Decoder::new(&binary);
751        let val2 = dec.decode_next().unwrap().to_owned_value();
752        assert_eq!(val1, val2);
753
754        let text2 = pretty_print(&val2, 4);
755        let val3 = parse(&text2).unwrap();
756        assert_eq!(val1, val3);
757    }
758}