Skip to main content

toon_core/
encoder.rs

1//! TOON v3.0 Encoder — converts JSON into Token-Oriented Object Notation.
2//!
3//! TOON is a compact, human-readable format designed to minimize token usage when
4//! feeding structured data to LLMs. The encoder implements the full TOON v3.0 spec
5//! (2025-11-24), including:
6//!
7//! - **Key folding**: nested objects expressed via indentation, no braces/brackets
8//! - **Inline arrays**: primitive arrays as `key[N]: v1,v2,v3`
9//! - **Tabular arrays**: uniform object arrays as `key[N]{f1,f2}:\n  v1,v2\n  v3,v4`
10//! - **Expanded lists**: mixed/complex arrays as `key[N]:\n  - item1\n  - item2`
11//! - **Context-dependent quoting**: strings only quoted when ambiguous (per delimiter scope)
12//! - **Number normalization**: no exponents, no trailing zeros, -0 → 0
13//!
14//! # Example
15//! ```
16//! use toon_core::encode;
17//! let json = r#"{"name":"Alice","age":30,"tags":["rust","wasm"]}"#;
18//! let toon = encode(json).unwrap();
19//! // name: Alice
20//! // age: 30
21//! // tags[2]: rust,wasm
22//! ```
23
24use crate::error::Result;
25use serde_json::Value;
26
27/// Encode a JSON string into TOON v3.0 format.
28///
29/// Parses the input as JSON, then walks the value tree to produce a compact TOON
30/// representation. Returns an error if the input is not valid JSON.
31pub fn encode(json: &str) -> Result<String> {
32    let value: Value = serde_json::from_str(json)?;
33    let mut out = String::new();
34    encode_root(&value, &mut out);
35    Ok(out)
36}
37
38/// Top-level dispatch: objects emit fields, arrays emit root array syntax,
39/// primitives emit a bare value.
40fn encode_root(value: &Value, out: &mut String) {
41    match value {
42        Value::Object(map) => {
43            encode_object_fields(map, 0, out);
44        }
45        Value::Array(arr) => {
46            encode_root_array(arr, out);
47        }
48        _ => {
49            encode_primitive_value(value, QuoteContext::Document, out);
50        }
51    }
52}
53
54/// Encode a root-level array. Primitive arrays use inline syntax `[N]: v1,v2`;
55/// mixed/complex arrays use expanded list syntax `[N]:\n  - item`.
56fn encode_root_array(arr: &[Value], out: &mut String) {
57    let len = arr.len();
58    if all_primitives(arr) {
59        out.push_str(&format!("[{}]: ", len));
60        encode_inline_values(arr, out);
61    } else {
62        out.push_str(&format!("[{}]:", len));
63        encode_list_items(arr, 0, out);
64    }
65}
66
67/// Emit all key-value pairs of an object at the given indentation depth.
68/// Each field appears on its own line; values are dispatched by type.
69///
70/// Relies on `serde_json::Map` with `preserve_order` feature to maintain
71/// the original JSON insertion order (IndexMap, not BTreeMap).
72fn encode_object_fields(map: &serde_json::Map<String, Value>, depth: usize, out: &mut String) {
73    let indent = make_indent(depth);
74    let mut first = true;
75    for (key, value) in map {
76        if !first {
77            out.push('\n');
78        }
79        first = false;
80        out.push_str(&indent);
81        out.push_str(&encode_key(key));
82        encode_field_value(key, value, depth, out);
83    }
84}
85
86/// Dispatch a field's value to the appropriate TOON encoding:
87/// - Empty objects → `key:`
88/// - Non-empty objects → `key:\n  child_key: child_val`
89/// - Arrays → delegated to `encode_array_field` (inline/tabular/expanded)
90/// - Primitives → `key: value`
91fn encode_field_value(_key: &str, value: &Value, depth: usize, out: &mut String) {
92    match value {
93        Value::Object(map) if map.is_empty() => {
94            out.push(':');
95        }
96        Value::Object(map) => {
97            out.push(':');
98            out.push('\n');
99            encode_object_fields(map, depth + 1, out);
100        }
101        Value::Array(arr) => {
102            encode_array_field(arr, depth, out);
103        }
104        _ => {
105            out.push_str(": ");
106            encode_primitive_value(value, QuoteContext::Document, out);
107        }
108    }
109}
110
111/// Encode an array field value, selecting the most compact TOON representation:
112///
113/// 1. **Empty**: `key[0]:`
114/// 2. **Tabular**: all elements are objects with identical primitive-only keys →
115///    `key[N]{f1,f2}:\n  v1,v2\n  v3,v4`
116/// 3. **Inline**: all elements are primitives → `key[N]: v1,v2,v3`
117/// 4. **Expanded list**: mixed content → `key[N]:\n  - item1\n  - item2`
118fn encode_array_field(arr: &[Value], depth: usize, out: &mut String) {
119    let len = arr.len();
120
121    if arr.is_empty() {
122        out.push_str(&format!("[{}]:", len));
123        return;
124    }
125
126    // Tabular: uniform object arrays (greatest compression for repetitive data)
127    if let Some(fields) = detect_tabular(arr) {
128        out.push_str(&format!("[{}]{{{}}}:", len, fields.join(",")));
129        encode_tabular_rows(arr, &fields, depth, out);
130        return;
131    }
132
133    // Inline: all-primitive arrays on a single line
134    if all_primitives(arr) {
135        out.push_str(&format!("[{}]: ", len));
136        encode_inline_values(arr, out);
137        return;
138    }
139
140    // Expanded: complex/mixed arrays with "- " list markers
141    out.push_str(&format!("[{}]:", len));
142    encode_list_items(arr, depth, out);
143}
144
145/// Emit comma-separated primitive values on a single line: `v1,v2,v3`
146/// Quoting uses `InlineArray` context (comma is the active delimiter, not colon).
147fn encode_inline_values(arr: &[Value], out: &mut String) {
148    for (i, val) in arr.iter().enumerate() {
149        if i > 0 {
150            out.push(',');
151        }
152        encode_primitive_value(val, QuoteContext::InlineArray, out);
153    }
154}
155
156/// Emit tabular rows: each object's values as a comma-separated line, no keys repeated.
157/// Quoting uses `TabularCell` context (comma triggers quoting, not colon).
158fn encode_tabular_rows(arr: &[Value], fields: &[String], depth: usize, out: &mut String) {
159    let row_indent = make_indent(depth + 1);
160    for obj_val in arr {
161        out.push('\n');
162        out.push_str(&row_indent);
163        if let Value::Object(map) = obj_val {
164            for (i, field) in fields.iter().enumerate() {
165                if i > 0 {
166                    out.push(',');
167                }
168                if let Some(val) = map.get(field) {
169                    encode_primitive_value(val, QuoteContext::TabularCell, out);
170                }
171            }
172        }
173    }
174}
175
176/// Emit expanded list items with "- " markers. Each item can be:
177/// - A primitive value: `- hello`
178/// - An object: `- key1: val1\n    key2: val2` (first field on hyphen line)
179/// - A nested array: `- [N]: v1,v2`
180fn encode_list_items(arr: &[Value], depth: usize, out: &mut String) {
181    let item_indent = make_indent(depth + 1);
182    for item in arr {
183        out.push('\n');
184        out.push_str(&item_indent);
185        out.push_str("- ");
186        match item {
187            Value::Object(map) => {
188                // First field on the hyphen line
189                let mut first = true;
190                for (key, value) in map {
191                    if first {
192                        first = false;
193                        out.push_str(&encode_key(key));
194                        encode_list_item_field_value(value, depth + 1, out);
195                    } else {
196                        out.push('\n');
197                        // Sibling fields at same depth as "- " content
198                        out.push_str(&make_indent(depth + 1));
199                        out.push_str("  ");
200                        out.push_str(&encode_key(key));
201                        encode_list_item_field_value(value, depth + 1, out);
202                    }
203                }
204            }
205            Value::Array(inner_arr) => {
206                // Nested array as list item
207                let len = inner_arr.len();
208                if all_primitives(inner_arr) {
209                    out.push_str(&format!("[{}]: ", len));
210                    encode_inline_values(inner_arr, out);
211                } else {
212                    out.push_str(&format!("[{}]:", len));
213                    encode_list_items(inner_arr, depth + 1, out);
214                }
215            }
216            _ => {
217                encode_primitive_value(item, QuoteContext::Document, out);
218            }
219        }
220    }
221}
222
223/// Encode a field value within a list item object. Differs from `encode_field_value`
224/// because nested objects inside list items use an extra indent level to account
225/// for the "- " prefix offset.
226fn encode_list_item_field_value(value: &Value, depth: usize, out: &mut String) {
227    match value {
228        Value::Object(map) if map.is_empty() => {
229            out.push(':');
230        }
231        Value::Object(map) => {
232            out.push(':');
233            out.push('\n');
234            // Nested object inside a list item: depth + 1 extra for the "- " offset
235            let nested_indent = make_indent(depth + 2);
236            let mut first = true;
237            for (key, val) in map {
238                if !first {
239                    out.push('\n');
240                }
241                first = false;
242                out.push_str(&nested_indent);
243                out.push_str(&encode_key(key));
244                encode_field_value(key, val, depth + 2, out);
245            }
246        }
247        Value::Array(arr) => {
248            encode_array_field(arr, depth, out);
249        }
250        _ => {
251            out.push_str(": ");
252            encode_primitive_value(value, QuoteContext::Document, out);
253        }
254    }
255}
256
257/// Context for quoting decisions per TOON v3.0 delimiter scoping rules.
258#[derive(Clone, Copy, PartialEq)]
259enum QuoteContext {
260    /// Object field value or bare root primitive — colon triggers quoting
261    Document,
262    /// Inline primitive array value — comma (active delimiter) triggers quoting
263    InlineArray,
264    /// Tabular row cell — comma (active delimiter) triggers quoting, NOT colon
265    TabularCell,
266}
267
268/// Emit a primitive JSON value (null, bool, number, string) in TOON format.
269/// String quoting depends on the `QuoteContext` — different delimiters are
270/// "active" in different positions (see TOON v3.0 spec, delimiter scoping).
271fn encode_primitive_value(value: &Value, ctx: QuoteContext, out: &mut String) {
272    match value {
273        Value::Null => out.push_str("null"),
274        Value::Bool(b) => out.push_str(if *b { "true" } else { "false" }),
275        Value::Number(n) => out.push_str(&format_number(n)),
276        Value::String(s) => encode_string_value(s, ctx, out),
277        _ => out.push_str("null"), // arrays/objects in primitive context
278    }
279}
280
281/// Format a JSON number per TOON v3.0 rules:
282/// - No scientific notation (exponents)
283/// - No leading zeros (except 0.x)
284/// - No trailing fractional zeros (3.10 → 3.1)
285/// - Negative zero normalizes to 0
286fn format_number(n: &serde_json::Number) -> String {
287    if let Some(i) = n.as_i64() {
288        return i.to_string();
289    }
290    if let Some(u) = n.as_u64() {
291        return u.to_string();
292    }
293    if let Some(f) = n.as_f64() {
294        if f.is_nan() || f.is_infinite() {
295            return "null".to_string();
296        }
297        // Normalize -0 to 0
298        let f = if f == 0.0 { 0.0 } else { f };
299        // Check if it's a whole number
300        if f.fract() == 0.0 && f.abs() < (i64::MAX as f64) {
301            return (f as i64).to_string();
302        }
303        // Format without trailing zeros
304        let s = format!("{}", f);
305        // Remove trailing zeros after decimal point
306        if s.contains('.') {
307            let trimmed = s.trim_end_matches('0');
308            let trimmed = trimmed.trim_end_matches('.');
309            trimmed.to_string()
310        } else {
311            s
312        }
313    } else {
314        "null".to_string()
315    }
316}
317
318/// Emit a string value, quoting and escaping only when necessary.
319/// Unquoted strings save 2 tokens (the quotes) per value — significant at scale.
320fn encode_string_value(s: &str, ctx: QuoteContext, out: &mut String) {
321    if needs_quoting(s, ctx) {
322        out.push('"');
323        for ch in s.chars() {
324            match ch {
325                '\\' => out.push_str("\\\\"),
326                '"' => out.push_str("\\\""),
327                '\n' => out.push_str("\\n"),
328                '\r' => out.push_str("\\r"),
329                '\t' => out.push_str("\\t"),
330                _ => out.push(ch),
331            }
332        }
333        out.push('"');
334    } else {
335        out.push_str(s);
336    }
337}
338
339/// Determine if a string value must be quoted to preserve TOON roundtrip fidelity.
340///
341/// A string MUST be quoted if it:
342/// - Is empty
343/// - Has leading/trailing whitespace
344/// - Looks like a boolean (`true`/`false`) or `null`
345/// - Looks numeric (would be decoded as a number instead of string)
346/// - Contains backslash, double quote, brackets, braces, or control chars
347/// - Starts with `-` (ambiguous with list item marker)
348/// - Contains the ACTIVE delimiter for the current context:
349///   - Document context: colon (`:`)
350///   - InlineArray/TabularCell context: comma (`,`)
351fn needs_quoting(s: &str, ctx: QuoteContext) -> bool {
352    // Empty string
353    if s.is_empty() {
354        return true;
355    }
356    // Leading or trailing whitespace
357    if s != s.trim() {
358        return true;
359    }
360    // Looks like bool or null
361    if s == "true" || s == "false" || s == "null" {
362        return true;
363    }
364    // Looks like a number (including leading-zero forms like "05")
365    if looks_numeric(s) {
366        return true;
367    }
368    // Contains backslash or double quote
369    if s.contains('\\') || s.contains('"') {
370        return true;
371    }
372    // Contains brackets or braces
373    if s.contains('[') || s.contains(']') || s.contains('{') || s.contains('}') {
374        return true;
375    }
376    // Contains control characters
377    if s.contains('\n') || s.contains('\r') || s.contains('\t') {
378        return true;
379    }
380    // Starts with hyphen (could be confused with list item marker "- ")
381    if s.starts_with('-') {
382        return true;
383    }
384    // Context-dependent delimiter quoting
385    match ctx {
386        QuoteContext::Document => {
387            // Colon triggers quoting in document context
388            if s.contains(':') {
389                return true;
390            }
391        }
392        QuoteContext::InlineArray | QuoteContext::TabularCell => {
393            // Active delimiter (comma by default) triggers quoting
394            if s.contains(',') {
395                return true;
396            }
397        }
398    }
399    false
400}
401
402/// Check if a string looks like a number (and thus must be quoted to preserve type info).
403/// Matches integers, floats, and leading-zero forms like "05" or "0001".
404fn looks_numeric(s: &str) -> bool {
405    // Matches numeric patterns: integers, floats, leading-zero forms
406    if s.is_empty() {
407        return false;
408    }
409    let bytes = s.as_bytes();
410    let start = if bytes[0] == b'-' { 1 } else { 0 };
411    if start >= bytes.len() {
412        return false;
413    }
414    // All remaining must be digits, optionally with one dot and optional exponent
415    let rest = &s[start..];
416    if rest.is_empty() {
417        return false;
418    }
419    // Check for leading-zero forms like "05", "0001"
420    if rest.len() > 1 && rest.starts_with('0') && rest.as_bytes()[1] != b'.' {
421        return true; // "05", "00" etc. are numeric-like
422    }
423    // Try to parse as a number pattern
424    let mut has_dot = false;
425    let mut has_e = false;
426    for (i, &b) in rest.as_bytes().iter().enumerate() {
427        match b {
428            b'0'..=b'9' => {}
429            b'.' if !has_dot && !has_e => has_dot = true,
430            b'e' | b'E' if !has_e && i > 0 => has_e = true,
431            b'+' | b'-' if has_e => {}
432            _ => return false,
433        }
434    }
435    // Must have at least one digit
436    rest.as_bytes().iter().any(|b| b.is_ascii_digit())
437}
438
439/// Encode an object key. Keys matching `^[A-Za-z_][A-Za-z0-9_.]*$` are emitted
440/// unquoted; all others are quoted with escape sequences.
441fn encode_key(key: &str) -> String {
442    if is_valid_unquoted_key(key) {
443        key.to_string()
444    } else {
445        let mut out = String::with_capacity(key.len() + 2);
446        out.push('"');
447        for ch in key.chars() {
448            match ch {
449                '\\' => out.push_str("\\\\"),
450                '"' => out.push_str("\\\""),
451                '\n' => out.push_str("\\n"),
452                '\r' => out.push_str("\\r"),
453                '\t' => out.push_str("\\t"),
454                _ => out.push(ch),
455            }
456        }
457        out.push('"');
458        out
459    }
460}
461
462/// Test if a key can be emitted unquoted per TOON v3.0: `^[A-Za-z_][A-Za-z0-9_.]*$`
463fn is_valid_unquoted_key(key: &str) -> bool {
464    // Must match: ^[A-Za-z_][A-Za-z0-9_.]*$
465    if key.is_empty() {
466        return false;
467    }
468    let mut chars = key.chars();
469    match chars.next() {
470        Some(c) if c.is_ascii_alphabetic() || c == '_' => {}
471        _ => return false,
472    }
473    chars.all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '.')
474}
475
476/// Detect if an array is tabular: all elements are objects with identical key sets,
477/// all values are primitives (no nested arrays/objects).
478fn detect_tabular(arr: &[Value]) -> Option<Vec<String>> {
479    if arr.is_empty() {
480        return None;
481    }
482    // All must be objects
483    let first = arr[0].as_object()?;
484    let fields: Vec<String> = first.keys().cloned().collect();
485    if fields.is_empty() {
486        return None;
487    }
488    // All values in first object must be primitive
489    for val in first.values() {
490        if val.is_object() || val.is_array() {
491            return None;
492        }
493    }
494    // All subsequent objects must have the same keys with primitive values
495    for item in &arr[1..] {
496        let obj = item.as_object()?;
497        if obj.len() != fields.len() {
498            return None;
499        }
500        for field in &fields {
501            let val = obj.get(field)?;
502            if val.is_object() || val.is_array() {
503                return None;
504            }
505        }
506    }
507    Some(fields)
508}
509
510/// Check if all array elements are primitives (not objects or arrays).
511fn all_primitives(arr: &[Value]) -> bool {
512    arr.iter().all(|v| !v.is_object() && !v.is_array())
513}
514
515/// Generate a 2-space-per-level indentation string.
516fn make_indent(depth: usize) -> String {
517    "  ".repeat(depth)
518}