Skip to main content

_etoon/
toon.rs

1//! TOON encoder core (sonic-rs backend).
2//!
3//! Input: JSON bytes (from orjson.dumps on Python side).
4//! Output: TOON string, byte-identical to `toons.dumps()` + TOON spec v1.5.
5
6use sonic_rs::{Array, JsonContainerTrait, JsonType, JsonValueTrait, Object, Value};
7use std::collections::HashSet;
8use std::fmt::Write as _;
9
10/// Encoder configuration matching TOON spec v1.5 options.
11#[derive(Clone, Copy)]
12pub struct Config {
13    /// Delimiter between array/tabular values. Must be `,`, `\t`, or `|`.
14    pub delimiter: u8,
15    /// If true, fold single-key object chains into dot-notation keys (safe mode).
16    pub key_folding: bool,
17    /// Max fold depth (segments). None = unlimited. 0 disables folding.
18    pub flatten_depth: Option<usize>,
19}
20
21impl Default for Config {
22    fn default() -> Self {
23        Self {
24            delimiter: b',',
25            key_folding: false,
26            flatten_depth: None,
27        }
28    }
29}
30
31pub fn encode(json_bytes: &[u8]) -> Result<String, String> {
32    encode_with(json_bytes, &Config::default())
33}
34
35pub fn encode_with(json_bytes: &[u8], cfg: &Config) -> Result<String, String> {
36    let value: Value =
37        sonic_rs::from_slice(json_bytes).map_err(|e| format!("JSON parse error: {}", e))?;
38    let mut out = String::with_capacity(json_bytes.len());
39    write_root(&value, cfg, &mut out);
40    Ok(out)
41}
42
43fn write_root(v: &Value, cfg: &Config, out: &mut String) {
44    match v.get_type() {
45        JsonType::Object => {
46            let m = v.as_object().unwrap();
47            if !m.is_empty() {
48                // Key folding applies only at the top-level object, per TOON spec v1.5.
49                write_object_body(m, 0, cfg, cfg.key_folding, out);
50            }
51        }
52        JsonType::Array => write_array_suffix(v.as_array().unwrap(), 0, cfg, out),
53        _ => write_scalar(v, cfg, out),
54    }
55}
56
57fn write_object_body(m: &Object, indent: usize, cfg: &Config, allow_fold: bool, out: &mut String) {
58    // Collect literal sibling keys only when folding at this level.
59    let siblings: Option<HashSet<&str>> = if allow_fold {
60        Some(m.iter().map(|(k, _)| k).collect())
61    } else {
62        None
63    };
64
65    let mut first = true;
66    for (k, v) in m.iter() {
67        if !first {
68            out.push('\n');
69        }
70        first = false;
71        write_indent(indent, out);
72
73        if let Some(ref sibs) = siblings {
74            if let Some((path, final_v)) = try_fold(k, v, cfg, sibs) {
75                for (i, seg) in path.iter().enumerate() {
76                    if i > 0 {
77                        out.push('.');
78                    }
79                    out.push_str(seg);
80                }
81                write_value_after_key(final_v, indent, cfg, out);
82                continue;
83            }
84        }
85
86        write_key(k, cfg, out);
87        write_value_after_key(v, indent, cfg, out);
88    }
89}
90
91/// Attempt to build a folded chain starting at (k, v).
92/// Returns `Some((path, final_value))` if fold is viable (chain length ≥ 2,
93/// all segments don't need quoting, within flatten_depth, no collision).
94fn try_fold<'a>(
95    k: &'a str,
96    v: &'a Value,
97    cfg: &Config,
98    siblings: &HashSet<&str>,
99) -> Option<(Vec<&'a str>, &'a Value)> {
100    // flatten_depth of 0 or 1 means no practical folding (need ≥2 segments).
101    let max_depth = cfg.flatten_depth.unwrap_or(usize::MAX);
102    if max_depth < 2 {
103        return None;
104    }
105
106    // Key segment must not need quoting (safe mode).
107    if needs_quoting(k, true, cfg.delimiter) {
108        return None;
109    }
110
111    // First value must be a non-empty single-key object for chain to start.
112    let mut cur_v = v;
113    let mut path: Vec<&'a str> = vec![k];
114
115    loop {
116        if path.len() >= max_depth {
117            break;
118        }
119        let obj = match cur_v.get_type() {
120            JsonType::Object => cur_v.as_object().unwrap(),
121            _ => break,
122        };
123        if obj.len() != 1 {
124            break;
125        }
126        let (nk, nv) = obj.iter().next().unwrap();
127        if needs_quoting(nk, true, cfg.delimiter) {
128            // Cannot fold past a segment requiring quotes (safe mode).
129            break;
130        }
131        path.push(nk);
132        cur_v = nv;
133    }
134
135    if path.len() < 2 {
136        return None;
137    }
138
139    // Collision check: verify folded path doesn't match any literal sibling key.
140    let joined: String = path.join(".");
141    for &s in siblings {
142        if s != k && s == joined.as_str() {
143            return None;
144        }
145    }
146
147    Some((path, cur_v))
148}
149
150/// Write the ": value" or ":\n<body>" tail after a key at `key_indent`.
151fn write_value_after_key(v: &Value, key_indent: usize, cfg: &Config, out: &mut String) {
152    match v.get_type() {
153        JsonType::Object => {
154            let child = v.as_object().unwrap();
155            if child.is_empty() {
156                out.push(':');
157            } else {
158                out.push_str(":\n");
159                // Nested object bodies never re-apply key folding (TOON spec: top-level only).
160                write_object_body(child, key_indent + 1, cfg, false, out);
161            }
162        }
163        JsonType::Array => write_array_suffix(v.as_array().unwrap(), key_indent, cfg, out),
164        _ => {
165            out.push_str(": ");
166            write_scalar(v, cfg, out);
167        }
168    }
169}
170
171fn write_array_suffix(arr: &Array, indent: usize, cfg: &Config, out: &mut String) {
172    write!(out, "[{}", arr.len()).unwrap();
173    if cfg.delimiter != b',' {
174        out.push(cfg.delimiter as char);
175    }
176    out.push(']');
177
178    if arr.is_empty() {
179        out.push(':');
180        return;
181    }
182
183    if arr.iter().all(is_scalar) {
184        out.push_str(": ");
185        let mut first = true;
186        for v in arr.iter() {
187            if !first {
188                out.push(cfg.delimiter as char);
189            }
190            first = false;
191            write_scalar(v, cfg, out);
192        }
193        return;
194    }
195
196    if let Some((keys, uniform_order)) = table_keys(arr) {
197        out.push('{');
198        for (i, k) in keys.iter().enumerate() {
199            if i > 0 {
200                out.push(cfg.delimiter as char);
201            }
202            write_key(k, cfg, out);
203        }
204        out.push_str("}:");
205        if uniform_order {
206            for item in arr.iter() {
207                let m = item.as_object().unwrap();
208                out.push('\n');
209                write_indent(indent + 1, out);
210                let mut first = true;
211                for (_, v) in m.iter() {
212                    if !first {
213                        out.push(cfg.delimiter as char);
214                    }
215                    first = false;
216                    write_scalar(v, cfg, out);
217                }
218            }
219        } else {
220            for item in arr.iter() {
221                let m = item.as_object().unwrap();
222                out.push('\n');
223                write_indent(indent + 1, out);
224                let mut first = true;
225                for k in &keys {
226                    if !first {
227                        out.push(cfg.delimiter as char);
228                    }
229                    first = false;
230                    write_scalar(m.get(k).unwrap(), cfg, out);
231                }
232            }
233        }
234        return;
235    }
236
237    out.push(':');
238    for item in arr.iter() {
239        out.push('\n');
240        write_indent(indent + 1, out);
241        out.push('-');
242        write_list_item(item, indent + 1, cfg, out);
243    }
244}
245
246fn write_list_item(v: &Value, l: usize, cfg: &Config, out: &mut String) {
247    match v.get_type() {
248        JsonType::Object => {
249            let m = v.as_object().unwrap();
250            if !m.is_empty() {
251                out.push(' ');
252                write_list_item_object(m, l, cfg, out);
253            }
254        }
255        JsonType::Array => {
256            out.push(' ');
257            write_array_suffix(v.as_array().unwrap(), l, cfg, out);
258        }
259        _ => {
260            out.push(' ');
261            write_scalar(v, cfg, out);
262        }
263    }
264}
265
266fn write_list_item_object(m: &Object, l: usize, cfg: &Config, out: &mut String) {
267    let mut first = true;
268    for (k, v) in m.iter() {
269        if !first {
270            out.push('\n');
271            write_indent(l + 1, out);
272        }
273        first = false;
274        write_key(k, cfg, out);
275        write_value_after_key(v, l + 1, cfg, out);
276    }
277}
278
279// ==================== Helpers ====================
280
281// Pre-computed indent strings for common depths (0-8 levels).
282const INDENTS: [&str; 9] = [
283    "",
284    "  ",
285    "    ",
286    "      ",
287    "        ",
288    "          ",
289    "            ",
290    "              ",
291    "                ",
292];
293
294#[inline]
295fn write_indent(level: usize, out: &mut String) {
296    if level < INDENTS.len() {
297        out.push_str(INDENTS[level]);
298    } else {
299        for _ in 0..(level * 2) {
300            out.push(' ');
301        }
302    }
303}
304
305fn is_scalar(v: &Value) -> bool {
306    !matches!(v.get_type(), JsonType::Object | JsonType::Array)
307}
308
309/// Return ordered keys + order-uniformity flag if array is tabular-eligible.
310fn table_keys<'a>(arr: &'a Array) -> Option<(Vec<&'a str>, bool)> {
311    let first_v = arr.iter().next()?;
312    let first = first_v.as_object()?;
313    if first.is_empty() {
314        return None;
315    }
316    if !first.iter().all(|(_, v)| is_scalar(v)) {
317        return None;
318    }
319    let keys: Vec<&'a str> = first.iter().map(|(k, _)| k).collect();
320    let mut uniform_order = true;
321
322    for item in arr.iter().skip(1) {
323        let m = item.as_object()?;
324        if m.len() != keys.len() {
325            return None;
326        }
327        let mut row_iter = m.iter();
328        for k in &keys {
329            let (ik, iv) = row_iter.next()?;
330            if !is_scalar(iv) {
331                return None;
332            }
333            if ik != *k {
334                uniform_order = false;
335            }
336        }
337        // Order mismatch: re-verify via lookup that every header key exists in this row.
338        if !uniform_order {
339            for k in &keys {
340                match m.get(k) {
341                    Some(v) if is_scalar(v) => {}
342                    _ => return None,
343                }
344            }
345        }
346    }
347    Some((keys, uniform_order))
348}
349
350// ==================== Scalar ====================
351
352fn write_scalar(v: &Value, cfg: &Config, out: &mut String) {
353    match v.get_type() {
354        JsonType::Null => out.push_str("null"),
355        JsonType::Boolean => out.push_str(if v.as_bool().unwrap() {
356            "true"
357        } else {
358            "false"
359        }),
360        JsonType::Number => write_number(v, out),
361        JsonType::String => write_string_value(v.as_str().unwrap(), cfg, out),
362        _ => unreachable!("write_scalar on non-scalar"),
363    }
364}
365
366fn write_number(v: &Value, out: &mut String) {
367    if let Some(i) = v.as_i64() {
368        let mut buf = itoa::Buffer::new();
369        out.push_str(buf.format(i));
370        return;
371    }
372    if let Some(u) = v.as_u64() {
373        let mut buf = itoa::Buffer::new();
374        out.push_str(buf.format(u));
375        return;
376    }
377    let raw = v.to_string();
378    if !raw.contains('.') && !raw.contains('e') && !raw.contains('E') {
379        out.push_str(&raw);
380        return;
381    }
382    if let Some(f) = v.as_f64() {
383        write_float(f, out);
384    } else {
385        out.push_str("null");
386    }
387}
388
389fn write_float(f: f64, out: &mut String) {
390    if !f.is_finite() {
391        out.push_str("null");
392        return;
393    }
394    if f == 0.0 {
395        out.push('0');
396        return;
397    }
398    if f.fract() == 0.0 && f.abs() < 1e16 {
399        let mut buf = itoa::Buffer::new();
400        out.push_str(buf.format(f as i64));
401        return;
402    }
403    write!(out, "{}", f).unwrap();
404}
405
406// ==================== String ====================
407
408fn write_string_value(s: &str, cfg: &Config, out: &mut String) {
409    if needs_quoting(s, false, cfg.delimiter) {
410        write_quoted(s, out);
411    } else {
412        out.push_str(s);
413    }
414}
415
416fn write_key(k: &str, cfg: &Config, out: &mut String) {
417    if needs_quoting(k, true, cfg.delimiter) {
418        write_quoted(k, out);
419    } else {
420        out.push_str(k);
421    }
422}
423
424fn needs_quoting(s: &str, is_key: bool, delimiter: u8) -> bool {
425    if s.is_empty() {
426        return true;
427    }
428    let bytes = s.as_bytes();
429
430    if is_key {
431        // Keys must match TOON identifier pattern: [a-zA-Z_][a-zA-Z0-9_.]*
432        let first = bytes[0];
433        if !(first.is_ascii_alphabetic() || first == b'_') {
434            return true;
435        }
436        for &b in &bytes[1..] {
437            if !(b.is_ascii_alphanumeric() || b == b'_' || b == b'.') {
438                return true;
439            }
440        }
441        return false;
442    }
443
444    // Value rules
445    match bytes[0] {
446        b'-' | b'[' | b'{' | b'"' | b'#' | b' ' | b'\t' => return true,
447        _ => {}
448    }
449    match bytes[bytes.len() - 1] {
450        b' ' | b'\t' => return true,
451        _ => {}
452    }
453    for &b in bytes {
454        if b == delimiter {
455            return true;
456        }
457        match b {
458            b':' | b'\n' | b'\r' | b'\t' | b'"' | b'\\' => return true,
459            _ => {}
460        }
461    }
462    if matches!(s, "true" | "false" | "null") {
463        return true;
464    }
465    looks_like_number(bytes)
466}
467
468fn looks_like_number(bytes: &[u8]) -> bool {
469    let mut i = 0;
470    if bytes[0] == b'-' {
471        i = 1;
472        if i == bytes.len() {
473            return false;
474        }
475    }
476    let mut has_digit = false;
477    while i < bytes.len() && bytes[i].is_ascii_digit() {
478        has_digit = true;
479        i += 1;
480    }
481    if !has_digit {
482        return false;
483    }
484    if i < bytes.len() && bytes[i] == b'.' {
485        i += 1;
486        let mut has_frac = false;
487        while i < bytes.len() && bytes[i].is_ascii_digit() {
488            has_frac = true;
489            i += 1;
490        }
491        if !has_frac {
492            return false;
493        }
494    }
495    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
496        i += 1;
497        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
498            i += 1;
499        }
500        let mut has_exp_digit = false;
501        while i < bytes.len() && bytes[i].is_ascii_digit() {
502            has_exp_digit = true;
503            i += 1;
504        }
505        if !has_exp_digit {
506            return false;
507        }
508    }
509    i == bytes.len()
510}
511
512fn write_quoted(s: &str, out: &mut String) {
513    out.push('"');
514    let bytes = s.as_bytes();
515    let mut start = 0;
516    for (i, &b) in bytes.iter().enumerate() {
517        if matches!(b, b'\\' | b'"' | b'\n' | b'\r' | b'\t') {
518            if start < i {
519                out.push_str(unsafe { std::str::from_utf8_unchecked(&bytes[start..i]) });
520            }
521            out.push_str(match b {
522                b'\\' => "\\\\",
523                b'"' => "\\\"",
524                b'\n' => "\\n",
525                b'\r' => "\\r",
526                b'\t' => "\\t",
527                _ => unreachable!(),
528            });
529            start = i + 1;
530        }
531    }
532    if start < bytes.len() {
533        out.push_str(unsafe { std::str::from_utf8_unchecked(&bytes[start..]) });
534    }
535    out.push('"');
536}