Skip to main content

_etoon/
toon.rs

1//! TOON encoder core (sonic-rs backend).
2//!
3//! Input: JSON bytes (from orjson.dumps on Python side).
4//! Output: TOON string, matching TOON spec v1.5.
5//!
6//! Delimiter is monomorphized via const generics (`DELIM: u8`) so the
7//! byte-match inner loops fold away when emitting default-comma output.
8
9use sonic_rs::{Array, JsonContainerTrait, JsonType, JsonValueTrait, Object, Value};
10use std::collections::HashSet;
11use std::fmt::Write as _;
12
13/// Encoder configuration matching TOON spec v1.5 options.
14#[derive(Clone, Copy)]
15pub struct Config {
16    /// Delimiter between array/tabular values. Must be `,`, `\t`, or `|`.
17    pub delimiter: u8,
18    /// If true, fold single-key object chains into dot-notation keys (safe mode).
19    pub key_folding: bool,
20    /// Max fold depth (segments). None = unlimited. 0 disables folding.
21    pub flatten_depth: Option<usize>,
22}
23
24impl Default for Config {
25    fn default() -> Self {
26        Self {
27            delimiter: b',',
28            key_folding: false,
29            flatten_depth: None,
30        }
31    }
32}
33
34pub fn encode(json_bytes: &[u8]) -> Result<String, String> {
35    encode_with(json_bytes, &Config::default())
36}
37
38pub fn encode_with(json_bytes: &[u8], cfg: &Config) -> Result<String, String> {
39    let value: Value =
40        sonic_rs::from_slice(json_bytes).map_err(|e| format!("JSON parse error: {}", e))?;
41    let mut out = String::with_capacity(json_bytes.len());
42    match cfg.delimiter {
43        b',' => write_root::<b','>(&value, cfg, &mut out),
44        b'\t' => write_root::<b'\t'>(&value, cfg, &mut out),
45        b'|' => write_root::<b'|'>(&value, cfg, &mut out),
46        _ => return Err("delimiter must be ',', '\\t', or '|'".to_string()),
47    }
48    Ok(out)
49}
50
51fn write_root<const DELIM: u8>(v: &Value, cfg: &Config, out: &mut String) {
52    match v.get_type() {
53        JsonType::Object => {
54            let m = v.as_object().unwrap();
55            if !m.is_empty() {
56                // Key folding applies only at the top-level object (TOON spec v1.5).
57                write_object_body::<DELIM>(m, 0, cfg, cfg.key_folding, out);
58            }
59        }
60        JsonType::Array => write_array_suffix::<DELIM>(v.as_array().unwrap(), 0, cfg, out),
61        _ => write_scalar::<DELIM>(v, out),
62    }
63}
64
65fn write_object_body<const DELIM: u8>(
66    m: &Object,
67    indent: usize,
68    cfg: &Config,
69    allow_fold: bool,
70    out: &mut String,
71) {
72    let siblings: Option<HashSet<&str>> = if allow_fold {
73        Some(m.iter().map(|(k, _)| k).collect())
74    } else {
75        None
76    };
77
78    let mut first = true;
79    for (k, v) in m.iter() {
80        if !first {
81            out.push('\n');
82        }
83        first = false;
84        write_indent(indent, out);
85
86        if let Some(ref sibs) = siblings {
87            if let Some((joined, final_v)) = try_fold(k, v, cfg, sibs) {
88                write_key(&joined, out);
89                write_value_after_key::<DELIM>(final_v, indent, cfg, out);
90                continue;
91            }
92        }
93
94        write_key(k, out);
95        write_value_after_key::<DELIM>(v, indent, cfg, out);
96    }
97}
98
99fn try_fold<'a>(
100    k: &'a str,
101    v: &'a Value,
102    cfg: &Config,
103    siblings: &HashSet<&str>,
104) -> Option<(String, &'a Value)> {
105    let max_depth = cfg.flatten_depth.unwrap_or(usize::MAX);
106    if max_depth < 2 {
107        return None;
108    }
109
110    // Key segments must match TOON identifier pattern (safe mode).
111    if key_needs_quoting(k) {
112        return None;
113    }
114
115    let mut cur_v = v;
116    let mut path: Vec<&'a str> = vec![k];
117
118    loop {
119        if path.len() >= max_depth {
120            break;
121        }
122        let obj = match cur_v.get_type() {
123            JsonType::Object => cur_v.as_object().unwrap(),
124            _ => break,
125        };
126        if obj.len() != 1 {
127            break;
128        }
129        let (nk, nv) = obj.iter().next().unwrap();
130        if key_needs_quoting(nk) {
131            break;
132        }
133        path.push(nk);
134        cur_v = nv;
135    }
136
137    if path.len() < 2 {
138        return None;
139    }
140
141    let joined: String = path.join(".");
142    for &s in siblings {
143        if s != k && s == joined.as_str() {
144            return None;
145        }
146    }
147
148    Some((joined, cur_v))
149}
150
151fn write_value_after_key<const DELIM: u8>(
152    v: &Value,
153    key_indent: usize,
154    cfg: &Config,
155    out: &mut String,
156) {
157    match v.get_type() {
158        JsonType::Object => {
159            let child = v.as_object().unwrap();
160            if child.is_empty() {
161                out.push(':');
162            } else {
163                out.push_str(":\n");
164                // Nested object bodies never re-apply key folding (TOON spec: top-level only).
165                write_object_body::<DELIM>(child, key_indent + 1, cfg, false, out);
166            }
167        }
168        JsonType::Array => write_array_suffix::<DELIM>(v.as_array().unwrap(), key_indent, cfg, out),
169        _ => {
170            out.push_str(": ");
171            write_scalar::<DELIM>(v, out);
172        }
173    }
174}
175
176fn write_array_suffix<const DELIM: u8>(arr: &Array, indent: usize, cfg: &Config, out: &mut String) {
177    write!(out, "[{}", arr.len()).unwrap();
178    if DELIM != b',' {
179        out.push(DELIM as char);
180    }
181    out.push(']');
182
183    if arr.is_empty() {
184        out.push(':');
185        return;
186    }
187
188    if arr.iter().all(is_scalar) {
189        out.push_str(": ");
190        let mut first = true;
191        for v in arr.iter() {
192            if !first {
193                out.push(DELIM as char);
194            }
195            first = false;
196            write_scalar::<DELIM>(v, out);
197        }
198        return;
199    }
200
201    if let Some((keys, uniform_order)) = table_keys(arr) {
202        out.push('{');
203        for (i, k) in keys.iter().enumerate() {
204            if i > 0 {
205                out.push(DELIM as char);
206            }
207            write_key(k, out);
208        }
209        out.push_str("}:");
210        if uniform_order {
211            for item in arr.iter() {
212                let m = item.as_object().unwrap();
213                out.push('\n');
214                write_indent(indent + 1, out);
215                let mut first = true;
216                for (_, v) in m.iter() {
217                    if !first {
218                        out.push(DELIM as char);
219                    }
220                    first = false;
221                    write_scalar::<DELIM>(v, out);
222                }
223            }
224        } else {
225            for item in arr.iter() {
226                let m = item.as_object().unwrap();
227                out.push('\n');
228                write_indent(indent + 1, out);
229                let mut first = true;
230                for k in &keys {
231                    if !first {
232                        out.push(DELIM as char);
233                    }
234                    first = false;
235                    write_scalar::<DELIM>(m.get(k).unwrap(), out);
236                }
237            }
238        }
239        return;
240    }
241
242    out.push(':');
243    for item in arr.iter() {
244        out.push('\n');
245        write_indent(indent + 1, out);
246        out.push('-');
247        write_list_item::<DELIM>(item, indent + 1, cfg, out);
248    }
249}
250
251fn write_list_item<const DELIM: u8>(v: &Value, l: usize, cfg: &Config, out: &mut String) {
252    match v.get_type() {
253        JsonType::Object => {
254            let m = v.as_object().unwrap();
255            if !m.is_empty() {
256                out.push(' ');
257                write_list_item_object::<DELIM>(m, l, cfg, out);
258            }
259        }
260        JsonType::Array => {
261            out.push(' ');
262            write_array_suffix::<DELIM>(v.as_array().unwrap(), l, cfg, out);
263        }
264        _ => {
265            out.push(' ');
266            write_scalar::<DELIM>(v, out);
267        }
268    }
269}
270
271fn write_list_item_object<const DELIM: u8>(m: &Object, l: usize, cfg: &Config, out: &mut String) {
272    let mut first = true;
273    for (k, v) in m.iter() {
274        if !first {
275            out.push('\n');
276            write_indent(l + 1, out);
277        }
278        first = false;
279        write_key(k, out);
280        write_value_after_key::<DELIM>(v, l + 1, cfg, out);
281    }
282}
283
284// ==================== Helpers ====================
285
286const INDENTS: [&str; 9] = [
287    "",
288    "  ",
289    "    ",
290    "      ",
291    "        ",
292    "          ",
293    "            ",
294    "              ",
295    "                ",
296];
297
298#[inline]
299fn write_indent(level: usize, out: &mut String) {
300    if level < INDENTS.len() {
301        out.push_str(INDENTS[level]);
302    } else {
303        for _ in 0..(level * 2) {
304            out.push(' ');
305        }
306    }
307}
308
309fn is_scalar(v: &Value) -> bool {
310    !matches!(v.get_type(), JsonType::Object | JsonType::Array)
311}
312
313fn table_keys<'a>(arr: &'a Array) -> Option<(Vec<&'a str>, bool)> {
314    let first_v = arr.iter().next()?;
315    let first = first_v.as_object()?;
316    if first.is_empty() {
317        return None;
318    }
319    if !first.iter().all(|(_, v)| is_scalar(v)) {
320        return None;
321    }
322    let keys: Vec<&'a str> = first.iter().map(|(k, _)| k).collect();
323    let mut uniform_order = true;
324
325    for item in arr.iter().skip(1) {
326        let m = item.as_object()?;
327        if m.len() != keys.len() {
328            return None;
329        }
330        let mut row_iter = m.iter();
331        for k in &keys {
332            let (ik, iv) = row_iter.next()?;
333            if !is_scalar(iv) {
334                return None;
335            }
336            if ik != *k {
337                uniform_order = false;
338            }
339        }
340        if !uniform_order {
341            for k in &keys {
342                match m.get(k) {
343                    Some(v) if is_scalar(v) => {}
344                    _ => return None,
345                }
346            }
347        }
348    }
349    Some((keys, uniform_order))
350}
351
352// ==================== Scalar ====================
353
354#[inline]
355fn write_scalar<const DELIM: u8>(v: &Value, out: &mut String) {
356    match v.get_type() {
357        JsonType::Null => out.push_str("null"),
358        JsonType::Boolean => out.push_str(if v.as_bool().unwrap() {
359            "true"
360        } else {
361            "false"
362        }),
363        JsonType::Number => write_number(v, out),
364        JsonType::String => write_string_value::<DELIM>(v.as_str().unwrap(), out),
365        _ => unreachable!("write_scalar on non-scalar"),
366    }
367}
368
369fn write_number(v: &Value, out: &mut String) {
370    if let Some(i) = v.as_i64() {
371        let mut buf = itoa::Buffer::new();
372        out.push_str(buf.format(i));
373        return;
374    }
375    if let Some(u) = v.as_u64() {
376        let mut buf = itoa::Buffer::new();
377        out.push_str(buf.format(u));
378        return;
379    }
380    let raw = v.to_string();
381    if !raw.contains('.') && !raw.contains('e') && !raw.contains('E') {
382        out.push_str(&raw);
383        return;
384    }
385    if let Some(f) = v.as_f64() {
386        write_float(f, out);
387    } else {
388        out.push_str("null");
389    }
390}
391
392fn write_float(f: f64, out: &mut String) {
393    if !f.is_finite() {
394        out.push_str("null");
395        return;
396    }
397    if f == 0.0 {
398        out.push('0');
399        return;
400    }
401    if f.fract() == 0.0 && f.abs() < 1e16 {
402        let mut buf = itoa::Buffer::new();
403        out.push_str(buf.format(f as i64));
404        return;
405    }
406    write!(out, "{}", f).unwrap();
407}
408
409// ==================== String ====================
410
411#[inline]
412fn write_string_value<const DELIM: u8>(s: &str, out: &mut String) {
413    if value_needs_quoting::<DELIM>(s) {
414        write_quoted(s, out);
415    } else {
416        out.push_str(s);
417    }
418}
419
420fn write_key(k: &str, out: &mut String) {
421    if key_needs_quoting(k) {
422        write_quoted(k, out);
423    } else {
424        out.push_str(k);
425    }
426}
427
428/// Keys must match TOON identifier pattern: `[@$#a-zA-Z_][a-zA-Z0-9_.]*`.
429/// Sigil prefixes `@`, `$`, `#` are allowed for ecosystem compatibility:
430/// - `@` : AWS CloudWatch, Elasticsearch, Serilog, XML→JSON
431/// - `$` : MongoDB, JSON Schema, AWS CloudFormation
432/// - `#` : JSON-LD, Azure Resource Manager
433#[inline]
434fn key_needs_quoting(s: &str) -> bool {
435    if s.is_empty() {
436        return true;
437    }
438    let bytes = s.as_bytes();
439    let start = match bytes[0] {
440        b'@' | b'$' | b'#' => {
441            if bytes.len() < 2 {
442                return true; // bare sigil needs quoting
443            }
444            1
445        }
446        _ => 0,
447    };
448    let first = bytes[start];
449    if !(first.is_ascii_alphabetic() || first == b'_') {
450        return true;
451    }
452    for &b in &bytes[start + 1..] {
453        if !(b.is_ascii_alphanumeric() || b == b'_' || b == b'.') {
454            return true;
455        }
456    }
457    false
458}
459
460#[inline]
461fn value_needs_quoting<const DELIM: u8>(s: &str) -> bool {
462    if s.is_empty() {
463        return true;
464    }
465    let bytes = s.as_bytes();
466    match bytes[0] {
467        b'-' | b'[' | b'{' | b'"' | b'#' | b' ' | b'\t' => return true,
468        _ => {}
469    }
470    match bytes[bytes.len() - 1] {
471        b' ' | b'\t' => return true,
472        _ => {}
473    }
474    // DELIM is a compile-time constant, so this match collapses into the
475    // single match arm below when DELIM is in {',', '\t'} (already included),
476    // and stays as a separate branch only for DELIM = '|'.
477    for &b in bytes {
478        match b {
479            b':' | b'\n' | b'\r' | b'\t' | b'"' | b'\\' => return true,
480            _ if b == DELIM => return true,
481            _ => {}
482        }
483    }
484    if matches!(s, "true" | "false" | "null") {
485        return true;
486    }
487    looks_like_number(bytes)
488}
489
490fn looks_like_number(bytes: &[u8]) -> bool {
491    let mut i = 0;
492    if bytes[0] == b'-' {
493        i = 1;
494        if i == bytes.len() {
495            return false;
496        }
497    }
498    let mut has_digit = false;
499    while i < bytes.len() && bytes[i].is_ascii_digit() {
500        has_digit = true;
501        i += 1;
502    }
503    if !has_digit {
504        return false;
505    }
506    if i < bytes.len() && bytes[i] == b'.' {
507        i += 1;
508        let mut has_frac = false;
509        while i < bytes.len() && bytes[i].is_ascii_digit() {
510            has_frac = true;
511            i += 1;
512        }
513        if !has_frac {
514            return false;
515        }
516    }
517    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
518        i += 1;
519        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
520            i += 1;
521        }
522        let mut has_exp_digit = false;
523        while i < bytes.len() && bytes[i].is_ascii_digit() {
524            has_exp_digit = true;
525            i += 1;
526        }
527        if !has_exp_digit {
528            return false;
529        }
530    }
531    i == bytes.len()
532}
533
534fn write_quoted(s: &str, out: &mut String) {
535    out.push('"');
536    let bytes = s.as_bytes();
537    let mut start = 0;
538    for (i, &b) in bytes.iter().enumerate() {
539        if matches!(b, b'\\' | b'"' | b'\n' | b'\r' | b'\t') {
540            if start < i {
541                out.push_str(unsafe { std::str::from_utf8_unchecked(&bytes[start..i]) });
542            }
543            out.push_str(match b {
544                b'\\' => "\\\\",
545                b'"' => "\\\"",
546                b'\n' => "\\n",
547                b'\r' => "\\r",
548                b'\t' => "\\t",
549                _ => unreachable!(),
550            });
551            start = i + 1;
552        }
553    }
554    if start < bytes.len() {
555        out.push_str(unsafe { std::str::from_utf8_unchecked(&bytes[start..]) });
556    }
557    out.push('"');
558}