Skip to main content

_etoon/
toon.rs

1//! TOON encoder core (sonic-rs backend).
2//!
3//! Input: JSON bytes (from orjson.dumps on Python side).
4//! Output: TOON string, matching TOON spec v1.5.
5//!
6//! Delimiter is monomorphized via const generics (`DELIM: u8`) so the
7//! byte-match inner loops fold away when emitting default-comma output.
8
9use sonic_rs::{Array, JsonContainerTrait, JsonType, JsonValueTrait, Object, Value};
10use std::collections::HashSet;
11use std::fmt::Write as _;
12
13/// Encoder configuration matching TOON spec v1.5 options.
14#[derive(Clone, Copy)]
15pub struct Config {
16    /// Delimiter between array/tabular values. Must be `,`, `\t`, or `|`.
17    pub delimiter: u8,
18    /// If true, fold single-key object chains into dot-notation keys (safe mode).
19    pub key_folding: bool,
20    /// Max fold depth (segments). None = unlimited. 0 disables folding.
21    pub flatten_depth: Option<usize>,
22}
23
24impl Default for Config {
25    fn default() -> Self {
26        Self {
27            delimiter: b',',
28            key_folding: false,
29            flatten_depth: None,
30        }
31    }
32}
33
34pub fn encode(json_bytes: &[u8]) -> Result<String, String> {
35    encode_with(json_bytes, &Config::default())
36}
37
38pub fn encode_with(json_bytes: &[u8], cfg: &Config) -> Result<String, String> {
39    let value: Value =
40        sonic_rs::from_slice(json_bytes).map_err(|e| format!("JSON parse error: {}", e))?;
41    let mut out = String::with_capacity(json_bytes.len());
42    match cfg.delimiter {
43        b',' => write_root::<b','>(&value, cfg, &mut out),
44        b'\t' => write_root::<b'\t'>(&value, cfg, &mut out),
45        b'|' => write_root::<b'|'>(&value, cfg, &mut out),
46        _ => return Err("delimiter must be ',', '\\t', or '|'".to_string()),
47    }
48    Ok(out)
49}
50
51fn write_root<const DELIM: u8>(v: &Value, cfg: &Config, out: &mut String) {
52    match v.get_type() {
53        JsonType::Object => {
54            let m = v.as_object().unwrap();
55            if !m.is_empty() {
56                // Key folding applies only at the top-level object (TOON spec v1.5).
57                write_object_body::<DELIM>(m, 0, cfg, cfg.key_folding, out);
58            }
59        }
60        JsonType::Array => write_array_suffix::<DELIM>(v.as_array().unwrap(), 0, cfg, out),
61        _ => write_scalar::<DELIM>(v, out),
62    }
63}
64
65fn write_object_body<const DELIM: u8>(
66    m: &Object,
67    indent: usize,
68    cfg: &Config,
69    allow_fold: bool,
70    out: &mut String,
71) {
72    let siblings: Option<HashSet<&str>> = if allow_fold {
73        Some(m.iter().map(|(k, _)| k).collect())
74    } else {
75        None
76    };
77
78    let mut first = true;
79    for (k, v) in m.iter() {
80        if !first {
81            out.push('\n');
82        }
83        first = false;
84        write_indent(indent, out);
85
86        if let Some(ref sibs) = siblings {
87            if let Some((path, final_v)) = try_fold(k, v, cfg, sibs) {
88                for (i, seg) in path.iter().enumerate() {
89                    if i > 0 {
90                        out.push('.');
91                    }
92                    out.push_str(seg);
93                }
94                write_value_after_key::<DELIM>(final_v, indent, cfg, out);
95                continue;
96            }
97        }
98
99        write_key(k, out);
100        write_value_after_key::<DELIM>(v, indent, cfg, out);
101    }
102}
103
104fn try_fold<'a>(
105    k: &'a str,
106    v: &'a Value,
107    cfg: &Config,
108    siblings: &HashSet<&str>,
109) -> Option<(Vec<&'a str>, &'a Value)> {
110    let max_depth = cfg.flatten_depth.unwrap_or(usize::MAX);
111    if max_depth < 2 {
112        return None;
113    }
114
115    // Key segments must match TOON identifier pattern (safe mode).
116    if key_needs_quoting(k) {
117        return None;
118    }
119
120    let mut cur_v = v;
121    let mut path: Vec<&'a str> = vec![k];
122
123    loop {
124        if path.len() >= max_depth {
125            break;
126        }
127        let obj = match cur_v.get_type() {
128            JsonType::Object => cur_v.as_object().unwrap(),
129            _ => break,
130        };
131        if obj.len() != 1 {
132            break;
133        }
134        let (nk, nv) = obj.iter().next().unwrap();
135        if key_needs_quoting(nk) {
136            break;
137        }
138        path.push(nk);
139        cur_v = nv;
140    }
141
142    if path.len() < 2 {
143        return None;
144    }
145
146    let joined: String = path.join(".");
147    for &s in siblings {
148        if s != k && s == joined.as_str() {
149            return None;
150        }
151    }
152
153    Some((path, cur_v))
154}
155
156fn write_value_after_key<const DELIM: u8>(
157    v: &Value,
158    key_indent: usize,
159    cfg: &Config,
160    out: &mut String,
161) {
162    match v.get_type() {
163        JsonType::Object => {
164            let child = v.as_object().unwrap();
165            if child.is_empty() {
166                out.push(':');
167            } else {
168                out.push_str(":\n");
169                // Nested object bodies never re-apply key folding (TOON spec: top-level only).
170                write_object_body::<DELIM>(child, key_indent + 1, cfg, false, out);
171            }
172        }
173        JsonType::Array => write_array_suffix::<DELIM>(v.as_array().unwrap(), key_indent, cfg, out),
174        _ => {
175            out.push_str(": ");
176            write_scalar::<DELIM>(v, out);
177        }
178    }
179}
180
181fn write_array_suffix<const DELIM: u8>(arr: &Array, indent: usize, cfg: &Config, out: &mut String) {
182    write!(out, "[{}", arr.len()).unwrap();
183    if DELIM != b',' {
184        out.push(DELIM as char);
185    }
186    out.push(']');
187
188    if arr.is_empty() {
189        out.push(':');
190        return;
191    }
192
193    if arr.iter().all(is_scalar) {
194        out.push_str(": ");
195        let mut first = true;
196        for v in arr.iter() {
197            if !first {
198                out.push(DELIM as char);
199            }
200            first = false;
201            write_scalar::<DELIM>(v, out);
202        }
203        return;
204    }
205
206    if let Some((keys, uniform_order)) = table_keys(arr) {
207        out.push('{');
208        for (i, k) in keys.iter().enumerate() {
209            if i > 0 {
210                out.push(DELIM as char);
211            }
212            write_key(k, out);
213        }
214        out.push_str("}:");
215        if uniform_order {
216            for item in arr.iter() {
217                let m = item.as_object().unwrap();
218                out.push('\n');
219                write_indent(indent + 1, out);
220                let mut first = true;
221                for (_, v) in m.iter() {
222                    if !first {
223                        out.push(DELIM as char);
224                    }
225                    first = false;
226                    write_scalar::<DELIM>(v, out);
227                }
228            }
229        } else {
230            for item in arr.iter() {
231                let m = item.as_object().unwrap();
232                out.push('\n');
233                write_indent(indent + 1, out);
234                let mut first = true;
235                for k in &keys {
236                    if !first {
237                        out.push(DELIM as char);
238                    }
239                    first = false;
240                    write_scalar::<DELIM>(m.get(k).unwrap(), out);
241                }
242            }
243        }
244        return;
245    }
246
247    out.push(':');
248    for item in arr.iter() {
249        out.push('\n');
250        write_indent(indent + 1, out);
251        out.push('-');
252        write_list_item::<DELIM>(item, indent + 1, cfg, out);
253    }
254}
255
256fn write_list_item<const DELIM: u8>(v: &Value, l: usize, cfg: &Config, out: &mut String) {
257    match v.get_type() {
258        JsonType::Object => {
259            let m = v.as_object().unwrap();
260            if !m.is_empty() {
261                out.push(' ');
262                write_list_item_object::<DELIM>(m, l, cfg, out);
263            }
264        }
265        JsonType::Array => {
266            out.push(' ');
267            write_array_suffix::<DELIM>(v.as_array().unwrap(), l, cfg, out);
268        }
269        _ => {
270            out.push(' ');
271            write_scalar::<DELIM>(v, out);
272        }
273    }
274}
275
276fn write_list_item_object<const DELIM: u8>(m: &Object, l: usize, cfg: &Config, out: &mut String) {
277    let mut first = true;
278    for (k, v) in m.iter() {
279        if !first {
280            out.push('\n');
281            write_indent(l + 1, out);
282        }
283        first = false;
284        write_key(k, out);
285        write_value_after_key::<DELIM>(v, l + 1, cfg, out);
286    }
287}
288
289// ==================== Helpers ====================
290
291const INDENTS: [&str; 9] = [
292    "",
293    "  ",
294    "    ",
295    "      ",
296    "        ",
297    "          ",
298    "            ",
299    "              ",
300    "                ",
301];
302
303#[inline]
304fn write_indent(level: usize, out: &mut String) {
305    if level < INDENTS.len() {
306        out.push_str(INDENTS[level]);
307    } else {
308        for _ in 0..(level * 2) {
309            out.push(' ');
310        }
311    }
312}
313
314fn is_scalar(v: &Value) -> bool {
315    !matches!(v.get_type(), JsonType::Object | JsonType::Array)
316}
317
318fn table_keys<'a>(arr: &'a Array) -> Option<(Vec<&'a str>, bool)> {
319    let first_v = arr.iter().next()?;
320    let first = first_v.as_object()?;
321    if first.is_empty() {
322        return None;
323    }
324    if !first.iter().all(|(_, v)| is_scalar(v)) {
325        return None;
326    }
327    let keys: Vec<&'a str> = first.iter().map(|(k, _)| k).collect();
328    let mut uniform_order = true;
329
330    for item in arr.iter().skip(1) {
331        let m = item.as_object()?;
332        if m.len() != keys.len() {
333            return None;
334        }
335        let mut row_iter = m.iter();
336        for k in &keys {
337            let (ik, iv) = row_iter.next()?;
338            if !is_scalar(iv) {
339                return None;
340            }
341            if ik != *k {
342                uniform_order = false;
343            }
344        }
345        if !uniform_order {
346            for k in &keys {
347                match m.get(k) {
348                    Some(v) if is_scalar(v) => {}
349                    _ => return None,
350                }
351            }
352        }
353    }
354    Some((keys, uniform_order))
355}
356
357// ==================== Scalar ====================
358
359#[inline]
360fn write_scalar<const DELIM: u8>(v: &Value, out: &mut String) {
361    match v.get_type() {
362        JsonType::Null => out.push_str("null"),
363        JsonType::Boolean => out.push_str(if v.as_bool().unwrap() {
364            "true"
365        } else {
366            "false"
367        }),
368        JsonType::Number => write_number(v, out),
369        JsonType::String => write_string_value::<DELIM>(v.as_str().unwrap(), out),
370        _ => unreachable!("write_scalar on non-scalar"),
371    }
372}
373
374fn write_number(v: &Value, out: &mut String) {
375    if let Some(i) = v.as_i64() {
376        let mut buf = itoa::Buffer::new();
377        out.push_str(buf.format(i));
378        return;
379    }
380    if let Some(u) = v.as_u64() {
381        let mut buf = itoa::Buffer::new();
382        out.push_str(buf.format(u));
383        return;
384    }
385    let raw = v.to_string();
386    if !raw.contains('.') && !raw.contains('e') && !raw.contains('E') {
387        out.push_str(&raw);
388        return;
389    }
390    if let Some(f) = v.as_f64() {
391        write_float(f, out);
392    } else {
393        out.push_str("null");
394    }
395}
396
397fn write_float(f: f64, out: &mut String) {
398    if !f.is_finite() {
399        out.push_str("null");
400        return;
401    }
402    if f == 0.0 {
403        out.push('0');
404        return;
405    }
406    if f.fract() == 0.0 && f.abs() < 1e16 {
407        let mut buf = itoa::Buffer::new();
408        out.push_str(buf.format(f as i64));
409        return;
410    }
411    write!(out, "{}", f).unwrap();
412}
413
414// ==================== String ====================
415
416#[inline]
417fn write_string_value<const DELIM: u8>(s: &str, out: &mut String) {
418    if value_needs_quoting::<DELIM>(s) {
419        write_quoted(s, out);
420    } else {
421        out.push_str(s);
422    }
423}
424
425fn write_key(k: &str, out: &mut String) {
426    if key_needs_quoting(k) {
427        write_quoted(k, out);
428    } else {
429        out.push_str(k);
430    }
431}
432
433/// Keys must match TOON identifier pattern: `[a-zA-Z_][a-zA-Z0-9_.]*`.
434#[inline]
435fn key_needs_quoting(s: &str) -> bool {
436    if s.is_empty() {
437        return true;
438    }
439    let bytes = s.as_bytes();
440    let first = bytes[0];
441    if !(first.is_ascii_alphabetic() || first == b'_') {
442        return true;
443    }
444    for &b in &bytes[1..] {
445        if !(b.is_ascii_alphanumeric() || b == b'_' || b == b'.') {
446            return true;
447        }
448    }
449    false
450}
451
452#[inline]
453fn value_needs_quoting<const DELIM: u8>(s: &str) -> bool {
454    if s.is_empty() {
455        return true;
456    }
457    let bytes = s.as_bytes();
458    match bytes[0] {
459        b'-' | b'[' | b'{' | b'"' | b'#' | b' ' | b'\t' => return true,
460        _ => {}
461    }
462    match bytes[bytes.len() - 1] {
463        b' ' | b'\t' => return true,
464        _ => {}
465    }
466    // DELIM is a compile-time constant, so this match collapses into the
467    // single match arm below when DELIM is in {',', '\t'} (already included),
468    // and stays as a separate branch only for DELIM = '|'.
469    for &b in bytes {
470        match b {
471            b':' | b'\n' | b'\r' | b'\t' | b'"' | b'\\' => return true,
472            _ if b == DELIM => return true,
473            _ => {}
474        }
475    }
476    if matches!(s, "true" | "false" | "null") {
477        return true;
478    }
479    looks_like_number(bytes)
480}
481
482fn looks_like_number(bytes: &[u8]) -> bool {
483    let mut i = 0;
484    if bytes[0] == b'-' {
485        i = 1;
486        if i == bytes.len() {
487            return false;
488        }
489    }
490    let mut has_digit = false;
491    while i < bytes.len() && bytes[i].is_ascii_digit() {
492        has_digit = true;
493        i += 1;
494    }
495    if !has_digit {
496        return false;
497    }
498    if i < bytes.len() && bytes[i] == b'.' {
499        i += 1;
500        let mut has_frac = false;
501        while i < bytes.len() && bytes[i].is_ascii_digit() {
502            has_frac = true;
503            i += 1;
504        }
505        if !has_frac {
506            return false;
507        }
508    }
509    if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
510        i += 1;
511        if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
512            i += 1;
513        }
514        let mut has_exp_digit = false;
515        while i < bytes.len() && bytes[i].is_ascii_digit() {
516            has_exp_digit = true;
517            i += 1;
518        }
519        if !has_exp_digit {
520            return false;
521        }
522    }
523    i == bytes.len()
524}
525
526fn write_quoted(s: &str, out: &mut String) {
527    out.push('"');
528    let bytes = s.as_bytes();
529    let mut start = 0;
530    for (i, &b) in bytes.iter().enumerate() {
531        if matches!(b, b'\\' | b'"' | b'\n' | b'\r' | b'\t') {
532            if start < i {
533                out.push_str(unsafe { std::str::from_utf8_unchecked(&bytes[start..i]) });
534            }
535            out.push_str(match b {
536                b'\\' => "\\\\",
537                b'"' => "\\\"",
538                b'\n' => "\\n",
539                b'\r' => "\\r",
540                b'\t' => "\\t",
541                _ => unreachable!(),
542            });
543            start = i + 1;
544        }
545    }
546    if start < bytes.len() {
547        out.push_str(unsafe { std::str::from_utf8_unchecked(&bytes[start..]) });
548    }
549    out.push('"');
550}