toon_core/encoder.rs
1//! TOON v3.0 Encoder — converts JSON into Token-Oriented Object Notation.
2//!
3//! TOON is a compact, human-readable format designed to minimize token usage when
4//! feeding structured data to LLMs. The encoder implements the full TOON v3.0 spec
5//! (2025-11-24), including:
6//!
7//! - **Key folding**: nested objects expressed via indentation, no braces/brackets
8//! - **Inline arrays**: primitive arrays as `key[N]: v1,v2,v3`
9//! - **Tabular arrays**: uniform object arrays as `key[N]{f1,f2}:\n v1,v2\n v3,v4`
10//! - **Expanded lists**: mixed/complex arrays as `key[N]:\n - item1\n - item2`
11//! - **Context-dependent quoting**: strings only quoted when ambiguous (per delimiter scope)
12//! - **Number normalization**: no exponents, no trailing zeros, -0 → 0
13//!
14//! # Example
15//! ```
16//! use toon_core::encode;
17//! let json = r#"{"name":"Alice","age":30,"tags":["rust","wasm"]}"#;
18//! let toon = encode(json).unwrap();
19//! // name: Alice
20//! // age: 30
21//! // tags[2]: rust,wasm
22//! ```
23
24use crate::error::Result;
25use serde_json::Value;
26
27/// Encode a JSON string into TOON v3.0 format.
28///
29/// Parses the input as JSON, then walks the value tree to produce a compact TOON
30/// representation. Returns an error if the input is not valid JSON.
31pub fn encode(json: &str) -> Result<String> {
32 let value: Value = serde_json::from_str(json)?;
33 let mut out = String::new();
34 encode_root(&value, &mut out);
35 Ok(out)
36}
37
38/// Top-level dispatch: objects emit fields, arrays emit root array syntax,
39/// primitives emit a bare value.
40fn encode_root(value: &Value, out: &mut String) {
41 match value {
42 Value::Object(map) => {
43 encode_object_fields(map, 0, out);
44 }
45 Value::Array(arr) => {
46 encode_root_array(arr, out);
47 }
48 _ => {
49 encode_primitive_value(value, QuoteContext::Document, out);
50 }
51 }
52}
53
54/// Encode a root-level array. Primitive arrays use inline syntax `[N]: v1,v2`;
55/// mixed/complex arrays use expanded list syntax `[N]:\n - item`.
56fn encode_root_array(arr: &[Value], out: &mut String) {
57 let len = arr.len();
58 if all_primitives(arr) {
59 out.push_str(&format!("[{}]: ", len));
60 encode_inline_values(arr, out);
61 } else {
62 out.push_str(&format!("[{}]:", len));
63 encode_list_items(arr, 0, out);
64 }
65}
66
67/// Emit all key-value pairs of an object at the given indentation depth.
68/// Each field appears on its own line; values are dispatched by type.
69///
70/// Relies on `serde_json::Map` with `preserve_order` feature to maintain
71/// the original JSON insertion order (IndexMap, not BTreeMap).
72fn encode_object_fields(map: &serde_json::Map<String, Value>, depth: usize, out: &mut String) {
73 let indent = make_indent(depth);
74 let mut first = true;
75 for (key, value) in map {
76 if !first {
77 out.push('\n');
78 }
79 first = false;
80 out.push_str(&indent);
81 out.push_str(&encode_key(key));
82 encode_field_value(key, value, depth, out);
83 }
84}
85
86/// Dispatch a field's value to the appropriate TOON encoding:
87/// - Empty objects → `key:`
88/// - Non-empty objects → `key:\n child_key: child_val`
89/// - Arrays → delegated to `encode_array_field` (inline/tabular/expanded)
90/// - Primitives → `key: value`
91fn encode_field_value(_key: &str, value: &Value, depth: usize, out: &mut String) {
92 match value {
93 Value::Object(map) if map.is_empty() => {
94 out.push(':');
95 }
96 Value::Object(map) => {
97 out.push(':');
98 out.push('\n');
99 encode_object_fields(map, depth + 1, out);
100 }
101 Value::Array(arr) => {
102 encode_array_field(arr, depth, out);
103 }
104 _ => {
105 out.push_str(": ");
106 encode_primitive_value(value, QuoteContext::Document, out);
107 }
108 }
109}
110
111/// Encode an array field value, selecting the most compact TOON representation:
112///
113/// 1. **Empty**: `key[0]:`
114/// 2. **Tabular**: all elements are objects with identical primitive-only keys →
115/// `key[N]{f1,f2}:\n v1,v2\n v3,v4`
116/// 3. **Inline**: all elements are primitives → `key[N]: v1,v2,v3`
117/// 4. **Expanded list**: mixed content → `key[N]:\n - item1\n - item2`
118fn encode_array_field(arr: &[Value], depth: usize, out: &mut String) {
119 let len = arr.len();
120
121 if arr.is_empty() {
122 out.push_str(&format!("[{}]:", len));
123 return;
124 }
125
126 // Tabular: uniform object arrays (greatest compression for repetitive data)
127 if let Some(fields) = detect_tabular(arr) {
128 out.push_str(&format!("[{}]{{{}}}:", len, fields.join(",")));
129 encode_tabular_rows(arr, &fields, depth, out);
130 return;
131 }
132
133 // Inline: all-primitive arrays on a single line
134 if all_primitives(arr) {
135 out.push_str(&format!("[{}]: ", len));
136 encode_inline_values(arr, out);
137 return;
138 }
139
140 // Expanded: complex/mixed arrays with "- " list markers
141 out.push_str(&format!("[{}]:", len));
142 encode_list_items(arr, depth, out);
143}
144
145/// Emit comma-separated primitive values on a single line: `v1,v2,v3`
146/// Quoting uses `InlineArray` context (comma is the active delimiter, not colon).
147fn encode_inline_values(arr: &[Value], out: &mut String) {
148 for (i, val) in arr.iter().enumerate() {
149 if i > 0 {
150 out.push(',');
151 }
152 encode_primitive_value(val, QuoteContext::InlineArray, out);
153 }
154}
155
156/// Emit tabular rows: each object's values as a comma-separated line, no keys repeated.
157/// Quoting uses `TabularCell` context (comma triggers quoting, not colon).
158fn encode_tabular_rows(arr: &[Value], fields: &[String], depth: usize, out: &mut String) {
159 let row_indent = make_indent(depth + 1);
160 for obj_val in arr {
161 out.push('\n');
162 out.push_str(&row_indent);
163 if let Value::Object(map) = obj_val {
164 for (i, field) in fields.iter().enumerate() {
165 if i > 0 {
166 out.push(',');
167 }
168 if let Some(val) = map.get(field) {
169 encode_primitive_value(val, QuoteContext::TabularCell, out);
170 }
171 }
172 }
173 }
174}
175
176/// Emit expanded list items with "- " markers. Each item can be:
177/// - A primitive value: `- hello`
178/// - An object: `- key1: val1\n key2: val2` (first field on hyphen line)
179/// - A nested array: `- [N]: v1,v2`
180fn encode_list_items(arr: &[Value], depth: usize, out: &mut String) {
181 let item_indent = make_indent(depth + 1);
182 for item in arr {
183 out.push('\n');
184 out.push_str(&item_indent);
185 out.push_str("- ");
186 match item {
187 Value::Object(map) => {
188 // First field on the hyphen line
189 let mut first = true;
190 for (key, value) in map {
191 if first {
192 first = false;
193 out.push_str(&encode_key(key));
194 encode_list_item_field_value(value, depth + 1, out);
195 } else {
196 out.push('\n');
197 // Sibling fields at same depth as "- " content
198 out.push_str(&make_indent(depth + 1));
199 out.push_str(" ");
200 out.push_str(&encode_key(key));
201 encode_list_item_field_value(value, depth + 1, out);
202 }
203 }
204 }
205 Value::Array(inner_arr) => {
206 // Nested array as list item
207 let len = inner_arr.len();
208 if all_primitives(inner_arr) {
209 out.push_str(&format!("[{}]: ", len));
210 encode_inline_values(inner_arr, out);
211 } else {
212 out.push_str(&format!("[{}]:", len));
213 encode_list_items(inner_arr, depth + 1, out);
214 }
215 }
216 _ => {
217 encode_primitive_value(item, QuoteContext::Document, out);
218 }
219 }
220 }
221}
222
223/// Encode a field value within a list item object. Differs from `encode_field_value`
224/// because nested objects inside list items use an extra indent level to account
225/// for the "- " prefix offset.
226fn encode_list_item_field_value(value: &Value, depth: usize, out: &mut String) {
227 match value {
228 Value::Object(map) if map.is_empty() => {
229 out.push(':');
230 }
231 Value::Object(map) => {
232 out.push(':');
233 out.push('\n');
234 // Nested object inside a list item: depth + 1 extra for the "- " offset
235 let nested_indent = make_indent(depth + 2);
236 let mut first = true;
237 for (key, val) in map {
238 if !first {
239 out.push('\n');
240 }
241 first = false;
242 out.push_str(&nested_indent);
243 out.push_str(&encode_key(key));
244 encode_field_value(key, val, depth + 2, out);
245 }
246 }
247 Value::Array(arr) => {
248 encode_array_field(arr, depth, out);
249 }
250 _ => {
251 out.push_str(": ");
252 encode_primitive_value(value, QuoteContext::Document, out);
253 }
254 }
255}
256
257/// Context for quoting decisions per TOON v3.0 delimiter scoping rules.
258#[derive(Clone, Copy, PartialEq)]
259enum QuoteContext {
260 /// Object field value or bare root primitive — colon triggers quoting
261 Document,
262 /// Inline primitive array value — comma (active delimiter) triggers quoting
263 InlineArray,
264 /// Tabular row cell — comma (active delimiter) triggers quoting, NOT colon
265 TabularCell,
266}
267
268/// Emit a primitive JSON value (null, bool, number, string) in TOON format.
269/// String quoting depends on the `QuoteContext` — different delimiters are
270/// "active" in different positions (see TOON v3.0 spec, delimiter scoping).
271fn encode_primitive_value(value: &Value, ctx: QuoteContext, out: &mut String) {
272 match value {
273 Value::Null => out.push_str("null"),
274 Value::Bool(b) => out.push_str(if *b { "true" } else { "false" }),
275 Value::Number(n) => out.push_str(&format_number(n)),
276 Value::String(s) => encode_string_value(s, ctx, out),
277 _ => out.push_str("null"), // arrays/objects in primitive context
278 }
279}
280
281/// Format a JSON number per TOON v3.0 rules:
282/// - No scientific notation (exponents)
283/// - No leading zeros (except 0.x)
284/// - No trailing fractional zeros (3.10 → 3.1)
285/// - Negative zero normalizes to 0
286fn format_number(n: &serde_json::Number) -> String {
287 if let Some(i) = n.as_i64() {
288 return i.to_string();
289 }
290 if let Some(u) = n.as_u64() {
291 return u.to_string();
292 }
293 if let Some(f) = n.as_f64() {
294 if f.is_nan() || f.is_infinite() {
295 return "null".to_string();
296 }
297 // Normalize -0 to 0
298 let f = if f == 0.0 { 0.0 } else { f };
299 // Check if it's a whole number
300 if f.fract() == 0.0 && f.abs() < (i64::MAX as f64) {
301 return (f as i64).to_string();
302 }
303 // Format without trailing zeros
304 let s = format!("{}", f);
305 // Remove trailing zeros after decimal point
306 if s.contains('.') {
307 let trimmed = s.trim_end_matches('0');
308 let trimmed = trimmed.trim_end_matches('.');
309 trimmed.to_string()
310 } else {
311 s
312 }
313 } else {
314 "null".to_string()
315 }
316}
317
318/// Emit a string value, quoting and escaping only when necessary.
319/// Unquoted strings save 2 tokens (the quotes) per value — significant at scale.
320fn encode_string_value(s: &str, ctx: QuoteContext, out: &mut String) {
321 if needs_quoting(s, ctx) {
322 out.push('"');
323 for ch in s.chars() {
324 match ch {
325 '\\' => out.push_str("\\\\"),
326 '"' => out.push_str("\\\""),
327 '\n' => out.push_str("\\n"),
328 '\r' => out.push_str("\\r"),
329 '\t' => out.push_str("\\t"),
330 _ => out.push(ch),
331 }
332 }
333 out.push('"');
334 } else {
335 out.push_str(s);
336 }
337}
338
339/// Determine if a string value must be quoted to preserve TOON roundtrip fidelity.
340///
341/// A string MUST be quoted if it:
342/// - Is empty
343/// - Has leading/trailing whitespace
344/// - Looks like a boolean (`true`/`false`) or `null`
345/// - Looks numeric (would be decoded as a number instead of string)
346/// - Contains backslash, double quote, brackets, braces, or control chars
347/// - Starts with `-` (ambiguous with list item marker)
348/// - Contains the ACTIVE delimiter for the current context:
349/// - Document context: colon (`:`)
350/// - InlineArray/TabularCell context: comma (`,`)
351fn needs_quoting(s: &str, ctx: QuoteContext) -> bool {
352 // Empty string
353 if s.is_empty() {
354 return true;
355 }
356 // Leading or trailing whitespace
357 if s != s.trim() {
358 return true;
359 }
360 // Looks like bool or null
361 if s == "true" || s == "false" || s == "null" {
362 return true;
363 }
364 // Looks like a number (including leading-zero forms like "05")
365 if looks_numeric(s) {
366 return true;
367 }
368 // Contains backslash or double quote
369 if s.contains('\\') || s.contains('"') {
370 return true;
371 }
372 // Contains brackets or braces
373 if s.contains('[') || s.contains(']') || s.contains('{') || s.contains('}') {
374 return true;
375 }
376 // Contains control characters
377 if s.contains('\n') || s.contains('\r') || s.contains('\t') {
378 return true;
379 }
380 // Starts with hyphen (could be confused with list item marker "- ")
381 if s.starts_with('-') {
382 return true;
383 }
384 // Context-dependent delimiter quoting
385 match ctx {
386 QuoteContext::Document => {
387 // Colon triggers quoting in document context
388 if s.contains(':') {
389 return true;
390 }
391 }
392 QuoteContext::InlineArray | QuoteContext::TabularCell => {
393 // Active delimiter (comma by default) triggers quoting
394 if s.contains(',') {
395 return true;
396 }
397 }
398 }
399 false
400}
401
402/// Check if a string looks like a number (and thus must be quoted to preserve type info).
403/// Matches integers, floats, and leading-zero forms like "05" or "0001".
404fn looks_numeric(s: &str) -> bool {
405 // Matches numeric patterns: integers, floats, leading-zero forms
406 if s.is_empty() {
407 return false;
408 }
409 let bytes = s.as_bytes();
410 let start = if bytes[0] == b'-' { 1 } else { 0 };
411 if start >= bytes.len() {
412 return false;
413 }
414 // All remaining must be digits, optionally with one dot and optional exponent
415 let rest = &s[start..];
416 if rest.is_empty() {
417 return false;
418 }
419 // Check for leading-zero forms like "05", "0001"
420 if rest.len() > 1 && rest.starts_with('0') && rest.as_bytes()[1] != b'.' {
421 return true; // "05", "00" etc. are numeric-like
422 }
423 // Try to parse as a number pattern
424 let mut has_dot = false;
425 let mut has_e = false;
426 for (i, &b) in rest.as_bytes().iter().enumerate() {
427 match b {
428 b'0'..=b'9' => {}
429 b'.' if !has_dot && !has_e => has_dot = true,
430 b'e' | b'E' if !has_e && i > 0 => has_e = true,
431 b'+' | b'-' if has_e => {}
432 _ => return false,
433 }
434 }
435 // Must have at least one digit
436 rest.as_bytes().iter().any(|b| b.is_ascii_digit())
437}
438
439/// Encode an object key. Keys matching `^[A-Za-z_][A-Za-z0-9_.]*$` are emitted
440/// unquoted; all others are quoted with escape sequences.
441fn encode_key(key: &str) -> String {
442 if is_valid_unquoted_key(key) {
443 key.to_string()
444 } else {
445 let mut out = String::with_capacity(key.len() + 2);
446 out.push('"');
447 for ch in key.chars() {
448 match ch {
449 '\\' => out.push_str("\\\\"),
450 '"' => out.push_str("\\\""),
451 '\n' => out.push_str("\\n"),
452 '\r' => out.push_str("\\r"),
453 '\t' => out.push_str("\\t"),
454 _ => out.push(ch),
455 }
456 }
457 out.push('"');
458 out
459 }
460}
461
462/// Test if a key can be emitted unquoted per TOON v3.0: `^[A-Za-z_][A-Za-z0-9_.]*$`
463fn is_valid_unquoted_key(key: &str) -> bool {
464 // Must match: ^[A-Za-z_][A-Za-z0-9_.]*$
465 if key.is_empty() {
466 return false;
467 }
468 let mut chars = key.chars();
469 match chars.next() {
470 Some(c) if c.is_ascii_alphabetic() || c == '_' => {}
471 _ => return false,
472 }
473 chars.all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '.')
474}
475
476/// Detect if an array is tabular: all elements are objects with identical key sets,
477/// all values are primitives (no nested arrays/objects).
478fn detect_tabular(arr: &[Value]) -> Option<Vec<String>> {
479 if arr.is_empty() {
480 return None;
481 }
482 // All must be objects
483 let first = arr[0].as_object()?;
484 let fields: Vec<String> = first.keys().cloned().collect();
485 if fields.is_empty() {
486 return None;
487 }
488 // All values in first object must be primitive
489 for val in first.values() {
490 if val.is_object() || val.is_array() {
491 return None;
492 }
493 }
494 // All subsequent objects must have the same keys with primitive values
495 for item in &arr[1..] {
496 let obj = item.as_object()?;
497 if obj.len() != fields.len() {
498 return None;
499 }
500 for field in &fields {
501 let val = obj.get(field)?;
502 if val.is_object() || val.is_array() {
503 return None;
504 }
505 }
506 }
507 Some(fields)
508}
509
510/// Check if all array elements are primitives (not objects or arrays).
511fn all_primitives(arr: &[Value]) -> bool {
512 arr.iter().all(|v| !v.is_object() && !v.is_array())
513}
514
515/// Generate a 2-space-per-level indentation string.
516fn make_indent(depth: usize) -> String {
517 " ".repeat(depth)
518}