toon-encode 0.1.1

Minimal TOON encoder — Token-Oriented Object Notation for LLM output
Documentation
//! Minimal TOON encoder — Token-Oriented Object Notation.
//!
//! TOON is a compact, human-readable encoding of JSON data for LLM prompts.
//! It declares field names once in tabular headers and uses CSV-like rows,
//! reducing token consumption by 30-50% for list-heavy responses.
//!
//! # Examples
//!
//! ```
//! use serde::Serialize;
//!
//! #[derive(Serialize)]
//! struct Item { name: String, value: i32 }
//!
//! let items = vec![
//!     Item { name: "alpha".into(), value: 1 },
//!     Item { name: "beta".into(), value: 2 },
//! ];
//! let toon = toon_encode::to_toon_string(&items).unwrap();
//! assert!(toon.contains("[2]{name,value}:"));
//! ```

use serde_json::Value;

const INDENT: &str = "  ";
const TOON_SPECIAL: &[char] = &[',', ':', '"', '\\', '[', ']', '{', '}', '\n', '\r', '\t'];

/// Encode any `Serialize` type as a TOON string.
///
/// Internally converts to `serde_json::Value` first, then encodes as TOON.
pub fn to_toon_string<T: serde::Serialize>(value: &T) -> Result<String, serde_json::Error> {
    let json_value = serde_json::to_value(value)?;
    Ok(encode_toon(&json_value, 0))
}

/// Encode a `serde_json::Value` as a TOON string at the given indentation depth.
pub fn encode_toon(value: &Value, depth: usize) -> String {
    let indent = INDENT.repeat(depth);
    match value {
        Value::Null => "null".to_string(),
        Value::Bool(b) => b.to_string(),
        Value::Number(n) => n.to_string(),
        Value::String(s) => toon_quote(s),
        Value::Array(arr) if is_tabular(arr) => encode_tabular(arr, depth),
        Value::Array(arr) => encode_list(arr, depth),
        Value::Object(obj) => {
            let mut lines = Vec::new();
            obj.iter().for_each(|(k, v)| match v {
                Value::Object(_) | Value::Array(_) => {
                    lines.push(format!("{indent}{}:", toon_quote(k)));
                    let child = encode_toon(v, depth + 1);
                    lines.push(child);
                }
                _ => lines.push(format!("{indent}{}: {}", toon_quote(k), encode_toon(v, 0))),
            });
            lines.join("\n")
        }
    }
}

/// Check if an array is tabular (all elements are objects with identical key sets
/// and only primitive values).
fn is_tabular(arr: &[Value]) -> bool {
    if arr.is_empty() {
        return false;
    }
    let Some(Value::Object(first)) = arr.first() else {
        return false;
    };
    let all_primitive =
        |o: &serde_json::Map<String, Value>| o.values().all(|v| !v.is_object() && !v.is_array());
    if !all_primitive(first) {
        return false;
    }
    let keys: Vec<&String> = first.keys().collect();
    arr[1..].iter().all(|v| {
        v.as_object()
            .map(|o| {
                o.len() == keys.len()
                    && keys.iter().all(|k| o.contains_key(k.as_str()))
                    && all_primitive(o)
            })
            .unwrap_or(false)
    })
}

/// Encode a tabular array as TOON with header row.
fn encode_tabular(arr: &[Value], depth: usize) -> String {
    let indent = INDENT.repeat(depth);
    let row_indent = INDENT.repeat(depth + 1);
    let Some(first) = arr[0].as_object() else {
        return String::new();
    };
    let fields: Vec<&String> = first.keys().collect();
    let header = fields
        .iter()
        .map(|f| f.as_str())
        .collect::<Vec<_>>()
        .join(",");
    let mut lines = vec![format!("{indent}[{}]{{{header}}}:", arr.len())];
    arr.iter().for_each(|row| {
        let Some(obj) = row.as_object() else { return };
        let vals: Vec<String> = fields
            .iter()
            .map(|f| encode_toon(&obj[f.as_str()], 0))
            .collect();
        lines.push(format!("{row_indent}{}", vals.join(",")));
    });
    lines.join("\n")
}

/// Encode a non-tabular array as TOON list.
fn encode_list(arr: &[Value], depth: usize) -> String {
    let row_indent = INDENT.repeat(depth);
    let mut lines = Vec::new();
    arr.iter().for_each(|v| {
        let encoded = encode_toon(v, depth + 1);
        if encoded.contains('\n') {
            lines.push(format!("{row_indent}-"));
            lines.push(encoded);
        } else {
            lines.push(format!("{row_indent}- {encoded}"));
        }
    });
    lines.join("\n")
}

/// Quote a string if it contains TOON special characters, is empty, or starts with `-`.
fn toon_quote(s: &str) -> String {
    if s.is_empty() || s.starts_with('-') || s.contains(TOON_SPECIAL) {
        let escaped = s
            .replace('\\', "\\\\")
            .replace('"', "\\\"")
            .replace('\n', "\\n")
            .replace('\r', "\\r")
            .replace('\t', "\\t");
        format!("\"{escaped}\"")
    } else {
        s.to_string()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use serde::Serialize;
    use serde_json::json;

    // ── Quote tests ──────────────────────────────────────────

    #[test]
    fn quote_plain() {
        assert_eq!(toon_quote("hello"), "hello");
        assert_eq!(toon_quote("foo_bar"), "foo_bar");
    }

    #[test]
    fn quote_special_chars() {
        assert_eq!(toon_quote("a,b"), "\"a,b\"");
        assert_eq!(toon_quote("key: val"), "\"key: val\"");
        assert_eq!(toon_quote(""), "\"\"");
    }

    #[test]
    fn quote_control_chars() {
        assert_eq!(toon_quote("line1\nline2"), "\"line1\\nline2\"");
        assert_eq!(toon_quote("col1\tcol2"), "\"col1\\tcol2\"");
        assert_eq!(toon_quote("cr\rhere"), "\"cr\\rhere\"");
    }

    #[test]
    fn tabular_with_newline_values() {
        let val = json!([
            {"name": "a", "body": "line1\nline2"},
            {"name": "b", "body": "single"},
        ]);
        let toon = encode_toon(&val, 0);
        // Newline in value must be escaped, not break the row format
        assert!(
            toon.contains("\"line1\\nline2\""),
            "newline should be escaped, got: {toon}"
        );
        assert_eq!(
            toon.lines().count(),
            3,
            "should be header + 2 rows, got: {toon}"
        );
    }

    #[test]
    fn quote_dash_start() {
        assert_eq!(toon_quote("-flag"), "\"-flag\"");
    }

    #[test]
    fn quote_escapes() {
        assert_eq!(toon_quote("say \"hi\""), "\"say \\\"hi\\\"\"");
        assert_eq!(toon_quote("a\\b"), "\"a\\\\b\"");
    }

    // ── Primitive tests ──────────────────────────────────────

    #[test]
    fn encode_primitives() {
        assert_eq!(encode_toon(&json!(null), 0), "null");
        assert_eq!(encode_toon(&json!(true), 0), "true");
        assert_eq!(encode_toon(&json!(42), 0), "42");
        assert_eq!(encode_toon(&json!("hello"), 0), "hello");
        assert_eq!(encode_toon(&json!("a,b"), 0), "\"a,b\"");
    }

    // ── Object tests ─────────────────────────────────────────

    #[test]
    fn encode_flat_object() {
        let val = json!({"version": "0.5.5", "findings": 0});
        let toon = encode_toon(&val, 0);
        assert!(toon.contains("version: 0.5.5"), "got: {toon}");
        assert!(toon.contains("findings: 0"), "got: {toon}");
    }

    // ── Tabular array tests ──────────────────────────────────

    #[test]
    fn encode_tabular_array() {
        let val = json!([
            {"name": "IOSP", "pct": 100.0},
            {"name": "CX", "pct": 99.8},
        ]);
        let toon = encode_toon(&val, 0);
        assert!(
            toon.contains("[2]{name,pct}:"),
            "should have tabular header, got: {toon}"
        );
        assert!(toon.contains("IOSP,100.0"), "got: {toon}");
        assert!(toon.contains("CX,99.8"), "got: {toon}");
    }

    #[test]
    fn encode_empty_array_not_tabular() {
        let val = json!([]);
        let toon = encode_toon(&val, 0);
        assert_eq!(toon, "");
    }

    // ── Non-tabular array tests ──────────────────────────────

    #[test]
    fn encode_primitive_list() {
        let val = json!(["a", "b", "c"]);
        let toon = encode_toon(&val, 0);
        assert!(toon.contains("- a"), "got: {toon}");
        assert!(toon.contains("- b"), "got: {toon}");
    }

    #[test]
    fn encode_list_indentation() {
        let val = json!({"items": ["x", "y"]});
        let toon = encode_toon(&val, 0);
        assert!(toon.contains("items:"), "got: {toon}");
        assert!(
            toon.contains("  - x"),
            "items should be at 2-space indent, got: {toon}"
        );
    }

    // ── Serde integration tests ──────────────────────────────

    #[test]
    fn to_toon_string_with_struct() {
        #[derive(Serialize)]
        struct Item {
            name: String,
            value: i32,
        }
        let items = vec![
            Item {
                name: "alpha".into(),
                value: 1,
            },
            Item {
                name: "beta".into(),
                value: 2,
            },
        ];
        let toon = to_toon_string(&items).unwrap();
        assert!(toon.contains("[2]{name,value}:"), "got: {toon}");
        assert!(toon.contains("alpha,1"), "got: {toon}");
        assert!(toon.contains("beta,2"), "got: {toon}");
    }

    #[test]
    fn to_toon_string_with_nested_struct() {
        #[derive(Serialize)]
        struct Wrapper {
            results: Vec<Entry>,
            count: usize,
        }
        #[derive(Serialize)]
        struct Entry {
            file: String,
            kind: String,
        }
        let w = Wrapper {
            results: vec![
                Entry {
                    file: "a.rs".into(),
                    kind: "fn".into(),
                },
                Entry {
                    file: "b.rs".into(),
                    kind: "struct".into(),
                },
            ],
            count: 2,
        };
        let toon = to_toon_string(&w).unwrap();
        assert!(toon.contains("results:"), "got: {toon}");
        assert!(toon.contains("[2]{file,kind}:"), "got: {toon}");
        assert!(toon.contains("count: 2"), "got: {toon}");
    }
}