Skip to main content

aura_toon/
lib.rs

1//! # aura-toon
2//!
3//! Lightweight encoder for **TOON** — Token-Oriented Object Notation.
4//!
5//! TOON is a JSON-compatible serialization format designed to reduce token
6//! count when feeding structured data to LLMs. For typical payloads it
7//! produces 30–60% fewer tokens than equivalent JSON, with a tabular form
8//! for arrays of homogeneous objects.
9//!
10//! Spec: <https://github.com/toon-format/spec>
11//!
12//! ## Example
13//!
14//! ```
15//! use serde_json::json;
16//!
17//! let value = json!({
18//!     "snapshots": [
19//!         {"file": "main.rs", "trigger": "watcher", "ts": 123},
20//!         {"file": "lib.rs",  "trigger": "mcp",     "ts": 456}
21//!     ]
22//! });
23//!
24//! let toon = aura_toon::encode(&value);
25//! assert!(toon.contains("snapshots[2]{file,trigger,ts}:"));
26//! ```
27//!
28//! ## Caveman Mode
29//!
30//! For natural-language strings, `caveman()` strips articles, filler words,
31//! hedging, and pleasantries — producing terse, fragment-style output that
32//! uses 20–40% fewer tokens while preserving all technical meaning.
33//!
34//! ```
35//! let verbose = "I would be happy to help you with that. The function is currently \
36//!                failing because the variable is not properly initialized.";
37//! let terse = aura_toon::caveman(verbose);
38//! assert!(terse.contains("function"));
39//! assert!(!terse.contains("would be happy"));
40//! ```
41//!
42//! Originally extracted from [Aura](https://auravcs.com), the semantic
43//! version control engine.
44
45use serde_json::Value;
46
47const INDENT: &str = "  ";
48
49/// Encode a serde_json::Value as a TOON string.
50pub fn encode(value: &Value) -> String {
51    let mut out = String::new();
52    encode_value(value, 0, &mut out);
53    // Remove trailing newline per spec
54    while out.ends_with('\n') {
55        out.pop();
56    }
57    out
58}
59
60fn encode_value(value: &Value, depth: usize, out: &mut String) {
61    match value {
62        Value::Null => out.push_str("null"),
63        Value::Bool(b) => out.push_str(if *b { "true" } else { "false" }),
64        Value::Number(n) => out.push_str(&canonical_number(n)),
65        Value::String(s) => encode_string(s, ',', out),
66        Value::Array(arr) => encode_array(arr, depth, out),
67        Value::Object(obj) => encode_object_fields(obj, depth, out),
68    }
69}
70
71fn canonical_number(n: &serde_json::Number) -> String {
72    if let Some(i) = n.as_i64() {
73        return i.to_string();
74    }
75    if let Some(u) = n.as_u64() {
76        return u.to_string();
77    }
78    if let Some(f) = n.as_f64() {
79        if f.is_nan() || f.is_infinite() {
80            return "null".to_string();
81        }
82        if f == 0.0 {
83            return "0".to_string();
84        }
85        // Check if it's effectively an integer
86        if f.fract() == 0.0 && f.abs() < (i64::MAX as f64) {
87            return (f as i64).to_string();
88        }
89        // Remove trailing zeros
90        let s = format!("{}", f);
91        s
92    } else {
93        "null".to_string()
94    }
95}
96
97/// Quote a string value if needed per TOON spec section 7.2
98fn needs_quoting(s: &str, delimiter: char) -> bool {
99    if s.is_empty() {
100        return true;
101    }
102    if s.starts_with(' ') || s.ends_with(' ') {
103        return true;
104    }
105    if s == "true" || s == "false" || s == "null" {
106        return true;
107    }
108    if s.starts_with('-') {
109        return true;
110    }
111    // Numeric pattern check
112    if looks_numeric(s) {
113        return true;
114    }
115    // Contains special characters
116    for c in s.chars() {
117        if c == ':'
118            || c == '"'
119            || c == '\\'
120            || c == '['
121            || c == ']'
122            || c == '{'
123            || c == '}'
124            || c == '\n'
125            || c == '\r'
126            || c == '\t'
127        {
128            return true;
129        }
130        if c == delimiter {
131            return true;
132        }
133    }
134    false
135}
136
137fn looks_numeric(s: &str) -> bool {
138    let s = s.strip_prefix('-').unwrap_or(s);
139    if s.is_empty() {
140        return false;
141    }
142    // Leading zero followed by more digits
143    if s.len() > 1 && s.starts_with('0') && s.as_bytes()[1].is_ascii_digit() {
144        return true;
145    }
146    let mut chars = s.chars().peekable();
147    // Integer part
148    if !chars.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
149        return false;
150    }
151    while chars.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
152        chars.next();
153    }
154    // Optional fractional part
155    if chars.peek() == Some(&'.') {
156        chars.next();
157        if !chars.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
158            return false;
159        }
160        while chars.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
161            chars.next();
162        }
163    }
164    // Optional exponent
165    if chars
166        .peek()
167        .map(|c| *c == 'e' || *c == 'E')
168        .unwrap_or(false)
169    {
170        chars.next();
171        if chars
172            .peek()
173            .map(|c| *c == '+' || *c == '-')
174            .unwrap_or(false)
175        {
176            chars.next();
177        }
178        if !chars.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
179            return false;
180        }
181        while chars.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
182            chars.next();
183        }
184    }
185    chars.peek().is_none()
186}
187
188fn escape_string(s: &str) -> String {
189    let mut out = String::with_capacity(s.len());
190    for c in s.chars() {
191        match c {
192            '\\' => out.push_str("\\\\"),
193            '"' => out.push_str("\\\""),
194            '\n' => out.push_str("\\n"),
195            '\r' => out.push_str("\\r"),
196            '\t' => out.push_str("\\t"),
197            _ => out.push(c),
198        }
199    }
200    out
201}
202
203fn encode_string(s: &str, delimiter: char, out: &mut String) {
204    if needs_quoting(s, delimiter) {
205        out.push('"');
206        out.push_str(&escape_string(s));
207        out.push('"');
208    } else {
209        out.push_str(s);
210    }
211}
212
213/// Check if a key can be unquoted: ^[A-Za-z_][A-Za-z0-9_.]*$
214fn key_needs_quoting(k: &str) -> bool {
215    if k.is_empty() {
216        return true;
217    }
218    let mut chars = k.chars();
219    let first = chars.next().unwrap();
220    if !first.is_ascii_alphabetic() && first != '_' {
221        return true;
222    }
223    for c in chars {
224        if !c.is_ascii_alphanumeric() && c != '_' && c != '.' {
225            return true;
226        }
227    }
228    false
229}
230
231fn encode_key(k: &str, out: &mut String) {
232    if key_needs_quoting(k) {
233        out.push('"');
234        out.push_str(&escape_string(k));
235        out.push('"');
236    } else {
237        out.push_str(k);
238    }
239}
240
241fn indent(depth: usize, out: &mut String) {
242    for _ in 0..depth {
243        out.push_str(INDENT);
244    }
245}
246
247fn encode_object_fields(obj: &serde_json::Map<String, Value>, depth: usize, out: &mut String) {
248    for (key, val) in obj {
249        indent(depth, out);
250        encode_key(key, out);
251
252        match val {
253            Value::Object(inner) if !inner.is_empty() => {
254                out.push_str(":\n");
255                encode_object_fields(inner, depth + 1, out);
256            }
257            Value::Array(arr) => {
258                encode_array_header(key, arr, out);
259                encode_array_body(arr, depth, out);
260            }
261            _ => {
262                out.push_str(": ");
263                encode_value(val, depth + 1, out);
264                out.push('\n');
265            }
266        }
267    }
268}
269
270/// Try tabular form: all elements are objects with identical keys and all primitive values
271fn try_tabular(arr: &[Value]) -> Option<Vec<String>> {
272    if arr.is_empty() {
273        return None;
274    }
275    let mut fields: Option<Vec<String>> = None;
276    for item in arr {
277        let obj = item.as_object()?;
278        let keys: Vec<String> = obj.keys().cloned().collect();
279        // All values must be primitives
280        for val in obj.values() {
281            if val.is_object() || val.is_array() {
282                return None;
283            }
284        }
285        match &fields {
286            None => fields = Some(keys),
287            Some(f) => {
288                if keys.len() != f.len() || keys.iter().zip(f.iter()).any(|(a, b)| a != b) {
289                    return None;
290                }
291            }
292        }
293    }
294    fields
295}
296
297fn encode_array_header(_key: &str, arr: &[Value], out: &mut String) {
298    // Don't re-emit key, it was already emitted by caller
299    // Actually this is called from encode_object_fields which already emitted the key
300    // We need to emit [N] or [N]{fields}:
301    let len = arr.len();
302
303    if let Some(fields) = try_tabular(arr) {
304        out.push_str(&format!("[{}]{{", len));
305        for (i, f) in fields.iter().enumerate() {
306            if i > 0 {
307                out.push(',');
308            }
309            encode_key(f, out);
310        }
311        out.push_str("}:\n");
312    } else if all_primitives(arr) {
313        // Inline primitive array
314        out.push_str(&format!("[{}]: ", len));
315        for (i, val) in arr.iter().enumerate() {
316            if i > 0 {
317                out.push(',');
318            }
319            encode_primitive_value(val, ',', out);
320        }
321        out.push('\n');
322    } else {
323        out.push_str(&format!("[{}]:\n", len));
324    }
325}
326
327fn all_primitives(arr: &[Value]) -> bool {
328    arr.iter().all(|v| !v.is_object() && !v.is_array())
329}
330
331fn encode_primitive_value(val: &Value, delimiter: char, out: &mut String) {
332    match val {
333        Value::Null => out.push_str("null"),
334        Value::Bool(b) => out.push_str(if *b { "true" } else { "false" }),
335        Value::Number(n) => out.push_str(&canonical_number(n)),
336        Value::String(s) => encode_string(s, delimiter, out),
337        _ => {}
338    }
339}
340
341fn encode_array_body(arr: &[Value], depth: usize, out: &mut String) {
342    if let Some(fields) = try_tabular(arr) {
343        // Tabular rows
344        for item in arr {
345            let obj = item.as_object().unwrap();
346            indent(depth + 1, out);
347            for (i, f) in fields.iter().enumerate() {
348                if i > 0 {
349                    out.push(',');
350                }
351                let val = obj.get(f).unwrap_or(&Value::Null);
352                encode_primitive_value(val, ',', out);
353            }
354            out.push('\n');
355        }
356    } else if all_primitives(arr) {
357        // Already handled inline in header
358    } else {
359        // Mixed/expansion form
360        for item in arr {
361            indent(depth + 1, out);
362            out.push_str("- ");
363            match item {
364                Value::Object(obj) if !obj.is_empty() => {
365                    // First field on hyphen line
366                    let mut iter = obj.iter();
367                    if let Some((k, v)) = iter.next() {
368                        encode_key(k, out);
369                        match v {
370                            Value::Object(inner) if !inner.is_empty() => {
371                                out.push_str(":\n");
372                                encode_object_fields(inner, depth + 2, out);
373                            }
374                            _ => {
375                                out.push_str(": ");
376                                encode_value(v, depth + 2, out);
377                                out.push('\n');
378                            }
379                        }
380                        // Remaining fields at depth+1
381                        for (k, v) in iter {
382                            indent(depth + 2, out);
383                            encode_key(k, out);
384                            match v {
385                                Value::Object(inner) if !inner.is_empty() => {
386                                    out.push_str(":\n");
387                                    encode_object_fields(inner, depth + 3, out);
388                                }
389                                _ => {
390                                    out.push_str(": ");
391                                    encode_value(v, depth + 3, out);
392                                    out.push('\n');
393                                }
394                            }
395                        }
396                    }
397                }
398                _ => {
399                    encode_value(item, depth + 2, out);
400                    out.push('\n');
401                }
402            }
403        }
404    }
405}
406
407fn encode_array(arr: &[Value], depth: usize, out: &mut String) {
408    // Root-level array (no key)
409    let len = arr.len();
410
411    if let Some(fields) = try_tabular(arr) {
412        out.push_str(&format!("[{}]{{", len));
413        for (i, f) in fields.iter().enumerate() {
414            if i > 0 {
415                out.push(',');
416            }
417            encode_key(f, out);
418        }
419        out.push_str("}:\n");
420        for item in arr {
421            let obj = item.as_object().unwrap();
422            indent(depth + 1, out);
423            for (i, f) in fields.iter().enumerate() {
424                if i > 0 {
425                    out.push(',');
426                }
427                let val = obj.get(f).unwrap_or(&Value::Null);
428                encode_primitive_value(val, ',', out);
429            }
430            out.push('\n');
431        }
432    } else if all_primitives(arr) {
433        out.push_str(&format!("[{}]: ", len));
434        for (i, val) in arr.iter().enumerate() {
435            if i > 0 {
436                out.push(',');
437            }
438            encode_primitive_value(val, ',', out);
439        }
440        out.push('\n');
441    } else {
442        out.push_str(&format!("[{}]:\n", len));
443        encode_array_body(arr, depth, out);
444    }
445}
446
447// ─── Caveman Mode ─────────────────────────────────────────────────────────
448// Strips low-information tokens from natural language to reduce LLM token
449// usage while preserving technical meaning. Drop articles, filler, hedging,
450// pleasantries. Collapse verbose phrases to short synonyms.
451
452/// Strip filler words, articles, hedging, and pleasantries from prose.
453///
454/// Designed for LLM-to-LLM communication where politeness wastes tokens.
455/// Preserves code identifiers, technical terms, and sentence structure.
456/// Produces terse, fragment-style output (20–40% fewer tokens on typical
457/// LLM prose).
458///
459/// # Example
460///
461/// ```
462/// let input = "Sure! I'd be happy to help you with that. The function \
463///              is currently not working because the variable has not been \
464///              properly initialized in the constructor.";
465/// let output = aura_toon::caveman(input);
466/// assert!(!output.contains("Sure!"));
467/// assert!(!output.contains("I'd be happy to help"));
468/// assert!(output.contains("function"));
469/// assert!(output.contains("variable"));
470/// ```
471pub fn caveman(text: &str) -> String {
472    let mut result = text.to_string();
473
474    // Phase 1: Strip full pleasantry phrases (re-derive lowercase each pass)
475    for phrase in STRIP_PHRASES {
476        let phrase_lower = phrase.to_lowercase();
477        if let Some(pos) = result.to_lowercase().find(&phrase_lower) {
478            let end = pos + phrase.len();
479            result = format!("{}{}", &result[..pos], &result[end..]);
480        }
481    }
482
483    // Phase 2: Replace verbose phrases with short synonyms
484    for (verbose, short) in REPLACE_PHRASES {
485        let lower = result.to_lowercase();
486        if let Some(pos) = lower.find(&verbose.to_lowercase()) {
487            let end = pos + verbose.len();
488            result = format!("{}{}{}", &result[..pos], short, &result[end..]);
489        }
490    }
491
492    // Phase 3: Strip filler words (word-boundary aware)
493    let words: Vec<&str> = result.split_whitespace().collect();
494    let filtered: Vec<&str> = words
495        .into_iter()
496        .filter(|w| {
497            let lower = w
498                .trim_matches(|c: char| c.is_ascii_punctuation())
499                .to_lowercase();
500            !FILLER_WORDS.contains(&lower.as_str())
501        })
502        .collect();
503    result = filtered.join(" ");
504
505    // Phase 4: Clean up whitespace artifacts
506    while result.contains("  ") {
507        result = result.replace("  ", " ");
508    }
509    // Clean dangling punctuation
510    result = result.replace(" .", ".").replace(" ,", ",");
511    result = result.replace(". .", ".").replace(",,", ",");
512    result = result.trim().to_string();
513
514    // Strip leading/trailing punctuation-only fragments
515    while result.starts_with('.') || result.starts_with(',') || result.starts_with('!') {
516        result = result[1..].trim_start().to_string();
517    }
518
519    result
520}
521
522/// Pleasantry phrases stripped entirely (matched case-insensitively).
523const STRIP_PHRASES: &[&str] = &[
524    "I would be happy to help you with that.",
525    "I'd be happy to help you with that.",
526    "I'd be happy to help with that.",
527    "I would be happy to help with that.",
528    "I'd be happy to help!",
529    "I'd be happy to help.",
530    "Sure! I'd be happy to",
531    "Sure, I'd be happy to",
532    "Sure! I can help with that.",
533    "Sure, I can help with that.",
534    "Sure thing!",
535    "Sure!",
536    "Sure,",
537    "Of course!",
538    "Of course,",
539    "Absolutely!",
540    "Absolutely,",
541    "Let me help you with that.",
542    "I'll help you with that.",
543    "Great question!",
544    "That's a great question.",
545    "Good question!",
546    "Here's what I found:",
547    "Here is what I found:",
548    "Let me explain.",
549    "Let me break this down.",
550    "I hope this helps!",
551    "I hope that helps!",
552    "Hope this helps!",
553    "Let me know if you have any questions.",
554    "Let me know if you need anything else.",
555    "Feel free to ask if you have any questions.",
556    "Don't hesitate to ask.",
557    "Happy to help further!",
558    "Is there anything else I can help with?",
559    "Is there anything else you need?",
560];
561
562/// Verbose → short phrase replacements.
563const REPLACE_PHRASES: &[(&str, &str)] = &[
564    ("in order to", "to"),
565    ("due to the fact that", "because"),
566    ("for the purpose of", "for"),
567    ("in the event that", "if"),
568    ("at this point in time", "now"),
569    ("at the present time", "now"),
570    ("on the other hand", "but"),
571    ("in addition to", "plus"),
572    ("as a result of", "from"),
573    ("with regard to", "re"),
574    ("with respect to", "re"),
575    ("in terms of", "for"),
576    ("a large number of", "many"),
577    ("a significant amount of", "much"),
578    ("it is important to note that", "note:"),
579    ("it should be noted that", "note:"),
580    ("it is worth mentioning that", "note:"),
581    ("please note that", "note:"),
582    ("as you can see", ""),
583    ("as mentioned above", ""),
584    ("as previously mentioned", ""),
585    ("is currently not working", "fails"),
586    ("is not working", "fails"),
587    ("is currently failing", "fails"),
588    ("does not work", "fails"),
589    ("has not been", "wasn't"),
590    ("have not been", "weren't"),
591    ("is not able to", "can't"),
592    ("are not able to", "can't"),
593    ("was not able to", "couldn't"),
594    ("it appears that", ""),
595    ("it seems that", ""),
596    ("it looks like", ""),
597    ("I believe that", ""),
598    ("I think that", ""),
599    ("in my opinion", ""),
600    ("basically what happens is", ""),
601    ("what's happening here is", ""),
602    ("the reason for this is", "reason:"),
603    ("the issue here is that", "issue:"),
604    ("the problem is that", "problem:"),
605    ("make sure to", "must"),
606    ("you need to make sure", "must"),
607    ("you'll want to", ""),
608    ("you might want to", ""),
609    ("you should consider", "consider"),
610    ("it would be a good idea to", "should"),
611    ("properly initialized", "initialized"),
612    ("correctly configured", "configured"),
613    ("successfully completed", "completed"),
614];
615
616/// Single filler words stripped when they appear as standalone tokens.
617const FILLER_WORDS: &[&str] = &[
618    "the",
619    "a",
620    "an", // articles
621    "just",
622    "really",
623    "very", // intensifiers
624    "quite",
625    "rather",
626    "fairly",
627    "somewhat",
628    "actually",
629    "basically",
630    "essentially",
631    "literally",
632    "obviously",
633    "clearly",
634    "simply",
635    "merely",
636    "certainly",
637    "definitely",
638    "perhaps",
639    "maybe",
640    "possibly",
641    "potentially",
642    "presumably",
643    "however",
644    "furthermore",
645    "moreover",
646    "additionally",
647    "consequently",
648    "therefore",
649    "thus",
650    "hence",
651    "accordingly",
652    "please",
653    "kindly",
654    "respective",
655    "corresponding",
656];
657
658#[cfg(test)]
659mod tests {
660    use super::*;
661    use serde_json::json;
662
663    #[test]
664    fn test_simple_object() {
665        let val = json!({"name": "aura", "version": "0.4.0", "active": true});
666        let toon = encode(&val);
667        assert!(toon.contains("name: aura"));
668        assert!(toon.contains("version: 0.4.0"));
669        assert!(toon.contains("active: true"));
670    }
671
672    #[test]
673    fn test_nested_object() {
674        let val = json!({"server": {"name": "aura-vcs", "version": "1.0"}});
675        let toon = encode(&val);
676        assert!(toon.contains("server:\n"));
677        assert!(toon.contains("  name: aura-vcs"));
678    }
679
680    #[test]
681    fn test_tabular_array() {
682        let val = json!({
683            "snapshots": [
684                {"file": "main.rs", "trigger": "watcher", "ts": 123},
685                {"file": "lib.rs", "trigger": "mcp", "ts": 456}
686            ]
687        });
688        let toon = encode(&val);
689        assert!(toon.contains("snapshots[2]{file,trigger,ts}:"));
690        assert!(toon.contains("main.rs,watcher,123"));
691    }
692
693    #[test]
694    fn test_quoting() {
695        let val = json!({"msg": "hello world: test"});
696        let toon = encode(&val);
697        assert!(toon.contains("\"hello world: test\""));
698    }
699
700    #[test]
701    fn test_empty_object() {
702        let val = json!({});
703        let toon = encode(&val);
704        assert_eq!(toon, "");
705    }
706
707    #[test]
708    fn test_primitive_array() {
709        let val = json!({"tags": ["rust", "git", "ai"]});
710        let toon = encode(&val);
711        assert!(toon.contains("tags[3]: rust,git,ai"));
712    }
713
714    // ── Caveman tests ──
715
716    #[test]
717    fn test_caveman_strips_pleasantries() {
718        let input = "Sure! I'd be happy to help you with that. The function fails.";
719        let output = caveman(input);
720        assert!(!output.contains("Sure"));
721        assert!(!output.contains("happy"));
722        assert!(output.contains("function fails"));
723    }
724
725    #[test]
726    fn test_caveman_replaces_verbose_phrases() {
727        let input = "In order to fix the bug, due to the fact that the config is wrong.";
728        let output = caveman(input);
729        assert!(output.contains("to fix"));
730        assert!(output.contains("because"));
731        assert!(!output.contains("in order to"));
732        assert!(!output.contains("due to the fact that"));
733    }
734
735    #[test]
736    fn test_caveman_strips_filler_words() {
737        let input = "The variable is actually just really not initialized.";
738        let output = caveman(input);
739        assert!(!output.contains("actually"));
740        assert!(!output.contains("just"));
741        assert!(!output.contains("really"));
742        assert!(output.contains("variable"));
743        assert!(output.contains("not initialized"));
744    }
745
746    #[test]
747    fn test_caveman_preserves_technical_content() {
748        let input = "HashMap<String, Vec<u8>> implements Clone and Send.";
749        let output = caveman(input);
750        assert!(output.contains("HashMap<String,"));
751        assert!(output.contains("Clone"));
752        assert!(output.contains("Send"));
753    }
754
755    #[test]
756    fn test_caveman_empty_input() {
757        assert_eq!(caveman(""), "");
758    }
759
760    #[test]
761    fn test_caveman_token_reduction() {
762        let verbose = "Sure! I'd be happy to help you with that. The function is currently \
763                       not working because the variable has not been properly initialized \
764                       in the constructor. In order to fix this, you need to make sure that \
765                       the value is correctly configured before calling the method.";
766        let terse = caveman(verbose);
767        // Terse should be meaningfully shorter
768        assert!(
769            terse.len() < verbose.len() * 3 / 4,
770            "Expected >25% reduction. Original: {} chars, caveman: {} chars",
771            verbose.len(),
772            terse.len()
773        );
774        // But must keep the technical content
775        assert!(terse.contains("function"));
776        assert!(terse.contains("variable"));
777        assert!(terse.contains("constructor"));
778    }
779}