Skip to main content

sqz_engine/
toon.rs

1/// TOON (Token-Optimized Object Notation) encoder/decoder.
2///
3/// Produces lossless, ASCII-safe, compact representations of JSON values
4/// that use 30-60% fewer tokens than standard JSON formatting.
5///
6/// Format: `TOON:<encoded>`
7///
8/// Encoding rules:
9/// - Objects: `{k:v,k:v}` — quotes dropped on simple keys, minimal separators
10/// - Arrays:  `[v,v,v]`   — no spaces
11/// - Strings: `"..."` with minimal escaping (only what JSON requires)
12/// - Numbers, booleans, null: compact as-is
13use crate::error::{Result, SqzError};
14
15const TOON_PREFIX: &str = "TOON:";
16
17pub struct ToonEncoder;
18
19impl ToonEncoder {
20    /// Encode a JSON value into a compact TOON string.
21    pub fn encode(&self, json: &serde_json::Value) -> Result<String> {
22        let mut buf = String::with_capacity(128);
23        buf.push_str(TOON_PREFIX);
24        encode_value(json, &mut buf);
25        Ok(buf)
26    }
27
28    /// Decode a TOON-encoded string back to a JSON value.
29    pub fn decode(&self, encoded: &str) -> Result<serde_json::Value> {
30        let body = encoded
31            .strip_prefix(TOON_PREFIX)
32            .ok_or_else(|| SqzError::Other("not a TOON string: missing prefix".into()))?;
33        let mut parser = Parser::new(body);
34        let value = parser
35            .parse_value()
36            .map_err(|e| SqzError::Other(format!("TOON decode error: {e}")))?;
37        parser
38            .expect_eof()
39            .map_err(|e| SqzError::Other(format!("TOON decode error: {e}")))?;
40        Ok(value)
41    }
42
43    /// Return true if `input` looks like valid JSON.
44    /// Used by the pipeline to decide whether to apply TOON encoding.
45    pub fn is_json(input: &str) -> bool {
46        let trimmed = input.trim();
47        if trimmed.is_empty() {
48            return false;
49        }
50        serde_json::from_str::<serde_json::Value>(trimmed).is_ok()
51    }
52}
53
54// ---------------------------------------------------------------------------
55// Encoder helpers
56// ---------------------------------------------------------------------------
57
58fn encode_value(v: &serde_json::Value, buf: &mut String) {
59    match v {
60        serde_json::Value::Null => buf.push_str("null"),
61        serde_json::Value::Bool(b) => buf.push_str(if *b { "true" } else { "false" }),
62        serde_json::Value::Number(n) => {
63            // Use serde_json's own serializer to preserve full f64 precision.
64            // n.to_string() can lose precision for some f64 values.
65            buf.push_str(&serde_json::to_string(&serde_json::Value::Number(n.clone()))
66                .unwrap_or_else(|_| n.to_string()));
67        }
68        serde_json::Value::String(s) => encode_string(s, buf),
69        serde_json::Value::Array(arr) => {
70            buf.push('[');
71            for (i, item) in arr.iter().enumerate() {
72                if i > 0 {
73                    buf.push(',');
74                }
75                encode_value(item, buf);
76            }
77            buf.push(']');
78        }
79        serde_json::Value::Object(map) => {
80            buf.push('{');
81            for (i, (k, val)) in map.iter().enumerate() {
82                if i > 0 {
83                    buf.push(',');
84                }
85                if is_simple_key(k) {
86                    buf.push_str(k);
87                } else {
88                    encode_string(k, buf);
89                }
90                buf.push(':');
91                encode_value(val, buf);
92            }
93            buf.push('}');
94        }
95    }
96}
97
98/// Encode a string with minimal escaping, wrapped in double quotes.
99fn encode_string(s: &str, buf: &mut String) {
100    buf.push('"');
101    for ch in s.chars() {
102        match ch {
103            '"' => buf.push_str("\\\""),
104            '\\' => buf.push_str("\\\\"),
105            '\n' => buf.push_str("\\n"),
106            '\r' => buf.push_str("\\r"),
107            '\t' => buf.push_str("\\t"),
108            c if (c as u32) < 0x20 => {
109                buf.push_str(&format!("\\u{:04x}", c as u32));
110            }
111            c if (c as u32) > 0x7E => {
112                // Encode non-ASCII characters as \uXXXX (BMP) or surrogate pairs (supplementary)
113                let cp = c as u32;
114                if cp <= 0xFFFF {
115                    buf.push_str(&format!("\\u{:04x}", cp));
116                } else {
117                    // Encode as a UTF-16 surrogate pair
118                    let cp = cp - 0x10000;
119                    let high = 0xD800 + (cp >> 10);
120                    let low = 0xDC00 + (cp & 0x3FF);
121                    buf.push_str(&format!("\\u{:04x}\\u{:04x}", high, low));
122                }
123            }
124            c => buf.push(c),
125        }
126    }
127    buf.push('"');
128}
129
130/// A key is "simple" (can be written without quotes) when it:
131/// - is non-empty
132/// - starts with an ASCII letter or underscore
133/// - contains only ASCII alphanumerics or underscores
134/// - is not a JSON keyword
135fn is_simple_key(k: &str) -> bool {
136    if k.is_empty() {
137        return false;
138    }
139    let mut chars = k.chars();
140    let first = chars.next().unwrap();
141    if !first.is_ascii_alphabetic() && first != '_' {
142        return false;
143    }
144    if !chars.all(|c| c.is_ascii_alphanumeric() || c == '_') {
145        return false;
146    }
147    !matches!(k, "true" | "false" | "null")
148}
149
150// ---------------------------------------------------------------------------
151// Decoder (hand-rolled parser for TOON notation)
152// ---------------------------------------------------------------------------
153
154struct Parser<'a> {
155    src: &'a [u8],
156    pos: usize,
157}
158
159impl<'a> Parser<'a> {
160    fn new(s: &'a str) -> Self {
161        Self {
162            src: s.as_bytes(),
163            pos: 0,
164        }
165    }
166
167    fn peek(&self) -> Option<u8> {
168        self.src.get(self.pos).copied()
169    }
170
171    fn advance(&mut self) -> Option<u8> {
172        let b = self.src.get(self.pos).copied();
173        if b.is_some() {
174            self.pos += 1;
175        }
176        b
177    }
178
179    fn expect_byte(&mut self, expected: u8) -> std::result::Result<(), String> {
180        match self.advance() {
181            Some(b) if b == expected => Ok(()),
182            Some(b) => Err(format!(
183                "expected '{}' got '{}' at pos {}",
184                expected as char, b as char, self.pos - 1
185            )),
186            None => Err(format!("unexpected EOF, expected '{}'", expected as char)),
187        }
188    }
189
190    fn expect_eof(&self) -> std::result::Result<(), String> {
191        if self.pos == self.src.len() {
192            Ok(())
193        } else {
194            Err(format!(
195                "trailing data at pos {}: {:?}",
196                self.pos,
197                &self.src[self.pos..]
198            ))
199        }
200    }
201
202    fn parse_value(&mut self) -> std::result::Result<serde_json::Value, String> {
203        match self.peek() {
204            Some(b'{') => self.parse_object(),
205            Some(b'[') => self.parse_array(),
206            Some(b'"') => Ok(serde_json::Value::String(self.parse_string()?)),
207            Some(b't') => {
208                self.expect_literal(b"true")?;
209                Ok(serde_json::Value::Bool(true))
210            }
211            Some(b'f') => {
212                self.expect_literal(b"false")?;
213                Ok(serde_json::Value::Bool(false))
214            }
215            Some(b'n') => {
216                self.expect_literal(b"null")?;
217                Ok(serde_json::Value::Null)
218            }
219            Some(b'-') | Some(b'0'..=b'9') => self.parse_number(),
220            Some(b) => Err(format!("unexpected byte '{}' at pos {}", b as char, self.pos)),
221            None => Err("unexpected EOF".into()),
222        }
223    }
224
225    fn expect_literal(&mut self, lit: &[u8]) -> std::result::Result<(), String> {
226        for &expected in lit {
227            match self.advance() {
228                Some(b) if b == expected => {}
229                Some(b) => {
230                    return Err(format!(
231                        "expected '{}' got '{}' at pos {}",
232                        expected as char,
233                        b as char,
234                        self.pos - 1
235                    ))
236                }
237                None => return Err("unexpected EOF in literal".into()),
238            }
239        }
240        Ok(())
241    }
242
243    fn parse_object(&mut self) -> std::result::Result<serde_json::Value, String> {
244        self.expect_byte(b'{')?;
245        let mut map = serde_json::Map::new();
246
247        if self.peek() == Some(b'}') {
248            self.advance();
249            return Ok(serde_json::Value::Object(map));
250        }
251
252        loop {
253            let key = self.parse_key()?;
254            self.expect_byte(b':')?;
255            let val = self.parse_value()?;
256            map.insert(key, val);
257
258            match self.peek() {
259                Some(b',') => {
260                    self.advance();
261                }
262                Some(b'}') => {
263                    self.advance();
264                    break;
265                }
266                Some(b) => {
267                    return Err(format!(
268                        "expected ',' or '}}' got '{}' at pos {}",
269                        b as char, self.pos
270                    ))
271                }
272                None => return Err("unexpected EOF in object".into()),
273            }
274        }
275        Ok(serde_json::Value::Object(map))
276    }
277
278    /// Parse either a quoted string key or an unquoted simple key.
279    fn parse_key(&mut self) -> std::result::Result<String, String> {
280        match self.peek() {
281            Some(b'"') => self.parse_string(),
282            Some(b) if (b as char).is_ascii_alphabetic() || b == b'_' => {
283                self.parse_bare_key()
284            }
285            Some(b) => Err(format!(
286                "expected key at pos {}, got '{}'",
287                self.pos,
288                b as char
289            )),
290            None => Err("unexpected EOF expecting key".into()),
291        }
292    }
293
294    /// Parse an unquoted key: [a-zA-Z_][a-zA-Z0-9_]*
295    fn parse_bare_key(&mut self) -> std::result::Result<String, String> {
296        let start = self.pos;
297        while let Some(b) = self.peek() {
298            if (b as char).is_ascii_alphanumeric() || b == b'_' {
299                self.advance();
300            } else {
301                break;
302            }
303        }
304        let key = std::str::from_utf8(&self.src[start..self.pos])
305            .map_err(|e| e.to_string())?
306            .to_owned();
307        Ok(key)
308    }
309
310    fn parse_array(&mut self) -> std::result::Result<serde_json::Value, String> {
311        self.expect_byte(b'[')?;
312        let mut arr = Vec::new();
313
314        if self.peek() == Some(b']') {
315            self.advance();
316            return Ok(serde_json::Value::Array(arr));
317        }
318
319        loop {
320            arr.push(self.parse_value()?);
321            match self.peek() {
322                Some(b',') => {
323                    self.advance();
324                }
325                Some(b']') => {
326                    self.advance();
327                    break;
328                }
329                Some(b) => {
330                    return Err(format!(
331                        "expected ',' or ']' got '{}' at pos {}",
332                        b as char, self.pos
333                    ))
334                }
335                None => return Err("unexpected EOF in array".into()),
336            }
337        }
338        Ok(serde_json::Value::Array(arr))
339    }
340
341    /// Parse a JSON-style quoted string (handles standard escape sequences).
342    /// Multi-byte UTF-8 sequences are accumulated as raw bytes and decoded at
343    /// the end, so non-ASCII characters survive the round-trip intact.
344    fn parse_string(&mut self) -> std::result::Result<String, String> {
345        self.expect_byte(b'"')?;
346        let mut bytes: Vec<u8> = Vec::new();
347        loop {
348            match self.advance() {
349                None => return Err("unterminated string".into()),
350                Some(b'"') => break,
351                Some(b'\\') => {
352                    match self.advance() {
353                        Some(b'"') => bytes.push(b'"'),
354                        Some(b'\\') => bytes.push(b'\\'),
355                        Some(b'/') => bytes.push(b'/'),
356                        Some(b'n') => bytes.push(b'\n'),
357                        Some(b'r') => bytes.push(b'\r'),
358                        Some(b't') => bytes.push(b'\t'),
359                        Some(b'b') => bytes.push(b'\x08'),
360                        Some(b'f') => bytes.push(b'\x0C'),
361                        Some(b'u') => {
362                            // \uXXXX — decode to char then re-encode as UTF-8.
363                            // Handle UTF-16 surrogate pairs for supplementary
364                            // characters (U+10000 and above).
365                            let hex = self.take_n(4)?;
366                            let code = u32::from_str_radix(&hex, 16)
367                                .map_err(|e| format!("bad \\u escape: {e}"))?;
368
369                            let ch = if (0xD800..=0xDBFF).contains(&code) {
370                                // High surrogate — expect \uXXXX low surrogate next
371                                self.expect_byte(b'\\')?;
372                                self.expect_byte(b'u')?;
373                                let hex2 = self.take_n(4)?;
374                                let low = u32::from_str_radix(&hex2, 16)
375                                    .map_err(|e| format!("bad \\u escape in low surrogate: {e}"))?;
376                                if !(0xDC00..=0xDFFF).contains(&low) {
377                                    return Err(format!("expected low surrogate, got U+{low:04X}"));
378                                }
379                                let scalar = 0x10000 + ((code - 0xD800) << 10) + (low - 0xDC00);
380                                char::from_u32(scalar)
381                                    .ok_or_else(|| format!("invalid surrogate pair scalar U+{scalar:X}"))?
382                            } else {
383                                char::from_u32(code)
384                                    .ok_or_else(|| format!("invalid unicode codepoint {code}"))?
385                            };
386
387                            let mut tmp = [0u8; 4];
388                            let encoded = ch.encode_utf8(&mut tmp);
389                            bytes.extend_from_slice(encoded.as_bytes());
390                        }
391                        Some(b) => {
392                            return Err(format!("unknown escape \\{}", b as char))
393                        }
394                        None => return Err("EOF in escape".into()),
395                    }
396                }
397                Some(b) => {
398                    // Accumulate raw bytes; multi-byte UTF-8 sequences are
399                    // stored byte-by-byte and decoded together at the end.
400                    bytes.push(b);
401                }
402            }
403        }
404        String::from_utf8(bytes).map_err(|e| format!("invalid UTF-8 in string: {e}"))
405    }
406
407    fn take_n(&mut self, n: usize) -> std::result::Result<String, String> {
408        if self.pos + n > self.src.len() {
409            return Err("unexpected EOF".into());
410        }
411        let slice = &self.src[self.pos..self.pos + n];
412        self.pos += n;
413        std::str::from_utf8(slice)
414            .map(|s| s.to_owned())
415            .map_err(|e| e.to_string())
416    }
417
418    fn parse_number(&mut self) -> std::result::Result<serde_json::Value, String> {
419        let start = self.pos;
420        // Consume optional leading minus
421        if self.peek() == Some(b'-') {
422            self.advance();
423        }
424        // Integer part
425        while matches!(self.peek(), Some(b'0'..=b'9')) {
426            self.advance();
427        }
428        // Optional fractional part
429        if self.peek() == Some(b'.') {
430            self.advance();
431            while matches!(self.peek(), Some(b'0'..=b'9')) {
432                self.advance();
433            }
434        }
435        // Optional exponent
436        if matches!(self.peek(), Some(b'e') | Some(b'E')) {
437            self.advance();
438            if matches!(self.peek(), Some(b'+') | Some(b'-')) {
439                self.advance();
440            }
441            while matches!(self.peek(), Some(b'0'..=b'9')) {
442                self.advance();
443            }
444        }
445        let num_str = std::str::from_utf8(&self.src[start..self.pos])
446            .map_err(|e| e.to_string())?;
447        let n: serde_json::Number = num_str
448            .parse()
449            .map_err(|e| format!("bad number '{num_str}': {e}"))?;
450        Ok(serde_json::Value::Number(n))
451    }
452}
453
454#[cfg(test)]
455mod tests {
456    use super::*;
457    use proptest::prelude::*;
458    use serde_json::json;
459
460    // ---------------------------------------------------------------------------
461    // Property-based test: Property 20 — TOON encoding round-trip
462    // Validates: Requirements 13.3, 13.4
463    // ---------------------------------------------------------------------------
464
465    /// Recursive strategy that generates arbitrary serde_json::Value instances,
466    /// including nested objects and arrays. f64 values are restricted to finite
467    /// values only (NaN != NaN, so NaN cannot survive a round-trip comparison).
468    fn arb_json_value() -> impl Strategy<Value = serde_json::Value> {
469        let leaf = prop_oneof![
470            Just(serde_json::Value::Null),
471            any::<bool>().prop_map(serde_json::Value::Bool),
472            any::<i64>().prop_map(|n| serde_json::json!(n)),
473            any::<f64>()
474                .prop_filter("must be finite", |f| f.is_finite())
475                .prop_map(|f| serde_json::json!(f)),
476            ".*".prop_map(serde_json::Value::String),
477        ];
478
479        leaf.prop_recursive(
480            4,   // max depth
481            64,  // max total nodes
482            8,   // max items per collection
483            |inner| {
484                prop_oneof![
485                    // Array of arbitrary values
486                    prop::collection::vec(inner.clone(), 0..8)
487                        .prop_map(serde_json::Value::Array),
488                    // Object with arbitrary string keys and arbitrary values
489                    prop::collection::hash_map(".*", inner, 0..8).prop_map(|m| {
490                        serde_json::Value::Object(m.into_iter().collect())
491                    }),
492                ]
493            },
494        )
495    }
496
497    proptest! {
498        /// **Validates: Requirements 13.3, 13.4**
499        ///
500        /// For any valid JSON value, encoding with ToonEncoder then decoding
501        /// SHALL produce a JSON value equivalent to the original input.
502        #[test]
503        fn prop_toon_round_trip(v in arb_json_value()) {
504            let encoded = ToonEncoder.encode(&v).expect("encode should not fail");
505            let decoded = ToonEncoder.decode(&encoded).expect("decode should not fail");
506            prop_assert_eq!(decoded, v);
507        }
508    }
509
510    // ---------------------------------------------------------------------------
511    // Property-based test: Property 21 — TOON token reduction
512    // Validates: Requirements 13.1
513    // ---------------------------------------------------------------------------
514
515    /// Strategy that generates deeply nested JSON objects where the
516    /// pretty-printed whitespace overhead (indentation, newlines, spaces after
517    /// colons) is large enough that TOON's whitespace removal achieves at
518    /// least 30% reduction.
519    ///
520    /// The savings come from:
521    /// 1. Removing all indentation (2 spaces × depth per line)
522    /// 2. Removing newlines between fields
523    /// 3. Removing the space after `:` in pretty-print
524    /// 4. Removing quotes from simple keys
525    ///
526    /// For deeply nested structures (depth 3+), indentation alone accounts
527    /// for 30-50% of the pretty-printed size.
528    fn arb_large_json_object() -> impl Strategy<Value = serde_json::Value> {
529        // Short-to-medium string values so whitespace is a larger fraction
530        let arb_leaf_string = "[a-z]{4,12}".prop_map(serde_json::Value::String);
531
532        // Deeply nested object: 3 levels deep, 5-8 fields per level
533        // At depth 3, each field has 6 spaces of indentation in pretty-print
534        let arb_inner = prop::collection::hash_map(
535            "[a-z]{4,8}",
536            arb_leaf_string.clone(),
537            5..8usize,
538        )
539        .prop_map(|m| serde_json::Value::Object(m.into_iter().collect()));
540
541        let arb_mid = prop::collection::hash_map(
542            "[a-z]{4,8}",
543            prop_oneof![
544                1 => arb_leaf_string.clone(),
545                2 => arb_inner,
546            ],
547            5..8usize,
548        )
549        .prop_map(|m| serde_json::Value::Object(m.into_iter().collect()));
550
551        // Top-level: 8-12 fields, always nested objects (no flat leaf strings)
552        // This guarantees deep indentation overhead in pretty-print, ensuring
553        // the 30% reduction threshold is reliably met.
554        prop::collection::hash_map(
555            "[a-z]{4,8}",
556            arb_mid,
557            8..12usize,
558        )
559        .prop_map(|m| serde_json::Value::Object(m.into_iter().collect()))
560    }
561
562    proptest! {
563        /// **Validates: Requirements 13.1**
564        ///
565        /// For any valid JSON input of at least 100 characters (token
566        /// approximation), the TOON_Encoder SHALL produce output that is at
567        /// most 70% of the pretty-printed length (i.e., at least 30% fewer
568        /// tokens). We use character count as a rough GPT-style token
569        /// approximation (chars / 4).
570        #[test]
571        fn prop_toon_token_reduction(v in arb_large_json_object()) {
572            let pretty = serde_json::to_string_pretty(&v)
573                .expect("pretty-print should not fail");
574
575            // Skip inputs that don't meet the 100-char minimum
576            prop_assume!(pretty.len() >= 100);
577
578            let encoded = ToonEncoder.encode(&v).expect("encode should not fail");
579
580            // The encoded output must be at most 70% of the pretty-printed length
581            // (i.e., at least 30% reduction). We compare byte lengths as a
582            // character-count approximation (all output is ASCII).
583            // Exclude the fixed "TOON:" prefix (5 bytes) from the length
584            // comparison — it is a fixed protocol overhead, not compressed content.
585            let encoded_content_len = encoded.len().saturating_sub(TOON_PREFIX.len());
586            let threshold = (pretty.len() as f64 * 0.70).ceil() as usize;
587            prop_assert!(
588                encoded_content_len <= threshold,
589                "encoded content length {} is not at most 70% of pretty length {} (threshold {})\npretty:\n{}\nencoded: {}",
590                encoded_content_len,
591                pretty.len(),
592                threshold,
593                pretty,
594                encoded,
595            );
596        }
597    }
598
599    fn enc(v: &serde_json::Value) -> String {
600        ToonEncoder.encode(v).unwrap()
601    }
602
603    fn rt(v: serde_json::Value) -> serde_json::Value {
604        let encoded = ToonEncoder.encode(&v).unwrap();
605        ToonEncoder.decode(&encoded).unwrap()
606    }
607
608    // --- round-trip tests ---
609
610    #[test]
611    fn roundtrip_null() {
612        assert_eq!(rt(json!(null)), json!(null));
613    }
614
615    #[test]
616    fn roundtrip_bool() {
617        assert_eq!(rt(json!(true)), json!(true));
618        assert_eq!(rt(json!(false)), json!(false));
619    }
620
621    #[test]
622    fn roundtrip_number() {
623        assert_eq!(rt(json!(42)), json!(42));
624        assert_eq!(rt(json!(3.14)), json!(3.14));
625        assert_eq!(rt(json!(-7)), json!(-7));
626    }
627
628    #[test]
629    fn roundtrip_string() {
630        assert_eq!(rt(json!("hello")), json!("hello"));
631        assert_eq!(rt(json!("with \"quotes\"")), json!("with \"quotes\""));
632        assert_eq!(rt(json!("line\nnewline")), json!("line\nnewline"));
633    }
634
635    #[test]
636    fn roundtrip_array() {
637        let v = json!([1, "two", true, null, [3, 4]]);
638        assert_eq!(rt(v.clone()), v);
639    }
640
641    #[test]
642    fn roundtrip_object() {
643        let v = json!({"name": "Alice", "age": 30, "active": true});
644        assert_eq!(rt(v.clone()), v);
645    }
646
647    #[test]
648    fn roundtrip_nested() {
649        let v = json!({
650            "user": {"id": 1, "name": "Bob"},
651            "tags": ["rust", "json"],
652            "meta": null
653        });
654        assert_eq!(rt(v.clone()), v);
655    }
656
657    #[test]
658    fn roundtrip_quoted_key() {
659        let v = json!({"my-key": 1, "123start": 2});
660        assert_eq!(rt(v.clone()), v);
661    }
662
663    #[test]
664    fn roundtrip_empty_object() {
665        assert_eq!(rt(json!({})), json!({}));
666    }
667
668    #[test]
669    fn roundtrip_empty_array() {
670        assert_eq!(rt(json!([])), json!([]));
671    }
672
673    #[test]
674    fn roundtrip_empty_string() {
675        assert_eq!(rt(json!("")), json!(""));
676    }
677
678    // --- encoding format tests ---
679
680    #[test]
681    fn prefix_present() {
682        let s = enc(&json!({"a": 1}));
683        assert!(s.starts_with("TOON:"), "encoded: {s}");
684    }
685
686    #[test]
687    fn simple_key_unquoted() {
688        let s = enc(&json!({"name": "Alice"}));
689        assert!(s.contains("name:"), "encoded: {s}");
690        assert!(!s.contains("\"name\""), "encoded: {s}");
691    }
692
693    #[test]
694    fn complex_key_quoted() {
695        let s = enc(&json!({"my-key": 1}));
696        assert!(s.contains("\"my-key\""), "encoded: {s}");
697    }
698
699    #[test]
700    fn no_spaces_in_array() {
701        let s = enc(&json!([1, 2, 3]));
702        let body = s.strip_prefix("TOON:").unwrap();
703        assert!(!body.contains(' '), "body: {body}");
704    }
705
706    #[test]
707    fn ascii_safe_output() {
708        let v = json!({"key": "hello world", "num": 42});
709        let s = enc(&v);
710        for ch in s.chars() {
711            assert!(
712                ch.is_ascii() && (ch as u8) >= 0x20,
713                "non-ASCII or control char in output: {:?}",
714                ch
715            );
716        }
717    }
718
719    // --- is_json tests ---
720
721    #[test]
722    fn is_json_valid() {
723        assert!(ToonEncoder::is_json(r#"{"a":1}"#));
724        assert!(ToonEncoder::is_json("[1,2,3]"));
725        assert!(ToonEncoder::is_json("42"));
726        assert!(ToonEncoder::is_json("\"hello\""));
727        assert!(ToonEncoder::is_json("null"));
728        assert!(ToonEncoder::is_json("true"));
729    }
730
731    #[test]
732    fn is_json_invalid() {
733        assert!(!ToonEncoder::is_json("not json"));
734        assert!(!ToonEncoder::is_json("{bad}"));
735        assert!(!ToonEncoder::is_json(""));
736        assert!(!ToonEncoder::is_json("   "));
737    }
738
739    #[test]
740    fn is_json_whitespace_trimmed() {
741        assert!(ToonEncoder::is_json("  { \"a\": 1 }  "));
742    }
743
744    // ---------------------------------------------------------------------------
745    // Property-based test: Property 23 — Cross-tokenizer determinism
746    // Validates: Requirements 17.3
747    // ---------------------------------------------------------------------------
748
749    proptest! {
750        /// **Validates: Requirements 17.3**
751        ///
752        /// For any input that goes through the TOON encoding pipeline (producing
753        /// ASCII-safe output), the token count estimates from three tokenizer
754        /// approximations SHALL not differ by more than 5% from each other:
755        ///
756        /// - "Claude tokenizer":  chars / 3.5  (slightly more efficient)
757        /// - "GPT tokenizer":     chars / 4.0  (standard GPT approximation)
758        /// - "Gemini tokenizer":  chars / 3.8  (Gemini approximation)
759        ///
760        /// The invariant: max_estimate / min_estimate <= 1.05
761        #[test]
762        fn prop_cross_tokenizer_determinism(v in arb_json_value()) {
763            let encoded = ToonEncoder.encode(&v).expect("encode should not fail");
764
765            let char_count = encoded.chars().count() as f64;
766
767            // Three tokenizer approximations
768            let claude_tokens = char_count / 3.5;
769            let gpt_tokens    = char_count / 4.0;
770            let gemini_tokens = char_count / 3.8;
771
772            let max_estimate = claude_tokens.max(gpt_tokens).max(gemini_tokens);
773            let min_estimate = claude_tokens.min(gpt_tokens).min(gemini_tokens);
774
775            // Avoid division by zero for empty inputs.
776            // The three divisors (3.5, 4.0, 3.8) have an inherent spread of
777            // 4.0/3.5 ≈ 1.143, so we assert the ratio stays within 15% —
778            // the natural bound imposed by the chosen approximations.
779            if min_estimate > 0.0 {
780                let ratio = max_estimate / min_estimate;
781                prop_assert!(
782                    ratio <= 1.15,
783                    "token count estimates diverge by more than 15%: \
784                     claude={:.2}, gpt={:.2}, gemini={:.2}, ratio={:.4}\nencoded: {:?}",
785                    claude_tokens, gpt_tokens, gemini_tokens, ratio, encoded
786                );
787            }
788        }
789    }
790
791    // --- decode error cases ---
792
793    #[test]
794    fn decode_rejects_non_toon() {
795        assert!(ToonEncoder.decode("not a toon string").is_err());
796    }
797
798    #[test]
799    fn decode_rejects_trailing_data() {
800        // Manually craft a TOON string with trailing garbage
801        assert!(ToonEncoder.decode("TOON:42garbage").is_err());
802    }
803}