1use crate::error::{Result, SqzError};
14
15const TOON_PREFIX: &str = "TOON:";
16
17pub struct ToonEncoder;
18
19impl ToonEncoder {
20 pub fn encode(&self, json: &serde_json::Value) -> Result<String> {
22 let mut buf = String::with_capacity(128);
23 buf.push_str(TOON_PREFIX);
24 encode_value(json, &mut buf);
25 Ok(buf)
26 }
27
28 pub fn decode(&self, encoded: &str) -> Result<serde_json::Value> {
30 let body = encoded
31 .strip_prefix(TOON_PREFIX)
32 .ok_or_else(|| SqzError::Other("not a TOON string: missing prefix".into()))?;
33 let mut parser = Parser::new(body);
34 let value = parser
35 .parse_value()
36 .map_err(|e| SqzError::Other(format!("TOON decode error: {e}")))?;
37 parser
38 .expect_eof()
39 .map_err(|e| SqzError::Other(format!("TOON decode error: {e}")))?;
40 Ok(value)
41 }
42
43 pub fn is_json(input: &str) -> bool {
46 let trimmed = input.trim();
47 if trimmed.is_empty() {
48 return false;
49 }
50 serde_json::from_str::<serde_json::Value>(trimmed).is_ok()
51 }
52}
53
54fn encode_value(v: &serde_json::Value, buf: &mut String) {
59 match v {
60 serde_json::Value::Null => buf.push_str("null"),
61 serde_json::Value::Bool(b) => buf.push_str(if *b { "true" } else { "false" }),
62 serde_json::Value::Number(n) => {
63 buf.push_str(&serde_json::to_string(&serde_json::Value::Number(n.clone()))
66 .unwrap_or_else(|_| n.to_string()));
67 }
68 serde_json::Value::String(s) => encode_string(s, buf),
69 serde_json::Value::Array(arr) => {
70 buf.push('[');
71 for (i, item) in arr.iter().enumerate() {
72 if i > 0 {
73 buf.push(',');
74 }
75 encode_value(item, buf);
76 }
77 buf.push(']');
78 }
79 serde_json::Value::Object(map) => {
80 buf.push('{');
81 for (i, (k, val)) in map.iter().enumerate() {
82 if i > 0 {
83 buf.push(',');
84 }
85 if is_simple_key(k) {
86 buf.push_str(k);
87 } else {
88 encode_string(k, buf);
89 }
90 buf.push(':');
91 encode_value(val, buf);
92 }
93 buf.push('}');
94 }
95 }
96}
97
98fn encode_string(s: &str, buf: &mut String) {
100 buf.push('"');
101 for ch in s.chars() {
102 match ch {
103 '"' => buf.push_str("\\\""),
104 '\\' => buf.push_str("\\\\"),
105 '\n' => buf.push_str("\\n"),
106 '\r' => buf.push_str("\\r"),
107 '\t' => buf.push_str("\\t"),
108 c if (c as u32) < 0x20 => {
109 buf.push_str(&format!("\\u{:04x}", c as u32));
110 }
111 c if (c as u32) > 0x7E => {
112 let cp = c as u32;
114 if cp <= 0xFFFF {
115 buf.push_str(&format!("\\u{:04x}", cp));
116 } else {
117 let cp = cp - 0x10000;
119 let high = 0xD800 + (cp >> 10);
120 let low = 0xDC00 + (cp & 0x3FF);
121 buf.push_str(&format!("\\u{:04x}\\u{:04x}", high, low));
122 }
123 }
124 c => buf.push(c),
125 }
126 }
127 buf.push('"');
128}
129
130fn is_simple_key(k: &str) -> bool {
136 if k.is_empty() {
137 return false;
138 }
139 let mut chars = k.chars();
140 let first = chars.next().unwrap();
141 if !first.is_ascii_alphabetic() && first != '_' {
142 return false;
143 }
144 if !chars.all(|c| c.is_ascii_alphanumeric() || c == '_') {
145 return false;
146 }
147 !matches!(k, "true" | "false" | "null")
148}
149
150struct Parser<'a> {
155 src: &'a [u8],
156 pos: usize,
157}
158
159impl<'a> Parser<'a> {
160 fn new(s: &'a str) -> Self {
161 Self {
162 src: s.as_bytes(),
163 pos: 0,
164 }
165 }
166
167 fn peek(&self) -> Option<u8> {
168 self.src.get(self.pos).copied()
169 }
170
171 fn advance(&mut self) -> Option<u8> {
172 let b = self.src.get(self.pos).copied();
173 if b.is_some() {
174 self.pos += 1;
175 }
176 b
177 }
178
179 fn expect_byte(&mut self, expected: u8) -> std::result::Result<(), String> {
180 match self.advance() {
181 Some(b) if b == expected => Ok(()),
182 Some(b) => Err(format!(
183 "expected '{}' got '{}' at pos {}",
184 expected as char, b as char, self.pos - 1
185 )),
186 None => Err(format!("unexpected EOF, expected '{}'", expected as char)),
187 }
188 }
189
190 fn expect_eof(&self) -> std::result::Result<(), String> {
191 if self.pos == self.src.len() {
192 Ok(())
193 } else {
194 Err(format!(
195 "trailing data at pos {}: {:?}",
196 self.pos,
197 &self.src[self.pos..]
198 ))
199 }
200 }
201
202 fn parse_value(&mut self) -> std::result::Result<serde_json::Value, String> {
203 match self.peek() {
204 Some(b'{') => self.parse_object(),
205 Some(b'[') => self.parse_array(),
206 Some(b'"') => Ok(serde_json::Value::String(self.parse_string()?)),
207 Some(b't') => {
208 self.expect_literal(b"true")?;
209 Ok(serde_json::Value::Bool(true))
210 }
211 Some(b'f') => {
212 self.expect_literal(b"false")?;
213 Ok(serde_json::Value::Bool(false))
214 }
215 Some(b'n') => {
216 self.expect_literal(b"null")?;
217 Ok(serde_json::Value::Null)
218 }
219 Some(b'-') | Some(b'0'..=b'9') => self.parse_number(),
220 Some(b) => Err(format!("unexpected byte '{}' at pos {}", b as char, self.pos)),
221 None => Err("unexpected EOF".into()),
222 }
223 }
224
225 fn expect_literal(&mut self, lit: &[u8]) -> std::result::Result<(), String> {
226 for &expected in lit {
227 match self.advance() {
228 Some(b) if b == expected => {}
229 Some(b) => {
230 return Err(format!(
231 "expected '{}' got '{}' at pos {}",
232 expected as char,
233 b as char,
234 self.pos - 1
235 ))
236 }
237 None => return Err("unexpected EOF in literal".into()),
238 }
239 }
240 Ok(())
241 }
242
243 fn parse_object(&mut self) -> std::result::Result<serde_json::Value, String> {
244 self.expect_byte(b'{')?;
245 let mut map = serde_json::Map::new();
246
247 if self.peek() == Some(b'}') {
248 self.advance();
249 return Ok(serde_json::Value::Object(map));
250 }
251
252 loop {
253 let key = self.parse_key()?;
254 self.expect_byte(b':')?;
255 let val = self.parse_value()?;
256 map.insert(key, val);
257
258 match self.peek() {
259 Some(b',') => {
260 self.advance();
261 }
262 Some(b'}') => {
263 self.advance();
264 break;
265 }
266 Some(b) => {
267 return Err(format!(
268 "expected ',' or '}}' got '{}' at pos {}",
269 b as char, self.pos
270 ))
271 }
272 None => return Err("unexpected EOF in object".into()),
273 }
274 }
275 Ok(serde_json::Value::Object(map))
276 }
277
278 fn parse_key(&mut self) -> std::result::Result<String, String> {
280 match self.peek() {
281 Some(b'"') => self.parse_string(),
282 Some(b) if (b as char).is_ascii_alphabetic() || b == b'_' => {
283 self.parse_bare_key()
284 }
285 Some(b) => Err(format!(
286 "expected key at pos {}, got '{}'",
287 self.pos,
288 b as char
289 )),
290 None => Err("unexpected EOF expecting key".into()),
291 }
292 }
293
294 fn parse_bare_key(&mut self) -> std::result::Result<String, String> {
296 let start = self.pos;
297 while let Some(b) = self.peek() {
298 if (b as char).is_ascii_alphanumeric() || b == b'_' {
299 self.advance();
300 } else {
301 break;
302 }
303 }
304 let key = std::str::from_utf8(&self.src[start..self.pos])
305 .map_err(|e| e.to_string())?
306 .to_owned();
307 Ok(key)
308 }
309
310 fn parse_array(&mut self) -> std::result::Result<serde_json::Value, String> {
311 self.expect_byte(b'[')?;
312 let mut arr = Vec::new();
313
314 if self.peek() == Some(b']') {
315 self.advance();
316 return Ok(serde_json::Value::Array(arr));
317 }
318
319 loop {
320 arr.push(self.parse_value()?);
321 match self.peek() {
322 Some(b',') => {
323 self.advance();
324 }
325 Some(b']') => {
326 self.advance();
327 break;
328 }
329 Some(b) => {
330 return Err(format!(
331 "expected ',' or ']' got '{}' at pos {}",
332 b as char, self.pos
333 ))
334 }
335 None => return Err("unexpected EOF in array".into()),
336 }
337 }
338 Ok(serde_json::Value::Array(arr))
339 }
340
341 fn parse_string(&mut self) -> std::result::Result<String, String> {
345 self.expect_byte(b'"')?;
346 let mut bytes: Vec<u8> = Vec::new();
347 loop {
348 match self.advance() {
349 None => return Err("unterminated string".into()),
350 Some(b'"') => break,
351 Some(b'\\') => {
352 match self.advance() {
353 Some(b'"') => bytes.push(b'"'),
354 Some(b'\\') => bytes.push(b'\\'),
355 Some(b'/') => bytes.push(b'/'),
356 Some(b'n') => bytes.push(b'\n'),
357 Some(b'r') => bytes.push(b'\r'),
358 Some(b't') => bytes.push(b'\t'),
359 Some(b'b') => bytes.push(b'\x08'),
360 Some(b'f') => bytes.push(b'\x0C'),
361 Some(b'u') => {
362 let hex = self.take_n(4)?;
366 let code = u32::from_str_radix(&hex, 16)
367 .map_err(|e| format!("bad \\u escape: {e}"))?;
368
369 let ch = if (0xD800..=0xDBFF).contains(&code) {
370 self.expect_byte(b'\\')?;
372 self.expect_byte(b'u')?;
373 let hex2 = self.take_n(4)?;
374 let low = u32::from_str_radix(&hex2, 16)
375 .map_err(|e| format!("bad \\u escape in low surrogate: {e}"))?;
376 if !(0xDC00..=0xDFFF).contains(&low) {
377 return Err(format!("expected low surrogate, got U+{low:04X}"));
378 }
379 let scalar = 0x10000 + ((code - 0xD800) << 10) + (low - 0xDC00);
380 char::from_u32(scalar)
381 .ok_or_else(|| format!("invalid surrogate pair scalar U+{scalar:X}"))?
382 } else {
383 char::from_u32(code)
384 .ok_or_else(|| format!("invalid unicode codepoint {code}"))?
385 };
386
387 let mut tmp = [0u8; 4];
388 let encoded = ch.encode_utf8(&mut tmp);
389 bytes.extend_from_slice(encoded.as_bytes());
390 }
391 Some(b) => {
392 return Err(format!("unknown escape \\{}", b as char))
393 }
394 None => return Err("EOF in escape".into()),
395 }
396 }
397 Some(b) => {
398 bytes.push(b);
401 }
402 }
403 }
404 String::from_utf8(bytes).map_err(|e| format!("invalid UTF-8 in string: {e}"))
405 }
406
407 fn take_n(&mut self, n: usize) -> std::result::Result<String, String> {
408 if self.pos + n > self.src.len() {
409 return Err("unexpected EOF".into());
410 }
411 let slice = &self.src[self.pos..self.pos + n];
412 self.pos += n;
413 std::str::from_utf8(slice)
414 .map(|s| s.to_owned())
415 .map_err(|e| e.to_string())
416 }
417
418 fn parse_number(&mut self) -> std::result::Result<serde_json::Value, String> {
419 let start = self.pos;
420 if self.peek() == Some(b'-') {
422 self.advance();
423 }
424 while matches!(self.peek(), Some(b'0'..=b'9')) {
426 self.advance();
427 }
428 if self.peek() == Some(b'.') {
430 self.advance();
431 while matches!(self.peek(), Some(b'0'..=b'9')) {
432 self.advance();
433 }
434 }
435 if matches!(self.peek(), Some(b'e') | Some(b'E')) {
437 self.advance();
438 if matches!(self.peek(), Some(b'+') | Some(b'-')) {
439 self.advance();
440 }
441 while matches!(self.peek(), Some(b'0'..=b'9')) {
442 self.advance();
443 }
444 }
445 let num_str = std::str::from_utf8(&self.src[start..self.pos])
446 .map_err(|e| e.to_string())?;
447 let n: serde_json::Number = num_str
448 .parse()
449 .map_err(|e| format!("bad number '{num_str}': {e}"))?;
450 Ok(serde_json::Value::Number(n))
451 }
452}
453
454#[cfg(test)]
455mod tests {
456 use super::*;
457 use proptest::prelude::*;
458 use serde_json::json;
459
460 fn arb_json_value() -> impl Strategy<Value = serde_json::Value> {
469 let leaf = prop_oneof![
470 Just(serde_json::Value::Null),
471 any::<bool>().prop_map(serde_json::Value::Bool),
472 any::<i64>().prop_map(|n| serde_json::json!(n)),
473 any::<f64>()
474 .prop_filter("must be finite", |f| f.is_finite())
475 .prop_map(|f| serde_json::json!(f)),
476 ".*".prop_map(serde_json::Value::String),
477 ];
478
479 leaf.prop_recursive(
480 4, 64, 8, |inner| {
484 prop_oneof![
485 prop::collection::vec(inner.clone(), 0..8)
487 .prop_map(serde_json::Value::Array),
488 prop::collection::hash_map(".*", inner, 0..8).prop_map(|m| {
490 serde_json::Value::Object(m.into_iter().collect())
491 }),
492 ]
493 },
494 )
495 }
496
497 proptest! {
498 #[test]
503 fn prop_toon_round_trip(v in arb_json_value()) {
504 let encoded = ToonEncoder.encode(&v).expect("encode should not fail");
505 let decoded = ToonEncoder.decode(&encoded).expect("decode should not fail");
506 prop_assert_eq!(decoded, v);
507 }
508 }
509
510 fn arb_large_json_object() -> impl Strategy<Value = serde_json::Value> {
529 let arb_leaf_string = "[a-z]{4,12}".prop_map(serde_json::Value::String);
531
532 let arb_inner = prop::collection::hash_map(
535 "[a-z]{4,8}",
536 arb_leaf_string.clone(),
537 5..8usize,
538 )
539 .prop_map(|m| serde_json::Value::Object(m.into_iter().collect()));
540
541 let arb_mid = prop::collection::hash_map(
542 "[a-z]{4,8}",
543 prop_oneof![
544 1 => arb_leaf_string.clone(),
545 2 => arb_inner,
546 ],
547 5..8usize,
548 )
549 .prop_map(|m| serde_json::Value::Object(m.into_iter().collect()));
550
551 prop::collection::hash_map(
555 "[a-z]{4,8}",
556 arb_mid,
557 8..12usize,
558 )
559 .prop_map(|m| serde_json::Value::Object(m.into_iter().collect()))
560 }
561
562 proptest! {
563 #[test]
571 fn prop_toon_token_reduction(v in arb_large_json_object()) {
572 let pretty = serde_json::to_string_pretty(&v)
573 .expect("pretty-print should not fail");
574
575 prop_assume!(pretty.len() >= 100);
577
578 let encoded = ToonEncoder.encode(&v).expect("encode should not fail");
579
580 let encoded_content_len = encoded.len().saturating_sub(TOON_PREFIX.len());
586 let threshold = (pretty.len() as f64 * 0.70).ceil() as usize;
587 prop_assert!(
588 encoded_content_len <= threshold,
589 "encoded content length {} is not at most 70% of pretty length {} (threshold {})\npretty:\n{}\nencoded: {}",
590 encoded_content_len,
591 pretty.len(),
592 threshold,
593 pretty,
594 encoded,
595 );
596 }
597 }
598
599 fn enc(v: &serde_json::Value) -> String {
600 ToonEncoder.encode(v).unwrap()
601 }
602
603 fn rt(v: serde_json::Value) -> serde_json::Value {
604 let encoded = ToonEncoder.encode(&v).unwrap();
605 ToonEncoder.decode(&encoded).unwrap()
606 }
607
608 #[test]
611 fn roundtrip_null() {
612 assert_eq!(rt(json!(null)), json!(null));
613 }
614
615 #[test]
616 fn roundtrip_bool() {
617 assert_eq!(rt(json!(true)), json!(true));
618 assert_eq!(rt(json!(false)), json!(false));
619 }
620
621 #[test]
622 fn roundtrip_number() {
623 assert_eq!(rt(json!(42)), json!(42));
624 assert_eq!(rt(json!(3.14)), json!(3.14));
625 assert_eq!(rt(json!(-7)), json!(-7));
626 }
627
628 #[test]
629 fn roundtrip_string() {
630 assert_eq!(rt(json!("hello")), json!("hello"));
631 assert_eq!(rt(json!("with \"quotes\"")), json!("with \"quotes\""));
632 assert_eq!(rt(json!("line\nnewline")), json!("line\nnewline"));
633 }
634
635 #[test]
636 fn roundtrip_array() {
637 let v = json!([1, "two", true, null, [3, 4]]);
638 assert_eq!(rt(v.clone()), v);
639 }
640
641 #[test]
642 fn roundtrip_object() {
643 let v = json!({"name": "Alice", "age": 30, "active": true});
644 assert_eq!(rt(v.clone()), v);
645 }
646
647 #[test]
648 fn roundtrip_nested() {
649 let v = json!({
650 "user": {"id": 1, "name": "Bob"},
651 "tags": ["rust", "json"],
652 "meta": null
653 });
654 assert_eq!(rt(v.clone()), v);
655 }
656
657 #[test]
658 fn roundtrip_quoted_key() {
659 let v = json!({"my-key": 1, "123start": 2});
660 assert_eq!(rt(v.clone()), v);
661 }
662
663 #[test]
664 fn roundtrip_empty_object() {
665 assert_eq!(rt(json!({})), json!({}));
666 }
667
668 #[test]
669 fn roundtrip_empty_array() {
670 assert_eq!(rt(json!([])), json!([]));
671 }
672
673 #[test]
674 fn roundtrip_empty_string() {
675 assert_eq!(rt(json!("")), json!(""));
676 }
677
678 #[test]
681 fn prefix_present() {
682 let s = enc(&json!({"a": 1}));
683 assert!(s.starts_with("TOON:"), "encoded: {s}");
684 }
685
686 #[test]
687 fn simple_key_unquoted() {
688 let s = enc(&json!({"name": "Alice"}));
689 assert!(s.contains("name:"), "encoded: {s}");
690 assert!(!s.contains("\"name\""), "encoded: {s}");
691 }
692
693 #[test]
694 fn complex_key_quoted() {
695 let s = enc(&json!({"my-key": 1}));
696 assert!(s.contains("\"my-key\""), "encoded: {s}");
697 }
698
699 #[test]
700 fn no_spaces_in_array() {
701 let s = enc(&json!([1, 2, 3]));
702 let body = s.strip_prefix("TOON:").unwrap();
703 assert!(!body.contains(' '), "body: {body}");
704 }
705
706 #[test]
707 fn ascii_safe_output() {
708 let v = json!({"key": "hello world", "num": 42});
709 let s = enc(&v);
710 for ch in s.chars() {
711 assert!(
712 ch.is_ascii() && (ch as u8) >= 0x20,
713 "non-ASCII or control char in output: {:?}",
714 ch
715 );
716 }
717 }
718
719 #[test]
722 fn is_json_valid() {
723 assert!(ToonEncoder::is_json(r#"{"a":1}"#));
724 assert!(ToonEncoder::is_json("[1,2,3]"));
725 assert!(ToonEncoder::is_json("42"));
726 assert!(ToonEncoder::is_json("\"hello\""));
727 assert!(ToonEncoder::is_json("null"));
728 assert!(ToonEncoder::is_json("true"));
729 }
730
731 #[test]
732 fn is_json_invalid() {
733 assert!(!ToonEncoder::is_json("not json"));
734 assert!(!ToonEncoder::is_json("{bad}"));
735 assert!(!ToonEncoder::is_json(""));
736 assert!(!ToonEncoder::is_json(" "));
737 }
738
739 #[test]
740 fn is_json_whitespace_trimmed() {
741 assert!(ToonEncoder::is_json(" { \"a\": 1 } "));
742 }
743
744 proptest! {
750 #[test]
762 fn prop_cross_tokenizer_determinism(v in arb_json_value()) {
763 let encoded = ToonEncoder.encode(&v).expect("encode should not fail");
764
765 let char_count = encoded.chars().count() as f64;
766
767 let claude_tokens = char_count / 3.5;
769 let gpt_tokens = char_count / 4.0;
770 let gemini_tokens = char_count / 3.8;
771
772 let max_estimate = claude_tokens.max(gpt_tokens).max(gemini_tokens);
773 let min_estimate = claude_tokens.min(gpt_tokens).min(gemini_tokens);
774
775 if min_estimate > 0.0 {
780 let ratio = max_estimate / min_estimate;
781 prop_assert!(
782 ratio <= 1.15,
783 "token count estimates diverge by more than 15%: \
784 claude={:.2}, gpt={:.2}, gemini={:.2}, ratio={:.4}\nencoded: {:?}",
785 claude_tokens, gpt_tokens, gemini_tokens, ratio, encoded
786 );
787 }
788 }
789 }
790
791 #[test]
794 fn decode_rejects_non_toon() {
795 assert!(ToonEncoder.decode("not a toon string").is_err());
796 }
797
798 #[test]
799 fn decode_rejects_trailing_data() {
800 assert!(ToonEncoder.decode("TOON:42garbage").is_err());
802 }
803}