Skip to main content

rain_tokens/
rain_tokens.rs

1//! Does MechGen's "digital rain" (dense-UTF-8, Matrix-style) actually compress
2//! **LLM token streams**? Measured with the real BPE tokenizers (cl100k + o200k)
3//! via `--features real-tokens`. Without that feature the numbers are heuristic
4//! and the CJK token cost is *under*-counted — so for the real answer run:
5//!
6//!   cargo run -p agentic-eval --example rain_tokens --features real-tokens
7//!
8//! Hypothesis under test: pack each MechGen token into one dense glyph to shrink
9//! the token stream. Spoiler (and the whole-project lesson): an LLM emits
10//! *tokens*, not glyphs — BPE splits rare multi-byte CJK/katakana into multiple
11//! tokens, so the dense stream costs MORE tokens even as it costs fewer chars.
12
13use agentic_eval::tokens::Model;
14
15/// Glyph alphabet identical in spirit to MechGen's rain::encode.
16fn glyphs() -> Vec<char> {
17    let mut v = Vec::new();
18    v.extend((0xFF66u32..=0xFF9D).filter_map(char::from_u32)); // half-width katakana
19    v.extend((0x30A1u32..=0x30FA).filter_map(char::from_u32)); // full katakana
20    v.extend((0x4E00u32..=0x9FA5).filter_map(char::from_u32)); // CJK
21    v
22}
23
24/// Lightweight tokenizer: runs of [A-Za-z0-9_] are one token; every other
25/// non-space char is its own token (matches MechGen's per-token rain mapping).
26fn tokenize(src: &str) -> Vec<String> {
27    let mut out = Vec::new();
28    let mut cur = String::new();
29    for c in src.chars() {
30        if c.is_alphanumeric() || c == '_' {
31            cur.push(c);
32        } else {
33            if !cur.is_empty() {
34                out.push(std::mem::take(&mut cur));
35            }
36            if !c.is_whitespace() {
37                out.push(c.to_string());
38            }
39        }
40    }
41    if !cur.is_empty() {
42        out.push(cur);
43    }
44    out
45}
46
47/// Rain-encode: one glyph per token. Returns (stream, legend_text).
48fn rain(src: &str) -> (String, String) {
49    let alpha = glyphs();
50    let mut map: std::collections::HashMap<String, char> = std::collections::HashMap::new();
51    let mut legend: Vec<(char, String)> = Vec::new();
52    let mut stream = String::new();
53    for t in tokenize(src) {
54        let g = *map.entry(t.clone()).or_insert_with(|| {
55            let g = alpha[legend.len() % alpha.len()];
56            legend.push((g, t.clone()));
57            g
58        });
59        stream.push(g);
60    }
61    let legend_text = legend.iter().map(|(g, t)| format!("{g}\t{t}")).collect::<Vec<_>>().join("\n");
62    (stream, legend_text)
63}
64
65/// Minimal base64 (for the bytes-as-text comparison).
66fn b64(bytes: &[u8]) -> String {
67    const T: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
68    let mut o = String::new();
69    for ch in bytes.chunks(3) {
70        let b = [ch[0], *ch.get(1).unwrap_or(&0), *ch.get(2).unwrap_or(&0)];
71        let n = (b[0] as u32) << 16 | (b[1] as u32) << 8 | b[2] as u32;
72        for k in 0..4 {
73            if k <= ch.len() {
74                o.push(T[((n >> (18 - 6 * k)) & 0x3f) as usize] as char);
75            } else {
76                o.push('=');
77            }
78        }
79    }
80    o
81}
82
83fn chars(s: &str) -> usize {
84    s.chars().count()
85}
86
87fn main() {
88    let exact = Model::OpenAiGpt4.is_exact();
89    println!("=== MechGen 'digital rain' vs token streams (cl100k + o200k BPE) ===");
90    println!(
91        "tokenizer: {}\n",
92        if exact { "REAL tiktoken (exact)" } else { "HEURISTIC (CJK undercounted — rerun with --features real-tokens)" }
93    );
94
95    let samples: &[(&str, &str)] = &[
96        ("net", "net MLP { layer fc1: Linear(8, 16); layer act: ReLU; layer fc2: Linear(16, 4); forward { fc1 } }"),
97        ("fn", "fn factorial(n: u64) -> u64 { if n <= 1 { return 1; } n * factorial(n - 1) }"),
98        ("kb", "kb Family { fact parent(alice, bob); fact parent(bob, carol); rule gp(x: i32, z: i32) where parent(x, y), parent(y, z) { x } }"),
99    ];
100
101    println!("{:<5} {:>10} {:>22} {:>22} {:>14}", "kind", "form", "cl100k tok", "o200k tok", "chars");
102    let cl = Model::OpenAiGpt4;
103    let o2 = Model::OpenAiGpt4o;
104    let (mut s_cl, mut s_o, mut r_cl, mut r_o, mut rl_cl, mut rl_o) = (0, 0, 0, 0, 0, 0);
105    for (name, src) in samples {
106        let (stream, legend) = rain(src);
107        let rl = format!("{legend}\n{stream}"); // rain + legend (one-off, reversible)
108        let bytes = src.as_bytes();
109        let base = b64(bytes);
110        let row = |label: &str, s: &str| {
111            println!("{:<5} {:>10} {:>22} {:>22} {:>14}", "", label, cl.count(s), o2.count(s), chars(s));
112        };
113        println!("[{name}]");
114        row("source", src);
115        row("rain(stream)", &stream);
116        row("rain+legend", &rl);
117        row("base64(bytes)", &base);
118        s_cl += cl.count(src); s_o += o2.count(src);
119        r_cl += cl.count(&stream); r_o += o2.count(&stream);
120        rl_cl += cl.count(&rl); rl_o += o2.count(&rl);
121    }
122
123    println!("\nTOTALS (3 samples) — token ratio vs source (>1.0 = rain is WORSE):");
124    println!("  source            cl100k {s_cl:>4}   o200k {s_o:>4}");
125    println!("  rain(stream)      cl100k {r_cl:>4} ({:.2}x)   o200k {r_o:>4} ({:.2}x)", r_cl as f64 / s_cl as f64, r_o as f64 / s_o as f64);
126    println!("  rain+legend       cl100k {rl_cl:>4} ({:.2}x)   o200k {rl_o:>4} ({:.2}x)", rl_cl as f64 / s_cl as f64, rl_o as f64 / s_o as f64);
127
128    println!("\nVERDICT");
129    println!("  Digital rain shrinks CHARACTERS (~3x — the Matrix look) but the dense glyph");
130    println!("  stream costs MORE BPE tokens than the ASCII source (ratios above), because the");
131    println!("  tokenizer splits each rare multi-byte glyph into several tokens. Adding the legend");
132    println!("  (needed for reversibility) makes a one-off snippet far worse still. Even an");
133    println!("  amortized shared codebook (stream only) does not beat source on tokens.");
134    println!("  This is the project's token-floor finding, re-confirmed on the dense-symbol idea:");
135    println!("  an LLM emits TOKENS, not glyphs/bytes — the information (names/ops/dims) is the floor.");
136    if !exact {
137        println!("\n  (heuristic run: it counts ~1 token/char and UNDER-counts CJK — the real cl100k/o200k");
138        println!("   gap is larger. Rerun with --features real-tokens for the exact, even-worse numbers.)");
139    }
140}