Skip to main content

lean_ctx/core/neural/
token_optimizer.rs

1//! Token-optimal encoding based on empirical lab results.
2//!
3//! Uses a lookup table (concept -> optimal representation) derived from
4//! Experiment C's cross-tokenizer analysis. Falls back to identity when
5//! no optimizations are known.
6
7use std::collections::HashMap;
8use std::path::Path;
9
10pub struct TokenOptimizer {
11    replacements: HashMap<String, String>,
12}
13
14// Lab Experiment C (2026-04-02): Unicode symbols (λ, →, §, ∂, ⊕) INCREASE token count
15// on GPT-4/GPT-4o tokenizers. English keywords already encode as 1 token each.
16// Only use ASCII abbreviations that tokenizers handle well.
17const DEFAULT_OPTIMIZATIONS: &[(&str, &str)] = &[
18    ("function ", "fn "),
19    ("boolean", "bool"),
20    ("string", "str"),
21    ("number", "num"),
22    ("undefined", "undef"),
23    ("console.log", "log"),
24    ("export function ", "fn "),
25    ("    ", "  "),
26    ("Result<T, E>", "Result"),
27    ("Result<T,E>", "Result"),
28    ("Option<T>", "Option"),
29    ("Vec<String>", "Vec"),
30    ("Vec<&str>", "Vec"),
31    ("Vec<u8>", "Vec"),
32    ("HashMap<String, String>", "HashMap"),
33    ("HashMap<K, V>", "HashMap"),
34    ("HashMap<K,V>", "HashMap"),
35    ("BTreeMap<K, V>", "BTreeMap"),
36    ("HashSet<String>", "HashSet"),
37    ("Box<dyn Error>", "Box<Error>"),
38    ("Arc<Mutex<", "Arc<Mutex<"),
39    ("std::collections::HashMap", "HashMap"),
40    ("std::collections::HashSet", "HashSet"),
41    ("std::collections::BTreeMap", "BTreeMap"),
42    ("std::path::PathBuf", "PathBuf"),
43    ("std::path::Path", "Path"),
44    ("std::sync::Arc", "Arc"),
45    ("std::sync::Mutex", "Mutex"),
46    ("std::io::Result", "io::Result"),
47    ("std::fmt::Display", "Display"),
48    ("std::fmt::Debug", "Debug"),
49];
50
51/// BPE-aligned formatting rules — empirically measured to save tokens on o200k_base.
52/// Each rule reduces token count by exploiting the tokenizer's preferred byte sequences.
53const BPE_ALIGNED_RULES: &[(&str, &str)] = &[
54    // Whitespace around operators: `-> ` tokenizes as 2 tokens, `->` as 1
55    (" -> ", "->"),
56    (" => ", "=>"),
57    // Trailing semicolons after blocks waste 1 token
58    ("};", "}"),
59    // Double newlines cost 1 extra token each
60    ("\n\n\n", "\n\n"),
61    // Common verbose patterns
62    (".to_string()", ".into()"),
63    (".to_owned()", ".into()"),
64    ("pub(crate) ", "pub "),
65    ("pub(super) ", "pub "),
66    // Python
67    ("self, ", ""),
68    ("    pass\n", ""),
69    // TypeScript/JS
70    ("export default ", "export "),
71    (": void", ""),
72    (": undefined", ""),
73    // Go
74    ("func (", "fn ("),
75    ("interface{}", "any"),
76];
77
78impl TokenOptimizer {
79    pub fn load_or_default(model_dir: &Path) -> Self {
80        let config_path = model_dir.join("token_optimizer.json");
81        if config_path.exists() {
82            match Self::load_from_file(&config_path) {
83                Ok(opt) => {
84                    tracing::info!(
85                        "Token optimizer loaded ({} rules) from {:?}",
86                        opt.replacements.len(),
87                        config_path,
88                    );
89                    return opt;
90                }
91                Err(e) => {
92                    tracing::warn!("Failed to load token optimizer: {e}. Using defaults.");
93                }
94            }
95        }
96
97        Self::with_defaults()
98    }
99
100    pub fn with_defaults() -> Self {
101        let mut replacements: HashMap<String, String> = DEFAULT_OPTIMIZATIONS
102            .iter()
103            .map(|(k, v)| (k.to_string(), v.to_string()))
104            .collect();
105
106        for &(from, to) in BPE_ALIGNED_RULES {
107            replacements.insert(from.to_string(), to.to_string());
108        }
109
110        Self { replacements }
111    }
112
113    fn load_from_file(path: &Path) -> anyhow::Result<Self> {
114        let content = std::fs::read_to_string(path)?;
115        let data: HashMap<String, String> = serde_json::from_str(&content)?;
116        Ok(Self { replacements: data })
117    }
118
119    pub fn optimize<'a>(&'a self, _concept: &str, representation: &'a str) -> &'a str {
120        representation
121    }
122
123    pub fn optimize_line(&self, line: &str) -> String {
124        let mut result = line.to_string();
125        for (from, to) in &self.replacements {
126            result = result.replace(from.as_str(), to.as_str());
127        }
128        result = elide_lifetimes(&result);
129        result
130    }
131
132    pub fn optimize_block(&self, content: &str) -> String {
133        let optimized: Vec<String> = content
134            .lines()
135            .map(|line| self.optimize_line(line))
136            .collect();
137        let collapsed = collapse_closing_braces(&optimized);
138        collapsed.join("\n")
139    }
140
141    pub fn replacement_count(&self) -> usize {
142        self.replacements.len()
143    }
144
145    /// BPE cost oracle: measure the actual token cost of a string representation.
146    /// Used to pick the cheapest encoding when multiple are semantically equivalent.
147    pub fn token_cost(text: &str) -> usize {
148        crate::core::tokens::count_tokens(text)
149    }
150
151    /// Choose the cheaper representation between two semantically equivalent strings.
152    pub fn cheaper_repr<'a>(a: &'a str, b: &'a str) -> &'a str {
153        if Self::token_cost(a) <= Self::token_cost(b) {
154            a
155        } else {
156            b
157        }
158    }
159}
160
161fn elide_lifetimes(line: &str) -> String {
162    let mut result = line.to_string();
163    let patterns = ["'a ", "'b ", "'c ", "'static "];
164    for pat in &patterns {
165        if *pat == "'static " {
166            continue;
167        }
168        let with_ref = format!("&{pat}");
169        let with_mut = format!("&{pat}mut ");
170        result = result.replace(&with_mut, "&mut ");
171        result = result.replace(&with_ref, "&");
172    }
173    result
174}
175
176fn collapse_closing_braces(lines: &[String]) -> Vec<String> {
177    let mut result: Vec<String> = Vec::with_capacity(lines.len());
178    let mut brace_run = 0u32;
179
180    for line in lines {
181        let trimmed = line.trim();
182        if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
183            brace_run += 1;
184            if brace_run <= 2 {
185                result.push(trimmed.to_string());
186            } else if brace_run == 3 {
187                if let Some(last) = result.last_mut() {
188                    last.push_str(trimmed);
189                }
190            }
191            continue;
192        }
193        brace_run = 0;
194        result.push(line.clone());
195    }
196    result
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202
203    #[test]
204    fn default_optimizations_apply() {
205        let opt = TokenOptimizer::with_defaults();
206        assert_eq!(opt.optimize_line("function hello() {"), "fn hello() {");
207        assert_eq!(opt.optimize_line("boolean flag"), "bool flag");
208    }
209
210    #[test]
211    fn indentation_compresses() {
212        let opt = TokenOptimizer::with_defaults();
213        let input = "    let x = 1;";
214        let output = opt.optimize_line(input);
215        assert_eq!(output, "  let x = 1;");
216    }
217
218    #[test]
219    fn generic_types_simplify() {
220        let opt = TokenOptimizer::with_defaults();
221        assert_eq!(
222            opt.optimize_line("fn foo() -> Result<T, E>"),
223            "fn foo()->Result"
224        );
225        assert_eq!(
226            opt.optimize_line("fn bar() -> Option<T>"),
227            "fn bar()->Option"
228        );
229        assert_eq!(
230            opt.optimize_line("let v: Vec<String> = vec![]"),
231            "let v: Vec = vec![]"
232        );
233        assert_eq!(
234            opt.optimize_line("use std::collections::HashMap;"),
235            "use HashMap;"
236        );
237    }
238
239    #[test]
240    fn multiline_optimization() {
241        let opt = TokenOptimizer::with_defaults();
242        let input = "function hello() {\n    return 42;\n}";
243        let output = opt.optimize_block(input);
244        assert_eq!(output, "fn hello() {\n  return 42;\n}");
245    }
246
247    #[test]
248    fn lifetime_elision() {
249        let opt = TokenOptimizer::with_defaults();
250        assert_eq!(
251            opt.optimize_line("fn foo(&'a str) -> &'a str"),
252            "fn foo(&str)->&str"
253        );
254        assert_eq!(opt.optimize_line("fn bar(&'a mut Vec)"), "fn bar(&mut Vec)");
255        assert_eq!(
256            opt.optimize_line("fn baz(&'static str)"),
257            "fn baz(&'static str)",
258            "'static must not be elided"
259        );
260    }
261
262    #[test]
263    fn closing_brace_collapsing() {
264        let opt = TokenOptimizer::with_defaults();
265        let input = "fn main() {\n  inner() {\n    x\n  }\n}\n}\n}\n}\nfn next() {}";
266        let output = opt.optimize_block(input);
267        assert!(output.contains("fn next()"), "code after braces preserved");
268        let brace_only_lines: Vec<_> = output.lines().filter(|l| l.trim() == "}").collect();
269        assert!(
270            brace_only_lines.len() <= 2,
271            "should collapse 4+ closing braces"
272        );
273    }
274
275    #[test]
276    fn std_path_shortening() {
277        let opt = TokenOptimizer::with_defaults();
278        assert_eq!(opt.optimize_line("use std::path::PathBuf;"), "use PathBuf;");
279        assert_eq!(opt.optimize_line("use std::sync::Arc;"), "use Arc;");
280    }
281
282    #[test]
283    fn bpe_aligned_arrow_compression() {
284        let opt = TokenOptimizer::with_defaults();
285        assert_eq!(opt.optimize_line("fn foo() -> bool {"), "fn foo()->bool {");
286    }
287
288    #[test]
289    fn bpe_cost_oracle_works() {
290        let cost = TokenOptimizer::token_cost("hello world");
291        assert!(cost > 0);
292    }
293
294    #[test]
295    fn cheaper_repr_picks_shorter() {
296        let result = TokenOptimizer::cheaper_repr("fn foo() -> bool", "fn foo()->bool");
297        assert!(
298            TokenOptimizer::token_cost(result) <= TokenOptimizer::token_cost("fn foo() -> bool")
299        );
300    }
301}