Skip to main content

lean_ctx/core/neural/
token_optimizer.rs

1//! Token-optimal encoding based on empirical lab results.
2//!
3//! Uses a lookup table (concept -> optimal representation) derived from
4//! Experiment C's cross-tokenizer analysis. Falls back to identity when
5//! no optimizations are known.
6
7use std::path::Path;
8
9pub struct TokenOptimizer {
10    replacements: Vec<(String, String)>,
11}
12
13// Lab Experiment C (2026-04-02): Unicode symbols (λ, →, §, ∂, ⊕) INCREASE token count
14// on GPT-4/GPT-4o tokenizers. English keywords already encode as 1 token each.
15// Only use ASCII abbreviations that tokenizers handle well.
16const DEFAULT_OPTIMIZATIONS: &[(&str, &str)] = &[
17    ("function ", "fn "),
18    ("boolean", "bool"),
19    ("string", "str"),
20    ("number", "num"),
21    ("undefined", "undef"),
22    ("console.log", "log"),
23    ("export function ", "fn "),
24    ("    ", "  "),
25    ("Result<T, E>", "Result"),
26    ("Result<T,E>", "Result"),
27    ("Option<T>", "Option"),
28    ("Vec<String>", "Vec"),
29    ("Vec<&str>", "Vec"),
30    ("Vec<u8>", "Vec"),
31    ("HashMap<String, String>", "HashMap"),
32    ("HashMap<K, V>", "HashMap"),
33    ("HashMap<K,V>", "HashMap"),
34    ("BTreeMap<K, V>", "BTreeMap"),
35    ("HashSet<String>", "HashSet"),
36    ("Box<dyn Error>", "Box<Error>"),
37    ("Arc<Mutex<", "Arc<Mutex<"),
38    ("std::collections::HashMap", "HashMap"),
39    ("std::collections::HashSet", "HashSet"),
40    ("std::collections::BTreeMap", "BTreeMap"),
41    ("std::path::PathBuf", "PathBuf"),
42    ("std::path::Path", "Path"),
43    ("std::sync::Arc", "Arc"),
44    ("std::sync::Mutex", "Mutex"),
45    ("std::io::Result", "io::Result"),
46    ("std::fmt::Display", "Display"),
47    ("std::fmt::Debug", "Debug"),
48];
49
50/// BPE-aligned formatting rules — empirically measured to save tokens on o200k_base.
51/// Only SAFE rules that never break semantics or compilability.
52/// Dangerous rules removed after BPE Guard audit (2026-04-03):
53///   REMOVED: `.to_string()->.into()` (not always equivalent, trait-dependent)
54///   REMOVED: `.to_owned()->.into()` (same issue)
55///   REMOVED: `self, ->""` (breaks Python method signatures)
56///   REMOVED: `pass\n->""` (removes required Python stubs)
57///   REMOVED: `}: ->}` (breaks struct initialization `Foo { field: val };`)
58///   REMOVED: `: void->""` (breaks TypeScript explicit return types)
59///   REMOVED: `: undefined->""` (breaks TypeScript type annotations)
60///   REMOVED: `func (->fn (` (breaks Go method receivers)
61///   REMOVED: `interface{}->any` (only valid in Go 1.18+)
62const BPE_ALIGNED_RULES: &[(&str, &str)] = &[
63    (" -> ", "->"),
64    (" => ", "=>"),
65    ("\n\n\n", "\n\n"),
66    ("pub(crate) ", "pub "),
67    ("pub(super) ", "pub "),
68    ("export default ", "export "),
69];
70
71impl TokenOptimizer {
72    pub fn load_or_default(model_dir: &Path) -> Self {
73        let config_path = model_dir.join("token_optimizer.json");
74        if config_path.exists() {
75            match Self::load_from_file(&config_path) {
76                Ok(opt) => {
77                    tracing::info!(
78                        "Token optimizer loaded ({} rules) from {:?}",
79                        opt.replacements.len(),
80                        config_path,
81                    );
82                    return opt;
83                }
84                Err(e) => {
85                    tracing::warn!("Failed to load token optimizer: {e}. Using defaults.");
86                }
87            }
88        }
89
90        Self::with_defaults()
91    }
92
93    pub fn with_defaults() -> Self {
94        let mut rules: Vec<(String, String)> = DEFAULT_OPTIMIZATIONS
95            .iter()
96            .map(|(k, v)| (k.to_string(), v.to_string()))
97            .collect();
98        rules.extend(
99            BPE_ALIGNED_RULES
100                .iter()
101                .map(|(k, v)| (k.to_string(), v.to_string())),
102        );
103        Self {
104            replacements: sort_rules(rules),
105        }
106    }
107
108    fn load_from_file(path: &Path) -> anyhow::Result<Self> {
109        let content = std::fs::read_to_string(path)?;
110        let data: std::collections::HashMap<String, String> = serde_json::from_str(&content)?;
111        let rules: Vec<(String, String)> = data.into_iter().collect();
112        Ok(Self {
113            replacements: sort_rules(rules),
114        })
115    }
116
117    pub fn optimize_line(&self, line: &str) -> String {
118        let mut result = line.to_string();
119        for (from, to) in &self.replacements {
120            result = result.replace(from.as_str(), to.as_str());
121        }
122        result = elide_lifetimes(&result);
123        result
124    }
125
126    pub fn optimize_block(&self, content: &str) -> String {
127        let optimized: Vec<String> = content
128            .lines()
129            .map(|line| self.optimize_line(line))
130            .collect();
131        let collapsed = collapse_closing_braces(&optimized);
132        collapsed.join("\n")
133    }
134
135    pub fn replacement_count(&self) -> usize {
136        self.replacements.len()
137    }
138
139    /// BPE cost oracle: measure the actual token cost of a string representation.
140    /// Used to pick the cheapest encoding when multiple are semantically equivalent.
141    pub fn token_cost(text: &str) -> usize {
142        crate::core::tokens::count_tokens(text)
143    }
144
145    /// Choose the cheaper representation between two semantically equivalent strings.
146    pub fn cheaper_repr<'a>(a: &'a str, b: &'a str) -> &'a str {
147        if Self::token_cost(a) <= Self::token_cost(b) {
148            a
149        } else {
150            b
151        }
152    }
153}
154
155fn sort_rules(mut rules: Vec<(String, String)>) -> Vec<(String, String)> {
156    // Deterministic application order: longer patterns first, then lexical tie-break.
157    rules.sort_by(|a, b| {
158        let la = a.0.len();
159        let lb = b.0.len();
160        lb.cmp(&la)
161            .then_with(|| a.0.cmp(&b.0))
162            .then_with(|| a.1.cmp(&b.1))
163    });
164    rules
165}
166
167fn elide_lifetimes(line: &str) -> String {
168    let mut result = line.to_string();
169    let patterns = ["'a ", "'b ", "'c ", "'static "];
170    for pat in &patterns {
171        if *pat == "'static " {
172            continue;
173        }
174        let with_ref = format!("&{pat}");
175        let with_mut = format!("&{pat}mut ");
176        result = result.replace(&with_mut, "&mut ");
177        result = result.replace(&with_ref, "&");
178    }
179    result
180}
181
182fn collapse_closing_braces(lines: &[String]) -> Vec<String> {
183    let mut result: Vec<String> = Vec::with_capacity(lines.len());
184    let mut brace_run = 0u32;
185
186    for line in lines {
187        let trimmed = line.trim();
188        if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
189            brace_run += 1;
190            if brace_run <= 2 {
191                result.push(trimmed.to_string());
192            } else if brace_run == 3 {
193                if let Some(last) = result.last_mut() {
194                    last.push_str(trimmed);
195                }
196            }
197            continue;
198        }
199        brace_run = 0;
200        result.push(line.clone());
201    }
202    result
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208
209    #[test]
210    fn default_optimizations_apply() {
211        let opt = TokenOptimizer::with_defaults();
212        assert_eq!(opt.optimize_line("function hello() {"), "fn hello() {");
213        assert_eq!(opt.optimize_line("boolean flag"), "bool flag");
214    }
215
216    #[test]
217    fn indentation_compresses() {
218        let opt = TokenOptimizer::with_defaults();
219        let input = "    let x = 1;";
220        let output = opt.optimize_line(input);
221        assert_eq!(output, "  let x = 1;");
222    }
223
224    #[test]
225    fn generic_types_simplify() {
226        let opt = TokenOptimizer::with_defaults();
227        assert_eq!(
228            opt.optimize_line("fn foo() -> Result<T, E>"),
229            "fn foo()->Result"
230        );
231        assert_eq!(
232            opt.optimize_line("fn bar() -> Option<T>"),
233            "fn bar()->Option"
234        );
235        assert_eq!(
236            opt.optimize_line("let v: Vec<String> = vec![]"),
237            "let v: Vec = vec![]"
238        );
239        assert_eq!(
240            opt.optimize_line("use std::collections::HashMap;"),
241            "use HashMap;"
242        );
243    }
244
245    #[test]
246    fn multiline_optimization() {
247        let opt = TokenOptimizer::with_defaults();
248        let input = "function hello() {\n    return 42;\n}";
249        let output = opt.optimize_block(input);
250        assert_eq!(output, "fn hello() {\n  return 42;\n}");
251    }
252
253    #[test]
254    fn lifetime_elision() {
255        let opt = TokenOptimizer::with_defaults();
256        assert_eq!(
257            opt.optimize_line("fn foo(&'a str) -> &'a str"),
258            "fn foo(&str)->&str"
259        );
260        assert_eq!(opt.optimize_line("fn bar(&'a mut Vec)"), "fn bar(&mut Vec)");
261        assert_eq!(
262            opt.optimize_line("fn baz(&'static str)"),
263            "fn baz(&'static str)",
264            "'static must not be elided"
265        );
266    }
267
268    #[test]
269    fn closing_brace_collapsing() {
270        let opt = TokenOptimizer::with_defaults();
271        let input = "fn main() {\n  inner() {\n    x\n  }\n}\n}\n}\n}\nfn next() {}";
272        let output = opt.optimize_block(input);
273        assert!(output.contains("fn next()"), "code after braces preserved");
274        let brace_only_lines: Vec<_> = output.lines().filter(|l| l.trim() == "}").collect();
275        assert!(
276            brace_only_lines.len() <= 2,
277            "should collapse 4+ closing braces"
278        );
279    }
280
281    #[test]
282    fn std_path_shortening() {
283        let opt = TokenOptimizer::with_defaults();
284        assert_eq!(opt.optimize_line("use std::path::PathBuf;"), "use PathBuf;");
285        assert_eq!(opt.optimize_line("use std::sync::Arc;"), "use Arc;");
286    }
287
288    #[test]
289    fn bpe_aligned_arrow_compression() {
290        let opt = TokenOptimizer::with_defaults();
291        assert_eq!(opt.optimize_line("fn foo() -> bool {"), "fn foo()->bool {");
292    }
293
294    #[test]
295    fn bpe_cost_oracle_works() {
296        let cost = TokenOptimizer::token_cost("hello world");
297        assert!(cost > 0);
298    }
299
300    #[test]
301    fn cheaper_repr_picks_shorter() {
302        let result = TokenOptimizer::cheaper_repr("fn foo() -> bool", "fn foo()->bool");
303        assert!(
304            TokenOptimizer::token_cost(result) <= TokenOptimizer::token_cost("fn foo() -> bool")
305        );
306    }
307}