Skip to main content

lean_ctx/core/neural/
token_optimizer.rs

1//! Token-optimal encoding based on empirical lab results.
2//!
3//! Uses a lookup table (concept -> optimal representation) derived from
4//! Experiment C's cross-tokenizer analysis. Falls back to identity when
5//! no optimizations are known.
6
7use std::collections::HashMap;
8use std::path::Path;
9
10pub struct TokenOptimizer {
11    replacements: HashMap<String, String>,
12}
13
14// Lab Experiment C (2026-04-02): Unicode symbols (λ, →, §, ∂, ⊕) INCREASE token count
15// on GPT-4/GPT-4o tokenizers. English keywords already encode as 1 token each.
16// Only use ASCII abbreviations that tokenizers handle well.
17const DEFAULT_OPTIMIZATIONS: &[(&str, &str)] = &[
18    ("function ", "fn "),
19    ("boolean", "bool"),
20    ("string", "str"),
21    ("number", "num"),
22    ("undefined", "undef"),
23    ("console.log", "log"),
24    ("export function ", "fn "),
25    ("    ", "  "),
26    ("Result<T, E>", "Result"),
27    ("Result<T,E>", "Result"),
28    ("Option<T>", "Option"),
29    ("Vec<String>", "Vec"),
30    ("Vec<&str>", "Vec"),
31    ("Vec<u8>", "Vec"),
32    ("HashMap<String, String>", "HashMap"),
33    ("HashMap<K, V>", "HashMap"),
34    ("HashMap<K,V>", "HashMap"),
35    ("BTreeMap<K, V>", "BTreeMap"),
36    ("HashSet<String>", "HashSet"),
37    ("Box<dyn Error>", "Box<Error>"),
38    ("Arc<Mutex<", "Arc<Mutex<"),
39    ("std::collections::HashMap", "HashMap"),
40    ("std::collections::HashSet", "HashSet"),
41    ("std::collections::BTreeMap", "BTreeMap"),
42    ("std::path::PathBuf", "PathBuf"),
43    ("std::path::Path", "Path"),
44    ("std::sync::Arc", "Arc"),
45    ("std::sync::Mutex", "Mutex"),
46    ("std::io::Result", "io::Result"),
47    ("std::fmt::Display", "Display"),
48    ("std::fmt::Debug", "Debug"),
49];
50
51/// BPE-aligned formatting rules — empirically measured to save tokens on o200k_base.
52/// Only SAFE rules that never break semantics or compilability.
53/// Dangerous rules removed after BPE Guard audit (2026-04-03):
54///   REMOVED: `.to_string()->.into()` (not always equivalent, trait-dependent)
55///   REMOVED: `.to_owned()->.into()` (same issue)
56///   REMOVED: `self, ->""` (breaks Python method signatures)
57///   REMOVED: `pass\n->""` (removes required Python stubs)
58///   REMOVED: `}: ->}` (breaks struct initialization `Foo { field: val };`)
59///   REMOVED: `: void->""` (breaks TypeScript explicit return types)
60///   REMOVED: `: undefined->""` (breaks TypeScript type annotations)
61///   REMOVED: `func (->fn (` (breaks Go method receivers)
62///   REMOVED: `interface{}->any` (only valid in Go 1.18+)
63const BPE_ALIGNED_RULES: &[(&str, &str)] = &[
64    (" -> ", "->"),
65    (" => ", "=>"),
66    ("\n\n\n", "\n\n"),
67    ("pub(crate) ", "pub "),
68    ("pub(super) ", "pub "),
69    ("export default ", "export "),
70];
71
72impl TokenOptimizer {
73    pub fn load_or_default(model_dir: &Path) -> Self {
74        let config_path = model_dir.join("token_optimizer.json");
75        if config_path.exists() {
76            match Self::load_from_file(&config_path) {
77                Ok(opt) => {
78                    tracing::info!(
79                        "Token optimizer loaded ({} rules) from {:?}",
80                        opt.replacements.len(),
81                        config_path,
82                    );
83                    return opt;
84                }
85                Err(e) => {
86                    tracing::warn!("Failed to load token optimizer: {e}. Using defaults.");
87                }
88            }
89        }
90
91        Self::with_defaults()
92    }
93
94    pub fn with_defaults() -> Self {
95        let mut replacements: HashMap<String, String> = DEFAULT_OPTIMIZATIONS
96            .iter()
97            .map(|(k, v)| (k.to_string(), v.to_string()))
98            .collect();
99
100        for &(from, to) in BPE_ALIGNED_RULES {
101            replacements.insert(from.to_string(), to.to_string());
102        }
103
104        Self { replacements }
105    }
106
107    fn load_from_file(path: &Path) -> anyhow::Result<Self> {
108        let content = std::fs::read_to_string(path)?;
109        let data: HashMap<String, String> = serde_json::from_str(&content)?;
110        Ok(Self { replacements: data })
111    }
112
113    pub fn optimize<'a>(&'a self, _concept: &str, representation: &'a str) -> &'a str {
114        representation
115    }
116
117    pub fn optimize_line(&self, line: &str) -> String {
118        let mut result = line.to_string();
119        for (from, to) in &self.replacements {
120            result = result.replace(from.as_str(), to.as_str());
121        }
122        result = elide_lifetimes(&result);
123        result
124    }
125
126    pub fn optimize_block(&self, content: &str) -> String {
127        let optimized: Vec<String> = content
128            .lines()
129            .map(|line| self.optimize_line(line))
130            .collect();
131        let collapsed = collapse_closing_braces(&optimized);
132        collapsed.join("\n")
133    }
134
135    pub fn replacement_count(&self) -> usize {
136        self.replacements.len()
137    }
138
139    /// BPE cost oracle: measure the actual token cost of a string representation.
140    /// Used to pick the cheapest encoding when multiple are semantically equivalent.
141    pub fn token_cost(text: &str) -> usize {
142        crate::core::tokens::count_tokens(text)
143    }
144
145    /// Choose the cheaper representation between two semantically equivalent strings.
146    pub fn cheaper_repr<'a>(a: &'a str, b: &'a str) -> &'a str {
147        if Self::token_cost(a) <= Self::token_cost(b) {
148            a
149        } else {
150            b
151        }
152    }
153}
154
155fn elide_lifetimes(line: &str) -> String {
156    let mut result = line.to_string();
157    let patterns = ["'a ", "'b ", "'c ", "'static "];
158    for pat in &patterns {
159        if *pat == "'static " {
160            continue;
161        }
162        let with_ref = format!("&{pat}");
163        let with_mut = format!("&{pat}mut ");
164        result = result.replace(&with_mut, "&mut ");
165        result = result.replace(&with_ref, "&");
166    }
167    result
168}
169
170fn collapse_closing_braces(lines: &[String]) -> Vec<String> {
171    let mut result: Vec<String> = Vec::with_capacity(lines.len());
172    let mut brace_run = 0u32;
173
174    for line in lines {
175        let trimmed = line.trim();
176        if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
177            brace_run += 1;
178            if brace_run <= 2 {
179                result.push(trimmed.to_string());
180            } else if brace_run == 3 {
181                if let Some(last) = result.last_mut() {
182                    last.push_str(trimmed);
183                }
184            }
185            continue;
186        }
187        brace_run = 0;
188        result.push(line.clone());
189    }
190    result
191}
192
193#[cfg(test)]
194mod tests {
195    use super::*;
196
197    #[test]
198    fn default_optimizations_apply() {
199        let opt = TokenOptimizer::with_defaults();
200        assert_eq!(opt.optimize_line("function hello() {"), "fn hello() {");
201        assert_eq!(opt.optimize_line("boolean flag"), "bool flag");
202    }
203
204    #[test]
205    fn indentation_compresses() {
206        let opt = TokenOptimizer::with_defaults();
207        let input = "    let x = 1;";
208        let output = opt.optimize_line(input);
209        assert_eq!(output, "  let x = 1;");
210    }
211
212    #[test]
213    fn generic_types_simplify() {
214        let opt = TokenOptimizer::with_defaults();
215        assert_eq!(
216            opt.optimize_line("fn foo() -> Result<T, E>"),
217            "fn foo()->Result"
218        );
219        assert_eq!(
220            opt.optimize_line("fn bar() -> Option<T>"),
221            "fn bar()->Option"
222        );
223        assert_eq!(
224            opt.optimize_line("let v: Vec<String> = vec![]"),
225            "let v: Vec = vec![]"
226        );
227        assert_eq!(
228            opt.optimize_line("use std::collections::HashMap;"),
229            "use HashMap;"
230        );
231    }
232
233    #[test]
234    fn multiline_optimization() {
235        let opt = TokenOptimizer::with_defaults();
236        let input = "function hello() {\n    return 42;\n}";
237        let output = opt.optimize_block(input);
238        assert_eq!(output, "fn hello() {\n  return 42;\n}");
239    }
240
241    #[test]
242    fn lifetime_elision() {
243        let opt = TokenOptimizer::with_defaults();
244        assert_eq!(
245            opt.optimize_line("fn foo(&'a str) -> &'a str"),
246            "fn foo(&str)->&str"
247        );
248        assert_eq!(opt.optimize_line("fn bar(&'a mut Vec)"), "fn bar(&mut Vec)");
249        assert_eq!(
250            opt.optimize_line("fn baz(&'static str)"),
251            "fn baz(&'static str)",
252            "'static must not be elided"
253        );
254    }
255
256    #[test]
257    fn closing_brace_collapsing() {
258        let opt = TokenOptimizer::with_defaults();
259        let input = "fn main() {\n  inner() {\n    x\n  }\n}\n}\n}\n}\nfn next() {}";
260        let output = opt.optimize_block(input);
261        assert!(output.contains("fn next()"), "code after braces preserved");
262        let brace_only_lines: Vec<_> = output.lines().filter(|l| l.trim() == "}").collect();
263        assert!(
264            brace_only_lines.len() <= 2,
265            "should collapse 4+ closing braces"
266        );
267    }
268
269    #[test]
270    fn std_path_shortening() {
271        let opt = TokenOptimizer::with_defaults();
272        assert_eq!(opt.optimize_line("use std::path::PathBuf;"), "use PathBuf;");
273        assert_eq!(opt.optimize_line("use std::sync::Arc;"), "use Arc;");
274    }
275
276    #[test]
277    fn bpe_aligned_arrow_compression() {
278        let opt = TokenOptimizer::with_defaults();
279        assert_eq!(opt.optimize_line("fn foo() -> bool {"), "fn foo()->bool {");
280    }
281
282    #[test]
283    fn bpe_cost_oracle_works() {
284        let cost = TokenOptimizer::token_cost("hello world");
285        assert!(cost > 0);
286    }
287
288    #[test]
289    fn cheaper_repr_picks_shorter() {
290        let result = TokenOptimizer::cheaper_repr("fn foo() -> bool", "fn foo()->bool");
291        assert!(
292            TokenOptimizer::token_cost(result) <= TokenOptimizer::token_cost("fn foo() -> bool")
293        );
294    }
295}