Skip to main content

lean_ctx/core/neural/
token_optimizer.rs

1//! Token-optimal encoding based on empirical lab results.
2//!
3//! Uses a lookup table (concept -> optimal representation) derived from
4//! Experiment C's cross-tokenizer analysis. Falls back to identity when
5//! no optimizations are known.
6
7use std::collections::HashMap;
8use std::path::Path;
9
10pub struct TokenOptimizer {
11    replacements: HashMap<String, String>,
12}
13
14// Lab Experiment C (2026-04-02): Unicode symbols (λ, →, §, ∂, ⊕) INCREASE token count
15// on GPT-4/GPT-4o tokenizers. English keywords already encode as 1 token each.
16// Only use ASCII abbreviations that tokenizers handle well.
17const DEFAULT_OPTIMIZATIONS: &[(&str, &str)] = &[
18    ("function ", "fn "),
19    ("boolean", "bool"),
20    ("string", "str"),
21    ("number", "num"),
22    ("undefined", "undef"),
23    ("console.log", "log"),
24    ("export function ", "fn "),
25    ("    ", "  "),
26    ("Result<T, E>", "Result"),
27    ("Result<T,E>", "Result"),
28    ("Option<T>", "Option"),
29    ("Vec<String>", "Vec"),
30    ("Vec<&str>", "Vec"),
31    ("Vec<u8>", "Vec"),
32    ("HashMap<String, String>", "HashMap"),
33    ("HashMap<K, V>", "HashMap"),
34    ("HashMap<K,V>", "HashMap"),
35    ("BTreeMap<K, V>", "BTreeMap"),
36    ("HashSet<String>", "HashSet"),
37    ("Box<dyn Error>", "Box<Error>"),
38    ("Arc<Mutex<", "Arc<Mutex<"),
39    ("std::collections::HashMap", "HashMap"),
40    ("std::collections::HashSet", "HashSet"),
41    ("std::collections::BTreeMap", "BTreeMap"),
42    ("std::path::PathBuf", "PathBuf"),
43    ("std::path::Path", "Path"),
44    ("std::sync::Arc", "Arc"),
45    ("std::sync::Mutex", "Mutex"),
46    ("std::io::Result", "io::Result"),
47    ("std::fmt::Display", "Display"),
48    ("std::fmt::Debug", "Debug"),
49];
50
51impl TokenOptimizer {
52    pub fn load_or_default(model_dir: &Path) -> Self {
53        let config_path = model_dir.join("token_optimizer.json");
54        if config_path.exists() {
55            match Self::load_from_file(&config_path) {
56                Ok(opt) => {
57                    tracing::info!(
58                        "Token optimizer loaded ({} rules) from {:?}",
59                        opt.replacements.len(),
60                        config_path,
61                    );
62                    return opt;
63                }
64                Err(e) => {
65                    tracing::warn!("Failed to load token optimizer: {e}. Using defaults.");
66                }
67            }
68        }
69
70        Self::with_defaults()
71    }
72
73    pub fn with_defaults() -> Self {
74        let replacements: HashMap<String, String> = DEFAULT_OPTIMIZATIONS
75            .iter()
76            .map(|(k, v)| (k.to_string(), v.to_string()))
77            .collect();
78
79        Self { replacements }
80    }
81
82    fn load_from_file(path: &Path) -> anyhow::Result<Self> {
83        let content = std::fs::read_to_string(path)?;
84        let data: HashMap<String, String> = serde_json::from_str(&content)?;
85        Ok(Self { replacements: data })
86    }
87
88    pub fn optimize<'a>(&'a self, _concept: &str, representation: &'a str) -> &'a str {
89        representation
90    }
91
92    pub fn optimize_line(&self, line: &str) -> String {
93        let mut result = line.to_string();
94        for (from, to) in &self.replacements {
95            result = result.replace(from.as_str(), to.as_str());
96        }
97        result = elide_lifetimes(&result);
98        result
99    }
100
101    pub fn optimize_block(&self, content: &str) -> String {
102        let optimized: Vec<String> = content
103            .lines()
104            .map(|line| self.optimize_line(line))
105            .collect();
106        let collapsed = collapse_closing_braces(&optimized);
107        collapsed.join("\n")
108    }
109
110    pub fn replacement_count(&self) -> usize {
111        self.replacements.len()
112    }
113}
114
115fn elide_lifetimes(line: &str) -> String {
116    let mut result = line.to_string();
117    let patterns = ["'a ", "'b ", "'c ", "'static "];
118    for pat in &patterns {
119        if *pat == "'static " {
120            continue;
121        }
122        let with_ref = format!("&{pat}");
123        let with_mut = format!("&{pat}mut ");
124        result = result.replace(&with_mut, "&mut ");
125        result = result.replace(&with_ref, "&");
126    }
127    result
128}
129
130fn collapse_closing_braces(lines: &[String]) -> Vec<String> {
131    let mut result: Vec<String> = Vec::with_capacity(lines.len());
132    let mut brace_run = 0u32;
133
134    for line in lines {
135        let trimmed = line.trim();
136        if matches!(trimmed, "}" | "};" | ");" | "});" | ")") {
137            brace_run += 1;
138            if brace_run <= 2 {
139                result.push(trimmed.to_string());
140            } else if brace_run == 3 {
141                if let Some(last) = result.last_mut() {
142                    last.push_str(trimmed);
143                }
144            }
145            continue;
146        }
147        brace_run = 0;
148        result.push(line.clone());
149    }
150    result
151}
152
153#[cfg(test)]
154mod tests {
155    use super::*;
156
157    #[test]
158    fn default_optimizations_apply() {
159        let opt = TokenOptimizer::with_defaults();
160        assert_eq!(opt.optimize_line("function hello() {"), "fn hello() {");
161        assert_eq!(opt.optimize_line("boolean flag"), "bool flag");
162    }
163
164    #[test]
165    fn indentation_compresses() {
166        let opt = TokenOptimizer::with_defaults();
167        let input = "    let x = 1;";
168        let output = opt.optimize_line(input);
169        assert_eq!(output, "  let x = 1;");
170    }
171
172    #[test]
173    fn generic_types_simplify() {
174        let opt = TokenOptimizer::with_defaults();
175        assert_eq!(
176            opt.optimize_line("fn foo() -> Result<T, E>"),
177            "fn foo() -> Result"
178        );
179        assert_eq!(
180            opt.optimize_line("fn bar() -> Option<T>"),
181            "fn bar() -> Option"
182        );
183        assert_eq!(
184            opt.optimize_line("let v: Vec<String> = vec![]"),
185            "let v: Vec = vec![]"
186        );
187        assert_eq!(
188            opt.optimize_line("use std::collections::HashMap;"),
189            "use HashMap;"
190        );
191    }
192
193    #[test]
194    fn multiline_optimization() {
195        let opt = TokenOptimizer::with_defaults();
196        let input = "function hello() {\n    return 42;\n}";
197        let output = opt.optimize_block(input);
198        assert_eq!(output, "fn hello() {\n  return 42;\n}");
199    }
200
201    #[test]
202    fn lifetime_elision() {
203        let opt = TokenOptimizer::with_defaults();
204        assert_eq!(
205            opt.optimize_line("fn foo(&'a str) -> &'a str"),
206            "fn foo(&str) -> &str"
207        );
208        assert_eq!(opt.optimize_line("fn bar(&'a mut Vec)"), "fn bar(&mut Vec)");
209        assert_eq!(
210            opt.optimize_line("fn baz(&'static str)"),
211            "fn baz(&'static str)",
212            "'static must not be elided"
213        );
214    }
215
216    #[test]
217    fn closing_brace_collapsing() {
218        let opt = TokenOptimizer::with_defaults();
219        let input = "fn main() {\n  inner() {\n    x\n  }\n}\n}\n}\n}\nfn next() {}";
220        let output = opt.optimize_block(input);
221        assert!(output.contains("fn next()"), "code after braces preserved");
222        let brace_only_lines: Vec<_> = output.lines().filter(|l| l.trim() == "}").collect();
223        assert!(
224            brace_only_lines.len() <= 2,
225            "should collapse 4+ closing braces"
226        );
227    }
228
229    #[test]
230    fn std_path_shortening() {
231        let opt = TokenOptimizer::with_defaults();
232        assert_eq!(opt.optimize_line("use std::path::PathBuf;"), "use PathBuf;");
233        assert_eq!(opt.optimize_line("use std::sync::Arc;"), "use Arc;");
234    }
235}