Skip to main content

lean_ctx/core/
symbol_map.rs

1use std::collections::HashMap;
2
3use crate::core::tokens::count_tokens;
4
5macro_rules! static_regex {
6    ($pattern:expr) => {{
7        static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
8        RE.get_or_init(|| {
9            regex::Regex::new($pattern).expect(concat!("BUG: invalid static regex: ", $pattern))
10        })
11    }};
12}
13
14const MIN_IDENT_LENGTH: usize = 6;
15const SHORT_ID_PREFIX: char = 'α';
16
17/// Whether alpha/§MAP identifier substitution should be applied to tool output.
18///
19/// Activation order:
20/// 1. `LEAN_CTX_SYMBOL_MAP=1` env var → force on
21/// 2. `LEAN_CTX_SYMBOL_MAP=0` env var → force off
22/// 3. `symbol_map_auto = true` in config + project >50 source files → auto-on
23/// 4. Default: off (the abbreviated form hinders editing; opt-in only)
24pub fn substitution_enabled() -> bool {
25    if let Ok(v) = std::env::var("LEAN_CTX_SYMBOL_MAP") {
26        return v == "1" || v.eq_ignore_ascii_case("true") || v.eq_ignore_ascii_case("on");
27    }
28    let cfg = crate::core::config::Config::load();
29    if cfg.symbol_map_auto {
30        return auto_detect_large_project();
31    }
32    false
33}
34
35fn auto_detect_large_project() -> bool {
36    use std::sync::OnceLock;
37    static DETECTED: OnceLock<bool> = OnceLock::new();
38    *DETECTED.get_or_init(|| {
39        let cwd = std::env::current_dir().unwrap_or_default();
40        let source_exts = [
41            "rs", "ts", "tsx", "js", "jsx", "py", "go", "java", "rb", "cpp", "c", "h",
42        ];
43        let count = ignore::WalkBuilder::new(&cwd)
44            .hidden(true)
45            .max_depth(Some(6))
46            .git_ignore(true)
47            .build()
48            .filter_map(std::result::Result::ok)
49            .filter(|e| {
50                e.file_type().is_some_and(|ft| ft.is_file())
51                    && e.path()
52                        .extension()
53                        .and_then(|ext| ext.to_str())
54                        .is_some_and(|ext| source_exts.contains(&ext))
55            })
56            .take(51)
57            .count();
58        count > 50
59    })
60}
61
62#[derive(Debug, Clone)]
63pub struct SymbolMap {
64    forward: HashMap<String, String>,
65    next_id: usize,
66}
67
68impl Default for SymbolMap {
69    fn default() -> Self {
70        Self::new()
71    }
72}
73
74impl SymbolMap {
75    pub fn new() -> Self {
76        Self {
77            forward: HashMap::new(),
78            next_id: 1,
79        }
80    }
81
82    pub fn register(&mut self, identifier: &str) -> Option<String> {
83        if identifier.len() < MIN_IDENT_LENGTH {
84            return None;
85        }
86
87        if let Some(existing) = self.forward.get(identifier) {
88            return Some(existing.clone());
89        }
90
91        let short_id = format!("{SHORT_ID_PREFIX}{}", self.next_id);
92        self.next_id += 1;
93        self.forward
94            .insert(identifier.to_string(), short_id.clone());
95        Some(short_id)
96    }
97
98    pub fn apply(&self, text: &str) -> String {
99        if self.forward.is_empty() {
100            return text.to_string();
101        }
102
103        let mut sorted: Vec<(&String, &String)> = self.forward.iter().collect();
104        sorted.sort_by_key(|x| std::cmp::Reverse(x.0.len()));
105
106        let mut result = text.to_string();
107        for (long, short) in &sorted {
108            result = result.replace(long.as_str(), short.as_str());
109        }
110        result
111    }
112
113    pub fn format_table(&self) -> String {
114        if self.forward.is_empty() {
115            return String::new();
116        }
117
118        let mut entries: Vec<(&String, &String)> = self.forward.iter().collect();
119        entries.sort_by_key(|(_, v)| {
120            v.trim_start_matches(SHORT_ID_PREFIX)
121                .parse::<usize>()
122                .unwrap_or(0)
123        });
124
125        let mut table = String::from("\n§MAP:");
126        for (long, short) in &entries {
127            table.push_str(&format!("\n  {short}={long}"));
128        }
129        table
130    }
131
132    pub fn len(&self) -> usize {
133        self.forward.len()
134    }
135
136    pub fn is_empty(&self) -> bool {
137        self.forward.is_empty()
138    }
139}
140
141/// MAP entry cost in tokens: "  αN=identifier\n" ≈ short_id_tokens + ident_tokens + 2 (= and newline)
142const MAP_ENTRY_OVERHEAD: usize = 2;
143
144/// ROI-based decision: register only when total savings exceed the MAP entry cost.
145/// savings = occurrences * (tokens(ident) - tokens(short_id))
146/// cost    = tokens(ident) + tokens(short_id) + MAP_ENTRY_OVERHEAD
147pub fn should_register(identifier: &str, occurrences: usize, next_id: usize) -> bool {
148    if identifier.len() < MIN_IDENT_LENGTH {
149        return false;
150    }
151    let ident_tokens = count_tokens(identifier);
152    let short_id = format!("{SHORT_ID_PREFIX}{next_id}");
153    let short_tokens = count_tokens(&short_id);
154
155    let token_saving_per_use = ident_tokens.saturating_sub(short_tokens);
156    if token_saving_per_use == 0 {
157        return false;
158    }
159
160    let total_savings = occurrences * token_saving_per_use;
161    let entry_cost = ident_tokens + short_tokens + MAP_ENTRY_OVERHEAD;
162
163    total_savings > entry_cost
164}
165
166pub fn extract_identifiers(content: &str, ext: &str) -> Vec<String> {
167    let ident_re = static_regex!(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b");
168
169    let mut seen = HashMap::new();
170    for mat in ident_re.find_iter(content) {
171        let word = mat.as_str();
172        if word.len() >= MIN_IDENT_LENGTH && !is_keyword(word, ext) {
173            *seen.entry(word.to_string()).or_insert(0usize) += 1;
174        }
175    }
176
177    let mut next_id = 1usize;
178    let mut idents: Vec<(String, usize)> = seen
179        .into_iter()
180        .filter(|(ident, count)| {
181            let pass = should_register(ident, *count, next_id);
182            if pass {
183                next_id += 1;
184            }
185            pass
186        })
187        .collect();
188
189    idents.sort_by(|a, b| {
190        let savings_a = a.0.len() * a.1;
191        let savings_b = b.0.len() * b.1;
192        savings_b.cmp(&savings_a)
193    });
194
195    idents.into_iter().map(|(s, _)| s).collect()
196}
197
198fn is_keyword(word: &str, ext: &str) -> bool {
199    match ext {
200        "rs" => matches!(
201            word,
202            "continue" | "default" | "return" | "struct" | "unsafe" | "where"
203        ),
204        "ts" | "tsx" | "js" | "jsx" => matches!(
205            word,
206            "constructor" | "arguments" | "undefined" | "prototype" | "instanceof"
207        ),
208        "py" => matches!(word, "continue" | "lambda" | "return" | "import" | "class"),
209        _ => false,
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    #[test]
218    fn test_should_register_short_ident_rejected() {
219        assert!(!should_register("foo", 100, 1));
220        assert!(!should_register("bar", 50, 1));
221        assert!(!should_register("x", 1000, 1));
222    }
223
224    #[test]
225    fn test_should_register_roi_positive() {
226        // Very long identifier (many BPE tokens) appearing 5 times
227        assert!(should_register(
228            "authenticate_user_credentials_handler",
229            5,
230            1
231        ));
232    }
233
234    #[test]
235    fn test_should_register_roi_negative_single_use() {
236        // Long ident but only 1 occurrence — MAP entry cost > savings
237        assert!(!should_register(
238            "authenticate_user_credentials_handler",
239            1,
240            1
241        ));
242    }
243
244    #[test]
245    fn test_should_register_roi_scales_with_frequency() {
246        let ident = "configuration_manager_instance";
247        // Should fail at low frequency, pass at high frequency
248        let passes_at_low = should_register(ident, 2, 1);
249        let passes_at_high = should_register(ident, 10, 1);
250        // At some point frequency makes it worthwhile
251        assert!(passes_at_high || !passes_at_low);
252    }
253
254    #[test]
255    fn test_extract_identifiers_roi_filtering() {
256        // Repeat a long identifier enough times that ROI is positive
257        let long = "authenticate_user_credentials_handler";
258        let content = format!("{long} {long} {long} {long} {long} short");
259        let result = extract_identifiers(&content, "rs");
260        assert!(result.contains(&long.to_string()));
261        assert!(!result.contains(&"short".to_string()));
262    }
263
264    #[test]
265    fn test_register_returns_existing() {
266        let mut map = SymbolMap::new();
267        let first = map.register("validateToken");
268        let second = map.register("validateToken");
269        assert_eq!(first, second);
270    }
271
272    #[test]
273    fn test_apply_replaces_identifiers() {
274        let mut map = SymbolMap::new();
275        map.register("validateToken");
276        let result = map.apply("call validateToken here");
277        assert!(result.contains("α1"));
278        assert!(!result.contains("validateToken"));
279    }
280
281    #[test]
282    fn test_format_table_output() {
283        let mut map = SymbolMap::new();
284        map.register("validateToken");
285        let table = map.format_table();
286        assert!(table.contains("§MAP:"));
287        assert!(table.contains("α1=validateToken"));
288    }
289}