Skip to main content

lean_ctx/core/
symbol_map.rs

1use std::collections::HashMap;
2
3use crate::core::tokens::count_tokens;
4
5const MIN_IDENT_LENGTH: usize = 6;
6const SHORT_ID_PREFIX: char = 'α';
7
8#[derive(Debug, Clone)]
9pub struct SymbolMap {
10    forward: HashMap<String, String>,
11    next_id: usize,
12}
13
14impl Default for SymbolMap {
15    fn default() -> Self {
16        Self::new()
17    }
18}
19
20impl SymbolMap {
21    pub fn new() -> Self {
22        Self {
23            forward: HashMap::new(),
24            next_id: 1,
25        }
26    }
27
28    pub fn register(&mut self, identifier: &str) -> Option<String> {
29        if identifier.len() < MIN_IDENT_LENGTH {
30            return None;
31        }
32
33        if let Some(existing) = self.forward.get(identifier) {
34            return Some(existing.clone());
35        }
36
37        let short_id = format!("{SHORT_ID_PREFIX}{}", self.next_id);
38        self.next_id += 1;
39        self.forward
40            .insert(identifier.to_string(), short_id.clone());
41        Some(short_id)
42    }
43
44    pub fn apply(&self, text: &str) -> String {
45        if self.forward.is_empty() {
46            return text.to_string();
47        }
48
49        let mut sorted: Vec<(&String, &String)> = self.forward.iter().collect();
50        sorted.sort_by_key(|x| std::cmp::Reverse(x.0.len()));
51
52        let mut result = text.to_string();
53        for (long, short) in &sorted {
54            result = result.replace(long.as_str(), short.as_str());
55        }
56        result
57    }
58
59    pub fn format_table(&self) -> String {
60        if self.forward.is_empty() {
61            return String::new();
62        }
63
64        let mut entries: Vec<(&String, &String)> = self.forward.iter().collect();
65        entries.sort_by_key(|(_, v)| {
66            v.trim_start_matches(SHORT_ID_PREFIX)
67                .parse::<usize>()
68                .unwrap_or(0)
69        });
70
71        let mut table = String::from("\n§MAP:");
72        for (long, short) in &entries {
73            table.push_str(&format!("\n  {short}={long}"));
74        }
75        table
76    }
77
78    pub fn len(&self) -> usize {
79        self.forward.len()
80    }
81
82    pub fn is_empty(&self) -> bool {
83        self.forward.is_empty()
84    }
85}
86
87/// MAP entry cost in tokens: "  αN=identifier\n" ≈ short_id_tokens + ident_tokens + 2 (= and newline)
88const MAP_ENTRY_OVERHEAD: usize = 2;
89
90/// ROI-based decision: register only when total savings exceed the MAP entry cost.
91/// savings = occurrences * (tokens(ident) - tokens(short_id))
92/// cost    = tokens(ident) + tokens(short_id) + MAP_ENTRY_OVERHEAD
93pub fn should_register(identifier: &str, occurrences: usize, next_id: usize) -> bool {
94    if identifier.len() < MIN_IDENT_LENGTH {
95        return false;
96    }
97    let ident_tokens = count_tokens(identifier);
98    let short_id = format!("{SHORT_ID_PREFIX}{next_id}");
99    let short_tokens = count_tokens(&short_id);
100
101    let token_saving_per_use = ident_tokens.saturating_sub(short_tokens);
102    if token_saving_per_use == 0 {
103        return false;
104    }
105
106    let total_savings = occurrences * token_saving_per_use;
107    let entry_cost = ident_tokens + short_tokens + MAP_ENTRY_OVERHEAD;
108
109    total_savings > entry_cost
110}
111
112pub fn extract_identifiers(content: &str, ext: &str) -> Vec<String> {
113    let ident_re = regex::Regex::new(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b").unwrap();
114
115    let mut seen = HashMap::new();
116    for mat in ident_re.find_iter(content) {
117        let word = mat.as_str();
118        if word.len() >= MIN_IDENT_LENGTH && !is_keyword(word, ext) {
119            *seen.entry(word.to_string()).or_insert(0usize) += 1;
120        }
121    }
122
123    let mut next_id = 1usize;
124    let mut idents: Vec<(String, usize)> = seen
125        .into_iter()
126        .filter(|(ident, count)| {
127            let pass = should_register(ident, *count, next_id);
128            if pass {
129                next_id += 1;
130            }
131            pass
132        })
133        .collect();
134
135    idents.sort_by(|a, b| {
136        let savings_a = a.0.len() * a.1;
137        let savings_b = b.0.len() * b.1;
138        savings_b.cmp(&savings_a)
139    });
140
141    idents.into_iter().map(|(s, _)| s).collect()
142}
143
144fn is_keyword(word: &str, ext: &str) -> bool {
145    match ext {
146        "rs" => matches!(
147            word,
148            "continue" | "default" | "return" | "struct" | "unsafe" | "where"
149        ),
150        "ts" | "tsx" | "js" | "jsx" => matches!(
151            word,
152            "constructor" | "arguments" | "undefined" | "prototype" | "instanceof"
153        ),
154        "py" => matches!(word, "continue" | "lambda" | "return" | "import" | "class"),
155        _ => false,
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162
163    #[test]
164    fn test_should_register_short_ident_rejected() {
165        assert!(!should_register("foo", 100, 1));
166        assert!(!should_register("bar", 50, 1));
167        assert!(!should_register("x", 1000, 1));
168    }
169
170    #[test]
171    fn test_should_register_roi_positive() {
172        // Very long identifier (many BPE tokens) appearing 5 times
173        assert!(should_register(
174            "authenticate_user_credentials_handler",
175            5,
176            1
177        ));
178    }
179
180    #[test]
181    fn test_should_register_roi_negative_single_use() {
182        // Long ident but only 1 occurrence — MAP entry cost > savings
183        assert!(!should_register(
184            "authenticate_user_credentials_handler",
185            1,
186            1
187        ));
188    }
189
190    #[test]
191    fn test_should_register_roi_scales_with_frequency() {
192        let ident = "configuration_manager_instance";
193        // Should fail at low frequency, pass at high frequency
194        let passes_at_low = should_register(ident, 2, 1);
195        let passes_at_high = should_register(ident, 10, 1);
196        // At some point frequency makes it worthwhile
197        assert!(passes_at_high || !passes_at_low);
198    }
199
200    #[test]
201    fn test_extract_identifiers_roi_filtering() {
202        // Repeat a long identifier enough times that ROI is positive
203        let long = "authenticate_user_credentials_handler";
204        let content = format!("{long} {long} {long} {long} {long} short");
205        let result = extract_identifiers(&content, "rs");
206        assert!(result.contains(&long.to_string()));
207        assert!(!result.contains(&"short".to_string()));
208    }
209
210    #[test]
211    fn test_register_returns_existing() {
212        let mut map = SymbolMap::new();
213        let first = map.register("validateToken");
214        let second = map.register("validateToken");
215        assert_eq!(first, second);
216    }
217
218    #[test]
219    fn test_apply_replaces_identifiers() {
220        let mut map = SymbolMap::new();
221        map.register("validateToken");
222        let result = map.apply("call validateToken here");
223        assert!(result.contains("α1"));
224        assert!(!result.contains("validateToken"));
225    }
226
227    #[test]
228    fn test_format_table_output() {
229        let mut map = SymbolMap::new();
230        map.register("validateToken");
231        let table = map.format_table();
232        assert!(table.contains("§MAP:"));
233        assert!(table.contains("α1=validateToken"));
234    }
235}