Skip to main content

lean_ctx/core/
symbol_map.rs

1use std::collections::HashMap;
2
3use crate::core::tokens::count_tokens;
4
5const MIN_IDENT_LENGTH: usize = 6;
6const SHORT_ID_PREFIX: char = 'α';
7
8#[derive(Debug, Clone)]
9#[allow(dead_code)]
10pub struct SymbolMap {
11    forward: HashMap<String, String>,
12    next_id: usize,
13}
14
15impl Default for SymbolMap {
16    fn default() -> Self {
17        Self::new()
18    }
19}
20
21impl SymbolMap {
22    pub fn new() -> Self {
23        Self {
24            forward: HashMap::new(),
25            next_id: 1,
26        }
27    }
28
29    pub fn register(&mut self, identifier: &str) -> Option<String> {
30        if identifier.len() < MIN_IDENT_LENGTH {
31            return None;
32        }
33
34        if let Some(existing) = self.forward.get(identifier) {
35            return Some(existing.clone());
36        }
37
38        let short_id = format!("{SHORT_ID_PREFIX}{}", self.next_id);
39        self.next_id += 1;
40        self.forward
41            .insert(identifier.to_string(), short_id.clone());
42        Some(short_id)
43    }
44
45    pub fn apply(&self, text: &str) -> String {
46        if self.forward.is_empty() {
47            return text.to_string();
48        }
49
50        let mut sorted: Vec<(&String, &String)> = self.forward.iter().collect();
51        sorted.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
52
53        let mut result = text.to_string();
54        for (long, short) in &sorted {
55            result = result.replace(long.as_str(), short.as_str());
56        }
57        result
58    }
59
60    pub fn format_table(&self) -> String {
61        if self.forward.is_empty() {
62            return String::new();
63        }
64
65        let mut entries: Vec<(&String, &String)> = self.forward.iter().collect();
66        entries.sort_by_key(|(_, v)| {
67            v.trim_start_matches(SHORT_ID_PREFIX)
68                .parse::<usize>()
69                .unwrap_or(0)
70        });
71
72        let mut table = String::from("\n§MAP:");
73        for (long, short) in &entries {
74            table.push_str(&format!("\n  {short}={long}"));
75        }
76        table
77    }
78
79    #[allow(dead_code)]
80    pub fn len(&self) -> usize {
81        self.forward.len()
82    }
83
84    #[allow(dead_code)]
85    pub fn is_empty(&self) -> bool {
86        self.forward.is_empty()
87    }
88}
89
90/// MAP entry cost in tokens: "  αN=identifier\n" ≈ short_id_tokens + ident_tokens + 2 (= and newline)
91const MAP_ENTRY_OVERHEAD: usize = 2;
92
93/// ROI-based decision: register only when total savings exceed the MAP entry cost.
94/// savings = occurrences * (tokens(ident) - tokens(short_id))
95/// cost    = tokens(ident) + tokens(short_id) + MAP_ENTRY_OVERHEAD
96pub fn should_register(identifier: &str, occurrences: usize, next_id: usize) -> bool {
97    if identifier.len() < MIN_IDENT_LENGTH {
98        return false;
99    }
100    let ident_tokens = count_tokens(identifier);
101    let short_id = format!("{SHORT_ID_PREFIX}{next_id}");
102    let short_tokens = count_tokens(&short_id);
103
104    let token_saving_per_use = ident_tokens.saturating_sub(short_tokens);
105    if token_saving_per_use == 0 {
106        return false;
107    }
108
109    let total_savings = occurrences * token_saving_per_use;
110    let entry_cost = ident_tokens + short_tokens + MAP_ENTRY_OVERHEAD;
111
112    total_savings > entry_cost
113}
114
115pub fn extract_identifiers(content: &str, ext: &str) -> Vec<String> {
116    let ident_re = regex::Regex::new(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b").unwrap();
117
118    let mut seen = HashMap::new();
119    for mat in ident_re.find_iter(content) {
120        let word = mat.as_str();
121        if word.len() >= MIN_IDENT_LENGTH && !is_keyword(word, ext) {
122            *seen.entry(word.to_string()).or_insert(0usize) += 1;
123        }
124    }
125
126    let mut next_id = 1usize;
127    let mut idents: Vec<(String, usize)> = seen
128        .into_iter()
129        .filter(|(ident, count)| {
130            let pass = should_register(ident, *count, next_id);
131            if pass {
132                next_id += 1;
133            }
134            pass
135        })
136        .collect();
137
138    idents.sort_by(|a, b| {
139        let savings_a = a.0.len() * a.1;
140        let savings_b = b.0.len() * b.1;
141        savings_b.cmp(&savings_a)
142    });
143
144    idents.into_iter().map(|(s, _)| s).collect()
145}
146
147fn is_keyword(word: &str, ext: &str) -> bool {
148    match ext {
149        "rs" => matches!(
150            word,
151            "continue" | "default" | "return" | "struct" | "unsafe" | "where"
152        ),
153        "ts" | "tsx" | "js" | "jsx" => matches!(
154            word,
155            "constructor" | "arguments" | "undefined" | "prototype" | "instanceof"
156        ),
157        "py" => matches!(word, "continue" | "lambda" | "return" | "import" | "class"),
158        _ => false,
159    }
160}
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165
166    #[test]
167    fn test_should_register_short_ident_rejected() {
168        assert!(!should_register("foo", 100, 1));
169        assert!(!should_register("bar", 50, 1));
170        assert!(!should_register("x", 1000, 1));
171    }
172
173    #[test]
174    fn test_should_register_roi_positive() {
175        // Very long identifier (many BPE tokens) appearing 5 times
176        assert!(should_register(
177            "authenticate_user_credentials_handler",
178            5,
179            1
180        ));
181    }
182
183    #[test]
184    fn test_should_register_roi_negative_single_use() {
185        // Long ident but only 1 occurrence — MAP entry cost > savings
186        assert!(!should_register(
187            "authenticate_user_credentials_handler",
188            1,
189            1
190        ));
191    }
192
193    #[test]
194    fn test_should_register_roi_scales_with_frequency() {
195        let ident = "configuration_manager_instance";
196        // Should fail at low frequency, pass at high frequency
197        let passes_at_low = should_register(ident, 2, 1);
198        let passes_at_high = should_register(ident, 10, 1);
199        // At some point frequency makes it worthwhile
200        assert!(passes_at_high || !passes_at_low);
201    }
202
203    #[test]
204    fn test_extract_identifiers_roi_filtering() {
205        // Repeat a long identifier enough times that ROI is positive
206        let long = "authenticate_user_credentials_handler";
207        let content = format!("{long} {long} {long} {long} {long} short");
208        let result = extract_identifiers(&content, "rs");
209        assert!(result.contains(&long.to_string()));
210        assert!(!result.contains(&"short".to_string()));
211    }
212
213    #[test]
214    fn test_register_returns_existing() {
215        let mut map = SymbolMap::new();
216        let first = map.register("validateToken");
217        let second = map.register("validateToken");
218        assert_eq!(first, second);
219    }
220
221    #[test]
222    fn test_apply_replaces_identifiers() {
223        let mut map = SymbolMap::new();
224        map.register("validateToken");
225        let result = map.apply("call validateToken here");
226        assert!(result.contains("α1"));
227        assert!(!result.contains("validateToken"));
228    }
229
230    #[test]
231    fn test_format_table_output() {
232        let mut map = SymbolMap::new();
233        map.register("validateToken");
234        let table = map.format_table();
235        assert!(table.contains("§MAP:"));
236        assert!(table.contains("α1=validateToken"));
237    }
238}