Skip to main content

lean_ctx/core/
symbol_map.rs

1use std::collections::HashMap;
2
3use crate::core::tokens::count_tokens;
4
5macro_rules! static_regex {
6    ($pattern:expr) => {{
7        static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
8        RE.get_or_init(|| {
9            regex::Regex::new($pattern).expect(concat!("BUG: invalid static regex: ", $pattern))
10        })
11    }};
12}
13
14const MIN_IDENT_LENGTH: usize = 6;
15const SHORT_ID_PREFIX: char = 'α';
16
17#[derive(Debug, Clone)]
18pub struct SymbolMap {
19    forward: HashMap<String, String>,
20    next_id: usize,
21}
22
23impl Default for SymbolMap {
24    fn default() -> Self {
25        Self::new()
26    }
27}
28
29impl SymbolMap {
30    pub fn new() -> Self {
31        Self {
32            forward: HashMap::new(),
33            next_id: 1,
34        }
35    }
36
37    pub fn register(&mut self, identifier: &str) -> Option<String> {
38        if identifier.len() < MIN_IDENT_LENGTH {
39            return None;
40        }
41
42        if let Some(existing) = self.forward.get(identifier) {
43            return Some(existing.clone());
44        }
45
46        let short_id = format!("{SHORT_ID_PREFIX}{}", self.next_id);
47        self.next_id += 1;
48        self.forward
49            .insert(identifier.to_string(), short_id.clone());
50        Some(short_id)
51    }
52
53    pub fn apply(&self, text: &str) -> String {
54        if self.forward.is_empty() {
55            return text.to_string();
56        }
57
58        let mut sorted: Vec<(&String, &String)> = self.forward.iter().collect();
59        sorted.sort_by_key(|x| std::cmp::Reverse(x.0.len()));
60
61        let mut result = text.to_string();
62        for (long, short) in &sorted {
63            result = result.replace(long.as_str(), short.as_str());
64        }
65        result
66    }
67
68    pub fn format_table(&self) -> String {
69        if self.forward.is_empty() {
70            return String::new();
71        }
72
73        let mut entries: Vec<(&String, &String)> = self.forward.iter().collect();
74        entries.sort_by_key(|(_, v)| {
75            v.trim_start_matches(SHORT_ID_PREFIX)
76                .parse::<usize>()
77                .unwrap_or(0)
78        });
79
80        let mut table = String::from("\n§MAP:");
81        for (long, short) in &entries {
82            table.push_str(&format!("\n  {short}={long}"));
83        }
84        table
85    }
86
87    pub fn len(&self) -> usize {
88        self.forward.len()
89    }
90
91    pub fn is_empty(&self) -> bool {
92        self.forward.is_empty()
93    }
94}
95
96/// MAP entry cost in tokens: "  αN=identifier\n" ≈ short_id_tokens + ident_tokens + 2 (= and newline)
97const MAP_ENTRY_OVERHEAD: usize = 2;
98
99/// ROI-based decision: register only when total savings exceed the MAP entry cost.
100/// savings = occurrences * (tokens(ident) - tokens(short_id))
101/// cost    = tokens(ident) + tokens(short_id) + MAP_ENTRY_OVERHEAD
102pub fn should_register(identifier: &str, occurrences: usize, next_id: usize) -> bool {
103    if identifier.len() < MIN_IDENT_LENGTH {
104        return false;
105    }
106    let ident_tokens = count_tokens(identifier);
107    let short_id = format!("{SHORT_ID_PREFIX}{next_id}");
108    let short_tokens = count_tokens(&short_id);
109
110    let token_saving_per_use = ident_tokens.saturating_sub(short_tokens);
111    if token_saving_per_use == 0 {
112        return false;
113    }
114
115    let total_savings = occurrences * token_saving_per_use;
116    let entry_cost = ident_tokens + short_tokens + MAP_ENTRY_OVERHEAD;
117
118    total_savings > entry_cost
119}
120
121pub fn extract_identifiers(content: &str, ext: &str) -> Vec<String> {
122    let ident_re = static_regex!(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b");
123
124    let mut seen = HashMap::new();
125    for mat in ident_re.find_iter(content) {
126        let word = mat.as_str();
127        if word.len() >= MIN_IDENT_LENGTH && !is_keyword(word, ext) {
128            *seen.entry(word.to_string()).or_insert(0usize) += 1;
129        }
130    }
131
132    let mut next_id = 1usize;
133    let mut idents: Vec<(String, usize)> = seen
134        .into_iter()
135        .filter(|(ident, count)| {
136            let pass = should_register(ident, *count, next_id);
137            if pass {
138                next_id += 1;
139            }
140            pass
141        })
142        .collect();
143
144    idents.sort_by(|a, b| {
145        let savings_a = a.0.len() * a.1;
146        let savings_b = b.0.len() * b.1;
147        savings_b.cmp(&savings_a)
148    });
149
150    idents.into_iter().map(|(s, _)| s).collect()
151}
152
153fn is_keyword(word: &str, ext: &str) -> bool {
154    match ext {
155        "rs" => matches!(
156            word,
157            "continue" | "default" | "return" | "struct" | "unsafe" | "where"
158        ),
159        "ts" | "tsx" | "js" | "jsx" => matches!(
160            word,
161            "constructor" | "arguments" | "undefined" | "prototype" | "instanceof"
162        ),
163        "py" => matches!(word, "continue" | "lambda" | "return" | "import" | "class"),
164        _ => false,
165    }
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171
172    #[test]
173    fn test_should_register_short_ident_rejected() {
174        assert!(!should_register("foo", 100, 1));
175        assert!(!should_register("bar", 50, 1));
176        assert!(!should_register("x", 1000, 1));
177    }
178
179    #[test]
180    fn test_should_register_roi_positive() {
181        // Very long identifier (many BPE tokens) appearing 5 times
182        assert!(should_register(
183            "authenticate_user_credentials_handler",
184            5,
185            1
186        ));
187    }
188
189    #[test]
190    fn test_should_register_roi_negative_single_use() {
191        // Long ident but only 1 occurrence — MAP entry cost > savings
192        assert!(!should_register(
193            "authenticate_user_credentials_handler",
194            1,
195            1
196        ));
197    }
198
199    #[test]
200    fn test_should_register_roi_scales_with_frequency() {
201        let ident = "configuration_manager_instance";
202        // Should fail at low frequency, pass at high frequency
203        let passes_at_low = should_register(ident, 2, 1);
204        let passes_at_high = should_register(ident, 10, 1);
205        // At some point frequency makes it worthwhile
206        assert!(passes_at_high || !passes_at_low);
207    }
208
209    #[test]
210    fn test_extract_identifiers_roi_filtering() {
211        // Repeat a long identifier enough times that ROI is positive
212        let long = "authenticate_user_credentials_handler";
213        let content = format!("{long} {long} {long} {long} {long} short");
214        let result = extract_identifiers(&content, "rs");
215        assert!(result.contains(&long.to_string()));
216        assert!(!result.contains(&"short".to_string()));
217    }
218
219    #[test]
220    fn test_register_returns_existing() {
221        let mut map = SymbolMap::new();
222        let first = map.register("validateToken");
223        let second = map.register("validateToken");
224        assert_eq!(first, second);
225    }
226
227    #[test]
228    fn test_apply_replaces_identifiers() {
229        let mut map = SymbolMap::new();
230        map.register("validateToken");
231        let result = map.apply("call validateToken here");
232        assert!(result.contains("α1"));
233        assert!(!result.contains("validateToken"));
234    }
235
236    #[test]
237    fn test_format_table_output() {
238        let mut map = SymbolMap::new();
239        map.register("validateToken");
240        let table = map.format_table();
241        assert!(table.contains("§MAP:"));
242        assert!(table.contains("α1=validateToken"));
243    }
244}