use super::estimate_tokens;
use std::collections::{HashMap, HashSet};
const MIN_IDENT_LENGTH: usize = 6;
pub struct AnchorGenerator {
index: usize,
}
impl Default for AnchorGenerator {
fn default() -> Self {
Self::new()
}
}
impl AnchorGenerator {
pub fn new() -> Self {
Self { index: 0 }
}
}
impl Iterator for AnchorGenerator {
type Item = String;
fn next(&mut self) -> Option<Self::Item> {
let mut num = self.index;
self.index += 1;
let mut result = String::new();
loop {
let rem = num % 52;
let chr = if rem < 26 {
(b'A' + rem as u8) as char
} else {
(b'a' + (rem - 26) as u8) as char
};
result.insert(0, chr);
if num < 52 {
break;
}
num = (num / 52) - 1;
}
Some(result)
}
}
#[derive(Debug, Clone)]
pub struct SymbolMap {
forward: HashMap<String, String>,
existing_words: HashSet<String>,
}
impl SymbolMap {
pub fn new(content: &str) -> Self {
let ident_re = regex::Regex::new(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b").unwrap();
let mut existing_words = HashSet::new();
for mat in ident_re.find_iter(content) {
existing_words.insert(mat.as_str().to_string());
}
Self {
forward: HashMap::new(),
existing_words,
}
}
pub fn register(&mut self, identifier: &str) -> Option<String> {
if identifier.len() < MIN_IDENT_LENGTH {
return None;
}
if let Some(existing) = self.forward.get(identifier) {
return Some(existing.clone());
}
let mut generator = AnchorGenerator::new();
loop {
let short_id = generator.next().unwrap();
if !self.existing_words.contains(&short_id) {
self.forward
.insert(identifier.to_string(), short_id.clone());
self.existing_words.insert(short_id.clone());
return Some(short_id);
}
}
}
pub fn apply(&self, text: &str) -> String {
if self.forward.is_empty() {
return text.to_string();
}
let mut sorted: Vec<(&String, &String)> = self.forward.iter().collect();
sorted.sort_by_key(|b| std::cmp::Reverse(b.0.len()));
let mut result = text.to_string();
for (long, short) in &sorted {
result = result.replace(long.as_str(), short.as_str());
}
result
}
pub fn format_table(&self) -> String {
if self.forward.is_empty() {
return String::new();
}
let mut entries: Vec<(&String, &String)> = self.forward.iter().collect();
entries.sort_by(|(_, a), (_, b)| a.len().cmp(&b.len()).then(a.cmp(b)));
let mut table = String::from("\n[MAP]:");
for (long, short) in &entries {
table.push_str(&format!("\n {}={}", short, long));
}
table
}
pub fn len(&self) -> usize {
self.forward.len()
}
pub fn is_empty(&self) -> bool {
self.forward.is_empty()
}
}
const MAP_ENTRY_OVERHEAD: usize = 2;
pub fn should_register(identifier: &str, occurrences: usize) -> bool {
if identifier.len() < MIN_IDENT_LENGTH {
return false;
}
let ident_tokens = estimate_tokens(identifier);
let short_tokens = 1;
let token_saving_per_use = ident_tokens.saturating_sub(short_tokens);
if token_saving_per_use == 0 {
return false;
}
let total_savings = occurrences * token_saving_per_use;
let entry_cost = ident_tokens + short_tokens + MAP_ENTRY_OVERHEAD;
total_savings > entry_cost
}
fn is_keyword(word: &str, ext: &str) -> bool {
match ext {
"rs" => matches!(
word,
"continue" | "default" | "return" | "struct" | "unsafe" | "where" | "match" | "impl"
),
"ts" | "tsx" | "js" | "jsx" => matches!(
word,
"constructor" | "arguments" | "undefined" | "prototype" | "instanceof" | "function"
),
"py" => matches!(
word,
"continue" | "lambda" | "return" | "import" | "class" | "def"
),
"go" => matches!(
word,
"continue" | "default" | "return" | "struct" | "interface" | "func"
),
_ => false,
}
}
pub fn extract_identifiers(content: &str, ext: &str) -> Vec<String> {
let ident_re = regex::Regex::new(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b").unwrap();
let mut seen = HashMap::new();
for mat in ident_re.find_iter(content) {
let word = mat.as_str();
if word.len() >= MIN_IDENT_LENGTH && !is_keyword(word, ext) {
*seen.entry(word.to_string()).or_insert(0usize) += 1;
}
}
let mut idents: Vec<(String, usize)> = seen
.into_iter()
.filter(|(ident, count)| should_register(ident, *count))
.collect();
idents.sort_by(|a, b| {
let savings_a = a.0.len() * a.1;
let savings_b = b.0.len() * b.1;
savings_b.cmp(&savings_a)
});
idents.into_iter().map(|(s, _)| s).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_anchor_generator() {
let mut gen = AnchorGenerator::new();
assert_eq!(gen.next().unwrap(), "A");
assert_eq!(gen.next().unwrap(), "B");
let mut gen2 = AnchorGenerator::new();
for _ in 0..26 {
gen2.next();
}
assert_eq!(gen2.next().unwrap(), "a");
for _ in 0..25 {
gen2.next();
}
assert_eq!(gen2.next().unwrap(), "AA");
assert_eq!(gen2.next().unwrap(), "AB");
}
#[test]
fn test_symbol_map_apply() {
let content = "fn my_long_function_name() {}";
let mut map = SymbolMap::new(content);
map.register("my_long_function_name");
let result = map.apply(content);
assert_eq!(result, "fn A() {}");
assert!(map.format_table().contains("A=my_long_function_name"));
}
#[test]
fn test_collision_prevention() {
let content = "let A = 0; let B = 1; let C = 2; let custom_long_var = 5;";
let mut map = SymbolMap::new(content);
let assigned = map.register("custom_long_var").unwrap();
assert_eq!(assigned, "D"); }
#[test]
fn test_keyword_exclusion() {
assert!(is_keyword("continue", "rs"));
assert!(is_keyword("interface", "go"));
assert!(!is_keyword("my_var", "rs"));
let content = "fn continue() {} fn my_custom_ident() {}";
let idents = extract_identifiers(content, "rs");
assert!(!idents.contains(&"continue".to_string()));
}
}