use std::collections::HashMap;
use strsim::normalized_levenshtein;
#[derive(Debug, Clone)]
pub struct PromptCluster {
pub canonical: String,
pub variants: Vec<String>,
pub count: usize,
pub latest_timestamp: i64,
}
pub struct FuzzyDeduper {
threshold: f64,
min_length: usize,
}
impl FuzzyDeduper {
pub fn new(threshold: f64, min_length: usize) -> Self {
Self { threshold, min_length }
}
pub fn cluster(&self, prompts: Vec<(String, i64)>) -> Vec<PromptCluster> {
let mut counts: HashMap<String, (usize, i64)> = HashMap::new();
for (prompt, timestamp) in &prompts {
let normalized = self.normalize(prompt);
if !normalized.is_empty() && normalized.len() >= self.min_length {
let entry = counts.entry(normalized).or_insert((0, 0));
entry.0 += 1;
if *timestamp > entry.1 {
entry.1 = *timestamp;
}
}
}
let mut items: Vec<(String, usize, i64)> = counts
.into_iter()
.map(|(k, (count, ts))| (k, count, ts))
.collect();
items.sort_by(|a, b| b.1.cmp(&a.1));
let mut clusters: Vec<PromptCluster> = Vec::new();
for (prompt, count, timestamp) in items {
let mut found_cluster = false;
for cluster in &mut clusters {
if self.is_similar(&prompt, &cluster.canonical) {
cluster.variants.push(prompt.clone());
cluster.count += count;
if timestamp > cluster.latest_timestamp {
cluster.latest_timestamp = timestamp;
}
found_cluster = true;
break;
}
}
if !found_cluster {
clusters.push(PromptCluster {
canonical: prompt.clone(),
variants: vec![prompt],
count,
latest_timestamp: timestamp,
});
}
}
clusters
}
pub fn sort_by_count(clusters: &mut [PromptCluster]) {
clusters.sort_by(|a, b| b.count.cmp(&a.count));
}
pub fn sort_by_latest(clusters: &mut [PromptCluster]) {
clusters.sort_by(|a, b| b.latest_timestamp.cmp(&a.latest_timestamp));
}
fn normalize(&self, s: &str) -> String {
let s = s.trim().to_lowercase();
if s.contains("import ")
|| s.contains("export ")
|| s.contains("const ")
|| s.contains("function ")
|| s.contains("interface ")
|| s.starts_with("//")
|| s.starts_with("/*")
|| s.starts_with("```")
|| s.contains(".js:")
|| s.contains(".ts:")
|| s.contains(".tsx:")
|| s.contains("chunk-")
|| s.contains("requestanimationframe")
|| s.contains("installhook")
|| s.starts_with('[')
|| s.starts_with('{')
|| s.starts_with('<')
{
return String::new();
}
s
}
fn is_similar(&self, a: &str, b: &str) -> bool {
if a == b {
return true;
}
let len_ratio = a.len().min(b.len()) as f64 / a.len().max(b.len()) as f64;
if len_ratio < 0.5 {
return false;
}
let similarity = normalized_levenshtein(a, b);
similarity >= self.threshold
}
}
impl Default for FuzzyDeduper {
fn default() -> Self {
Self::new(0.8, 4)
}
}