use std::path::PathBuf;
use rustc_hash::{FxHashMap, FxHashSet};
use xxhash_rust::xxh3::xxh3_64;
use super::types::{CloneGroup, CloneInstance, RefactoringKind, RefactoringSuggestion};
pub const FINGERPRINT_PREFIX: &str = "dup:";
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct CloneFingerprintKey {
representative_fragment: String,
token_count: usize,
line_count: usize,
instance_count: usize,
first_file: Option<PathBuf>,
first_start_line: usize,
first_end_line: usize,
}
impl CloneFingerprintKey {
#[must_use]
pub fn from_parts(instances: &[CloneInstance], token_count: usize, line_count: usize) -> Self {
let first = instances.first();
Self {
representative_fragment: first.map_or_else(String::new, |inst| inst.fragment.clone()),
token_count,
line_count,
instance_count: instances.len(),
first_file: first.map(|inst| inst.file.clone()),
first_start_line: first.map_or(0, |inst| inst.start_line),
first_end_line: first.map_or(0, |inst| inst.end_line),
}
}
fn from_group(group: &CloneGroup) -> Self {
Self::from_parts(&group.instances, group.token_count, group.line_count)
}
fn representative_fragment(&self) -> &str {
&self.representative_fragment
}
}
#[derive(Debug, Clone)]
pub struct CloneFingerprintSet {
by_key: FxHashMap<CloneFingerprintKey, String>,
key_by_fingerprint: FxHashMap<String, CloneFingerprintKey>,
}
impl CloneFingerprintSet {
#[must_use]
pub fn from_groups(groups: &[CloneGroup]) -> Self {
let entries: Vec<_> = groups
.iter()
.map(|group| {
let key = CloneFingerprintKey::from_group(group);
let hash = hash_fragment(key.representative_fragment());
(key, hash)
})
.collect();
Self::from_hashed_entries(&entries)
}
#[must_use]
pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
self.fingerprint_for_key(&CloneFingerprintKey::from_group(group))
}
#[must_use]
pub fn fingerprint_for_parts(
&self,
instances: &[CloneInstance],
token_count: usize,
line_count: usize,
) -> String {
self.fingerprint_for_key(&CloneFingerprintKey::from_parts(
instances,
token_count,
line_count,
))
}
#[must_use]
pub fn fingerprint_for_key(&self, key: &CloneFingerprintKey) -> String {
self.by_key
.get(key)
.cloned()
.unwrap_or_else(|| fingerprint_for_fragment(key.representative_fragment.as_str()))
}
#[must_use]
pub fn find_group<'a>(
&self,
groups: &'a [CloneGroup],
fingerprint: &str,
) -> Option<&'a CloneGroup> {
let key = self.key_by_fingerprint.get(fingerprint)?;
groups
.iter()
.find(|group| CloneFingerprintKey::from_group(group) == *key)
}
fn from_hashed_entries(entries: &[(CloneFingerprintKey, u64)]) -> Self {
let mut short_counts: FxHashMap<u32, usize> = FxHashMap::default();
let mut full_counts: FxHashMap<u64, usize> = FxHashMap::default();
for (_, hash) in entries {
*short_counts.entry(*hash as u32).or_insert(0) += 1;
*full_counts.entry(*hash).or_insert(0) += 1;
}
let mut full_ordinals: FxHashMap<u64, usize> = FxHashMap::default();
let mut ambiguous_short_handles: FxHashSet<String> = FxHashSet::default();
let mut by_key = FxHashMap::default();
let mut key_by_fingerprint = FxHashMap::default();
for (key, hash) in entries {
let short = *hash as u32;
let short_handle = format!("{FINGERPRINT_PREFIX}{short:08x}");
let fingerprint = if short_counts.get(&short).copied().unwrap_or(0) == 1 {
short_handle
} else {
ambiguous_short_handles.insert(short_handle);
let full_handle = format!("{FINGERPRINT_PREFIX}{hash:016x}");
if full_counts.get(hash).copied().unwrap_or(0) == 1 {
full_handle
} else {
let ordinal = full_ordinals.entry(*hash).or_insert(0);
*ordinal += 1;
format!("{full_handle}-{ordinal}")
}
};
key_by_fingerprint.insert(fingerprint.clone(), key.clone());
by_key.insert(key.clone(), fingerprint);
}
for handle in ambiguous_short_handles {
key_by_fingerprint.remove(&handle);
}
Self {
by_key,
key_by_fingerprint,
}
}
}
#[must_use]
pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
let representative = instances.first().map_or("", |inst| inst.fragment.as_str());
fingerprint_for_fragment(representative)
}
#[must_use]
pub fn fingerprint_for_fragment(fragment: &str) -> String {
let hash = hash_fragment(fragment);
format!("{FINGERPRINT_PREFIX}{:08x}", hash as u32)
}
fn hash_fragment(fragment: &str) -> u64 {
if fragment.as_bytes().contains(&b'\r') {
xxh3_64(fragment.replace('\r', "").as_bytes())
} else {
xxh3_64(fragment.as_bytes())
}
}
#[must_use]
pub fn group_refactoring_suggestion(group: &CloneGroup) -> RefactoringSuggestion {
let estimated_savings = group.line_count * group.instances.len().saturating_sub(1);
RefactoringSuggestion {
kind: RefactoringKind::ExtractFunction,
description: format!(
"Extract the shared {}-line block into one function and call it from {} sites",
group.line_count,
group.instances.len(),
),
estimated_savings,
}
}
#[must_use]
pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
let fragment = group.instances.first().map(|inst| inst.fragment.as_str())?;
let mut counts: FxHashMap<&str, usize> = FxHashMap::default();
for word in identifier_words(fragment) {
if is_generic_identifier(word) {
continue;
}
*counts.entry(word).or_insert(0) += 1;
}
let mut candidates: Vec<_> = counts
.into_iter()
.map(|(word, count)| IdentifierCandidate {
word,
count,
score: identifier_score(word, count),
})
.collect();
candidates.sort_by(|a, b| {
b.score
.cmp(&a.score)
.then_with(|| b.count.cmp(&a.count))
.then_with(|| a.word.cmp(b.word))
});
let best = candidates.first()?;
if best.count < 2 {
return None;
}
let runner_up = candidates.get(1);
if runner_up.is_some_and(|next| best.score.saturating_sub(next.score) < 2) {
return None;
}
if is_plain_single_token(best.word) {
let next_count = runner_up.map_or(0, |candidate| candidate.count);
if best.count < 3 || best.count < next_count + 2 {
return None;
}
}
Some(best.word.to_string())
}
#[derive(Debug)]
struct IdentifierCandidate<'a> {
word: &'a str,
count: usize,
score: usize,
}
fn identifier_score(word: &str, count: usize) -> usize {
let quality_bonus = if has_identifier_separator_or_case_transition(word) {
5
} else if word.chars().count() >= 8 {
2
} else {
0
};
count * 5 + quality_bonus
}
fn is_plain_single_token(word: &str) -> bool {
!has_identifier_separator_or_case_transition(word) && word.chars().count() < 8
}
fn has_identifier_separator_or_case_transition(word: &str) -> bool {
word.contains('_')
|| word.contains('$')
|| word
.chars()
.collect::<Vec<_>>()
.windows(2)
.any(|pair| pair[0].is_ascii_lowercase() && pair[1].is_ascii_uppercase())
}
fn identifier_words(source: &str) -> impl Iterator<Item = &str> {
source
.split(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '$'))
.filter(|word| {
!word.is_empty()
&& word
.chars()
.next()
.is_some_and(|c| c.is_ascii_alphabetic() || c == '_' || c == '$')
})
}
fn is_generic_identifier(word: &str) -> bool {
if word.chars().count() == 1 {
return true;
}
matches!(
word,
"data" | "result" | "results" | "item" | "items" | "value" | "values" | "val"
| "obj" | "object" | "arr" | "array" | "list" | "map" | "set" | "key" | "keys"
| "tmp" | "temp" | "acc" | "cur" | "curr" | "prev" | "next" | "node" | "el"
| "elem" | "element" | "args" | "arg" | "opts" | "options" | "params" | "param"
| "props" | "ctx" | "context" | "res" | "req" | "err" | "error" | "fn" | "cb"
| "callback" | "out" | "input" | "output" | "name" | "id" | "index" | "idx"
| "x" | "y" | "z" | "i" | "j" | "k" | "n" | "m" | "a" | "b" | "c" | "e" | "_"
| "const" | "let" | "var" | "function" | "return" | "if" | "else" | "for"
| "while" | "do" | "switch" | "case" | "break" | "continue" | "new" | "this"
| "true" | "false" | "null" | "undefined" | "void" | "typeof" | "instanceof"
| "in" | "of" | "class" | "extends" | "super" | "import" | "export" | "from"
| "default" | "async" | "await" | "yield" | "type" | "interface" | "enum"
| "as" | "is" | "keyof" | "readonly" | "public" | "private" | "protected"
| "static" | "get" | "delete" | "throw" | "try" | "catch" | "finally"
| "string" | "number" | "boolean" | "any" | "unknown" | "never" | "bigint"
| "symbol"
| "Math" | "JSON" | "Object" | "Array" | "Promise" | "BigInt" | "Number"
| "String" | "Boolean" | "Symbol" | "RegExp" | "Date"
)
}
#[cfg(test)]
mod tests {
use std::path::PathBuf;
use super::*;
fn instance(fragment: &str) -> CloneInstance {
CloneInstance {
file: PathBuf::from("a.ts"),
start_line: 1,
end_line: 5,
start_col: 0,
end_col: 0,
fragment: fragment.to_string(),
}
}
fn group(fragments: &[&str], line_count: usize) -> CloneGroup {
CloneGroup {
instances: fragments.iter().map(|f| instance(f)).collect(),
token_count: 40,
line_count,
}
}
#[test]
fn fingerprint_is_stable_and_prefixed() {
let g = group(&["foo(bar)", "foo(baz)"], 3);
let fp1 = clone_fingerprint(&g.instances);
let fp2 = clone_fingerprint(&g.instances);
assert_eq!(fp1, fp2);
assert!(fp1.starts_with("dup:"));
assert_eq!(fp1.len(), "dup:".len() + 8);
}
#[test]
fn fingerprint_is_sibling_stable() {
let group_a = group(&["computeInvoiceTotal(order)", "computeInvoiceTotal(o)"], 4);
let before = clone_fingerprint(&group_a.instances);
let _group_b_edited = group(&["totallyDifferentBody()"], 2);
let after = clone_fingerprint(&group_a.instances);
assert_eq!(before, after);
}
#[test]
fn fingerprint_differs_for_different_content() {
let a = group(&["alpha()"], 2);
let b = group(&["beta()"], 2);
assert_ne!(
clone_fingerprint(&a.instances),
clone_fingerprint(&b.instances)
);
}
#[test]
fn fingerprint_set_widens_only_colliding_short_handles() {
let a = group(&["alpha()"], 2);
let b = group(&["beta()"], 2);
let c = group(&["gamma()"], 2);
let entries = vec![
(
CloneFingerprintKey::from_group(&a),
0x0000_0001_1234_5678_u64,
),
(
CloneFingerprintKey::from_group(&b),
0x0000_0002_1234_5678_u64,
),
(
CloneFingerprintKey::from_group(&c),
0x0000_0003_8765_4321_u64,
),
];
let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
assert_eq!(
fingerprints.fingerprint_for_group(&a),
"dup:0000000112345678"
);
assert_eq!(
fingerprints.fingerprint_for_group(&b),
"dup:0000000212345678"
);
assert_eq!(fingerprints.fingerprint_for_group(&c), "dup:87654321");
assert!(
fingerprints
.find_group(&[a.clone(), b.clone(), c.clone()], "dup:12345678")
.is_none()
);
assert_eq!(
fingerprints
.find_group(&[a, b, c], "dup:0000000212345678")
.and_then(|group| group.instances.first())
.map(|inst| inst.fragment.as_str()),
Some("beta()")
);
}
#[test]
fn fingerprint_set_suffixes_full_hash_collisions() {
let a = group(&["alpha()"], 2);
let b = group(&["beta()"], 2);
let entries = vec![
(
CloneFingerprintKey::from_group(&a),
0x0000_0001_1234_5678_u64,
),
(
CloneFingerprintKey::from_group(&b),
0x0000_0001_1234_5678_u64,
),
];
let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
assert_eq!(
fingerprints.fingerprint_for_group(&a),
"dup:0000000112345678-1"
);
assert_eq!(
fingerprints.fingerprint_for_group(&b),
"dup:0000000112345678-2"
);
assert!(
fingerprints
.find_group(&[a.clone(), b.clone()], "dup:12345678")
.is_none()
);
assert!(
fingerprints
.find_group(&[a, b], "dup:0000000112345678")
.is_none()
);
}
#[test]
fn group_suggestion_savings_is_lines_times_extra_copies() {
let g = group(&["x", "x", "x"], 10); let suggestion = group_refactoring_suggestion(&g);
assert_eq!(suggestion.kind, RefactoringKind::ExtractFunction);
assert_eq!(suggestion.estimated_savings, 20); }
#[test]
fn dominant_identifier_picks_repeated_domain_name() {
let g = group(
&["function buildInvoice(invoice) { return invoice.total + invoice.tax; }"],
3,
);
assert_eq!(dominant_identifier(&g).as_deref(), Some("invoice"));
}
#[test]
fn dominant_identifier_none_on_generic() {
let g = group(&["const data = result.map((item) => item.value);"], 3);
assert_eq!(dominant_identifier(&g), None);
}
#[test]
fn dominant_identifier_skips_ts_primitive_keywords_and_globals() {
let g = group(
&["const parseUser = z.string(); parseUser(z.number()); parseUser.or(z.string());"],
4,
);
assert_eq!(dominant_identifier(&g).as_deref(), Some("parseUser"));
let only_keywords = group(&["const x: string = y as string; return x as any;"], 3);
assert_eq!(dominant_identifier(&only_keywords), None);
let g_global = group(&["Math.max(Math.floor(Math.abs(v)), 0)"], 3);
assert_eq!(dominant_identifier(&g_global), None);
}
#[test]
fn dominant_identifier_none_on_single_letter_type_param() {
let g = group(
&["function id<T>(x: T): T { const a: T = x; return a as T; }"],
3,
);
assert_eq!(dominant_identifier(&g), None);
}
#[test]
fn dominant_identifier_none_on_tie() {
let g = group(&["alpha(); beta();"], 2); assert_eq!(dominant_identifier(&g), None);
}
#[test]
fn dominant_identifier_prefers_structured_names() {
let g = group(
&["parseSchema(input); parseSchema(cache); helper(); helper();"],
3,
);
assert_eq!(dominant_identifier(&g).as_deref(), Some("parseSchema"));
}
#[test]
fn dominant_identifier_requires_plain_token_margin() {
let low_signal = group(&["schema(); schema(); parseUser();"], 3);
assert_eq!(dominant_identifier(&low_signal), None);
let strong = group(&["schema(); schema(); schema(); schema(); parseUser();"], 3);
assert_eq!(dominant_identifier(&strong).as_deref(), Some("schema"));
}
#[test]
fn dominant_identifier_is_stable_across_word_order() {
let first = group(
&["helper(); parseSchema(input); helper(); parseSchema(cache);"],
3,
);
let second = group(
&["parseSchema(input); helper(); parseSchema(cache); helper();"],
3,
);
assert_eq!(dominant_identifier(&first), dominant_identifier(&second));
assert_eq!(dominant_identifier(&first).as_deref(), Some("parseSchema"));
}
}