use std::path::PathBuf;
use rustc_hash::{FxHashMap, FxHashSet};
use xxhash_rust::xxh3::xxh3_64;
use super::types::{CloneGroup, CloneInstance, RefactoringKind, RefactoringSuggestion};
pub const FINGERPRINT_PREFIX: &str = "dup:";
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct CloneFingerprintKey {
representative_fragment: String,
token_count: usize,
line_count: usize,
instance_count: usize,
first_file: Option<PathBuf>,
first_start_line: usize,
first_end_line: usize,
}
impl CloneFingerprintKey {
#[must_use]
pub fn from_parts(instances: &[CloneInstance], token_count: usize, line_count: usize) -> Self {
let first = instances.first();
Self {
representative_fragment: first.map_or_else(String::new, |inst| inst.fragment.clone()),
token_count,
line_count,
instance_count: instances.len(),
first_file: first.map(|inst| inst.file.clone()),
first_start_line: first.map_or(0, |inst| inst.start_line),
first_end_line: first.map_or(0, |inst| inst.end_line),
}
}
fn from_group(group: &CloneGroup) -> Self {
Self::from_parts(&group.instances, group.token_count, group.line_count)
}
fn representative_fragment(&self) -> &str {
&self.representative_fragment
}
}
#[derive(Debug, Clone)]
pub struct CloneFingerprintSet {
by_key: FxHashMap<CloneFingerprintKey, String>,
key_by_fingerprint: FxHashMap<String, CloneFingerprintKey>,
}
impl CloneFingerprintSet {
#[must_use]
pub fn from_groups(groups: &[CloneGroup]) -> Self {
let entries: Vec<_> = groups
.iter()
.map(|group| {
let key = CloneFingerprintKey::from_group(group);
let hash = hash_fragment(key.representative_fragment());
(key, hash)
})
.collect();
Self::from_hashed_entries(&entries)
}
#[must_use]
pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
self.fingerprint_for_key(&CloneFingerprintKey::from_group(group))
}
#[must_use]
pub fn fingerprint_for_parts(
&self,
instances: &[CloneInstance],
token_count: usize,
line_count: usize,
) -> String {
self.fingerprint_for_key(&CloneFingerprintKey::from_parts(
instances,
token_count,
line_count,
))
}
#[must_use]
pub fn fingerprint_for_key(&self, key: &CloneFingerprintKey) -> String {
self.by_key
.get(key)
.cloned()
.unwrap_or_else(|| fingerprint_for_fragment(key.representative_fragment.as_str()))
}
#[must_use]
pub fn find_group<'a>(
&self,
groups: &'a [CloneGroup],
fingerprint: &str,
) -> Option<&'a CloneGroup> {
let key = self.key_by_fingerprint.get(fingerprint)?;
groups
.iter()
.find(|group| CloneFingerprintKey::from_group(group) == *key)
}
fn from_hashed_entries(entries: &[(CloneFingerprintKey, u64)]) -> Self {
let mut short_counts: FxHashMap<u32, usize> = FxHashMap::default();
let mut full_counts: FxHashMap<u64, usize> = FxHashMap::default();
for (_, hash) in entries {
*short_counts.entry(*hash as u32).or_insert(0) += 1;
*full_counts.entry(*hash).or_insert(0) += 1;
}
let mut full_ordinals: FxHashMap<u64, usize> = FxHashMap::default();
let mut ambiguous_short_handles: FxHashSet<String> = FxHashSet::default();
let mut by_key = FxHashMap::default();
let mut key_by_fingerprint = FxHashMap::default();
for (key, hash) in entries {
let short = *hash as u32;
let short_handle = format!("{FINGERPRINT_PREFIX}{short:08x}");
let fingerprint = if short_counts.get(&short).copied().unwrap_or(0) == 1 {
short_handle
} else {
ambiguous_short_handles.insert(short_handle);
let full_handle = format!("{FINGERPRINT_PREFIX}{hash:016x}");
if full_counts.get(hash).copied().unwrap_or(0) == 1 {
full_handle
} else {
let ordinal = full_ordinals.entry(*hash).or_insert(0);
*ordinal += 1;
format!("{full_handle}-{ordinal}")
}
};
key_by_fingerprint.insert(fingerprint.clone(), key.clone());
by_key.insert(key.clone(), fingerprint);
}
for handle in ambiguous_short_handles {
key_by_fingerprint.remove(&handle);
}
Self {
by_key,
key_by_fingerprint,
}
}
}
#[must_use]
pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
let representative = instances.first().map_or("", |inst| inst.fragment.as_str());
fingerprint_for_fragment(representative)
}
#[must_use]
pub fn fingerprint_for_fragment(fragment: &str) -> String {
let hash = hash_fragment(fragment);
format!("{FINGERPRINT_PREFIX}{:08x}", hash as u32)
}
fn hash_fragment(fragment: &str) -> u64 {
if fragment.as_bytes().contains(&b'\r') {
xxh3_64(fragment.replace('\r', "").as_bytes())
} else {
xxh3_64(fragment.as_bytes())
}
}
#[must_use]
pub fn group_refactoring_suggestion(group: &CloneGroup) -> RefactoringSuggestion {
let estimated_savings = group.line_count * group.instances.len().saturating_sub(1);
RefactoringSuggestion {
kind: RefactoringKind::ExtractFunction,
description: format!(
"Extract the shared {}-line block into one function and call it from {} sites",
group.line_count,
group.instances.len(),
),
estimated_savings,
}
}
#[must_use]
pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
let fragment = group.instances.first().map(|inst| inst.fragment.as_str())?;
let mut counts: FxHashMap<&str, usize> = FxHashMap::default();
for word in identifier_words(fragment) {
if is_generic_identifier(word) {
continue;
}
*counts.entry(word).or_insert(0) += 1;
}
let mut candidates: Vec<_> = counts
.into_iter()
.map(|(word, count)| IdentifierCandidate {
word,
count,
score: identifier_score(word, count),
})
.collect();
candidates.sort_by(|a, b| {
b.score
.cmp(&a.score)
.then_with(|| b.count.cmp(&a.count))
.then_with(|| a.word.cmp(b.word))
});
let best = candidates.first()?;
if best.count < 2 {
return None;
}
let runner_up = candidates.get(1);
if runner_up.is_some_and(|next| best.score.saturating_sub(next.score) < 2) {
return None;
}
if is_plain_single_token(best.word) {
let next_count = runner_up.map_or(0, |candidate| candidate.count);
if best.count < 3 || best.count < next_count + 2 {
return None;
}
}
Some(best.word.to_string())
}
#[derive(Debug)]
struct IdentifierCandidate<'a> {
word: &'a str,
count: usize,
score: usize,
}
fn identifier_score(word: &str, count: usize) -> usize {
let quality_bonus = if has_identifier_separator_or_case_transition(word) {
5
} else if word.chars().count() >= 8 {
2
} else {
0
};
count * 5 + quality_bonus
}
fn is_plain_single_token(word: &str) -> bool {
!has_identifier_separator_or_case_transition(word) && word.chars().count() < 8
}
fn has_identifier_separator_or_case_transition(word: &str) -> bool {
word.contains('_')
|| word.contains('$')
|| word
.chars()
.collect::<Vec<_>>()
.windows(2)
.any(|pair| pair[0].is_ascii_lowercase() && pair[1].is_ascii_uppercase())
}
fn identifier_words(source: &str) -> impl Iterator<Item = &str> {
source
.split(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '$'))
.filter(|word| {
!word.is_empty()
&& word
.chars()
.next()
.is_some_and(|c| c.is_ascii_alphabetic() || c == '_' || c == '$')
})
}
const GENERIC_IDENTIFIERS: &[&str] = &[
"data",
"result",
"results",
"item",
"items",
"value",
"values",
"val",
"obj",
"object",
"arr",
"array",
"list",
"map",
"set",
"key",
"keys",
"tmp",
"temp",
"acc",
"cur",
"curr",
"prev",
"next",
"node",
"el",
"elem",
"element",
"args",
"arg",
"opts",
"options",
"params",
"param",
"props",
"ctx",
"context",
"res",
"req",
"err",
"error",
"fn",
"cb",
"callback",
"out",
"input",
"output",
"name",
"id",
"index",
"idx",
"x",
"y",
"z",
"i",
"j",
"k",
"n",
"m",
"a",
"b",
"c",
"e",
"_",
"const",
"let",
"var",
"function",
"return",
"if",
"else",
"for",
"while",
"do",
"switch",
"case",
"break",
"continue",
"new",
"this",
"true",
"false",
"null",
"undefined",
"void",
"typeof",
"instanceof",
"in",
"of",
"class",
"extends",
"super",
"import",
"export",
"from",
"default",
"async",
"await",
"yield",
"type",
"interface",
"enum",
"as",
"is",
"keyof",
"readonly",
"public",
"private",
"protected",
"static",
"get",
"delete",
"throw",
"try",
"catch",
"finally",
"string",
"number",
"boolean",
"any",
"unknown",
"never",
"bigint",
"symbol",
"Math",
"JSON",
"Object",
"Array",
"Promise",
"BigInt",
"Number",
"String",
"Boolean",
"Symbol",
"RegExp",
"Date",
];
fn is_generic_identifier(word: &str) -> bool {
word.chars().count() == 1 || GENERIC_IDENTIFIERS.contains(&word)
}
#[cfg(test)]
mod tests {
use std::path::PathBuf;
use super::*;
fn instance(fragment: &str) -> CloneInstance {
CloneInstance {
file: PathBuf::from("a.ts"),
start_line: 1,
end_line: 5,
start_col: 0,
end_col: 0,
fragment: fragment.to_string(),
}
}
fn group(fragments: &[&str], line_count: usize) -> CloneGroup {
CloneGroup {
instances: fragments.iter().map(|f| instance(f)).collect(),
token_count: 40,
line_count,
}
}
#[test]
fn fingerprint_is_stable_and_prefixed() {
let g = group(&["foo(bar)", "foo(baz)"], 3);
let fp1 = clone_fingerprint(&g.instances);
let fp2 = clone_fingerprint(&g.instances);
assert_eq!(fp1, fp2);
assert!(fp1.starts_with("dup:"));
assert_eq!(fp1.len(), "dup:".len() + 8);
}
#[test]
fn fingerprint_is_sibling_stable() {
let group_a = group(&["computeInvoiceTotal(order)", "computeInvoiceTotal(o)"], 4);
let before = clone_fingerprint(&group_a.instances);
let _group_b_edited = group(&["totallyDifferentBody()"], 2);
let after = clone_fingerprint(&group_a.instances);
assert_eq!(before, after);
}
#[test]
fn fingerprint_differs_for_different_content() {
let a = group(&["alpha()"], 2);
let b = group(&["beta()"], 2);
assert_ne!(
clone_fingerprint(&a.instances),
clone_fingerprint(&b.instances)
);
}
#[test]
fn fingerprint_set_widens_only_colliding_short_handles() {
let a = group(&["alpha()"], 2);
let b = group(&["beta()"], 2);
let c = group(&["gamma()"], 2);
let entries = vec![
(
CloneFingerprintKey::from_group(&a),
0x0000_0001_1234_5678_u64,
),
(
CloneFingerprintKey::from_group(&b),
0x0000_0002_1234_5678_u64,
),
(
CloneFingerprintKey::from_group(&c),
0x0000_0003_8765_4321_u64,
),
];
let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
assert_eq!(
fingerprints.fingerprint_for_group(&a),
"dup:0000000112345678"
);
assert_eq!(
fingerprints.fingerprint_for_group(&b),
"dup:0000000212345678"
);
assert_eq!(fingerprints.fingerprint_for_group(&c), "dup:87654321");
assert!(
fingerprints
.find_group(&[a.clone(), b.clone(), c.clone()], "dup:12345678")
.is_none()
);
assert_eq!(
fingerprints
.find_group(&[a, b, c], "dup:0000000212345678")
.and_then(|group| group.instances.first())
.map(|inst| inst.fragment.as_str()),
Some("beta()")
);
}
#[test]
fn fingerprint_set_suffixes_full_hash_collisions() {
let a = group(&["alpha()"], 2);
let b = group(&["beta()"], 2);
let entries = vec![
(
CloneFingerprintKey::from_group(&a),
0x0000_0001_1234_5678_u64,
),
(
CloneFingerprintKey::from_group(&b),
0x0000_0001_1234_5678_u64,
),
];
let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
assert_eq!(
fingerprints.fingerprint_for_group(&a),
"dup:0000000112345678-1"
);
assert_eq!(
fingerprints.fingerprint_for_group(&b),
"dup:0000000112345678-2"
);
assert!(
fingerprints
.find_group(&[a.clone(), b.clone()], "dup:12345678")
.is_none()
);
assert!(
fingerprints
.find_group(&[a, b], "dup:0000000112345678")
.is_none()
);
}
#[test]
fn group_suggestion_savings_is_lines_times_extra_copies() {
let g = group(&["x", "x", "x"], 10); let suggestion = group_refactoring_suggestion(&g);
assert_eq!(suggestion.kind, RefactoringKind::ExtractFunction);
assert_eq!(suggestion.estimated_savings, 20); }
#[test]
fn dominant_identifier_picks_repeated_domain_name() {
let g = group(
&["function buildInvoice(invoice) { return invoice.total + invoice.tax; }"],
3,
);
assert_eq!(dominant_identifier(&g).as_deref(), Some("invoice"));
}
#[test]
fn dominant_identifier_none_on_generic() {
let g = group(&["const data = result.map((item) => item.value);"], 3);
assert_eq!(dominant_identifier(&g), None);
}
#[test]
fn dominant_identifier_skips_ts_primitive_keywords_and_globals() {
let g = group(
&["const parseUser = z.string(); parseUser(z.number()); parseUser.or(z.string());"],
4,
);
assert_eq!(dominant_identifier(&g).as_deref(), Some("parseUser"));
let only_keywords = group(&["const x: string = y as string; return x as any;"], 3);
assert_eq!(dominant_identifier(&only_keywords), None);
let g_global = group(&["Math.max(Math.floor(Math.abs(v)), 0)"], 3);
assert_eq!(dominant_identifier(&g_global), None);
}
#[test]
fn dominant_identifier_none_on_single_letter_type_param() {
let g = group(
&["function id<T>(x: T): T { const a: T = x; return a as T; }"],
3,
);
assert_eq!(dominant_identifier(&g), None);
}
#[test]
fn dominant_identifier_none_on_tie() {
let g = group(&["alpha(); beta();"], 2); assert_eq!(dominant_identifier(&g), None);
}
#[test]
fn dominant_identifier_prefers_structured_names() {
let g = group(
&["parseSchema(input); parseSchema(cache); helper(); helper();"],
3,
);
assert_eq!(dominant_identifier(&g).as_deref(), Some("parseSchema"));
}
#[test]
fn dominant_identifier_requires_plain_token_margin() {
let low_signal = group(&["schema(); schema(); parseUser();"], 3);
assert_eq!(dominant_identifier(&low_signal), None);
let strong = group(&["schema(); schema(); schema(); schema(); parseUser();"], 3);
assert_eq!(dominant_identifier(&strong).as_deref(), Some("schema"));
}
#[test]
fn dominant_identifier_is_stable_across_word_order() {
let first = group(
&["helper(); parseSchema(input); helper(); parseSchema(cache);"],
3,
);
let second = group(
&["parseSchema(input); helper(); parseSchema(cache); helper();"],
3,
);
assert_eq!(dominant_identifier(&first), dominant_identifier(&second));
assert_eq!(dominant_identifier(&first).as_deref(), Some("parseSchema"));
}
}