1use std::path::PathBuf;
12
13use rustc_hash::{FxHashMap, FxHashSet};
14use xxhash_rust::xxh3::xxh3_64;
15
16use super::types::{CloneGroup, CloneInstance, RefactoringKind, RefactoringSuggestion};
17
18pub const FINGERPRINT_PREFIX: &str = "dup:";
20
21#[derive(Debug, Clone, PartialEq, Eq, Hash)]
27pub struct CloneFingerprintKey {
28 representative_fragment: String,
29 token_count: usize,
30 line_count: usize,
31 instance_count: usize,
32 first_file: Option<PathBuf>,
33 first_start_line: usize,
34 first_end_line: usize,
35}
36
37impl CloneFingerprintKey {
38 #[must_use]
40 pub fn from_parts(instances: &[CloneInstance], token_count: usize, line_count: usize) -> Self {
41 let first = instances.first();
42 Self {
43 representative_fragment: first.map_or_else(String::new, |inst| inst.fragment.clone()),
44 token_count,
45 line_count,
46 instance_count: instances.len(),
47 first_file: first.map(|inst| inst.file.clone()),
48 first_start_line: first.map_or(0, |inst| inst.start_line),
49 first_end_line: first.map_or(0, |inst| inst.end_line),
50 }
51 }
52
53 fn from_group(group: &CloneGroup) -> Self {
54 Self::from_parts(&group.instances, group.token_count, group.line_count)
55 }
56
57 fn representative_fragment(&self) -> &str {
58 &self.representative_fragment
59 }
60}
61
62#[derive(Debug, Clone)]
69pub struct CloneFingerprintSet {
70 by_key: FxHashMap<CloneFingerprintKey, String>,
71 key_by_fingerprint: FxHashMap<String, CloneFingerprintKey>,
72}
73
74impl CloneFingerprintSet {
75 #[must_use]
77 pub fn from_groups(groups: &[CloneGroup]) -> Self {
78 let entries: Vec<_> = groups
79 .iter()
80 .map(|group| {
81 let key = CloneFingerprintKey::from_group(group);
82 let hash = hash_fragment(key.representative_fragment());
83 (key, hash)
84 })
85 .collect();
86 Self::from_hashed_entries(&entries)
87 }
88
89 #[must_use]
91 pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
92 self.fingerprint_for_key(&CloneFingerprintKey::from_group(group))
93 }
94
95 #[must_use]
97 pub fn fingerprint_for_parts(
98 &self,
99 instances: &[CloneInstance],
100 token_count: usize,
101 line_count: usize,
102 ) -> String {
103 self.fingerprint_for_key(&CloneFingerprintKey::from_parts(
104 instances,
105 token_count,
106 line_count,
107 ))
108 }
109
110 #[must_use]
113 pub fn fingerprint_for_key(&self, key: &CloneFingerprintKey) -> String {
114 self.by_key
115 .get(key)
116 .cloned()
117 .unwrap_or_else(|| fingerprint_for_fragment(key.representative_fragment.as_str()))
118 }
119
120 #[must_use]
126 pub fn find_group<'a>(
127 &self,
128 groups: &'a [CloneGroup],
129 fingerprint: &str,
130 ) -> Option<&'a CloneGroup> {
131 let key = self.key_by_fingerprint.get(fingerprint)?;
132 groups
133 .iter()
134 .find(|group| CloneFingerprintKey::from_group(group) == *key)
135 }
136
137 fn from_hashed_entries(entries: &[(CloneFingerprintKey, u64)]) -> Self {
138 let mut short_counts: FxHashMap<u32, usize> = FxHashMap::default();
139 let mut full_counts: FxHashMap<u64, usize> = FxHashMap::default();
140 for (_, hash) in entries {
141 *short_counts.entry(*hash as u32).or_insert(0) += 1;
142 *full_counts.entry(*hash).or_insert(0) += 1;
143 }
144
145 let mut full_ordinals: FxHashMap<u64, usize> = FxHashMap::default();
146 let mut ambiguous_short_handles: FxHashSet<String> = FxHashSet::default();
147 let mut by_key = FxHashMap::default();
148 let mut key_by_fingerprint = FxHashMap::default();
149
150 for (key, hash) in entries {
151 let short = *hash as u32;
152 let short_handle = format!("{FINGERPRINT_PREFIX}{short:08x}");
153 let fingerprint = if short_counts.get(&short).copied().unwrap_or(0) == 1 {
154 short_handle
155 } else {
156 ambiguous_short_handles.insert(short_handle);
157 let full_handle = format!("{FINGERPRINT_PREFIX}{hash:016x}");
158 if full_counts.get(hash).copied().unwrap_or(0) == 1 {
159 full_handle
160 } else {
161 let ordinal = full_ordinals.entry(*hash).or_insert(0);
162 *ordinal += 1;
163 format!("{full_handle}-{ordinal}")
164 }
165 };
166
167 key_by_fingerprint.insert(fingerprint.clone(), key.clone());
168 by_key.insert(key.clone(), fingerprint);
169 }
170
171 for handle in ambiguous_short_handles {
172 key_by_fingerprint.remove(&handle);
173 }
174
175 Self {
176 by_key,
177 key_by_fingerprint,
178 }
179 }
180}
181
182#[must_use]
202pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
203 let representative = instances.first().map_or("", |inst| inst.fragment.as_str());
204 fingerprint_for_fragment(representative)
205}
206
207#[must_use]
213pub fn fingerprint_for_fragment(fragment: &str) -> String {
214 let hash = hash_fragment(fragment);
215 format!("{FINGERPRINT_PREFIX}{:08x}", hash as u32)
218}
219
220fn hash_fragment(fragment: &str) -> u64 {
229 if fragment.as_bytes().contains(&b'\r') {
230 xxh3_64(fragment.replace('\r', "").as_bytes())
231 } else {
232 xxh3_64(fragment.as_bytes())
233 }
234}
235
236#[must_use]
242pub fn group_refactoring_suggestion(group: &CloneGroup) -> RefactoringSuggestion {
243 let estimated_savings = group.line_count * group.instances.len().saturating_sub(1);
244 RefactoringSuggestion {
245 kind: RefactoringKind::ExtractFunction,
246 description: format!(
247 "Extract the shared {}-line block into one function and call it from {} sites",
248 group.line_count,
249 group.instances.len(),
250 ),
251 estimated_savings,
252 }
253}
254
255#[must_use]
264pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
265 let fragment = group.instances.first().map(|inst| inst.fragment.as_str())?;
266 let mut counts: FxHashMap<&str, usize> = FxHashMap::default();
267 for word in identifier_words(fragment) {
268 if is_generic_identifier(word) {
269 continue;
270 }
271 *counts.entry(word).or_insert(0) += 1;
272 }
273
274 let mut candidates: Vec<_> = counts
275 .into_iter()
276 .map(|(word, count)| IdentifierCandidate {
277 word,
278 count,
279 score: identifier_score(word, count),
280 })
281 .collect();
282 candidates.sort_by(|a, b| {
283 b.score
284 .cmp(&a.score)
285 .then_with(|| b.count.cmp(&a.count))
286 .then_with(|| a.word.cmp(b.word))
287 });
288
289 let best = candidates.first()?;
290 if best.count < 2 {
291 return None;
292 }
293
294 let runner_up = candidates.get(1);
295 if runner_up.is_some_and(|next| best.score.saturating_sub(next.score) < 2) {
296 return None;
297 }
298
299 if is_plain_single_token(best.word) {
300 let next_count = runner_up.map_or(0, |candidate| candidate.count);
301 if best.count < 3 || best.count < next_count + 2 {
302 return None;
303 }
304 }
305
306 Some(best.word.to_string())
307}
308
309#[derive(Debug)]
310struct IdentifierCandidate<'a> {
311 word: &'a str,
312 count: usize,
313 score: usize,
314}
315
316fn identifier_score(word: &str, count: usize) -> usize {
317 let quality_bonus = if has_identifier_separator_or_case_transition(word) {
318 5
319 } else if word.chars().count() >= 8 {
320 2
321 } else {
322 0
323 };
324 count * 5 + quality_bonus
325}
326
327fn is_plain_single_token(word: &str) -> bool {
328 !has_identifier_separator_or_case_transition(word) && word.chars().count() < 8
329}
330
331fn has_identifier_separator_or_case_transition(word: &str) -> bool {
332 word.contains('_')
333 || word.contains('$')
334 || word
335 .chars()
336 .collect::<Vec<_>>()
337 .windows(2)
338 .any(|pair| pair[0].is_ascii_lowercase() && pair[1].is_ascii_uppercase())
339}
340
341fn identifier_words(source: &str) -> impl Iterator<Item = &str> {
343 source
344 .split(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '$'))
345 .filter(|word| {
346 !word.is_empty()
347 && word
348 .chars()
349 .next()
350 .is_some_and(|c| c.is_ascii_alphabetic() || c == '_' || c == '$')
351 })
352}
353
354fn is_generic_identifier(word: &str) -> bool {
357 if word.chars().count() == 1 {
361 return true;
362 }
363 matches!(
364 word,
365 "data" | "result" | "results" | "item" | "items" | "value" | "values" | "val"
367 | "obj" | "object" | "arr" | "array" | "list" | "map" | "set" | "key" | "keys"
368 | "tmp" | "temp" | "acc" | "cur" | "curr" | "prev" | "next" | "node" | "el"
369 | "elem" | "element" | "args" | "arg" | "opts" | "options" | "params" | "param"
370 | "props" | "ctx" | "context" | "res" | "req" | "err" | "error" | "fn" | "cb"
371 | "callback" | "out" | "input" | "output" | "name" | "id" | "index" | "idx"
372 | "x" | "y" | "z" | "i" | "j" | "k" | "n" | "m" | "a" | "b" | "c" | "e" | "_"
374 | "const" | "let" | "var" | "function" | "return" | "if" | "else" | "for"
376 | "while" | "do" | "switch" | "case" | "break" | "continue" | "new" | "this"
377 | "true" | "false" | "null" | "undefined" | "void" | "typeof" | "instanceof"
378 | "in" | "of" | "class" | "extends" | "super" | "import" | "export" | "from"
379 | "default" | "async" | "await" | "yield" | "type" | "interface" | "enum"
380 | "as" | "is" | "keyof" | "readonly" | "public" | "private" | "protected"
381 | "static" | "get" | "delete" | "throw" | "try" | "catch" | "finally"
382 | "string" | "number" | "boolean" | "any" | "unknown" | "never" | "bigint"
385 | "symbol"
386 | "Math" | "JSON" | "Object" | "Array" | "Promise" | "BigInt" | "Number"
388 | "String" | "Boolean" | "Symbol" | "RegExp" | "Date"
389 )
390}
391
392#[cfg(test)]
393mod tests {
394 use std::path::PathBuf;
395
396 use super::*;
397
398 fn instance(fragment: &str) -> CloneInstance {
399 CloneInstance {
400 file: PathBuf::from("a.ts"),
401 start_line: 1,
402 end_line: 5,
403 start_col: 0,
404 end_col: 0,
405 fragment: fragment.to_string(),
406 }
407 }
408
409 fn group(fragments: &[&str], line_count: usize) -> CloneGroup {
410 CloneGroup {
411 instances: fragments.iter().map(|f| instance(f)).collect(),
412 token_count: 40,
413 line_count,
414 }
415 }
416
417 #[test]
418 fn fingerprint_is_stable_and_prefixed() {
419 let g = group(&["foo(bar)", "foo(baz)"], 3);
420 let fp1 = clone_fingerprint(&g.instances);
421 let fp2 = clone_fingerprint(&g.instances);
422 assert_eq!(fp1, fp2);
423 assert!(fp1.starts_with("dup:"));
424 assert_eq!(fp1.len(), "dup:".len() + 8);
425 }
426
427 #[test]
428 fn fingerprint_is_sibling_stable() {
429 let group_a = group(&["computeInvoiceTotal(order)", "computeInvoiceTotal(o)"], 4);
431 let before = clone_fingerprint(&group_a.instances);
432 let _group_b_edited = group(&["totallyDifferentBody()"], 2);
433 let after = clone_fingerprint(&group_a.instances);
434 assert_eq!(before, after);
435 }
436
437 #[test]
438 fn fingerprint_differs_for_different_content() {
439 let a = group(&["alpha()"], 2);
440 let b = group(&["beta()"], 2);
441 assert_ne!(
442 clone_fingerprint(&a.instances),
443 clone_fingerprint(&b.instances)
444 );
445 }
446
447 #[test]
448 fn fingerprint_set_widens_only_colliding_short_handles() {
449 let a = group(&["alpha()"], 2);
450 let b = group(&["beta()"], 2);
451 let c = group(&["gamma()"], 2);
452 let entries = vec![
453 (
454 CloneFingerprintKey::from_group(&a),
455 0x0000_0001_1234_5678_u64,
456 ),
457 (
458 CloneFingerprintKey::from_group(&b),
459 0x0000_0002_1234_5678_u64,
460 ),
461 (
462 CloneFingerprintKey::from_group(&c),
463 0x0000_0003_8765_4321_u64,
464 ),
465 ];
466
467 let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
468
469 assert_eq!(
470 fingerprints.fingerprint_for_group(&a),
471 "dup:0000000112345678"
472 );
473 assert_eq!(
474 fingerprints.fingerprint_for_group(&b),
475 "dup:0000000212345678"
476 );
477 assert_eq!(fingerprints.fingerprint_for_group(&c), "dup:87654321");
478 assert!(
479 fingerprints
480 .find_group(&[a.clone(), b.clone(), c.clone()], "dup:12345678")
481 .is_none()
482 );
483 assert_eq!(
484 fingerprints
485 .find_group(&[a, b, c], "dup:0000000212345678")
486 .and_then(|group| group.instances.first())
487 .map(|inst| inst.fragment.as_str()),
488 Some("beta()")
489 );
490 }
491
492 #[test]
493 fn fingerprint_set_suffixes_full_hash_collisions() {
494 let a = group(&["alpha()"], 2);
495 let b = group(&["beta()"], 2);
496 let entries = vec![
497 (
498 CloneFingerprintKey::from_group(&a),
499 0x0000_0001_1234_5678_u64,
500 ),
501 (
502 CloneFingerprintKey::from_group(&b),
503 0x0000_0001_1234_5678_u64,
504 ),
505 ];
506
507 let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
508
509 assert_eq!(
510 fingerprints.fingerprint_for_group(&a),
511 "dup:0000000112345678-1"
512 );
513 assert_eq!(
514 fingerprints.fingerprint_for_group(&b),
515 "dup:0000000112345678-2"
516 );
517 assert!(
518 fingerprints
519 .find_group(&[a.clone(), b.clone()], "dup:12345678")
520 .is_none()
521 );
522 assert!(
523 fingerprints
524 .find_group(&[a, b], "dup:0000000112345678")
525 .is_none()
526 );
527 }
528
529 #[test]
530 fn group_suggestion_savings_is_lines_times_extra_copies() {
531 let g = group(&["x", "x", "x"], 10); let suggestion = group_refactoring_suggestion(&g);
533 assert_eq!(suggestion.kind, RefactoringKind::ExtractFunction);
534 assert_eq!(suggestion.estimated_savings, 20); }
536
537 #[test]
538 fn dominant_identifier_picks_repeated_domain_name() {
539 let g = group(
540 &["function buildInvoice(invoice) { return invoice.total + invoice.tax; }"],
541 3,
542 );
543 assert_eq!(dominant_identifier(&g).as_deref(), Some("invoice"));
544 }
545
546 #[test]
547 fn dominant_identifier_none_on_generic() {
548 let g = group(&["const data = result.map((item) => item.value);"], 3);
549 assert_eq!(dominant_identifier(&g), None);
550 }
551
552 #[test]
553 fn dominant_identifier_skips_ts_primitive_keywords_and_globals() {
554 let g = group(
558 &["const parseUser = z.string(); parseUser(z.number()); parseUser.or(z.string());"],
559 4,
560 );
561 assert_eq!(dominant_identifier(&g).as_deref(), Some("parseUser"));
562 let only_keywords = group(&["const x: string = y as string; return x as any;"], 3);
563 assert_eq!(dominant_identifier(&only_keywords), None);
564 let g_global = group(&["Math.max(Math.floor(Math.abs(v)), 0)"], 3);
565 assert_eq!(dominant_identifier(&g_global), None);
566 }
567
568 #[test]
569 fn dominant_identifier_none_on_single_letter_type_param() {
570 let g = group(
572 &["function id<T>(x: T): T { const a: T = x; return a as T; }"],
573 3,
574 );
575 assert_eq!(dominant_identifier(&g), None);
576 }
577
578 #[test]
579 fn dominant_identifier_none_on_tie() {
580 let g = group(&["alpha(); beta();"], 2); assert_eq!(dominant_identifier(&g), None);
582 }
583
584 #[test]
585 fn dominant_identifier_prefers_structured_names() {
586 let g = group(
587 &["parseSchema(input); parseSchema(cache); helper(); helper();"],
588 3,
589 );
590 assert_eq!(dominant_identifier(&g).as_deref(), Some("parseSchema"));
591 }
592
593 #[test]
594 fn dominant_identifier_requires_plain_token_margin() {
595 let low_signal = group(&["schema(); schema(); parseUser();"], 3);
596 assert_eq!(dominant_identifier(&low_signal), None);
597
598 let strong = group(&["schema(); schema(); schema(); schema(); parseUser();"], 3);
599 assert_eq!(dominant_identifier(&strong).as_deref(), Some("schema"));
600 }
601
602 #[test]
603 fn dominant_identifier_is_stable_across_word_order() {
604 let first = group(
605 &["helper(); parseSchema(input); helper(); parseSchema(cache);"],
606 3,
607 );
608 let second = group(
609 &["parseSchema(input); helper(); parseSchema(cache); helper();"],
610 3,
611 );
612
613 assert_eq!(dominant_identifier(&first), dominant_identifier(&second));
614 assert_eq!(dominant_identifier(&first).as_deref(), Some("parseSchema"));
615 }
616}