1use std::path::PathBuf;
12
13use rustc_hash::{FxHashMap, FxHashSet};
14use xxhash_rust::xxh3::xxh3_64;
15
16use super::types::{CloneGroup, CloneInstance, RefactoringKind, RefactoringSuggestion};
17
18pub const FINGERPRINT_PREFIX: &str = "dup:";
20
21#[derive(Debug, Clone, PartialEq, Eq, Hash)]
27pub struct CloneFingerprintKey {
28 representative_fragment: String,
29 token_count: usize,
30 line_count: usize,
31 instance_count: usize,
32 first_file: Option<PathBuf>,
33 first_start_line: usize,
34 first_end_line: usize,
35}
36
37impl CloneFingerprintKey {
38 #[must_use]
40 pub fn from_parts(instances: &[CloneInstance], token_count: usize, line_count: usize) -> Self {
41 let first = instances.first();
42 Self {
43 representative_fragment: first.map_or_else(String::new, |inst| inst.fragment.clone()),
44 token_count,
45 line_count,
46 instance_count: instances.len(),
47 first_file: first.map(|inst| inst.file.clone()),
48 first_start_line: first.map_or(0, |inst| inst.start_line),
49 first_end_line: first.map_or(0, |inst| inst.end_line),
50 }
51 }
52
53 fn from_group(group: &CloneGroup) -> Self {
54 Self::from_parts(&group.instances, group.token_count, group.line_count)
55 }
56
57 fn representative_fragment(&self) -> &str {
58 &self.representative_fragment
59 }
60}
61
62#[derive(Debug, Clone)]
69pub struct CloneFingerprintSet {
70 by_key: FxHashMap<CloneFingerprintKey, String>,
71 key_by_fingerprint: FxHashMap<String, CloneFingerprintKey>,
72}
73
74impl CloneFingerprintSet {
75 #[must_use]
77 pub fn from_groups(groups: &[CloneGroup]) -> Self {
78 let entries: Vec<_> = groups
79 .iter()
80 .map(|group| {
81 let key = CloneFingerprintKey::from_group(group);
82 let hash = hash_fragment(key.representative_fragment());
83 (key, hash)
84 })
85 .collect();
86 Self::from_hashed_entries(&entries)
87 }
88
89 #[must_use]
91 pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
92 self.fingerprint_for_key(&CloneFingerprintKey::from_group(group))
93 }
94
95 #[must_use]
97 pub fn fingerprint_for_parts(
98 &self,
99 instances: &[CloneInstance],
100 token_count: usize,
101 line_count: usize,
102 ) -> String {
103 self.fingerprint_for_key(&CloneFingerprintKey::from_parts(
104 instances,
105 token_count,
106 line_count,
107 ))
108 }
109
110 #[must_use]
113 pub fn fingerprint_for_key(&self, key: &CloneFingerprintKey) -> String {
114 self.by_key
115 .get(key)
116 .cloned()
117 .unwrap_or_else(|| fingerprint_for_fragment(key.representative_fragment.as_str()))
118 }
119
120 #[must_use]
126 pub fn find_group<'a>(
127 &self,
128 groups: &'a [CloneGroup],
129 fingerprint: &str,
130 ) -> Option<&'a CloneGroup> {
131 let key = self.key_by_fingerprint.get(fingerprint)?;
132 groups
133 .iter()
134 .find(|group| CloneFingerprintKey::from_group(group) == *key)
135 }
136
137 fn from_hashed_entries(entries: &[(CloneFingerprintKey, u64)]) -> Self {
138 let mut short_counts: FxHashMap<u32, usize> = FxHashMap::default();
139 let mut full_counts: FxHashMap<u64, usize> = FxHashMap::default();
140 for (_, hash) in entries {
141 *short_counts.entry(*hash as u32).or_insert(0) += 1;
142 *full_counts.entry(*hash).or_insert(0) += 1;
143 }
144
145 let mut full_ordinals: FxHashMap<u64, usize> = FxHashMap::default();
146 let mut ambiguous_short_handles: FxHashSet<String> = FxHashSet::default();
147 let mut by_key = FxHashMap::default();
148 let mut key_by_fingerprint = FxHashMap::default();
149
150 for (key, hash) in entries {
151 let short = *hash as u32;
152 let short_handle = format!("{FINGERPRINT_PREFIX}{short:08x}");
153 let fingerprint = if short_counts.get(&short).copied().unwrap_or(0) == 1 {
154 short_handle
155 } else {
156 ambiguous_short_handles.insert(short_handle);
157 let full_handle = format!("{FINGERPRINT_PREFIX}{hash:016x}");
158 if full_counts.get(hash).copied().unwrap_or(0) == 1 {
159 full_handle
160 } else {
161 let ordinal = full_ordinals.entry(*hash).or_insert(0);
162 *ordinal += 1;
163 format!("{full_handle}-{ordinal}")
164 }
165 };
166
167 key_by_fingerprint.insert(fingerprint.clone(), key.clone());
168 by_key.insert(key.clone(), fingerprint);
169 }
170
171 for handle in ambiguous_short_handles {
172 key_by_fingerprint.remove(&handle);
173 }
174
175 Self {
176 by_key,
177 key_by_fingerprint,
178 }
179 }
180}
181
182#[must_use]
202pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
203 let representative = instances.first().map_or("", |inst| inst.fragment.as_str());
204 fingerprint_for_fragment(representative)
205}
206
207#[must_use]
213pub fn fingerprint_for_fragment(fragment: &str) -> String {
214 let hash = hash_fragment(fragment);
215 format!("{FINGERPRINT_PREFIX}{:08x}", hash as u32)
216}
217
218fn hash_fragment(fragment: &str) -> u64 {
227 if fragment.as_bytes().contains(&b'\r') {
228 xxh3_64(fragment.replace('\r', "").as_bytes())
229 } else {
230 xxh3_64(fragment.as_bytes())
231 }
232}
233
234#[must_use]
240pub fn group_refactoring_suggestion(group: &CloneGroup) -> RefactoringSuggestion {
241 let estimated_savings = group.line_count * group.instances.len().saturating_sub(1);
242 RefactoringSuggestion {
243 kind: RefactoringKind::ExtractFunction,
244 description: format!(
245 "Extract the shared {}-line block into one function and call it from {} sites",
246 group.line_count,
247 group.instances.len(),
248 ),
249 estimated_savings,
250 }
251}
252
253#[must_use]
262pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
263 let fragment = group.instances.first().map(|inst| inst.fragment.as_str())?;
264 let mut counts: FxHashMap<&str, usize> = FxHashMap::default();
265 for word in identifier_words(fragment) {
266 if is_generic_identifier(word) {
267 continue;
268 }
269 *counts.entry(word).or_insert(0) += 1;
270 }
271
272 let mut candidates: Vec<_> = counts
273 .into_iter()
274 .map(|(word, count)| IdentifierCandidate {
275 word,
276 count,
277 score: identifier_score(word, count),
278 })
279 .collect();
280 candidates.sort_by(|a, b| {
281 b.score
282 .cmp(&a.score)
283 .then_with(|| b.count.cmp(&a.count))
284 .then_with(|| a.word.cmp(b.word))
285 });
286
287 let best = candidates.first()?;
288 if best.count < 2 {
289 return None;
290 }
291
292 let runner_up = candidates.get(1);
293 if runner_up.is_some_and(|next| best.score.saturating_sub(next.score) < 2) {
294 return None;
295 }
296
297 if is_plain_single_token(best.word) {
298 let next_count = runner_up.map_or(0, |candidate| candidate.count);
299 if best.count < 3 || best.count < next_count + 2 {
300 return None;
301 }
302 }
303
304 Some(best.word.to_string())
305}
306
307#[derive(Debug)]
308struct IdentifierCandidate<'a> {
309 word: &'a str,
310 count: usize,
311 score: usize,
312}
313
314fn identifier_score(word: &str, count: usize) -> usize {
315 let quality_bonus = if has_identifier_separator_or_case_transition(word) {
316 5
317 } else if word.chars().count() >= 8 {
318 2
319 } else {
320 0
321 };
322 count * 5 + quality_bonus
323}
324
325fn is_plain_single_token(word: &str) -> bool {
326 !has_identifier_separator_or_case_transition(word) && word.chars().count() < 8
327}
328
329fn has_identifier_separator_or_case_transition(word: &str) -> bool {
330 word.contains('_')
331 || word.contains('$')
332 || word
333 .chars()
334 .collect::<Vec<_>>()
335 .windows(2)
336 .any(|pair| pair[0].is_ascii_lowercase() && pair[1].is_ascii_uppercase())
337}
338
339fn identifier_words(source: &str) -> impl Iterator<Item = &str> {
341 source
342 .split(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '$'))
343 .filter(|word| {
344 !word.is_empty()
345 && word
346 .chars()
347 .next()
348 .is_some_and(|c| c.is_ascii_alphabetic() || c == '_' || c == '$')
349 })
350}
351
352const GENERIC_IDENTIFIERS: &[&str] = &[
355 "data",
356 "result",
357 "results",
358 "item",
359 "items",
360 "value",
361 "values",
362 "val",
363 "obj",
364 "object",
365 "arr",
366 "array",
367 "list",
368 "map",
369 "set",
370 "key",
371 "keys",
372 "tmp",
373 "temp",
374 "acc",
375 "cur",
376 "curr",
377 "prev",
378 "next",
379 "node",
380 "el",
381 "elem",
382 "element",
383 "args",
384 "arg",
385 "opts",
386 "options",
387 "params",
388 "param",
389 "props",
390 "ctx",
391 "context",
392 "res",
393 "req",
394 "err",
395 "error",
396 "fn",
397 "cb",
398 "callback",
399 "out",
400 "input",
401 "output",
402 "name",
403 "id",
404 "index",
405 "idx",
406 "x",
407 "y",
408 "z",
409 "i",
410 "j",
411 "k",
412 "n",
413 "m",
414 "a",
415 "b",
416 "c",
417 "e",
418 "_",
419 "const",
420 "let",
421 "var",
422 "function",
423 "return",
424 "if",
425 "else",
426 "for",
427 "while",
428 "do",
429 "switch",
430 "case",
431 "break",
432 "continue",
433 "new",
434 "this",
435 "true",
436 "false",
437 "null",
438 "undefined",
439 "void",
440 "typeof",
441 "instanceof",
442 "in",
443 "of",
444 "class",
445 "extends",
446 "super",
447 "import",
448 "export",
449 "from",
450 "default",
451 "async",
452 "await",
453 "yield",
454 "type",
455 "interface",
456 "enum",
457 "as",
458 "is",
459 "keyof",
460 "readonly",
461 "public",
462 "private",
463 "protected",
464 "static",
465 "get",
466 "delete",
467 "throw",
468 "try",
469 "catch",
470 "finally",
471 "string",
472 "number",
473 "boolean",
474 "any",
475 "unknown",
476 "never",
477 "bigint",
478 "symbol",
479 "Math",
480 "JSON",
481 "Object",
482 "Array",
483 "Promise",
484 "BigInt",
485 "Number",
486 "String",
487 "Boolean",
488 "Symbol",
489 "RegExp",
490 "Date",
491];
492
493fn is_generic_identifier(word: &str) -> bool {
494 word.chars().count() == 1 || GENERIC_IDENTIFIERS.contains(&word)
495}
496
497#[cfg(test)]
498mod tests {
499 use std::path::PathBuf;
500
501 use super::*;
502
503 fn instance(fragment: &str) -> CloneInstance {
504 CloneInstance {
505 file: PathBuf::from("a.ts"),
506 start_line: 1,
507 end_line: 5,
508 start_col: 0,
509 end_col: 0,
510 fragment: fragment.to_string(),
511 }
512 }
513
514 fn group(fragments: &[&str], line_count: usize) -> CloneGroup {
515 CloneGroup {
516 instances: fragments.iter().map(|f| instance(f)).collect(),
517 token_count: 40,
518 line_count,
519 }
520 }
521
522 #[test]
523 fn fingerprint_is_stable_and_prefixed() {
524 let g = group(&["foo(bar)", "foo(baz)"], 3);
525 let fp1 = clone_fingerprint(&g.instances);
526 let fp2 = clone_fingerprint(&g.instances);
527 assert_eq!(fp1, fp2);
528 assert!(fp1.starts_with("dup:"));
529 assert_eq!(fp1.len(), "dup:".len() + 8);
530 }
531
532 #[test]
533 fn fingerprint_is_sibling_stable() {
534 let group_a = group(&["computeInvoiceTotal(order)", "computeInvoiceTotal(o)"], 4);
535 let before = clone_fingerprint(&group_a.instances);
536 let _group_b_edited = group(&["totallyDifferentBody()"], 2);
537 let after = clone_fingerprint(&group_a.instances);
538 assert_eq!(before, after);
539 }
540
541 #[test]
542 fn fingerprint_differs_for_different_content() {
543 let a = group(&["alpha()"], 2);
544 let b = group(&["beta()"], 2);
545 assert_ne!(
546 clone_fingerprint(&a.instances),
547 clone_fingerprint(&b.instances)
548 );
549 }
550
551 #[test]
552 fn fingerprint_set_widens_only_colliding_short_handles() {
553 let a = group(&["alpha()"], 2);
554 let b = group(&["beta()"], 2);
555 let c = group(&["gamma()"], 2);
556 let entries = vec![
557 (
558 CloneFingerprintKey::from_group(&a),
559 0x0000_0001_1234_5678_u64,
560 ),
561 (
562 CloneFingerprintKey::from_group(&b),
563 0x0000_0002_1234_5678_u64,
564 ),
565 (
566 CloneFingerprintKey::from_group(&c),
567 0x0000_0003_8765_4321_u64,
568 ),
569 ];
570
571 let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
572
573 assert_eq!(
574 fingerprints.fingerprint_for_group(&a),
575 "dup:0000000112345678"
576 );
577 assert_eq!(
578 fingerprints.fingerprint_for_group(&b),
579 "dup:0000000212345678"
580 );
581 assert_eq!(fingerprints.fingerprint_for_group(&c), "dup:87654321");
582 assert!(
583 fingerprints
584 .find_group(&[a.clone(), b.clone(), c.clone()], "dup:12345678")
585 .is_none()
586 );
587 assert_eq!(
588 fingerprints
589 .find_group(&[a, b, c], "dup:0000000212345678")
590 .and_then(|group| group.instances.first())
591 .map(|inst| inst.fragment.as_str()),
592 Some("beta()")
593 );
594 }
595
596 #[test]
597 fn fingerprint_set_suffixes_full_hash_collisions() {
598 let a = group(&["alpha()"], 2);
599 let b = group(&["beta()"], 2);
600 let entries = vec![
601 (
602 CloneFingerprintKey::from_group(&a),
603 0x0000_0001_1234_5678_u64,
604 ),
605 (
606 CloneFingerprintKey::from_group(&b),
607 0x0000_0001_1234_5678_u64,
608 ),
609 ];
610
611 let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
612
613 assert_eq!(
614 fingerprints.fingerprint_for_group(&a),
615 "dup:0000000112345678-1"
616 );
617 assert_eq!(
618 fingerprints.fingerprint_for_group(&b),
619 "dup:0000000112345678-2"
620 );
621 assert!(
622 fingerprints
623 .find_group(&[a.clone(), b.clone()], "dup:12345678")
624 .is_none()
625 );
626 assert!(
627 fingerprints
628 .find_group(&[a, b], "dup:0000000112345678")
629 .is_none()
630 );
631 }
632
633 #[test]
634 fn group_suggestion_savings_is_lines_times_extra_copies() {
635 let g = group(&["x", "x", "x"], 10); let suggestion = group_refactoring_suggestion(&g);
637 assert_eq!(suggestion.kind, RefactoringKind::ExtractFunction);
638 assert_eq!(suggestion.estimated_savings, 20); }
640
641 #[test]
642 fn dominant_identifier_picks_repeated_domain_name() {
643 let g = group(
644 &["function buildInvoice(invoice) { return invoice.total + invoice.tax; }"],
645 3,
646 );
647 assert_eq!(dominant_identifier(&g).as_deref(), Some("invoice"));
648 }
649
650 #[test]
651 fn dominant_identifier_none_on_generic() {
652 let g = group(&["const data = result.map((item) => item.value);"], 3);
653 assert_eq!(dominant_identifier(&g), None);
654 }
655
656 #[test]
657 fn dominant_identifier_skips_ts_primitive_keywords_and_globals() {
658 let g = group(
659 &["const parseUser = z.string(); parseUser(z.number()); parseUser.or(z.string());"],
660 4,
661 );
662 assert_eq!(dominant_identifier(&g).as_deref(), Some("parseUser"));
663 let only_keywords = group(&["const x: string = y as string; return x as any;"], 3);
664 assert_eq!(dominant_identifier(&only_keywords), None);
665 let g_global = group(&["Math.max(Math.floor(Math.abs(v)), 0)"], 3);
666 assert_eq!(dominant_identifier(&g_global), None);
667 }
668
669 #[test]
670 fn dominant_identifier_none_on_single_letter_type_param() {
671 let g = group(
672 &["function id<T>(x: T): T { const a: T = x; return a as T; }"],
673 3,
674 );
675 assert_eq!(dominant_identifier(&g), None);
676 }
677
678 #[test]
679 fn dominant_identifier_none_on_tie() {
680 let g = group(&["alpha(); beta();"], 2); assert_eq!(dominant_identifier(&g), None);
682 }
683
684 #[test]
685 fn dominant_identifier_prefers_structured_names() {
686 let g = group(
687 &["parseSchema(input); parseSchema(cache); helper(); helper();"],
688 3,
689 );
690 assert_eq!(dominant_identifier(&g).as_deref(), Some("parseSchema"));
691 }
692
693 #[test]
694 fn dominant_identifier_requires_plain_token_margin() {
695 let low_signal = group(&["schema(); schema(); parseUser();"], 3);
696 assert_eq!(dominant_identifier(&low_signal), None);
697
698 let strong = group(&["schema(); schema(); schema(); schema(); parseUser();"], 3);
699 assert_eq!(dominant_identifier(&strong).as_deref(), Some("schema"));
700 }
701
702 #[test]
703 fn dominant_identifier_is_stable_across_word_order() {
704 let first = group(
705 &["helper(); parseSchema(input); helper(); parseSchema(cache);"],
706 3,
707 );
708 let second = group(
709 &["parseSchema(input); helper(); parseSchema(cache); helper();"],
710 3,
711 );
712
713 assert_eq!(dominant_identifier(&first), dominant_identifier(&second));
714 assert_eq!(dominant_identifier(&first).as_deref(), Some("parseSchema"));
715 }
716}