1use std::path::PathBuf;
12
13use rustc_hash::{FxHashMap, FxHashSet};
14use xxhash_rust::xxh3::xxh3_64;
15
16use super::types::{CloneGroup, CloneInstance, RefactoringKind, RefactoringSuggestion};
17
18pub const FINGERPRINT_PREFIX: &str = "dup:";
20
21#[derive(Debug, Clone, PartialEq, Eq, Hash)]
27pub struct CloneFingerprintKey {
28 representative_fragment: String,
29 token_count: usize,
30 line_count: usize,
31 instance_count: usize,
32 first_file: Option<PathBuf>,
33 first_start_line: usize,
34 first_end_line: usize,
35}
36
37impl CloneFingerprintKey {
38 #[must_use]
40 pub fn from_parts(instances: &[CloneInstance], token_count: usize, line_count: usize) -> Self {
41 let first = instances.first();
42 Self {
43 representative_fragment: first.map_or_else(String::new, |inst| inst.fragment.clone()),
44 token_count,
45 line_count,
46 instance_count: instances.len(),
47 first_file: first.map(|inst| inst.file.clone()),
48 first_start_line: first.map_or(0, |inst| inst.start_line),
49 first_end_line: first.map_or(0, |inst| inst.end_line),
50 }
51 }
52
53 fn from_group(group: &CloneGroup) -> Self {
54 Self::from_parts(&group.instances, group.token_count, group.line_count)
55 }
56
57 fn representative_fragment(&self) -> &str {
58 &self.representative_fragment
59 }
60}
61
62#[derive(Debug, Clone)]
69pub struct CloneFingerprintSet {
70 by_key: FxHashMap<CloneFingerprintKey, String>,
71 key_by_fingerprint: FxHashMap<String, CloneFingerprintKey>,
72}
73
74impl CloneFingerprintSet {
75 #[must_use]
77 pub fn from_groups(groups: &[CloneGroup]) -> Self {
78 let entries: Vec<_> = groups
79 .iter()
80 .map(|group| {
81 let key = CloneFingerprintKey::from_group(group);
82 let hash = hash_fragment(key.representative_fragment());
83 (key, hash)
84 })
85 .collect();
86 Self::from_hashed_entries(&entries)
87 }
88
89 #[must_use]
91 pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
92 self.fingerprint_for_key(&CloneFingerprintKey::from_group(group))
93 }
94
95 #[must_use]
97 pub fn fingerprint_for_parts(
98 &self,
99 instances: &[CloneInstance],
100 token_count: usize,
101 line_count: usize,
102 ) -> String {
103 self.fingerprint_for_key(&CloneFingerprintKey::from_parts(
104 instances,
105 token_count,
106 line_count,
107 ))
108 }
109
110 #[must_use]
113 pub fn fingerprint_for_key(&self, key: &CloneFingerprintKey) -> String {
114 self.by_key
115 .get(key)
116 .cloned()
117 .unwrap_or_else(|| fingerprint_for_fragment(key.representative_fragment.as_str()))
118 }
119
120 #[must_use]
126 pub fn find_group<'a>(
127 &self,
128 groups: &'a [CloneGroup],
129 fingerprint: &str,
130 ) -> Option<&'a CloneGroup> {
131 let key = self.key_by_fingerprint.get(fingerprint)?;
132 groups
133 .iter()
134 .find(|group| CloneFingerprintKey::from_group(group) == *key)
135 }
136
137 fn from_hashed_entries(entries: &[(CloneFingerprintKey, u64)]) -> Self {
138 let mut short_counts: FxHashMap<u32, usize> = FxHashMap::default();
139 let mut full_counts: FxHashMap<u64, usize> = FxHashMap::default();
140 for (_, hash) in entries {
141 *short_counts.entry(*hash as u32).or_insert(0) += 1;
142 *full_counts.entry(*hash).or_insert(0) += 1;
143 }
144
145 let mut full_ordinals: FxHashMap<u64, usize> = FxHashMap::default();
146 let mut ambiguous_short_handles: FxHashSet<String> = FxHashSet::default();
147 let mut by_key = FxHashMap::default();
148 let mut key_by_fingerprint = FxHashMap::default();
149
150 for (key, hash) in entries {
151 let short = *hash as u32;
152 let short_handle = format!("{FINGERPRINT_PREFIX}{short:08x}");
153 let fingerprint = if short_counts.get(&short).copied().unwrap_or(0) == 1 {
154 short_handle
155 } else {
156 ambiguous_short_handles.insert(short_handle);
157 let full_handle = format!("{FINGERPRINT_PREFIX}{hash:016x}");
158 if full_counts.get(hash).copied().unwrap_or(0) == 1 {
159 full_handle
160 } else {
161 let ordinal = full_ordinals.entry(*hash).or_insert(0);
162 *ordinal += 1;
163 format!("{full_handle}-{ordinal}")
164 }
165 };
166
167 key_by_fingerprint.insert(fingerprint.clone(), key.clone());
168 by_key.insert(key.clone(), fingerprint);
169 }
170
171 for handle in ambiguous_short_handles {
172 key_by_fingerprint.remove(&handle);
173 }
174
175 Self {
176 by_key,
177 key_by_fingerprint,
178 }
179 }
180}
181
182#[must_use]
202pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
203 let representative = instances.first().map_or("", |inst| inst.fragment.as_str());
204 fingerprint_for_fragment(representative)
205}
206
207#[must_use]
213pub fn fingerprint_for_fragment(fragment: &str) -> String {
214 let hash = hash_fragment(fragment);
215 format!("{FINGERPRINT_PREFIX}{:08x}", hash as u32)
216}
217
218fn hash_fragment(fragment: &str) -> u64 {
227 if fragment.as_bytes().contains(&b'\r') {
228 xxh3_64(fragment.replace('\r', "").as_bytes())
229 } else {
230 xxh3_64(fragment.as_bytes())
231 }
232}
233
234#[must_use]
240pub fn group_refactoring_suggestion(group: &CloneGroup) -> RefactoringSuggestion {
241 let estimated_savings = group.line_count * group.instances.len().saturating_sub(1);
242 RefactoringSuggestion {
243 kind: RefactoringKind::ExtractFunction,
244 description: format!(
245 "Extract the shared {}-line block into one function and call it from {} sites",
246 group.line_count,
247 group.instances.len(),
248 ),
249 estimated_savings,
250 }
251}
252
253#[must_use]
262pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
263 let fragment = group.instances.first().map(|inst| inst.fragment.as_str())?;
264 let mut counts: FxHashMap<&str, usize> = FxHashMap::default();
265 for word in identifier_words(fragment) {
266 if is_generic_identifier(word) {
267 continue;
268 }
269 *counts.entry(word).or_insert(0) += 1;
270 }
271
272 let mut candidates: Vec<_> = counts
273 .into_iter()
274 .map(|(word, count)| IdentifierCandidate {
275 word,
276 count,
277 score: identifier_score(word, count),
278 })
279 .collect();
280 candidates.sort_by(|a, b| {
281 b.score
282 .cmp(&a.score)
283 .then_with(|| b.count.cmp(&a.count))
284 .then_with(|| a.word.cmp(b.word))
285 });
286
287 let best = candidates.first()?;
288 if best.count < 2 {
289 return None;
290 }
291
292 let runner_up = candidates.get(1);
293 if runner_up.is_some_and(|next| best.score.saturating_sub(next.score) < 2) {
294 return None;
295 }
296
297 if is_plain_single_token(best.word) {
298 let next_count = runner_up.map_or(0, |candidate| candidate.count);
299 if best.count < 3 || best.count < next_count + 2 {
300 return None;
301 }
302 }
303
304 Some(best.word.to_string())
305}
306
307#[derive(Debug)]
308struct IdentifierCandidate<'a> {
309 word: &'a str,
310 count: usize,
311 score: usize,
312}
313
314fn identifier_score(word: &str, count: usize) -> usize {
315 let quality_bonus = if has_identifier_separator_or_case_transition(word) {
316 5
317 } else if word.chars().count() >= 8 {
318 2
319 } else {
320 0
321 };
322 count * 5 + quality_bonus
323}
324
325fn is_plain_single_token(word: &str) -> bool {
326 !has_identifier_separator_or_case_transition(word) && word.chars().count() < 8
327}
328
329fn has_identifier_separator_or_case_transition(word: &str) -> bool {
330 if word.contains('_') || word.contains('$') {
331 return true;
332 }
333
334 let mut previous = None;
335 for ch in word.chars() {
336 if previous.is_some_and(|prev: char| prev.is_ascii_lowercase() && ch.is_ascii_uppercase()) {
337 return true;
338 }
339 previous = Some(ch);
340 }
341 false
342}
343
344fn identifier_words(source: &str) -> impl Iterator<Item = &str> {
346 source
347 .split(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '$'))
348 .filter(|word| {
349 !word.is_empty()
350 && word
351 .chars()
352 .next()
353 .is_some_and(|c| c.is_ascii_alphabetic() || c == '_' || c == '$')
354 })
355}
356
357const GENERIC_IDENTIFIERS: &[&str] = &[
360 "data",
361 "result",
362 "results",
363 "item",
364 "items",
365 "value",
366 "values",
367 "val",
368 "obj",
369 "object",
370 "arr",
371 "array",
372 "list",
373 "map",
374 "set",
375 "key",
376 "keys",
377 "tmp",
378 "temp",
379 "acc",
380 "cur",
381 "curr",
382 "prev",
383 "next",
384 "node",
385 "el",
386 "elem",
387 "element",
388 "args",
389 "arg",
390 "opts",
391 "options",
392 "params",
393 "param",
394 "props",
395 "ctx",
396 "context",
397 "res",
398 "req",
399 "err",
400 "error",
401 "fn",
402 "cb",
403 "callback",
404 "out",
405 "input",
406 "output",
407 "name",
408 "id",
409 "index",
410 "idx",
411 "x",
412 "y",
413 "z",
414 "i",
415 "j",
416 "k",
417 "n",
418 "m",
419 "a",
420 "b",
421 "c",
422 "e",
423 "_",
424 "const",
425 "let",
426 "var",
427 "function",
428 "return",
429 "if",
430 "else",
431 "for",
432 "while",
433 "do",
434 "switch",
435 "case",
436 "break",
437 "continue",
438 "new",
439 "this",
440 "true",
441 "false",
442 "null",
443 "undefined",
444 "void",
445 "typeof",
446 "instanceof",
447 "in",
448 "of",
449 "class",
450 "extends",
451 "super",
452 "import",
453 "export",
454 "from",
455 "default",
456 "async",
457 "await",
458 "yield",
459 "type",
460 "interface",
461 "enum",
462 "as",
463 "is",
464 "keyof",
465 "readonly",
466 "public",
467 "private",
468 "protected",
469 "static",
470 "get",
471 "delete",
472 "throw",
473 "try",
474 "catch",
475 "finally",
476 "string",
477 "number",
478 "boolean",
479 "any",
480 "unknown",
481 "never",
482 "bigint",
483 "symbol",
484 "Math",
485 "JSON",
486 "Object",
487 "Array",
488 "Promise",
489 "BigInt",
490 "Number",
491 "String",
492 "Boolean",
493 "Symbol",
494 "RegExp",
495 "Date",
496];
497
498fn is_generic_identifier(word: &str) -> bool {
499 word.chars().count() == 1 || GENERIC_IDENTIFIERS.contains(&word)
500}
501
502#[cfg(test)]
503mod tests {
504 use std::path::PathBuf;
505
506 use super::*;
507
508 fn instance(fragment: &str) -> CloneInstance {
509 CloneInstance {
510 file: PathBuf::from("a.ts"),
511 start_line: 1,
512 end_line: 5,
513 start_col: 0,
514 end_col: 0,
515 fragment: fragment.to_string(),
516 }
517 }
518
519 fn group(fragments: &[&str], line_count: usize) -> CloneGroup {
520 CloneGroup {
521 instances: fragments.iter().map(|f| instance(f)).collect(),
522 token_count: 40,
523 line_count,
524 }
525 }
526
527 #[test]
528 fn fingerprint_is_stable_and_prefixed() {
529 let g = group(&["foo(bar)", "foo(baz)"], 3);
530 let fp1 = clone_fingerprint(&g.instances);
531 let fp2 = clone_fingerprint(&g.instances);
532 assert_eq!(fp1, fp2);
533 assert!(fp1.starts_with("dup:"));
534 assert_eq!(fp1.len(), "dup:".len() + 8);
535 }
536
537 #[test]
538 fn fingerprint_is_sibling_stable() {
539 let group_a = group(&["computeInvoiceTotal(order)", "computeInvoiceTotal(o)"], 4);
540 let before = clone_fingerprint(&group_a.instances);
541 let _group_b_edited = group(&["totallyDifferentBody()"], 2);
542 let after = clone_fingerprint(&group_a.instances);
543 assert_eq!(before, after);
544 }
545
546 #[test]
547 fn fingerprint_differs_for_different_content() {
548 let a = group(&["alpha()"], 2);
549 let b = group(&["beta()"], 2);
550 assert_ne!(
551 clone_fingerprint(&a.instances),
552 clone_fingerprint(&b.instances)
553 );
554 }
555
556 #[test]
557 fn fingerprint_set_widens_only_colliding_short_handles() {
558 let a = group(&["alpha()"], 2);
559 let b = group(&["beta()"], 2);
560 let c = group(&["gamma()"], 2);
561 let entries = vec![
562 (
563 CloneFingerprintKey::from_group(&a),
564 0x0000_0001_1234_5678_u64,
565 ),
566 (
567 CloneFingerprintKey::from_group(&b),
568 0x0000_0002_1234_5678_u64,
569 ),
570 (
571 CloneFingerprintKey::from_group(&c),
572 0x0000_0003_8765_4321_u64,
573 ),
574 ];
575
576 let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
577
578 assert_eq!(
579 fingerprints.fingerprint_for_group(&a),
580 "dup:0000000112345678"
581 );
582 assert_eq!(
583 fingerprints.fingerprint_for_group(&b),
584 "dup:0000000212345678"
585 );
586 assert_eq!(fingerprints.fingerprint_for_group(&c), "dup:87654321");
587 assert!(
588 fingerprints
589 .find_group(&[a.clone(), b.clone(), c.clone()], "dup:12345678")
590 .is_none()
591 );
592 assert_eq!(
593 fingerprints
594 .find_group(&[a, b, c], "dup:0000000212345678")
595 .and_then(|group| group.instances.first())
596 .map(|inst| inst.fragment.as_str()),
597 Some("beta()")
598 );
599 }
600
601 #[test]
602 fn fingerprint_set_suffixes_full_hash_collisions() {
603 let a = group(&["alpha()"], 2);
604 let b = group(&["beta()"], 2);
605 let entries = vec![
606 (
607 CloneFingerprintKey::from_group(&a),
608 0x0000_0001_1234_5678_u64,
609 ),
610 (
611 CloneFingerprintKey::from_group(&b),
612 0x0000_0001_1234_5678_u64,
613 ),
614 ];
615
616 let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
617
618 assert_eq!(
619 fingerprints.fingerprint_for_group(&a),
620 "dup:0000000112345678-1"
621 );
622 assert_eq!(
623 fingerprints.fingerprint_for_group(&b),
624 "dup:0000000112345678-2"
625 );
626 assert!(
627 fingerprints
628 .find_group(&[a.clone(), b.clone()], "dup:12345678")
629 .is_none()
630 );
631 assert!(
632 fingerprints
633 .find_group(&[a, b], "dup:0000000112345678")
634 .is_none()
635 );
636 }
637
638 #[test]
639 fn group_suggestion_savings_is_lines_times_extra_copies() {
640 let g = group(&["x", "x", "x"], 10); let suggestion = group_refactoring_suggestion(&g);
642 assert_eq!(suggestion.kind, RefactoringKind::ExtractFunction);
643 assert_eq!(suggestion.estimated_savings, 20); }
645
646 #[test]
647 fn dominant_identifier_picks_repeated_domain_name() {
648 let g = group(
649 &["function buildInvoice(invoice) { return invoice.total + invoice.tax; }"],
650 3,
651 );
652 assert_eq!(dominant_identifier(&g).as_deref(), Some("invoice"));
653 }
654
655 #[test]
656 fn dominant_identifier_none_on_generic() {
657 let g = group(&["const data = result.map((item) => item.value);"], 3);
658 assert_eq!(dominant_identifier(&g), None);
659 }
660
661 #[test]
662 fn dominant_identifier_skips_ts_primitive_keywords_and_globals() {
663 let g = group(
664 &["const parseUser = z.string(); parseUser(z.number()); parseUser.or(z.string());"],
665 4,
666 );
667 assert_eq!(dominant_identifier(&g).as_deref(), Some("parseUser"));
668 let only_keywords = group(&["const x: string = y as string; return x as any;"], 3);
669 assert_eq!(dominant_identifier(&only_keywords), None);
670 let g_global = group(&["Math.max(Math.floor(Math.abs(v)), 0)"], 3);
671 assert_eq!(dominant_identifier(&g_global), None);
672 }
673
674 #[test]
675 fn dominant_identifier_none_on_single_letter_type_param() {
676 let g = group(
677 &["function id<T>(x: T): T { const a: T = x; return a as T; }"],
678 3,
679 );
680 assert_eq!(dominant_identifier(&g), None);
681 }
682
683 #[test]
684 fn dominant_identifier_none_on_tie() {
685 let g = group(&["alpha(); beta();"], 2); assert_eq!(dominant_identifier(&g), None);
687 }
688
689 #[test]
690 fn dominant_identifier_prefers_structured_names() {
691 let g = group(
692 &["parseSchema(input); parseSchema(cache); helper(); helper();"],
693 3,
694 );
695 assert_eq!(dominant_identifier(&g).as_deref(), Some("parseSchema"));
696 }
697
698 #[test]
699 fn dominant_identifier_requires_plain_token_margin() {
700 let low_signal = group(&["schema(); schema(); parseUser();"], 3);
701 assert_eq!(dominant_identifier(&low_signal), None);
702
703 let strong = group(&["schema(); schema(); schema(); schema(); parseUser();"], 3);
704 assert_eq!(dominant_identifier(&strong).as_deref(), Some("schema"));
705 }
706
707 #[test]
708 fn dominant_identifier_is_stable_across_word_order() {
709 let first = group(
710 &["helper(); parseSchema(input); helper(); parseSchema(cache);"],
711 3,
712 );
713 let second = group(
714 &["parseSchema(input); helper(); parseSchema(cache); helper();"],
715 3,
716 );
717
718 assert_eq!(dominant_identifier(&first), dominant_identifier(&second));
719 assert_eq!(dominant_identifier(&first).as_deref(), Some("parseSchema"));
720 }
721}