1use std::path::PathBuf;
12
13use rustc_hash::{FxHashMap, FxHashSet};
14use xxhash_rust::xxh3::xxh3_64;
15
16use super::types::{CloneGroup, CloneInstance, RefactoringKind, RefactoringSuggestion};
17
18pub const FINGERPRINT_PREFIX: &str = "dup:";
20
21#[derive(Debug, Clone, PartialEq, Eq, Hash)]
27pub struct CloneFingerprintKey {
28 representative_fragment: String,
29 token_count: usize,
30 line_count: usize,
31 instance_count: usize,
32 first_file: Option<PathBuf>,
33 first_start_line: usize,
34 first_end_line: usize,
35}
36
37impl CloneFingerprintKey {
38 #[must_use]
40 pub fn from_parts(instances: &[CloneInstance], token_count: usize, line_count: usize) -> Self {
41 let first = instances.first();
42 Self {
43 representative_fragment: first.map_or_else(String::new, |inst| inst.fragment.clone()),
44 token_count,
45 line_count,
46 instance_count: instances.len(),
47 first_file: first.map(|inst| inst.file.clone()),
48 first_start_line: first.map_or(0, |inst| inst.start_line),
49 first_end_line: first.map_or(0, |inst| inst.end_line),
50 }
51 }
52
53 fn from_group(group: &CloneGroup) -> Self {
54 Self::from_parts(&group.instances, group.token_count, group.line_count)
55 }
56
57 fn representative_fragment(&self) -> &str {
58 &self.representative_fragment
59 }
60}
61
62#[derive(Debug, Clone)]
69pub struct CloneFingerprintSet {
70 by_key: FxHashMap<CloneFingerprintKey, String>,
71 key_by_fingerprint: FxHashMap<String, CloneFingerprintKey>,
72}
73
74impl CloneFingerprintSet {
75 #[must_use]
77 pub fn from_groups(groups: &[CloneGroup]) -> Self {
78 let entries: Vec<_> = groups
79 .iter()
80 .map(|group| {
81 let key = CloneFingerprintKey::from_group(group);
82 let hash = hash_fragment(key.representative_fragment());
83 (key, hash)
84 })
85 .collect();
86 Self::from_hashed_entries(&entries)
87 }
88
89 #[must_use]
91 pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
92 self.fingerprint_for_key(&CloneFingerprintKey::from_group(group))
93 }
94
95 #[must_use]
97 pub fn fingerprint_for_parts(
98 &self,
99 instances: &[CloneInstance],
100 token_count: usize,
101 line_count: usize,
102 ) -> String {
103 self.fingerprint_for_key(&CloneFingerprintKey::from_parts(
104 instances,
105 token_count,
106 line_count,
107 ))
108 }
109
110 #[must_use]
113 pub fn fingerprint_for_key(&self, key: &CloneFingerprintKey) -> String {
114 self.by_key
115 .get(key)
116 .cloned()
117 .unwrap_or_else(|| fingerprint_for_fragment(key.representative_fragment.as_str()))
118 }
119
120 #[must_use]
126 pub fn find_group<'a>(
127 &self,
128 groups: &'a [CloneGroup],
129 fingerprint: &str,
130 ) -> Option<&'a CloneGroup> {
131 let key = self.key_by_fingerprint.get(fingerprint)?;
132 groups
133 .iter()
134 .find(|group| CloneFingerprintKey::from_group(group) == *key)
135 }
136
137 fn from_hashed_entries(entries: &[(CloneFingerprintKey, u64)]) -> Self {
138 let mut short_counts: FxHashMap<u32, usize> = FxHashMap::default();
139 let mut full_counts: FxHashMap<u64, usize> = FxHashMap::default();
140 for (_, hash) in entries {
141 *short_counts.entry(*hash as u32).or_insert(0) += 1;
142 *full_counts.entry(*hash).or_insert(0) += 1;
143 }
144
145 let mut full_ordinals: FxHashMap<u64, usize> = FxHashMap::default();
146 let mut ambiguous_short_handles: FxHashSet<String> = FxHashSet::default();
147 let mut by_key = FxHashMap::default();
148 let mut key_by_fingerprint = FxHashMap::default();
149
150 for (key, hash) in entries {
151 let short = *hash as u32;
152 let short_handle = format!("{FINGERPRINT_PREFIX}{short:08x}");
153 let fingerprint = if short_counts.get(&short).copied().unwrap_or(0) == 1 {
154 short_handle
155 } else {
156 ambiguous_short_handles.insert(short_handle);
157 let full_handle = format!("{FINGERPRINT_PREFIX}{hash:016x}");
158 if full_counts.get(hash).copied().unwrap_or(0) == 1 {
159 full_handle
160 } else {
161 let ordinal = full_ordinals.entry(*hash).or_insert(0);
162 *ordinal += 1;
163 format!("{full_handle}-{ordinal}")
164 }
165 };
166
167 key_by_fingerprint.insert(fingerprint.clone(), key.clone());
168 by_key.insert(key.clone(), fingerprint);
169 }
170
171 for handle in ambiguous_short_handles {
172 key_by_fingerprint.remove(&handle);
173 }
174
175 Self {
176 by_key,
177 key_by_fingerprint,
178 }
179 }
180}
181
182#[must_use]
202pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
203 let representative = instances.first().map_or("", |inst| inst.fragment.as_str());
204 fingerprint_for_fragment(representative)
205}
206
207#[must_use]
213pub fn fingerprint_for_fragment(fragment: &str) -> String {
214 let hash = hash_fragment(fragment);
215 format!("{FINGERPRINT_PREFIX}{:08x}", hash as u32)
216}
217
218fn hash_fragment(fragment: &str) -> u64 {
227 if fragment.as_bytes().contains(&b'\r') {
228 xxh3_64(fragment.replace('\r', "").as_bytes())
229 } else {
230 xxh3_64(fragment.as_bytes())
231 }
232}
233
234#[must_use]
240pub fn group_refactoring_suggestion(group: &CloneGroup) -> RefactoringSuggestion {
241 let estimated_savings = group.line_count * group.instances.len().saturating_sub(1);
242 RefactoringSuggestion {
243 kind: RefactoringKind::ExtractFunction,
244 description: format!(
245 "Extract the shared {}-line block into one function and call it from {} sites",
246 group.line_count,
247 group.instances.len(),
248 ),
249 estimated_savings,
250 }
251}
252
253#[must_use]
262pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
263 let fragment = group.instances.first().map(|inst| inst.fragment.as_str())?;
264 let mut counts: FxHashMap<&str, usize> = FxHashMap::default();
265 for word in identifier_words(fragment) {
266 if is_generic_identifier(word) {
267 continue;
268 }
269 *counts.entry(word).or_insert(0) += 1;
270 }
271
272 let mut candidates: Vec<_> = counts
273 .into_iter()
274 .map(|(word, count)| IdentifierCandidate {
275 word,
276 count,
277 score: identifier_score(word, count),
278 })
279 .collect();
280 candidates.sort_by(|a, b| {
281 b.score
282 .cmp(&a.score)
283 .then_with(|| b.count.cmp(&a.count))
284 .then_with(|| a.word.cmp(b.word))
285 });
286
287 let best = candidates.first()?;
288 if best.count < 2 {
289 return None;
290 }
291
292 let runner_up = candidates.get(1);
293 if runner_up.is_some_and(|next| best.score.saturating_sub(next.score) < 2) {
294 return None;
295 }
296
297 if is_plain_single_token(best.word) {
298 let next_count = runner_up.map_or(0, |candidate| candidate.count);
299 if best.count < 3 || best.count < next_count + 2 {
300 return None;
301 }
302 }
303
304 Some(best.word.to_string())
305}
306
307#[derive(Debug)]
308struct IdentifierCandidate<'a> {
309 word: &'a str,
310 count: usize,
311 score: usize,
312}
313
314fn identifier_score(word: &str, count: usize) -> usize {
315 let quality_bonus = if has_identifier_separator_or_case_transition(word) {
316 5
317 } else if word.chars().count() >= 8 {
318 2
319 } else {
320 0
321 };
322 count * 5 + quality_bonus
323}
324
325fn is_plain_single_token(word: &str) -> bool {
326 !has_identifier_separator_or_case_transition(word) && word.chars().count() < 8
327}
328
329fn has_identifier_separator_or_case_transition(word: &str) -> bool {
330 word.contains('_')
331 || word.contains('$')
332 || word
333 .chars()
334 .collect::<Vec<_>>()
335 .windows(2)
336 .any(|pair| pair[0].is_ascii_lowercase() && pair[1].is_ascii_uppercase())
337}
338
339fn identifier_words(source: &str) -> impl Iterator<Item = &str> {
341 source
342 .split(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '$'))
343 .filter(|word| {
344 !word.is_empty()
345 && word
346 .chars()
347 .next()
348 .is_some_and(|c| c.is_ascii_alphabetic() || c == '_' || c == '$')
349 })
350}
351
352fn is_generic_identifier(word: &str) -> bool {
355 if word.chars().count() == 1 {
356 return true;
357 }
358 matches!(
359 word,
360 "data"
361 | "result"
362 | "results"
363 | "item"
364 | "items"
365 | "value"
366 | "values"
367 | "val"
368 | "obj"
369 | "object"
370 | "arr"
371 | "array"
372 | "list"
373 | "map"
374 | "set"
375 | "key"
376 | "keys"
377 | "tmp"
378 | "temp"
379 | "acc"
380 | "cur"
381 | "curr"
382 | "prev"
383 | "next"
384 | "node"
385 | "el"
386 | "elem"
387 | "element"
388 | "args"
389 | "arg"
390 | "opts"
391 | "options"
392 | "params"
393 | "param"
394 | "props"
395 | "ctx"
396 | "context"
397 | "res"
398 | "req"
399 | "err"
400 | "error"
401 | "fn"
402 | "cb"
403 | "callback"
404 | "out"
405 | "input"
406 | "output"
407 | "name"
408 | "id"
409 | "index"
410 | "idx"
411 | "x"
412 | "y"
413 | "z"
414 | "i"
415 | "j"
416 | "k"
417 | "n"
418 | "m"
419 | "a"
420 | "b"
421 | "c"
422 | "e"
423 | "_"
424 | "const"
425 | "let"
426 | "var"
427 | "function"
428 | "return"
429 | "if"
430 | "else"
431 | "for"
432 | "while"
433 | "do"
434 | "switch"
435 | "case"
436 | "break"
437 | "continue"
438 | "new"
439 | "this"
440 | "true"
441 | "false"
442 | "null"
443 | "undefined"
444 | "void"
445 | "typeof"
446 | "instanceof"
447 | "in"
448 | "of"
449 | "class"
450 | "extends"
451 | "super"
452 | "import"
453 | "export"
454 | "from"
455 | "default"
456 | "async"
457 | "await"
458 | "yield"
459 | "type"
460 | "interface"
461 | "enum"
462 | "as"
463 | "is"
464 | "keyof"
465 | "readonly"
466 | "public"
467 | "private"
468 | "protected"
469 | "static"
470 | "get"
471 | "delete"
472 | "throw"
473 | "try"
474 | "catch"
475 | "finally"
476 | "string"
477 | "number"
478 | "boolean"
479 | "any"
480 | "unknown"
481 | "never"
482 | "bigint"
483 | "symbol"
484 | "Math"
485 | "JSON"
486 | "Object"
487 | "Array"
488 | "Promise"
489 | "BigInt"
490 | "Number"
491 | "String"
492 | "Boolean"
493 | "Symbol"
494 | "RegExp"
495 | "Date"
496 )
497}
498
499#[cfg(test)]
500mod tests {
501 use std::path::PathBuf;
502
503 use super::*;
504
505 fn instance(fragment: &str) -> CloneInstance {
506 CloneInstance {
507 file: PathBuf::from("a.ts"),
508 start_line: 1,
509 end_line: 5,
510 start_col: 0,
511 end_col: 0,
512 fragment: fragment.to_string(),
513 }
514 }
515
516 fn group(fragments: &[&str], line_count: usize) -> CloneGroup {
517 CloneGroup {
518 instances: fragments.iter().map(|f| instance(f)).collect(),
519 token_count: 40,
520 line_count,
521 }
522 }
523
524 #[test]
525 fn fingerprint_is_stable_and_prefixed() {
526 let g = group(&["foo(bar)", "foo(baz)"], 3);
527 let fp1 = clone_fingerprint(&g.instances);
528 let fp2 = clone_fingerprint(&g.instances);
529 assert_eq!(fp1, fp2);
530 assert!(fp1.starts_with("dup:"));
531 assert_eq!(fp1.len(), "dup:".len() + 8);
532 }
533
534 #[test]
535 fn fingerprint_is_sibling_stable() {
536 let group_a = group(&["computeInvoiceTotal(order)", "computeInvoiceTotal(o)"], 4);
537 let before = clone_fingerprint(&group_a.instances);
538 let _group_b_edited = group(&["totallyDifferentBody()"], 2);
539 let after = clone_fingerprint(&group_a.instances);
540 assert_eq!(before, after);
541 }
542
543 #[test]
544 fn fingerprint_differs_for_different_content() {
545 let a = group(&["alpha()"], 2);
546 let b = group(&["beta()"], 2);
547 assert_ne!(
548 clone_fingerprint(&a.instances),
549 clone_fingerprint(&b.instances)
550 );
551 }
552
553 #[test]
554 fn fingerprint_set_widens_only_colliding_short_handles() {
555 let a = group(&["alpha()"], 2);
556 let b = group(&["beta()"], 2);
557 let c = group(&["gamma()"], 2);
558 let entries = vec![
559 (
560 CloneFingerprintKey::from_group(&a),
561 0x0000_0001_1234_5678_u64,
562 ),
563 (
564 CloneFingerprintKey::from_group(&b),
565 0x0000_0002_1234_5678_u64,
566 ),
567 (
568 CloneFingerprintKey::from_group(&c),
569 0x0000_0003_8765_4321_u64,
570 ),
571 ];
572
573 let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
574
575 assert_eq!(
576 fingerprints.fingerprint_for_group(&a),
577 "dup:0000000112345678"
578 );
579 assert_eq!(
580 fingerprints.fingerprint_for_group(&b),
581 "dup:0000000212345678"
582 );
583 assert_eq!(fingerprints.fingerprint_for_group(&c), "dup:87654321");
584 assert!(
585 fingerprints
586 .find_group(&[a.clone(), b.clone(), c.clone()], "dup:12345678")
587 .is_none()
588 );
589 assert_eq!(
590 fingerprints
591 .find_group(&[a, b, c], "dup:0000000212345678")
592 .and_then(|group| group.instances.first())
593 .map(|inst| inst.fragment.as_str()),
594 Some("beta()")
595 );
596 }
597
598 #[test]
599 fn fingerprint_set_suffixes_full_hash_collisions() {
600 let a = group(&["alpha()"], 2);
601 let b = group(&["beta()"], 2);
602 let entries = vec![
603 (
604 CloneFingerprintKey::from_group(&a),
605 0x0000_0001_1234_5678_u64,
606 ),
607 (
608 CloneFingerprintKey::from_group(&b),
609 0x0000_0001_1234_5678_u64,
610 ),
611 ];
612
613 let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
614
615 assert_eq!(
616 fingerprints.fingerprint_for_group(&a),
617 "dup:0000000112345678-1"
618 );
619 assert_eq!(
620 fingerprints.fingerprint_for_group(&b),
621 "dup:0000000112345678-2"
622 );
623 assert!(
624 fingerprints
625 .find_group(&[a.clone(), b.clone()], "dup:12345678")
626 .is_none()
627 );
628 assert!(
629 fingerprints
630 .find_group(&[a, b], "dup:0000000112345678")
631 .is_none()
632 );
633 }
634
635 #[test]
636 fn group_suggestion_savings_is_lines_times_extra_copies() {
637 let g = group(&["x", "x", "x"], 10); let suggestion = group_refactoring_suggestion(&g);
639 assert_eq!(suggestion.kind, RefactoringKind::ExtractFunction);
640 assert_eq!(suggestion.estimated_savings, 20); }
642
643 #[test]
644 fn dominant_identifier_picks_repeated_domain_name() {
645 let g = group(
646 &["function buildInvoice(invoice) { return invoice.total + invoice.tax; }"],
647 3,
648 );
649 assert_eq!(dominant_identifier(&g).as_deref(), Some("invoice"));
650 }
651
652 #[test]
653 fn dominant_identifier_none_on_generic() {
654 let g = group(&["const data = result.map((item) => item.value);"], 3);
655 assert_eq!(dominant_identifier(&g), None);
656 }
657
658 #[test]
659 fn dominant_identifier_skips_ts_primitive_keywords_and_globals() {
660 let g = group(
661 &["const parseUser = z.string(); parseUser(z.number()); parseUser.or(z.string());"],
662 4,
663 );
664 assert_eq!(dominant_identifier(&g).as_deref(), Some("parseUser"));
665 let only_keywords = group(&["const x: string = y as string; return x as any;"], 3);
666 assert_eq!(dominant_identifier(&only_keywords), None);
667 let g_global = group(&["Math.max(Math.floor(Math.abs(v)), 0)"], 3);
668 assert_eq!(dominant_identifier(&g_global), None);
669 }
670
671 #[test]
672 fn dominant_identifier_none_on_single_letter_type_param() {
673 let g = group(
674 &["function id<T>(x: T): T { const a: T = x; return a as T; }"],
675 3,
676 );
677 assert_eq!(dominant_identifier(&g), None);
678 }
679
680 #[test]
681 fn dominant_identifier_none_on_tie() {
682 let g = group(&["alpha(); beta();"], 2); assert_eq!(dominant_identifier(&g), None);
684 }
685
686 #[test]
687 fn dominant_identifier_prefers_structured_names() {
688 let g = group(
689 &["parseSchema(input); parseSchema(cache); helper(); helper();"],
690 3,
691 );
692 assert_eq!(dominant_identifier(&g).as_deref(), Some("parseSchema"));
693 }
694
695 #[test]
696 fn dominant_identifier_requires_plain_token_margin() {
697 let low_signal = group(&["schema(); schema(); parseUser();"], 3);
698 assert_eq!(dominant_identifier(&low_signal), None);
699
700 let strong = group(&["schema(); schema(); schema(); schema(); parseUser();"], 3);
701 assert_eq!(dominant_identifier(&strong).as_deref(), Some("schema"));
702 }
703
704 #[test]
705 fn dominant_identifier_is_stable_across_word_order() {
706 let first = group(
707 &["helper(); parseSchema(input); helper(); parseSchema(cache);"],
708 3,
709 );
710 let second = group(
711 &["parseSchema(input); helper(); parseSchema(cache); helper();"],
712 3,
713 );
714
715 assert_eq!(dominant_identifier(&first), dominant_identifier(&second));
716 assert_eq!(dominant_identifier(&first).as_deref(), Some("parseSchema"));
717 }
718}