Skip to main content

fallow_core/duplicates/
deepdive.rs

1//! Deep-dive helpers for the `fallow dupes --trace` inspector: a stable
2//! content fingerprint that addresses a clone group across runs, a group-level
3//! refactoring suggestion, and a best-effort "dominant identifier" name for the
4//! extracted function.
5//!
6//! These are pure functions over [`CloneInstance`] / [`CloneGroup`] so every
7//! surface (human listing, `--trace dup:<fp>` lookup, the typed JSON wrappers,
8//! and `trace_clone`) computes the same values without storing a field on the
9//! core [`CloneGroup`] struct.
10
11use std::path::PathBuf;
12
13use rustc_hash::{FxHashMap, FxHashSet};
14use xxhash_rust::xxh3::xxh3_64;
15
16use super::types::{CloneGroup, CloneInstance, RefactoringKind, RefactoringSuggestion};
17
18/// Prefix marking a clone-group fingerprint addressable via `--trace`.
19pub const FINGERPRINT_PREFIX: &str = "dup:";
20
21/// Canonical identity for a clone group when assigning report-scoped handles.
22///
23/// The representative fragment keeps the handle content-derived. The structural
24/// and location fields make otherwise identical wrappers addressable when tests
25/// or future grouping modes present them as separate report entries.
26#[derive(Debug, Clone, PartialEq, Eq, Hash)]
27pub struct CloneFingerprintKey {
28    representative_fragment: String,
29    token_count: usize,
30    line_count: usize,
31    instance_count: usize,
32    first_file: Option<PathBuf>,
33    first_start_line: usize,
34    first_end_line: usize,
35}
36
37impl CloneFingerprintKey {
38    /// Build a fingerprint key from clone-group parts.
39    #[must_use]
40    pub fn from_parts(instances: &[CloneInstance], token_count: usize, line_count: usize) -> Self {
41        let first = instances.first();
42        Self {
43            representative_fragment: first.map_or_else(String::new, |inst| inst.fragment.clone()),
44            token_count,
45            line_count,
46            instance_count: instances.len(),
47            first_file: first.map(|inst| inst.file.clone()),
48            first_start_line: first.map_or(0, |inst| inst.start_line),
49            first_end_line: first.map_or(0, |inst| inst.end_line),
50        }
51    }
52
53    fn from_group(group: &CloneGroup) -> Self {
54        Self::from_parts(&group.instances, group.token_count, group.line_count)
55    }
56
57    fn representative_fragment(&self) -> &str {
58        &self.representative_fragment
59    }
60}
61
62/// Report-scoped clone fingerprint assignment.
63///
64/// Most reports retain the short `dup:<8hex>` handle. If two report entries
65/// collide on those low 32 bits, only the colliding entries widen to
66/// `dup:<16hex>`. If a full 64-bit collision ever occurs inside one report,
67/// every entry in that collision bucket receives a deterministic numeric suffix.
68#[derive(Debug, Clone)]
69pub struct CloneFingerprintSet {
70    by_key: FxHashMap<CloneFingerprintKey, String>,
71    key_by_fingerprint: FxHashMap<String, CloneFingerprintKey>,
72}
73
74impl CloneFingerprintSet {
75    /// Assign collision-free fingerprints for the report's clone groups.
76    #[must_use]
77    pub fn from_groups(groups: &[CloneGroup]) -> Self {
78        let entries: Vec<_> = groups
79            .iter()
80            .map(|group| {
81                let key = CloneFingerprintKey::from_group(group);
82                let hash = hash_fragment(key.representative_fragment());
83                (key, hash)
84            })
85            .collect();
86        Self::from_hashed_entries(&entries)
87    }
88
89    /// Return the assigned fingerprint for a clone group.
90    #[must_use]
91    pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
92        self.fingerprint_for_key(&CloneFingerprintKey::from_group(group))
93    }
94
95    /// Return the assigned fingerprint for clone-group parts.
96    #[must_use]
97    pub fn fingerprint_for_parts(
98        &self,
99        instances: &[CloneInstance],
100        token_count: usize,
101        line_count: usize,
102    ) -> String {
103        self.fingerprint_for_key(&CloneFingerprintKey::from_parts(
104            instances,
105            token_count,
106            line_count,
107        ))
108    }
109
110    /// Return the assigned fingerprint for a key, falling back to the legacy
111    /// short content handle when the key was not present in this report.
112    #[must_use]
113    pub fn fingerprint_for_key(&self, key: &CloneFingerprintKey) -> String {
114        self.by_key
115            .get(key)
116            .cloned()
117            .unwrap_or_else(|| fingerprint_for_fragment(key.representative_fragment.as_str()))
118    }
119
120    /// Find the group addressed by an assigned fingerprint.
121    ///
122    /// Ambiguous short handles created by low-32 collisions are intentionally
123    /// absent from the lookup table, so callers get `None` instead of the first
124    /// matching group.
125    #[must_use]
126    pub fn find_group<'a>(
127        &self,
128        groups: &'a [CloneGroup],
129        fingerprint: &str,
130    ) -> Option<&'a CloneGroup> {
131        let key = self.key_by_fingerprint.get(fingerprint)?;
132        groups
133            .iter()
134            .find(|group| CloneFingerprintKey::from_group(group) == *key)
135    }
136
137    fn from_hashed_entries(entries: &[(CloneFingerprintKey, u64)]) -> Self {
138        let mut short_counts: FxHashMap<u32, usize> = FxHashMap::default();
139        let mut full_counts: FxHashMap<u64, usize> = FxHashMap::default();
140        for (_, hash) in entries {
141            *short_counts.entry(*hash as u32).or_insert(0) += 1;
142            *full_counts.entry(*hash).or_insert(0) += 1;
143        }
144
145        let mut full_ordinals: FxHashMap<u64, usize> = FxHashMap::default();
146        let mut ambiguous_short_handles: FxHashSet<String> = FxHashSet::default();
147        let mut by_key = FxHashMap::default();
148        let mut key_by_fingerprint = FxHashMap::default();
149
150        for (key, hash) in entries {
151            let short = *hash as u32;
152            let short_handle = format!("{FINGERPRINT_PREFIX}{short:08x}");
153            let fingerprint = if short_counts.get(&short).copied().unwrap_or(0) == 1 {
154                short_handle
155            } else {
156                ambiguous_short_handles.insert(short_handle);
157                let full_handle = format!("{FINGERPRINT_PREFIX}{hash:016x}");
158                if full_counts.get(hash).copied().unwrap_or(0) == 1 {
159                    full_handle
160                } else {
161                    let ordinal = full_ordinals.entry(*hash).or_insert(0);
162                    *ordinal += 1;
163                    format!("{full_handle}-{ordinal}")
164                }
165            };
166
167            key_by_fingerprint.insert(fingerprint.clone(), key.clone());
168            by_key.insert(key.clone(), fingerprint);
169        }
170
171        for handle in ambiguous_short_handles {
172            key_by_fingerprint.remove(&handle);
173        }
174
175        Self {
176            by_key,
177            key_by_fingerprint,
178        }
179    }
180}
181
182/// Compute the legacy short content fingerprint for a clone group from its
183/// instances.
184///
185/// The fingerprint is derived from the representative instance's raw source
186/// fragment (the first instance after [`super::types::DuplicationReport::sort`],
187/// which orders instances by `(file, line)`), so it is:
188///
189/// - content-derived, not line-derived (moving a clone down a file does not
190///   change it),
191/// - sibling-stable (editing one clone group never changes another group's
192///   fingerprint, since each hashes only its own content),
193///
194/// Use [`CloneFingerprintSet`] for user-facing report output, since it widens
195/// only the rare colliding handles while preserving this short form for the
196/// common case.
197///
198/// Hashes the empty string for an empty group (never produced by the detector,
199/// which guarantees `>= 2` instances), so the result is still a well-formed
200/// `dup:<8hex>` handle.
201#[must_use]
202pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
203    let representative = instances.first().map_or("", |inst| inst.fragment.as_str());
204    fingerprint_for_fragment(representative)
205}
206
207/// Compute the fingerprint directly from a representative source fragment.
208///
209/// Use when the instances are wrapped (e.g. `--group-by` attributed instances)
210/// but the representative fragment is the same as the bare clone group's, so the
211/// fingerprint matches the top-level `clone_groups[].fingerprint` for the clone.
212#[must_use]
213pub fn fingerprint_for_fragment(fragment: &str) -> String {
214    let hash = hash_fragment(fragment);
215    format!("{FINGERPRINT_PREFIX}{:08x}", hash as u32)
216}
217
218/// Hash a representative fragment, normalizing CRLF to LF first.
219///
220/// A clone group must get the same `dup:` fingerprint whether its source was
221/// checked out with Windows (`\r\n`) or Unix (`\n`) line endings; otherwise the
222/// same code yields different handles on a Windows dev machine versus a Linux CI
223/// runner, breaking `dupes --trace dup:<id>` and any fingerprint-keyed baseline
224/// across platforms. Stripping `\r` is a no-op on Unix-checkout fragments, so
225/// existing fingerprints are unchanged.
226fn hash_fragment(fragment: &str) -> u64 {
227    if fragment.as_bytes().contains(&b'\r') {
228        xxh3_64(fragment.replace('\r', "").as_bytes())
229    } else {
230        xxh3_64(fragment.as_bytes())
231    }
232}
233
234/// Build a per-group `ExtractFunction` refactoring suggestion.
235///
236/// Mirrors the per-group branch of [`super::families`]'s `generate_suggestions`:
237/// the savings is `(instances - 1)` copies of the group's line count, since one
238/// copy survives as the extracted function and the rest collapse to call sites.
239#[must_use]
240pub fn group_refactoring_suggestion(group: &CloneGroup) -> RefactoringSuggestion {
241    let estimated_savings = group.line_count * group.instances.len().saturating_sub(1);
242    RefactoringSuggestion {
243        kind: RefactoringKind::ExtractFunction,
244        description: format!(
245            "Extract the shared {}-line block into one function and call it from {} sites",
246            group.line_count,
247            group.instances.len(),
248        ),
249        estimated_savings,
250    }
251}
252
253/// Best-effort name for the extracted function, derived from the most frequent
254/// non-generic identifier in the representative fragment.
255///
256/// Returns `None` when the dominant identifier is generic (`data`, `result`,
257/// loop counters), appears only once, or ties with another, so absence is the
258/// low-confidence signal for both human and agent consumers. This is a
259/// lexical heuristic over the raw fragment, not an AST analysis; it is advisory
260/// and consumers should verify before applying.
261#[must_use]
262pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
263    let fragment = group.instances.first().map(|inst| inst.fragment.as_str())?;
264    let mut counts: FxHashMap<&str, usize> = FxHashMap::default();
265    for word in identifier_words(fragment) {
266        if is_generic_identifier(word) {
267            continue;
268        }
269        *counts.entry(word).or_insert(0) += 1;
270    }
271
272    let mut candidates: Vec<_> = counts
273        .into_iter()
274        .map(|(word, count)| IdentifierCandidate {
275            word,
276            count,
277            score: identifier_score(word, count),
278        })
279        .collect();
280    candidates.sort_by(|a, b| {
281        b.score
282            .cmp(&a.score)
283            .then_with(|| b.count.cmp(&a.count))
284            .then_with(|| a.word.cmp(b.word))
285    });
286
287    let best = candidates.first()?;
288    if best.count < 2 {
289        return None;
290    }
291
292    let runner_up = candidates.get(1);
293    if runner_up.is_some_and(|next| best.score.saturating_sub(next.score) < 2) {
294        return None;
295    }
296
297    if is_plain_single_token(best.word) {
298        let next_count = runner_up.map_or(0, |candidate| candidate.count);
299        if best.count < 3 || best.count < next_count + 2 {
300            return None;
301        }
302    }
303
304    Some(best.word.to_string())
305}
306
307#[derive(Debug)]
308struct IdentifierCandidate<'a> {
309    word: &'a str,
310    count: usize,
311    score: usize,
312}
313
314fn identifier_score(word: &str, count: usize) -> usize {
315    let quality_bonus = if has_identifier_separator_or_case_transition(word) {
316        5
317    } else if word.chars().count() >= 8 {
318        2
319    } else {
320        0
321    };
322    count * 5 + quality_bonus
323}
324
325fn is_plain_single_token(word: &str) -> bool {
326    !has_identifier_separator_or_case_transition(word) && word.chars().count() < 8
327}
328
329fn has_identifier_separator_or_case_transition(word: &str) -> bool {
330    word.contains('_')
331        || word.contains('$')
332        || word
333            .chars()
334            .collect::<Vec<_>>()
335            .windows(2)
336            .any(|pair| pair[0].is_ascii_lowercase() && pair[1].is_ascii_uppercase())
337}
338
339/// Yield identifier-like words (`[A-Za-z_$][A-Za-z0-9_$]*`) from raw source.
340fn identifier_words(source: &str) -> impl Iterator<Item = &str> {
341    source
342        .split(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '$'))
343        .filter(|word| {
344            !word.is_empty()
345                && word
346                    .chars()
347                    .next()
348                    .is_some_and(|c| c.is_ascii_alphabetic() || c == '_' || c == '$')
349        })
350}
351
352/// Identifiers too generic to make a useful extracted-function name, plus the
353/// reserved words that show up as bare tokens in a fragment.
354const GENERIC_IDENTIFIERS: &[&str] = &[
355    "data",
356    "result",
357    "results",
358    "item",
359    "items",
360    "value",
361    "values",
362    "val",
363    "obj",
364    "object",
365    "arr",
366    "array",
367    "list",
368    "map",
369    "set",
370    "key",
371    "keys",
372    "tmp",
373    "temp",
374    "acc",
375    "cur",
376    "curr",
377    "prev",
378    "next",
379    "node",
380    "el",
381    "elem",
382    "element",
383    "args",
384    "arg",
385    "opts",
386    "options",
387    "params",
388    "param",
389    "props",
390    "ctx",
391    "context",
392    "res",
393    "req",
394    "err",
395    "error",
396    "fn",
397    "cb",
398    "callback",
399    "out",
400    "input",
401    "output",
402    "name",
403    "id",
404    "index",
405    "idx",
406    "x",
407    "y",
408    "z",
409    "i",
410    "j",
411    "k",
412    "n",
413    "m",
414    "a",
415    "b",
416    "c",
417    "e",
418    "_",
419    "const",
420    "let",
421    "var",
422    "function",
423    "return",
424    "if",
425    "else",
426    "for",
427    "while",
428    "do",
429    "switch",
430    "case",
431    "break",
432    "continue",
433    "new",
434    "this",
435    "true",
436    "false",
437    "null",
438    "undefined",
439    "void",
440    "typeof",
441    "instanceof",
442    "in",
443    "of",
444    "class",
445    "extends",
446    "super",
447    "import",
448    "export",
449    "from",
450    "default",
451    "async",
452    "await",
453    "yield",
454    "type",
455    "interface",
456    "enum",
457    "as",
458    "is",
459    "keyof",
460    "readonly",
461    "public",
462    "private",
463    "protected",
464    "static",
465    "get",
466    "delete",
467    "throw",
468    "try",
469    "catch",
470    "finally",
471    "string",
472    "number",
473    "boolean",
474    "any",
475    "unknown",
476    "never",
477    "bigint",
478    "symbol",
479    "Math",
480    "JSON",
481    "Object",
482    "Array",
483    "Promise",
484    "BigInt",
485    "Number",
486    "String",
487    "Boolean",
488    "Symbol",
489    "RegExp",
490    "Date",
491];
492
493fn is_generic_identifier(word: &str) -> bool {
494    word.chars().count() == 1 || GENERIC_IDENTIFIERS.contains(&word)
495}
496
497#[cfg(test)]
498mod tests {
499    use std::path::PathBuf;
500
501    use super::*;
502
503    fn instance(fragment: &str) -> CloneInstance {
504        CloneInstance {
505            file: PathBuf::from("a.ts"),
506            start_line: 1,
507            end_line: 5,
508            start_col: 0,
509            end_col: 0,
510            fragment: fragment.to_string(),
511        }
512    }
513
514    fn group(fragments: &[&str], line_count: usize) -> CloneGroup {
515        CloneGroup {
516            instances: fragments.iter().map(|f| instance(f)).collect(),
517            token_count: 40,
518            line_count,
519        }
520    }
521
522    #[test]
523    fn fingerprint_is_stable_and_prefixed() {
524        let g = group(&["foo(bar)", "foo(baz)"], 3);
525        let fp1 = clone_fingerprint(&g.instances);
526        let fp2 = clone_fingerprint(&g.instances);
527        assert_eq!(fp1, fp2);
528        assert!(fp1.starts_with("dup:"));
529        assert_eq!(fp1.len(), "dup:".len() + 8);
530    }
531
532    #[test]
533    fn fingerprint_is_sibling_stable() {
534        let group_a = group(&["computeInvoiceTotal(order)", "computeInvoiceTotal(o)"], 4);
535        let before = clone_fingerprint(&group_a.instances);
536        let _group_b_edited = group(&["totallyDifferentBody()"], 2);
537        let after = clone_fingerprint(&group_a.instances);
538        assert_eq!(before, after);
539    }
540
541    #[test]
542    fn fingerprint_differs_for_different_content() {
543        let a = group(&["alpha()"], 2);
544        let b = group(&["beta()"], 2);
545        assert_ne!(
546            clone_fingerprint(&a.instances),
547            clone_fingerprint(&b.instances)
548        );
549    }
550
551    #[test]
552    fn fingerprint_set_widens_only_colliding_short_handles() {
553        let a = group(&["alpha()"], 2);
554        let b = group(&["beta()"], 2);
555        let c = group(&["gamma()"], 2);
556        let entries = vec![
557            (
558                CloneFingerprintKey::from_group(&a),
559                0x0000_0001_1234_5678_u64,
560            ),
561            (
562                CloneFingerprintKey::from_group(&b),
563                0x0000_0002_1234_5678_u64,
564            ),
565            (
566                CloneFingerprintKey::from_group(&c),
567                0x0000_0003_8765_4321_u64,
568            ),
569        ];
570
571        let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
572
573        assert_eq!(
574            fingerprints.fingerprint_for_group(&a),
575            "dup:0000000112345678"
576        );
577        assert_eq!(
578            fingerprints.fingerprint_for_group(&b),
579            "dup:0000000212345678"
580        );
581        assert_eq!(fingerprints.fingerprint_for_group(&c), "dup:87654321");
582        assert!(
583            fingerprints
584                .find_group(&[a.clone(), b.clone(), c.clone()], "dup:12345678")
585                .is_none()
586        );
587        assert_eq!(
588            fingerprints
589                .find_group(&[a, b, c], "dup:0000000212345678")
590                .and_then(|group| group.instances.first())
591                .map(|inst| inst.fragment.as_str()),
592            Some("beta()")
593        );
594    }
595
596    #[test]
597    fn fingerprint_set_suffixes_full_hash_collisions() {
598        let a = group(&["alpha()"], 2);
599        let b = group(&["beta()"], 2);
600        let entries = vec![
601            (
602                CloneFingerprintKey::from_group(&a),
603                0x0000_0001_1234_5678_u64,
604            ),
605            (
606                CloneFingerprintKey::from_group(&b),
607                0x0000_0001_1234_5678_u64,
608            ),
609        ];
610
611        let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
612
613        assert_eq!(
614            fingerprints.fingerprint_for_group(&a),
615            "dup:0000000112345678-1"
616        );
617        assert_eq!(
618            fingerprints.fingerprint_for_group(&b),
619            "dup:0000000112345678-2"
620        );
621        assert!(
622            fingerprints
623                .find_group(&[a.clone(), b.clone()], "dup:12345678")
624                .is_none()
625        );
626        assert!(
627            fingerprints
628                .find_group(&[a, b], "dup:0000000112345678")
629                .is_none()
630        );
631    }
632
633    #[test]
634    fn group_suggestion_savings_is_lines_times_extra_copies() {
635        let g = group(&["x", "x", "x"], 10); // 3 instances, 10 lines
636        let suggestion = group_refactoring_suggestion(&g);
637        assert_eq!(suggestion.kind, RefactoringKind::ExtractFunction);
638        assert_eq!(suggestion.estimated_savings, 20); // 10 * (3 - 1)
639    }
640
641    #[test]
642    fn dominant_identifier_picks_repeated_domain_name() {
643        let g = group(
644            &["function buildInvoice(invoice) { return invoice.total + invoice.tax; }"],
645            3,
646        );
647        assert_eq!(dominant_identifier(&g).as_deref(), Some("invoice"));
648    }
649
650    #[test]
651    fn dominant_identifier_none_on_generic() {
652        let g = group(&["const data = result.map((item) => item.value);"], 3);
653        assert_eq!(dominant_identifier(&g), None);
654    }
655
656    #[test]
657    fn dominant_identifier_skips_ts_primitive_keywords_and_globals() {
658        let g = group(
659            &["const parseUser = z.string(); parseUser(z.number()); parseUser.or(z.string());"],
660            4,
661        );
662        assert_eq!(dominant_identifier(&g).as_deref(), Some("parseUser"));
663        let only_keywords = group(&["const x: string = y as string; return x as any;"], 3);
664        assert_eq!(dominant_identifier(&only_keywords), None);
665        let g_global = group(&["Math.max(Math.floor(Math.abs(v)), 0)"], 3);
666        assert_eq!(dominant_identifier(&g_global), None);
667    }
668
669    #[test]
670    fn dominant_identifier_none_on_single_letter_type_param() {
671        let g = group(
672            &["function id<T>(x: T): T { const a: T = x; return a as T; }"],
673            3,
674        );
675        assert_eq!(dominant_identifier(&g), None);
676    }
677
678    #[test]
679    fn dominant_identifier_none_on_tie() {
680        let g = group(&["alpha(); beta();"], 2); // each appears once, no count >= 2
681        assert_eq!(dominant_identifier(&g), None);
682    }
683
684    #[test]
685    fn dominant_identifier_prefers_structured_names() {
686        let g = group(
687            &["parseSchema(input); parseSchema(cache); helper(); helper();"],
688            3,
689        );
690        assert_eq!(dominant_identifier(&g).as_deref(), Some("parseSchema"));
691    }
692
693    #[test]
694    fn dominant_identifier_requires_plain_token_margin() {
695        let low_signal = group(&["schema(); schema(); parseUser();"], 3);
696        assert_eq!(dominant_identifier(&low_signal), None);
697
698        let strong = group(&["schema(); schema(); schema(); schema(); parseUser();"], 3);
699        assert_eq!(dominant_identifier(&strong).as_deref(), Some("schema"));
700    }
701
702    #[test]
703    fn dominant_identifier_is_stable_across_word_order() {
704        let first = group(
705            &["helper(); parseSchema(input); helper(); parseSchema(cache);"],
706            3,
707        );
708        let second = group(
709            &["parseSchema(input); helper(); parseSchema(cache); helper();"],
710            3,
711        );
712
713        assert_eq!(dominant_identifier(&first), dominant_identifier(&second));
714        assert_eq!(dominant_identifier(&first).as_deref(), Some("parseSchema"));
715    }
716}