Skip to main content

fallow_core/duplicates/
deepdive.rs

1//! Deep-dive helpers for the `fallow dupes --trace` inspector: a stable
2//! content fingerprint that addresses a clone group across runs, a group-level
3//! refactoring suggestion, and a best-effort "dominant identifier" name for the
4//! extracted function.
5//!
6//! These are pure functions over [`CloneInstance`] / [`CloneGroup`] so every
7//! surface (human listing, `--trace dup:<fp>` lookup, the typed JSON wrappers,
8//! and `trace_clone`) computes the same values without storing a field on the
9//! core [`CloneGroup`] struct.
10
11use std::path::PathBuf;
12
13use rustc_hash::{FxHashMap, FxHashSet};
14use xxhash_rust::xxh3::xxh3_64;
15
16use super::types::{CloneGroup, CloneInstance, RefactoringKind, RefactoringSuggestion};
17
18/// Prefix marking a clone-group fingerprint addressable via `--trace`.
19pub const FINGERPRINT_PREFIX: &str = "dup:";
20
21/// Canonical identity for a clone group when assigning report-scoped handles.
22///
23/// The representative fragment keeps the handle content-derived. The structural
24/// and location fields make otherwise identical wrappers addressable when tests
25/// or future grouping modes present them as separate report entries.
26#[derive(Debug, Clone, PartialEq, Eq, Hash)]
27pub struct CloneFingerprintKey {
28    representative_fragment: String,
29    token_count: usize,
30    line_count: usize,
31    instance_count: usize,
32    first_file: Option<PathBuf>,
33    first_start_line: usize,
34    first_end_line: usize,
35}
36
37impl CloneFingerprintKey {
38    /// Build a fingerprint key from clone-group parts.
39    #[must_use]
40    pub fn from_parts(instances: &[CloneInstance], token_count: usize, line_count: usize) -> Self {
41        let first = instances.first();
42        Self {
43            representative_fragment: first.map_or_else(String::new, |inst| inst.fragment.clone()),
44            token_count,
45            line_count,
46            instance_count: instances.len(),
47            first_file: first.map(|inst| inst.file.clone()),
48            first_start_line: first.map_or(0, |inst| inst.start_line),
49            first_end_line: first.map_or(0, |inst| inst.end_line),
50        }
51    }
52
53    fn from_group(group: &CloneGroup) -> Self {
54        Self::from_parts(&group.instances, group.token_count, group.line_count)
55    }
56
57    fn representative_fragment(&self) -> &str {
58        &self.representative_fragment
59    }
60}
61
62/// Report-scoped clone fingerprint assignment.
63///
64/// Most reports retain the short `dup:<8hex>` handle. If two report entries
65/// collide on those low 32 bits, only the colliding entries widen to
66/// `dup:<16hex>`. If a full 64-bit collision ever occurs inside one report,
67/// every entry in that collision bucket receives a deterministic numeric suffix.
68#[derive(Debug, Clone)]
69pub struct CloneFingerprintSet {
70    by_key: FxHashMap<CloneFingerprintKey, String>,
71    key_by_fingerprint: FxHashMap<String, CloneFingerprintKey>,
72}
73
74impl CloneFingerprintSet {
75    /// Assign collision-free fingerprints for the report's clone groups.
76    #[must_use]
77    pub fn from_groups(groups: &[CloneGroup]) -> Self {
78        let entries: Vec<_> = groups
79            .iter()
80            .map(|group| {
81                let key = CloneFingerprintKey::from_group(group);
82                let hash = hash_fragment(key.representative_fragment());
83                (key, hash)
84            })
85            .collect();
86        Self::from_hashed_entries(&entries)
87    }
88
89    /// Return the assigned fingerprint for a clone group.
90    #[must_use]
91    pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
92        self.fingerprint_for_key(&CloneFingerprintKey::from_group(group))
93    }
94
95    /// Return the assigned fingerprint for clone-group parts.
96    #[must_use]
97    pub fn fingerprint_for_parts(
98        &self,
99        instances: &[CloneInstance],
100        token_count: usize,
101        line_count: usize,
102    ) -> String {
103        self.fingerprint_for_key(&CloneFingerprintKey::from_parts(
104            instances,
105            token_count,
106            line_count,
107        ))
108    }
109
110    /// Return the assigned fingerprint for a key, falling back to the legacy
111    /// short content handle when the key was not present in this report.
112    #[must_use]
113    pub fn fingerprint_for_key(&self, key: &CloneFingerprintKey) -> String {
114        self.by_key
115            .get(key)
116            .cloned()
117            .unwrap_or_else(|| fingerprint_for_fragment(key.representative_fragment.as_str()))
118    }
119
120    /// Find the group addressed by an assigned fingerprint.
121    ///
122    /// Ambiguous short handles created by low-32 collisions are intentionally
123    /// absent from the lookup table, so callers get `None` instead of the first
124    /// matching group.
125    #[must_use]
126    pub fn find_group<'a>(
127        &self,
128        groups: &'a [CloneGroup],
129        fingerprint: &str,
130    ) -> Option<&'a CloneGroup> {
131        let key = self.key_by_fingerprint.get(fingerprint)?;
132        groups
133            .iter()
134            .find(|group| CloneFingerprintKey::from_group(group) == *key)
135    }
136
137    fn from_hashed_entries(entries: &[(CloneFingerprintKey, u64)]) -> Self {
138        let mut short_counts: FxHashMap<u32, usize> = FxHashMap::default();
139        let mut full_counts: FxHashMap<u64, usize> = FxHashMap::default();
140        for (_, hash) in entries {
141            *short_counts.entry(*hash as u32).or_insert(0) += 1;
142            *full_counts.entry(*hash).or_insert(0) += 1;
143        }
144
145        let mut full_ordinals: FxHashMap<u64, usize> = FxHashMap::default();
146        let mut ambiguous_short_handles: FxHashSet<String> = FxHashSet::default();
147        let mut by_key = FxHashMap::default();
148        let mut key_by_fingerprint = FxHashMap::default();
149
150        for (key, hash) in entries {
151            let short = *hash as u32;
152            let short_handle = format!("{FINGERPRINT_PREFIX}{short:08x}");
153            let fingerprint = if short_counts.get(&short).copied().unwrap_or(0) == 1 {
154                short_handle
155            } else {
156                ambiguous_short_handles.insert(short_handle);
157                let full_handle = format!("{FINGERPRINT_PREFIX}{hash:016x}");
158                if full_counts.get(hash).copied().unwrap_or(0) == 1 {
159                    full_handle
160                } else {
161                    let ordinal = full_ordinals.entry(*hash).or_insert(0);
162                    *ordinal += 1;
163                    format!("{full_handle}-{ordinal}")
164                }
165            };
166
167            key_by_fingerprint.insert(fingerprint.clone(), key.clone());
168            by_key.insert(key.clone(), fingerprint);
169        }
170
171        for handle in ambiguous_short_handles {
172            key_by_fingerprint.remove(&handle);
173        }
174
175        Self {
176            by_key,
177            key_by_fingerprint,
178        }
179    }
180}
181
182/// Compute the legacy short content fingerprint for a clone group from its
183/// instances.
184///
185/// The fingerprint is derived from the representative instance's raw source
186/// fragment (the first instance after [`super::types::DuplicationReport::sort`],
187/// which orders instances by `(file, line)`), so it is:
188///
189/// - content-derived, not line-derived (moving a clone down a file does not
190///   change it),
191/// - sibling-stable (editing one clone group never changes another group's
192///   fingerprint, since each hashes only its own content),
193///
194/// Use [`CloneFingerprintSet`] for user-facing report output, since it widens
195/// only the rare colliding handles while preserving this short form for the
196/// common case.
197///
198/// Hashes the empty string for an empty group (never produced by the detector,
199/// which guarantees `>= 2` instances), so the result is still a well-formed
200/// `dup:<8hex>` handle.
201#[must_use]
202pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
203    let representative = instances.first().map_or("", |inst| inst.fragment.as_str());
204    fingerprint_for_fragment(representative)
205}
206
207/// Compute the fingerprint directly from a representative source fragment.
208///
209/// Use when the instances are wrapped (e.g. `--group-by` attributed instances)
210/// but the representative fragment is the same as the bare clone group's, so the
211/// fingerprint matches the top-level `clone_groups[].fingerprint` for the clone.
212#[must_use]
213pub fn fingerprint_for_fragment(fragment: &str) -> String {
214    let hash = hash_fragment(fragment);
215    format!("{FINGERPRINT_PREFIX}{:08x}", hash as u32)
216}
217
218/// Hash a representative fragment, normalizing CRLF to LF first.
219///
220/// A clone group must get the same `dup:` fingerprint whether its source was
221/// checked out with Windows (`\r\n`) or Unix (`\n`) line endings; otherwise the
222/// same code yields different handles on a Windows dev machine versus a Linux CI
223/// runner, breaking `dupes --trace dup:<id>` and any fingerprint-keyed baseline
224/// across platforms. Stripping `\r` is a no-op on Unix-checkout fragments, so
225/// existing fingerprints are unchanged.
226fn hash_fragment(fragment: &str) -> u64 {
227    if fragment.as_bytes().contains(&b'\r') {
228        xxh3_64(fragment.replace('\r', "").as_bytes())
229    } else {
230        xxh3_64(fragment.as_bytes())
231    }
232}
233
234/// Build a per-group `ExtractFunction` refactoring suggestion.
235///
236/// Mirrors the per-group branch of [`super::families`]'s `generate_suggestions`:
237/// the savings is `(instances - 1)` copies of the group's line count, since one
238/// copy survives as the extracted function and the rest collapse to call sites.
239#[must_use]
240pub fn group_refactoring_suggestion(group: &CloneGroup) -> RefactoringSuggestion {
241    let estimated_savings = group.line_count * group.instances.len().saturating_sub(1);
242    RefactoringSuggestion {
243        kind: RefactoringKind::ExtractFunction,
244        description: format!(
245            "Extract the shared {}-line block into one function and call it from {} sites",
246            group.line_count,
247            group.instances.len(),
248        ),
249        estimated_savings,
250    }
251}
252
253/// Best-effort name for the extracted function, derived from the most frequent
254/// non-generic identifier in the representative fragment.
255///
256/// Returns `None` when the dominant identifier is generic (`data`, `result`,
257/// loop counters), appears only once, or ties with another, so absence is the
258/// low-confidence signal for both human and agent consumers. This is a
259/// lexical heuristic over the raw fragment, not an AST analysis; it is advisory
260/// and consumers should verify before applying.
261#[must_use]
262pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
263    let fragment = group.instances.first().map(|inst| inst.fragment.as_str())?;
264    let mut counts: FxHashMap<&str, usize> = FxHashMap::default();
265    for word in identifier_words(fragment) {
266        if is_generic_identifier(word) {
267            continue;
268        }
269        *counts.entry(word).or_insert(0) += 1;
270    }
271
272    let mut candidates: Vec<_> = counts
273        .into_iter()
274        .map(|(word, count)| IdentifierCandidate {
275            word,
276            count,
277            score: identifier_score(word, count),
278        })
279        .collect();
280    candidates.sort_by(|a, b| {
281        b.score
282            .cmp(&a.score)
283            .then_with(|| b.count.cmp(&a.count))
284            .then_with(|| a.word.cmp(b.word))
285    });
286
287    let best = candidates.first()?;
288    if best.count < 2 {
289        return None;
290    }
291
292    let runner_up = candidates.get(1);
293    if runner_up.is_some_and(|next| best.score.saturating_sub(next.score) < 2) {
294        return None;
295    }
296
297    if is_plain_single_token(best.word) {
298        let next_count = runner_up.map_or(0, |candidate| candidate.count);
299        if best.count < 3 || best.count < next_count + 2 {
300            return None;
301        }
302    }
303
304    Some(best.word.to_string())
305}
306
307#[derive(Debug)]
308struct IdentifierCandidate<'a> {
309    word: &'a str,
310    count: usize,
311    score: usize,
312}
313
314fn identifier_score(word: &str, count: usize) -> usize {
315    let quality_bonus = if has_identifier_separator_or_case_transition(word) {
316        5
317    } else if word.chars().count() >= 8 {
318        2
319    } else {
320        0
321    };
322    count * 5 + quality_bonus
323}
324
325fn is_plain_single_token(word: &str) -> bool {
326    !has_identifier_separator_or_case_transition(word) && word.chars().count() < 8
327}
328
329fn has_identifier_separator_or_case_transition(word: &str) -> bool {
330    if word.contains('_') || word.contains('$') {
331        return true;
332    }
333
334    let mut previous = None;
335    for ch in word.chars() {
336        if previous.is_some_and(|prev: char| prev.is_ascii_lowercase() && ch.is_ascii_uppercase()) {
337            return true;
338        }
339        previous = Some(ch);
340    }
341    false
342}
343
344/// Yield identifier-like words (`[A-Za-z_$][A-Za-z0-9_$]*`) from raw source.
345fn identifier_words(source: &str) -> impl Iterator<Item = &str> {
346    source
347        .split(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '$'))
348        .filter(|word| {
349            !word.is_empty()
350                && word
351                    .chars()
352                    .next()
353                    .is_some_and(|c| c.is_ascii_alphabetic() || c == '_' || c == '$')
354        })
355}
356
357/// Identifiers too generic to make a useful extracted-function name, plus the
358/// reserved words that show up as bare tokens in a fragment.
359const GENERIC_IDENTIFIERS: &[&str] = &[
360    "data",
361    "result",
362    "results",
363    "item",
364    "items",
365    "value",
366    "values",
367    "val",
368    "obj",
369    "object",
370    "arr",
371    "array",
372    "list",
373    "map",
374    "set",
375    "key",
376    "keys",
377    "tmp",
378    "temp",
379    "acc",
380    "cur",
381    "curr",
382    "prev",
383    "next",
384    "node",
385    "el",
386    "elem",
387    "element",
388    "args",
389    "arg",
390    "opts",
391    "options",
392    "params",
393    "param",
394    "props",
395    "ctx",
396    "context",
397    "res",
398    "req",
399    "err",
400    "error",
401    "fn",
402    "cb",
403    "callback",
404    "out",
405    "input",
406    "output",
407    "name",
408    "id",
409    "index",
410    "idx",
411    "x",
412    "y",
413    "z",
414    "i",
415    "j",
416    "k",
417    "n",
418    "m",
419    "a",
420    "b",
421    "c",
422    "e",
423    "_",
424    "const",
425    "let",
426    "var",
427    "function",
428    "return",
429    "if",
430    "else",
431    "for",
432    "while",
433    "do",
434    "switch",
435    "case",
436    "break",
437    "continue",
438    "new",
439    "this",
440    "true",
441    "false",
442    "null",
443    "undefined",
444    "void",
445    "typeof",
446    "instanceof",
447    "in",
448    "of",
449    "class",
450    "extends",
451    "super",
452    "import",
453    "export",
454    "from",
455    "default",
456    "async",
457    "await",
458    "yield",
459    "type",
460    "interface",
461    "enum",
462    "as",
463    "is",
464    "keyof",
465    "readonly",
466    "public",
467    "private",
468    "protected",
469    "static",
470    "get",
471    "delete",
472    "throw",
473    "try",
474    "catch",
475    "finally",
476    "string",
477    "number",
478    "boolean",
479    "any",
480    "unknown",
481    "never",
482    "bigint",
483    "symbol",
484    "Math",
485    "JSON",
486    "Object",
487    "Array",
488    "Promise",
489    "BigInt",
490    "Number",
491    "String",
492    "Boolean",
493    "Symbol",
494    "RegExp",
495    "Date",
496];
497
498fn is_generic_identifier(word: &str) -> bool {
499    word.chars().count() == 1 || GENERIC_IDENTIFIERS.contains(&word)
500}
501
502#[cfg(test)]
503mod tests {
504    use std::path::PathBuf;
505
506    use super::*;
507
508    fn instance(fragment: &str) -> CloneInstance {
509        CloneInstance {
510            file: PathBuf::from("a.ts"),
511            start_line: 1,
512            end_line: 5,
513            start_col: 0,
514            end_col: 0,
515            fragment: fragment.to_string(),
516        }
517    }
518
519    fn group(fragments: &[&str], line_count: usize) -> CloneGroup {
520        CloneGroup {
521            instances: fragments.iter().map(|f| instance(f)).collect(),
522            token_count: 40,
523            line_count,
524        }
525    }
526
527    #[test]
528    fn fingerprint_is_stable_and_prefixed() {
529        let g = group(&["foo(bar)", "foo(baz)"], 3);
530        let fp1 = clone_fingerprint(&g.instances);
531        let fp2 = clone_fingerprint(&g.instances);
532        assert_eq!(fp1, fp2);
533        assert!(fp1.starts_with("dup:"));
534        assert_eq!(fp1.len(), "dup:".len() + 8);
535    }
536
537    #[test]
538    fn fingerprint_is_sibling_stable() {
539        let group_a = group(&["computeInvoiceTotal(order)", "computeInvoiceTotal(o)"], 4);
540        let before = clone_fingerprint(&group_a.instances);
541        let _group_b_edited = group(&["totallyDifferentBody()"], 2);
542        let after = clone_fingerprint(&group_a.instances);
543        assert_eq!(before, after);
544    }
545
546    #[test]
547    fn fingerprint_differs_for_different_content() {
548        let a = group(&["alpha()"], 2);
549        let b = group(&["beta()"], 2);
550        assert_ne!(
551            clone_fingerprint(&a.instances),
552            clone_fingerprint(&b.instances)
553        );
554    }
555
556    #[test]
557    fn fingerprint_set_widens_only_colliding_short_handles() {
558        let a = group(&["alpha()"], 2);
559        let b = group(&["beta()"], 2);
560        let c = group(&["gamma()"], 2);
561        let entries = vec![
562            (
563                CloneFingerprintKey::from_group(&a),
564                0x0000_0001_1234_5678_u64,
565            ),
566            (
567                CloneFingerprintKey::from_group(&b),
568                0x0000_0002_1234_5678_u64,
569            ),
570            (
571                CloneFingerprintKey::from_group(&c),
572                0x0000_0003_8765_4321_u64,
573            ),
574        ];
575
576        let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
577
578        assert_eq!(
579            fingerprints.fingerprint_for_group(&a),
580            "dup:0000000112345678"
581        );
582        assert_eq!(
583            fingerprints.fingerprint_for_group(&b),
584            "dup:0000000212345678"
585        );
586        assert_eq!(fingerprints.fingerprint_for_group(&c), "dup:87654321");
587        assert!(
588            fingerprints
589                .find_group(&[a.clone(), b.clone(), c.clone()], "dup:12345678")
590                .is_none()
591        );
592        assert_eq!(
593            fingerprints
594                .find_group(&[a, b, c], "dup:0000000212345678")
595                .and_then(|group| group.instances.first())
596                .map(|inst| inst.fragment.as_str()),
597            Some("beta()")
598        );
599    }
600
601    #[test]
602    fn fingerprint_set_suffixes_full_hash_collisions() {
603        let a = group(&["alpha()"], 2);
604        let b = group(&["beta()"], 2);
605        let entries = vec![
606            (
607                CloneFingerprintKey::from_group(&a),
608                0x0000_0001_1234_5678_u64,
609            ),
610            (
611                CloneFingerprintKey::from_group(&b),
612                0x0000_0001_1234_5678_u64,
613            ),
614        ];
615
616        let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
617
618        assert_eq!(
619            fingerprints.fingerprint_for_group(&a),
620            "dup:0000000112345678-1"
621        );
622        assert_eq!(
623            fingerprints.fingerprint_for_group(&b),
624            "dup:0000000112345678-2"
625        );
626        assert!(
627            fingerprints
628                .find_group(&[a.clone(), b.clone()], "dup:12345678")
629                .is_none()
630        );
631        assert!(
632            fingerprints
633                .find_group(&[a, b], "dup:0000000112345678")
634                .is_none()
635        );
636    }
637
638    #[test]
639    fn group_suggestion_savings_is_lines_times_extra_copies() {
640        let g = group(&["x", "x", "x"], 10); // 3 instances, 10 lines
641        let suggestion = group_refactoring_suggestion(&g);
642        assert_eq!(suggestion.kind, RefactoringKind::ExtractFunction);
643        assert_eq!(suggestion.estimated_savings, 20); // 10 * (3 - 1)
644    }
645
646    #[test]
647    fn dominant_identifier_picks_repeated_domain_name() {
648        let g = group(
649            &["function buildInvoice(invoice) { return invoice.total + invoice.tax; }"],
650            3,
651        );
652        assert_eq!(dominant_identifier(&g).as_deref(), Some("invoice"));
653    }
654
655    #[test]
656    fn dominant_identifier_none_on_generic() {
657        let g = group(&["const data = result.map((item) => item.value);"], 3);
658        assert_eq!(dominant_identifier(&g), None);
659    }
660
661    #[test]
662    fn dominant_identifier_skips_ts_primitive_keywords_and_globals() {
663        let g = group(
664            &["const parseUser = z.string(); parseUser(z.number()); parseUser.or(z.string());"],
665            4,
666        );
667        assert_eq!(dominant_identifier(&g).as_deref(), Some("parseUser"));
668        let only_keywords = group(&["const x: string = y as string; return x as any;"], 3);
669        assert_eq!(dominant_identifier(&only_keywords), None);
670        let g_global = group(&["Math.max(Math.floor(Math.abs(v)), 0)"], 3);
671        assert_eq!(dominant_identifier(&g_global), None);
672    }
673
674    #[test]
675    fn dominant_identifier_none_on_single_letter_type_param() {
676        let g = group(
677            &["function id<T>(x: T): T { const a: T = x; return a as T; }"],
678            3,
679        );
680        assert_eq!(dominant_identifier(&g), None);
681    }
682
683    #[test]
684    fn dominant_identifier_none_on_tie() {
685        let g = group(&["alpha(); beta();"], 2); // each appears once, no count >= 2
686        assert_eq!(dominant_identifier(&g), None);
687    }
688
689    #[test]
690    fn dominant_identifier_prefers_structured_names() {
691        let g = group(
692            &["parseSchema(input); parseSchema(cache); helper(); helper();"],
693            3,
694        );
695        assert_eq!(dominant_identifier(&g).as_deref(), Some("parseSchema"));
696    }
697
698    #[test]
699    fn dominant_identifier_requires_plain_token_margin() {
700        let low_signal = group(&["schema(); schema(); parseUser();"], 3);
701        assert_eq!(dominant_identifier(&low_signal), None);
702
703        let strong = group(&["schema(); schema(); schema(); schema(); parseUser();"], 3);
704        assert_eq!(dominant_identifier(&strong).as_deref(), Some("schema"));
705    }
706
707    #[test]
708    fn dominant_identifier_is_stable_across_word_order() {
709        let first = group(
710            &["helper(); parseSchema(input); helper(); parseSchema(cache);"],
711            3,
712        );
713        let second = group(
714            &["parseSchema(input); helper(); parseSchema(cache); helper();"],
715            3,
716        );
717
718        assert_eq!(dominant_identifier(&first), dominant_identifier(&second));
719        assert_eq!(dominant_identifier(&first).as_deref(), Some("parseSchema"));
720    }
721}