Skip to main content

fallow_core/duplicates/
deepdive.rs

1//! Deep-dive helpers for the `fallow dupes --trace` inspector: a stable
2//! content fingerprint that addresses a clone group across runs, a group-level
3//! refactoring suggestion, and a best-effort "dominant identifier" name for the
4//! extracted function.
5//!
6//! These are pure functions over [`CloneInstance`] / [`CloneGroup`] so every
7//! surface (human listing, `--trace dup:<fp>` lookup, the typed JSON wrappers,
8//! and `trace_clone`) computes the same values without storing a field on the
9//! core [`CloneGroup`] struct.
10
11use std::path::PathBuf;
12
13use rustc_hash::{FxHashMap, FxHashSet};
14use xxhash_rust::xxh3::xxh3_64;
15
16use super::types::{CloneGroup, CloneInstance, RefactoringKind, RefactoringSuggestion};
17
18/// Prefix marking a clone-group fingerprint addressable via `--trace`.
19pub const FINGERPRINT_PREFIX: &str = "dup:";
20
21/// Canonical identity for a clone group when assigning report-scoped handles.
22///
23/// The representative fragment keeps the handle content-derived. The structural
24/// and location fields make otherwise identical wrappers addressable when tests
25/// or future grouping modes present them as separate report entries.
26#[derive(Debug, Clone, PartialEq, Eq, Hash)]
27pub struct CloneFingerprintKey {
28    representative_fragment: String,
29    token_count: usize,
30    line_count: usize,
31    instance_count: usize,
32    first_file: Option<PathBuf>,
33    first_start_line: usize,
34    first_end_line: usize,
35}
36
37impl CloneFingerprintKey {
38    /// Build a fingerprint key from clone-group parts.
39    #[must_use]
40    pub fn from_parts(instances: &[CloneInstance], token_count: usize, line_count: usize) -> Self {
41        let first = instances.first();
42        Self {
43            representative_fragment: first.map_or_else(String::new, |inst| inst.fragment.clone()),
44            token_count,
45            line_count,
46            instance_count: instances.len(),
47            first_file: first.map(|inst| inst.file.clone()),
48            first_start_line: first.map_or(0, |inst| inst.start_line),
49            first_end_line: first.map_or(0, |inst| inst.end_line),
50        }
51    }
52
53    fn from_group(group: &CloneGroup) -> Self {
54        Self::from_parts(&group.instances, group.token_count, group.line_count)
55    }
56
57    fn representative_fragment(&self) -> &str {
58        &self.representative_fragment
59    }
60}
61
62/// Report-scoped clone fingerprint assignment.
63///
64/// Most reports retain the short `dup:<8hex>` handle. If two report entries
65/// collide on those low 32 bits, only the colliding entries widen to
66/// `dup:<16hex>`. If a full 64-bit collision ever occurs inside one report,
67/// every entry in that collision bucket receives a deterministic numeric suffix.
68#[derive(Debug, Clone)]
69pub struct CloneFingerprintSet {
70    by_key: FxHashMap<CloneFingerprintKey, String>,
71    key_by_fingerprint: FxHashMap<String, CloneFingerprintKey>,
72}
73
74impl CloneFingerprintSet {
75    /// Assign collision-free fingerprints for the report's clone groups.
76    #[must_use]
77    pub fn from_groups(groups: &[CloneGroup]) -> Self {
78        let entries: Vec<_> = groups
79            .iter()
80            .map(|group| {
81                let key = CloneFingerprintKey::from_group(group);
82                let hash = hash_fragment(key.representative_fragment());
83                (key, hash)
84            })
85            .collect();
86        Self::from_hashed_entries(&entries)
87    }
88
89    /// Return the assigned fingerprint for a clone group.
90    #[must_use]
91    pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
92        self.fingerprint_for_key(&CloneFingerprintKey::from_group(group))
93    }
94
95    /// Return the assigned fingerprint for clone-group parts.
96    #[must_use]
97    pub fn fingerprint_for_parts(
98        &self,
99        instances: &[CloneInstance],
100        token_count: usize,
101        line_count: usize,
102    ) -> String {
103        self.fingerprint_for_key(&CloneFingerprintKey::from_parts(
104            instances,
105            token_count,
106            line_count,
107        ))
108    }
109
110    /// Return the assigned fingerprint for a key, falling back to the legacy
111    /// short content handle when the key was not present in this report.
112    #[must_use]
113    pub fn fingerprint_for_key(&self, key: &CloneFingerprintKey) -> String {
114        self.by_key
115            .get(key)
116            .cloned()
117            .unwrap_or_else(|| fingerprint_for_fragment(key.representative_fragment.as_str()))
118    }
119
120    /// Find the group addressed by an assigned fingerprint.
121    ///
122    /// Ambiguous short handles created by low-32 collisions are intentionally
123    /// absent from the lookup table, so callers get `None` instead of the first
124    /// matching group.
125    #[must_use]
126    pub fn find_group<'a>(
127        &self,
128        groups: &'a [CloneGroup],
129        fingerprint: &str,
130    ) -> Option<&'a CloneGroup> {
131        let key = self.key_by_fingerprint.get(fingerprint)?;
132        groups
133            .iter()
134            .find(|group| CloneFingerprintKey::from_group(group) == *key)
135    }
136
137    fn from_hashed_entries(entries: &[(CloneFingerprintKey, u64)]) -> Self {
138        let mut short_counts: FxHashMap<u32, usize> = FxHashMap::default();
139        let mut full_counts: FxHashMap<u64, usize> = FxHashMap::default();
140        for (_, hash) in entries {
141            *short_counts.entry(*hash as u32).or_insert(0) += 1;
142            *full_counts.entry(*hash).or_insert(0) += 1;
143        }
144
145        let mut full_ordinals: FxHashMap<u64, usize> = FxHashMap::default();
146        let mut ambiguous_short_handles: FxHashSet<String> = FxHashSet::default();
147        let mut by_key = FxHashMap::default();
148        let mut key_by_fingerprint = FxHashMap::default();
149
150        for (key, hash) in entries {
151            let short = *hash as u32;
152            let short_handle = format!("{FINGERPRINT_PREFIX}{short:08x}");
153            let fingerprint = if short_counts.get(&short).copied().unwrap_or(0) == 1 {
154                short_handle
155            } else {
156                ambiguous_short_handles.insert(short_handle);
157                let full_handle = format!("{FINGERPRINT_PREFIX}{hash:016x}");
158                if full_counts.get(hash).copied().unwrap_or(0) == 1 {
159                    full_handle
160                } else {
161                    let ordinal = full_ordinals.entry(*hash).or_insert(0);
162                    *ordinal += 1;
163                    format!("{full_handle}-{ordinal}")
164                }
165            };
166
167            key_by_fingerprint.insert(fingerprint.clone(), key.clone());
168            by_key.insert(key.clone(), fingerprint);
169        }
170
171        for handle in ambiguous_short_handles {
172            key_by_fingerprint.remove(&handle);
173        }
174
175        Self {
176            by_key,
177            key_by_fingerprint,
178        }
179    }
180}
181
182/// Compute the legacy short content fingerprint for a clone group from its
183/// instances.
184///
185/// The fingerprint is derived from the representative instance's raw source
186/// fragment (the first instance after [`super::types::DuplicationReport::sort`],
187/// which orders instances by `(file, line)`), so it is:
188///
189/// - content-derived, not line-derived (moving a clone down a file does not
190///   change it),
191/// - sibling-stable (editing one clone group never changes another group's
192///   fingerprint, since each hashes only its own content),
193///
194/// Use [`CloneFingerprintSet`] for user-facing report output, since it widens
195/// only the rare colliding handles while preserving this short form for the
196/// common case.
197///
198/// Hashes the empty string for an empty group (never produced by the detector,
199/// which guarantees `>= 2` instances), so the result is still a well-formed
200/// `dup:<8hex>` handle.
201#[must_use]
202pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
203    let representative = instances.first().map_or("", |inst| inst.fragment.as_str());
204    fingerprint_for_fragment(representative)
205}
206
207/// Compute the fingerprint directly from a representative source fragment.
208///
209/// Use when the instances are wrapped (e.g. `--group-by` attributed instances)
210/// but the representative fragment is the same as the bare clone group's, so the
211/// fingerprint matches the top-level `clone_groups[].fingerprint` for the clone.
212#[must_use]
213pub fn fingerprint_for_fragment(fragment: &str) -> String {
214    let hash = hash_fragment(fragment);
215    format!("{FINGERPRINT_PREFIX}{:08x}", hash as u32)
216}
217
218/// Hash a representative fragment, normalizing CRLF to LF first.
219///
220/// A clone group must get the same `dup:` fingerprint whether its source was
221/// checked out with Windows (`\r\n`) or Unix (`\n`) line endings; otherwise the
222/// same code yields different handles on a Windows dev machine versus a Linux CI
223/// runner, breaking `dupes --trace dup:<id>` and any fingerprint-keyed baseline
224/// across platforms. Stripping `\r` is a no-op on Unix-checkout fragments, so
225/// existing fingerprints are unchanged.
226fn hash_fragment(fragment: &str) -> u64 {
227    if fragment.as_bytes().contains(&b'\r') {
228        xxh3_64(fragment.replace('\r', "").as_bytes())
229    } else {
230        xxh3_64(fragment.as_bytes())
231    }
232}
233
234/// Build a per-group `ExtractFunction` refactoring suggestion.
235///
236/// Mirrors the per-group branch of [`super::families`]'s `generate_suggestions`:
237/// the savings is `(instances - 1)` copies of the group's line count, since one
238/// copy survives as the extracted function and the rest collapse to call sites.
239#[must_use]
240pub fn group_refactoring_suggestion(group: &CloneGroup) -> RefactoringSuggestion {
241    let estimated_savings = group.line_count * group.instances.len().saturating_sub(1);
242    RefactoringSuggestion {
243        kind: RefactoringKind::ExtractFunction,
244        description: format!(
245            "Extract the shared {}-line block into one function and call it from {} sites",
246            group.line_count,
247            group.instances.len(),
248        ),
249        estimated_savings,
250    }
251}
252
253/// Best-effort name for the extracted function, derived from the most frequent
254/// non-generic identifier in the representative fragment.
255///
256/// Returns `None` when the dominant identifier is generic (`data`, `result`,
257/// loop counters), appears only once, or ties with another, so absence is the
258/// low-confidence signal for both human and agent consumers. This is a
259/// lexical heuristic over the raw fragment, not an AST analysis; it is advisory
260/// and consumers should verify before applying.
261#[must_use]
262pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
263    let fragment = group.instances.first().map(|inst| inst.fragment.as_str())?;
264    let mut counts: FxHashMap<&str, usize> = FxHashMap::default();
265    for word in identifier_words(fragment) {
266        if is_generic_identifier(word) {
267            continue;
268        }
269        *counts.entry(word).or_insert(0) += 1;
270    }
271
272    let mut candidates: Vec<_> = counts
273        .into_iter()
274        .map(|(word, count)| IdentifierCandidate {
275            word,
276            count,
277            score: identifier_score(word, count),
278        })
279        .collect();
280    candidates.sort_by(|a, b| {
281        b.score
282            .cmp(&a.score)
283            .then_with(|| b.count.cmp(&a.count))
284            .then_with(|| a.word.cmp(b.word))
285    });
286
287    let best = candidates.first()?;
288    if best.count < 2 {
289        return None;
290    }
291
292    let runner_up = candidates.get(1);
293    if runner_up.is_some_and(|next| best.score.saturating_sub(next.score) < 2) {
294        return None;
295    }
296
297    if is_plain_single_token(best.word) {
298        let next_count = runner_up.map_or(0, |candidate| candidate.count);
299        if best.count < 3 || best.count < next_count + 2 {
300            return None;
301        }
302    }
303
304    Some(best.word.to_string())
305}
306
307#[derive(Debug)]
308struct IdentifierCandidate<'a> {
309    word: &'a str,
310    count: usize,
311    score: usize,
312}
313
314fn identifier_score(word: &str, count: usize) -> usize {
315    let quality_bonus = if has_identifier_separator_or_case_transition(word) {
316        5
317    } else if word.chars().count() >= 8 {
318        2
319    } else {
320        0
321    };
322    count * 5 + quality_bonus
323}
324
325fn is_plain_single_token(word: &str) -> bool {
326    !has_identifier_separator_or_case_transition(word) && word.chars().count() < 8
327}
328
329fn has_identifier_separator_or_case_transition(word: &str) -> bool {
330    word.contains('_')
331        || word.contains('$')
332        || word
333            .chars()
334            .collect::<Vec<_>>()
335            .windows(2)
336            .any(|pair| pair[0].is_ascii_lowercase() && pair[1].is_ascii_uppercase())
337}
338
339/// Yield identifier-like words (`[A-Za-z_$][A-Za-z0-9_$]*`) from raw source.
340fn identifier_words(source: &str) -> impl Iterator<Item = &str> {
341    source
342        .split(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '$'))
343        .filter(|word| {
344            !word.is_empty()
345                && word
346                    .chars()
347                    .next()
348                    .is_some_and(|c| c.is_ascii_alphabetic() || c == '_' || c == '$')
349        })
350}
351
352/// Identifiers too generic to make a useful extracted-function name, plus the
353/// reserved words that show up as bare tokens in a fragment.
354fn is_generic_identifier(word: &str) -> bool {
355    if word.chars().count() == 1 {
356        return true;
357    }
358    matches!(
359        word,
360        "data"
361            | "result"
362            | "results"
363            | "item"
364            | "items"
365            | "value"
366            | "values"
367            | "val"
368            | "obj"
369            | "object"
370            | "arr"
371            | "array"
372            | "list"
373            | "map"
374            | "set"
375            | "key"
376            | "keys"
377            | "tmp"
378            | "temp"
379            | "acc"
380            | "cur"
381            | "curr"
382            | "prev"
383            | "next"
384            | "node"
385            | "el"
386            | "elem"
387            | "element"
388            | "args"
389            | "arg"
390            | "opts"
391            | "options"
392            | "params"
393            | "param"
394            | "props"
395            | "ctx"
396            | "context"
397            | "res"
398            | "req"
399            | "err"
400            | "error"
401            | "fn"
402            | "cb"
403            | "callback"
404            | "out"
405            | "input"
406            | "output"
407            | "name"
408            | "id"
409            | "index"
410            | "idx"
411            | "x"
412            | "y"
413            | "z"
414            | "i"
415            | "j"
416            | "k"
417            | "n"
418            | "m"
419            | "a"
420            | "b"
421            | "c"
422            | "e"
423            | "_"
424            | "const"
425            | "let"
426            | "var"
427            | "function"
428            | "return"
429            | "if"
430            | "else"
431            | "for"
432            | "while"
433            | "do"
434            | "switch"
435            | "case"
436            | "break"
437            | "continue"
438            | "new"
439            | "this"
440            | "true"
441            | "false"
442            | "null"
443            | "undefined"
444            | "void"
445            | "typeof"
446            | "instanceof"
447            | "in"
448            | "of"
449            | "class"
450            | "extends"
451            | "super"
452            | "import"
453            | "export"
454            | "from"
455            | "default"
456            | "async"
457            | "await"
458            | "yield"
459            | "type"
460            | "interface"
461            | "enum"
462            | "as"
463            | "is"
464            | "keyof"
465            | "readonly"
466            | "public"
467            | "private"
468            | "protected"
469            | "static"
470            | "get"
471            | "delete"
472            | "throw"
473            | "try"
474            | "catch"
475            | "finally"
476            | "string"
477            | "number"
478            | "boolean"
479            | "any"
480            | "unknown"
481            | "never"
482            | "bigint"
483            | "symbol"
484            | "Math"
485            | "JSON"
486            | "Object"
487            | "Array"
488            | "Promise"
489            | "BigInt"
490            | "Number"
491            | "String"
492            | "Boolean"
493            | "Symbol"
494            | "RegExp"
495            | "Date"
496    )
497}
498
499#[cfg(test)]
500mod tests {
501    use std::path::PathBuf;
502
503    use super::*;
504
505    fn instance(fragment: &str) -> CloneInstance {
506        CloneInstance {
507            file: PathBuf::from("a.ts"),
508            start_line: 1,
509            end_line: 5,
510            start_col: 0,
511            end_col: 0,
512            fragment: fragment.to_string(),
513        }
514    }
515
516    fn group(fragments: &[&str], line_count: usize) -> CloneGroup {
517        CloneGroup {
518            instances: fragments.iter().map(|f| instance(f)).collect(),
519            token_count: 40,
520            line_count,
521        }
522    }
523
524    #[test]
525    fn fingerprint_is_stable_and_prefixed() {
526        let g = group(&["foo(bar)", "foo(baz)"], 3);
527        let fp1 = clone_fingerprint(&g.instances);
528        let fp2 = clone_fingerprint(&g.instances);
529        assert_eq!(fp1, fp2);
530        assert!(fp1.starts_with("dup:"));
531        assert_eq!(fp1.len(), "dup:".len() + 8);
532    }
533
534    #[test]
535    fn fingerprint_is_sibling_stable() {
536        let group_a = group(&["computeInvoiceTotal(order)", "computeInvoiceTotal(o)"], 4);
537        let before = clone_fingerprint(&group_a.instances);
538        let _group_b_edited = group(&["totallyDifferentBody()"], 2);
539        let after = clone_fingerprint(&group_a.instances);
540        assert_eq!(before, after);
541    }
542
543    #[test]
544    fn fingerprint_differs_for_different_content() {
545        let a = group(&["alpha()"], 2);
546        let b = group(&["beta()"], 2);
547        assert_ne!(
548            clone_fingerprint(&a.instances),
549            clone_fingerprint(&b.instances)
550        );
551    }
552
553    #[test]
554    fn fingerprint_set_widens_only_colliding_short_handles() {
555        let a = group(&["alpha()"], 2);
556        let b = group(&["beta()"], 2);
557        let c = group(&["gamma()"], 2);
558        let entries = vec![
559            (
560                CloneFingerprintKey::from_group(&a),
561                0x0000_0001_1234_5678_u64,
562            ),
563            (
564                CloneFingerprintKey::from_group(&b),
565                0x0000_0002_1234_5678_u64,
566            ),
567            (
568                CloneFingerprintKey::from_group(&c),
569                0x0000_0003_8765_4321_u64,
570            ),
571        ];
572
573        let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
574
575        assert_eq!(
576            fingerprints.fingerprint_for_group(&a),
577            "dup:0000000112345678"
578        );
579        assert_eq!(
580            fingerprints.fingerprint_for_group(&b),
581            "dup:0000000212345678"
582        );
583        assert_eq!(fingerprints.fingerprint_for_group(&c), "dup:87654321");
584        assert!(
585            fingerprints
586                .find_group(&[a.clone(), b.clone(), c.clone()], "dup:12345678")
587                .is_none()
588        );
589        assert_eq!(
590            fingerprints
591                .find_group(&[a, b, c], "dup:0000000212345678")
592                .and_then(|group| group.instances.first())
593                .map(|inst| inst.fragment.as_str()),
594            Some("beta()")
595        );
596    }
597
598    #[test]
599    fn fingerprint_set_suffixes_full_hash_collisions() {
600        let a = group(&["alpha()"], 2);
601        let b = group(&["beta()"], 2);
602        let entries = vec![
603            (
604                CloneFingerprintKey::from_group(&a),
605                0x0000_0001_1234_5678_u64,
606            ),
607            (
608                CloneFingerprintKey::from_group(&b),
609                0x0000_0001_1234_5678_u64,
610            ),
611        ];
612
613        let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
614
615        assert_eq!(
616            fingerprints.fingerprint_for_group(&a),
617            "dup:0000000112345678-1"
618        );
619        assert_eq!(
620            fingerprints.fingerprint_for_group(&b),
621            "dup:0000000112345678-2"
622        );
623        assert!(
624            fingerprints
625                .find_group(&[a.clone(), b.clone()], "dup:12345678")
626                .is_none()
627        );
628        assert!(
629            fingerprints
630                .find_group(&[a, b], "dup:0000000112345678")
631                .is_none()
632        );
633    }
634
635    #[test]
636    fn group_suggestion_savings_is_lines_times_extra_copies() {
637        let g = group(&["x", "x", "x"], 10); // 3 instances, 10 lines
638        let suggestion = group_refactoring_suggestion(&g);
639        assert_eq!(suggestion.kind, RefactoringKind::ExtractFunction);
640        assert_eq!(suggestion.estimated_savings, 20); // 10 * (3 - 1)
641    }
642
643    #[test]
644    fn dominant_identifier_picks_repeated_domain_name() {
645        let g = group(
646            &["function buildInvoice(invoice) { return invoice.total + invoice.tax; }"],
647            3,
648        );
649        assert_eq!(dominant_identifier(&g).as_deref(), Some("invoice"));
650    }
651
652    #[test]
653    fn dominant_identifier_none_on_generic() {
654        let g = group(&["const data = result.map((item) => item.value);"], 3);
655        assert_eq!(dominant_identifier(&g), None);
656    }
657
658    #[test]
659    fn dominant_identifier_skips_ts_primitive_keywords_and_globals() {
660        let g = group(
661            &["const parseUser = z.string(); parseUser(z.number()); parseUser.or(z.string());"],
662            4,
663        );
664        assert_eq!(dominant_identifier(&g).as_deref(), Some("parseUser"));
665        let only_keywords = group(&["const x: string = y as string; return x as any;"], 3);
666        assert_eq!(dominant_identifier(&only_keywords), None);
667        let g_global = group(&["Math.max(Math.floor(Math.abs(v)), 0)"], 3);
668        assert_eq!(dominant_identifier(&g_global), None);
669    }
670
671    #[test]
672    fn dominant_identifier_none_on_single_letter_type_param() {
673        let g = group(
674            &["function id<T>(x: T): T { const a: T = x; return a as T; }"],
675            3,
676        );
677        assert_eq!(dominant_identifier(&g), None);
678    }
679
680    #[test]
681    fn dominant_identifier_none_on_tie() {
682        let g = group(&["alpha(); beta();"], 2); // each appears once, no count >= 2
683        assert_eq!(dominant_identifier(&g), None);
684    }
685
686    #[test]
687    fn dominant_identifier_prefers_structured_names() {
688        let g = group(
689            &["parseSchema(input); parseSchema(cache); helper(); helper();"],
690            3,
691        );
692        assert_eq!(dominant_identifier(&g).as_deref(), Some("parseSchema"));
693    }
694
695    #[test]
696    fn dominant_identifier_requires_plain_token_margin() {
697        let low_signal = group(&["schema(); schema(); parseUser();"], 3);
698        assert_eq!(dominant_identifier(&low_signal), None);
699
700        let strong = group(&["schema(); schema(); schema(); schema(); parseUser();"], 3);
701        assert_eq!(dominant_identifier(&strong).as_deref(), Some("schema"));
702    }
703
704    #[test]
705    fn dominant_identifier_is_stable_across_word_order() {
706        let first = group(
707            &["helper(); parseSchema(input); helper(); parseSchema(cache);"],
708            3,
709        );
710        let second = group(
711            &["parseSchema(input); helper(); parseSchema(cache); helper();"],
712            3,
713        );
714
715        assert_eq!(dominant_identifier(&first), dominant_identifier(&second));
716        assert_eq!(dominant_identifier(&first).as_deref(), Some("parseSchema"));
717    }
718}