Skip to main content

fallow_core/duplicates/
deepdive.rs

1//! Deep-dive helpers for the `fallow dupes --trace` inspector: a stable
2//! content fingerprint that addresses a clone group across runs, a group-level
3//! refactoring suggestion, and a best-effort "dominant identifier" name for the
4//! extracted function.
5//!
6//! These are pure functions over [`CloneInstance`] / [`CloneGroup`] so every
7//! surface (human listing, `--trace dup:<fp>` lookup, the typed JSON wrappers,
8//! and `trace_clone`) computes the same values without storing a field on the
9//! core [`CloneGroup`] struct.
10
11use std::path::PathBuf;
12
13use rustc_hash::{FxHashMap, FxHashSet};
14use xxhash_rust::xxh3::xxh3_64;
15
16use super::types::{CloneGroup, CloneInstance, RefactoringKind, RefactoringSuggestion};
17
18/// Prefix marking a clone-group fingerprint addressable via `--trace`.
19pub const FINGERPRINT_PREFIX: &str = "dup:";
20
21/// Canonical identity for a clone group when assigning report-scoped handles.
22///
23/// The representative fragment keeps the handle content-derived. The structural
24/// and location fields make otherwise identical wrappers addressable when tests
25/// or future grouping modes present them as separate report entries.
26#[derive(Debug, Clone, PartialEq, Eq, Hash)]
27pub struct CloneFingerprintKey {
28    representative_fragment: String,
29    token_count: usize,
30    line_count: usize,
31    instance_count: usize,
32    first_file: Option<PathBuf>,
33    first_start_line: usize,
34    first_end_line: usize,
35}
36
37impl CloneFingerprintKey {
38    /// Build a fingerprint key from clone-group parts.
39    #[must_use]
40    pub fn from_parts(instances: &[CloneInstance], token_count: usize, line_count: usize) -> Self {
41        let first = instances.first();
42        Self {
43            representative_fragment: first.map_or_else(String::new, |inst| inst.fragment.clone()),
44            token_count,
45            line_count,
46            instance_count: instances.len(),
47            first_file: first.map(|inst| inst.file.clone()),
48            first_start_line: first.map_or(0, |inst| inst.start_line),
49            first_end_line: first.map_or(0, |inst| inst.end_line),
50        }
51    }
52
53    fn from_group(group: &CloneGroup) -> Self {
54        Self::from_parts(&group.instances, group.token_count, group.line_count)
55    }
56
57    fn representative_fragment(&self) -> &str {
58        &self.representative_fragment
59    }
60}
61
62/// Report-scoped clone fingerprint assignment.
63///
64/// Most reports retain the short `dup:<8hex>` handle. If two report entries
65/// collide on those low 32 bits, only the colliding entries widen to
66/// `dup:<16hex>`. If a full 64-bit collision ever occurs inside one report,
67/// every entry in that collision bucket receives a deterministic numeric suffix.
68#[derive(Debug, Clone)]
69pub struct CloneFingerprintSet {
70    by_key: FxHashMap<CloneFingerprintKey, String>,
71    key_by_fingerprint: FxHashMap<String, CloneFingerprintKey>,
72}
73
74impl CloneFingerprintSet {
75    /// Assign collision-free fingerprints for the report's clone groups.
76    #[must_use]
77    pub fn from_groups(groups: &[CloneGroup]) -> Self {
78        let entries: Vec<_> = groups
79            .iter()
80            .map(|group| {
81                let key = CloneFingerprintKey::from_group(group);
82                let hash = hash_fragment(key.representative_fragment());
83                (key, hash)
84            })
85            .collect();
86        Self::from_hashed_entries(&entries)
87    }
88
89    /// Return the assigned fingerprint for a clone group.
90    #[must_use]
91    pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
92        self.fingerprint_for_key(&CloneFingerprintKey::from_group(group))
93    }
94
95    /// Return the assigned fingerprint for clone-group parts.
96    #[must_use]
97    pub fn fingerprint_for_parts(
98        &self,
99        instances: &[CloneInstance],
100        token_count: usize,
101        line_count: usize,
102    ) -> String {
103        self.fingerprint_for_key(&CloneFingerprintKey::from_parts(
104            instances,
105            token_count,
106            line_count,
107        ))
108    }
109
110    /// Return the assigned fingerprint for a key, falling back to the legacy
111    /// short content handle when the key was not present in this report.
112    #[must_use]
113    pub fn fingerprint_for_key(&self, key: &CloneFingerprintKey) -> String {
114        self.by_key
115            .get(key)
116            .cloned()
117            .unwrap_or_else(|| fingerprint_for_fragment(key.representative_fragment.as_str()))
118    }
119
120    /// Find the group addressed by an assigned fingerprint.
121    ///
122    /// Ambiguous short handles created by low-32 collisions are intentionally
123    /// absent from the lookup table, so callers get `None` instead of the first
124    /// matching group.
125    #[must_use]
126    pub fn find_group<'a>(
127        &self,
128        groups: &'a [CloneGroup],
129        fingerprint: &str,
130    ) -> Option<&'a CloneGroup> {
131        let key = self.key_by_fingerprint.get(fingerprint)?;
132        groups
133            .iter()
134            .find(|group| CloneFingerprintKey::from_group(group) == *key)
135    }
136
137    fn from_hashed_entries(entries: &[(CloneFingerprintKey, u64)]) -> Self {
138        let mut short_counts: FxHashMap<u32, usize> = FxHashMap::default();
139        let mut full_counts: FxHashMap<u64, usize> = FxHashMap::default();
140        for (_, hash) in entries {
141            *short_counts.entry(*hash as u32).or_insert(0) += 1;
142            *full_counts.entry(*hash).or_insert(0) += 1;
143        }
144
145        let mut full_ordinals: FxHashMap<u64, usize> = FxHashMap::default();
146        let mut ambiguous_short_handles: FxHashSet<String> = FxHashSet::default();
147        let mut by_key = FxHashMap::default();
148        let mut key_by_fingerprint = FxHashMap::default();
149
150        for (key, hash) in entries {
151            let short = *hash as u32;
152            let short_handle = format!("{FINGERPRINT_PREFIX}{short:08x}");
153            let fingerprint = if short_counts.get(&short).copied().unwrap_or(0) == 1 {
154                short_handle
155            } else {
156                ambiguous_short_handles.insert(short_handle);
157                let full_handle = format!("{FINGERPRINT_PREFIX}{hash:016x}");
158                if full_counts.get(hash).copied().unwrap_or(0) == 1 {
159                    full_handle
160                } else {
161                    let ordinal = full_ordinals.entry(*hash).or_insert(0);
162                    *ordinal += 1;
163                    format!("{full_handle}-{ordinal}")
164                }
165            };
166
167            key_by_fingerprint.insert(fingerprint.clone(), key.clone());
168            by_key.insert(key.clone(), fingerprint);
169        }
170
171        for handle in ambiguous_short_handles {
172            key_by_fingerprint.remove(&handle);
173        }
174
175        Self {
176            by_key,
177            key_by_fingerprint,
178        }
179    }
180}
181
182/// Compute the legacy short content fingerprint for a clone group from its
183/// instances.
184///
185/// The fingerprint is derived from the representative instance's raw source
186/// fragment (the first instance after [`super::types::DuplicationReport::sort`],
187/// which orders instances by `(file, line)`), so it is:
188///
189/// - content-derived, not line-derived (moving a clone down a file does not
190///   change it),
191/// - sibling-stable (editing one clone group never changes another group's
192///   fingerprint, since each hashes only its own content),
193///
194/// Use [`CloneFingerprintSet`] for user-facing report output, since it widens
195/// only the rare colliding handles while preserving this short form for the
196/// common case.
197///
198/// Hashes the empty string for an empty group (never produced by the detector,
199/// which guarantees `>= 2` instances), so the result is still a well-formed
200/// `dup:<8hex>` handle.
201#[must_use]
202pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
203    let representative = instances.first().map_or("", |inst| inst.fragment.as_str());
204    fingerprint_for_fragment(representative)
205}
206
207/// Compute the fingerprint directly from a representative source fragment.
208///
209/// Use when the instances are wrapped (e.g. `--group-by` attributed instances)
210/// but the representative fragment is the same as the bare clone group's, so the
211/// fingerprint matches the top-level `clone_groups[].fingerprint` for the clone.
212#[must_use]
213pub fn fingerprint_for_fragment(fragment: &str) -> String {
214    let hash = hash_fragment(fragment);
215    // Low 32 bits give an 8-hex handle: ~4e9 space, ample for a single report's
216    // clone-group count while staying short enough to type after `--trace`.
217    format!("{FINGERPRINT_PREFIX}{:08x}", hash as u32)
218}
219
220/// Hash a representative fragment, normalizing CRLF to LF first.
221///
222/// A clone group must get the same `dup:` fingerprint whether its source was
223/// checked out with Windows (`\r\n`) or Unix (`\n`) line endings; otherwise the
224/// same code yields different handles on a Windows dev machine versus a Linux CI
225/// runner, breaking `dupes --trace dup:<id>` and any fingerprint-keyed baseline
226/// across platforms. Stripping `\r` is a no-op on Unix-checkout fragments, so
227/// existing fingerprints are unchanged.
228fn hash_fragment(fragment: &str) -> u64 {
229    if fragment.as_bytes().contains(&b'\r') {
230        xxh3_64(fragment.replace('\r', "").as_bytes())
231    } else {
232        xxh3_64(fragment.as_bytes())
233    }
234}
235
236/// Build a per-group `ExtractFunction` refactoring suggestion.
237///
238/// Mirrors the per-group branch of [`super::families`]'s `generate_suggestions`:
239/// the savings is `(instances - 1)` copies of the group's line count, since one
240/// copy survives as the extracted function and the rest collapse to call sites.
241#[must_use]
242pub fn group_refactoring_suggestion(group: &CloneGroup) -> RefactoringSuggestion {
243    let estimated_savings = group.line_count * group.instances.len().saturating_sub(1);
244    RefactoringSuggestion {
245        kind: RefactoringKind::ExtractFunction,
246        description: format!(
247            "Extract the shared {}-line block into one function and call it from {} sites",
248            group.line_count,
249            group.instances.len(),
250        ),
251        estimated_savings,
252    }
253}
254
255/// Best-effort name for the extracted function, derived from the most frequent
256/// non-generic identifier in the representative fragment.
257///
258/// Returns `None` when the dominant identifier is generic (`data`, `result`,
259/// loop counters), appears only once, or ties with another, so absence is the
260/// low-confidence signal for both human and agent consumers. This is a
261/// lexical heuristic over the raw fragment, not an AST analysis; it is advisory
262/// and consumers should verify before applying.
263#[must_use]
264pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
265    let fragment = group.instances.first().map(|inst| inst.fragment.as_str())?;
266    let mut counts: FxHashMap<&str, usize> = FxHashMap::default();
267    for word in identifier_words(fragment) {
268        if is_generic_identifier(word) {
269            continue;
270        }
271        *counts.entry(word).or_insert(0) += 1;
272    }
273
274    let mut candidates: Vec<_> = counts
275        .into_iter()
276        .map(|(word, count)| IdentifierCandidate {
277            word,
278            count,
279            score: identifier_score(word, count),
280        })
281        .collect();
282    candidates.sort_by(|a, b| {
283        b.score
284            .cmp(&a.score)
285            .then_with(|| b.count.cmp(&a.count))
286            .then_with(|| a.word.cmp(b.word))
287    });
288
289    let best = candidates.first()?;
290    if best.count < 2 {
291        return None;
292    }
293
294    let runner_up = candidates.get(1);
295    if runner_up.is_some_and(|next| best.score.saturating_sub(next.score) < 2) {
296        return None;
297    }
298
299    if is_plain_single_token(best.word) {
300        let next_count = runner_up.map_or(0, |candidate| candidate.count);
301        if best.count < 3 || best.count < next_count + 2 {
302            return None;
303        }
304    }
305
306    Some(best.word.to_string())
307}
308
309#[derive(Debug)]
310struct IdentifierCandidate<'a> {
311    word: &'a str,
312    count: usize,
313    score: usize,
314}
315
316fn identifier_score(word: &str, count: usize) -> usize {
317    let quality_bonus = if has_identifier_separator_or_case_transition(word) {
318        5
319    } else if word.chars().count() >= 8 {
320        2
321    } else {
322        0
323    };
324    count * 5 + quality_bonus
325}
326
327fn is_plain_single_token(word: &str) -> bool {
328    !has_identifier_separator_or_case_transition(word) && word.chars().count() < 8
329}
330
331fn has_identifier_separator_or_case_transition(word: &str) -> bool {
332    word.contains('_')
333        || word.contains('$')
334        || word
335            .chars()
336            .collect::<Vec<_>>()
337            .windows(2)
338            .any(|pair| pair[0].is_ascii_lowercase() && pair[1].is_ascii_uppercase())
339}
340
341/// Yield identifier-like words (`[A-Za-z_$][A-Za-z0-9_$]*`) from raw source.
342fn identifier_words(source: &str) -> impl Iterator<Item = &str> {
343    source
344        .split(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '$'))
345        .filter(|word| {
346            !word.is_empty()
347                && word
348                    .chars()
349                    .next()
350                    .is_some_and(|c| c.is_ascii_alphabetic() || c == '_' || c == '$')
351        })
352}
353
354/// Identifiers too generic to make a useful extracted-function name, plus the
355/// reserved words that show up as bare tokens in a fragment.
356fn is_generic_identifier(word: &str) -> bool {
357    // Single-character names are never useful as an extracted-function name:
358    // loop counters (`i`, `n`), lambda params (`x`), and generic type params
359    // (`T`, `U`, `K`, `V`) all collapse here regardless of case.
360    if word.chars().count() == 1 {
361        return true;
362    }
363    matches!(
364        word,
365        // generic value / collection names
366        "data" | "result" | "results" | "item" | "items" | "value" | "values" | "val"
367            | "obj" | "object" | "arr" | "array" | "list" | "map" | "set" | "key" | "keys"
368            | "tmp" | "temp" | "acc" | "cur" | "curr" | "prev" | "next" | "node" | "el"
369            | "elem" | "element" | "args" | "arg" | "opts" | "options" | "params" | "param"
370            | "props" | "ctx" | "context" | "res" | "req" | "err" | "error" | "fn" | "cb"
371            | "callback" | "out" | "input" | "output" | "name" | "id" | "index" | "idx"
372            // single-letter loop / lambda vars
373            | "x" | "y" | "z" | "i" | "j" | "k" | "n" | "m" | "a" | "b" | "c" | "e" | "_"
374            // JS / TS keywords that appear as bare words in fragments
375            | "const" | "let" | "var" | "function" | "return" | "if" | "else" | "for"
376            | "while" | "do" | "switch" | "case" | "break" | "continue" | "new" | "this"
377            | "true" | "false" | "null" | "undefined" | "void" | "typeof" | "instanceof"
378            | "in" | "of" | "class" | "extends" | "super" | "import" | "export" | "from"
379            | "default" | "async" | "await" | "yield" | "type" | "interface" | "enum"
380            | "as" | "is" | "keyof" | "readonly" | "public" | "private" | "protected"
381            | "static" | "get" | "delete" | "throw" | "try" | "catch" | "finally"
382            // TS primitive / utility type keywords (dominate type-heavy code like
383            // schema libraries, where they would otherwise win the frequency count)
384            | "string" | "number" | "boolean" | "any" | "unknown" | "never" | "bigint"
385            | "symbol"
386            // common JS globals that are never a useful extracted-function name
387            | "Math" | "JSON" | "Object" | "Array" | "Promise" | "BigInt" | "Number"
388            | "String" | "Boolean" | "Symbol" | "RegExp" | "Date"
389    )
390}
391
392#[cfg(test)]
393mod tests {
394    use std::path::PathBuf;
395
396    use super::*;
397
398    fn instance(fragment: &str) -> CloneInstance {
399        CloneInstance {
400            file: PathBuf::from("a.ts"),
401            start_line: 1,
402            end_line: 5,
403            start_col: 0,
404            end_col: 0,
405            fragment: fragment.to_string(),
406        }
407    }
408
409    fn group(fragments: &[&str], line_count: usize) -> CloneGroup {
410        CloneGroup {
411            instances: fragments.iter().map(|f| instance(f)).collect(),
412            token_count: 40,
413            line_count,
414        }
415    }
416
417    #[test]
418    fn fingerprint_is_stable_and_prefixed() {
419        let g = group(&["foo(bar)", "foo(baz)"], 3);
420        let fp1 = clone_fingerprint(&g.instances);
421        let fp2 = clone_fingerprint(&g.instances);
422        assert_eq!(fp1, fp2);
423        assert!(fp1.starts_with("dup:"));
424        assert_eq!(fp1.len(), "dup:".len() + 8);
425    }
426
427    #[test]
428    fn fingerprint_is_sibling_stable() {
429        // Editing group B's content must not change group A's fingerprint.
430        let group_a = group(&["computeInvoiceTotal(order)", "computeInvoiceTotal(o)"], 4);
431        let before = clone_fingerprint(&group_a.instances);
432        let _group_b_edited = group(&["totallyDifferentBody()"], 2);
433        let after = clone_fingerprint(&group_a.instances);
434        assert_eq!(before, after);
435    }
436
437    #[test]
438    fn fingerprint_differs_for_different_content() {
439        let a = group(&["alpha()"], 2);
440        let b = group(&["beta()"], 2);
441        assert_ne!(
442            clone_fingerprint(&a.instances),
443            clone_fingerprint(&b.instances)
444        );
445    }
446
447    #[test]
448    fn fingerprint_set_widens_only_colliding_short_handles() {
449        let a = group(&["alpha()"], 2);
450        let b = group(&["beta()"], 2);
451        let c = group(&["gamma()"], 2);
452        let entries = vec![
453            (
454                CloneFingerprintKey::from_group(&a),
455                0x0000_0001_1234_5678_u64,
456            ),
457            (
458                CloneFingerprintKey::from_group(&b),
459                0x0000_0002_1234_5678_u64,
460            ),
461            (
462                CloneFingerprintKey::from_group(&c),
463                0x0000_0003_8765_4321_u64,
464            ),
465        ];
466
467        let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
468
469        assert_eq!(
470            fingerprints.fingerprint_for_group(&a),
471            "dup:0000000112345678"
472        );
473        assert_eq!(
474            fingerprints.fingerprint_for_group(&b),
475            "dup:0000000212345678"
476        );
477        assert_eq!(fingerprints.fingerprint_for_group(&c), "dup:87654321");
478        assert!(
479            fingerprints
480                .find_group(&[a.clone(), b.clone(), c.clone()], "dup:12345678")
481                .is_none()
482        );
483        assert_eq!(
484            fingerprints
485                .find_group(&[a, b, c], "dup:0000000212345678")
486                .and_then(|group| group.instances.first())
487                .map(|inst| inst.fragment.as_str()),
488            Some("beta()")
489        );
490    }
491
492    #[test]
493    fn fingerprint_set_suffixes_full_hash_collisions() {
494        let a = group(&["alpha()"], 2);
495        let b = group(&["beta()"], 2);
496        let entries = vec![
497            (
498                CloneFingerprintKey::from_group(&a),
499                0x0000_0001_1234_5678_u64,
500            ),
501            (
502                CloneFingerprintKey::from_group(&b),
503                0x0000_0001_1234_5678_u64,
504            ),
505        ];
506
507        let fingerprints = CloneFingerprintSet::from_hashed_entries(&entries);
508
509        assert_eq!(
510            fingerprints.fingerprint_for_group(&a),
511            "dup:0000000112345678-1"
512        );
513        assert_eq!(
514            fingerprints.fingerprint_for_group(&b),
515            "dup:0000000112345678-2"
516        );
517        assert!(
518            fingerprints
519                .find_group(&[a.clone(), b.clone()], "dup:12345678")
520                .is_none()
521        );
522        assert!(
523            fingerprints
524                .find_group(&[a, b], "dup:0000000112345678")
525                .is_none()
526        );
527    }
528
529    #[test]
530    fn group_suggestion_savings_is_lines_times_extra_copies() {
531        let g = group(&["x", "x", "x"], 10); // 3 instances, 10 lines
532        let suggestion = group_refactoring_suggestion(&g);
533        assert_eq!(suggestion.kind, RefactoringKind::ExtractFunction);
534        assert_eq!(suggestion.estimated_savings, 20); // 10 * (3 - 1)
535    }
536
537    #[test]
538    fn dominant_identifier_picks_repeated_domain_name() {
539        let g = group(
540            &["function buildInvoice(invoice) { return invoice.total + invoice.tax; }"],
541            3,
542        );
543        assert_eq!(dominant_identifier(&g).as_deref(), Some("invoice"));
544    }
545
546    #[test]
547    fn dominant_identifier_none_on_generic() {
548        let g = group(&["const data = result.map((item) => item.value);"], 3);
549        assert_eq!(dominant_identifier(&g), None);
550    }
551
552    #[test]
553    fn dominant_identifier_skips_ts_primitive_keywords_and_globals() {
554        // Type-heavy code (schema libraries) repeats `string`/`number`/`any`;
555        // they must never become the proposed name. The real domain identifier
556        // wins instead.
557        let g = group(
558            &["const parseUser = z.string(); parseUser(z.number()); parseUser.or(z.string());"],
559            4,
560        );
561        assert_eq!(dominant_identifier(&g).as_deref(), Some("parseUser"));
562        let only_keywords = group(&["const x: string = y as string; return x as any;"], 3);
563        assert_eq!(dominant_identifier(&only_keywords), None);
564        let g_global = group(&["Math.max(Math.floor(Math.abs(v)), 0)"], 3);
565        assert_eq!(dominant_identifier(&g_global), None);
566    }
567
568    #[test]
569    fn dominant_identifier_none_on_single_letter_type_param() {
570        // A generic type param repeated many times must not become the name.
571        let g = group(
572            &["function id<T>(x: T): T { const a: T = x; return a as T; }"],
573            3,
574        );
575        assert_eq!(dominant_identifier(&g), None);
576    }
577
578    #[test]
579    fn dominant_identifier_none_on_tie() {
580        let g = group(&["alpha(); beta();"], 2); // each appears once, no count >= 2
581        assert_eq!(dominant_identifier(&g), None);
582    }
583
584    #[test]
585    fn dominant_identifier_prefers_structured_names() {
586        let g = group(
587            &["parseSchema(input); parseSchema(cache); helper(); helper();"],
588            3,
589        );
590        assert_eq!(dominant_identifier(&g).as_deref(), Some("parseSchema"));
591    }
592
593    #[test]
594    fn dominant_identifier_requires_plain_token_margin() {
595        let low_signal = group(&["schema(); schema(); parseUser();"], 3);
596        assert_eq!(dominant_identifier(&low_signal), None);
597
598        let strong = group(&["schema(); schema(); schema(); schema(); parseUser();"], 3);
599        assert_eq!(dominant_identifier(&strong).as_deref(), Some("schema"));
600    }
601
602    #[test]
603    fn dominant_identifier_is_stable_across_word_order() {
604        let first = group(
605            &["helper(); parseSchema(input); helper(); parseSchema(cache);"],
606            3,
607        );
608        let second = group(
609            &["parseSchema(input); helper(); parseSchema(cache); helper();"],
610            3,
611        );
612
613        assert_eq!(dominant_identifier(&first), dominant_identifier(&second));
614        assert_eq!(dominant_identifier(&first).as_deref(), Some("parseSchema"));
615    }
616}