Skip to main content

fallow_engine/
duplicates.rs

1//! Duplication result types exposed through the engine boundary.
2
3use std::path::{Path, PathBuf};
4
5use fallow_config::DuplicatesConfig;
6use fallow_types::discover::DiscoveredFile;
7use rustc_hash::{FxHashMap, FxHashSet};
8
9use crate::DuplicationAnalysis;
10
11pub const FINGERPRINT_PREFIX: &str = fallow_core::duplicates::FINGERPRINT_PREFIX;
12
13pub type CloneGroup = fallow_types::duplicates::CloneGroup;
14pub type CloneInstance = fallow_types::duplicates::CloneInstance;
15pub type DefaultIgnoreSkips = fallow_types::duplicates::DefaultIgnoreSkips;
16pub type DuplicationReport = fallow_types::duplicates::DuplicationReport;
17pub type DuplicationStats = fallow_types::duplicates::DuplicationStats;
18
19/// Report-scoped clone fingerprint assignment exposed through the engine boundary.
20#[derive(Debug, Clone)]
21pub struct CloneFingerprintSet {
22    inner: fallow_core::duplicates::CloneFingerprintSet,
23}
24
25impl CloneFingerprintSet {
26    /// Assign collision-free fingerprints for the report's clone groups.
27    #[must_use]
28    pub fn from_groups(groups: &[CloneGroup]) -> Self {
29        Self {
30            inner: fallow_core::duplicates::CloneFingerprintSet::from_groups(groups),
31        }
32    }
33
34    /// Return the assigned fingerprint for a clone group.
35    #[must_use]
36    pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
37        self.inner.fingerprint_for_group(group)
38    }
39
40    /// Return the assigned fingerprint for clone-group parts.
41    #[must_use]
42    pub fn fingerprint_for_parts(
43        &self,
44        instances: &[CloneInstance],
45        token_count: usize,
46        line_count: usize,
47    ) -> String {
48        self.inner
49            .fingerprint_for_parts(instances, token_count, line_count)
50    }
51
52    /// Find the group addressed by an assigned fingerprint.
53    #[must_use]
54    pub fn find_group<'a>(
55        &self,
56        groups: &'a [CloneGroup],
57        fingerprint: &str,
58    ) -> Option<&'a CloneGroup> {
59        self.inner.find_group(groups, fingerprint)
60    }
61}
62
63/// Compute the stable fingerprint for a clone group.
64#[must_use]
65pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
66    fallow_core::duplicates::clone_fingerprint(instances)
67}
68
69/// Compute a clone fingerprint directly from a representative source fragment.
70#[must_use]
71pub fn fingerprint_for_fragment(fragment: &str) -> String {
72    fallow_core::duplicates::fingerprint_for_fragment(fragment)
73}
74
75/// Return the best-effort dominant identifier for a clone group.
76#[must_use]
77pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
78    fallow_core::duplicates::dominant_identifier(group)
79}
80
81/// Refresh clone-family and mirrored-directory fields after clone groups change.
82pub fn refresh_clone_families(report: &mut DuplicationReport, root: &Path) {
83    report.clone_families =
84        fallow_core::duplicates::families::group_into_families(&report.clone_groups, root);
85    report.mirrored_directories = fallow_core::duplicates::families::detect_mirrored_directories(
86        &report.clone_families,
87        root,
88    );
89}
90
91/// Recompute duplication statistics after clone groups have been filtered.
92///
93/// Uses per-file line deduplication, matching the detector's stats model, so
94/// overlapping clone instances do not inflate the duplicated line count.
95#[must_use]
96pub fn recompute_stats(report: &DuplicationReport) -> DuplicationStats {
97    let mut files_with_clones: FxHashSet<&Path> = FxHashSet::default();
98    let mut file_dup_lines: FxHashMap<&Path, FxHashSet<usize>> = FxHashMap::default();
99    let mut duplicated_tokens = 0usize;
100    let mut clone_instances = 0usize;
101
102    for group in &report.clone_groups {
103        for instance in &group.instances {
104            files_with_clones.insert(&instance.file);
105            clone_instances += 1;
106            let lines = file_dup_lines.entry(&instance.file).or_default();
107            for line in instance.start_line..=instance.end_line {
108                lines.insert(line);
109            }
110        }
111        duplicated_tokens += group.token_count * group.instances.len();
112    }
113
114    let duplicated_lines: usize = file_dup_lines.values().map(FxHashSet::len).sum();
115
116    DuplicationStats {
117        total_files: report.stats.total_files,
118        files_with_clones: files_with_clones.len(),
119        total_lines: report.stats.total_lines,
120        duplicated_lines,
121        total_tokens: report.stats.total_tokens,
122        duplicated_tokens,
123        clone_groups: report.clone_groups.len(),
124        clone_instances,
125        duplication_percentage: if report.stats.total_lines > 0 {
126            (duplicated_lines as f64 / report.stats.total_lines as f64) * 100.0
127        } else {
128            0.0
129        },
130        clone_groups_below_min_occurrences: report.stats.clone_groups_below_min_occurrences,
131    }
132}
133
134/// Compare two JS/TS sources by duplicate-token kind sequence.
135///
136/// This keeps CLI audit's non-behavioral change check from depending on the
137/// tokenizer module shape.
138#[must_use]
139pub fn source_token_kinds_equivalent(
140    path: &Path,
141    current: &str,
142    base: &str,
143    cross_language: bool,
144) -> bool {
145    let current_tokens =
146        fallow_core::duplicates::tokenize::tokenize_file(path, current, cross_language);
147    let base_tokens = fallow_core::duplicates::tokenize::tokenize_file(path, base, cross_language);
148    current_tokens
149        .tokens
150        .iter()
151        .map(|token| &token.kind)
152        .eq(base_tokens.tokens.iter().map(|token| &token.kind))
153}
154
155/// Run duplication detection on a discovered file set.
156#[must_use]
157pub fn find_duplicates(
158    root: &Path,
159    files: &[DiscoveredFile],
160    config: &DuplicatesConfig,
161) -> DuplicationReport {
162    fallow_core::duplicates::find_duplicates(root, files, config)
163}
164
165/// Run cached duplication detection inside the engine boundary.
166#[must_use]
167pub fn find_duplicates_cached(
168    root: &Path,
169    files: &[DiscoveredFile],
170    config: &DuplicatesConfig,
171    cache_dir: &Path,
172) -> DuplicationReport {
173    fallow_core::duplicates::find_duplicates_cached(root, files, config, cache_dir)
174}
175
176/// Run duplication detection and include metadata about built-in ignored files.
177#[must_use]
178pub fn find_duplicates_with_defaults(
179    root: &Path,
180    files: &[DiscoveredFile],
181    config: &DuplicatesConfig,
182    cache_dir: Option<&Path>,
183) -> DuplicationAnalysis {
184    let (report, default_ignore_skips) = if let Some(cache_dir) = cache_dir {
185        fallow_core::duplicates::find_duplicates_cached_with_default_ignore_skips(
186            root, files, config, cache_dir,
187        )
188    } else {
189        fallow_core::duplicates::find_duplicates_with_default_ignore_skips(root, files, config)
190    };
191    DuplicationAnalysis {
192        report,
193        default_ignore_skips,
194    }
195}
196
197/// Run focused duplication detection and include metadata about built-in ignored files.
198#[must_use]
199pub fn find_duplicates_touching_files_with_defaults(
200    root: &Path,
201    files: &[DiscoveredFile],
202    config: &DuplicatesConfig,
203    changed_files: &[PathBuf],
204    cache_dir: Option<&Path>,
205) -> DuplicationAnalysis {
206    let changed_files = changed_files.iter().cloned().collect::<FxHashSet<_>>();
207    let (report, default_ignore_skips) = if let Some(cache_dir) = cache_dir {
208        fallow_core::duplicates::find_duplicates_touching_files_cached_with_default_ignore_skips(
209            root,
210            files,
211            config,
212            &changed_files,
213            cache_dir,
214        )
215    } else {
216        fallow_core::duplicates::find_duplicates_touching_files_with_default_ignore_skips(
217            root,
218            files,
219            config,
220            &changed_files,
221        )
222    };
223    DuplicationAnalysis {
224        report,
225        default_ignore_skips,
226    }
227}
228
229#[cfg(test)]
230mod tests {
231    use std::path::PathBuf;
232
233    use super::*;
234
235    fn instance(file: &str, start_line: usize, end_line: usize) -> CloneInstance {
236        CloneInstance {
237            file: PathBuf::from(file),
238            start_line,
239            end_line,
240            start_col: 0,
241            end_col: 0,
242            fragment: String::new(),
243        }
244    }
245
246    fn report(clone_groups: Vec<CloneGroup>) -> DuplicationReport {
247        DuplicationReport {
248            clone_groups,
249            clone_families: Vec::new(),
250            mirrored_directories: Vec::new(),
251            stats: DuplicationStats {
252                total_files: 3,
253                total_lines: 100,
254                total_tokens: 1_000,
255                clone_groups_below_min_occurrences: 4,
256                ..DuplicationStats::default()
257            },
258        }
259    }
260
261    #[test]
262    fn recompute_stats_deduplicates_overlapping_lines_per_file() {
263        let report = report(vec![
264            CloneGroup {
265                instances: vec![instance("src/a.ts", 1, 10), instance("src/b.ts", 20, 24)],
266                token_count: 30,
267                line_count: 10,
268            },
269            CloneGroup {
270                instances: vec![instance("src/a.ts", 5, 12), instance("src/c.ts", 40, 44)],
271                token_count: 20,
272                line_count: 8,
273            },
274        ]);
275
276        let stats = recompute_stats(&report);
277
278        assert_eq!(stats.total_files, 3);
279        assert_eq!(stats.files_with_clones, 3);
280        assert_eq!(stats.total_lines, 100);
281        assert_eq!(stats.duplicated_lines, 22);
282        assert_eq!(stats.total_tokens, 1_000);
283        assert_eq!(stats.duplicated_tokens, 100);
284        assert_eq!(stats.clone_groups, 2);
285        assert_eq!(stats.clone_instances, 4);
286        assert!((stats.duplication_percentage - 22.0).abs() < f64::EPSILON);
287        assert_eq!(stats.clone_groups_below_min_occurrences, 4);
288    }
289
290    #[test]
291    fn recompute_stats_handles_zero_total_lines() {
292        let mut report = report(vec![CloneGroup {
293            instances: vec![instance("src/a.ts", 1, 1)],
294            token_count: 5,
295            line_count: 1,
296        }]);
297        report.stats.total_lines = 0;
298
299        let stats = recompute_stats(&report);
300
301        assert_eq!(stats.duplicated_lines, 1);
302        assert!(stats.duplication_percentage.abs() < f64::EPSILON);
303    }
304
305    #[test]
306    fn clone_fingerprint_set_delegates_without_leaking_core_type() {
307        let groups = vec![CloneGroup {
308            instances: vec![
309                CloneInstance {
310                    fragment: "const value = 1;".to_string(),
311                    ..instance("src/a.ts", 1, 1)
312                },
313                CloneInstance {
314                    fragment: "const value = 1;".to_string(),
315                    ..instance("src/b.ts", 2, 2)
316                },
317            ],
318            token_count: 5,
319            line_count: 1,
320        }];
321        let fingerprints = CloneFingerprintSet::from_groups(&groups);
322        let fingerprint = fingerprints.fingerprint_for_group(&groups[0]);
323
324        assert!(fingerprint.starts_with(FINGERPRINT_PREFIX));
325        assert!(fingerprints.find_group(&groups, &fingerprint).is_some());
326    }
327}