Skip to main content

fallow_engine/
duplicates.rs

1//! Duplication result types exposed through the engine boundary.
2
3use std::path::{Path, PathBuf};
4
5use fallow_config::DuplicatesConfig;
6use fallow_types::discover::DiscoveredFile;
7use rustc_hash::{FxHashMap, FxHashSet};
8
9use crate::{core_backend, results::DuplicationAnalysis};
10
11pub const FINGERPRINT_PREFIX: &str = "dup:";
12
13pub type CloneGroup = fallow_types::duplicates::CloneGroup;
14pub type CloneInstance = fallow_types::duplicates::CloneInstance;
15pub type DefaultIgnoreSkips = fallow_types::duplicates::DefaultIgnoreSkips;
16pub type DuplicationReport = fallow_types::duplicates::DuplicationReport;
17pub type DuplicationStats = fallow_types::duplicates::DuplicationStats;
18
19/// Report-scoped clone fingerprint assignment exposed through the engine boundary.
20#[derive(Debug, Clone)]
21pub struct CloneFingerprintSet {
22    inner: core_backend::BackendCloneFingerprintSet,
23}
24
25impl CloneFingerprintSet {
26    /// Assign collision-free fingerprints for the report's clone groups.
27    #[must_use]
28    pub fn from_groups(groups: &[CloneGroup]) -> Self {
29        Self {
30            inner: core_backend::BackendCloneFingerprintSet::from_groups(groups),
31        }
32    }
33
34    /// Return the assigned fingerprint for a clone group.
35    #[must_use]
36    pub fn fingerprint_for_group(&self, group: &CloneGroup) -> String {
37        self.inner.fingerprint_for_group(group)
38    }
39
40    /// Return the assigned fingerprint for clone-group parts.
41    #[must_use]
42    pub fn fingerprint_for_parts(
43        &self,
44        instances: &[CloneInstance],
45        token_count: usize,
46        line_count: usize,
47    ) -> String {
48        self.inner
49            .fingerprint_for_parts(instances, token_count, line_count)
50    }
51
52    /// Find the group addressed by an assigned fingerprint.
53    #[must_use]
54    pub fn find_group<'a>(
55        &self,
56        groups: &'a [CloneGroup],
57        fingerprint: &str,
58    ) -> Option<&'a CloneGroup> {
59        self.inner.find_group(groups, fingerprint)
60    }
61}
62
63/// Compute the stable fingerprint for a clone group.
64#[must_use]
65pub fn clone_fingerprint(instances: &[CloneInstance]) -> String {
66    core_backend::clone_fingerprint(instances)
67}
68
69/// Compute a clone fingerprint directly from a representative source fragment.
70#[must_use]
71pub fn fingerprint_for_fragment(fragment: &str) -> String {
72    core_backend::fingerprint_for_fragment(fragment)
73}
74
75/// Return the best-effort dominant identifier for a clone group.
76#[must_use]
77pub fn dominant_identifier(group: &CloneGroup) -> Option<String> {
78    core_backend::dominant_identifier(group)
79}
80
81/// Refresh clone-family and mirrored-directory fields after clone groups change.
82pub fn refresh_clone_families(report: &mut DuplicationReport, root: &Path) {
83    core_backend::refresh_clone_families(report, root);
84}
85
86/// Recompute duplication statistics after clone groups have been filtered.
87///
88/// Uses per-file line deduplication, matching the detector's stats model, so
89/// overlapping clone instances do not inflate the duplicated line count.
90#[must_use]
91pub fn recompute_stats(report: &DuplicationReport) -> DuplicationStats {
92    let mut files_with_clones: FxHashSet<&Path> = FxHashSet::default();
93    let mut file_dup_lines: FxHashMap<&Path, FxHashSet<usize>> = FxHashMap::default();
94    let mut duplicated_tokens = 0usize;
95    let mut clone_instances = 0usize;
96
97    for group in &report.clone_groups {
98        for instance in &group.instances {
99            files_with_clones.insert(&instance.file);
100            clone_instances += 1;
101            let lines = file_dup_lines.entry(&instance.file).or_default();
102            for line in instance.start_line..=instance.end_line {
103                lines.insert(line);
104            }
105        }
106        duplicated_tokens += group.token_count * group.instances.len();
107    }
108
109    let duplicated_lines: usize = file_dup_lines.values().map(FxHashSet::len).sum();
110
111    DuplicationStats {
112        total_files: report.stats.total_files,
113        files_with_clones: files_with_clones.len(),
114        total_lines: report.stats.total_lines,
115        duplicated_lines,
116        total_tokens: report.stats.total_tokens,
117        duplicated_tokens,
118        clone_groups: report.clone_groups.len(),
119        clone_instances,
120        duplication_percentage: if report.stats.total_lines > 0 {
121            (duplicated_lines as f64 / report.stats.total_lines as f64) * 100.0
122        } else {
123            0.0
124        },
125        clone_groups_below_min_occurrences: report.stats.clone_groups_below_min_occurrences,
126    }
127}
128
129/// Compare two JS/TS sources by duplicate-token kind sequence.
130///
131/// This keeps CLI audit's non-behavioral change check from depending on the
132/// tokenizer module shape.
133#[must_use]
134pub fn source_token_kinds_equivalent(
135    path: &Path,
136    current: &str,
137    base: &str,
138    cross_language: bool,
139) -> bool {
140    core_backend::source_token_kinds_equivalent(path, current, base, cross_language)
141}
142
143/// Run duplication detection on a discovered file set.
144#[must_use]
145pub fn find_duplicates(
146    root: &Path,
147    files: &[DiscoveredFile],
148    config: &DuplicatesConfig,
149) -> DuplicationReport {
150    core_backend::find_duplicates(root, files, config)
151}
152
153/// Run cached duplication detection inside the engine boundary.
154#[must_use]
155pub fn find_duplicates_cached(
156    root: &Path,
157    files: &[DiscoveredFile],
158    config: &DuplicatesConfig,
159    cache_dir: &Path,
160) -> DuplicationReport {
161    core_backend::find_duplicates_cached(root, files, config, cache_dir)
162}
163
164/// Run duplication detection and include metadata about built-in ignored files.
165#[must_use]
166pub fn find_duplicates_with_defaults(
167    root: &Path,
168    files: &[DiscoveredFile],
169    config: &DuplicatesConfig,
170    cache_dir: Option<&Path>,
171) -> DuplicationAnalysis {
172    core_backend::find_duplicates_with_defaults(root, files, config, cache_dir)
173}
174
175/// Run focused duplication detection and include metadata about built-in ignored files.
176#[must_use]
177pub fn find_duplicates_touching_files_with_defaults(
178    root: &Path,
179    files: &[DiscoveredFile],
180    config: &DuplicatesConfig,
181    changed_files: &[PathBuf],
182    cache_dir: Option<&Path>,
183) -> DuplicationAnalysis {
184    core_backend::find_duplicates_touching_files_with_defaults(
185        root,
186        files,
187        config,
188        changed_files,
189        cache_dir,
190    )
191}
192
193#[cfg(test)]
194mod tests {
195    use std::path::PathBuf;
196
197    use super::*;
198
199    fn instance(file: &str, start_line: usize, end_line: usize) -> CloneInstance {
200        CloneInstance {
201            file: PathBuf::from(file),
202            start_line,
203            end_line,
204            start_col: 0,
205            end_col: 0,
206            fragment: String::new(),
207        }
208    }
209
210    fn report(clone_groups: Vec<CloneGroup>) -> DuplicationReport {
211        DuplicationReport {
212            clone_groups,
213            clone_families: Vec::new(),
214            mirrored_directories: Vec::new(),
215            stats: DuplicationStats {
216                total_files: 3,
217                total_lines: 100,
218                total_tokens: 1_000,
219                clone_groups_below_min_occurrences: 4,
220                ..DuplicationStats::default()
221            },
222        }
223    }
224
225    #[test]
226    fn recompute_stats_deduplicates_overlapping_lines_per_file() {
227        let report = report(vec![
228            CloneGroup {
229                instances: vec![instance("src/a.ts", 1, 10), instance("src/b.ts", 20, 24)],
230                token_count: 30,
231                line_count: 10,
232            },
233            CloneGroup {
234                instances: vec![instance("src/a.ts", 5, 12), instance("src/c.ts", 40, 44)],
235                token_count: 20,
236                line_count: 8,
237            },
238        ]);
239
240        let stats = recompute_stats(&report);
241
242        assert_eq!(stats.total_files, 3);
243        assert_eq!(stats.files_with_clones, 3);
244        assert_eq!(stats.total_lines, 100);
245        assert_eq!(stats.duplicated_lines, 22);
246        assert_eq!(stats.total_tokens, 1_000);
247        assert_eq!(stats.duplicated_tokens, 100);
248        assert_eq!(stats.clone_groups, 2);
249        assert_eq!(stats.clone_instances, 4);
250        assert!((stats.duplication_percentage - 22.0).abs() < f64::EPSILON);
251        assert_eq!(stats.clone_groups_below_min_occurrences, 4);
252    }
253
254    #[test]
255    fn recompute_stats_handles_zero_total_lines() {
256        let mut report = report(vec![CloneGroup {
257            instances: vec![instance("src/a.ts", 1, 1)],
258            token_count: 5,
259            line_count: 1,
260        }]);
261        report.stats.total_lines = 0;
262
263        let stats = recompute_stats(&report);
264
265        assert_eq!(stats.duplicated_lines, 1);
266        assert!(stats.duplication_percentage.abs() < f64::EPSILON);
267    }
268
269    #[test]
270    fn clone_fingerprint_set_delegates_without_leaking_core_type() {
271        let groups = vec![CloneGroup {
272            instances: vec![
273                CloneInstance {
274                    fragment: "const value = 1;".to_string(),
275                    ..instance("src/a.ts", 1, 1)
276                },
277                CloneInstance {
278                    fragment: "const value = 1;".to_string(),
279                    ..instance("src/b.ts", 2, 2)
280                },
281            ],
282            token_count: 5,
283            line_count: 1,
284        }];
285        let fingerprints = CloneFingerprintSet::from_groups(&groups);
286        let fingerprint = fingerprints.fingerprint_for_group(&groups[0]);
287
288        assert!(fingerprint.starts_with(FINGERPRINT_PREFIX));
289        assert!(fingerprints.find_group(&groups, &fingerprint).is_some());
290    }
291}