fallow_core/duplicates/detect/
mod.rs1mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use oxc_span::Span;
22use rustc_hash::FxHashSet;
23
24use super::normalize::HashedToken;
25use super::tokenize::FileTokens;
26use super::types::{DuplicationReport, DuplicationStats};
27
28struct FileData {
30 path: PathBuf,
31 hashed_tokens: Vec<HashedToken>,
32 file_tokens: FileTokens,
33 atomic_invocation_spans: Vec<Span>,
34}
35
36pub struct CloneDetector {
42 min_tokens: usize,
44 min_lines: usize,
46 skip_local: bool,
48}
49
50impl CloneDetector {
51 #[must_use]
53 pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
54 Self {
55 min_tokens,
56 min_lines,
57 skip_local,
58 }
59 }
60
61 pub fn detect(
66 &self,
67 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
68 ) -> DuplicationReport {
69 self.detect_inner(file_data, None)
70 }
71
72 pub fn detect_touching_files(
79 &self,
80 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
81 focus_files: &FxHashSet<PathBuf>,
82 ) -> DuplicationReport {
83 self.detect_inner(file_data, Some(focus_files))
84 }
85
86 fn detect_inner(
87 &self,
88 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
89 focus_files: Option<&FxHashSet<PathBuf>>,
90 ) -> DuplicationReport {
91 let _span = tracing::info_span!("clone_detect").entered();
92
93 if file_data.is_empty() || self.min_tokens == 0 {
94 return empty_report(0);
95 }
96
97 let files: Vec<FileData> = file_data
98 .into_iter()
99 .map(|(path, hashed_tokens, file_tokens)| FileData {
100 atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
101 path,
102 hashed_tokens,
103 file_tokens,
104 })
105 .collect();
106
107 let total_files = files.len();
109 let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
110 let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
111 let focus_file_ids = focus_files.map(|focus| {
112 files
113 .iter()
114 .map(|file| focus.contains(&file.path))
115 .collect::<Vec<_>>()
116 });
117
118 tracing::debug!(
119 total_files,
120 total_tokens,
121 total_lines,
122 focused_files = focus_file_ids
123 .as_ref()
124 .map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
125 "clone detection input"
126 );
127
128 let t0 = std::time::Instant::now();
130 let ranked_files = ranking::rank_reduce(&files);
131 let rank_time = t0.elapsed();
132 let unique_ranks: usize = ranked_files
133 .iter()
134 .flat_map(|f| f.iter())
135 .copied()
136 .max()
137 .map_or(0, |m| m as usize + 1);
138 tracing::debug!(
139 elapsed_us = rank_time.as_micros(),
140 unique_ranks,
141 "step1_rank_reduce"
142 );
143
144 let t0 = std::time::Instant::now();
146 let (text, file_of, file_offsets) =
147 concatenation::concatenate_with_sentinels(&ranked_files);
148 let concat_time = t0.elapsed();
149 tracing::debug!(
150 elapsed_us = concat_time.as_micros(),
151 concat_len = text.len(),
152 "step2_concatenate"
153 );
154
155 if text.is_empty() {
156 return empty_report(total_files);
157 }
158
159 let t0 = std::time::Instant::now();
161 let sa = suffix_array::build_suffix_array(&text);
162 let sa_time = t0.elapsed();
163 tracing::debug!(
164 elapsed_us = sa_time.as_micros(),
165 n = text.len(),
166 "step3_suffix_array"
167 );
168
169 let t0 = std::time::Instant::now();
171 let lcp_arr = lcp::build_lcp(&text, &sa);
172 let lcp_time = t0.elapsed();
173 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
174
175 let t0 = std::time::Instant::now();
177 let raw_groups = extraction::extract_clone_groups(
178 &sa,
179 &lcp_arr,
180 &file_of,
181 &file_offsets,
182 self.min_tokens,
183 &files,
184 focus_file_ids.as_deref(),
185 );
186 let extract_time = t0.elapsed();
187 tracing::debug!(
188 elapsed_us = extract_time.as_micros(),
189 raw_groups = raw_groups.len(),
190 "step5_extract_groups"
191 );
192
193 let t0 = std::time::Instant::now();
195 let clone_groups =
196 filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
197 let build_time = t0.elapsed();
198 tracing::debug!(
199 elapsed_us = build_time.as_micros(),
200 final_groups = clone_groups.len(),
201 "step6_build_groups"
202 );
203
204 let t0 = std::time::Instant::now();
206 let stats =
207 statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
208 let stats_time = t0.elapsed();
209 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
210
211 tracing::info!(
212 total_us = (rank_time
213 + concat_time
214 + sa_time
215 + lcp_time
216 + extract_time
217 + build_time
218 + stats_time)
219 .as_micros(),
220 rank_us = rank_time.as_micros(),
221 sa_us = sa_time.as_micros(),
222 lcp_us = lcp_time.as_micros(),
223 extract_us = extract_time.as_micros(),
224 build_us = build_time.as_micros(),
225 stats_us = stats_time.as_micros(),
226 total_tokens,
227 clone_groups = clone_groups.len(),
228 "clone detection complete"
229 );
230
231 DuplicationReport {
232 clone_groups,
233 clone_families: vec![], mirrored_directories: vec![],
235 stats,
236 }
237 }
238}
239
240const fn empty_report(total_files: usize) -> DuplicationReport {
242 DuplicationReport {
243 clone_groups: Vec::new(),
244 clone_families: Vec::new(),
245 mirrored_directories: Vec::new(),
246 stats: DuplicationStats {
247 total_files,
248 files_with_clones: 0,
249 total_lines: 0,
250 duplicated_lines: 0,
251 total_tokens: 0,
252 duplicated_tokens: 0,
253 clone_groups: 0,
254 clone_instances: 0,
255 duplication_percentage: 0.0,
256 clone_groups_below_min_occurrences: 0,
257 },
258 }
259}