fallow_core/duplicates/detect/
mod.rs1mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use oxc_span::Span;
22use rustc_hash::FxHashSet;
23
24use super::normalize::HashedToken;
25use super::tokenize::FileTokens;
26use super::types::{DuplicationReport, DuplicationStats};
27
28struct FileData {
30 path: PathBuf,
31 hashed_tokens: Vec<HashedToken>,
32 file_tokens: FileTokens,
33 atomic_invocation_spans: Vec<Span>,
34}
35
36pub struct CloneDetector {
42 min_tokens: usize,
44 min_lines: usize,
46 skip_local: bool,
48}
49
50impl CloneDetector {
51 #[must_use]
53 pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
54 Self {
55 min_tokens,
56 min_lines,
57 skip_local,
58 }
59 }
60
61 pub fn detect(
66 &self,
67 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
68 ) -> DuplicationReport {
69 self.detect_inner(file_data, None)
70 }
71
72 pub fn detect_touching_files(
79 &self,
80 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
81 focus_files: &FxHashSet<PathBuf>,
82 ) -> DuplicationReport {
83 self.detect_inner(file_data, Some(focus_files))
84 }
85
86 fn detect_inner(
87 &self,
88 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
89 focus_files: Option<&FxHashSet<PathBuf>>,
90 ) -> DuplicationReport {
91 let _span = tracing::info_span!("clone_detect").entered();
92
93 if file_data.is_empty() || self.min_tokens == 0 {
94 return empty_report(0);
95 }
96
97 let files: Vec<FileData> = file_data
98 .into_iter()
99 .map(|(path, hashed_tokens, file_tokens)| FileData {
100 atomic_invocation_spans: file_tokens.atomic_invocation_spans.clone(),
101 path,
102 hashed_tokens,
103 file_tokens,
104 })
105 .collect();
106
107 let total_files = files.len();
108 let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
109 let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
110 let focus_file_ids = focus_files.map(|focus| {
111 let normalized: rustc_hash::FxHashSet<std::path::PathBuf> = focus
112 .iter()
113 .map(|p| dunce::simplified(p).to_path_buf())
114 .collect();
115 files
116 .iter()
117 .map(|file| normalized.contains(dunce::simplified(&file.path)))
118 .collect::<Vec<_>>()
119 });
120
121 tracing::debug!(
122 total_files,
123 total_tokens,
124 total_lines,
125 focused_files = focus_file_ids
126 .as_ref()
127 .map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
128 "clone detection input"
129 );
130
131 let t0 = std::time::Instant::now();
132 let ranked_files = ranking::rank_reduce(&files);
133 let rank_time = t0.elapsed();
134 let unique_ranks: usize = ranked_files
135 .iter()
136 .flat_map(|f| f.iter())
137 .copied()
138 .max()
139 .map_or(0, |m| m as usize + 1);
140 tracing::debug!(
141 elapsed_us = rank_time.as_micros(),
142 unique_ranks,
143 "step1_rank_reduce"
144 );
145
146 let t0 = std::time::Instant::now();
147 let (text, file_of, file_offsets) =
148 concatenation::concatenate_with_sentinels(&ranked_files);
149 let concat_time = t0.elapsed();
150 tracing::debug!(
151 elapsed_us = concat_time.as_micros(),
152 concat_len = text.len(),
153 "step2_concatenate"
154 );
155
156 if text.is_empty() {
157 return empty_report(total_files);
158 }
159
160 let t0 = std::time::Instant::now();
161 let sa = suffix_array::build_suffix_array(&text);
162 let sa_time = t0.elapsed();
163 tracing::debug!(
164 elapsed_us = sa_time.as_micros(),
165 n = text.len(),
166 "step3_suffix_array"
167 );
168
169 let t0 = std::time::Instant::now();
170 let lcp_arr = lcp::build_lcp(&text, &sa);
171 let lcp_time = t0.elapsed();
172 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
173
174 let t0 = std::time::Instant::now();
175 let raw_groups = extraction::extract_clone_groups(
176 &sa,
177 &lcp_arr,
178 &file_of,
179 &file_offsets,
180 self.min_tokens,
181 &files,
182 focus_file_ids.as_deref(),
183 );
184 let extract_time = t0.elapsed();
185 tracing::debug!(
186 elapsed_us = extract_time.as_micros(),
187 raw_groups = raw_groups.len(),
188 "step5_extract_groups"
189 );
190
191 let t0 = std::time::Instant::now();
192 let clone_groups =
193 filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
194 let build_time = t0.elapsed();
195 tracing::debug!(
196 elapsed_us = build_time.as_micros(),
197 final_groups = clone_groups.len(),
198 "step6_build_groups"
199 );
200
201 let t0 = std::time::Instant::now();
202 let stats =
203 statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
204 let stats_time = t0.elapsed();
205 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
206
207 tracing::info!(
208 total_us = (rank_time
209 + concat_time
210 + sa_time
211 + lcp_time
212 + extract_time
213 + build_time
214 + stats_time)
215 .as_micros(),
216 rank_us = rank_time.as_micros(),
217 sa_us = sa_time.as_micros(),
218 lcp_us = lcp_time.as_micros(),
219 extract_us = extract_time.as_micros(),
220 build_us = build_time.as_micros(),
221 stats_us = stats_time.as_micros(),
222 total_tokens,
223 clone_groups = clone_groups.len(),
224 "clone detection complete"
225 );
226
227 DuplicationReport {
228 clone_groups,
229 clone_families: vec![], mirrored_directories: vec![],
231 stats,
232 }
233 }
234}
235
236const fn empty_report(total_files: usize) -> DuplicationReport {
238 DuplicationReport {
239 clone_groups: Vec::new(),
240 clone_families: Vec::new(),
241 mirrored_directories: Vec::new(),
242 stats: DuplicationStats {
243 total_files,
244 files_with_clones: 0,
245 total_lines: 0,
246 duplicated_lines: 0,
247 total_tokens: 0,
248 duplicated_tokens: 0,
249 clone_groups: 0,
250 clone_instances: 0,
251 duplication_percentage: 0.0,
252 clone_groups_below_min_occurrences: 0,
253 },
254 }
255}