fallow_core/duplicates/detect/
mod.rs1mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use rustc_hash::FxHashSet;
22
23use super::normalize::HashedToken;
24use super::tokenize::FileTokens;
25use super::types::{DuplicationReport, DuplicationStats};
26
27struct FileData {
29 path: PathBuf,
30 hashed_tokens: Vec<HashedToken>,
31 file_tokens: FileTokens,
32}
33
34pub struct CloneDetector {
40 min_tokens: usize,
42 min_lines: usize,
44 skip_local: bool,
46}
47
48impl CloneDetector {
49 #[must_use]
51 pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
52 Self {
53 min_tokens,
54 min_lines,
55 skip_local,
56 }
57 }
58
59 pub fn detect(
64 &self,
65 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
66 ) -> DuplicationReport {
67 self.detect_inner(file_data, None)
68 }
69
70 pub fn detect_touching_files(
77 &self,
78 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
79 focus_files: &FxHashSet<PathBuf>,
80 ) -> DuplicationReport {
81 self.detect_inner(file_data, Some(focus_files))
82 }
83
84 fn detect_inner(
85 &self,
86 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
87 focus_files: Option<&FxHashSet<PathBuf>>,
88 ) -> DuplicationReport {
89 let _span = tracing::info_span!("clone_detect").entered();
90
91 if file_data.is_empty() || self.min_tokens == 0 {
92 return empty_report(0);
93 }
94
95 let files: Vec<FileData> = file_data
96 .into_iter()
97 .map(|(path, hashed_tokens, file_tokens)| FileData {
98 path,
99 hashed_tokens,
100 file_tokens,
101 })
102 .collect();
103
104 let total_files = files.len();
106 let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
107 let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
108 let focus_file_ids = focus_files.map(|focus| {
109 files
110 .iter()
111 .map(|file| focus.contains(&file.path))
112 .collect::<Vec<_>>()
113 });
114
115 tracing::debug!(
116 total_files,
117 total_tokens,
118 total_lines,
119 focused_files = focus_file_ids
120 .as_ref()
121 .map_or(0, |ids| ids.iter().filter(|&&is_focus| is_focus).count()),
122 "clone detection input"
123 );
124
125 let t0 = std::time::Instant::now();
127 let ranked_files = ranking::rank_reduce(&files);
128 let rank_time = t0.elapsed();
129 let unique_ranks: usize = ranked_files
130 .iter()
131 .flat_map(|f| f.iter())
132 .copied()
133 .max()
134 .map_or(0, |m| m as usize + 1);
135 tracing::debug!(
136 elapsed_us = rank_time.as_micros(),
137 unique_ranks,
138 "step1_rank_reduce"
139 );
140
141 let t0 = std::time::Instant::now();
143 let (text, file_of, file_offsets) =
144 concatenation::concatenate_with_sentinels(&ranked_files);
145 let concat_time = t0.elapsed();
146 tracing::debug!(
147 elapsed_us = concat_time.as_micros(),
148 concat_len = text.len(),
149 "step2_concatenate"
150 );
151
152 if text.is_empty() {
153 return empty_report(total_files);
154 }
155
156 let t0 = std::time::Instant::now();
158 let sa = suffix_array::build_suffix_array(&text);
159 let sa_time = t0.elapsed();
160 tracing::debug!(
161 elapsed_us = sa_time.as_micros(),
162 n = text.len(),
163 "step3_suffix_array"
164 );
165
166 let t0 = std::time::Instant::now();
168 let lcp_arr = lcp::build_lcp(&text, &sa);
169 let lcp_time = t0.elapsed();
170 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
171
172 let t0 = std::time::Instant::now();
174 let raw_groups = extraction::extract_clone_groups(
175 &sa,
176 &lcp_arr,
177 &file_of,
178 &file_offsets,
179 self.min_tokens,
180 &files,
181 focus_file_ids.as_deref(),
182 );
183 let extract_time = t0.elapsed();
184 tracing::debug!(
185 elapsed_us = extract_time.as_micros(),
186 raw_groups = raw_groups.len(),
187 "step5_extract_groups"
188 );
189
190 let t0 = std::time::Instant::now();
192 let clone_groups =
193 filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
194 let build_time = t0.elapsed();
195 tracing::debug!(
196 elapsed_us = build_time.as_micros(),
197 final_groups = clone_groups.len(),
198 "step6_build_groups"
199 );
200
201 let t0 = std::time::Instant::now();
203 let stats =
204 statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
205 let stats_time = t0.elapsed();
206 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
207
208 tracing::info!(
209 total_us = (rank_time
210 + concat_time
211 + sa_time
212 + lcp_time
213 + extract_time
214 + build_time
215 + stats_time)
216 .as_micros(),
217 rank_us = rank_time.as_micros(),
218 sa_us = sa_time.as_micros(),
219 lcp_us = lcp_time.as_micros(),
220 extract_us = extract_time.as_micros(),
221 build_us = build_time.as_micros(),
222 stats_us = stats_time.as_micros(),
223 total_tokens,
224 clone_groups = clone_groups.len(),
225 "clone detection complete"
226 );
227
228 DuplicationReport {
229 clone_groups,
230 clone_families: vec![], mirrored_directories: vec![],
232 stats,
233 }
234 }
235}
236
237const fn empty_report(total_files: usize) -> DuplicationReport {
239 DuplicationReport {
240 clone_groups: Vec::new(),
241 clone_families: Vec::new(),
242 mirrored_directories: Vec::new(),
243 stats: DuplicationStats {
244 total_files,
245 files_with_clones: 0,
246 total_lines: 0,
247 duplicated_lines: 0,
248 total_tokens: 0,
249 duplicated_tokens: 0,
250 clone_groups: 0,
251 clone_instances: 0,
252 duplication_percentage: 0.0,
253 },
254 }
255}