fallow_core/duplicates/detect/
mod.rs1mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use super::normalize::HashedToken;
22use super::tokenize::FileTokens;
23use super::types::{DuplicationReport, DuplicationStats};
24
25struct FileData {
27 path: PathBuf,
28 hashed_tokens: Vec<HashedToken>,
29 file_tokens: FileTokens,
30}
31
32pub struct CloneDetector {
38 min_tokens: usize,
40 min_lines: usize,
42 skip_local: bool,
44}
45
46impl CloneDetector {
47 #[must_use]
49 pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
50 Self {
51 min_tokens,
52 min_lines,
53 skip_local,
54 }
55 }
56
57 pub fn detect(
62 &self,
63 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
64 ) -> DuplicationReport {
65 let _span = tracing::info_span!("clone_detect").entered();
66
67 if file_data.is_empty() || self.min_tokens == 0 {
68 return empty_report(0);
69 }
70
71 let files: Vec<FileData> = file_data
72 .into_iter()
73 .map(|(path, hashed_tokens, file_tokens)| FileData {
74 path,
75 hashed_tokens,
76 file_tokens,
77 })
78 .collect();
79
80 let total_files = files.len();
82 let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
83 let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
84
85 tracing::debug!(
86 total_files,
87 total_tokens,
88 total_lines,
89 "clone detection input"
90 );
91
92 let t0 = std::time::Instant::now();
94 let ranked_files = ranking::rank_reduce(&files);
95 let rank_time = t0.elapsed();
96 let unique_ranks: usize = ranked_files
97 .iter()
98 .flat_map(|f| f.iter())
99 .copied()
100 .max()
101 .map_or(0, |m| m as usize + 1);
102 tracing::debug!(
103 elapsed_us = rank_time.as_micros(),
104 unique_ranks,
105 "step1_rank_reduce"
106 );
107
108 let t0 = std::time::Instant::now();
110 let (text, file_of, file_offsets) =
111 concatenation::concatenate_with_sentinels(&ranked_files);
112 let concat_time = t0.elapsed();
113 tracing::debug!(
114 elapsed_us = concat_time.as_micros(),
115 concat_len = text.len(),
116 "step2_concatenate"
117 );
118
119 if text.is_empty() {
120 return empty_report(total_files);
121 }
122
123 let t0 = std::time::Instant::now();
125 let sa = suffix_array::build_suffix_array(&text);
126 let sa_time = t0.elapsed();
127 tracing::debug!(
128 elapsed_us = sa_time.as_micros(),
129 n = text.len(),
130 "step3_suffix_array"
131 );
132
133 let t0 = std::time::Instant::now();
135 let lcp_arr = lcp::build_lcp(&text, &sa);
136 let lcp_time = t0.elapsed();
137 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
138
139 let t0 = std::time::Instant::now();
141 let raw_groups = extraction::extract_clone_groups(
142 &sa,
143 &lcp_arr,
144 &file_of,
145 &file_offsets,
146 self.min_tokens,
147 &files,
148 );
149 let extract_time = t0.elapsed();
150 tracing::debug!(
151 elapsed_us = extract_time.as_micros(),
152 raw_groups = raw_groups.len(),
153 "step5_extract_groups"
154 );
155
156 let t0 = std::time::Instant::now();
158 let clone_groups =
159 filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
160 let build_time = t0.elapsed();
161 tracing::debug!(
162 elapsed_us = build_time.as_micros(),
163 final_groups = clone_groups.len(),
164 "step6_build_groups"
165 );
166
167 let t0 = std::time::Instant::now();
169 let stats =
170 statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
171 let stats_time = t0.elapsed();
172 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
173
174 tracing::info!(
175 total_us = (rank_time
176 + concat_time
177 + sa_time
178 + lcp_time
179 + extract_time
180 + build_time
181 + stats_time)
182 .as_micros(),
183 rank_us = rank_time.as_micros(),
184 sa_us = sa_time.as_micros(),
185 lcp_us = lcp_time.as_micros(),
186 extract_us = extract_time.as_micros(),
187 build_us = build_time.as_micros(),
188 stats_us = stats_time.as_micros(),
189 total_tokens,
190 clone_groups = clone_groups.len(),
191 "clone detection complete"
192 );
193
194 DuplicationReport {
195 clone_groups,
196 clone_families: vec![], mirrored_directories: vec![],
198 stats,
199 }
200 }
201}
202
203const fn empty_report(total_files: usize) -> DuplicationReport {
205 DuplicationReport {
206 clone_groups: Vec::new(),
207 clone_families: Vec::new(),
208 mirrored_directories: Vec::new(),
209 stats: DuplicationStats {
210 total_files,
211 files_with_clones: 0,
212 total_lines: 0,
213 duplicated_lines: 0,
214 total_tokens: 0,
215 duplicated_tokens: 0,
216 clone_groups: 0,
217 clone_instances: 0,
218 duplication_percentage: 0.0,
219 },
220 }
221}