fallow_core/duplicates/detect/
mod.rs1mod concatenation;
8mod extraction;
9mod filtering;
10mod lcp;
11mod ranking;
12mod statistics;
13mod suffix_array;
14mod utils;
15
16#[cfg(test)]
17mod tests;
18
19use std::path::PathBuf;
20
21use super::normalize::HashedToken;
22use super::tokenize::FileTokens;
23use super::types::{DuplicationReport, DuplicationStats};
24
25struct FileData {
27 path: PathBuf,
28 hashed_tokens: Vec<HashedToken>,
29 file_tokens: FileTokens,
30}
31
32pub struct CloneDetector {
38 min_tokens: usize,
40 min_lines: usize,
42 skip_local: bool,
44}
45
46impl CloneDetector {
47 pub const fn new(min_tokens: usize, min_lines: usize, skip_local: bool) -> Self {
49 Self {
50 min_tokens,
51 min_lines,
52 skip_local,
53 }
54 }
55
56 pub fn detect(
61 &self,
62 file_data: Vec<(PathBuf, Vec<HashedToken>, FileTokens)>,
63 ) -> DuplicationReport {
64 let _span = tracing::info_span!("clone_detect").entered();
65
66 if file_data.is_empty() || self.min_tokens == 0 {
67 return empty_report(0);
68 }
69
70 let files: Vec<FileData> = file_data
71 .into_iter()
72 .map(|(path, hashed_tokens, file_tokens)| FileData {
73 path,
74 hashed_tokens,
75 file_tokens,
76 })
77 .collect();
78
79 let total_files = files.len();
81 let total_lines: usize = files.iter().map(|f| f.file_tokens.line_count).sum();
82 let total_tokens: usize = files.iter().map(|f| f.hashed_tokens.len()).sum();
83
84 tracing::debug!(
85 total_files,
86 total_tokens,
87 total_lines,
88 "clone detection input"
89 );
90
91 let t0 = std::time::Instant::now();
93 let ranked_files = ranking::rank_reduce(&files);
94 let rank_time = t0.elapsed();
95 let unique_ranks: usize = ranked_files
96 .iter()
97 .flat_map(|f| f.iter())
98 .copied()
99 .max()
100 .map_or(0, |m| m as usize + 1);
101 tracing::debug!(
102 elapsed_us = rank_time.as_micros(),
103 unique_ranks,
104 "step1_rank_reduce"
105 );
106
107 let t0 = std::time::Instant::now();
109 let (text, file_of, file_offsets) =
110 concatenation::concatenate_with_sentinels(&ranked_files);
111 let concat_time = t0.elapsed();
112 tracing::debug!(
113 elapsed_us = concat_time.as_micros(),
114 concat_len = text.len(),
115 "step2_concatenate"
116 );
117
118 if text.is_empty() {
119 return empty_report(total_files);
120 }
121
122 let t0 = std::time::Instant::now();
124 let sa = suffix_array::build_suffix_array(&text);
125 let sa_time = t0.elapsed();
126 tracing::debug!(
127 elapsed_us = sa_time.as_micros(),
128 n = text.len(),
129 "step3_suffix_array"
130 );
131
132 let t0 = std::time::Instant::now();
134 let lcp_arr = lcp::build_lcp(&text, &sa);
135 let lcp_time = t0.elapsed();
136 tracing::debug!(elapsed_us = lcp_time.as_micros(), "step4_lcp_array");
137
138 let t0 = std::time::Instant::now();
140 let raw_groups = extraction::extract_clone_groups(
141 &sa,
142 &lcp_arr,
143 &file_of,
144 &file_offsets,
145 self.min_tokens,
146 &files,
147 );
148 let extract_time = t0.elapsed();
149 tracing::debug!(
150 elapsed_us = extract_time.as_micros(),
151 raw_groups = raw_groups.len(),
152 "step5_extract_groups"
153 );
154
155 let t0 = std::time::Instant::now();
157 let clone_groups =
158 filtering::build_groups(raw_groups, &files, self.min_lines, self.skip_local);
159 let build_time = t0.elapsed();
160 tracing::debug!(
161 elapsed_us = build_time.as_micros(),
162 final_groups = clone_groups.len(),
163 "step6_build_groups"
164 );
165
166 let t0 = std::time::Instant::now();
168 let stats =
169 statistics::compute_stats(&clone_groups, total_files, total_lines, total_tokens);
170 let stats_time = t0.elapsed();
171 tracing::debug!(elapsed_us = stats_time.as_micros(), "step7_compute_stats");
172
173 tracing::info!(
174 total_us = (rank_time
175 + concat_time
176 + sa_time
177 + lcp_time
178 + extract_time
179 + build_time
180 + stats_time)
181 .as_micros(),
182 rank_us = rank_time.as_micros(),
183 sa_us = sa_time.as_micros(),
184 lcp_us = lcp_time.as_micros(),
185 extract_us = extract_time.as_micros(),
186 build_us = build_time.as_micros(),
187 stats_us = stats_time.as_micros(),
188 total_tokens,
189 clone_groups = clone_groups.len(),
190 "clone detection complete"
191 );
192
193 DuplicationReport {
194 clone_groups,
195 clone_families: vec![], stats,
197 }
198 }
199}
200
201const fn empty_report(total_files: usize) -> DuplicationReport {
203 DuplicationReport {
204 clone_groups: Vec::new(),
205 clone_families: Vec::new(),
206 stats: DuplicationStats {
207 total_files,
208 files_with_clones: 0,
209 total_lines: 0,
210 duplicated_lines: 0,
211 total_tokens: 0,
212 duplicated_tokens: 0,
213 clone_groups: 0,
214 clone_instances: 0,
215 duplication_percentage: 0.0,
216 },
217 }
218}