1use std::collections::HashMap;
2use std::sync::Arc;
3
4use rayon::prelude::*;
5use rustc_hash::FxHashSet;
6
7use crate::cli::Options;
8use crate::files::SourceFile;
9
10mod matching;
11mod model;
12mod prepare;
13mod skip_local;
14mod statistics;
15mod store;
16#[cfg(test)]
17mod tests;
18
19#[cfg(test)]
20pub use model::FormatStatistic;
21pub(crate) use model::PreparedSourceDraft;
22pub use model::{
23 BlamedLine, BlamedLines, CloneMatch, DetectionResult, Fragment, SkippedClone, SourceSummary,
24 StatisticRow, Statistics,
25};
26pub use statistics::{Statistic, clone_lines};
27pub use store::{MemoryStore, MemoryStoreError};
28
29use matching::detect_format;
30use model::{FormatId, PreparedSource, SourceId, TokenStream};
31use prepare::{assign_formats, prepare_file_maps};
32use statistics::{finalize_percentages, update_clone_statistics, update_source_statistics};
33
34#[derive(Clone, Debug)]
40pub struct Detector {
41 options: Options,
42 sources: Vec<SourceFile>,
43}
44
45impl Detector {
46 pub fn new(options: Options) -> Self {
48 Self {
49 options,
50 sources: Vec::new(),
51 }
52 }
53
54 pub fn with_sources(options: Options, sources: Vec<SourceFile>) -> Self {
56 Self { options, sources }
57 }
58
59 pub fn options(&self) -> &Options {
61 &self.options
62 }
63
64 pub fn options_mut(&mut self) -> &mut Options {
66 &mut self.options
67 }
68
69 pub fn sources(&self) -> &[SourceFile] {
71 &self.sources
72 }
73
74 pub fn clear(&mut self) {
76 self.sources.clear();
77 }
78
79 pub fn detect(
81 &mut self,
82 source_id: impl Into<String>,
83 text: impl Into<String>,
84 format: impl Into<String>,
85 ) -> Vec<CloneMatch> {
86 self.detect_source_file(SourceFile {
87 source_id: source_id.into(),
88 format: format.into(),
89 content: text.into(),
90 })
91 }
92
93 pub fn detect_source_file(&mut self, source: SourceFile) -> Vec<CloneMatch> {
95 let source_id = source.source_id.clone();
96 self.sources.push(source);
97 let result = detect(self.sources.clone(), &self.options);
98 result
99 .clones
100 .into_iter()
101 .filter(|clone| {
102 clone.duplication_a.source_id == source_id
103 || clone.duplication_b.source_id == source_id
104 })
105 .collect()
106 }
107
108 pub fn detect_files(&self, files: Vec<SourceFile>) -> DetectionResult {
110 detect(files, &self.options)
111 }
112}
113
114pub fn detect(files: Vec<SourceFile>, options: &Options) -> DetectionResult {
115 detect_prepared_drafts(prepare_source_drafts(files, options), options)
116}
117
118pub(crate) fn prepare_source_drafts(
119 files: Vec<SourceFile>,
120 options: &Options,
121) -> Vec<PreparedSourceDraft> {
122 files
123 .into_par_iter()
124 .map(|file| prepare_file_maps(file, options))
125 .collect::<Vec<_>>()
126 .into_iter()
127 .flatten()
128 .collect::<Vec<_>>()
129}
130
131pub(crate) fn detect_prepared_drafts(
132 prepared_drafts: Vec<PreparedSourceDraft>,
133 options: &Options,
134) -> DetectionResult {
135 let include_source_contents = options
136 .reporters
137 .iter()
138 .any(|reporter| matches!(reporter.as_str(), "json" | "xml" | "html" | "consoleFull"));
139 let mut source_contents = HashMap::new();
140 let (format_ids, format_names) = assign_formats(&prepared_drafts);
141 let prepared_files = prepared_drafts
142 .into_iter()
143 .enumerate()
144 .map(|(idx, draft)| {
145 if include_source_contents && !draft.spans.is_empty() {
146 source_contents
147 .entry(draft.meta.source_id.clone())
148 .or_insert_with(|| draft.content.to_string());
149 }
150 PreparedSource {
151 meta: draft.meta,
152 stream: TokenStream {
153 source_id: SourceId(idx),
154 format_id: format_ids[idx],
155 hashes: unwrap_or_clone_arc_vec(draft.hashes),
156 spans: unwrap_or_clone_arc_vec(draft.spans),
157 },
158 }
159 })
160 .collect::<Vec<_>>();
161
162 let mut statistics = Statistics::default();
163 let mut sources = Vec::new();
164 let mut source_indices_by_format = vec![Vec::new(); format_names.len()];
165
166 for (idx, prepared) in prepared_files.iter().enumerate() {
167 if prepared.stream.spans.is_empty() {
168 continue;
169 }
170 update_source_statistics(
171 &mut statistics,
172 &prepared.meta.source_id,
173 &prepared.meta.format,
174 prepared.meta.lines,
175 prepared.meta.tokens,
176 );
177 sources.push(SourceSummary {
178 path: prepared.meta.source_id.clone(),
179 format: prepared.meta.format.clone(),
180 lines: prepared.meta.lines,
181 tokens: prepared.meta.tokens,
182 });
183 source_indices_by_format[prepared.stream.format_id.0].push(idx);
184 }
185
186 let format_results = source_indices_by_format
187 .par_iter()
188 .enumerate()
189 .map(|(format_id, source_indices)| {
190 detect_format(
191 FormatId(format_id),
192 source_indices,
193 &prepared_files,
194 &format_names,
195 options,
196 )
197 })
198 .collect::<Vec<_>>();
199
200 let mut clones = Vec::new();
201 let mut skipped_clones = Vec::new();
202 for format_result in format_results {
203 clones.extend(format_result.clones);
204 skipped_clones.extend(format_result.skipped_clones);
205 }
206 dedup_exact_clones(&mut clones);
207 for clone in &clones {
208 update_clone_statistics(&mut statistics, clone);
209 }
210
211 finalize_percentages(&mut statistics);
212
213 DetectionResult {
214 clones,
215 skipped_clones,
216 statistics,
217 sources,
218 source_contents,
219 }
220}
221
222fn dedup_exact_clones(clones: &mut Vec<CloneMatch>) {
223 let mut seen = FxHashSet::default();
224 clones.retain(|clone| seen.insert(CloneDedupKey::from(clone)));
225}
226
227fn unwrap_or_clone_arc_vec<T: Clone>(value: Arc<Vec<T>>) -> Vec<T> {
228 Arc::try_unwrap(value).unwrap_or_else(|value| (*value).clone())
229}
230
231#[derive(Hash, Eq, PartialEq)]
232struct CloneDedupKey {
233 format: String,
234 duplication_a: FragmentDedupKey,
235 duplication_b: FragmentDedupKey,
236 tokens: usize,
237}
238
239impl From<&CloneMatch> for CloneDedupKey {
240 fn from(clone: &CloneMatch) -> Self {
241 Self {
242 format: clone.format.clone(),
243 duplication_a: FragmentDedupKey::from(&clone.duplication_a),
244 duplication_b: FragmentDedupKey::from(&clone.duplication_b),
245 tokens: clone.tokens,
246 }
247 }
248}
249
250#[derive(Hash, Eq, PartialEq)]
251struct FragmentDedupKey {
252 source_id: String,
253 start_line: usize,
254 start_column: usize,
255 end_line: usize,
256 end_column: usize,
257 range: [usize; 2],
258}
259
260impl From<&Fragment> for FragmentDedupKey {
261 fn from(fragment: &Fragment) -> Self {
262 Self {
263 source_id: fragment.source_id.clone(),
264 start_line: fragment.start.line,
265 start_column: fragment.start.column,
266 end_line: fragment.end.line,
267 end_column: fragment.end.column,
268 range: fragment.range,
269 }
270 }
271}