Skip to main content

jscpd_rs/
detector.rs

1use std::collections::HashMap;
2use std::sync::Arc;
3
4use rayon::prelude::*;
5use rustc_hash::FxHashSet;
6
7use crate::cli::Options;
8use crate::files::SourceFile;
9
10mod matching;
11mod model;
12mod prepare;
13mod skip_local;
14mod statistics;
15mod store;
16#[cfg(test)]
17mod tests;
18
19#[cfg(test)]
20pub use model::FormatStatistic;
21pub(crate) use model::PreparedSourceDraft;
22pub use model::{
23    BlamedLine, BlamedLines, CloneMatch, DetectionResult, Fragment, SkippedClone, SourceSummary,
24    StatisticRow, Statistics,
25};
26pub use statistics::{Statistic, clone_lines};
27pub use store::{MemoryStore, MemoryStoreError};
28
29use matching::detect_format;
30use model::{FormatId, PreparedSource, SourceId, TokenStream};
31use prepare::{assign_formats, prepare_file_maps};
32use statistics::{finalize_percentages, update_clone_statistics, update_source_statistics};
33
34/// Incremental detector facade for native integrations.
35///
36/// For one-shot detection, prefer `detect_source_files` or
37/// `detect_clones_and_statistics`. Use this type when an integration wants to
38/// keep options and previously submitted in-memory sources together.
39#[derive(Clone, Debug)]
40pub struct Detector {
41    options: Options,
42    sources: Vec<SourceFile>,
43}
44
45impl Detector {
46    /// Create an empty detector with the provided options.
47    pub fn new(options: Options) -> Self {
48        Self {
49            options,
50            sources: Vec::new(),
51        }
52    }
53
54    /// Create a detector preloaded with in-memory sources.
55    pub fn with_sources(options: Options, sources: Vec<SourceFile>) -> Self {
56        Self { options, sources }
57    }
58
59    /// Return detector options.
60    pub fn options(&self) -> &Options {
61        &self.options
62    }
63
64    /// Mutably access detector options.
65    pub fn options_mut(&mut self) -> &mut Options {
66        &mut self.options
67    }
68
69    /// Return sources currently held by this detector.
70    pub fn sources(&self) -> &[SourceFile] {
71        &self.sources
72    }
73
74    /// Remove all remembered sources.
75    pub fn clear(&mut self) {
76        self.sources.clear();
77    }
78
79    /// Add one source and return clones involving that new source.
80    pub fn detect(
81        &mut self,
82        source_id: impl Into<String>,
83        text: impl Into<String>,
84        format: impl Into<String>,
85    ) -> Vec<CloneMatch> {
86        self.detect_source_file(SourceFile {
87            source_id: source_id.into(),
88            format: format.into(),
89            content: text.into(),
90        })
91    }
92
93    /// Add one prepared source and return clones involving that new source.
94    pub fn detect_source_file(&mut self, source: SourceFile) -> Vec<CloneMatch> {
95        let source_id = source.source_id.clone();
96        self.sources.push(source);
97        let result = detect(self.sources.clone(), &self.options);
98        result
99            .clones
100            .into_iter()
101            .filter(|clone| {
102                clone.duplication_a.source_id == source_id
103                    || clone.duplication_b.source_id == source_id
104            })
105            .collect()
106    }
107
108    /// Run one-shot detection against the provided prepared sources.
109    pub fn detect_files(&self, files: Vec<SourceFile>) -> DetectionResult {
110        detect(files, &self.options)
111    }
112}
113
114pub fn detect(files: Vec<SourceFile>, options: &Options) -> DetectionResult {
115    detect_prepared_drafts(prepare_source_drafts(files, options), options)
116}
117
118pub(crate) fn prepare_source_drafts(
119    files: Vec<SourceFile>,
120    options: &Options,
121) -> Vec<PreparedSourceDraft> {
122    files
123        .into_par_iter()
124        .map(|file| prepare_file_maps(file, options))
125        .collect::<Vec<_>>()
126        .into_iter()
127        .flatten()
128        .collect::<Vec<_>>()
129}
130
131pub(crate) fn detect_prepared_drafts(
132    prepared_drafts: Vec<PreparedSourceDraft>,
133    options: &Options,
134) -> DetectionResult {
135    let include_source_contents = options
136        .reporters
137        .iter()
138        .any(|reporter| matches!(reporter.as_str(), "json" | "xml" | "html" | "consoleFull"));
139    let mut source_contents = HashMap::new();
140    let (format_ids, format_names) = assign_formats(&prepared_drafts);
141    let prepared_files = prepared_drafts
142        .into_iter()
143        .enumerate()
144        .map(|(idx, draft)| {
145            if include_source_contents && !draft.spans.is_empty() {
146                source_contents
147                    .entry(draft.meta.source_id.clone())
148                    .or_insert_with(|| draft.content.to_string());
149            }
150            PreparedSource {
151                meta: draft.meta,
152                stream: TokenStream {
153                    source_id: SourceId(idx),
154                    format_id: format_ids[idx],
155                    hashes: unwrap_or_clone_arc_vec(draft.hashes),
156                    spans: unwrap_or_clone_arc_vec(draft.spans),
157                },
158            }
159        })
160        .collect::<Vec<_>>();
161
162    let mut statistics = Statistics::default();
163    let mut sources = Vec::new();
164    let mut source_indices_by_format = vec![Vec::new(); format_names.len()];
165
166    for (idx, prepared) in prepared_files.iter().enumerate() {
167        if prepared.stream.spans.is_empty() {
168            continue;
169        }
170        update_source_statistics(
171            &mut statistics,
172            &prepared.meta.source_id,
173            &prepared.meta.format,
174            prepared.meta.lines,
175            prepared.meta.tokens,
176        );
177        sources.push(SourceSummary {
178            path: prepared.meta.source_id.clone(),
179            format: prepared.meta.format.clone(),
180            lines: prepared.meta.lines,
181            tokens: prepared.meta.tokens,
182        });
183        source_indices_by_format[prepared.stream.format_id.0].push(idx);
184    }
185
186    let format_results = source_indices_by_format
187        .par_iter()
188        .enumerate()
189        .map(|(format_id, source_indices)| {
190            detect_format(
191                FormatId(format_id),
192                source_indices,
193                &prepared_files,
194                &format_names,
195                options,
196            )
197        })
198        .collect::<Vec<_>>();
199
200    let mut clones = Vec::new();
201    let mut skipped_clones = Vec::new();
202    for format_result in format_results {
203        clones.extend(format_result.clones);
204        skipped_clones.extend(format_result.skipped_clones);
205    }
206    dedup_exact_clones(&mut clones);
207    for clone in &clones {
208        update_clone_statistics(&mut statistics, clone);
209    }
210
211    finalize_percentages(&mut statistics);
212
213    DetectionResult {
214        clones,
215        skipped_clones,
216        statistics,
217        sources,
218        source_contents,
219    }
220}
221
222fn dedup_exact_clones(clones: &mut Vec<CloneMatch>) {
223    let mut seen = FxHashSet::default();
224    clones.retain(|clone| seen.insert(CloneDedupKey::from(clone)));
225}
226
227fn unwrap_or_clone_arc_vec<T: Clone>(value: Arc<Vec<T>>) -> Vec<T> {
228    Arc::try_unwrap(value).unwrap_or_else(|value| (*value).clone())
229}
230
231#[derive(Hash, Eq, PartialEq)]
232struct CloneDedupKey {
233    format: String,
234    duplication_a: FragmentDedupKey,
235    duplication_b: FragmentDedupKey,
236    tokens: usize,
237}
238
239impl From<&CloneMatch> for CloneDedupKey {
240    fn from(clone: &CloneMatch) -> Self {
241        Self {
242            format: clone.format.clone(),
243            duplication_a: FragmentDedupKey::from(&clone.duplication_a),
244            duplication_b: FragmentDedupKey::from(&clone.duplication_b),
245            tokens: clone.tokens,
246        }
247    }
248}
249
250#[derive(Hash, Eq, PartialEq)]
251struct FragmentDedupKey {
252    source_id: String,
253    start_line: usize,
254    start_column: usize,
255    end_line: usize,
256    end_column: usize,
257    range: [usize; 2],
258}
259
260impl From<&Fragment> for FragmentDedupKey {
261    fn from(fragment: &Fragment) -> Self {
262        Self {
263            source_id: fragment.source_id.clone(),
264            start_line: fragment.start.line,
265            start_column: fragment.start.column,
266            end_line: fragment.end.line,
267            end_column: fragment.end.column,
268            range: fragment.range,
269        }
270    }
271}