use rayon::prelude::*;
use rustc_hash::{FxHashMap, FxHashSet};
use std::path::Path;
use crate::{
hash::{base_pow, hash_window, roll, token_hash},
models::{CpdClone, DetectionToken, Fragment, Location, SourceFile, TokenKind},
};
type WindowStore = FxHashMap<u64, Occurrence>;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct Occurrence {
source_id: usize,
token_start: usize,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct CloneDedupKey {
a_id: String,
a_start_line: u32,
b_id: String,
b_start_line: u32,
}
impl CloneDedupKey {
fn from_clone(c: &CpdClone) -> Self {
let a_key = (&c.fragment_a.source_id, c.fragment_a.start.line);
let b_key = (&c.fragment_b.source_id, c.fragment_b.start.line);
if a_key <= b_key {
Self {
a_id: c.fragment_a.source_id.clone(),
a_start_line: c.fragment_a.start.line,
b_id: c.fragment_b.source_id.clone(),
b_start_line: c.fragment_b.start.line,
}
} else {
Self {
a_id: c.fragment_b.source_id.clone(),
a_start_line: c.fragment_b.start.line,
b_id: c.fragment_a.source_id.clone(),
b_start_line: c.fragment_a.start.line,
}
}
}
}
pub fn detect(files: &[SourceFile], min_tokens: usize) -> Vec<CpdClone> {
detect_with_options(files, min_tokens, false, 0)
}
pub fn detect_with_options(
files: &[SourceFile],
min_tokens: usize,
skip_local: bool,
min_lines: usize,
) -> Vec<CpdClone> {
if files.is_empty() || min_tokens == 0 {
return vec![];
}
let mut by_format: FxHashMap<&str, Vec<usize>> = FxHashMap::default();
for (idx, file) in files.iter().enumerate() {
by_format.entry(file.format.as_str()).or_default().push(idx);
}
let mut format_groups: Vec<(&str, Vec<usize>)> = by_format.into_iter().collect();
format_groups.sort_unstable_by_key(|(fmt, _)| *fmt);
for (_, group) in &mut format_groups {
group.sort_unstable_by_key(|&idx| files[idx].id.as_str());
}
let all_clones: Vec<Vec<CpdClone>> = format_groups
.into_par_iter()
.map(|(_format, indices)| {
let prepared: Vec<PreparedSource> = indices
.iter()
.map(|&idx| {
let file = &files[idx];
let mut hashes = Vec::with_capacity(file.tokens.len());
let mut spans: Vec<(Location, Location)> =
Vec::with_capacity(file.tokens.len());
for t in &file.tokens {
if t.kind == TokenKind::Ignore {
continue;
}
hashes.push(token_hash(t.kind.discriminant(), &t.value));
spans.push((t.start.clone(), t.end.clone()));
}
PreparedSource {
id: file.id.clone(),
format: file.format.clone(),
hashes,
spans,
}
})
.collect();
detect_in_group(&prepared, min_tokens, skip_local, min_lines)
})
.collect();
let mut clones: Vec<CpdClone> = all_clones.into_iter().flatten().collect();
dedup_exact_clones(&mut clones);
suppress_subclones(&mut clones);
clones.sort_by(|a, b| {
(
&a.fragment_a.source_id,
a.fragment_a.start.line,
&a.fragment_b.source_id,
a.fragment_b.start.line,
)
.cmp(&(
&b.fragment_a.source_id,
b.fragment_a.start.line,
&b.fragment_b.source_id,
b.fragment_b.start.line,
))
});
clones
}
pub struct PreparedSource {
pub id: String,
pub format: String,
pub hashes: Vec<u64>,
pub spans: Vec<(Location, Location)>,
}
impl PreparedSource {
pub fn from_detection_tokens(id: String, format: String, tokens: &[DetectionToken]) -> Self {
let mut hashes = Vec::with_capacity(tokens.len());
let mut spans = Vec::with_capacity(tokens.len());
for t in tokens {
hashes.push(t.hash);
spans.push((t.start.clone(), t.end.clone()));
}
Self {
id,
format,
hashes,
spans,
}
}
}
pub fn detect_prepared(
format_groups: Vec<Vec<PreparedSource>>,
min_tokens: usize,
skip_local: bool,
min_lines: usize,
) -> Vec<CpdClone> {
if format_groups.is_empty() || min_tokens == 0 {
return vec![];
}
let all_clones: Vec<Vec<CpdClone>> = format_groups
.into_par_iter()
.map(|group| detect_in_group(&group, min_tokens, skip_local, min_lines))
.collect();
let mut clones: Vec<CpdClone> = all_clones.into_iter().flatten().collect();
dedup_exact_clones(&mut clones);
suppress_subclones(&mut clones);
clones.sort_by(|a, b| {
(
&a.fragment_a.source_id,
a.fragment_a.start.line,
&a.fragment_b.source_id,
a.fragment_b.start.line,
)
.cmp(&(
&b.fragment_a.source_id,
b.fragment_a.start.line,
&b.fragment_b.source_id,
b.fragment_b.start.line,
))
});
clones
}
fn detect_in_group(
prepared: &[PreparedSource],
min_tokens: usize,
skip_local: bool,
min_lines: usize,
) -> Vec<CpdClone> {
let window_power = base_pow(min_tokens.saturating_sub(1));
let total_windows: usize = prepared
.iter()
.map(|p| p.hashes.len().saturating_sub(min_tokens))
.sum();
let mut store: WindowStore =
FxHashMap::with_capacity_and_hasher(total_windows, Default::default());
let mut clones: Vec<CpdClone> = Vec::new();
const SECONDARY_OCCURRENCE_CAP: usize = 2;
let mut repeated_windows: FxHashMap<u64, Vec<Occurrence>> = FxHashMap::default();
for (file_idx, source) in prepared.iter().enumerate() {
let hashes = &source.hashes;
if hashes.len() < min_tokens {
continue;
}
let windows_len = hashes.len() - min_tokens + 1;
let mut open_clone: Option<OpenClone> = None;
let mut window_hash = hash_window(&hashes[..min_tokens]);
for token_start in 0..windows_len {
if token_start > 0 {
window_hash = roll(
window_hash,
hashes[token_start - 1],
hashes[token_start + min_tokens - 1],
window_power,
);
}
let current = Occurrence {
source_id: file_idx,
token_start,
};
match store.get(&window_hash).copied() {
Some(stored) if windows_match(stored, current, prepared) => {
if open_clone.is_none() {
open_clone = Some(OpenClone {
stored_occurrence: stored,
current_start: token_start,
match_len: min_tokens,
});
} else if let Some(ref mut oc) = open_clone {
oc.match_len += 1;
}
remember_repeated_window(
&mut repeated_windows,
window_hash,
stored,
SECONDARY_OCCURRENCE_CAP,
);
remember_repeated_window(
&mut repeated_windows,
window_hash,
current,
SECONDARY_OCCURRENCE_CAP,
);
}
_ => {
flush_clone(
open_clone.take(),
file_idx,
prepared,
skip_local,
min_lines,
&mut clones,
);
store.insert(window_hash, current);
}
}
}
flush_clone(
open_clone.take(),
file_idx,
prepared,
skip_local,
min_lines,
&mut clones,
);
}
add_secondary_clones(
repeated_windows,
prepared,
min_tokens,
skip_local,
min_lines,
&mut clones,
);
clones
}
struct OpenClone {
stored_occurrence: Occurrence,
current_start: usize,
match_len: usize,
}
fn windows_match(stored: Occurrence, current: Occurrence, _prepared: &[PreparedSource]) -> bool {
if stored.source_id == current.source_id && stored.token_start == current.token_start {
return false; }
true
}
fn flush_clone(
open: Option<OpenClone>,
current_file_idx: usize,
prepared: &[PreparedSource],
skip_local: bool,
min_lines: usize,
clones: &mut Vec<CpdClone>,
) {
let oc = match open {
Some(o) => o,
None => return,
};
let existing = &oc.stored_occurrence;
let cur_start = oc.current_start;
let match_len = oc.match_len;
let existing_file = &prepared[existing.source_id];
let current_file = &prepared[current_file_idx];
let ex_start = existing.token_start;
let ex_end = ex_start + match_len - 1;
let cur_end = cur_start + match_len - 1;
if existing.source_id == current_file_idx {
let overlap = if ex_start < cur_start {
ex_end > cur_start
} else {
cur_end > ex_start
};
if overlap {
return;
}
}
if skip_local {
let dir_a = Path::new(&existing_file.id).parent();
let dir_b = Path::new(¤t_file.id).parent();
if dir_a == dir_b {
return;
}
}
let fragment_a = match make_fragment(&existing_file.id, &existing_file.spans, ex_start, ex_end)
{
Some(f) => f,
None => return,
};
let fragment_b = match make_fragment(¤t_file.id, ¤t_file.spans, cur_start, cur_end)
{
Some(f) => f,
None => return,
};
if min_lines > 0 {
let span_a = fragment_a.end.line as i64 - fragment_a.start.line as i64;
let span_b = fragment_b.end.line as i64 - fragment_b.start.line as i64;
if span_a < min_lines as i64 && span_b < min_lines as i64 {
return;
}
}
clones.push(CpdClone {
format: current_file.format.clone(),
fragment_a,
fragment_b,
token_count: match_len as u32,
});
}
fn make_fragment(
source_id: &str,
spans: &[(Location, Location)],
start_idx: usize,
end_idx: usize,
) -> Option<Fragment> {
let (first_start, _) = spans.get(start_idx)?;
let (_, last_end) = spans.get(end_idx)?;
Some(Fragment {
source_id: source_id.to_string(),
start: first_start.clone(),
end: last_end.clone(),
range: [start_idx as u32, end_idx as u32],
blame: None,
})
}
fn dedup_exact_clones(clones: &mut Vec<CpdClone>) {
for clone in clones.iter_mut() {
let a_key = (&clone.fragment_a.source_id, clone.fragment_a.start.line);
let b_key = (&clone.fragment_b.source_id, clone.fragment_b.start.line);
if a_key > b_key {
std::mem::swap(&mut clone.fragment_a, &mut clone.fragment_b);
}
}
let mut seen: FxHashSet<CloneDedupKey> = FxHashSet::default();
clones.retain(|c| seen.insert(CloneDedupKey::from_clone(c)));
}
fn suppress_subclones(clones: &mut Vec<CpdClone>) {
use std::cmp::Reverse;
clones.sort_by_key(|c| Reverse(c.token_count));
let n = clones.len();
let mut keep = vec![true; n];
for i in 0..n {
if !keep[i] {
continue;
}
let big_a_id = clones[i].fragment_a.source_id.clone();
let big_b_id = clones[i].fragment_b.source_id.clone();
let big_a_range = clones[i].fragment_a.range;
let big_b_range = clones[i].fragment_b.range;
for j in (i + 1)..n {
if !keep[j] {
continue;
}
let small = &clones[j];
if small.fragment_a.source_id != big_a_id || small.fragment_b.source_id != big_b_id {
continue;
}
if big_a_range[0] <= small.fragment_a.range[0]
&& big_a_range[1] >= small.fragment_a.range[1]
&& big_b_range[0] <= small.fragment_b.range[0]
&& big_b_range[1] >= small.fragment_b.range[1]
{
keep[j] = false;
}
}
}
let mut i = 0;
clones.retain(|_| {
let k = keep[i];
i += 1;
k
});
}
fn remember_repeated_window(
repeated_windows: &mut FxHashMap<u64, Vec<Occurrence>>,
hash: u64,
occurrence: Occurrence,
cap: usize,
) {
let bucket = repeated_windows.entry(hash).or_default();
if bucket
.iter()
.any(|s| s.source_id == occurrence.source_id && s.token_start == occurrence.token_start)
{
return;
}
if bucket.len() < cap {
bucket.push(occurrence);
}
}
fn add_secondary_clones(
repeated_windows: FxHashMap<u64, Vec<Occurrence>>,
prepared: &[PreparedSource],
min_tokens: usize,
skip_local: bool,
min_lines: usize,
clones: &mut Vec<CpdClone>,
) {
if repeated_windows.is_empty() {
return;
}
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
struct Candidate {
source_a: usize,
source_b: usize,
token_a: usize,
token_b: usize,
}
let mut candidates: Vec<Candidate> = Vec::new();
for occurrences in repeated_windows.values() {
if occurrences.len() < 2 {
continue;
}
for li in 0..occurrences.len() {
for ri in li + 1..occurrences.len() {
let left = &occurrences[li];
let right = &occurrences[ri];
if left.source_id == right.source_id && left.token_start == right.token_start {
continue;
}
let lh = &prepared[left.source_id].hashes;
let rh = &prepared[right.source_id].hashes;
let la = left.token_start;
let ra = right.token_start;
if la + min_tokens > lh.len() || ra + min_tokens > rh.len() {
continue;
}
if lh[la..la + min_tokens] != rh[ra..ra + min_tokens] {
continue;
}
let (sa, ta, sb, tb) =
if (left.source_id, left.token_start) <= (right.source_id, right.token_start) {
(
left.source_id,
left.token_start,
right.source_id,
right.token_start,
)
} else {
(
right.source_id,
right.token_start,
left.source_id,
left.token_start,
)
};
candidates.push(Candidate {
source_a: sa,
source_b: sb,
token_a: ta,
token_b: tb,
});
}
}
}
if candidates.is_empty() {
return;
}
candidates.sort_unstable();
candidates.dedup();
let n_files = prepared.len();
let mut covered: Vec<Vec<(u32, u32)>> = vec![Vec::new(); n_files];
for c in clones.iter() {
let fa = prepared.iter().position(|p| p.id == c.fragment_a.source_id);
let fb = prepared.iter().position(|p| p.id == c.fragment_b.source_id);
if let Some(idx) = fa {
covered[idx].push((c.fragment_a.start.line, c.fragment_a.end.line));
}
if let Some(idx) = fb {
covered[idx].push((c.fragment_b.start.line, c.fragment_b.end.line));
}
}
for ranges in &mut covered {
ranges.sort_unstable();
}
let line_extends_coverage = |file_idx: usize, start: u32, end: u32| -> bool {
let ranges = &covered[file_idx];
let mut next = start;
for &(rs, re) in ranges {
if re < next {
continue;
}
if rs > next {
return true;
}
next = next.max(re.saturating_add(1));
if next > end {
return false;
}
}
next <= end
};
for cand in candidates {
let existing = Occurrence {
source_id: cand.source_a,
token_start: cand.token_a,
};
let current = Occurrence {
source_id: cand.source_b,
token_start: cand.token_b,
};
if skip_local {
let dir_a = Path::new(&prepared[existing.source_id].id).parent();
let dir_b = Path::new(&prepared[current.source_id].id).parent();
if dir_a == dir_b {
continue;
}
}
let ex_hashes = &prepared[existing.source_id].hashes;
let cur_hashes = &prepared[current.source_id].hashes;
let max_extend = (ex_hashes
.len()
.saturating_sub(existing.token_start + min_tokens))
.min(
cur_hashes
.len()
.saturating_sub(current.token_start + min_tokens),
);
let mut extra = 0usize;
while extra < max_extend
&& ex_hashes[existing.token_start + min_tokens + extra]
== cur_hashes[current.token_start + min_tokens + extra]
{
extra += 1;
}
let match_len = min_tokens + extra;
let ex_start = existing.token_start;
let ex_end = ex_start + match_len - 1;
let cur_start = current.token_start;
let cur_end = cur_start + match_len - 1;
let frag_a = match make_fragment(
&prepared[existing.source_id].id,
&prepared[existing.source_id].spans,
ex_start,
ex_end,
) {
Some(f) => f,
None => continue,
};
let frag_b = match make_fragment(
&prepared[current.source_id].id,
&prepared[current.source_id].spans,
cur_start,
cur_end,
) {
Some(f) => f,
None => continue,
};
let nc = CpdClone {
format: prepared[current.source_id].format.clone(),
fragment_a: frag_a,
fragment_b: frag_b,
token_count: match_len as u32,
};
if min_lines > 0 {
let span_a = nc.fragment_a.end.line as i64 - nc.fragment_a.start.line as i64;
let span_b = nc.fragment_b.end.line as i64 - nc.fragment_b.start.line as i64;
if span_a < min_lines as i64 && span_b < min_lines as i64 {
continue;
}
}
let extends_a = line_extends_coverage(
cand.source_a,
nc.fragment_a.start.line,
nc.fragment_a.end.line,
);
let extends_b = line_extends_coverage(
cand.source_b,
nc.fragment_b.start.line,
nc.fragment_b.end.line,
);
if extends_a || extends_b {
clones.push(nc);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::models::{Location, Token, TokenKind};
fn loc(line: u32, col: u32, offset: u32) -> Location {
Location {
line,
column: col,
offset,
}
}
fn make_token(kind: TokenKind, value: &str, line: u32, col: u32, offset: u32) -> Token {
let end_col = col + value.len() as u32;
let end_off = offset + value.len() as u32;
Token {
kind,
value: value.to_string(),
start: loc(line, col, offset),
end: loc(line, end_col, end_off),
}
}
fn make_file(id: &str, format: &str, tokens: Vec<Token>) -> SourceFile {
SourceFile {
id: id.to_string(),
format: format.to_string(),
tokens,
}
}
fn js_tokens_ab() -> Vec<Token> {
vec![
make_token(TokenKind::Keyword, "function", 1, 0, 0),
make_token(TokenKind::Other, "hello", 1, 9, 9),
make_token(TokenKind::Operator, "(", 1, 14, 14),
make_token(TokenKind::Operator, ")", 1, 15, 15),
make_token(TokenKind::Operator, "{", 1, 16, 16),
make_token(TokenKind::Keyword, "return", 2, 0, 18),
make_token(TokenKind::Literal, "42", 2, 7, 25),
make_token(TokenKind::Operator, ";", 2, 9, 27),
make_token(TokenKind::Operator, "}", 3, 0, 29),
]
}
#[test]
fn empty_input_returns_empty() {
let result = detect(&[], 10);
assert!(result.is_empty());
}
#[test]
fn identical_files_detected_as_clone() {
let tokens = js_tokens_ab();
let file_a = make_file("a.js", "javascript", tokens.clone());
let file_b = make_file("b.js", "javascript", tokens);
let clones = detect(&[file_a, file_b], 5);
assert!(
!clones.is_empty(),
"identical files must produce at least one clone"
);
}
#[test]
fn min_tokens_threshold_respected() {
let tokens = js_tokens_ab(); let file_a = make_file("a.js", "javascript", tokens.clone());
let file_b = make_file("b.js", "javascript", tokens);
let clones = detect(&[file_a, file_b], 100);
assert!(
clones.is_empty(),
"no clones when min_tokens exceeds file length"
);
}
#[test]
fn deduplication_ab_ba_collapse() {
let tokens = js_tokens_ab();
let file_a = make_file("a.js", "javascript", tokens.clone());
let file_b = make_file("b.js", "javascript", tokens);
let clones = detect(&[file_a, file_b], 5);
assert_eq!(clones.len(), 1, "symmetric pairs must collapse to 1");
}
#[test]
fn different_formats_not_cross_detected() {
let tokens = js_tokens_ab();
let file_js = make_file("a.js", "javascript", tokens.clone());
let file_py = make_file("a.py", "python", tokens);
let clones = detect(&[file_js, file_py], 5);
assert!(
clones.is_empty(),
"tokens from different formats must not match"
);
}
#[test]
fn identical_files_maximal_clone() {
let tokens = js_tokens_ab();
let file_a = make_file("a.js", "javascript", tokens.clone());
let file_b = make_file("b.js", "javascript", tokens);
let clones = detect(&[file_a, file_b], 5);
assert_eq!(
clones.len(),
1,
"open_clone SM must produce one maximal clone"
);
assert_eq!(
clones[0].token_count, 9,
"maximal clone must cover all 9 tokens"
);
}
#[test]
fn three_identical_files_secondary_pass_adds_missing_pair() {
let tokens = js_tokens_ab();
let file_a = make_file("a.js", "javascript", tokens.clone());
let file_b = make_file("b.js", "javascript", tokens.clone());
let file_c = make_file("c.js", "javascript", tokens);
let clones = detect(&[file_a, file_b, file_c], 5);
assert!(
clones.len() >= 2,
"three identical files must yield at least 2 clone pairs, got {}",
clones.len()
);
}
#[test]
fn clones_sorted_by_source_and_line() {
let tokens = js_tokens_ab();
let file_a = make_file("a.js", "javascript", tokens.clone());
let file_b = make_file("b.js", "javascript", tokens);
let clones = detect(&[file_a, file_b], 5);
for i in 1..clones.len() {
let prev = &clones[i - 1];
let curr = &clones[i];
assert!(
(
&prev.fragment_a.source_id,
prev.fragment_a.start.line,
&prev.fragment_b.source_id,
prev.fragment_b.start.line,
) <= (
&curr.fragment_a.source_id,
curr.fragment_a.start.line,
&curr.fragment_b.source_id,
curr.fragment_b.start.line,
),
"clones must be sorted"
);
}
}
}