use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use super::{
classify_clone_type, compute_dice_similarity, CloneFragment, ClonePair, CloneType,
ClonesOptions,
};
use super::extract::FragmentData;
pub type PairKey = (PathBuf, usize, usize, PathBuf, usize, usize);
pub fn detect_type1_type2(
fragments: &[FragmentData],
options: &ClonesOptions,
found_pairs: &mut HashSet<PairKey>,
) -> Vec<ClonePair> {
let mut clone_pairs: Vec<ClonePair> = Vec::new();
let mut raw_hash_index: HashMap<u64, Vec<usize>> = HashMap::new();
for (idx, frag) in fragments.iter().enumerate() {
raw_hash_index.entry(frag.raw_hash).or_default().push(idx);
}
for indices in raw_hash_index.values() {
if indices.len() < 2 {
continue;
}
for i in 0..indices.len() {
for j in (i + 1)..indices.len() {
if clone_pairs.len() >= options.max_clones {
break;
}
let idx_a = indices[i];
let idx_b = indices[j];
let frag_a = &fragments[idx_a];
let frag_b = &fragments[idx_b];
if should_skip_pair(frag_a, frag_b, options) {
continue;
}
let similarity = compute_dice_similarity(&frag_a.raw_tokens, &frag_b.raw_tokens);
if similarity < 0.99 {
continue; }
let pair_key = create_pair_key(frag_a, frag_b);
if found_pairs.contains(&pair_key) {
continue;
}
found_pairs.insert(pair_key);
let clone_type = CloneType::Type1;
if let Some(filter) = options.type_filter {
if clone_type != filter {
continue;
}
}
let pair = make_clone_pair(0, clone_type, similarity, frag_a, frag_b);
clone_pairs.push(pair);
}
if clone_pairs.len() >= options.max_clones {
break;
}
}
if clone_pairs.len() >= options.max_clones {
break;
}
}
let mut norm_hash_index: HashMap<u64, Vec<usize>> = HashMap::new();
for (idx, frag) in fragments.iter().enumerate() {
norm_hash_index
.entry(frag.normalized_hash)
.or_default()
.push(idx);
}
for indices in norm_hash_index.values() {
if indices.len() < 2 {
continue;
}
for i in 0..indices.len() {
for j in (i + 1)..indices.len() {
if clone_pairs.len() >= options.max_clones {
break;
}
let idx_a = indices[i];
let idx_b = indices[j];
let frag_a = &fragments[idx_a];
let frag_b = &fragments[idx_b];
if should_skip_pair(frag_a, frag_b, options) {
continue;
}
let pair_key = create_pair_key(frag_a, frag_b);
if found_pairs.contains(&pair_key) {
continue;
}
let raw_similarity =
compute_dice_similarity(&frag_a.raw_tokens, &frag_b.raw_tokens);
let (clone_type, similarity) = if raw_similarity >= 0.99 {
(CloneType::Type1, raw_similarity)
} else if raw_similarity >= 0.9 {
(CloneType::Type2, raw_similarity)
} else {
let norm_sim = compute_dice_similarity(
&frag_a.normalized_tokens,
&frag_b.normalized_tokens,
);
if norm_sim >= 0.9 {
(CloneType::Type2, raw_similarity.max(norm_sim))
} else {
continue; }
};
if let Some(filter) = options.type_filter {
if clone_type != filter {
continue;
}
}
found_pairs.insert(pair_key);
let pair = make_clone_pair(0, clone_type, similarity, frag_a, frag_b);
clone_pairs.push(pair);
}
if clone_pairs.len() >= options.max_clones {
break;
}
}
if clone_pairs.len() >= options.max_clones {
break;
}
}
clone_pairs
}
pub fn detect_type3(
fragments: &[FragmentData],
options: &ClonesOptions,
found_pairs: &mut HashSet<PairKey>,
) -> Vec<ClonePair> {
let mut clone_pairs: Vec<ClonePair> = Vec::new();
let mut inverted: HashMap<String, Vec<usize>> = HashMap::new();
for (frag_idx, frag) in fragments.iter().enumerate() {
let unique_tokens: HashSet<&str> =
frag.raw_tokens.iter().map(|t| t.value.as_str()).collect();
for token in unique_tokens {
let entry = inverted.entry(token.to_string()).or_default();
if entry.len() < 500 {
entry.push(frag_idx);
}
}
}
for (frag_idx, frag) in fragments.iter().enumerate() {
if clone_pairs.len() >= options.max_clones {
break;
}
let mut shared_counts: HashMap<usize, usize> = HashMap::new();
let unique_tokens: HashSet<&str> =
frag.raw_tokens.iter().map(|t| t.value.as_str()).collect();
for token in &unique_tokens {
if let Some(other_frags) = inverted.get(*token) {
for &other_idx in other_frags {
if other_idx > frag_idx {
*shared_counts.entry(other_idx).or_insert(0) += 1;
}
}
}
}
let size1 = unique_tokens.len();
for (other_idx, shared) in shared_counts {
if clone_pairs.len() >= options.max_clones {
break;
}
let other_frag = &fragments[other_idx];
let other_unique: HashSet<&str> = other_frag
.raw_tokens
.iter()
.map(|t| t.value.as_str())
.collect();
let size2 = other_unique.len();
let min_shared = ((options.threshold * (size1 + size2) as f64) / 2.0).ceil() as usize;
if shared < min_shared {
continue;
}
let frag_a = frag;
let frag_b = other_frag;
if should_skip_pair(frag_a, frag_b, options) {
continue;
}
let pair_key = create_pair_key(frag_a, frag_b);
if found_pairs.contains(&pair_key) {
continue;
}
let similarity = compute_dice_similarity(&frag_a.raw_tokens, &frag_b.raw_tokens);
if similarity < options.threshold {
continue;
}
let clone_type = classify_clone_type(similarity);
if let Some(filter) = options.type_filter {
if clone_type != filter {
continue;
}
}
found_pairs.insert(pair_key);
let pair = make_clone_pair(0, clone_type, similarity, frag_a, frag_b);
clone_pairs.push(pair);
}
}
clone_pairs
}
fn should_skip_pair(frag_a: &FragmentData, frag_b: &FragmentData, options: &ClonesOptions) -> bool {
let same_file = frag_a.file_idx == frag_b.file_idx;
if same_file {
if !options.include_within_file {
return true;
}
if ranges_overlap(
frag_a.start_line,
frag_a.end_line,
frag_b.start_line,
frag_b.end_line,
) {
return true;
}
}
false
}
fn ranges_overlap(start1: usize, end1: usize, start2: usize, end2: usize) -> bool {
start1 <= end2 && start2 <= end1
}
fn create_pair_key(frag_a: &FragmentData, frag_b: &FragmentData) -> PairKey {
if frag_a.file < frag_b.file
|| (frag_a.file == frag_b.file && frag_a.start_line < frag_b.start_line)
{
(
frag_a.file.clone(),
frag_a.start_line,
frag_a.end_line,
frag_b.file.clone(),
frag_b.start_line,
frag_b.end_line,
)
} else {
(
frag_b.file.clone(),
frag_b.start_line,
frag_b.end_line,
frag_a.file.clone(),
frag_a.start_line,
frag_a.end_line,
)
}
}
fn make_clone_pair(
id: usize,
clone_type: CloneType,
similarity: f64,
frag_a: &FragmentData,
frag_b: &FragmentData,
) -> ClonePair {
let mut fragment1 = CloneFragment::new(
frag_a.file.clone(),
frag_a.start_line,
frag_a.end_line,
frag_a.raw_tokens.len(),
)
.with_preview(frag_a.preview.clone());
if let Some(ref name) = frag_a.function_name {
fragment1 = fragment1.with_function(name.clone());
}
let mut fragment2 = CloneFragment::new(
frag_b.file.clone(),
frag_b.start_line,
frag_b.end_line,
frag_b.raw_tokens.len(),
)
.with_preview(frag_b.preview.clone());
if let Some(ref name) = frag_b.function_name {
fragment2 = fragment2.with_function(name.clone());
}
ClonePair::new(id, clone_type, similarity, fragment1, fragment2).canonical()
}