use std::collections::{HashMap, HashSet};
use std::path::Path;
use super::types::{CloneGroup, CloneType};
#[derive(Debug, Clone)]
pub struct CodeFragment {
pub id: usize,
pub file: String,
pub start_line: usize,
pub end_line: usize,
pub source: String,
pub functionality_id: Option<usize>,
}
#[derive(Debug, Clone)]
pub struct GroundTruthPair {
pub fragment_a: usize,
pub fragment_b: usize,
pub clone_type: CloneType,
pub is_clone: bool,
}
#[derive(Debug, Clone)]
pub struct BenchmarkResult {
pub true_positives: usize,
pub false_positives: usize,
pub false_negatives: usize,
pub precision: f64,
pub recall: f64,
pub f1: f64,
pub per_type: HashMap<CloneType, TypeResult>,
}
#[derive(Debug, Clone)]
pub struct TypeResult {
pub true_positives: usize,
pub false_positives: usize,
pub false_negatives: usize,
pub precision: f64,
pub recall: f64,
pub f1: f64,
}
pub struct CloneBenchmark {
pub fragments: Vec<CodeFragment>,
pub ground_truth: Vec<GroundTruthPair>,
}
impl CloneBenchmark {
pub fn new(fragments: Vec<CodeFragment>, ground_truth: Vec<GroundTruthPair>) -> Self {
Self {
fragments,
ground_truth,
}
}
pub fn from_bigclonebench(dir: &Path) -> Result<Self, crate::core::Error> {
let functions_path = dir.join("functions.csv");
let fragments = if functions_path.exists() {
Self::parse_functions_csv(&functions_path)?
} else {
Vec::new()
};
let clones_path = dir.join("clones.csv");
let ground_truth = if clones_path.exists() {
Self::parse_clones_csv(&clones_path)?
} else {
Vec::new()
};
Ok(Self {
fragments,
ground_truth,
})
}
fn fragment_location_map(&self) -> HashMap<usize, String> {
self.fragments
.iter()
.map(|f| {
let key = format!("{}:{}:{}", f.file, f.start_line, f.end_line);
(f.id, key)
})
.collect()
}
pub fn evaluate(&self, detected_groups: &[CloneGroup]) -> BenchmarkResult {
let mut detected_pairs: HashSet<(String, String)> = HashSet::new();
for group in detected_groups {
for i in 0..group.instances.len() {
for j in (i + 1)..group.instances.len() {
let a = &group.instances[i];
let b = &group.instances[j];
let key_a = format!("{}:{}:{}", a.file, a.start_line, a.end_line);
let key_b = format!("{}:{}:{}", b.file, b.start_line, b.end_line);
if key_a < key_b {
detected_pairs.insert((key_a, key_b));
} else {
detected_pairs.insert((key_b, key_a));
}
}
}
}
let location_map = self.fragment_location_map();
let mut tp = 0usize;
let mut fn_ = 0usize;
let mut per_type_counts: HashMap<CloneType, (usize, usize)> = HashMap::new();
for gt in &self.ground_truth {
if !gt.is_clone {
continue;
}
let key_a = match location_map.get(>.fragment_a) {
Some(k) => k.clone(),
None => continue,
};
let key_b = match location_map.get(>.fragment_b) {
Some(k) => k.clone(),
None => continue,
};
let canonical = if key_a < key_b {
(key_a, key_b)
} else {
(key_b, key_a)
};
let entry = per_type_counts.entry(gt.clone_type).or_insert((0, 0));
if detected_pairs.contains(&canonical) {
tp += 1;
entry.0 += 1;
} else {
fn_ += 1;
entry.1 += 1;
}
}
let fp = detected_pairs.len().saturating_sub(tp);
let precision = if tp + fp > 0 {
tp as f64 / (tp + fp) as f64
} else {
0.0
};
let recall = if tp + fn_ > 0 {
tp as f64 / (tp + fn_) as f64
} else {
0.0
};
let f1 = if precision + recall > 0.0 {
2.0 * precision * recall / (precision + recall)
} else {
0.0
};
let per_type: HashMap<CloneType, TypeResult> = per_type_counts
.into_iter()
.map(|(ct, (type_tp, type_fn))| {
let type_fp = 0usize;
let p = if type_tp + type_fp > 0 {
type_tp as f64 / (type_tp + type_fp) as f64
} else {
0.0
};
let r = if type_tp + type_fn > 0 {
type_tp as f64 / (type_tp + type_fn) as f64
} else {
0.0
};
let f = if p + r > 0.0 {
2.0 * p * r / (p + r)
} else {
0.0
};
(
ct,
TypeResult {
true_positives: type_tp,
false_positives: type_fp,
false_negatives: type_fn,
precision: p,
recall: r,
f1: f,
},
)
})
.collect();
BenchmarkResult {
true_positives: tp,
false_positives: fp,
false_negatives: fn_,
precision,
recall,
f1,
per_type,
}
}
pub fn sweep_thresholds(
&self,
thresholds: &[f64],
detect_fn: impl Fn(f64) -> Vec<CloneGroup>,
) -> Vec<(f64, BenchmarkResult)> {
thresholds
.iter()
.map(|&t| (t, self.evaluate(&detect_fn(t))))
.collect()
}
fn parse_functions_csv(path: &Path) -> Result<Vec<CodeFragment>, crate::core::Error> {
let content = std::fs::read_to_string(path).map_err(|e| {
crate::core::Error::config(format!("Cannot read {}: {e}", path.display()))
})?;
let mut fragments = Vec::new();
for (idx, line) in content.lines().enumerate() {
if idx == 0 {
continue; }
let line = line.trim();
if line.is_empty() {
continue;
}
let parts: Vec<&str> = line.split(',').collect();
if parts.len() >= 4 {
fragments.push(CodeFragment {
id: parts[0].trim().parse().unwrap_or(idx),
file: parts[1].trim().to_string(),
start_line: parts[2].trim().parse().unwrap_or(0),
end_line: parts[3].trim().parse().unwrap_or(0),
source: String::new(),
functionality_id: parts.get(4).and_then(|s| s.trim().parse().ok()),
});
}
}
Ok(fragments)
}
fn parse_clones_csv(path: &Path) -> Result<Vec<GroundTruthPair>, crate::core::Error> {
let content = std::fs::read_to_string(path).map_err(|e| {
crate::core::Error::config(format!("Cannot read {}: {e}", path.display()))
})?;
let mut pairs = Vec::new();
for (idx, line) in content.lines().enumerate() {
if idx == 0 {
continue; }
let line = line.trim();
if line.is_empty() {
continue;
}
let parts: Vec<&str> = line.split(',').collect();
if parts.len() >= 4 {
let clone_type = match parts[2].trim() {
"1" | "T1" => CloneType::Type1,
"2" | "T2" => CloneType::Type2,
"3" | "T3" | "VST3" | "ST3" | "MT3" | "WT3" => CloneType::Type3,
"4" | "T4" => CloneType::Type4,
_ => CloneType::Type3,
};
let is_clone =
parts[3].trim() == "true" || parts[3].trim() == "1" || parts[3].trim() == "T";
pairs.push(GroundTruthPair {
fragment_a: parts[0].trim().parse().unwrap_or(0),
fragment_b: parts[1].trim().parse().unwrap_or(0),
clone_type,
is_clone,
});
}
}
Ok(pairs)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::clones::types::CloneInstance;
fn make_instance(file: &str, start: usize, end: usize) -> CloneInstance {
CloneInstance {
file: file.to_string(),
start_line: start,
end_line: end,
start_byte: 0,
end_byte: 0,
function_name: None,
}
}
#[test]
fn test_empty_benchmark_zero_metrics() {
let bench = CloneBenchmark::new(Vec::new(), Vec::new());
let result = bench.evaluate(&[]);
assert_eq!(result.true_positives, 0);
assert_eq!(result.false_positives, 0);
assert_eq!(result.false_negatives, 0);
assert_eq!(result.precision, 0.0);
assert_eq!(result.recall, 0.0);
assert_eq!(result.f1, 0.0);
}
#[test]
fn test_perfect_detection() {
let fragments = vec![
CodeFragment {
id: 1,
file: "a.py".to_string(),
start_line: 1,
end_line: 10,
source: String::new(),
functionality_id: None,
},
CodeFragment {
id: 2,
file: "b.py".to_string(),
start_line: 1,
end_line: 10,
source: String::new(),
functionality_id: None,
},
];
let ground_truth = vec![GroundTruthPair {
fragment_a: 1,
fragment_b: 2,
clone_type: CloneType::Type1,
is_clone: true,
}];
let bench = CloneBenchmark::new(fragments, ground_truth);
let detected = vec![CloneGroup::new(
CloneType::Type1,
vec![make_instance("a.py", 1, 10), make_instance("b.py", 1, 10)],
)];
let result = bench.evaluate(&detected);
assert_eq!(result.true_positives, 1);
assert_eq!(result.false_positives, 0);
assert_eq!(result.false_negatives, 0);
assert!((result.precision - 1.0).abs() < f64::EPSILON);
assert!((result.recall - 1.0).abs() < f64::EPSILON);
assert!((result.f1 - 1.0).abs() < f64::EPSILON);
}
#[test]
fn test_no_detection_zero_recall() {
let fragments = vec![
CodeFragment {
id: 1,
file: "a.py".to_string(),
start_line: 1,
end_line: 10,
source: String::new(),
functionality_id: None,
},
CodeFragment {
id: 2,
file: "b.py".to_string(),
start_line: 1,
end_line: 10,
source: String::new(),
functionality_id: None,
},
];
let ground_truth = vec![GroundTruthPair {
fragment_a: 1,
fragment_b: 2,
clone_type: CloneType::Type3,
is_clone: true,
}];
let bench = CloneBenchmark::new(fragments, ground_truth);
let result = bench.evaluate(&[]);
assert_eq!(result.true_positives, 0);
assert_eq!(result.false_negatives, 1);
assert_eq!(result.recall, 0.0);
assert_eq!(result.f1, 0.0);
}
#[test]
fn test_false_positive_detection() {
let fragments = vec![
CodeFragment {
id: 1,
file: "a.py".to_string(),
start_line: 1,
end_line: 10,
source: String::new(),
functionality_id: None,
},
CodeFragment {
id: 2,
file: "b.py".to_string(),
start_line: 1,
end_line: 10,
source: String::new(),
functionality_id: None,
},
];
let ground_truth = Vec::new();
let bench = CloneBenchmark::new(fragments, ground_truth);
let detected = vec![CloneGroup::new(
CloneType::Type3,
vec![make_instance("a.py", 1, 10), make_instance("b.py", 1, 10)],
)];
let result = bench.evaluate(&detected);
assert_eq!(result.true_positives, 0);
assert_eq!(result.false_positives, 1);
assert_eq!(result.precision, 0.0);
}
#[test]
fn test_csv_parsing_handles_missing_data() {
let dir = tempfile::tempdir().unwrap();
std::fs::write(
dir.path().join("functions.csv"),
"id,file,start_line,end_line\n\
1,a.py,1,10\n\
bad_row\n\
3,c.py,5,20,42\n",
)
.unwrap();
std::fs::write(
dir.path().join("clones.csv"),
"id1,id2,type,is_clone\n\
1,3,T1,true\n\
1,3,VST3,1\n\
short\n",
)
.unwrap();
let bench = CloneBenchmark::from_bigclonebench(dir.path()).unwrap();
assert_eq!(bench.fragments.len(), 2);
assert_eq!(bench.fragments[0].id, 1);
assert_eq!(bench.fragments[1].id, 3);
assert_eq!(bench.fragments[1].functionality_id, Some(42));
assert_eq!(bench.ground_truth.len(), 2);
assert_eq!(bench.ground_truth[0].clone_type, CloneType::Type1);
assert!(bench.ground_truth[0].is_clone);
assert_eq!(bench.ground_truth[1].clone_type, CloneType::Type3);
assert!(bench.ground_truth[1].is_clone);
}
#[test]
fn test_from_bigclonebench_missing_files() {
let dir = tempfile::tempdir().unwrap();
let bench = CloneBenchmark::from_bigclonebench(dir.path()).unwrap();
assert!(bench.fragments.is_empty());
assert!(bench.ground_truth.is_empty());
}
#[test]
fn test_non_clone_pairs_ignored() {
let fragments = vec![
CodeFragment {
id: 1,
file: "a.py".to_string(),
start_line: 1,
end_line: 10,
source: String::new(),
functionality_id: None,
},
CodeFragment {
id: 2,
file: "b.py".to_string(),
start_line: 1,
end_line: 10,
source: String::new(),
functionality_id: None,
},
];
let ground_truth = vec![GroundTruthPair {
fragment_a: 1,
fragment_b: 2,
clone_type: CloneType::Type1,
is_clone: false,
}];
let bench = CloneBenchmark::new(fragments, ground_truth);
let result = bench.evaluate(&[]);
assert_eq!(result.false_negatives, 0);
}
#[test]
fn test_sweep_thresholds() {
let bench = CloneBenchmark::new(Vec::new(), Vec::new());
let results = bench.sweep_thresholds(&[0.3, 0.5, 0.7], |_threshold| Vec::new());
assert_eq!(results.len(), 3);
assert!((results[0].0 - 0.3).abs() < f64::EPSILON);
assert!((results[1].0 - 0.5).abs() < f64::EPSILON);
assert!((results[2].0 - 0.7).abs() < f64::EPSILON);
}
#[test]
fn test_per_type_breakdown() {
let fragments = vec![
CodeFragment {
id: 1,
file: "a.py".to_string(),
start_line: 1,
end_line: 10,
source: String::new(),
functionality_id: None,
},
CodeFragment {
id: 2,
file: "b.py".to_string(),
start_line: 1,
end_line: 10,
source: String::new(),
functionality_id: None,
},
CodeFragment {
id: 3,
file: "c.py".to_string(),
start_line: 1,
end_line: 10,
source: String::new(),
functionality_id: None,
},
];
let ground_truth = vec![
GroundTruthPair {
fragment_a: 1,
fragment_b: 2,
clone_type: CloneType::Type1,
is_clone: true,
},
GroundTruthPair {
fragment_a: 1,
fragment_b: 3,
clone_type: CloneType::Type3,
is_clone: true,
},
];
let bench = CloneBenchmark::new(fragments, ground_truth);
let detected = vec![CloneGroup::new(
CloneType::Type1,
vec![make_instance("a.py", 1, 10), make_instance("b.py", 1, 10)],
)];
let result = bench.evaluate(&detected);
assert_eq!(result.true_positives, 1);
assert_eq!(result.false_negatives, 1);
let t1 = result.per_type.get(&CloneType::Type1).unwrap();
assert_eq!(t1.true_positives, 1);
assert_eq!(t1.false_negatives, 0);
let t3 = result.per_type.get(&CloneType::Type3).unwrap();
assert_eq!(t3.true_positives, 0);
assert_eq!(t3.false_negatives, 1);
}
}