use std::io::Write;
use crate::near_dup::{NearDupLimits, build_near_dup_report};
use proptest::prelude::*;
use tempfile::TempDir;
use tokmd_analysis_types::NearDupScope;
use tokmd_types::{ChildIncludeMode, ExportData, FileKind, FileRow};
fn make_file_row(path: &str, code: usize, bytes: usize) -> FileRow {
FileRow {
path: path.to_string(),
module: "root".to_string(),
lang: "Rust".to_string(),
kind: FileKind::Parent,
code,
comments: 0,
blanks: 0,
lines: code,
bytes,
tokens: code * 5,
}
}
fn make_export(rows: Vec<FileRow>) -> ExportData {
ExportData {
rows,
module_roots: vec![],
module_depth: 1,
children: ChildIncludeMode::ParentsOnly,
}
}
fn write_file(dir: &TempDir, path: &str, content: &str) {
let full = dir.path().join(path);
if let Some(parent) = full.parent() {
std::fs::create_dir_all(parent).unwrap();
}
let mut f = std::fs::File::create(&full).unwrap();
f.write_all(content.as_bytes()).unwrap();
}
fn source_text(n: usize, seed: usize) -> String {
(0..n)
.map(|i| format!("tok_{}_{}", seed, i))
.collect::<Vec<_>>()
.join(" + ")
}
fn arb_threshold() -> impl Strategy<Value = f64> {
(0u32..=100).prop_map(|v| v as f64 / 100.0)
}
fn arb_token_count() -> impl Strategy<Value = usize> {
30usize..200
}
fn arb_scope() -> impl Strategy<Value = NearDupScope> {
prop_oneof![
Just(NearDupScope::Global),
Just(NearDupScope::Module),
Just(NearDupScope::Lang),
]
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(50))]
#[test]
fn report_never_errors(
threshold in arb_threshold(),
scope in arb_scope(),
n_tokens in arb_token_count(),
) {
let dir = TempDir::new().unwrap();
let content = source_text(n_tokens, 0);
write_file(&dir, "a.rs", &content);
write_file(&dir, "b.rs", &content);
let rows = vec![
make_file_row("a.rs", n_tokens, content.len()),
make_file_row("b.rs", n_tokens, content.len()),
];
let result = build_near_dup_report(
dir.path(),
&make_export(rows),
scope,
threshold,
1000,
None,
&NearDupLimits::default(),
&[],
);
prop_assert!(result.is_ok(), "report should never fail: {:?}", result.err());
}
#[test]
fn similarity_in_unit_range(n_tokens in arb_token_count()) {
let dir = TempDir::new().unwrap();
let content = source_text(n_tokens, 0);
write_file(&dir, "a.rs", &content);
write_file(&dir, "b.rs", &content);
let rows = vec![
make_file_row("a.rs", n_tokens, content.len()),
make_file_row("b.rs", n_tokens, content.len()),
];
let report = build_near_dup_report(
dir.path(),
&make_export(rows),
NearDupScope::Global,
0.0,
1000,
None,
&NearDupLimits::default(),
&[],
).unwrap();
for pair in &report.pairs {
prop_assert!(
pair.similarity >= 0.0 && pair.similarity <= 1.0,
"similarity out of range: {}",
pair.similarity
);
}
}
#[test]
fn pairs_sorted_descending(n_tokens in arb_token_count()) {
let dir = TempDir::new().unwrap();
let content = source_text(n_tokens, 0);
for name in &["a.rs", "b.rs", "c.rs"] {
write_file(&dir, name, &content);
}
let rows: Vec<FileRow> = ["a.rs", "b.rs", "c.rs"]
.iter()
.map(|name| make_file_row(name, n_tokens, content.len()))
.collect();
let report = build_near_dup_report(
dir.path(),
&make_export(rows),
NearDupScope::Global,
0.0,
1000,
None,
&NearDupLimits::default(),
&[],
).unwrap();
for window in report.pairs.windows(2) {
prop_assert!(
window[0].similarity >= window[1].similarity,
"pairs not sorted: {} vs {}",
window[0].similarity,
window[1].similarity
);
}
}
#[test]
fn analyzed_plus_skipped_equals_eligible(
max_files in 1usize..10,
file_count in 1usize..15,
n_tokens in arb_token_count(),
) {
let dir = TempDir::new().unwrap();
let content = source_text(n_tokens, 0);
let mut rows = Vec::new();
for i in 0..file_count {
let name = format!("f{}.rs", i);
write_file(&dir, &name, &content);
rows.push(make_file_row(&name, n_tokens, content.len()));
}
let report = build_near_dup_report(
dir.path(),
&make_export(rows),
NearDupScope::Global,
0.5,
max_files,
None,
&NearDupLimits::default(),
&[],
).unwrap();
if let Some(eligible) = report.eligible_files {
prop_assert_eq!(
report.files_analyzed + report.files_skipped,
eligible,
"analyzed({}) + skipped({}) != eligible({})",
report.files_analyzed,
report.files_skipped,
eligible
);
}
}
#[test]
fn higher_threshold_fewer_or_equal_pairs(
n_tokens in arb_token_count(),
t_low in 0.0f64..0.5,
t_delta in 0.0f64..0.5,
) {
let t_high = (t_low + t_delta).min(1.0);
let dir = TempDir::new().unwrap();
let shared: Vec<String> = (0..n_tokens).map(|i| format!("shared_{}", i)).collect();
let suffix_a: Vec<String> = (0..20).map(|i| format!("ua_{}", i)).collect();
let suffix_b: Vec<String> = (0..20).map(|i| format!("ub_{}", i)).collect();
let content_a = [shared.clone(), suffix_a].concat().join(" + ");
let content_b = [shared, suffix_b].concat().join(" + ");
write_file(&dir, "a.rs", &content_a);
write_file(&dir, "b.rs", &content_b);
let rows = vec![
make_file_row("a.rs", n_tokens + 20, content_a.len()),
make_file_row("b.rs", n_tokens + 20, content_b.len()),
];
let report_low = build_near_dup_report(
dir.path(),
&make_export(rows.clone()),
NearDupScope::Global,
t_low,
1000,
None,
&NearDupLimits::default(),
&[],
).unwrap();
let report_high = build_near_dup_report(
dir.path(),
&make_export(rows),
NearDupScope::Global,
t_high,
1000,
None,
&NearDupLimits::default(),
&[],
).unwrap();
prop_assert!(
report_high.pairs.len() <= report_low.pairs.len(),
"higher threshold {} yielded {} pairs > lower threshold {} with {} pairs",
t_high,
report_high.pairs.len(),
t_low,
report_low.pairs.len()
);
}
#[test]
fn clusters_have_at_least_two_files(n_tokens in arb_token_count()) {
let dir = TempDir::new().unwrap();
let content = source_text(n_tokens, 0);
for name in &["a.rs", "b.rs", "c.rs"] {
write_file(&dir, name, &content);
}
let rows: Vec<FileRow> = ["a.rs", "b.rs", "c.rs"]
.iter()
.map(|name| make_file_row(name, n_tokens, content.len()))
.collect();
let report = build_near_dup_report(
dir.path(),
&make_export(rows),
NearDupScope::Global,
0.0,
1000,
None,
&NearDupLimits::default(),
&[],
).unwrap();
if let Some(clusters) = &report.clusters {
for cluster in clusters {
prop_assert!(
cluster.files.len() >= 2,
"cluster has {} files, expected >= 2",
cluster.files.len()
);
}
}
}
#[test]
fn cluster_files_sorted(n_tokens in arb_token_count()) {
let dir = TempDir::new().unwrap();
let content = source_text(n_tokens, 0);
for name in &["z.rs", "a.rs", "m.rs"] {
write_file(&dir, name, &content);
}
let rows: Vec<FileRow> = ["z.rs", "a.rs", "m.rs"]
.iter()
.map(|name| make_file_row(name, n_tokens, content.len()))
.collect();
let report = build_near_dup_report(
dir.path(),
&make_export(rows),
NearDupScope::Global,
0.0,
1000,
None,
&NearDupLimits::default(),
&[],
).unwrap();
if let Some(clusters) = &report.clusters {
for cluster in clusters {
let mut sorted = cluster.files.clone();
sorted.sort();
prop_assert_eq!(
&cluster.files,
&sorted,
"cluster files not sorted"
);
}
}
}
#[test]
fn shared_fingerprints_bounded(n_tokens in arb_token_count()) {
let dir = TempDir::new().unwrap();
let content = source_text(n_tokens, 0);
write_file(&dir, "a.rs", &content);
write_file(&dir, "b.rs", &content);
let rows = vec![
make_file_row("a.rs", n_tokens, content.len()),
make_file_row("b.rs", n_tokens, content.len()),
];
let report = build_near_dup_report(
dir.path(),
&make_export(rows),
NearDupScope::Global,
0.0,
1000,
None,
&NearDupLimits::default(),
&[],
).unwrap();
for pair in &report.pairs {
let max_possible = pair.left_fingerprints.min(pair.right_fingerprints);
prop_assert!(
pair.shared_fingerprints <= max_possible,
"shared {} > min(left={}, right={})",
pair.shared_fingerprints,
pair.left_fingerprints,
pair.right_fingerprints
);
}
}
#[test]
fn idempotent_report(n_tokens in arb_token_count()) {
let dir = TempDir::new().unwrap();
let content = source_text(n_tokens, 0);
write_file(&dir, "a.rs", &content);
write_file(&dir, "b.rs", &content);
let rows = vec![
make_file_row("a.rs", n_tokens, content.len()),
make_file_row("b.rs", n_tokens, content.len()),
];
let report1 = build_near_dup_report(
dir.path(),
&make_export(rows.clone()),
NearDupScope::Global,
0.5,
1000,
None,
&NearDupLimits::default(),
&[],
).unwrap();
let report2 = build_near_dup_report(
dir.path(),
&make_export(rows),
NearDupScope::Global,
0.5,
1000,
None,
&NearDupLimits::default(),
&[],
).unwrap();
prop_assert_eq!(report1.pairs.len(), report2.pairs.len());
for (p1, p2) in report1.pairs.iter().zip(report2.pairs.iter()) {
prop_assert_eq!(&p1.left, &p2.left);
prop_assert_eq!(&p1.right, &p2.right);
prop_assert!((p1.similarity - p2.similarity).abs() < 1e-10);
}
}
}