use std::path::PathBuf;
use std::process::Command;
fn get_test_data_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/data")
}
fn get_binary_path() -> PathBuf {
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
path.push("target");
path.push(if cfg!(debug_assertions) {
"debug"
} else {
"release"
});
path.push("csv-nose");
path
}
fn parse_accuracy_from_output(output: &str) -> Option<f64> {
for line in output.lines() {
if line.contains("Passed:") && line.contains('%') {
if let Some(start) = line.find('(')
&& let Some(end) = line.find('%')
&& start < end
{
let pct_str = &line[start + 1..end];
return pct_str.trim().parse().ok();
}
}
}
None
}
fn parse_total_from_output(output: &str) -> Option<usize> {
for line in output.lines() {
if line.contains("Total files:") {
let parts: Vec<&str> = line.split_whitespace().collect();
if let Some(last) = parts.last() {
return last.parse().ok();
}
}
}
None
}
fn run_dataset_benchmark(dataset: &str) -> (usize, f64, String) {
run_benchmark_with_annotations(dataset, dataset)
}
fn run_benchmark_with_annotations(
data_dir_name: &str,
annotations_name: &str,
) -> (usize, f64, String) {
let data_dir = get_test_data_dir();
let csv_dir = data_dir.join(data_dir_name);
let annotations_path = data_dir
.join("annotations")
.join(format!("{}.txt", annotations_name));
if !csv_dir.exists() {
panic!(
"Test data directory not found: {}. \
Please copy CSVsniffer test files to tests/data/{}",
csv_dir.display(),
data_dir_name
);
}
if !annotations_path.exists() {
panic!(
"Annotations file not found: {}. \
Please copy annotations to tests/data/annotations/{}.txt",
annotations_path.display(),
annotations_name
);
}
let binary = get_binary_path();
if !binary.exists() {
panic!(
"Binary not found at {}. Run `cargo build` first.",
binary.display()
);
}
let output = Command::new(&binary)
.arg("--benchmark")
.arg(&csv_dir)
.arg("--annotations")
.arg(&annotations_path)
.output()
.expect("Failed to execute csv-nose binary");
let stdout = String::from_utf8_lossy(&output.stdout).to_string();
let stderr = String::from_utf8_lossy(&output.stderr).to_string();
if !output.status.success() {
panic!(
"Benchmark command failed with status {:?}\nstdout: {}\nstderr: {}",
output.status, stdout, stderr
);
}
let total = parse_total_from_output(&stdout).unwrap_or(0);
let accuracy = parse_accuracy_from_output(&stdout).unwrap_or(0.0);
(total, accuracy, stdout)
}
#[test]
fn test_pollock_accuracy() {
let (total, accuracy, stdout) = run_dataset_benchmark("pollock");
println!("\n========== POLLOCK Dataset ==========");
println!("{}", stdout);
assert!(total > 0, "Should have test files");
println!("\nPOLLOCK Accuracy: {:.1}%", accuracy);
}
#[test]
fn test_w3c_csvw_accuracy() {
let (total, accuracy, stdout) = run_dataset_benchmark("w3c-csvw");
println!("\n========== W3C-CSVW Dataset ==========");
println!("{}", stdout);
assert!(total > 0, "Should have test files");
println!("\nW3C-CSVW Accuracy: {:.1}%", accuracy);
}
#[test]
fn test_csv_wrangling_accuracy() {
let (total, accuracy, stdout) = run_dataset_benchmark("csv-wrangling");
println!("\n========== CSV Wrangling Dataset ==========");
println!("{}", stdout);
assert!(total > 0, "Should have test files");
println!("\nCSV Wrangling Accuracy: {:.1}%", accuracy);
}
#[test]
fn test_csv_wrangling_codec_accuracy() {
let (total, accuracy, stdout) =
run_benchmark_with_annotations("csv-wrangling", "csv-wrangling-codec");
println!("\n========== CSV Wrangling filtered CODEC ==========");
println!("{}", stdout);
assert!(total > 0, "Should have test files");
println!("\nCSV Wrangling CODEC Accuracy: {:.1}%", accuracy);
}
#[test]
fn test_csv_wrangling_messy_accuracy() {
let (total, accuracy, stdout) =
run_benchmark_with_annotations("csv-wrangling", "csv-wrangling-messy");
println!("\n========== CSV Wrangling MESSY ==========");
println!("{}", stdout);
assert!(total > 0, "Should have test files");
println!("\nCSV Wrangling MESSY Accuracy: {:.1}%", accuracy);
}
#[test]
fn test_combined_accuracy_report() {
let (pollock_total, pollock_accuracy, _) = run_dataset_benchmark("pollock");
let (w3c_total, w3c_accuracy, _) = run_dataset_benchmark("w3c-csvw");
let (wrangling_total, wrangling_accuracy, _) = run_dataset_benchmark("csv-wrangling");
let (codec_total, codec_accuracy, _) =
run_benchmark_with_annotations("csv-wrangling", "csv-wrangling-codec");
let (messy_total, messy_accuracy, _) =
run_benchmark_with_annotations("csv-wrangling", "csv-wrangling-messy");
println!("\n========================================");
println!(" COMBINED ACCURACY REPORT ");
println!("========================================\n");
println!("Dataset | Total | Accuracy");
println!("-------------------|-------|----------");
println!(
"POLLOCK | {:>5} | {:>7.1}%",
pollock_total, pollock_accuracy
);
println!(
"W3C-CSVW | {:>5} | {:>7.1}%",
w3c_total, w3c_accuracy
);
println!(
"CSV Wrangling | {:>5} | {:>7.1}%",
wrangling_total, wrangling_accuracy
);
println!(
"CSV Wrangling CODEC| {:>5} | {:>7.1}%",
codec_total, codec_accuracy
);
println!(
"CSV Wrangling MESSY| {:>5} | {:>7.1}%",
messy_total, messy_accuracy
);
let total_files = pollock_total + w3c_total + wrangling_total;
let combined_accuracy = if total_files > 0 {
(pollock_accuracy * pollock_total as f64
+ w3c_accuracy * w3c_total as f64
+ wrangling_accuracy * wrangling_total as f64)
/ total_files as f64
} else {
0.0
};
println!("-------------------|-------|----------");
println!(
"COMBINED | {:>5} | {:>7.1}%",
total_files, combined_accuracy
);
}