use std::io::Write;
use dataprof::parsers::csv::{CsvParserConfig, analyze_csv_file};
use dataprof::types::{ColumnStats, DataType, MetricConfidence, QualityDimension};
use dataprof::{EngineType, Profiler};
use tempfile::NamedTempFile;
fn create_test_csv() -> NamedTempFile {
let mut f = NamedTempFile::new().unwrap();
writeln!(f, "name,age,salary,score").unwrap();
for i in 0..100 {
writeln!(
f,
"Person{},{},{:.2},{:.1}",
i,
20 + i % 50,
30000.0 + i as f64 * 500.0,
50.0 + (i % 50) as f64
)
.unwrap();
}
f.flush().unwrap();
f
}
#[test]
fn test_standard_vs_arrow_csv_numeric_stats() {
let csv = create_test_csv();
let path = csv.path();
let std_report = analyze_csv_file(path, &CsvParserConfig::default())
.expect("standard CSV analysis should succeed");
let arrow_report = Profiler::new()
.engine(EngineType::Columnar)
.analyze_file(path)
.expect("Arrow CSV analysis should succeed");
assert_eq!(
std_report.column_profiles.len(),
arrow_report.column_profiles.len(),
"Both engines should detect the same number of columns"
);
for std_col in &std_report.column_profiles {
let arrow_col = arrow_report
.column_profiles
.iter()
.find(|c| c.name == std_col.name)
.unwrap_or_else(|| panic!("Column '{}' missing from Arrow report", std_col.name));
assert_eq!(
std_col.data_type, arrow_col.data_type,
"Type mismatch for column '{}'",
std_col.name
);
assert_eq!(
std_col.total_count, arrow_col.total_count,
"total_count mismatch for '{}'",
std_col.name
);
assert_eq!(
std_col.null_count, arrow_col.null_count,
"null_count mismatch for '{}'",
std_col.name
);
if let (ColumnStats::Numeric(n1), ColumnStats::Numeric(n2)) =
(&std_col.stats, &arrow_col.stats)
{
let tol = 0.01;
assert!(
(n1.min - n2.min).abs() < tol,
"'{}' min: {} vs {}",
std_col.name,
n1.min,
n2.min
);
assert!(
(n1.max - n2.max).abs() < tol,
"'{}' max: {} vs {}",
std_col.name,
n1.max,
n2.max
);
assert!(
(n1.mean - n2.mean).abs() < tol,
"'{}' mean: {} vs {}",
std_col.name,
n1.mean,
n2.mean
);
assert!(
(n1.std_dev - n2.std_dev).abs() < tol,
"'{}' std_dev: {} vs {}",
std_col.name,
n1.std_dev,
n2.std_dev
);
assert!(
(n1.variance - n2.variance).abs() < 0.1,
"'{}' variance: {} vs {}",
std_col.name,
n1.variance,
n2.variance
);
if let (Some(m1), Some(m2)) = (n1.median, n2.median) {
assert!(
(m1 - m2).abs() < 0.1,
"'{}' median: {} vs {}",
std_col.name,
m1,
m2
);
}
if let (Some(s1), Some(s2)) = (n1.skewness, n2.skewness) {
assert!(
(s1 - s2).abs() < 0.1,
"'{}' skewness: {} vs {}",
std_col.name,
s1,
s2
);
}
if let (Some(k1), Some(k2)) = (n1.kurtosis, n2.kurtosis) {
assert!(
(k1 - k2).abs() < 0.1,
"'{}' kurtosis: {} vs {}",
std_col.name,
k1,
k2
);
}
} else if matches!(std_col.data_type, DataType::Integer | DataType::Float) {
panic!(
"Column '{}' is {:?} but one engine produced non-Numeric stats: std={:?}, arrow={:?}",
std_col.name, std_col.data_type, std_col.stats, arrow_col.stats
);
}
}
}
#[test]
fn test_mixed_data_column_type_consistency() {
let mut f = NamedTempFile::new().unwrap();
writeln!(f, "id,value,date_col").unwrap();
writeln!(f, "1,100,2024-01-01").unwrap();
writeln!(f, "2,200,2024-01-02").unwrap();
writeln!(f, "3,N/A,2024-01-03").unwrap();
writeln!(f, "4,400,2024-01-04").unwrap();
writeln!(f, "5,500,2024-01-05").unwrap();
f.flush().unwrap();
let std_report = analyze_csv_file(f.path(), &CsvParserConfig::default())
.expect("standard CSV should succeed");
let arrow_report = Profiler::new()
.engine(EngineType::Columnar)
.analyze_file(f.path())
.expect("Arrow CSV should succeed");
for std_col in &std_report.column_profiles {
let arrow_col = arrow_report
.column_profiles
.iter()
.find(|c| c.name == std_col.name)
.unwrap_or_else(|| panic!("Column '{}' missing from Arrow report", std_col.name));
assert_eq!(
std_col.data_type, arrow_col.data_type,
"Type mismatch for column '{}': std={:?}, arrow={:?}",
std_col.name, std_col.data_type, arrow_col.data_type
);
}
}
#[test]
fn test_streaming_vs_batch_quality_confidence() {
let mut f = NamedTempFile::new().unwrap();
writeln!(f, "id,name,value").unwrap();
for i in 0..200 {
let name = if i % 10 == 0 { "" } else { "Alice" }; let value = if i % 20 == 0 {
""
} else {
&format!("{}", i * 100)
}; writeln!(f, "{},{},{}", i, name, value).unwrap();
}
f.flush().unwrap();
let batch_report = Profiler::new()
.engine(EngineType::Columnar)
.analyze_file(f.path())
.expect("Batch analysis should succeed");
let streaming_report = Profiler::new()
.engine(EngineType::Incremental)
.analyze_file(f.path())
.expect("Streaming analysis should succeed");
let batch_quality = batch_report
.quality
.as_ref()
.expect("Batch report should have quality");
let streaming_quality = streaming_report
.quality
.as_ref()
.expect("Streaming report should have quality");
assert!(
matches!(batch_quality.confidence, MetricConfidence::Exact),
"Batch engine should produce Exact confidence, got {:?}",
batch_quality.confidence
);
let streaming_m = &streaming_quality.metrics;
assert!(
streaming_m.missing_values_ratio() > 0.0,
"Streaming engine should detect missing values from empty CSV fields"
);
let batch_m = &batch_quality.metrics;
assert!(
(batch_m.key_uniqueness() - streaming_m.key_uniqueness()).abs() < 5.0,
"key_uniqueness: batch={:.2} vs streaming={:.2}",
batch_m.key_uniqueness(),
streaming_m.key_uniqueness()
);
}
#[test]
fn test_streaming_bifurcation_with_large_dataset() {
let mut f = NamedTempFile::new().unwrap();
writeln!(f, "id,category,amount").unwrap();
for i in 0..15_000 {
let cat = if i % 3 == 0 {
"A"
} else if i % 3 == 1 {
"B"
} else {
"C"
};
let amount = if i % 100 == 0 {
""
} else {
&format!("{:.2}", i as f64 * 1.5)
};
writeln!(f, "{},{},{}", i, cat, amount).unwrap();
}
f.flush().unwrap();
let report = Profiler::new()
.engine(EngineType::Incremental)
.analyze_file(f.path())
.expect("Large streaming analysis should succeed");
let quality = report
.quality
.as_ref()
.expect("Should have quality assessment");
match &quality.confidence {
MetricConfidence::Mixed {
exact_dimensions,
sampled_dimensions,
sample_size,
} => {
assert!(
exact_dimensions.contains(&"completeness".to_string()),
"completeness should be exact"
);
assert!(
exact_dimensions.contains(&"key_uniqueness".to_string()),
"key_uniqueness should be exact"
);
assert!(
sampled_dimensions.contains(&"consistency".to_string()),
"consistency should be sampled"
);
assert!(
sampled_dimensions.contains(&"accuracy".to_string()),
"accuracy should be sampled"
);
assert!(
*sample_size < 15_000,
"sample_size ({}) should be less than total rows (15000)",
sample_size
);
}
other => panic!(
"Expected Mixed confidence for large streaming dataset, got {:?}",
other
),
}
let m = &quality.metrics;
assert!(
m.missing_values_ratio() > 0.0,
"Should detect some missing values"
);
assert!(
m.missing_values_ratio() < 2.0,
"Missing ratio should be small (~0.33%)"
);
}
#[test]
fn test_profiler_selective_dimensions_only_completeness() {
let csv = create_test_csv();
let report = Profiler::new()
.quality_dimensions(vec![QualityDimension::Completeness])
.analyze_file(csv.path())
.unwrap();
let quality = report.quality.expect("quality should be present");
let m = &quality.metrics;
assert!(m.completeness.is_some(), "completeness should be computed");
assert!(m.consistency.is_none(), "consistency should be skipped");
assert!(m.uniqueness.is_none(), "uniqueness should be skipped");
assert!(m.accuracy.is_none(), "accuracy should be skipped");
assert!(m.timeliness.is_none(), "timeliness should be skipped");
let score = m.overall_score();
let completeness_score = m.completeness.as_ref().unwrap().complete_records_ratio;
assert!(
(score - completeness_score).abs() < 0.01,
"score {score} should equal completeness {completeness_score}"
);
}
#[test]
fn test_profiler_selective_dimensions_subset() {
let csv = create_test_csv();
let report = Profiler::new()
.quality_dimensions(vec![
QualityDimension::Completeness,
QualityDimension::Uniqueness,
])
.analyze_file(csv.path())
.unwrap();
let quality = report.quality.expect("quality should be present");
let m = &quality.metrics;
assert!(m.completeness.is_some());
assert!(m.consistency.is_none());
assert!(m.uniqueness.is_some());
assert!(m.accuracy.is_none());
assert!(m.timeliness.is_none());
}
#[test]
fn test_profiler_all_dimensions_default() {
let csv = create_test_csv();
let report = Profiler::new().analyze_file(csv.path()).unwrap();
let quality = report.quality.expect("quality should be present");
let m = &quality.metrics;
assert!(m.completeness.is_some());
assert!(m.consistency.is_some());
assert!(m.uniqueness.is_some());
assert!(m.accuracy.is_some());
assert!(m.timeliness.is_some());
}