use std::collections::HashMap;
use crate::analysis::MetricsCalculator;
use crate::analysis::metrics::BifurcatedResult;
use crate::types::{
ColumnProfile, DataSource, ExecutionMetadata, MetricConfidence, ProfileReport,
QualityAssessment, QualityDimension,
};
pub struct ReportAssembler {
source: DataSource,
execution: ExecutionMetadata,
columns: Vec<ColumnProfile>,
quality_data: Option<HashMap<String, Vec<String>>>,
confidence: Option<MetricConfidence>,
skip_quality: bool,
requested_dimensions: Option<Vec<QualityDimension>>,
}
impl ReportAssembler {
pub fn new(source: DataSource, execution: ExecutionMetadata) -> Self {
Self {
source,
execution,
columns: Vec::new(),
quality_data: None,
confidence: None,
skip_quality: false,
requested_dimensions: None,
}
}
pub fn columns(mut self, columns: Vec<ColumnProfile>) -> Self {
self.columns = columns;
self
}
pub fn with_quality_data(mut self, data: HashMap<String, Vec<String>>) -> Self {
self.quality_data = Some(data);
self
}
pub fn with_confidence(mut self, confidence: MetricConfidence) -> Self {
self.confidence = Some(confidence);
self
}
pub fn skip_quality(mut self) -> Self {
self.skip_quality = true;
self
}
pub fn with_requested_dimensions(mut self, dims: Vec<QualityDimension>) -> Self {
self.requested_dimensions = Some(dims);
self
}
pub fn build(self) -> ProfileReport {
let quality = if self.skip_quality {
None
} else if let Some(data) = &self.quality_data {
self.compute_quality(data)
} else {
None
};
ProfileReport::new(self.source, self.columns, self.execution, quality)
}
fn compute_quality(&self, data: &HashMap<String, Vec<String>>) -> Option<QualityAssessment> {
let sample_size = data.values().map(|v| v.len()).max().unwrap_or(0);
let is_streaming = self.is_streaming_context(sample_size);
if is_streaming {
self.compute_bifurcated_quality(data, sample_size)
} else {
self.compute_uniform_quality(data)
}
}
fn is_streaming_context(&self, sample_size: usize) -> bool {
self.execution.sampling_applied
|| (sample_size > 0 && sample_size < self.execution.rows_processed)
}
fn compute_bifurcated_quality(
&self,
data: &HashMap<String, Vec<String>>,
_sample_size: usize,
) -> Option<QualityAssessment> {
let calculator = MetricsCalculator::new();
match calculator.calculate_bifurcated_metrics(
data,
&self.columns,
self.requested_dimensions.as_deref(),
) {
Ok(result) => {
let confidence = self
.confidence
.clone()
.unwrap_or_else(|| self.mixed_confidence(&result));
Some(QualityAssessment {
metrics: result.metrics,
confidence,
})
}
Err(e) => {
log::warn!("Bifurcated quality metrics calculation failed: {e}");
None
}
}
}
fn compute_uniform_quality(
&self,
data: &HashMap<String, Vec<String>>,
) -> Option<QualityAssessment> {
let calculator = MetricsCalculator::new();
match calculator.calculate_comprehensive_metrics(
data,
&self.columns,
self.requested_dimensions.as_deref(),
) {
Ok(metrics) => {
let confidence = self.confidence.clone().unwrap_or(MetricConfidence::Exact);
Some(QualityAssessment {
metrics,
confidence,
})
}
Err(e) => {
log::warn!("Quality metrics calculation failed: {e}");
None
}
}
}
fn mixed_confidence(&self, result: &BifurcatedResult) -> MetricConfidence {
MetricConfidence::Mixed {
exact_dimensions: result.exact_dimensions.clone(),
sampled_dimensions: result.sampled_dimensions.clone(),
sample_size: result.sample_size,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::FileFormat;
fn test_source() -> DataSource {
DataSource::File {
path: "test.csv".to_string(),
format: FileFormat::Csv,
size_bytes: 1024,
modified_at: None,
parquet_metadata: None,
}
}
#[test]
fn test_basic_report_assembly() {
let report =
ReportAssembler::new(test_source(), ExecutionMetadata::new(100, 3, 50)).build();
assert_eq!(report.execution.rows_processed, 100);
assert!(report.quality.is_none()); assert!(report.column_profiles.is_empty());
}
#[test]
fn test_skip_quality() {
let mut data = HashMap::new();
data.insert("col".to_string(), vec!["a".to_string(), "b".to_string()]);
let report = ReportAssembler::new(test_source(), ExecutionMetadata::new(2, 1, 10))
.with_quality_data(data)
.skip_quality()
.build();
assert!(report.quality.is_none());
}
#[test]
fn test_batch_produces_exact_confidence() {
let mut data = HashMap::new();
data.insert("col".to_string(), vec!["a".to_string(), "b".to_string()]);
let report = ReportAssembler::new(test_source(), ExecutionMetadata::new(2, 1, 10))
.with_quality_data(data)
.build();
assert!(report.quality.is_some());
let quality = report.quality.unwrap();
assert!(matches!(quality.confidence, MetricConfidence::Exact));
}
#[test]
fn test_streaming_produces_mixed_confidence() {
let mut data = HashMap::new();
data.insert("col".to_string(), vec!["a".to_string(), "b".to_string()]);
let report = ReportAssembler::new(test_source(), ExecutionMetadata::new(1000, 1, 50))
.with_quality_data(data)
.build();
assert!(report.quality.is_some());
let quality = report.quality.unwrap();
match &quality.confidence {
MetricConfidence::Mixed {
exact_dimensions,
sampled_dimensions,
sample_size,
} => {
assert!(exact_dimensions.contains(&"completeness".to_string()));
assert!(exact_dimensions.contains(&"key_uniqueness".to_string()));
assert!(sampled_dimensions.contains(&"consistency".to_string()));
assert!(sampled_dimensions.contains(&"accuracy".to_string()));
assert!(sampled_dimensions.contains(&"timeliness".to_string()));
assert!(sampled_dimensions.contains(&"duplicate_rows".to_string()));
assert_eq!(*sample_size, 2);
}
other => panic!("Expected Mixed confidence, got {:?}", other),
}
}
#[test]
fn test_sampling_applied_triggers_bifurcation() {
let mut data = HashMap::new();
data.insert("col".to_string(), vec!["a".to_string(), "b".to_string()]);
let execution = ExecutionMetadata::new(2, 1, 10).with_sampling(0.1);
let report = ReportAssembler::new(test_source(), execution)
.with_quality_data(data)
.build();
assert!(report.quality.is_some());
let quality = report.quality.unwrap();
assert!(matches!(quality.confidence, MetricConfidence::Mixed { .. }));
}
}