1use std::collections::HashMap;
9
10use dataprof_core::{
11 ColumnProfile, DataSource, ExecutionMetadata, QualityDimension, SemanticHints,
12};
13use dataprof_metrics::{
14 MetricConfidence, MetricsCalculator, QualityAssessment, analysis::metrics::BifurcatedResult,
15};
16
17use crate::ProfileReport;
18
19pub struct ReportAssembler {
21 source: DataSource,
22 execution: ExecutionMetadata,
23 columns: Vec<ColumnProfile>,
24 quality_data: Option<HashMap<String, Vec<String>>>,
25 confidence: Option<MetricConfidence>,
26 skip_quality: bool,
27 requested_dimensions: Option<Vec<QualityDimension>>,
28 semantic_hints: SemanticHints,
29}
30
31impl ReportAssembler {
32 pub fn new(source: DataSource, execution: ExecutionMetadata) -> Self {
34 Self {
35 source,
36 execution,
37 columns: Vec::new(),
38 quality_data: None,
39 confidence: None,
40 skip_quality: false,
41 requested_dimensions: None,
42 semantic_hints: SemanticHints::default(),
43 }
44 }
45
46 pub fn columns(mut self, columns: Vec<ColumnProfile>) -> Self {
48 self.columns = columns;
49 self
50 }
51
52 pub fn with_quality_data(mut self, data: HashMap<String, Vec<String>>) -> Self {
54 self.quality_data = Some(data);
55 self
56 }
57
58 pub fn with_confidence(mut self, confidence: MetricConfidence) -> Self {
60 self.confidence = Some(confidence);
61 self
62 }
63
64 pub fn skip_quality(mut self) -> Self {
66 self.skip_quality = true;
67 self
68 }
69
70 pub fn with_requested_dimensions(mut self, dims: Vec<QualityDimension>) -> Self {
72 self.requested_dimensions = Some(dims);
73 self
74 }
75
76 pub fn with_semantic_hints(mut self, hints: SemanticHints) -> Self {
78 self.semantic_hints = hints;
79 self
80 }
81
82 pub fn build(self) -> ProfileReport {
84 let quality = if self.skip_quality {
85 None
86 } else if let Some(data) = &self.quality_data {
87 self.compute_quality(data)
88 } else {
89 None
90 };
91
92 ProfileReport::new(self.source, self.columns, self.execution, quality)
93 }
94
95 fn compute_quality(&self, data: &HashMap<String, Vec<String>>) -> Option<QualityAssessment> {
96 let sample_size = data.values().map(|v| v.len()).max().unwrap_or(0);
97 let is_streaming = self.is_streaming_context(sample_size);
98
99 if is_streaming {
100 self.compute_bifurcated_quality(data)
101 } else {
102 self.compute_uniform_quality(data)
103 }
104 }
105
106 fn is_streaming_context(&self, sample_size: usize) -> bool {
107 self.execution.sampling_applied
108 || (sample_size > 0 && sample_size < self.execution.rows_processed)
109 }
110
111 fn compute_bifurcated_quality(
112 &self,
113 data: &HashMap<String, Vec<String>>,
114 ) -> Option<QualityAssessment> {
115 let calculator = MetricsCalculator::new();
116 match calculator.calculate_bifurcated_metrics_with_positive_columns(
117 data,
118 &self.columns,
119 self.requested_dimensions.as_deref(),
120 &self.semantic_hints.positive_columns,
121 ) {
122 Ok(result) => {
123 let confidence = self
124 .confidence
125 .clone()
126 .unwrap_or_else(|| self.mixed_confidence(&result));
127 Some(QualityAssessment {
128 metrics: result.metrics,
129 confidence,
130 })
131 }
132 Err(error) => {
133 log::warn!("Bifurcated quality metrics calculation failed: {error}");
134 None
135 }
136 }
137 }
138
139 fn compute_uniform_quality(
140 &self,
141 data: &HashMap<String, Vec<String>>,
142 ) -> Option<QualityAssessment> {
143 let calculator = MetricsCalculator::new();
144 match calculator.calculate_comprehensive_metrics_with_positive_columns(
145 data,
146 &self.columns,
147 self.requested_dimensions.as_deref(),
148 &self.semantic_hints.positive_columns,
149 ) {
150 Ok(metrics) => {
151 let confidence = self.confidence.clone().unwrap_or(MetricConfidence::Exact);
152 Some(QualityAssessment {
153 metrics,
154 confidence,
155 })
156 }
157 Err(error) => {
158 log::warn!("Quality metrics calculation failed: {error}");
159 None
160 }
161 }
162 }
163
164 fn mixed_confidence(&self, result: &BifurcatedResult) -> MetricConfidence {
165 MetricConfidence::Mixed {
166 exact_dimensions: result.exact_dimensions.clone(),
167 sampled_dimensions: result.sampled_dimensions.clone(),
168 sample_size: result.sample_size,
169 }
170 }
171}
172
173#[cfg(test)]
174mod tests {
175 use super::*;
176 use dataprof_core::FileFormat;
177
178 fn test_source() -> DataSource {
179 DataSource::File {
180 path: "test.csv".to_string(),
181 format: FileFormat::Csv,
182 size_bytes: 1024,
183 modified_at: None,
184 parquet_metadata: None,
185 }
186 }
187
188 #[test]
189 fn test_basic_report_assembly() {
190 let report =
191 ReportAssembler::new(test_source(), ExecutionMetadata::new(100, 3, 50)).build();
192
193 assert_eq!(report.execution.rows_processed, 100);
194 assert!(report.quality.is_none());
195 assert!(report.column_profiles.is_empty());
196 }
197
198 #[test]
199 fn test_skip_quality() {
200 let mut data = HashMap::new();
201 data.insert("col".to_string(), vec!["a".to_string(), "b".to_string()]);
202
203 let report = ReportAssembler::new(test_source(), ExecutionMetadata::new(2, 1, 10))
204 .with_quality_data(data)
205 .skip_quality()
206 .build();
207
208 assert!(report.quality.is_none());
209 }
210
211 #[test]
212 fn test_batch_produces_exact_confidence() {
213 let mut data = HashMap::new();
214 data.insert("col".to_string(), vec!["a".to_string(), "b".to_string()]);
215
216 let report = ReportAssembler::new(test_source(), ExecutionMetadata::new(2, 1, 10))
217 .with_quality_data(data)
218 .build();
219
220 assert!(report.quality.is_some());
221 let quality = report.quality.unwrap();
222 assert!(matches!(quality.confidence, MetricConfidence::Exact));
223 }
224
225 #[test]
226 fn test_streaming_produces_mixed_confidence() {
227 let mut data = HashMap::new();
228 data.insert("col".to_string(), vec!["a".to_string(), "b".to_string()]);
229
230 let report = ReportAssembler::new(test_source(), ExecutionMetadata::new(1000, 1, 50))
231 .with_quality_data(data)
232 .build();
233
234 assert!(report.quality.is_some());
235 let quality = report.quality.unwrap();
236 match &quality.confidence {
237 MetricConfidence::Mixed {
238 exact_dimensions,
239 sampled_dimensions,
240 sample_size,
241 } => {
242 assert!(exact_dimensions.contains(&"completeness".to_string()));
243 assert!(exact_dimensions.contains(&"key_uniqueness".to_string()));
244 assert!(sampled_dimensions.contains(&"consistency".to_string()));
245 assert!(sampled_dimensions.contains(&"accuracy".to_string()));
246 assert!(sampled_dimensions.contains(&"timeliness".to_string()));
247 assert!(sampled_dimensions.contains(&"duplicate_rows".to_string()));
248 assert_eq!(*sample_size, 2);
249 }
250 other => panic!("Expected Mixed confidence, got {:?}", other),
251 }
252 }
253
254 #[test]
255 fn test_sampling_applied_triggers_bifurcation() {
256 let mut data = HashMap::new();
257 data.insert("col".to_string(), vec!["a".to_string(), "b".to_string()]);
258
259 let execution = ExecutionMetadata::new(2, 1, 10).with_sampling(0.1);
260
261 let report = ReportAssembler::new(test_source(), execution)
262 .with_quality_data(data)
263 .build();
264
265 assert!(report.quality.is_some());
266 let quality = report.quality.unwrap();
267 assert!(matches!(quality.confidence, MetricConfidence::Mixed { .. }));
268 }
269}