1use std::collections::HashMap;
2
3use dataprof_core::{ColumnProfile, QualityDimension};
4use serde::{Deserialize, Serialize};
5
6use crate::core::errors::DataProfilerError;
7
8#[derive(Debug, Clone, Default, Serialize, Deserialize)]
10pub struct CompletenessMetrics {
11 #[serde(serialize_with = "crate::serde_helpers::round_2")]
12 pub missing_values_ratio: f64,
13 #[serde(serialize_with = "crate::serde_helpers::round_2")]
14 pub complete_records_ratio: f64,
15 pub null_columns: Vec<String>,
16}
17
18#[derive(Debug, Clone, Default, Serialize, Deserialize)]
20pub struct ConsistencyMetrics {
21 #[serde(serialize_with = "crate::serde_helpers::round_2")]
22 pub data_type_consistency: f64,
23 pub format_violations: usize,
24 pub encoding_issues: usize,
25}
26
27#[derive(Debug, Clone, Default, Serialize, Deserialize)]
29pub struct UniquenessMetrics {
30 pub duplicate_rows: usize,
31 #[serde(serialize_with = "crate::serde_helpers::round_2")]
32 pub key_uniqueness: f64,
33 pub high_cardinality_warning: bool,
34}
35
36#[derive(Debug, Clone, Default, Serialize, Deserialize)]
38pub struct AccuracyMetrics {
39 #[serde(serialize_with = "crate::serde_helpers::round_2")]
40 pub outlier_ratio: f64,
41 pub range_violations: usize,
42 pub negative_values_in_positive: usize,
43}
44
45#[derive(Debug, Clone, Default, Serialize, Deserialize)]
47pub struct TimelinessMetrics {
48 pub future_dates_count: usize,
49 #[serde(serialize_with = "crate::serde_helpers::round_2")]
50 pub stale_data_ratio: f64,
51 pub temporal_violations: usize,
52}
53
54#[derive(Debug, Clone, Default, Serialize, Deserialize)]
56pub struct QualityMetrics {
57 #[serde(skip_serializing_if = "Option::is_none")]
58 pub completeness: Option<CompletenessMetrics>,
59 #[serde(skip_serializing_if = "Option::is_none")]
60 pub consistency: Option<ConsistencyMetrics>,
61 #[serde(skip_serializing_if = "Option::is_none")]
62 pub uniqueness: Option<UniquenessMetrics>,
63 #[serde(skip_serializing_if = "Option::is_none")]
64 pub accuracy: Option<AccuracyMetrics>,
65 #[serde(skip_serializing_if = "Option::is_none")]
66 pub timeliness: Option<TimelinessMetrics>,
67 #[serde(default, skip_serializing_if = "is_false")]
72 pub low_sample_warning: bool,
73}
74
75fn is_false(b: &bool) -> bool {
76 !*b
77}
78
79impl QualityMetrics {
80 pub fn empty() -> Self {
81 Self {
82 completeness: Some(CompletenessMetrics {
83 missing_values_ratio: 0.0,
84 complete_records_ratio: 100.0,
85 null_columns: vec![],
86 }),
87 consistency: Some(ConsistencyMetrics {
88 data_type_consistency: 100.0,
89 format_violations: 0,
90 encoding_issues: 0,
91 }),
92 uniqueness: Some(UniquenessMetrics {
93 duplicate_rows: 0,
94 key_uniqueness: 100.0,
95 high_cardinality_warning: false,
96 }),
97 accuracy: Some(AccuracyMetrics {
98 outlier_ratio: 0.0,
99 range_violations: 0,
100 negative_values_in_positive: 0,
101 }),
102 timeliness: Some(TimelinessMetrics {
103 future_dates_count: 0,
104 stale_data_ratio: 0.0,
105 temporal_violations: 0,
106 }),
107 low_sample_warning: false,
108 }
109 }
110
111 pub fn calculate_from_data(
112 data: &HashMap<String, Vec<String>>,
113 column_profiles: &[ColumnProfile],
114 ) -> Result<Self, DataProfilerError> {
115 let calculator = crate::analysis::MetricsCalculator::new();
116 calculator.calculate_comprehensive_metrics(data, column_profiles, None)
117 }
118
119 pub fn overall_score(&self) -> f64 {
120 let mut total_weight = 0.0;
121 let mut score = 0.0;
122
123 if let Some(c) = &self.completeness {
124 total_weight += 0.3;
125 score += c.complete_records_ratio * 0.3;
126 }
127 if let Some(c) = &self.consistency {
128 total_weight += 0.25;
129 score += c.data_type_consistency * 0.25;
130 }
131 if let Some(u) = &self.uniqueness {
132 total_weight += 0.2;
133 score += u.key_uniqueness * 0.2;
134 }
135 if let Some(a) = &self.accuracy {
136 total_weight += 0.15;
137 score += (100.0 - a.outlier_ratio) * 0.15;
138 }
139 if let Some(t) = &self.timeliness {
140 total_weight += 0.1;
141 score += (100.0 - t.stale_data_ratio) * 0.1;
142 }
143
144 if total_weight > 0.0 {
145 (score / total_weight).min(100.0)
146 } else {
147 0.0
148 }
149 }
150
151 pub fn missing_values_ratio(&self) -> f64 {
152 self.completeness
153 .as_ref()
154 .map_or(0.0, |c| c.missing_values_ratio)
155 }
156
157 pub fn complete_records_ratio(&self) -> f64 {
158 self.completeness
159 .as_ref()
160 .map_or(100.0, |c| c.complete_records_ratio)
161 }
162
163 pub fn null_columns(&self) -> &[String] {
164 self.completeness.as_ref().map_or(&[], |c| &c.null_columns)
165 }
166
167 pub fn data_type_consistency(&self) -> f64 {
168 self.consistency
169 .as_ref()
170 .map_or(100.0, |c| c.data_type_consistency)
171 }
172
173 pub fn format_violations(&self) -> usize {
174 self.consistency.as_ref().map_or(0, |c| c.format_violations)
175 }
176
177 pub fn encoding_issues(&self) -> usize {
178 self.consistency.as_ref().map_or(0, |c| c.encoding_issues)
179 }
180
181 pub fn duplicate_rows(&self) -> usize {
182 self.uniqueness.as_ref().map_or(0, |u| u.duplicate_rows)
183 }
184
185 pub fn key_uniqueness(&self) -> f64 {
186 self.uniqueness.as_ref().map_or(100.0, |u| u.key_uniqueness)
187 }
188
189 pub fn high_cardinality_warning(&self) -> bool {
190 self.uniqueness
191 .as_ref()
192 .is_some_and(|u| u.high_cardinality_warning)
193 }
194
195 pub fn outlier_ratio(&self) -> f64 {
196 self.accuracy.as_ref().map_or(0.0, |a| a.outlier_ratio)
197 }
198
199 pub fn range_violations(&self) -> usize {
200 self.accuracy.as_ref().map_or(0, |a| a.range_violations)
201 }
202
203 pub fn negative_values_in_positive(&self) -> usize {
204 self.accuracy
205 .as_ref()
206 .map_or(0, |a| a.negative_values_in_positive)
207 }
208
209 pub fn future_dates_count(&self) -> usize {
210 self.timeliness.as_ref().map_or(0, |t| t.future_dates_count)
211 }
212
213 pub fn stale_data_ratio(&self) -> f64 {
214 self.timeliness.as_ref().map_or(0.0, |t| t.stale_data_ratio)
215 }
216
217 pub fn temporal_violations(&self) -> usize {
218 self.timeliness
219 .as_ref()
220 .map_or(0, |t| t.temporal_violations)
221 }
222
223 pub fn supports_dimension(&self, dimension: QualityDimension) -> bool {
224 match dimension {
225 QualityDimension::Completeness => self.completeness.is_some(),
226 QualityDimension::Consistency => self.consistency.is_some(),
227 QualityDimension::Uniqueness => self.uniqueness.is_some(),
228 QualityDimension::Accuracy => self.accuracy.is_some(),
229 QualityDimension::Timeliness => self.timeliness.is_some(),
230 }
231 }
232}
233
234#[derive(Debug, Clone, Serialize, Deserialize)]
236pub enum MetricConfidence {
237 Exact,
238 Approximate {
239 sample_size: usize,
240 population_size: Option<usize>,
241 },
242 Mixed {
243 exact_dimensions: Vec<String>,
244 sampled_dimensions: Vec<String>,
245 sample_size: usize,
246 },
247}
248
249#[derive(Debug, Clone, Serialize, Deserialize)]
251pub struct QualityAssessment {
252 pub metrics: QualityMetrics,
253 pub confidence: MetricConfidence,
254}
255
256impl QualityAssessment {
257 pub fn exact(metrics: QualityMetrics) -> Self {
258 Self {
259 metrics,
260 confidence: MetricConfidence::Exact,
261 }
262 }
263
264 pub fn approximate(
265 metrics: QualityMetrics,
266 sample_size: usize,
267 population_size: Option<usize>,
268 ) -> Self {
269 Self {
270 metrics,
271 confidence: MetricConfidence::Approximate {
272 sample_size,
273 population_size,
274 },
275 }
276 }
277
278 pub fn score(&self) -> f64 {
279 self.metrics.overall_score()
280 }
281}
282
283impl From<QualityMetrics> for QualityAssessment {
284 fn from(metrics: QualityMetrics) -> Self {
285 Self::exact(metrics)
286 }
287}
288
289#[cfg(test)]
290mod tests {
291 use super::*;
292
293 #[test]
294 fn test_empty_metrics_perfect_score() {
295 let metrics = QualityMetrics::empty();
296 assert!((metrics.overall_score() - 100.0).abs() < 0.01);
297 }
298
299 #[test]
300 fn test_quality_score_weights_sum_to_100() {
301 let metrics = QualityMetrics::empty();
302 assert!((metrics.overall_score() - 100.0).abs() < 0.01);
303 }
304
305 #[test]
306 fn test_quality_score_completeness_weight() {
307 let mut metrics = QualityMetrics::empty();
308 if let Some(ref mut c) = metrics.completeness {
309 c.complete_records_ratio = 0.0;
310 }
311 assert!((metrics.overall_score() - 70.0).abs() < 0.01);
312 }
313
314 #[test]
315 fn test_quality_score_all_bad() {
316 let metrics = QualityMetrics {
317 completeness: Some(CompletenessMetrics {
318 complete_records_ratio: 0.0,
319 ..CompletenessMetrics::default()
320 }),
321 consistency: Some(ConsistencyMetrics {
322 data_type_consistency: 0.0,
323 ..ConsistencyMetrics::default()
324 }),
325 uniqueness: Some(UniquenessMetrics {
326 key_uniqueness: 0.0,
327 ..UniquenessMetrics::default()
328 }),
329 accuracy: Some(AccuracyMetrics {
330 outlier_ratio: 100.0,
331 ..AccuracyMetrics::default()
332 }),
333 timeliness: Some(TimelinessMetrics {
334 stale_data_ratio: 100.0,
335 ..TimelinessMetrics::default()
336 }),
337 ..QualityMetrics::default()
338 };
339
340 assert!((metrics.overall_score() - 0.0).abs() < 0.01);
341 }
342
343 #[test]
344 fn test_partial_dimensions_only_completeness() {
345 let metrics = QualityMetrics {
346 completeness: Some(CompletenessMetrics {
347 complete_records_ratio: 100.0,
348 missing_values_ratio: 0.0,
349 null_columns: vec![],
350 }),
351 ..QualityMetrics::default()
352 };
353
354 assert!(metrics.completeness.is_some());
355 assert!(metrics.consistency.is_none());
356 assert!(metrics.uniqueness.is_none());
357 assert!(metrics.accuracy.is_none());
358 assert!(metrics.timeliness.is_none());
359 assert!((metrics.overall_score() - 100.0).abs() < 0.01);
360 }
361
362 #[test]
363 fn test_partial_dimensions_two_dimensions() {
364 let metrics = QualityMetrics {
365 completeness: Some(CompletenessMetrics {
366 complete_records_ratio: 50.0,
367 ..CompletenessMetrics::default()
368 }),
369 uniqueness: Some(UniquenessMetrics {
370 key_uniqueness: 80.0,
371 ..UniquenessMetrics::default()
372 }),
373 ..QualityMetrics::default()
374 };
375
376 assert!((metrics.overall_score() - 62.0).abs() < 0.01);
377 }
378
379 #[test]
380 fn test_all_dimensions_none_score_zero() {
381 let metrics = QualityMetrics::default();
382
383 assert!((metrics.overall_score() - 0.0).abs() < 0.01);
384 }
385
386 #[test]
387 fn test_partial_dimensions_json_skips_none() {
388 let metrics = QualityMetrics {
389 completeness: Some(CompletenessMetrics::default()),
390 ..QualityMetrics::default()
391 };
392
393 let json = serde_json::to_string(&metrics).unwrap();
394 assert!(json.contains("completeness"));
395 assert!(!json.contains("consistency"));
396 assert!(!json.contains("uniqueness"));
397 assert!(!json.contains("accuracy"));
398 assert!(!json.contains("timeliness"));
399 }
400
401 #[test]
402 fn test_partial_dimensions_flat_accessors_return_defaults() {
403 let metrics = QualityMetrics::default();
404
405 assert!((metrics.complete_records_ratio() - 100.0).abs() < 0.01);
406 assert!((metrics.data_type_consistency() - 100.0).abs() < 0.01);
407 assert!((metrics.key_uniqueness() - 100.0).abs() < 0.01);
408 assert!((metrics.missing_values_ratio() - 0.0).abs() < 0.01);
409 assert_eq!(metrics.duplicate_rows(), 0);
410 assert!(!metrics.high_cardinality_warning());
411 }
412}