Skip to main content

dataprof_metrics/
quality.rs

1use std::collections::HashMap;
2
3use dataprof_core::{ColumnProfile, QualityDimension};
4use serde::{Deserialize, Serialize};
5
6use crate::core::errors::DataProfilerError;
7
8/// Completeness metrics (ISO 8000-8).
9#[derive(Debug, Clone, Default, Serialize, Deserialize)]
10pub struct CompletenessMetrics {
11    #[serde(serialize_with = "crate::serde_helpers::round_2")]
12    pub missing_values_ratio: f64,
13    #[serde(serialize_with = "crate::serde_helpers::round_2")]
14    pub complete_records_ratio: f64,
15    pub null_columns: Vec<String>,
16}
17
18/// Consistency metrics (ISO 8000-61).
19#[derive(Debug, Clone, Default, Serialize, Deserialize)]
20pub struct ConsistencyMetrics {
21    #[serde(serialize_with = "crate::serde_helpers::round_2")]
22    pub data_type_consistency: f64,
23    pub format_violations: usize,
24    pub encoding_issues: usize,
25}
26
27/// Uniqueness metrics (ISO 8000-110).
28#[derive(Debug, Clone, Default, Serialize, Deserialize)]
29pub struct UniquenessMetrics {
30    pub duplicate_rows: usize,
31    #[serde(serialize_with = "crate::serde_helpers::round_2")]
32    pub key_uniqueness: f64,
33    pub high_cardinality_warning: bool,
34}
35
36/// Accuracy metrics (ISO 25012).
37#[derive(Debug, Clone, Default, Serialize, Deserialize)]
38pub struct AccuracyMetrics {
39    #[serde(serialize_with = "crate::serde_helpers::round_2")]
40    pub outlier_ratio: f64,
41    pub range_violations: usize,
42    pub negative_values_in_positive: usize,
43}
44
45/// Timeliness metrics (ISO 8000-8).
46#[derive(Debug, Clone, Default, Serialize, Deserialize)]
47pub struct TimelinessMetrics {
48    pub future_dates_count: usize,
49    #[serde(serialize_with = "crate::serde_helpers::round_2")]
50    pub stale_data_ratio: f64,
51    pub temporal_violations: usize,
52}
53
54/// Comprehensive data quality metrics following industry standards.
55#[derive(Debug, Clone, Default, Serialize, Deserialize)]
56pub struct QualityMetrics {
57    #[serde(skip_serializing_if = "Option::is_none")]
58    pub completeness: Option<CompletenessMetrics>,
59    #[serde(skip_serializing_if = "Option::is_none")]
60    pub consistency: Option<ConsistencyMetrics>,
61    #[serde(skip_serializing_if = "Option::is_none")]
62    pub uniqueness: Option<UniquenessMetrics>,
63    #[serde(skip_serializing_if = "Option::is_none")]
64    pub accuracy: Option<AccuracyMetrics>,
65    #[serde(skip_serializing_if = "Option::is_none")]
66    pub timeliness: Option<TimelinessMetrics>,
67    /// True when the sample used to compute these metrics was below the
68    /// minimum recommended size (10 rows). When set, the quality scores and
69    /// per-dimension ratios should be treated as directional rather than
70    /// reliable. Backwards-compatible: defaults to `false`.
71    #[serde(default, skip_serializing_if = "is_false")]
72    pub low_sample_warning: bool,
73}
74
75fn is_false(b: &bool) -> bool {
76    !*b
77}
78
79impl QualityMetrics {
80    pub fn empty() -> Self {
81        Self {
82            completeness: Some(CompletenessMetrics {
83                missing_values_ratio: 0.0,
84                complete_records_ratio: 100.0,
85                null_columns: vec![],
86            }),
87            consistency: Some(ConsistencyMetrics {
88                data_type_consistency: 100.0,
89                format_violations: 0,
90                encoding_issues: 0,
91            }),
92            uniqueness: Some(UniquenessMetrics {
93                duplicate_rows: 0,
94                key_uniqueness: 100.0,
95                high_cardinality_warning: false,
96            }),
97            accuracy: Some(AccuracyMetrics {
98                outlier_ratio: 0.0,
99                range_violations: 0,
100                negative_values_in_positive: 0,
101            }),
102            timeliness: Some(TimelinessMetrics {
103                future_dates_count: 0,
104                stale_data_ratio: 0.0,
105                temporal_violations: 0,
106            }),
107            low_sample_warning: false,
108        }
109    }
110
111    pub fn calculate_from_data(
112        data: &HashMap<String, Vec<String>>,
113        column_profiles: &[ColumnProfile],
114    ) -> Result<Self, DataProfilerError> {
115        let calculator = crate::analysis::MetricsCalculator::new();
116        calculator.calculate_comprehensive_metrics(data, column_profiles, None)
117    }
118
119    pub fn overall_score(&self) -> f64 {
120        let mut total_weight = 0.0;
121        let mut score = 0.0;
122
123        if let Some(c) = &self.completeness {
124            total_weight += 0.3;
125            score += c.complete_records_ratio * 0.3;
126        }
127        if let Some(c) = &self.consistency {
128            total_weight += 0.25;
129            score += c.data_type_consistency * 0.25;
130        }
131        if let Some(u) = &self.uniqueness {
132            total_weight += 0.2;
133            score += u.key_uniqueness * 0.2;
134        }
135        if let Some(a) = &self.accuracy {
136            total_weight += 0.15;
137            score += (100.0 - a.outlier_ratio) * 0.15;
138        }
139        if let Some(t) = &self.timeliness {
140            total_weight += 0.1;
141            score += (100.0 - t.stale_data_ratio) * 0.1;
142        }
143
144        if total_weight > 0.0 {
145            (score / total_weight).min(100.0)
146        } else {
147            0.0
148        }
149    }
150
151    pub fn missing_values_ratio(&self) -> f64 {
152        self.completeness
153            .as_ref()
154            .map_or(0.0, |c| c.missing_values_ratio)
155    }
156
157    pub fn complete_records_ratio(&self) -> f64 {
158        self.completeness
159            .as_ref()
160            .map_or(100.0, |c| c.complete_records_ratio)
161    }
162
163    pub fn null_columns(&self) -> &[String] {
164        self.completeness.as_ref().map_or(&[], |c| &c.null_columns)
165    }
166
167    pub fn data_type_consistency(&self) -> f64 {
168        self.consistency
169            .as_ref()
170            .map_or(100.0, |c| c.data_type_consistency)
171    }
172
173    pub fn format_violations(&self) -> usize {
174        self.consistency.as_ref().map_or(0, |c| c.format_violations)
175    }
176
177    pub fn encoding_issues(&self) -> usize {
178        self.consistency.as_ref().map_or(0, |c| c.encoding_issues)
179    }
180
181    pub fn duplicate_rows(&self) -> usize {
182        self.uniqueness.as_ref().map_or(0, |u| u.duplicate_rows)
183    }
184
185    pub fn key_uniqueness(&self) -> f64 {
186        self.uniqueness.as_ref().map_or(100.0, |u| u.key_uniqueness)
187    }
188
189    pub fn high_cardinality_warning(&self) -> bool {
190        self.uniqueness
191            .as_ref()
192            .is_some_and(|u| u.high_cardinality_warning)
193    }
194
195    pub fn outlier_ratio(&self) -> f64 {
196        self.accuracy.as_ref().map_or(0.0, |a| a.outlier_ratio)
197    }
198
199    pub fn range_violations(&self) -> usize {
200        self.accuracy.as_ref().map_or(0, |a| a.range_violations)
201    }
202
203    pub fn negative_values_in_positive(&self) -> usize {
204        self.accuracy
205            .as_ref()
206            .map_or(0, |a| a.negative_values_in_positive)
207    }
208
209    pub fn future_dates_count(&self) -> usize {
210        self.timeliness.as_ref().map_or(0, |t| t.future_dates_count)
211    }
212
213    pub fn stale_data_ratio(&self) -> f64 {
214        self.timeliness.as_ref().map_or(0.0, |t| t.stale_data_ratio)
215    }
216
217    pub fn temporal_violations(&self) -> usize {
218        self.timeliness
219            .as_ref()
220            .map_or(0, |t| t.temporal_violations)
221    }
222
223    pub fn supports_dimension(&self, dimension: QualityDimension) -> bool {
224        match dimension {
225            QualityDimension::Completeness => self.completeness.is_some(),
226            QualityDimension::Consistency => self.consistency.is_some(),
227            QualityDimension::Uniqueness => self.uniqueness.is_some(),
228            QualityDimension::Accuracy => self.accuracy.is_some(),
229            QualityDimension::Timeliness => self.timeliness.is_some(),
230        }
231    }
232}
233
234/// Confidence level for quality metrics.
235#[derive(Debug, Clone, Serialize, Deserialize)]
236pub enum MetricConfidence {
237    Exact,
238    Approximate {
239        sample_size: usize,
240        population_size: Option<usize>,
241    },
242    Mixed {
243        exact_dimensions: Vec<String>,
244        sampled_dimensions: Vec<String>,
245        sample_size: usize,
246    },
247}
248
249/// Wraps quality metrics with confidence information.
250#[derive(Debug, Clone, Serialize, Deserialize)]
251pub struct QualityAssessment {
252    pub metrics: QualityMetrics,
253    pub confidence: MetricConfidence,
254}
255
256impl QualityAssessment {
257    pub fn exact(metrics: QualityMetrics) -> Self {
258        Self {
259            metrics,
260            confidence: MetricConfidence::Exact,
261        }
262    }
263
264    pub fn approximate(
265        metrics: QualityMetrics,
266        sample_size: usize,
267        population_size: Option<usize>,
268    ) -> Self {
269        Self {
270            metrics,
271            confidence: MetricConfidence::Approximate {
272                sample_size,
273                population_size,
274            },
275        }
276    }
277
278    pub fn score(&self) -> f64 {
279        self.metrics.overall_score()
280    }
281}
282
283impl From<QualityMetrics> for QualityAssessment {
284    fn from(metrics: QualityMetrics) -> Self {
285        Self::exact(metrics)
286    }
287}
288
289#[cfg(test)]
290mod tests {
291    use super::*;
292
293    #[test]
294    fn test_empty_metrics_perfect_score() {
295        let metrics = QualityMetrics::empty();
296        assert!((metrics.overall_score() - 100.0).abs() < 0.01);
297    }
298
299    #[test]
300    fn test_quality_score_weights_sum_to_100() {
301        let metrics = QualityMetrics::empty();
302        assert!((metrics.overall_score() - 100.0).abs() < 0.01);
303    }
304
305    #[test]
306    fn test_quality_score_completeness_weight() {
307        let mut metrics = QualityMetrics::empty();
308        if let Some(ref mut c) = metrics.completeness {
309            c.complete_records_ratio = 0.0;
310        }
311        assert!((metrics.overall_score() - 70.0).abs() < 0.01);
312    }
313
314    #[test]
315    fn test_quality_score_all_bad() {
316        let metrics = QualityMetrics {
317            completeness: Some(CompletenessMetrics {
318                complete_records_ratio: 0.0,
319                ..CompletenessMetrics::default()
320            }),
321            consistency: Some(ConsistencyMetrics {
322                data_type_consistency: 0.0,
323                ..ConsistencyMetrics::default()
324            }),
325            uniqueness: Some(UniquenessMetrics {
326                key_uniqueness: 0.0,
327                ..UniquenessMetrics::default()
328            }),
329            accuracy: Some(AccuracyMetrics {
330                outlier_ratio: 100.0,
331                ..AccuracyMetrics::default()
332            }),
333            timeliness: Some(TimelinessMetrics {
334                stale_data_ratio: 100.0,
335                ..TimelinessMetrics::default()
336            }),
337            ..QualityMetrics::default()
338        };
339
340        assert!((metrics.overall_score() - 0.0).abs() < 0.01);
341    }
342
343    #[test]
344    fn test_partial_dimensions_only_completeness() {
345        let metrics = QualityMetrics {
346            completeness: Some(CompletenessMetrics {
347                complete_records_ratio: 100.0,
348                missing_values_ratio: 0.0,
349                null_columns: vec![],
350            }),
351            ..QualityMetrics::default()
352        };
353
354        assert!(metrics.completeness.is_some());
355        assert!(metrics.consistency.is_none());
356        assert!(metrics.uniqueness.is_none());
357        assert!(metrics.accuracy.is_none());
358        assert!(metrics.timeliness.is_none());
359        assert!((metrics.overall_score() - 100.0).abs() < 0.01);
360    }
361
362    #[test]
363    fn test_partial_dimensions_two_dimensions() {
364        let metrics = QualityMetrics {
365            completeness: Some(CompletenessMetrics {
366                complete_records_ratio: 50.0,
367                ..CompletenessMetrics::default()
368            }),
369            uniqueness: Some(UniquenessMetrics {
370                key_uniqueness: 80.0,
371                ..UniquenessMetrics::default()
372            }),
373            ..QualityMetrics::default()
374        };
375
376        assert!((metrics.overall_score() - 62.0).abs() < 0.01);
377    }
378
379    #[test]
380    fn test_all_dimensions_none_score_zero() {
381        let metrics = QualityMetrics::default();
382
383        assert!((metrics.overall_score() - 0.0).abs() < 0.01);
384    }
385
386    #[test]
387    fn test_partial_dimensions_json_skips_none() {
388        let metrics = QualityMetrics {
389            completeness: Some(CompletenessMetrics::default()),
390            ..QualityMetrics::default()
391        };
392
393        let json = serde_json::to_string(&metrics).unwrap();
394        assert!(json.contains("completeness"));
395        assert!(!json.contains("consistency"));
396        assert!(!json.contains("uniqueness"));
397        assert!(!json.contains("accuracy"));
398        assert!(!json.contains("timeliness"));
399    }
400
401    #[test]
402    fn test_partial_dimensions_flat_accessors_return_defaults() {
403        let metrics = QualityMetrics::default();
404
405        assert!((metrics.complete_records_ratio() - 100.0).abs() < 0.01);
406        assert!((metrics.data_type_consistency() - 100.0).abs() < 0.01);
407        assert!((metrics.key_uniqueness() - 100.0).abs() < 0.01);
408        assert!((metrics.missing_values_ratio() - 0.0).abs() < 0.01);
409        assert_eq!(metrics.duplicate_rows(), 0);
410        assert!(!metrics.high_cardinality_warning());
411    }
412}