Skip to main content

datasynth_eval/quality/
completeness.rs

1//! Completeness evaluation.
2//!
3//! Analyzes missing values, required field coverage, and missing patterns.
4
5use crate::error::EvalResult;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9/// Results of completeness analysis.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct CompletenessAnalysis {
12    /// Total records analyzed.
13    pub total_records: usize,
14    /// Per-field completeness metrics.
15    pub field_completeness: Vec<FieldCompleteness>,
16    /// Overall completeness rate (0.0-1.0).
17    pub overall_completeness: f64,
18    /// Required field completeness rate.
19    pub required_completeness: f64,
20    /// Optional field completeness rate.
21    pub optional_completeness: f64,
22    /// Detected missing pattern.
23    pub missing_pattern: MissingPattern,
24    /// Fields with systematic missing values.
25    pub systematic_missing: Vec<String>,
26    /// Record-level completeness (% of records with all required fields).
27    pub record_completeness: f64,
28}
29
30/// Completeness metrics for a single field.
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct FieldCompleteness {
33    /// Field name.
34    pub field_name: String,
35    /// Whether field is required.
36    pub is_required: bool,
37    /// Total values.
38    pub total_values: usize,
39    /// Non-null, non-empty values.
40    pub present_values: usize,
41    /// Null values.
42    pub null_values: usize,
43    /// Empty string values.
44    pub empty_values: usize,
45    /// Completeness rate (0.0-1.0).
46    pub completeness_rate: f64,
47}
48
49/// Detected missing value pattern.
50#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
51pub enum MissingPattern {
52    /// Missing Completely At Random - equal probability across all records.
53    MCAR,
54    /// Missing At Random - depends on other observed values.
55    MAR,
56    /// Missing Not At Random - depends on the missing value itself.
57    MNAR,
58    /// Systematic - entire field groups missing together.
59    Systematic,
60    /// No significant missing pattern detected.
61    None,
62}
63
64/// Field definition for completeness checking.
65#[derive(Debug, Clone)]
66pub struct FieldDefinition {
67    /// Field name.
68    pub name: String,
69    /// Whether field is required.
70    pub required: bool,
71    /// Related fields (for pattern detection).
72    pub related_fields: Vec<String>,
73}
74
75/// A single field value for completeness checking.
76#[derive(Debug, Clone)]
77pub enum FieldValue {
78    /// Present value (not null, not empty).
79    Present,
80    /// Null value.
81    Null,
82    /// Empty string.
83    Empty,
84}
85
86/// Analyzer for completeness.
87pub struct CompletenessAnalyzer {
88    /// Field definitions.
89    field_definitions: Vec<FieldDefinition>,
90}
91
92impl CompletenessAnalyzer {
93    /// Create a new analyzer with field definitions.
94    pub fn new(field_definitions: Vec<FieldDefinition>) -> Self {
95        Self { field_definitions }
96    }
97
98    /// Analyze completeness of records.
99    pub fn analyze(
100        &self,
101        records: &[HashMap<String, FieldValue>],
102    ) -> EvalResult<CompletenessAnalysis> {
103        let total_records = records.len();
104        if total_records == 0 {
105            return Ok(CompletenessAnalysis {
106                total_records: 0,
107                field_completeness: vec![],
108                overall_completeness: 1.0,
109                required_completeness: 1.0,
110                optional_completeness: 1.0,
111                missing_pattern: MissingPattern::None,
112                systematic_missing: vec![],
113                record_completeness: 1.0,
114            });
115        }
116
117        let mut field_completeness = Vec::new();
118        let mut required_total = 0;
119        let mut required_present = 0;
120        let mut optional_total = 0;
121        let mut optional_present = 0;
122        let mut all_total = 0;
123        let mut all_present = 0;
124
125        // Analyze each field
126        for field_def in &self.field_definitions {
127            let mut present = 0;
128            let mut null = 0;
129            let mut empty = 0;
130
131            for record in records {
132                match record.get(&field_def.name) {
133                    Some(FieldValue::Present) => present += 1,
134                    Some(FieldValue::Null) => null += 1,
135                    Some(FieldValue::Empty) => empty += 1,
136                    None => null += 1,
137                }
138            }
139
140            let total = present + null + empty;
141            let rate = if total > 0 {
142                present as f64 / total as f64
143            } else {
144                1.0
145            };
146
147            if field_def.required {
148                required_total += total;
149                required_present += present;
150            } else {
151                optional_total += total;
152                optional_present += present;
153            }
154
155            all_total += total;
156            all_present += present;
157
158            field_completeness.push(FieldCompleteness {
159                field_name: field_def.name.clone(),
160                is_required: field_def.required,
161                total_values: total,
162                present_values: present,
163                null_values: null,
164                empty_values: empty,
165                completeness_rate: rate,
166            });
167        }
168
169        let overall_completeness = if all_total > 0 {
170            all_present as f64 / all_total as f64
171        } else {
172            1.0
173        };
174
175        let required_completeness = if required_total > 0 {
176            required_present as f64 / required_total as f64
177        } else {
178            1.0
179        };
180
181        let optional_completeness = if optional_total > 0 {
182            optional_present as f64 / optional_total as f64
183        } else {
184            1.0
185        };
186
187        // Detect missing pattern
188        let (missing_pattern, systematic_missing) =
189            self.detect_missing_pattern(records, &field_completeness);
190
191        // Calculate record-level completeness
192        let required_fields: Vec<_> = self
193            .field_definitions
194            .iter()
195            .filter(|f| f.required)
196            .map(|f| &f.name)
197            .collect();
198
199        let complete_records = records
200            .iter()
201            .filter(|record| {
202                required_fields
203                    .iter()
204                    .all(|field| matches!(record.get(*field), Some(FieldValue::Present)))
205            })
206            .count();
207
208        let record_completeness = if total_records > 0 {
209            complete_records as f64 / total_records as f64
210        } else {
211            1.0
212        };
213
214        Ok(CompletenessAnalysis {
215            total_records,
216            field_completeness,
217            overall_completeness,
218            required_completeness,
219            optional_completeness,
220            missing_pattern,
221            systematic_missing,
222            record_completeness,
223        })
224    }
225
226    /// Detect missing value pattern.
227    fn detect_missing_pattern(
228        &self,
229        records: &[HashMap<String, FieldValue>],
230        field_completeness: &[FieldCompleteness],
231    ) -> (MissingPattern, Vec<String>) {
232        let mut systematic_missing = Vec::new();
233
234        // Check for systematic patterns (fields missing together)
235        for field_def in &self.field_definitions {
236            if !field_def.related_fields.is_empty() {
237                let field_missing: Vec<bool> = records
238                    .iter()
239                    .map(|r| !matches!(r.get(&field_def.name), Some(FieldValue::Present)))
240                    .collect();
241
242                for related in &field_def.related_fields {
243                    let related_missing: Vec<bool> = records
244                        .iter()
245                        .map(|r| !matches!(r.get(related), Some(FieldValue::Present)))
246                        .collect();
247
248                    // Check correlation
249                    let both_missing = field_missing
250                        .iter()
251                        .zip(&related_missing)
252                        .filter(|(a, b)| **a && **b)
253                        .count();
254                    let either_missing = field_missing
255                        .iter()
256                        .zip(&related_missing)
257                        .filter(|(a, b)| **a || **b)
258                        .count();
259
260                    if either_missing > 0 && both_missing as f64 / either_missing as f64 > 0.8 {
261                        systematic_missing.push(format!("{} + {}", field_def.name, related));
262                    }
263                }
264            }
265        }
266
267        if !systematic_missing.is_empty() {
268            return (MissingPattern::Systematic, systematic_missing);
269        }
270
271        // Check for MCAR (uniform missing rate across all fields)
272        let rates: Vec<f64> = field_completeness
273            .iter()
274            .map(|f| 1.0 - f.completeness_rate)
275            .filter(|r| *r > 0.0)
276            .collect();
277
278        if rates.is_empty() {
279            return (MissingPattern::None, vec![]);
280        }
281
282        let mean_rate = rates.iter().sum::<f64>() / rates.len() as f64;
283        let variance: f64 =
284            rates.iter().map(|r| (r - mean_rate).powi(2)).sum::<f64>() / rates.len() as f64;
285        let std_dev = variance.sqrt();
286
287        // Low variance suggests MCAR
288        if std_dev < 0.05 {
289            return (MissingPattern::MCAR, vec![]);
290        }
291
292        // Default to MAR if we can't determine pattern
293        (MissingPattern::MAR, vec![])
294    }
295}
296
297impl Default for CompletenessAnalyzer {
298    fn default() -> Self {
299        Self::new(vec![])
300    }
301}
302
303#[cfg(test)]
304mod tests {
305    use super::*;
306
307    #[test]
308    fn test_complete_data() {
309        let fields = vec![
310            FieldDefinition {
311                name: "id".to_string(),
312                required: true,
313                related_fields: vec![],
314            },
315            FieldDefinition {
316                name: "name".to_string(),
317                required: true,
318                related_fields: vec![],
319            },
320        ];
321
322        let records: Vec<HashMap<String, FieldValue>> = vec![
323            [
324                ("id".to_string(), FieldValue::Present),
325                ("name".to_string(), FieldValue::Present),
326            ]
327            .into_iter()
328            .collect(),
329            [
330                ("id".to_string(), FieldValue::Present),
331                ("name".to_string(), FieldValue::Present),
332            ]
333            .into_iter()
334            .collect(),
335        ];
336
337        let analyzer = CompletenessAnalyzer::new(fields);
338        let result = analyzer.analyze(&records).unwrap();
339
340        assert_eq!(result.overall_completeness, 1.0);
341        assert_eq!(result.record_completeness, 1.0);
342    }
343
344    #[test]
345    fn test_missing_values() {
346        let fields = vec![
347            FieldDefinition {
348                name: "id".to_string(),
349                required: true,
350                related_fields: vec![],
351            },
352            FieldDefinition {
353                name: "name".to_string(),
354                required: true,
355                related_fields: vec![],
356            },
357        ];
358
359        let records: Vec<HashMap<String, FieldValue>> = vec![
360            [
361                ("id".to_string(), FieldValue::Present),
362                ("name".to_string(), FieldValue::Null),
363            ]
364            .into_iter()
365            .collect(),
366            [
367                ("id".to_string(), FieldValue::Present),
368                ("name".to_string(), FieldValue::Present),
369            ]
370            .into_iter()
371            .collect(),
372        ];
373
374        let analyzer = CompletenessAnalyzer::new(fields);
375        let result = analyzer.analyze(&records).unwrap();
376
377        assert!(result.overall_completeness < 1.0);
378        assert_eq!(result.record_completeness, 0.5);
379    }
380
381    #[test]
382    fn test_empty_records() {
383        let analyzer = CompletenessAnalyzer::default();
384        let result = analyzer.analyze(&[]).unwrap();
385        assert_eq!(result.overall_completeness, 1.0);
386    }
387}