1use crate::error::EvalResult;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct CompletenessAnalysis {
12 pub total_records: usize,
14 pub field_completeness: Vec<FieldCompleteness>,
16 pub overall_completeness: f64,
18 pub required_completeness: f64,
20 pub optional_completeness: f64,
22 pub missing_pattern: MissingPattern,
24 pub systematic_missing: Vec<String>,
26 pub record_completeness: f64,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct FieldCompleteness {
33 pub field_name: String,
35 pub is_required: bool,
37 pub total_values: usize,
39 pub present_values: usize,
41 pub null_values: usize,
43 pub empty_values: usize,
45 pub completeness_rate: f64,
47}
48
49#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
51pub enum MissingPattern {
52 MCAR,
54 MAR,
56 MNAR,
58 Systematic,
60 None,
62}
63
64#[derive(Debug, Clone)]
66pub struct FieldDefinition {
67 pub name: String,
69 pub required: bool,
71 pub related_fields: Vec<String>,
73}
74
75#[derive(Debug, Clone)]
77pub enum FieldValue {
78 Present,
80 Null,
82 Empty,
84}
85
86pub struct CompletenessAnalyzer {
88 field_definitions: Vec<FieldDefinition>,
90}
91
92impl CompletenessAnalyzer {
93 pub fn new(field_definitions: Vec<FieldDefinition>) -> Self {
95 Self { field_definitions }
96 }
97
98 pub fn analyze(
100 &self,
101 records: &[HashMap<String, FieldValue>],
102 ) -> EvalResult<CompletenessAnalysis> {
103 let total_records = records.len();
104 if total_records == 0 {
105 return Ok(CompletenessAnalysis {
106 total_records: 0,
107 field_completeness: vec![],
108 overall_completeness: 1.0,
109 required_completeness: 1.0,
110 optional_completeness: 1.0,
111 missing_pattern: MissingPattern::None,
112 systematic_missing: vec![],
113 record_completeness: 1.0,
114 });
115 }
116
117 let mut field_completeness = Vec::new();
118 let mut required_total = 0;
119 let mut required_present = 0;
120 let mut optional_total = 0;
121 let mut optional_present = 0;
122 let mut all_total = 0;
123 let mut all_present = 0;
124
125 for field_def in &self.field_definitions {
127 let mut present = 0;
128 let mut null = 0;
129 let mut empty = 0;
130
131 for record in records {
132 match record.get(&field_def.name) {
133 Some(FieldValue::Present) => present += 1,
134 Some(FieldValue::Null) => null += 1,
135 Some(FieldValue::Empty) => empty += 1,
136 None => null += 1,
137 }
138 }
139
140 let total = present + null + empty;
141 let rate = if total > 0 {
142 present as f64 / total as f64
143 } else {
144 1.0
145 };
146
147 if field_def.required {
148 required_total += total;
149 required_present += present;
150 } else {
151 optional_total += total;
152 optional_present += present;
153 }
154
155 all_total += total;
156 all_present += present;
157
158 field_completeness.push(FieldCompleteness {
159 field_name: field_def.name.clone(),
160 is_required: field_def.required,
161 total_values: total,
162 present_values: present,
163 null_values: null,
164 empty_values: empty,
165 completeness_rate: rate,
166 });
167 }
168
169 let overall_completeness = if all_total > 0 {
170 all_present as f64 / all_total as f64
171 } else {
172 1.0
173 };
174
175 let required_completeness = if required_total > 0 {
176 required_present as f64 / required_total as f64
177 } else {
178 1.0
179 };
180
181 let optional_completeness = if optional_total > 0 {
182 optional_present as f64 / optional_total as f64
183 } else {
184 1.0
185 };
186
187 let (missing_pattern, systematic_missing) =
189 self.detect_missing_pattern(records, &field_completeness);
190
191 let required_fields: Vec<_> = self
193 .field_definitions
194 .iter()
195 .filter(|f| f.required)
196 .map(|f| &f.name)
197 .collect();
198
199 let complete_records = records
200 .iter()
201 .filter(|record| {
202 required_fields
203 .iter()
204 .all(|field| matches!(record.get(*field), Some(FieldValue::Present)))
205 })
206 .count();
207
208 let record_completeness = if total_records > 0 {
209 complete_records as f64 / total_records as f64
210 } else {
211 1.0
212 };
213
214 Ok(CompletenessAnalysis {
215 total_records,
216 field_completeness,
217 overall_completeness,
218 required_completeness,
219 optional_completeness,
220 missing_pattern,
221 systematic_missing,
222 record_completeness,
223 })
224 }
225
226 fn detect_missing_pattern(
228 &self,
229 records: &[HashMap<String, FieldValue>],
230 field_completeness: &[FieldCompleteness],
231 ) -> (MissingPattern, Vec<String>) {
232 let mut systematic_missing = Vec::new();
233
234 for field_def in &self.field_definitions {
236 if !field_def.related_fields.is_empty() {
237 let field_missing: Vec<bool> = records
238 .iter()
239 .map(|r| !matches!(r.get(&field_def.name), Some(FieldValue::Present)))
240 .collect();
241
242 for related in &field_def.related_fields {
243 let related_missing: Vec<bool> = records
244 .iter()
245 .map(|r| !matches!(r.get(related), Some(FieldValue::Present)))
246 .collect();
247
248 let both_missing = field_missing
250 .iter()
251 .zip(&related_missing)
252 .filter(|(a, b)| **a && **b)
253 .count();
254 let either_missing = field_missing
255 .iter()
256 .zip(&related_missing)
257 .filter(|(a, b)| **a || **b)
258 .count();
259
260 if either_missing > 0 && both_missing as f64 / either_missing as f64 > 0.8 {
261 systematic_missing.push(format!("{} + {}", field_def.name, related));
262 }
263 }
264 }
265 }
266
267 if !systematic_missing.is_empty() {
268 return (MissingPattern::Systematic, systematic_missing);
269 }
270
271 let rates: Vec<f64> = field_completeness
273 .iter()
274 .map(|f| 1.0 - f.completeness_rate)
275 .filter(|r| *r > 0.0)
276 .collect();
277
278 if rates.is_empty() {
279 return (MissingPattern::None, vec![]);
280 }
281
282 let mean_rate = rates.iter().sum::<f64>() / rates.len() as f64;
283 let variance: f64 =
284 rates.iter().map(|r| (r - mean_rate).powi(2)).sum::<f64>() / rates.len() as f64;
285 let std_dev = variance.sqrt();
286
287 if std_dev < 0.05 {
289 return (MissingPattern::MCAR, vec![]);
290 }
291
292 (MissingPattern::MAR, vec![])
294 }
295}
296
297impl Default for CompletenessAnalyzer {
298 fn default() -> Self {
299 Self::new(vec![])
300 }
301}
302
303#[cfg(test)]
304mod tests {
305 use super::*;
306
307 #[test]
308 fn test_complete_data() {
309 let fields = vec![
310 FieldDefinition {
311 name: "id".to_string(),
312 required: true,
313 related_fields: vec![],
314 },
315 FieldDefinition {
316 name: "name".to_string(),
317 required: true,
318 related_fields: vec![],
319 },
320 ];
321
322 let records: Vec<HashMap<String, FieldValue>> = vec![
323 [
324 ("id".to_string(), FieldValue::Present),
325 ("name".to_string(), FieldValue::Present),
326 ]
327 .into_iter()
328 .collect(),
329 [
330 ("id".to_string(), FieldValue::Present),
331 ("name".to_string(), FieldValue::Present),
332 ]
333 .into_iter()
334 .collect(),
335 ];
336
337 let analyzer = CompletenessAnalyzer::new(fields);
338 let result = analyzer.analyze(&records).unwrap();
339
340 assert_eq!(result.overall_completeness, 1.0);
341 assert_eq!(result.record_completeness, 1.0);
342 }
343
344 #[test]
345 fn test_missing_values() {
346 let fields = vec![
347 FieldDefinition {
348 name: "id".to_string(),
349 required: true,
350 related_fields: vec![],
351 },
352 FieldDefinition {
353 name: "name".to_string(),
354 required: true,
355 related_fields: vec![],
356 },
357 ];
358
359 let records: Vec<HashMap<String, FieldValue>> = vec![
360 [
361 ("id".to_string(), FieldValue::Present),
362 ("name".to_string(), FieldValue::Null),
363 ]
364 .into_iter()
365 .collect(),
366 [
367 ("id".to_string(), FieldValue::Present),
368 ("name".to_string(), FieldValue::Present),
369 ]
370 .into_iter()
371 .collect(),
372 ];
373
374 let analyzer = CompletenessAnalyzer::new(fields);
375 let result = analyzer.analyze(&records).unwrap();
376
377 assert!(result.overall_completeness < 1.0);
378 assert_eq!(result.record_completeness, 0.5);
379 }
380
381 #[test]
382 fn test_empty_records() {
383 let analyzer = CompletenessAnalyzer::default();
384 let result = analyzer.analyze(&[]).unwrap();
385 assert_eq!(result.overall_completeness, 1.0);
386 }
387}