Skip to main content

datasynth_eval/quality/
format.rs

1//! Format consistency evaluation.
2//!
3//! Analyzes format variations in dates, amounts, identifiers, and currency codes.
4
5use crate::error::EvalResult;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9/// Results of format consistency analysis.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct FormatAnalysis {
12    /// Date format variations.
13    pub date_formats: Vec<FormatVariation>,
14    /// Amount format variations.
15    pub amount_formats: Vec<FormatVariation>,
16    /// Identifier format variations.
17    pub identifier_formats: Vec<FormatVariation>,
18    /// Currency code compliance.
19    pub currency_compliance: f64,
20    /// Overall format consistency score (0.0-1.0).
21    pub consistency_score: f64,
22    /// Format issues detected.
23    pub issues: Vec<FormatIssue>,
24}
25
26/// Variation in a specific format type.
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct FormatVariation {
29    /// Field name.
30    pub field_name: String,
31    /// Format type (e.g., "ISO", "US", "EU").
32    pub format_type: String,
33    /// Count of values in this format.
34    pub count: usize,
35    /// Percentage of total.
36    pub percentage: f64,
37    /// Example values.
38    pub examples: Vec<String>,
39}
40
41/// A format issue detected.
42#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct FormatIssue {
44    /// Field name.
45    pub field_name: String,
46    /// Issue type.
47    pub issue_type: FormatIssueType,
48    /// Description.
49    pub description: String,
50    /// Example problematic values.
51    pub examples: Vec<String>,
52}
53
54/// Type of format issue.
55#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
56pub enum FormatIssueType {
57    /// Multiple date formats in same field.
58    InconsistentDateFormat,
59    /// Multiple amount formats in same field.
60    InconsistentAmountFormat,
61    /// Case inconsistency in identifiers.
62    InconsistentCase,
63    /// Invalid currency code.
64    InvalidCurrencyCode,
65    /// Invalid decimal places.
66    InvalidDecimalPlaces,
67    /// Invalid separator usage.
68    InvalidSeparator,
69}
70
71/// Detected date format.
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
73pub enum DateFormat {
74    /// ISO 8601 (2024-01-15).
75    ISO,
76    /// US format (01/15/2024).
77    US,
78    /// European format (15.01.2024).
79    EU,
80    /// Long format (January 15, 2024).
81    Long,
82    /// Unknown format.
83    Unknown,
84}
85
86/// Detected amount format.
87#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
88pub enum AmountFormat {
89    /// Plain (1234.56).
90    Plain,
91    /// US with comma thousands (1,234.56).
92    USComma,
93    /// European (1.234,56).
94    European,
95    /// With currency prefix ($1,234.56).
96    CurrencyPrefix,
97    /// With currency suffix (1.234,56 EUR).
98    CurrencySuffix,
99    /// Unknown format.
100    Unknown,
101}
102
103/// Input data for format analysis.
104#[derive(Debug, Clone, Default)]
105pub struct FormatData {
106    /// Date field values: field_name -> values.
107    pub date_fields: HashMap<String, Vec<String>>,
108    /// Amount field values: field_name -> values.
109    pub amount_fields: HashMap<String, Vec<String>>,
110    /// Identifier field values: field_name -> values.
111    pub identifier_fields: HashMap<String, Vec<String>>,
112    /// Currency codes used.
113    pub currency_codes: Vec<String>,
114}
115
116/// Analyzer for format consistency.
117pub struct FormatAnalyzer {
118    /// Valid ISO 4217 currency codes.
119    valid_currencies: std::collections::HashSet<String>,
120    /// Minimum consistency threshold for a single field.
121    min_field_consistency: f64,
122}
123
124impl FormatAnalyzer {
125    /// Create a new analyzer.
126    pub fn new() -> Self {
127        let valid_currencies: std::collections::HashSet<String> = [
128            "USD", "EUR", "GBP", "JPY", "CHF", "CAD", "AUD", "CNY", "HKD", "SGD", "INR", "BRL",
129            "MXN", "KRW", "RUB", "ZAR", "SEK", "NOK", "DKK", "NZD", "THB", "MYR", "IDR", "PHP",
130        ]
131        .iter()
132        .map(|s| s.to_string())
133        .collect();
134
135        Self {
136            valid_currencies,
137            min_field_consistency: 0.95,
138        }
139    }
140
141    /// Analyze format consistency.
142    pub fn analyze(&self, data: &FormatData) -> EvalResult<FormatAnalysis> {
143        let mut date_formats = Vec::new();
144        let mut amount_formats = Vec::new();
145        let mut identifier_formats = Vec::new();
146        let mut issues = Vec::new();
147        let mut consistency_scores = Vec::new();
148
149        // Analyze date formats
150        for (field_name, values) in &data.date_fields {
151            let (formats, field_issues, consistency) = self.analyze_date_field(field_name, values);
152            date_formats.extend(formats);
153            issues.extend(field_issues);
154            consistency_scores.push(consistency);
155        }
156
157        // Analyze amount formats
158        for (field_name, values) in &data.amount_fields {
159            let (formats, field_issues, consistency) =
160                self.analyze_amount_field(field_name, values);
161            amount_formats.extend(formats);
162            issues.extend(field_issues);
163            consistency_scores.push(consistency);
164        }
165
166        // Analyze identifier formats
167        for (field_name, values) in &data.identifier_fields {
168            let (formats, field_issues, consistency) =
169                self.analyze_identifier_field(field_name, values);
170            identifier_formats.extend(formats);
171            issues.extend(field_issues);
172            consistency_scores.push(consistency);
173        }
174
175        // Check currency code compliance
176        let valid_count = data
177            .currency_codes
178            .iter()
179            .filter(|c| self.valid_currencies.contains(c.to_uppercase().as_str()))
180            .count();
181        let currency_compliance = if data.currency_codes.is_empty() {
182            1.0
183        } else {
184            valid_count as f64 / data.currency_codes.len() as f64
185        };
186
187        if currency_compliance < 1.0 {
188            let invalid: Vec<_> = data
189                .currency_codes
190                .iter()
191                .filter(|c| !self.valid_currencies.contains(c.to_uppercase().as_str()))
192                .take(5)
193                .cloned()
194                .collect();
195            issues.push(FormatIssue {
196                field_name: "currency_code".to_string(),
197                issue_type: FormatIssueType::InvalidCurrencyCode,
198                description: format!(
199                    "Found {} invalid currency codes",
200                    data.currency_codes.len() - valid_count
201                ),
202                examples: invalid,
203            });
204        }
205
206        consistency_scores.push(currency_compliance);
207
208        let consistency_score = if consistency_scores.is_empty() {
209            1.0
210        } else {
211            consistency_scores.iter().sum::<f64>() / consistency_scores.len() as f64
212        };
213
214        Ok(FormatAnalysis {
215            date_formats,
216            amount_formats,
217            identifier_formats,
218            currency_compliance,
219            consistency_score,
220            issues,
221        })
222    }
223
224    /// Analyze date field formats.
225    fn analyze_date_field(
226        &self,
227        field_name: &str,
228        values: &[String],
229    ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
230        let mut format_counts: HashMap<DateFormat, Vec<String>> = HashMap::new();
231
232        for value in values {
233            let format = self.detect_date_format(value);
234            format_counts.entry(format).or_default().push(value.clone());
235        }
236
237        let total = values.len();
238        let variations: Vec<FormatVariation> = format_counts
239            .iter()
240            .map(|(format, examples)| FormatVariation {
241                field_name: field_name.to_string(),
242                format_type: format!("{:?}", format),
243                count: examples.len(),
244                percentage: if total > 0 {
245                    examples.len() as f64 / total as f64
246                } else {
247                    0.0
248                },
249                examples: examples.iter().take(3).cloned().collect(),
250            })
251            .collect();
252
253        let mut issues = Vec::new();
254        let dominant_count = format_counts.values().map(|v| v.len()).max().unwrap_or(0);
255        let consistency = if total > 0 {
256            dominant_count as f64 / total as f64
257        } else {
258            1.0
259        };
260
261        if consistency < self.min_field_consistency && format_counts.len() > 1 {
262            issues.push(FormatIssue {
263                field_name: field_name.to_string(),
264                issue_type: FormatIssueType::InconsistentDateFormat,
265                description: format!(
266                    "Multiple date formats detected ({} variants)",
267                    format_counts.len()
268                ),
269                examples: values.iter().take(5).cloned().collect(),
270            });
271        }
272
273        (variations, issues, consistency)
274    }
275
276    /// Detect date format from a string.
277    fn detect_date_format(&self, value: &str) -> DateFormat {
278        let value = value.trim();
279
280        // ISO format: 2024-01-15
281        if value.len() == 10
282            && value.chars().nth(4) == Some('-')
283            && value.chars().nth(7) == Some('-')
284        {
285            return DateFormat::ISO;
286        }
287
288        // US format: 01/15/2024
289        if value.len() == 10
290            && value.chars().nth(2) == Some('/')
291            && value.chars().nth(5) == Some('/')
292        {
293            return DateFormat::US;
294        }
295
296        // EU format: 15.01.2024
297        if value.len() == 10
298            && value.chars().nth(2) == Some('.')
299            && value.chars().nth(5) == Some('.')
300        {
301            return DateFormat::EU;
302        }
303
304        // Long format contains month name
305        if value.contains("January")
306            || value.contains("February")
307            || value.contains("March")
308            || value.contains("April")
309            || value.contains("May")
310            || value.contains("June")
311            || value.contains("July")
312            || value.contains("August")
313            || value.contains("September")
314            || value.contains("October")
315            || value.contains("November")
316            || value.contains("December")
317        {
318            return DateFormat::Long;
319        }
320
321        DateFormat::Unknown
322    }
323
324    /// Analyze amount field formats.
325    fn analyze_amount_field(
326        &self,
327        field_name: &str,
328        values: &[String],
329    ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
330        let mut format_counts: HashMap<AmountFormat, Vec<String>> = HashMap::new();
331
332        for value in values {
333            let format = self.detect_amount_format(value);
334            format_counts.entry(format).or_default().push(value.clone());
335        }
336
337        let total = values.len();
338        let variations: Vec<FormatVariation> = format_counts
339            .iter()
340            .map(|(format, examples)| FormatVariation {
341                field_name: field_name.to_string(),
342                format_type: format!("{:?}", format),
343                count: examples.len(),
344                percentage: if total > 0 {
345                    examples.len() as f64 / total as f64
346                } else {
347                    0.0
348                },
349                examples: examples.iter().take(3).cloned().collect(),
350            })
351            .collect();
352
353        let mut issues = Vec::new();
354        let dominant_count = format_counts.values().map(|v| v.len()).max().unwrap_or(0);
355        let consistency = if total > 0 {
356            dominant_count as f64 / total as f64
357        } else {
358            1.0
359        };
360
361        if consistency < self.min_field_consistency && format_counts.len() > 1 {
362            issues.push(FormatIssue {
363                field_name: field_name.to_string(),
364                issue_type: FormatIssueType::InconsistentAmountFormat,
365                description: format!(
366                    "Multiple amount formats detected ({} variants)",
367                    format_counts.len()
368                ),
369                examples: values.iter().take(5).cloned().collect(),
370            });
371        }
372
373        (variations, issues, consistency)
374    }
375
376    /// Detect amount format from a string.
377    fn detect_amount_format(&self, value: &str) -> AmountFormat {
378        let value = value.trim();
379
380        // Currency prefix ($, €, £)
381        if value.starts_with('$') || value.starts_with('€') || value.starts_with('£') {
382            return AmountFormat::CurrencyPrefix;
383        }
384
385        // Currency suffix (EUR, USD at end)
386        if value.ends_with("EUR")
387            || value.ends_with("USD")
388            || value.ends_with("GBP")
389            || value.ends_with("JPY")
390        {
391            return AmountFormat::CurrencySuffix;
392        }
393
394        // European format (1.234,56)
395        if value.contains('.') && value.contains(',') {
396            let dot_pos = value.rfind('.').unwrap_or(0);
397            let comma_pos = value.rfind(',').unwrap_or(0);
398            if comma_pos > dot_pos {
399                return AmountFormat::European;
400            }
401        }
402
403        // US comma format (1,234.56)
404        if value.contains(',') && value.contains('.') {
405            return AmountFormat::USComma;
406        }
407
408        // Plain format (1234.56)
409        if value.contains('.') || value.chars().all(|c| c.is_ascii_digit() || c == '-') {
410            return AmountFormat::Plain;
411        }
412
413        AmountFormat::Unknown
414    }
415
416    /// Analyze identifier field formats.
417    fn analyze_identifier_field(
418        &self,
419        field_name: &str,
420        values: &[String],
421    ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
422        let mut upper_count = 0;
423        let mut lower_count = 0;
424        let mut mixed_count = 0;
425
426        for value in values {
427            if value
428                .chars()
429                .filter(|c| c.is_alphabetic())
430                .all(|c| c.is_uppercase())
431            {
432                upper_count += 1;
433            } else if value
434                .chars()
435                .filter(|c| c.is_alphabetic())
436                .all(|c| c.is_lowercase())
437            {
438                lower_count += 1;
439            } else {
440                mixed_count += 1;
441            }
442        }
443
444        let total = values.len();
445        let mut variations = Vec::new();
446
447        if upper_count > 0 {
448            variations.push(FormatVariation {
449                field_name: field_name.to_string(),
450                format_type: "UPPERCASE".to_string(),
451                count: upper_count,
452                percentage: upper_count as f64 / total.max(1) as f64,
453                examples: values
454                    .iter()
455                    .filter(|v| {
456                        v.chars()
457                            .filter(|c| c.is_alphabetic())
458                            .all(|c| c.is_uppercase())
459                    })
460                    .take(3)
461                    .cloned()
462                    .collect(),
463            });
464        }
465
466        if lower_count > 0 {
467            variations.push(FormatVariation {
468                field_name: field_name.to_string(),
469                format_type: "lowercase".to_string(),
470                count: lower_count,
471                percentage: lower_count as f64 / total.max(1) as f64,
472                examples: values
473                    .iter()
474                    .filter(|v| {
475                        v.chars()
476                            .filter(|c| c.is_alphabetic())
477                            .all(|c| c.is_lowercase())
478                    })
479                    .take(3)
480                    .cloned()
481                    .collect(),
482            });
483        }
484
485        if mixed_count > 0 {
486            variations.push(FormatVariation {
487                field_name: field_name.to_string(),
488                format_type: "MixedCase".to_string(),
489                count: mixed_count,
490                percentage: mixed_count as f64 / total.max(1) as f64,
491                examples: values.iter().take(3).cloned().collect(),
492            });
493        }
494
495        let dominant_count = upper_count.max(lower_count).max(mixed_count);
496        let consistency = if total > 0 {
497            dominant_count as f64 / total as f64
498        } else {
499            1.0
500        };
501
502        let mut issues = Vec::new();
503        if consistency < self.min_field_consistency && variations.len() > 1 {
504            issues.push(FormatIssue {
505                field_name: field_name.to_string(),
506                issue_type: FormatIssueType::InconsistentCase,
507                description: format!(
508                    "Mixed case formats detected ({} variants)",
509                    variations.len()
510                ),
511                examples: values.iter().take(5).cloned().collect(),
512            });
513        }
514
515        (variations, issues, consistency)
516    }
517}
518
519impl Default for FormatAnalyzer {
520    fn default() -> Self {
521        Self::new()
522    }
523}
524
525#[cfg(test)]
526mod tests {
527    use super::*;
528
529    #[test]
530    fn test_consistent_formats() {
531        let mut data = FormatData::default();
532        data.date_fields.insert(
533            "posting_date".to_string(),
534            vec![
535                "2024-01-15".to_string(),
536                "2024-01-16".to_string(),
537                "2024-01-17".to_string(),
538            ],
539        );
540
541        let analyzer = FormatAnalyzer::new();
542        let result = analyzer.analyze(&data).unwrap();
543
544        assert_eq!(result.date_formats.len(), 1);
545        assert!(result.consistency_score > 0.95);
546    }
547
548    #[test]
549    fn test_date_format_detection() {
550        let analyzer = FormatAnalyzer::new();
551
552        assert_eq!(analyzer.detect_date_format("2024-01-15"), DateFormat::ISO);
553        assert_eq!(analyzer.detect_date_format("01/15/2024"), DateFormat::US);
554        assert_eq!(analyzer.detect_date_format("15.01.2024"), DateFormat::EU);
555        assert_eq!(
556            analyzer.detect_date_format("January 15, 2024"),
557            DateFormat::Long
558        );
559    }
560
561    #[test]
562    fn test_currency_compliance() {
563        let mut data = FormatData::default();
564        data.currency_codes = vec!["USD".to_string(), "EUR".to_string(), "INVALID".to_string()];
565
566        let analyzer = FormatAnalyzer::new();
567        let result = analyzer.analyze(&data).unwrap();
568
569        assert!(result.currency_compliance < 1.0);
570        assert!(result.currency_compliance > 0.5);
571    }
572}