Skip to main content

datasynth_eval/quality/
format.rs

1//! Format consistency evaluation.
2//!
3//! Analyzes format variations in dates, amounts, identifiers, and currency codes.
4
5use crate::error::EvalResult;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9/// Results of format consistency analysis.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct FormatAnalysis {
12    /// Date format variations.
13    pub date_formats: Vec<FormatVariation>,
14    /// Amount format variations.
15    pub amount_formats: Vec<FormatVariation>,
16    /// Identifier format variations.
17    pub identifier_formats: Vec<FormatVariation>,
18    /// Currency code compliance.
19    pub currency_compliance: f64,
20    /// Overall format consistency score (0.0-1.0).
21    pub consistency_score: f64,
22    /// Format issues detected.
23    pub issues: Vec<FormatIssue>,
24}
25
26/// Variation in a specific format type.
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct FormatVariation {
29    /// Field name.
30    pub field_name: String,
31    /// Format type (e.g., "ISO", "US", "EU").
32    pub format_type: String,
33    /// Count of values in this format.
34    pub count: usize,
35    /// Percentage of total.
36    pub percentage: f64,
37    /// Example values.
38    pub examples: Vec<String>,
39}
40
41/// A format issue detected.
42#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct FormatIssue {
44    /// Field name.
45    pub field_name: String,
46    /// Issue type.
47    pub issue_type: FormatIssueType,
48    /// Description.
49    pub description: String,
50    /// Example problematic values.
51    pub examples: Vec<String>,
52}
53
54/// Type of format issue.
55#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
56pub enum FormatIssueType {
57    /// Multiple date formats in same field.
58    InconsistentDateFormat,
59    /// Multiple amount formats in same field.
60    InconsistentAmountFormat,
61    /// Case inconsistency in identifiers.
62    InconsistentCase,
63    /// Invalid currency code.
64    InvalidCurrencyCode,
65    /// Invalid decimal places.
66    InvalidDecimalPlaces,
67    /// Invalid separator usage.
68    InvalidSeparator,
69}
70
71/// Detected date format.
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
73pub enum DateFormat {
74    /// ISO 8601 (2024-01-15).
75    ISO,
76    /// US format (01/15/2024).
77    US,
78    /// European format (15.01.2024).
79    EU,
80    /// Long format (January 15, 2024).
81    Long,
82    /// Unknown format.
83    Unknown,
84}
85
86/// Detected amount format.
87#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
88pub enum AmountFormat {
89    /// Plain (1234.56).
90    Plain,
91    /// US with comma thousands (1,234.56).
92    USComma,
93    /// European (1.234,56).
94    European,
95    /// With currency prefix ($1,234.56).
96    CurrencyPrefix,
97    /// With currency suffix (1.234,56 EUR).
98    CurrencySuffix,
99    /// Unknown format.
100    Unknown,
101}
102
103/// Input data for format analysis.
104#[derive(Debug, Clone, Default)]
105pub struct FormatData {
106    /// Date field values: field_name -> values.
107    pub date_fields: HashMap<String, Vec<String>>,
108    /// Amount field values: field_name -> values.
109    pub amount_fields: HashMap<String, Vec<String>>,
110    /// Identifier field values: field_name -> values.
111    pub identifier_fields: HashMap<String, Vec<String>>,
112    /// Currency codes used.
113    pub currency_codes: Vec<String>,
114}
115
116/// Analyzer for format consistency.
117pub struct FormatAnalyzer {
118    /// Valid ISO 4217 currency codes.
119    valid_currencies: std::collections::HashSet<String>,
120    /// Minimum consistency threshold for a single field.
121    min_field_consistency: f64,
122}
123
124impl FormatAnalyzer {
125    /// Create a new analyzer.
126    pub fn new() -> Self {
127        let valid_currencies: std::collections::HashSet<String> = [
128            "USD", "EUR", "GBP", "JPY", "CHF", "CAD", "AUD", "CNY", "HKD", "SGD", "INR", "BRL",
129            "MXN", "KRW", "RUB", "ZAR", "SEK", "NOK", "DKK", "NZD", "THB", "MYR", "IDR", "PHP",
130        ]
131        .iter()
132        .map(std::string::ToString::to_string)
133        .collect();
134
135        Self {
136            valid_currencies,
137            min_field_consistency: 0.95,
138        }
139    }
140
141    /// Analyze format consistency.
142    pub fn analyze(&self, data: &FormatData) -> EvalResult<FormatAnalysis> {
143        let mut date_formats = Vec::new();
144        let mut amount_formats = Vec::new();
145        let mut identifier_formats = Vec::new();
146        let mut issues = Vec::new();
147        let mut consistency_scores = Vec::new();
148
149        // Analyze date formats
150        for (field_name, values) in &data.date_fields {
151            let (formats, field_issues, consistency) = self.analyze_date_field(field_name, values);
152            date_formats.extend(formats);
153            issues.extend(field_issues);
154            consistency_scores.push(consistency);
155        }
156
157        // Analyze amount formats
158        for (field_name, values) in &data.amount_fields {
159            let (formats, field_issues, consistency) =
160                self.analyze_amount_field(field_name, values);
161            amount_formats.extend(formats);
162            issues.extend(field_issues);
163            consistency_scores.push(consistency);
164        }
165
166        // Analyze identifier formats
167        for (field_name, values) in &data.identifier_fields {
168            let (formats, field_issues, consistency) =
169                self.analyze_identifier_field(field_name, values);
170            identifier_formats.extend(formats);
171            issues.extend(field_issues);
172            consistency_scores.push(consistency);
173        }
174
175        // Check currency code compliance
176        let valid_count = data
177            .currency_codes
178            .iter()
179            .filter(|c| self.valid_currencies.contains(c.to_uppercase().as_str()))
180            .count();
181        let currency_compliance = if data.currency_codes.is_empty() {
182            1.0
183        } else {
184            valid_count as f64 / data.currency_codes.len() as f64
185        };
186
187        if currency_compliance < 1.0 {
188            let invalid: Vec<_> = data
189                .currency_codes
190                .iter()
191                .filter(|c| !self.valid_currencies.contains(c.to_uppercase().as_str()))
192                .take(5)
193                .cloned()
194                .collect();
195            issues.push(FormatIssue {
196                field_name: "currency_code".to_string(),
197                issue_type: FormatIssueType::InvalidCurrencyCode,
198                description: format!(
199                    "Found {} invalid currency codes",
200                    data.currency_codes.len() - valid_count
201                ),
202                examples: invalid,
203            });
204        }
205
206        consistency_scores.push(currency_compliance);
207
208        let consistency_score = if consistency_scores.is_empty() {
209            1.0
210        } else {
211            consistency_scores.iter().sum::<f64>() / consistency_scores.len() as f64
212        };
213
214        Ok(FormatAnalysis {
215            date_formats,
216            amount_formats,
217            identifier_formats,
218            currency_compliance,
219            consistency_score,
220            issues,
221        })
222    }
223
224    /// Analyze date field formats.
225    fn analyze_date_field(
226        &self,
227        field_name: &str,
228        values: &[String],
229    ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
230        let mut format_counts: HashMap<DateFormat, Vec<String>> = HashMap::new();
231
232        for value in values {
233            let format = self.detect_date_format(value);
234            format_counts.entry(format).or_default().push(value.clone());
235        }
236
237        let total = values.len();
238        let variations: Vec<FormatVariation> = format_counts
239            .iter()
240            .map(|(format, examples)| FormatVariation {
241                field_name: field_name.to_string(),
242                format_type: format!("{format:?}"),
243                count: examples.len(),
244                percentage: if total > 0 {
245                    examples.len() as f64 / total as f64
246                } else {
247                    0.0
248                },
249                examples: examples.iter().take(3).cloned().collect(),
250            })
251            .collect();
252
253        let mut issues = Vec::new();
254        let dominant_count = format_counts
255            .values()
256            .map(std::vec::Vec::len)
257            .max()
258            .unwrap_or(0);
259        let consistency = if total > 0 {
260            dominant_count as f64 / total as f64
261        } else {
262            1.0
263        };
264
265        if consistency < self.min_field_consistency && format_counts.len() > 1 {
266            issues.push(FormatIssue {
267                field_name: field_name.to_string(),
268                issue_type: FormatIssueType::InconsistentDateFormat,
269                description: format!(
270                    "Multiple date formats detected ({} variants)",
271                    format_counts.len()
272                ),
273                examples: values.iter().take(5).cloned().collect(),
274            });
275        }
276
277        (variations, issues, consistency)
278    }
279
280    /// Detect date format from a string.
281    fn detect_date_format(&self, value: &str) -> DateFormat {
282        let value = value.trim();
283
284        // ISO format: 2024-01-15
285        if value.len() == 10
286            && value.chars().nth(4) == Some('-')
287            && value.chars().nth(7) == Some('-')
288        {
289            return DateFormat::ISO;
290        }
291
292        // US format: 01/15/2024
293        if value.len() == 10
294            && value.chars().nth(2) == Some('/')
295            && value.chars().nth(5) == Some('/')
296        {
297            return DateFormat::US;
298        }
299
300        // EU format: 15.01.2024
301        if value.len() == 10
302            && value.chars().nth(2) == Some('.')
303            && value.chars().nth(5) == Some('.')
304        {
305            return DateFormat::EU;
306        }
307
308        // Long format contains month name
309        if value.contains("January")
310            || value.contains("February")
311            || value.contains("March")
312            || value.contains("April")
313            || value.contains("May")
314            || value.contains("June")
315            || value.contains("July")
316            || value.contains("August")
317            || value.contains("September")
318            || value.contains("October")
319            || value.contains("November")
320            || value.contains("December")
321        {
322            return DateFormat::Long;
323        }
324
325        DateFormat::Unknown
326    }
327
328    /// Analyze amount field formats.
329    fn analyze_amount_field(
330        &self,
331        field_name: &str,
332        values: &[String],
333    ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
334        let mut format_counts: HashMap<AmountFormat, Vec<String>> = HashMap::new();
335
336        for value in values {
337            let format = self.detect_amount_format(value);
338            format_counts.entry(format).or_default().push(value.clone());
339        }
340
341        let total = values.len();
342        let variations: Vec<FormatVariation> = format_counts
343            .iter()
344            .map(|(format, examples)| FormatVariation {
345                field_name: field_name.to_string(),
346                format_type: format!("{format:?}"),
347                count: examples.len(),
348                percentage: if total > 0 {
349                    examples.len() as f64 / total as f64
350                } else {
351                    0.0
352                },
353                examples: examples.iter().take(3).cloned().collect(),
354            })
355            .collect();
356
357        let mut issues = Vec::new();
358        let dominant_count = format_counts
359            .values()
360            .map(std::vec::Vec::len)
361            .max()
362            .unwrap_or(0);
363        let consistency = if total > 0 {
364            dominant_count as f64 / total as f64
365        } else {
366            1.0
367        };
368
369        if consistency < self.min_field_consistency && format_counts.len() > 1 {
370            issues.push(FormatIssue {
371                field_name: field_name.to_string(),
372                issue_type: FormatIssueType::InconsistentAmountFormat,
373                description: format!(
374                    "Multiple amount formats detected ({} variants)",
375                    format_counts.len()
376                ),
377                examples: values.iter().take(5).cloned().collect(),
378            });
379        }
380
381        (variations, issues, consistency)
382    }
383
384    /// Detect amount format from a string.
385    fn detect_amount_format(&self, value: &str) -> AmountFormat {
386        let value = value.trim();
387
388        // Currency prefix ($, €, £)
389        if value.starts_with('$') || value.starts_with('€') || value.starts_with('£') {
390            return AmountFormat::CurrencyPrefix;
391        }
392
393        // Currency suffix (EUR, USD at end)
394        if value.ends_with("EUR")
395            || value.ends_with("USD")
396            || value.ends_with("GBP")
397            || value.ends_with("JPY")
398        {
399            return AmountFormat::CurrencySuffix;
400        }
401
402        // European format (1.234,56)
403        if value.contains('.') && value.contains(',') {
404            let dot_pos = value.rfind('.').unwrap_or(0);
405            let comma_pos = value.rfind(',').unwrap_or(0);
406            if comma_pos > dot_pos {
407                return AmountFormat::European;
408            }
409        }
410
411        // US comma format (1,234.56)
412        if value.contains(',') && value.contains('.') {
413            return AmountFormat::USComma;
414        }
415
416        // Plain format (1234.56)
417        if value.contains('.') || value.chars().all(|c| c.is_ascii_digit() || c == '-') {
418            return AmountFormat::Plain;
419        }
420
421        AmountFormat::Unknown
422    }
423
424    /// Analyze identifier field formats.
425    fn analyze_identifier_field(
426        &self,
427        field_name: &str,
428        values: &[String],
429    ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
430        let mut upper_count = 0;
431        let mut lower_count = 0;
432        let mut mixed_count = 0;
433
434        for value in values {
435            if value
436                .chars()
437                .filter(|c| c.is_alphabetic())
438                .all(char::is_uppercase)
439            {
440                upper_count += 1;
441            } else if value
442                .chars()
443                .filter(|c| c.is_alphabetic())
444                .all(char::is_lowercase)
445            {
446                lower_count += 1;
447            } else {
448                mixed_count += 1;
449            }
450        }
451
452        let total = values.len();
453        let mut variations = Vec::new();
454
455        if upper_count > 0 {
456            variations.push(FormatVariation {
457                field_name: field_name.to_string(),
458                format_type: "UPPERCASE".to_string(),
459                count: upper_count,
460                percentage: upper_count as f64 / total.max(1) as f64,
461                examples: values
462                    .iter()
463                    .filter(|v| {
464                        v.chars()
465                            .filter(|c| c.is_alphabetic())
466                            .all(char::is_uppercase)
467                    })
468                    .take(3)
469                    .cloned()
470                    .collect(),
471            });
472        }
473
474        if lower_count > 0 {
475            variations.push(FormatVariation {
476                field_name: field_name.to_string(),
477                format_type: "lowercase".to_string(),
478                count: lower_count,
479                percentage: lower_count as f64 / total.max(1) as f64,
480                examples: values
481                    .iter()
482                    .filter(|v| {
483                        v.chars()
484                            .filter(|c| c.is_alphabetic())
485                            .all(char::is_lowercase)
486                    })
487                    .take(3)
488                    .cloned()
489                    .collect(),
490            });
491        }
492
493        if mixed_count > 0 {
494            variations.push(FormatVariation {
495                field_name: field_name.to_string(),
496                format_type: "MixedCase".to_string(),
497                count: mixed_count,
498                percentage: mixed_count as f64 / total.max(1) as f64,
499                examples: values.iter().take(3).cloned().collect(),
500            });
501        }
502
503        let dominant_count = upper_count.max(lower_count).max(mixed_count);
504        let consistency = if total > 0 {
505            dominant_count as f64 / total as f64
506        } else {
507            1.0
508        };
509
510        let mut issues = Vec::new();
511        if consistency < self.min_field_consistency && variations.len() > 1 {
512            issues.push(FormatIssue {
513                field_name: field_name.to_string(),
514                issue_type: FormatIssueType::InconsistentCase,
515                description: format!(
516                    "Mixed case formats detected ({} variants)",
517                    variations.len()
518                ),
519                examples: values.iter().take(5).cloned().collect(),
520            });
521        }
522
523        (variations, issues, consistency)
524    }
525}
526
527impl Default for FormatAnalyzer {
528    fn default() -> Self {
529        Self::new()
530    }
531}
532
533#[cfg(test)]
534#[allow(clippy::unwrap_used)]
535mod tests {
536    use super::*;
537
538    #[test]
539    fn test_consistent_formats() {
540        let mut data = FormatData::default();
541        data.date_fields.insert(
542            "posting_date".to_string(),
543            vec![
544                "2024-01-15".to_string(),
545                "2024-01-16".to_string(),
546                "2024-01-17".to_string(),
547            ],
548        );
549
550        let analyzer = FormatAnalyzer::new();
551        let result = analyzer.analyze(&data).unwrap();
552
553        assert_eq!(result.date_formats.len(), 1);
554        assert!(result.consistency_score > 0.95);
555    }
556
557    #[test]
558    fn test_date_format_detection() {
559        let analyzer = FormatAnalyzer::new();
560
561        assert_eq!(analyzer.detect_date_format("2024-01-15"), DateFormat::ISO);
562        assert_eq!(analyzer.detect_date_format("01/15/2024"), DateFormat::US);
563        assert_eq!(analyzer.detect_date_format("15.01.2024"), DateFormat::EU);
564        assert_eq!(
565            analyzer.detect_date_format("January 15, 2024"),
566            DateFormat::Long
567        );
568    }
569
570    #[test]
571    fn test_currency_compliance() {
572        let mut data = FormatData::default();
573        data.currency_codes = vec!["USD".to_string(), "EUR".to_string(), "INVALID".to_string()];
574
575        let analyzer = FormatAnalyzer::new();
576        let result = analyzer.analyze(&data).unwrap();
577
578        assert!(result.currency_compliance < 1.0);
579        assert!(result.currency_compliance > 0.5);
580    }
581}