1use crate::error::EvalResult;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct FormatAnalysis {
12 pub date_formats: Vec<FormatVariation>,
14 pub amount_formats: Vec<FormatVariation>,
16 pub identifier_formats: Vec<FormatVariation>,
18 pub currency_compliance: f64,
20 pub consistency_score: f64,
22 pub issues: Vec<FormatIssue>,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct FormatVariation {
29 pub field_name: String,
31 pub format_type: String,
33 pub count: usize,
35 pub percentage: f64,
37 pub examples: Vec<String>,
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct FormatIssue {
44 pub field_name: String,
46 pub issue_type: FormatIssueType,
48 pub description: String,
50 pub examples: Vec<String>,
52}
53
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
56pub enum FormatIssueType {
57 InconsistentDateFormat,
59 InconsistentAmountFormat,
61 InconsistentCase,
63 InvalidCurrencyCode,
65 InvalidDecimalPlaces,
67 InvalidSeparator,
69}
70
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
73pub enum DateFormat {
74 ISO,
76 US,
78 EU,
80 Long,
82 Unknown,
84}
85
86#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
88pub enum AmountFormat {
89 Plain,
91 USComma,
93 European,
95 CurrencyPrefix,
97 CurrencySuffix,
99 Unknown,
101}
102
103#[derive(Debug, Clone, Default)]
105pub struct FormatData {
106 pub date_fields: HashMap<String, Vec<String>>,
108 pub amount_fields: HashMap<String, Vec<String>>,
110 pub identifier_fields: HashMap<String, Vec<String>>,
112 pub currency_codes: Vec<String>,
114}
115
116pub struct FormatAnalyzer {
118 valid_currencies: std::collections::HashSet<String>,
120 min_field_consistency: f64,
122}
123
124impl FormatAnalyzer {
125 pub fn new() -> Self {
127 let valid_currencies: std::collections::HashSet<String> = [
128 "USD", "EUR", "GBP", "JPY", "CHF", "CAD", "AUD", "CNY", "HKD", "SGD", "INR", "BRL",
129 "MXN", "KRW", "RUB", "ZAR", "SEK", "NOK", "DKK", "NZD", "THB", "MYR", "IDR", "PHP",
130 ]
131 .iter()
132 .map(|s| s.to_string())
133 .collect();
134
135 Self {
136 valid_currencies,
137 min_field_consistency: 0.95,
138 }
139 }
140
141 pub fn analyze(&self, data: &FormatData) -> EvalResult<FormatAnalysis> {
143 let mut date_formats = Vec::new();
144 let mut amount_formats = Vec::new();
145 let mut identifier_formats = Vec::new();
146 let mut issues = Vec::new();
147 let mut consistency_scores = Vec::new();
148
149 for (field_name, values) in &data.date_fields {
151 let (formats, field_issues, consistency) = self.analyze_date_field(field_name, values);
152 date_formats.extend(formats);
153 issues.extend(field_issues);
154 consistency_scores.push(consistency);
155 }
156
157 for (field_name, values) in &data.amount_fields {
159 let (formats, field_issues, consistency) =
160 self.analyze_amount_field(field_name, values);
161 amount_formats.extend(formats);
162 issues.extend(field_issues);
163 consistency_scores.push(consistency);
164 }
165
166 for (field_name, values) in &data.identifier_fields {
168 let (formats, field_issues, consistency) =
169 self.analyze_identifier_field(field_name, values);
170 identifier_formats.extend(formats);
171 issues.extend(field_issues);
172 consistency_scores.push(consistency);
173 }
174
175 let valid_count = data
177 .currency_codes
178 .iter()
179 .filter(|c| self.valid_currencies.contains(c.to_uppercase().as_str()))
180 .count();
181 let currency_compliance = if data.currency_codes.is_empty() {
182 1.0
183 } else {
184 valid_count as f64 / data.currency_codes.len() as f64
185 };
186
187 if currency_compliance < 1.0 {
188 let invalid: Vec<_> = data
189 .currency_codes
190 .iter()
191 .filter(|c| !self.valid_currencies.contains(c.to_uppercase().as_str()))
192 .take(5)
193 .cloned()
194 .collect();
195 issues.push(FormatIssue {
196 field_name: "currency_code".to_string(),
197 issue_type: FormatIssueType::InvalidCurrencyCode,
198 description: format!(
199 "Found {} invalid currency codes",
200 data.currency_codes.len() - valid_count
201 ),
202 examples: invalid,
203 });
204 }
205
206 consistency_scores.push(currency_compliance);
207
208 let consistency_score = if consistency_scores.is_empty() {
209 1.0
210 } else {
211 consistency_scores.iter().sum::<f64>() / consistency_scores.len() as f64
212 };
213
214 Ok(FormatAnalysis {
215 date_formats,
216 amount_formats,
217 identifier_formats,
218 currency_compliance,
219 consistency_score,
220 issues,
221 })
222 }
223
224 fn analyze_date_field(
226 &self,
227 field_name: &str,
228 values: &[String],
229 ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
230 let mut format_counts: HashMap<DateFormat, Vec<String>> = HashMap::new();
231
232 for value in values {
233 let format = self.detect_date_format(value);
234 format_counts.entry(format).or_default().push(value.clone());
235 }
236
237 let total = values.len();
238 let variations: Vec<FormatVariation> = format_counts
239 .iter()
240 .map(|(format, examples)| FormatVariation {
241 field_name: field_name.to_string(),
242 format_type: format!("{:?}", format),
243 count: examples.len(),
244 percentage: if total > 0 {
245 examples.len() as f64 / total as f64
246 } else {
247 0.0
248 },
249 examples: examples.iter().take(3).cloned().collect(),
250 })
251 .collect();
252
253 let mut issues = Vec::new();
254 let dominant_count = format_counts.values().map(|v| v.len()).max().unwrap_or(0);
255 let consistency = if total > 0 {
256 dominant_count as f64 / total as f64
257 } else {
258 1.0
259 };
260
261 if consistency < self.min_field_consistency && format_counts.len() > 1 {
262 issues.push(FormatIssue {
263 field_name: field_name.to_string(),
264 issue_type: FormatIssueType::InconsistentDateFormat,
265 description: format!(
266 "Multiple date formats detected ({} variants)",
267 format_counts.len()
268 ),
269 examples: values.iter().take(5).cloned().collect(),
270 });
271 }
272
273 (variations, issues, consistency)
274 }
275
276 fn detect_date_format(&self, value: &str) -> DateFormat {
278 let value = value.trim();
279
280 if value.len() == 10
282 && value.chars().nth(4) == Some('-')
283 && value.chars().nth(7) == Some('-')
284 {
285 return DateFormat::ISO;
286 }
287
288 if value.len() == 10
290 && value.chars().nth(2) == Some('/')
291 && value.chars().nth(5) == Some('/')
292 {
293 return DateFormat::US;
294 }
295
296 if value.len() == 10
298 && value.chars().nth(2) == Some('.')
299 && value.chars().nth(5) == Some('.')
300 {
301 return DateFormat::EU;
302 }
303
304 if value.contains("January")
306 || value.contains("February")
307 || value.contains("March")
308 || value.contains("April")
309 || value.contains("May")
310 || value.contains("June")
311 || value.contains("July")
312 || value.contains("August")
313 || value.contains("September")
314 || value.contains("October")
315 || value.contains("November")
316 || value.contains("December")
317 {
318 return DateFormat::Long;
319 }
320
321 DateFormat::Unknown
322 }
323
324 fn analyze_amount_field(
326 &self,
327 field_name: &str,
328 values: &[String],
329 ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
330 let mut format_counts: HashMap<AmountFormat, Vec<String>> = HashMap::new();
331
332 for value in values {
333 let format = self.detect_amount_format(value);
334 format_counts.entry(format).or_default().push(value.clone());
335 }
336
337 let total = values.len();
338 let variations: Vec<FormatVariation> = format_counts
339 .iter()
340 .map(|(format, examples)| FormatVariation {
341 field_name: field_name.to_string(),
342 format_type: format!("{:?}", format),
343 count: examples.len(),
344 percentage: if total > 0 {
345 examples.len() as f64 / total as f64
346 } else {
347 0.0
348 },
349 examples: examples.iter().take(3).cloned().collect(),
350 })
351 .collect();
352
353 let mut issues = Vec::new();
354 let dominant_count = format_counts.values().map(|v| v.len()).max().unwrap_or(0);
355 let consistency = if total > 0 {
356 dominant_count as f64 / total as f64
357 } else {
358 1.0
359 };
360
361 if consistency < self.min_field_consistency && format_counts.len() > 1 {
362 issues.push(FormatIssue {
363 field_name: field_name.to_string(),
364 issue_type: FormatIssueType::InconsistentAmountFormat,
365 description: format!(
366 "Multiple amount formats detected ({} variants)",
367 format_counts.len()
368 ),
369 examples: values.iter().take(5).cloned().collect(),
370 });
371 }
372
373 (variations, issues, consistency)
374 }
375
376 fn detect_amount_format(&self, value: &str) -> AmountFormat {
378 let value = value.trim();
379
380 if value.starts_with('$') || value.starts_with('€') || value.starts_with('£') {
382 return AmountFormat::CurrencyPrefix;
383 }
384
385 if value.ends_with("EUR")
387 || value.ends_with("USD")
388 || value.ends_with("GBP")
389 || value.ends_with("JPY")
390 {
391 return AmountFormat::CurrencySuffix;
392 }
393
394 if value.contains('.') && value.contains(',') {
396 let dot_pos = value.rfind('.').unwrap_or(0);
397 let comma_pos = value.rfind(',').unwrap_or(0);
398 if comma_pos > dot_pos {
399 return AmountFormat::European;
400 }
401 }
402
403 if value.contains(',') && value.contains('.') {
405 return AmountFormat::USComma;
406 }
407
408 if value.contains('.') || value.chars().all(|c| c.is_ascii_digit() || c == '-') {
410 return AmountFormat::Plain;
411 }
412
413 AmountFormat::Unknown
414 }
415
416 fn analyze_identifier_field(
418 &self,
419 field_name: &str,
420 values: &[String],
421 ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
422 let mut upper_count = 0;
423 let mut lower_count = 0;
424 let mut mixed_count = 0;
425
426 for value in values {
427 if value
428 .chars()
429 .filter(|c| c.is_alphabetic())
430 .all(|c| c.is_uppercase())
431 {
432 upper_count += 1;
433 } else if value
434 .chars()
435 .filter(|c| c.is_alphabetic())
436 .all(|c| c.is_lowercase())
437 {
438 lower_count += 1;
439 } else {
440 mixed_count += 1;
441 }
442 }
443
444 let total = values.len();
445 let mut variations = Vec::new();
446
447 if upper_count > 0 {
448 variations.push(FormatVariation {
449 field_name: field_name.to_string(),
450 format_type: "UPPERCASE".to_string(),
451 count: upper_count,
452 percentage: upper_count as f64 / total.max(1) as f64,
453 examples: values
454 .iter()
455 .filter(|v| {
456 v.chars()
457 .filter(|c| c.is_alphabetic())
458 .all(|c| c.is_uppercase())
459 })
460 .take(3)
461 .cloned()
462 .collect(),
463 });
464 }
465
466 if lower_count > 0 {
467 variations.push(FormatVariation {
468 field_name: field_name.to_string(),
469 format_type: "lowercase".to_string(),
470 count: lower_count,
471 percentage: lower_count as f64 / total.max(1) as f64,
472 examples: values
473 .iter()
474 .filter(|v| {
475 v.chars()
476 .filter(|c| c.is_alphabetic())
477 .all(|c| c.is_lowercase())
478 })
479 .take(3)
480 .cloned()
481 .collect(),
482 });
483 }
484
485 if mixed_count > 0 {
486 variations.push(FormatVariation {
487 field_name: field_name.to_string(),
488 format_type: "MixedCase".to_string(),
489 count: mixed_count,
490 percentage: mixed_count as f64 / total.max(1) as f64,
491 examples: values.iter().take(3).cloned().collect(),
492 });
493 }
494
495 let dominant_count = upper_count.max(lower_count).max(mixed_count);
496 let consistency = if total > 0 {
497 dominant_count as f64 / total as f64
498 } else {
499 1.0
500 };
501
502 let mut issues = Vec::new();
503 if consistency < self.min_field_consistency && variations.len() > 1 {
504 issues.push(FormatIssue {
505 field_name: field_name.to_string(),
506 issue_type: FormatIssueType::InconsistentCase,
507 description: format!(
508 "Mixed case formats detected ({} variants)",
509 variations.len()
510 ),
511 examples: values.iter().take(5).cloned().collect(),
512 });
513 }
514
515 (variations, issues, consistency)
516 }
517}
518
519impl Default for FormatAnalyzer {
520 fn default() -> Self {
521 Self::new()
522 }
523}
524
525#[cfg(test)]
526mod tests {
527 use super::*;
528
529 #[test]
530 fn test_consistent_formats() {
531 let mut data = FormatData::default();
532 data.date_fields.insert(
533 "posting_date".to_string(),
534 vec![
535 "2024-01-15".to_string(),
536 "2024-01-16".to_string(),
537 "2024-01-17".to_string(),
538 ],
539 );
540
541 let analyzer = FormatAnalyzer::new();
542 let result = analyzer.analyze(&data).unwrap();
543
544 assert_eq!(result.date_formats.len(), 1);
545 assert!(result.consistency_score > 0.95);
546 }
547
548 #[test]
549 fn test_date_format_detection() {
550 let analyzer = FormatAnalyzer::new();
551
552 assert_eq!(analyzer.detect_date_format("2024-01-15"), DateFormat::ISO);
553 assert_eq!(analyzer.detect_date_format("01/15/2024"), DateFormat::US);
554 assert_eq!(analyzer.detect_date_format("15.01.2024"), DateFormat::EU);
555 assert_eq!(
556 analyzer.detect_date_format("January 15, 2024"),
557 DateFormat::Long
558 );
559 }
560
561 #[test]
562 fn test_currency_compliance() {
563 let mut data = FormatData::default();
564 data.currency_codes = vec!["USD".to_string(), "EUR".to_string(), "INVALID".to_string()];
565
566 let analyzer = FormatAnalyzer::new();
567 let result = analyzer.analyze(&data).unwrap();
568
569 assert!(result.currency_compliance < 1.0);
570 assert!(result.currency_compliance > 0.5);
571 }
572}