1use crate::error::EvalResult;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct FormatAnalysis {
12 pub date_formats: Vec<FormatVariation>,
14 pub amount_formats: Vec<FormatVariation>,
16 pub identifier_formats: Vec<FormatVariation>,
18 pub currency_compliance: f64,
20 pub consistency_score: f64,
22 pub issues: Vec<FormatIssue>,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct FormatVariation {
29 pub field_name: String,
31 pub format_type: String,
33 pub count: usize,
35 pub percentage: f64,
37 pub examples: Vec<String>,
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct FormatIssue {
44 pub field_name: String,
46 pub issue_type: FormatIssueType,
48 pub description: String,
50 pub examples: Vec<String>,
52}
53
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
56pub enum FormatIssueType {
57 InconsistentDateFormat,
59 InconsistentAmountFormat,
61 InconsistentCase,
63 InvalidCurrencyCode,
65 InvalidDecimalPlaces,
67 InvalidSeparator,
69}
70
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
73pub enum DateFormat {
74 ISO,
76 US,
78 EU,
80 Long,
82 Unknown,
84}
85
86#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
88pub enum AmountFormat {
89 Plain,
91 USComma,
93 European,
95 CurrencyPrefix,
97 CurrencySuffix,
99 Unknown,
101}
102
103#[derive(Debug, Clone, Default)]
105pub struct FormatData {
106 pub date_fields: HashMap<String, Vec<String>>,
108 pub amount_fields: HashMap<String, Vec<String>>,
110 pub identifier_fields: HashMap<String, Vec<String>>,
112 pub currency_codes: Vec<String>,
114}
115
116pub struct FormatAnalyzer {
118 valid_currencies: std::collections::HashSet<String>,
120 min_field_consistency: f64,
122}
123
124impl FormatAnalyzer {
125 pub fn new() -> Self {
127 let valid_currencies: std::collections::HashSet<String> = [
128 "USD", "EUR", "GBP", "JPY", "CHF", "CAD", "AUD", "CNY", "HKD", "SGD", "INR", "BRL",
129 "MXN", "KRW", "RUB", "ZAR", "SEK", "NOK", "DKK", "NZD", "THB", "MYR", "IDR", "PHP",
130 ]
131 .iter()
132 .map(std::string::ToString::to_string)
133 .collect();
134
135 Self {
136 valid_currencies,
137 min_field_consistency: 0.95,
138 }
139 }
140
141 pub fn analyze(&self, data: &FormatData) -> EvalResult<FormatAnalysis> {
143 let mut date_formats = Vec::new();
144 let mut amount_formats = Vec::new();
145 let mut identifier_formats = Vec::new();
146 let mut issues = Vec::new();
147 let mut consistency_scores = Vec::new();
148
149 for (field_name, values) in &data.date_fields {
151 let (formats, field_issues, consistency) = self.analyze_date_field(field_name, values);
152 date_formats.extend(formats);
153 issues.extend(field_issues);
154 consistency_scores.push(consistency);
155 }
156
157 for (field_name, values) in &data.amount_fields {
159 let (formats, field_issues, consistency) =
160 self.analyze_amount_field(field_name, values);
161 amount_formats.extend(formats);
162 issues.extend(field_issues);
163 consistency_scores.push(consistency);
164 }
165
166 for (field_name, values) in &data.identifier_fields {
168 let (formats, field_issues, consistency) =
169 self.analyze_identifier_field(field_name, values);
170 identifier_formats.extend(formats);
171 issues.extend(field_issues);
172 consistency_scores.push(consistency);
173 }
174
175 let valid_count = data
177 .currency_codes
178 .iter()
179 .filter(|c| self.valid_currencies.contains(c.to_uppercase().as_str()))
180 .count();
181 let currency_compliance = if data.currency_codes.is_empty() {
182 1.0
183 } else {
184 valid_count as f64 / data.currency_codes.len() as f64
185 };
186
187 if currency_compliance < 1.0 {
188 let invalid: Vec<_> = data
189 .currency_codes
190 .iter()
191 .filter(|c| !self.valid_currencies.contains(c.to_uppercase().as_str()))
192 .take(5)
193 .cloned()
194 .collect();
195 issues.push(FormatIssue {
196 field_name: "currency_code".to_string(),
197 issue_type: FormatIssueType::InvalidCurrencyCode,
198 description: format!(
199 "Found {} invalid currency codes",
200 data.currency_codes.len() - valid_count
201 ),
202 examples: invalid,
203 });
204 }
205
206 consistency_scores.push(currency_compliance);
207
208 let consistency_score = if consistency_scores.is_empty() {
209 1.0
210 } else {
211 consistency_scores.iter().sum::<f64>() / consistency_scores.len() as f64
212 };
213
214 Ok(FormatAnalysis {
215 date_formats,
216 amount_formats,
217 identifier_formats,
218 currency_compliance,
219 consistency_score,
220 issues,
221 })
222 }
223
224 fn analyze_date_field(
226 &self,
227 field_name: &str,
228 values: &[String],
229 ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
230 let mut format_counts: HashMap<DateFormat, Vec<String>> = HashMap::new();
231
232 for value in values {
233 let format = self.detect_date_format(value);
234 format_counts.entry(format).or_default().push(value.clone());
235 }
236
237 let total = values.len();
238 let variations: Vec<FormatVariation> = format_counts
239 .iter()
240 .map(|(format, examples)| FormatVariation {
241 field_name: field_name.to_string(),
242 format_type: format!("{format:?}"),
243 count: examples.len(),
244 percentage: if total > 0 {
245 examples.len() as f64 / total as f64
246 } else {
247 0.0
248 },
249 examples: examples.iter().take(3).cloned().collect(),
250 })
251 .collect();
252
253 let mut issues = Vec::new();
254 let dominant_count = format_counts
255 .values()
256 .map(std::vec::Vec::len)
257 .max()
258 .unwrap_or(0);
259 let consistency = if total > 0 {
260 dominant_count as f64 / total as f64
261 } else {
262 1.0
263 };
264
265 if consistency < self.min_field_consistency && format_counts.len() > 1 {
266 issues.push(FormatIssue {
267 field_name: field_name.to_string(),
268 issue_type: FormatIssueType::InconsistentDateFormat,
269 description: format!(
270 "Multiple date formats detected ({} variants)",
271 format_counts.len()
272 ),
273 examples: values.iter().take(5).cloned().collect(),
274 });
275 }
276
277 (variations, issues, consistency)
278 }
279
280 fn detect_date_format(&self, value: &str) -> DateFormat {
282 let value = value.trim();
283
284 if value.len() == 10
286 && value.chars().nth(4) == Some('-')
287 && value.chars().nth(7) == Some('-')
288 {
289 return DateFormat::ISO;
290 }
291
292 if value.len() == 10
294 && value.chars().nth(2) == Some('/')
295 && value.chars().nth(5) == Some('/')
296 {
297 return DateFormat::US;
298 }
299
300 if value.len() == 10
302 && value.chars().nth(2) == Some('.')
303 && value.chars().nth(5) == Some('.')
304 {
305 return DateFormat::EU;
306 }
307
308 if value.contains("January")
310 || value.contains("February")
311 || value.contains("March")
312 || value.contains("April")
313 || value.contains("May")
314 || value.contains("June")
315 || value.contains("July")
316 || value.contains("August")
317 || value.contains("September")
318 || value.contains("October")
319 || value.contains("November")
320 || value.contains("December")
321 {
322 return DateFormat::Long;
323 }
324
325 DateFormat::Unknown
326 }
327
328 fn analyze_amount_field(
330 &self,
331 field_name: &str,
332 values: &[String],
333 ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
334 let mut format_counts: HashMap<AmountFormat, Vec<String>> = HashMap::new();
335
336 for value in values {
337 let format = self.detect_amount_format(value);
338 format_counts.entry(format).or_default().push(value.clone());
339 }
340
341 let total = values.len();
342 let variations: Vec<FormatVariation> = format_counts
343 .iter()
344 .map(|(format, examples)| FormatVariation {
345 field_name: field_name.to_string(),
346 format_type: format!("{format:?}"),
347 count: examples.len(),
348 percentage: if total > 0 {
349 examples.len() as f64 / total as f64
350 } else {
351 0.0
352 },
353 examples: examples.iter().take(3).cloned().collect(),
354 })
355 .collect();
356
357 let mut issues = Vec::new();
358 let dominant_count = format_counts
359 .values()
360 .map(std::vec::Vec::len)
361 .max()
362 .unwrap_or(0);
363 let consistency = if total > 0 {
364 dominant_count as f64 / total as f64
365 } else {
366 1.0
367 };
368
369 if consistency < self.min_field_consistency && format_counts.len() > 1 {
370 issues.push(FormatIssue {
371 field_name: field_name.to_string(),
372 issue_type: FormatIssueType::InconsistentAmountFormat,
373 description: format!(
374 "Multiple amount formats detected ({} variants)",
375 format_counts.len()
376 ),
377 examples: values.iter().take(5).cloned().collect(),
378 });
379 }
380
381 (variations, issues, consistency)
382 }
383
384 fn detect_amount_format(&self, value: &str) -> AmountFormat {
386 let value = value.trim();
387
388 if value.starts_with('$') || value.starts_with('€') || value.starts_with('£') {
390 return AmountFormat::CurrencyPrefix;
391 }
392
393 if value.ends_with("EUR")
395 || value.ends_with("USD")
396 || value.ends_with("GBP")
397 || value.ends_with("JPY")
398 {
399 return AmountFormat::CurrencySuffix;
400 }
401
402 if value.contains('.') && value.contains(',') {
404 let dot_pos = value.rfind('.').unwrap_or(0);
405 let comma_pos = value.rfind(',').unwrap_or(0);
406 if comma_pos > dot_pos {
407 return AmountFormat::European;
408 }
409 }
410
411 if value.contains(',') && value.contains('.') {
413 return AmountFormat::USComma;
414 }
415
416 if value.contains('.') || value.chars().all(|c| c.is_ascii_digit() || c == '-') {
418 return AmountFormat::Plain;
419 }
420
421 AmountFormat::Unknown
422 }
423
424 fn analyze_identifier_field(
426 &self,
427 field_name: &str,
428 values: &[String],
429 ) -> (Vec<FormatVariation>, Vec<FormatIssue>, f64) {
430 let mut upper_count = 0;
431 let mut lower_count = 0;
432 let mut mixed_count = 0;
433
434 for value in values {
435 if value
436 .chars()
437 .filter(|c| c.is_alphabetic())
438 .all(char::is_uppercase)
439 {
440 upper_count += 1;
441 } else if value
442 .chars()
443 .filter(|c| c.is_alphabetic())
444 .all(char::is_lowercase)
445 {
446 lower_count += 1;
447 } else {
448 mixed_count += 1;
449 }
450 }
451
452 let total = values.len();
453 let mut variations = Vec::new();
454
455 if upper_count > 0 {
456 variations.push(FormatVariation {
457 field_name: field_name.to_string(),
458 format_type: "UPPERCASE".to_string(),
459 count: upper_count,
460 percentage: upper_count as f64 / total.max(1) as f64,
461 examples: values
462 .iter()
463 .filter(|v| {
464 v.chars()
465 .filter(|c| c.is_alphabetic())
466 .all(char::is_uppercase)
467 })
468 .take(3)
469 .cloned()
470 .collect(),
471 });
472 }
473
474 if lower_count > 0 {
475 variations.push(FormatVariation {
476 field_name: field_name.to_string(),
477 format_type: "lowercase".to_string(),
478 count: lower_count,
479 percentage: lower_count as f64 / total.max(1) as f64,
480 examples: values
481 .iter()
482 .filter(|v| {
483 v.chars()
484 .filter(|c| c.is_alphabetic())
485 .all(char::is_lowercase)
486 })
487 .take(3)
488 .cloned()
489 .collect(),
490 });
491 }
492
493 if mixed_count > 0 {
494 variations.push(FormatVariation {
495 field_name: field_name.to_string(),
496 format_type: "MixedCase".to_string(),
497 count: mixed_count,
498 percentage: mixed_count as f64 / total.max(1) as f64,
499 examples: values.iter().take(3).cloned().collect(),
500 });
501 }
502
503 let dominant_count = upper_count.max(lower_count).max(mixed_count);
504 let consistency = if total > 0 {
505 dominant_count as f64 / total as f64
506 } else {
507 1.0
508 };
509
510 let mut issues = Vec::new();
511 if consistency < self.min_field_consistency && variations.len() > 1 {
512 issues.push(FormatIssue {
513 field_name: field_name.to_string(),
514 issue_type: FormatIssueType::InconsistentCase,
515 description: format!(
516 "Mixed case formats detected ({} variants)",
517 variations.len()
518 ),
519 examples: values.iter().take(5).cloned().collect(),
520 });
521 }
522
523 (variations, issues, consistency)
524 }
525}
526
527impl Default for FormatAnalyzer {
528 fn default() -> Self {
529 Self::new()
530 }
531}
532
533#[cfg(test)]
534#[allow(clippy::unwrap_used)]
535mod tests {
536 use super::*;
537
538 #[test]
539 fn test_consistent_formats() {
540 let mut data = FormatData::default();
541 data.date_fields.insert(
542 "posting_date".to_string(),
543 vec![
544 "2024-01-15".to_string(),
545 "2024-01-16".to_string(),
546 "2024-01-17".to_string(),
547 ],
548 );
549
550 let analyzer = FormatAnalyzer::new();
551 let result = analyzer.analyze(&data).unwrap();
552
553 assert_eq!(result.date_formats.len(), 1);
554 assert!(result.consistency_score > 0.95);
555 }
556
557 #[test]
558 fn test_date_format_detection() {
559 let analyzer = FormatAnalyzer::new();
560
561 assert_eq!(analyzer.detect_date_format("2024-01-15"), DateFormat::ISO);
562 assert_eq!(analyzer.detect_date_format("01/15/2024"), DateFormat::US);
563 assert_eq!(analyzer.detect_date_format("15.01.2024"), DateFormat::EU);
564 assert_eq!(
565 analyzer.detect_date_format("January 15, 2024"),
566 DateFormat::Long
567 );
568 }
569
570 #[test]
571 fn test_currency_compliance() {
572 let mut data = FormatData::default();
573 data.currency_codes = vec!["USD".to_string(), "EUR".to_string(), "INVALID".to_string()];
574
575 let analyzer = FormatAnalyzer::new();
576 let result = analyzer.analyze(&data).unwrap();
577
578 assert!(result.currency_compliance < 1.0);
579 assert!(result.currency_compliance > 0.5);
580 }
581}