1use chrono::{Datelike, NaiveDate};
7use datasynth_core::utils::seeded_rng;
8use rand::Rng;
9use rand_chacha::ChaCha8Rng;
10use rust_decimal::Decimal;
11use rust_decimal_macros::dec;
12use serde::{Deserialize, Serialize};
13
14use datasynth_core::models::{
15 FalsePositiveTrigger, LegitimatePatternType, NearMissLabel, NearMissPattern,
16};
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct NearMissConfig {
21 pub proportion: f64,
23 pub near_duplicate_enabled: bool,
25 pub threshold_proximity_enabled: bool,
27 pub unusual_legitimate_enabled: bool,
29 pub corrected_errors_enabled: bool,
31 pub near_duplicate_days: (u32, u32),
33 pub proximity_range: (f64, f64),
35 pub correction_lag_days: (u32, u32),
37 pub seed: u64,
39}
40
41impl Default for NearMissConfig {
42 fn default() -> Self {
43 Self {
44 proportion: 0.30,
45 near_duplicate_enabled: true,
46 threshold_proximity_enabled: true,
47 unusual_legitimate_enabled: true,
48 corrected_errors_enabled: true,
49 near_duplicate_days: (1, 3),
50 proximity_range: (0.90, 0.99),
51 correction_lag_days: (1, 5),
52 seed: 42,
53 }
54 }
55}
56
57pub struct NearMissGenerator {
59 config: NearMissConfig,
60 rng: ChaCha8Rng,
61 labels: Vec<NearMissLabel>,
63 recent_transactions: Vec<RecentTransaction>,
65 max_recent: usize,
67}
68
69#[derive(Debug, Clone)]
71struct RecentTransaction {
72 document_id: String,
73 date: NaiveDate,
74 amount: Decimal,
75 account: String,
76 counterparty: Option<String>,
77}
78
79impl NearMissGenerator {
80 pub fn new(config: NearMissConfig) -> Self {
82 let rng = seeded_rng(config.seed, 0);
83 Self {
84 config,
85 rng,
86 labels: Vec::new(),
87 recent_transactions: Vec::new(),
88 max_recent: 100,
89 }
90 }
91
92 pub fn record_transaction(
94 &mut self,
95 document_id: impl Into<String>,
96 date: NaiveDate,
97 amount: Decimal,
98 account: impl Into<String>,
99 counterparty: Option<String>,
100 ) {
101 let tx = RecentTransaction {
102 document_id: document_id.into(),
103 date,
104 amount,
105 account: account.into(),
106 counterparty,
107 };
108
109 self.recent_transactions.push(tx);
110
111 if self.recent_transactions.len() > self.max_recent {
113 self.recent_transactions.remove(0);
114 }
115 }
116
117 pub fn check_near_miss(
119 &mut self,
120 document_id: impl Into<String>,
121 date: NaiveDate,
122 amount: Decimal,
123 account: impl Into<String>,
124 counterparty: Option<String>,
125 thresholds: &[Decimal],
126 ) -> Option<NearMissLabel> {
127 if self.rng.random::<f64>() >= self.config.proportion {
129 return None;
130 }
131
132 let doc_id = document_id.into();
133 let acct = account.into();
134
135 let patterns = self.get_applicable_patterns(date, amount, &acct, &counterparty, thresholds);
137
138 if patterns.is_empty() {
139 return None;
140 }
141
142 let idx = self.rng.random_range(0..patterns.len());
144 let (pattern, trigger, explanation) =
145 patterns.into_iter().nth(idx).expect("idx < patterns.len()");
146
147 let suspicion_score = match &pattern {
149 NearMissPattern::NearDuplicate { .. } => 0.70,
150 NearMissPattern::ThresholdProximity { proximity, .. } => 0.50 + proximity * 0.4,
151 NearMissPattern::UnusualLegitimate { .. } => 0.55,
152 NearMissPattern::CorrectedError { .. } => 0.60,
153 };
154
155 let label = NearMissLabel::new(doc_id, pattern, suspicion_score, trigger, explanation);
156
157 self.labels.push(label.clone());
158 Some(label)
159 }
160
161 fn get_applicable_patterns(
163 &mut self,
164 date: NaiveDate,
165 amount: Decimal,
166 account: &str,
167 counterparty: &Option<String>,
168 thresholds: &[Decimal],
169 ) -> Vec<(NearMissPattern, FalsePositiveTrigger, String)> {
170 let mut patterns = Vec::new();
171
172 if self.config.near_duplicate_enabled {
174 if let Some(similar) =
175 self.find_similar_transaction(date, amount, account, counterparty)
176 {
177 let days_diff = (date - similar.date).num_days().unsigned_abs() as u32;
178 if days_diff >= self.config.near_duplicate_days.0
179 && days_diff <= self.config.near_duplicate_days.1
180 {
181 patterns.push((
182 NearMissPattern::NearDuplicate {
183 date_difference_days: days_diff,
184 similar_transaction_id: similar.document_id.clone(),
185 },
186 FalsePositiveTrigger::SimilarTransaction,
187 format!(
188 "Similar transaction {days_diff} days apart - different business event"
189 ),
190 ));
191 }
192 }
193 }
194
195 if self.config.threshold_proximity_enabled {
197 for threshold in thresholds {
198 let proximity = self.calculate_proximity(amount, *threshold);
199 if proximity >= self.config.proximity_range.0
200 && proximity <= self.config.proximity_range.1
201 {
202 patterns.push((
203 NearMissPattern::ThresholdProximity {
204 threshold: *threshold,
205 proximity,
206 },
207 FalsePositiveTrigger::AmountNearThreshold,
208 format!(
209 "Amount is {:.1}% of threshold {} - coincidental",
210 proximity * 100.0,
211 threshold
212 ),
213 ));
214 }
215 }
216 }
217
218 if self.config.unusual_legitimate_enabled {
220 if let Some((pattern_type, justification)) =
221 self.check_unusual_legitimate(date, amount, account)
222 {
223 patterns.push((
224 NearMissPattern::UnusualLegitimate {
225 pattern_type,
226 justification: justification.clone(),
227 },
228 FalsePositiveTrigger::UnusualTiming,
229 justification,
230 ));
231 }
232 }
233
234 patterns
235 }
236
237 fn find_similar_transaction(
239 &self,
240 date: NaiveDate,
241 amount: Decimal,
242 account: &str,
243 counterparty: &Option<String>,
244 ) -> Option<&RecentTransaction> {
245 self.recent_transactions.iter().find(|tx| {
246 let amount_diff = (tx.amount - amount).abs();
248 let amount_similar = amount_diff <= tx.amount * dec!(0.05);
249
250 let account_match = tx.account == account;
252
253 let counterparty_match = match (&tx.counterparty, counterparty) {
255 (Some(a), Some(b)) => a == b,
256 _ => true, };
258
259 let days_diff = (date - tx.date).num_days().abs();
261 let date_in_range =
262 days_diff > 0 && days_diff <= self.config.near_duplicate_days.1 as i64;
263
264 amount_similar && account_match && counterparty_match && date_in_range
265 })
266 }
267
268 fn calculate_proximity(&self, amount: Decimal, threshold: Decimal) -> f64 {
270 if threshold == Decimal::ZERO {
271 return 0.0;
272 }
273 let amount_f64: f64 = amount.try_into().unwrap_or(0.0);
274 let threshold_f64: f64 = threshold.try_into().unwrap_or(1.0);
275 (amount_f64 / threshold_f64).min(1.0)
276 }
277
278 fn check_unusual_legitimate(
280 &mut self,
281 date: NaiveDate,
282 amount: Decimal,
283 _account: &str,
284 ) -> Option<(LegitimatePatternType, String)> {
285 if date.month() == 12 && amount >= dec!(10000) && self.rng.random::<f64>() < 0.3 {
287 return Some((
288 LegitimatePatternType::YearEndBonus,
289 "Year-end bonus payment per compensation plan".to_string(),
290 ));
291 }
292
293 if date.month() <= 3 && amount >= dec!(5000) && self.rng.random::<f64>() < 0.2 {
295 return Some((
296 LegitimatePatternType::ContractPrepayment,
297 "Annual contract prepayment per terms".to_string(),
298 ));
299 }
300
301 if date.month() >= 10 && amount >= dec!(25000) && self.rng.random::<f64>() < 0.2 {
303 return Some((
304 LegitimatePatternType::PromotionalSpending,
305 "Holiday promotional campaign spending".to_string(),
306 ));
307 }
308
309 if date.month() >= 8
311 && date.month() <= 11
312 && amount >= dec!(50000)
313 && self.rng.random::<f64>() < 0.15
314 {
315 return Some((
316 LegitimatePatternType::SeasonalInventory,
317 "Seasonal inventory buildup for holiday sales".to_string(),
318 ));
319 }
320
321 if amount >= dec!(100000) && self.rng.random::<f64>() < 0.1 {
323 return Some((
324 LegitimatePatternType::OneTimePayment,
325 "One-time strategic vendor payment".to_string(),
326 ));
327 }
328
329 None
330 }
331
332 pub fn create_corrected_error(
334 &mut self,
335 document_id: impl Into<String>,
336 original_error_id: impl Into<String>,
337 correction_lag_days: u32,
338 ) -> NearMissLabel {
339 let pattern = NearMissPattern::CorrectedError {
340 correction_lag_days,
341 correction_document_id: original_error_id.into(),
342 };
343
344 let label = NearMissLabel::new(
345 document_id,
346 pattern,
347 0.60,
348 FalsePositiveTrigger::SimilarTransaction,
349 format!("Error caught and corrected within {correction_lag_days} days"),
350 );
351
352 self.labels.push(label.clone());
353 label
354 }
355
356 pub fn get_labels(&self) -> &[NearMissLabel] {
358 &self.labels
359 }
360
361 pub fn reset(&mut self) {
363 self.labels.clear();
364 self.recent_transactions.clear();
365 self.rng = seeded_rng(self.config.seed, 0);
366 }
367
368 pub fn get_statistics(&self) -> NearMissStatistics {
370 let mut by_pattern = std::collections::HashMap::new();
371 let mut by_trigger = std::collections::HashMap::new();
372
373 for label in &self.labels {
374 let pattern_name = match &label.pattern {
375 NearMissPattern::NearDuplicate { .. } => "near_duplicate",
376 NearMissPattern::ThresholdProximity { .. } => "threshold_proximity",
377 NearMissPattern::UnusualLegitimate { .. } => "unusual_legitimate",
378 NearMissPattern::CorrectedError { .. } => "corrected_error",
379 };
380
381 *by_pattern.entry(pattern_name.to_string()).or_insert(0) += 1;
382
383 let trigger_name = match label.false_positive_trigger {
384 FalsePositiveTrigger::AmountNearThreshold => "amount_near_threshold",
385 FalsePositiveTrigger::UnusualTiming => "unusual_timing",
386 FalsePositiveTrigger::SimilarTransaction => "similar_transaction",
387 FalsePositiveTrigger::NewCounterparty => "new_counterparty",
388 FalsePositiveTrigger::UnusualAccountCombination => "unusual_account",
389 FalsePositiveTrigger::VolumeSpike => "volume_spike",
390 FalsePositiveTrigger::RoundAmount => "round_amount",
391 };
392
393 *by_trigger.entry(trigger_name.to_string()).or_insert(0) += 1;
394 }
395
396 let avg_suspicion = if self.labels.is_empty() {
397 0.0
398 } else {
399 self.labels.iter().map(|l| l.suspicion_score).sum::<f64>() / self.labels.len() as f64
400 };
401
402 NearMissStatistics {
403 total_count: self.labels.len(),
404 by_pattern,
405 by_trigger,
406 average_suspicion_score: avg_suspicion,
407 }
408 }
409}
410
411#[derive(Debug, Clone, Serialize, Deserialize)]
413pub struct NearMissStatistics {
414 pub total_count: usize,
416 pub by_pattern: std::collections::HashMap<String, usize>,
418 pub by_trigger: std::collections::HashMap<String, usize>,
420 pub average_suspicion_score: f64,
422}
423
424#[cfg(test)]
425#[allow(clippy::unwrap_used)]
426mod tests {
427 use super::*;
428
429 #[test]
430 fn test_near_miss_config() {
431 let config = NearMissConfig::default();
432 assert!((config.proportion - 0.30).abs() < 0.01);
433 assert!(config.near_duplicate_enabled);
434 }
435
436 #[test]
437 fn test_near_miss_generator_creation() {
438 let generator = NearMissGenerator::new(NearMissConfig::default());
439 assert!(generator.labels.is_empty());
440 }
441
442 #[test]
443 fn test_record_transaction() {
444 let mut generator = NearMissGenerator::new(NearMissConfig::default());
445
446 generator.record_transaction(
447 "JE001",
448 NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
449 dec!(10000),
450 "5000",
451 Some("VENDOR001".to_string()),
452 );
453
454 assert_eq!(generator.recent_transactions.len(), 1);
455 }
456
457 #[test]
458 fn test_threshold_proximity() {
459 let mut generator = NearMissGenerator::new(NearMissConfig {
460 proportion: 1.0, threshold_proximity_enabled: true,
462 ..Default::default()
463 });
464
465 let thresholds = vec![dec!(10000), dec!(50000)];
466
467 let label = generator.check_near_miss(
469 "JE001",
470 NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
471 dec!(9500),
472 "5000",
473 None,
474 &thresholds,
475 );
476
477 if let Some(label) = label {
479 if matches!(label.pattern, NearMissPattern::ThresholdProximity { .. }) {
481 assert_eq!(
482 label.false_positive_trigger,
483 FalsePositiveTrigger::AmountNearThreshold
484 );
485 }
486 }
487 }
488
489 #[test]
490 fn test_corrected_error() {
491 let mut generator = NearMissGenerator::new(NearMissConfig::default());
492
493 let label = generator.create_corrected_error("JE002", "JE001", 3);
494
495 assert!(matches!(
496 label.pattern,
497 NearMissPattern::CorrectedError {
498 correction_lag_days: 3,
499 ..
500 }
501 ));
502 assert_eq!(generator.labels.len(), 1);
503 }
504
505 #[test]
506 fn test_statistics() {
507 let mut generator = NearMissGenerator::new(NearMissConfig::default());
508
509 generator.create_corrected_error("JE001", "JE000", 2);
510 generator.create_corrected_error("JE002", "JE000", 3);
511
512 let stats = generator.get_statistics();
513 assert_eq!(stats.total_count, 2);
514 assert!(stats.by_pattern.contains_key("corrected_error"));
515 }
516}