1use chrono::{Datelike, NaiveDate};
7use datasynth_core::utils::seeded_rng;
8use rand::Rng;
9use rand_chacha::ChaCha8Rng;
10use rust_decimal::Decimal;
11use rust_decimal_macros::dec;
12use serde::{Deserialize, Serialize};
13
14use datasynth_core::models::{
15 FalsePositiveTrigger, LegitimatePatternType, NearMissLabel, NearMissPattern,
16};
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct NearMissConfig {
21 pub proportion: f64,
23 pub near_duplicate_enabled: bool,
25 pub threshold_proximity_enabled: bool,
27 pub unusual_legitimate_enabled: bool,
29 pub corrected_errors_enabled: bool,
31 pub near_duplicate_days: (u32, u32),
33 pub proximity_range: (f64, f64),
35 pub correction_lag_days: (u32, u32),
37 pub seed: u64,
39}
40
41impl Default for NearMissConfig {
42 fn default() -> Self {
43 Self {
44 proportion: 0.30,
45 near_duplicate_enabled: true,
46 threshold_proximity_enabled: true,
47 unusual_legitimate_enabled: true,
48 corrected_errors_enabled: true,
49 near_duplicate_days: (1, 3),
50 proximity_range: (0.90, 0.99),
51 correction_lag_days: (1, 5),
52 seed: 42,
53 }
54 }
55}
56
57pub struct NearMissGenerator {
59 config: NearMissConfig,
60 rng: ChaCha8Rng,
61 labels: Vec<NearMissLabel>,
63 recent_transactions: Vec<RecentTransaction>,
65 max_recent: usize,
67}
68
69#[derive(Debug, Clone)]
71struct RecentTransaction {
72 document_id: String,
73 date: NaiveDate,
74 amount: Decimal,
75 account: String,
76 counterparty: Option<String>,
77}
78
79impl NearMissGenerator {
80 pub fn new(config: NearMissConfig) -> Self {
82 let rng = seeded_rng(config.seed, 0);
83 Self {
84 config,
85 rng,
86 labels: Vec::new(),
87 recent_transactions: Vec::new(),
88 max_recent: 100,
89 }
90 }
91
92 pub fn record_transaction(
94 &mut self,
95 document_id: impl Into<String>,
96 date: NaiveDate,
97 amount: Decimal,
98 account: impl Into<String>,
99 counterparty: Option<String>,
100 ) {
101 let tx = RecentTransaction {
102 document_id: document_id.into(),
103 date,
104 amount,
105 account: account.into(),
106 counterparty,
107 };
108
109 self.recent_transactions.push(tx);
110
111 if self.recent_transactions.len() > self.max_recent {
113 self.recent_transactions.remove(0);
114 }
115 }
116
117 pub fn check_near_miss(
119 &mut self,
120 document_id: impl Into<String>,
121 date: NaiveDate,
122 amount: Decimal,
123 account: impl Into<String>,
124 counterparty: Option<String>,
125 thresholds: &[Decimal],
126 ) -> Option<NearMissLabel> {
127 if self.rng.random::<f64>() >= self.config.proportion {
129 return None;
130 }
131
132 let doc_id = document_id.into();
133 let acct = account.into();
134
135 let patterns = self.get_applicable_patterns(date, amount, &acct, &counterparty, thresholds);
137
138 if patterns.is_empty() {
139 return None;
140 }
141
142 let idx = self.rng.random_range(0..patterns.len());
144 let (pattern, trigger, explanation) =
145 patterns.into_iter().nth(idx).expect("idx < patterns.len()");
146
147 let suspicion_score = match &pattern {
149 NearMissPattern::NearDuplicate { .. } => 0.70,
150 NearMissPattern::ThresholdProximity { proximity, .. } => 0.50 + proximity * 0.4,
151 NearMissPattern::UnusualLegitimate { .. } => 0.55,
152 NearMissPattern::CorrectedError { .. } => 0.60,
153 };
154
155 let label = NearMissLabel::new(doc_id, pattern, suspicion_score, trigger, explanation);
156
157 self.labels.push(label.clone());
158 Some(label)
159 }
160
161 fn get_applicable_patterns(
163 &mut self,
164 date: NaiveDate,
165 amount: Decimal,
166 account: &str,
167 counterparty: &Option<String>,
168 thresholds: &[Decimal],
169 ) -> Vec<(NearMissPattern, FalsePositiveTrigger, String)> {
170 let mut patterns = Vec::new();
171
172 if self.config.near_duplicate_enabled {
174 if let Some(similar) =
175 self.find_similar_transaction(date, amount, account, counterparty)
176 {
177 let days_diff = (date - similar.date).num_days().unsigned_abs() as u32;
178 if days_diff >= self.config.near_duplicate_days.0
179 && days_diff <= self.config.near_duplicate_days.1
180 {
181 patterns.push((
182 NearMissPattern::NearDuplicate {
183 date_difference_days: days_diff,
184 similar_transaction_id: similar.document_id.clone(),
185 },
186 FalsePositiveTrigger::SimilarTransaction,
187 format!(
188 "Similar transaction {} days apart - different business event",
189 days_diff
190 ),
191 ));
192 }
193 }
194 }
195
196 if self.config.threshold_proximity_enabled {
198 for threshold in thresholds {
199 let proximity = self.calculate_proximity(amount, *threshold);
200 if proximity >= self.config.proximity_range.0
201 && proximity <= self.config.proximity_range.1
202 {
203 patterns.push((
204 NearMissPattern::ThresholdProximity {
205 threshold: *threshold,
206 proximity,
207 },
208 FalsePositiveTrigger::AmountNearThreshold,
209 format!(
210 "Amount is {:.1}% of threshold {} - coincidental",
211 proximity * 100.0,
212 threshold
213 ),
214 ));
215 }
216 }
217 }
218
219 if self.config.unusual_legitimate_enabled {
221 if let Some((pattern_type, justification)) =
222 self.check_unusual_legitimate(date, amount, account)
223 {
224 patterns.push((
225 NearMissPattern::UnusualLegitimate {
226 pattern_type,
227 justification: justification.clone(),
228 },
229 FalsePositiveTrigger::UnusualTiming,
230 justification,
231 ));
232 }
233 }
234
235 patterns
236 }
237
238 fn find_similar_transaction(
240 &self,
241 date: NaiveDate,
242 amount: Decimal,
243 account: &str,
244 counterparty: &Option<String>,
245 ) -> Option<&RecentTransaction> {
246 self.recent_transactions.iter().find(|tx| {
247 let amount_diff = (tx.amount - amount).abs();
249 let amount_similar = amount_diff <= tx.amount * dec!(0.05);
250
251 let account_match = tx.account == account;
253
254 let counterparty_match = match (&tx.counterparty, counterparty) {
256 (Some(a), Some(b)) => a == b,
257 _ => true, };
259
260 let days_diff = (date - tx.date).num_days().abs();
262 let date_in_range =
263 days_diff > 0 && days_diff <= self.config.near_duplicate_days.1 as i64;
264
265 amount_similar && account_match && counterparty_match && date_in_range
266 })
267 }
268
269 fn calculate_proximity(&self, amount: Decimal, threshold: Decimal) -> f64 {
271 if threshold == Decimal::ZERO {
272 return 0.0;
273 }
274 let amount_f64: f64 = amount.try_into().unwrap_or(0.0);
275 let threshold_f64: f64 = threshold.try_into().unwrap_or(1.0);
276 (amount_f64 / threshold_f64).min(1.0)
277 }
278
279 fn check_unusual_legitimate(
281 &mut self,
282 date: NaiveDate,
283 amount: Decimal,
284 _account: &str,
285 ) -> Option<(LegitimatePatternType, String)> {
286 if date.month() == 12 && amount >= dec!(10000) && self.rng.random::<f64>() < 0.3 {
288 return Some((
289 LegitimatePatternType::YearEndBonus,
290 "Year-end bonus payment per compensation plan".to_string(),
291 ));
292 }
293
294 if date.month() <= 3 && amount >= dec!(5000) && self.rng.random::<f64>() < 0.2 {
296 return Some((
297 LegitimatePatternType::ContractPrepayment,
298 "Annual contract prepayment per terms".to_string(),
299 ));
300 }
301
302 if date.month() >= 10 && amount >= dec!(25000) && self.rng.random::<f64>() < 0.2 {
304 return Some((
305 LegitimatePatternType::PromotionalSpending,
306 "Holiday promotional campaign spending".to_string(),
307 ));
308 }
309
310 if date.month() >= 8
312 && date.month() <= 11
313 && amount >= dec!(50000)
314 && self.rng.random::<f64>() < 0.15
315 {
316 return Some((
317 LegitimatePatternType::SeasonalInventory,
318 "Seasonal inventory buildup for holiday sales".to_string(),
319 ));
320 }
321
322 if amount >= dec!(100000) && self.rng.random::<f64>() < 0.1 {
324 return Some((
325 LegitimatePatternType::OneTimePayment,
326 "One-time strategic vendor payment".to_string(),
327 ));
328 }
329
330 None
331 }
332
333 pub fn create_corrected_error(
335 &mut self,
336 document_id: impl Into<String>,
337 original_error_id: impl Into<String>,
338 correction_lag_days: u32,
339 ) -> NearMissLabel {
340 let pattern = NearMissPattern::CorrectedError {
341 correction_lag_days,
342 correction_document_id: original_error_id.into(),
343 };
344
345 let label = NearMissLabel::new(
346 document_id,
347 pattern,
348 0.60,
349 FalsePositiveTrigger::SimilarTransaction,
350 format!(
351 "Error caught and corrected within {} days",
352 correction_lag_days
353 ),
354 );
355
356 self.labels.push(label.clone());
357 label
358 }
359
360 pub fn get_labels(&self) -> &[NearMissLabel] {
362 &self.labels
363 }
364
365 pub fn reset(&mut self) {
367 self.labels.clear();
368 self.recent_transactions.clear();
369 self.rng = seeded_rng(self.config.seed, 0);
370 }
371
372 pub fn get_statistics(&self) -> NearMissStatistics {
374 let mut by_pattern = std::collections::HashMap::new();
375 let mut by_trigger = std::collections::HashMap::new();
376
377 for label in &self.labels {
378 let pattern_name = match &label.pattern {
379 NearMissPattern::NearDuplicate { .. } => "near_duplicate",
380 NearMissPattern::ThresholdProximity { .. } => "threshold_proximity",
381 NearMissPattern::UnusualLegitimate { .. } => "unusual_legitimate",
382 NearMissPattern::CorrectedError { .. } => "corrected_error",
383 };
384
385 *by_pattern.entry(pattern_name.to_string()).or_insert(0) += 1;
386
387 let trigger_name = match label.false_positive_trigger {
388 FalsePositiveTrigger::AmountNearThreshold => "amount_near_threshold",
389 FalsePositiveTrigger::UnusualTiming => "unusual_timing",
390 FalsePositiveTrigger::SimilarTransaction => "similar_transaction",
391 FalsePositiveTrigger::NewCounterparty => "new_counterparty",
392 FalsePositiveTrigger::UnusualAccountCombination => "unusual_account",
393 FalsePositiveTrigger::VolumeSpike => "volume_spike",
394 FalsePositiveTrigger::RoundAmount => "round_amount",
395 };
396
397 *by_trigger.entry(trigger_name.to_string()).or_insert(0) += 1;
398 }
399
400 let avg_suspicion = if self.labels.is_empty() {
401 0.0
402 } else {
403 self.labels.iter().map(|l| l.suspicion_score).sum::<f64>() / self.labels.len() as f64
404 };
405
406 NearMissStatistics {
407 total_count: self.labels.len(),
408 by_pattern,
409 by_trigger,
410 average_suspicion_score: avg_suspicion,
411 }
412 }
413}
414
415#[derive(Debug, Clone, Serialize, Deserialize)]
417pub struct NearMissStatistics {
418 pub total_count: usize,
420 pub by_pattern: std::collections::HashMap<String, usize>,
422 pub by_trigger: std::collections::HashMap<String, usize>,
424 pub average_suspicion_score: f64,
426}
427
428#[cfg(test)]
429#[allow(clippy::unwrap_used)]
430mod tests {
431 use super::*;
432
433 #[test]
434 fn test_near_miss_config() {
435 let config = NearMissConfig::default();
436 assert!((config.proportion - 0.30).abs() < 0.01);
437 assert!(config.near_duplicate_enabled);
438 }
439
440 #[test]
441 fn test_near_miss_generator_creation() {
442 let generator = NearMissGenerator::new(NearMissConfig::default());
443 assert!(generator.labels.is_empty());
444 }
445
446 #[test]
447 fn test_record_transaction() {
448 let mut generator = NearMissGenerator::new(NearMissConfig::default());
449
450 generator.record_transaction(
451 "JE001",
452 NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
453 dec!(10000),
454 "5000",
455 Some("VENDOR001".to_string()),
456 );
457
458 assert_eq!(generator.recent_transactions.len(), 1);
459 }
460
461 #[test]
462 fn test_threshold_proximity() {
463 let mut generator = NearMissGenerator::new(NearMissConfig {
464 proportion: 1.0, threshold_proximity_enabled: true,
466 ..Default::default()
467 });
468
469 let thresholds = vec![dec!(10000), dec!(50000)];
470
471 let label = generator.check_near_miss(
473 "JE001",
474 NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
475 dec!(9500),
476 "5000",
477 None,
478 &thresholds,
479 );
480
481 if let Some(label) = label {
483 if matches!(label.pattern, NearMissPattern::ThresholdProximity { .. }) {
485 assert_eq!(
486 label.false_positive_trigger,
487 FalsePositiveTrigger::AmountNearThreshold
488 );
489 }
490 }
491 }
492
493 #[test]
494 fn test_corrected_error() {
495 let mut generator = NearMissGenerator::new(NearMissConfig::default());
496
497 let label = generator.create_corrected_error("JE002", "JE001", 3);
498
499 assert!(matches!(
500 label.pattern,
501 NearMissPattern::CorrectedError {
502 correction_lag_days: 3,
503 ..
504 }
505 ));
506 assert_eq!(generator.labels.len(), 1);
507 }
508
509 #[test]
510 fn test_statistics() {
511 let mut generator = NearMissGenerator::new(NearMissConfig::default());
512
513 generator.create_corrected_error("JE001", "JE000", 2);
514 generator.create_corrected_error("JE002", "JE000", 3);
515
516 let stats = generator.get_statistics();
517 assert_eq!(stats.total_count, 2);
518 assert!(stats.by_pattern.contains_key("corrected_error"));
519 }
520}