1use chrono::{Datelike, NaiveDate};
7use rand::Rng;
8use rand::SeedableRng;
9use rand_chacha::ChaCha8Rng;
10use rust_decimal::Decimal;
11use rust_decimal_macros::dec;
12use serde::{Deserialize, Serialize};
13
14use datasynth_core::models::{
15 FalsePositiveTrigger, LegitimatePatternType, NearMissLabel, NearMissPattern,
16};
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct NearMissConfig {
21 pub proportion: f64,
23 pub near_duplicate_enabled: bool,
25 pub threshold_proximity_enabled: bool,
27 pub unusual_legitimate_enabled: bool,
29 pub corrected_errors_enabled: bool,
31 pub near_duplicate_days: (u32, u32),
33 pub proximity_range: (f64, f64),
35 pub correction_lag_days: (u32, u32),
37 pub seed: u64,
39}
40
41impl Default for NearMissConfig {
42 fn default() -> Self {
43 Self {
44 proportion: 0.30,
45 near_duplicate_enabled: true,
46 threshold_proximity_enabled: true,
47 unusual_legitimate_enabled: true,
48 corrected_errors_enabled: true,
49 near_duplicate_days: (1, 3),
50 proximity_range: (0.90, 0.99),
51 correction_lag_days: (1, 5),
52 seed: 42,
53 }
54 }
55}
56
57pub struct NearMissGenerator {
59 config: NearMissConfig,
60 rng: ChaCha8Rng,
61 labels: Vec<NearMissLabel>,
63 recent_transactions: Vec<RecentTransaction>,
65 max_recent: usize,
67}
68
69#[derive(Debug, Clone)]
71struct RecentTransaction {
72 document_id: String,
73 date: NaiveDate,
74 amount: Decimal,
75 account: String,
76 counterparty: Option<String>,
77}
78
79impl NearMissGenerator {
80 pub fn new(config: NearMissConfig) -> Self {
82 let rng = ChaCha8Rng::seed_from_u64(config.seed);
83 Self {
84 config,
85 rng,
86 labels: Vec::new(),
87 recent_transactions: Vec::new(),
88 max_recent: 100,
89 }
90 }
91
92 pub fn record_transaction(
94 &mut self,
95 document_id: impl Into<String>,
96 date: NaiveDate,
97 amount: Decimal,
98 account: impl Into<String>,
99 counterparty: Option<String>,
100 ) {
101 let tx = RecentTransaction {
102 document_id: document_id.into(),
103 date,
104 amount,
105 account: account.into(),
106 counterparty,
107 };
108
109 self.recent_transactions.push(tx);
110
111 if self.recent_transactions.len() > self.max_recent {
113 self.recent_transactions.remove(0);
114 }
115 }
116
117 pub fn check_near_miss(
119 &mut self,
120 document_id: impl Into<String>,
121 date: NaiveDate,
122 amount: Decimal,
123 account: impl Into<String>,
124 counterparty: Option<String>,
125 thresholds: &[Decimal],
126 ) -> Option<NearMissLabel> {
127 if self.rng.gen::<f64>() >= self.config.proportion {
129 return None;
130 }
131
132 let doc_id = document_id.into();
133 let acct = account.into();
134
135 let patterns = self.get_applicable_patterns(date, amount, &acct, &counterparty, thresholds);
137
138 if patterns.is_empty() {
139 return None;
140 }
141
142 let idx = self.rng.gen_range(0..patterns.len());
144 let (pattern, trigger, explanation) = patterns.into_iter().nth(idx).unwrap();
145
146 let suspicion_score = match &pattern {
148 NearMissPattern::NearDuplicate { .. } => 0.70,
149 NearMissPattern::ThresholdProximity { proximity, .. } => 0.50 + proximity * 0.4,
150 NearMissPattern::UnusualLegitimate { .. } => 0.55,
151 NearMissPattern::CorrectedError { .. } => 0.60,
152 };
153
154 let label = NearMissLabel::new(doc_id, pattern, suspicion_score, trigger, explanation);
155
156 self.labels.push(label.clone());
157 Some(label)
158 }
159
160 fn get_applicable_patterns(
162 &mut self,
163 date: NaiveDate,
164 amount: Decimal,
165 account: &str,
166 counterparty: &Option<String>,
167 thresholds: &[Decimal],
168 ) -> Vec<(NearMissPattern, FalsePositiveTrigger, String)> {
169 let mut patterns = Vec::new();
170
171 if self.config.near_duplicate_enabled {
173 if let Some(similar) =
174 self.find_similar_transaction(date, amount, account, counterparty)
175 {
176 let days_diff = (date - similar.date).num_days().unsigned_abs() as u32;
177 if days_diff >= self.config.near_duplicate_days.0
178 && days_diff <= self.config.near_duplicate_days.1
179 {
180 patterns.push((
181 NearMissPattern::NearDuplicate {
182 date_difference_days: days_diff,
183 similar_transaction_id: similar.document_id.clone(),
184 },
185 FalsePositiveTrigger::SimilarTransaction,
186 format!(
187 "Similar transaction {} days apart - different business event",
188 days_diff
189 ),
190 ));
191 }
192 }
193 }
194
195 if self.config.threshold_proximity_enabled {
197 for threshold in thresholds {
198 let proximity = self.calculate_proximity(amount, *threshold);
199 if proximity >= self.config.proximity_range.0
200 && proximity <= self.config.proximity_range.1
201 {
202 patterns.push((
203 NearMissPattern::ThresholdProximity {
204 threshold: *threshold,
205 proximity,
206 },
207 FalsePositiveTrigger::AmountNearThreshold,
208 format!(
209 "Amount is {:.1}% of threshold {} - coincidental",
210 proximity * 100.0,
211 threshold
212 ),
213 ));
214 }
215 }
216 }
217
218 if self.config.unusual_legitimate_enabled {
220 if let Some((pattern_type, justification)) =
221 self.check_unusual_legitimate(date, amount, account)
222 {
223 patterns.push((
224 NearMissPattern::UnusualLegitimate {
225 pattern_type,
226 justification: justification.clone(),
227 },
228 FalsePositiveTrigger::UnusualTiming,
229 justification,
230 ));
231 }
232 }
233
234 patterns
235 }
236
237 fn find_similar_transaction(
239 &self,
240 date: NaiveDate,
241 amount: Decimal,
242 account: &str,
243 counterparty: &Option<String>,
244 ) -> Option<&RecentTransaction> {
245 self.recent_transactions.iter().find(|tx| {
246 let amount_diff = (tx.amount - amount).abs();
248 let amount_similar = amount_diff <= tx.amount * dec!(0.05);
249
250 let account_match = tx.account == account;
252
253 let counterparty_match = match (&tx.counterparty, counterparty) {
255 (Some(a), Some(b)) => a == b,
256 _ => true, };
258
259 let days_diff = (date - tx.date).num_days().abs();
261 let date_in_range =
262 days_diff > 0 && days_diff <= self.config.near_duplicate_days.1 as i64;
263
264 amount_similar && account_match && counterparty_match && date_in_range
265 })
266 }
267
268 fn calculate_proximity(&self, amount: Decimal, threshold: Decimal) -> f64 {
270 if threshold == Decimal::ZERO {
271 return 0.0;
272 }
273 let amount_f64: f64 = amount.try_into().unwrap_or(0.0);
274 let threshold_f64: f64 = threshold.try_into().unwrap_or(1.0);
275 (amount_f64 / threshold_f64).min(1.0)
276 }
277
278 fn check_unusual_legitimate(
280 &mut self,
281 date: NaiveDate,
282 amount: Decimal,
283 _account: &str,
284 ) -> Option<(LegitimatePatternType, String)> {
285 if date.month() == 12 && amount >= dec!(10000) && self.rng.gen::<f64>() < 0.3 {
287 return Some((
288 LegitimatePatternType::YearEndBonus,
289 "Year-end bonus payment per compensation plan".to_string(),
290 ));
291 }
292
293 if date.month() <= 3 && amount >= dec!(5000) && self.rng.gen::<f64>() < 0.2 {
295 return Some((
296 LegitimatePatternType::ContractPrepayment,
297 "Annual contract prepayment per terms".to_string(),
298 ));
299 }
300
301 if date.month() >= 10 && amount >= dec!(25000) && self.rng.gen::<f64>() < 0.2 {
303 return Some((
304 LegitimatePatternType::PromotionalSpending,
305 "Holiday promotional campaign spending".to_string(),
306 ));
307 }
308
309 if date.month() >= 8
311 && date.month() <= 11
312 && amount >= dec!(50000)
313 && self.rng.gen::<f64>() < 0.15
314 {
315 return Some((
316 LegitimatePatternType::SeasonalInventory,
317 "Seasonal inventory buildup for holiday sales".to_string(),
318 ));
319 }
320
321 if amount >= dec!(100000) && self.rng.gen::<f64>() < 0.1 {
323 return Some((
324 LegitimatePatternType::OneTimePayment,
325 "One-time strategic vendor payment".to_string(),
326 ));
327 }
328
329 None
330 }
331
332 pub fn create_corrected_error(
334 &mut self,
335 document_id: impl Into<String>,
336 original_error_id: impl Into<String>,
337 correction_lag_days: u32,
338 ) -> NearMissLabel {
339 let pattern = NearMissPattern::CorrectedError {
340 correction_lag_days,
341 correction_document_id: original_error_id.into(),
342 };
343
344 let label = NearMissLabel::new(
345 document_id,
346 pattern,
347 0.60,
348 FalsePositiveTrigger::SimilarTransaction,
349 format!(
350 "Error caught and corrected within {} days",
351 correction_lag_days
352 ),
353 );
354
355 self.labels.push(label.clone());
356 label
357 }
358
359 pub fn get_labels(&self) -> &[NearMissLabel] {
361 &self.labels
362 }
363
364 pub fn reset(&mut self) {
366 self.labels.clear();
367 self.recent_transactions.clear();
368 self.rng = ChaCha8Rng::seed_from_u64(self.config.seed);
369 }
370
371 pub fn get_statistics(&self) -> NearMissStatistics {
373 let mut by_pattern = std::collections::HashMap::new();
374 let mut by_trigger = std::collections::HashMap::new();
375
376 for label in &self.labels {
377 let pattern_name = match &label.pattern {
378 NearMissPattern::NearDuplicate { .. } => "near_duplicate",
379 NearMissPattern::ThresholdProximity { .. } => "threshold_proximity",
380 NearMissPattern::UnusualLegitimate { .. } => "unusual_legitimate",
381 NearMissPattern::CorrectedError { .. } => "corrected_error",
382 };
383
384 *by_pattern.entry(pattern_name.to_string()).or_insert(0) += 1;
385
386 let trigger_name = match label.false_positive_trigger {
387 FalsePositiveTrigger::AmountNearThreshold => "amount_near_threshold",
388 FalsePositiveTrigger::UnusualTiming => "unusual_timing",
389 FalsePositiveTrigger::SimilarTransaction => "similar_transaction",
390 FalsePositiveTrigger::NewCounterparty => "new_counterparty",
391 FalsePositiveTrigger::UnusualAccountCombination => "unusual_account",
392 FalsePositiveTrigger::VolumeSpike => "volume_spike",
393 FalsePositiveTrigger::RoundAmount => "round_amount",
394 };
395
396 *by_trigger.entry(trigger_name.to_string()).or_insert(0) += 1;
397 }
398
399 let avg_suspicion = if self.labels.is_empty() {
400 0.0
401 } else {
402 self.labels.iter().map(|l| l.suspicion_score).sum::<f64>() / self.labels.len() as f64
403 };
404
405 NearMissStatistics {
406 total_count: self.labels.len(),
407 by_pattern,
408 by_trigger,
409 average_suspicion_score: avg_suspicion,
410 }
411 }
412}
413
414#[derive(Debug, Clone, Serialize, Deserialize)]
416pub struct NearMissStatistics {
417 pub total_count: usize,
419 pub by_pattern: std::collections::HashMap<String, usize>,
421 pub by_trigger: std::collections::HashMap<String, usize>,
423 pub average_suspicion_score: f64,
425}
426
427#[cfg(test)]
428mod tests {
429 use super::*;
430
431 #[test]
432 fn test_near_miss_config() {
433 let config = NearMissConfig::default();
434 assert!((config.proportion - 0.30).abs() < 0.01);
435 assert!(config.near_duplicate_enabled);
436 }
437
438 #[test]
439 fn test_near_miss_generator_creation() {
440 let generator = NearMissGenerator::new(NearMissConfig::default());
441 assert!(generator.labels.is_empty());
442 }
443
444 #[test]
445 fn test_record_transaction() {
446 let mut generator = NearMissGenerator::new(NearMissConfig::default());
447
448 generator.record_transaction(
449 "JE001",
450 NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
451 dec!(10000),
452 "5000",
453 Some("VENDOR001".to_string()),
454 );
455
456 assert_eq!(generator.recent_transactions.len(), 1);
457 }
458
459 #[test]
460 fn test_threshold_proximity() {
461 let mut generator = NearMissGenerator::new(NearMissConfig {
462 proportion: 1.0, threshold_proximity_enabled: true,
464 ..Default::default()
465 });
466
467 let thresholds = vec![dec!(10000), dec!(50000)];
468
469 let label = generator.check_near_miss(
471 "JE001",
472 NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
473 dec!(9500),
474 "5000",
475 None,
476 &thresholds,
477 );
478
479 if let Some(label) = label {
481 if matches!(label.pattern, NearMissPattern::ThresholdProximity { .. }) {
483 assert_eq!(
484 label.false_positive_trigger,
485 FalsePositiveTrigger::AmountNearThreshold
486 );
487 }
488 }
489 }
490
491 #[test]
492 fn test_corrected_error() {
493 let mut generator = NearMissGenerator::new(NearMissConfig::default());
494
495 let label = generator.create_corrected_error("JE002", "JE001", 3);
496
497 assert!(matches!(
498 label.pattern,
499 NearMissPattern::CorrectedError {
500 correction_lag_days: 3,
501 ..
502 }
503 ));
504 assert_eq!(generator.labels.len(), 1);
505 }
506
507 #[test]
508 fn test_statistics() {
509 let mut generator = NearMissGenerator::new(NearMissConfig::default());
510
511 generator.create_corrected_error("JE001", "JE000", 2);
512 generator.create_corrected_error("JE002", "JE000", 3);
513
514 let stats = generator.get_statistics();
515 assert_eq!(stats.total_count, 2);
516 assert!(stats.by_pattern.contains_key("corrected_error"));
517 }
518}