Skip to main content

datasynth_generators/anomaly/
near_miss.rs

1//! Near-miss pattern generator.
2//!
3//! Generates near-miss cases that appear suspicious but are actually
4//! legitimate, useful for training models to reduce false positives.
5
6use chrono::{Datelike, NaiveDate};
7use rand::Rng;
8use rand::SeedableRng;
9use rand_chacha::ChaCha8Rng;
10use rust_decimal::Decimal;
11use rust_decimal_macros::dec;
12use serde::{Deserialize, Serialize};
13
14use datasynth_core::models::{
15    FalsePositiveTrigger, LegitimatePatternType, NearMissLabel, NearMissPattern,
16};
17
18/// Configuration for near-miss generation.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct NearMissConfig {
21    /// Proportion of "suspicious" entries that are near-misses (0.0-1.0).
22    pub proportion: f64,
23    /// Enable near-duplicate detection.
24    pub near_duplicate_enabled: bool,
25    /// Enable threshold proximity.
26    pub threshold_proximity_enabled: bool,
27    /// Enable unusual legitimate patterns.
28    pub unusual_legitimate_enabled: bool,
29    /// Enable corrected errors.
30    pub corrected_errors_enabled: bool,
31    /// Date difference range for near-duplicates.
32    pub near_duplicate_days: (u32, u32),
33    /// Proximity range for threshold proximity (0.90-0.99).
34    pub proximity_range: (f64, f64),
35    /// Correction lag range in days.
36    pub correction_lag_days: (u32, u32),
37    /// Random seed.
38    pub seed: u64,
39}
40
41impl Default for NearMissConfig {
42    fn default() -> Self {
43        Self {
44            proportion: 0.30,
45            near_duplicate_enabled: true,
46            threshold_proximity_enabled: true,
47            unusual_legitimate_enabled: true,
48            corrected_errors_enabled: true,
49            near_duplicate_days: (1, 3),
50            proximity_range: (0.90, 0.99),
51            correction_lag_days: (1, 5),
52            seed: 42,
53        }
54    }
55}
56
57/// Generator for near-miss patterns.
58pub struct NearMissGenerator {
59    config: NearMissConfig,
60    rng: ChaCha8Rng,
61    /// Generated near-miss labels.
62    labels: Vec<NearMissLabel>,
63    /// Tracking recent transactions for near-duplicate detection.
64    recent_transactions: Vec<RecentTransaction>,
65    /// Maximum recent transactions to track.
66    max_recent: usize,
67}
68
69/// Tracked transaction for near-duplicate detection.
70#[derive(Debug, Clone)]
71struct RecentTransaction {
72    document_id: String,
73    date: NaiveDate,
74    amount: Decimal,
75    account: String,
76    counterparty: Option<String>,
77}
78
79impl NearMissGenerator {
80    /// Creates a new near-miss generator.
81    pub fn new(config: NearMissConfig) -> Self {
82        let rng = ChaCha8Rng::seed_from_u64(config.seed);
83        Self {
84            config,
85            rng,
86            labels: Vec::new(),
87            recent_transactions: Vec::new(),
88            max_recent: 100,
89        }
90    }
91
92    /// Records a transaction for near-duplicate tracking.
93    pub fn record_transaction(
94        &mut self,
95        document_id: impl Into<String>,
96        date: NaiveDate,
97        amount: Decimal,
98        account: impl Into<String>,
99        counterparty: Option<String>,
100    ) {
101        let tx = RecentTransaction {
102            document_id: document_id.into(),
103            date,
104            amount,
105            account: account.into(),
106            counterparty,
107        };
108
109        self.recent_transactions.push(tx);
110
111        // Prune old transactions
112        if self.recent_transactions.len() > self.max_recent {
113            self.recent_transactions.remove(0);
114        }
115    }
116
117    /// Checks if a transaction should be marked as a near-miss.
118    pub fn check_near_miss(
119        &mut self,
120        document_id: impl Into<String>,
121        date: NaiveDate,
122        amount: Decimal,
123        account: impl Into<String>,
124        counterparty: Option<String>,
125        thresholds: &[Decimal],
126    ) -> Option<NearMissLabel> {
127        // Check proportion
128        if self.rng.gen::<f64>() >= self.config.proportion {
129            return None;
130        }
131
132        let doc_id = document_id.into();
133        let acct = account.into();
134
135        // Try different near-miss patterns
136        let patterns = self.get_applicable_patterns(date, amount, &acct, &counterparty, thresholds);
137
138        if patterns.is_empty() {
139            return None;
140        }
141
142        // Select random pattern
143        let idx = self.rng.gen_range(0..patterns.len());
144        let (pattern, trigger, explanation) = patterns.into_iter().nth(idx).unwrap();
145
146        // Calculate suspicion score based on pattern
147        let suspicion_score = match &pattern {
148            NearMissPattern::NearDuplicate { .. } => 0.70,
149            NearMissPattern::ThresholdProximity { proximity, .. } => 0.50 + proximity * 0.4,
150            NearMissPattern::UnusualLegitimate { .. } => 0.55,
151            NearMissPattern::CorrectedError { .. } => 0.60,
152        };
153
154        let label = NearMissLabel::new(doc_id, pattern, suspicion_score, trigger, explanation);
155
156        self.labels.push(label.clone());
157        Some(label)
158    }
159
160    /// Gets applicable near-miss patterns for a transaction.
161    fn get_applicable_patterns(
162        &mut self,
163        date: NaiveDate,
164        amount: Decimal,
165        account: &str,
166        counterparty: &Option<String>,
167        thresholds: &[Decimal],
168    ) -> Vec<(NearMissPattern, FalsePositiveTrigger, String)> {
169        let mut patterns = Vec::new();
170
171        // Check for near-duplicate
172        if self.config.near_duplicate_enabled {
173            if let Some(similar) =
174                self.find_similar_transaction(date, amount, account, counterparty)
175            {
176                let days_diff = (date - similar.date).num_days().unsigned_abs() as u32;
177                if days_diff >= self.config.near_duplicate_days.0
178                    && days_diff <= self.config.near_duplicate_days.1
179                {
180                    patterns.push((
181                        NearMissPattern::NearDuplicate {
182                            date_difference_days: days_diff,
183                            similar_transaction_id: similar.document_id.clone(),
184                        },
185                        FalsePositiveTrigger::SimilarTransaction,
186                        format!(
187                            "Similar transaction {} days apart - different business event",
188                            days_diff
189                        ),
190                    ));
191                }
192            }
193        }
194
195        // Check for threshold proximity
196        if self.config.threshold_proximity_enabled {
197            for threshold in thresholds {
198                let proximity = self.calculate_proximity(amount, *threshold);
199                if proximity >= self.config.proximity_range.0
200                    && proximity <= self.config.proximity_range.1
201                {
202                    patterns.push((
203                        NearMissPattern::ThresholdProximity {
204                            threshold: *threshold,
205                            proximity,
206                        },
207                        FalsePositiveTrigger::AmountNearThreshold,
208                        format!(
209                            "Amount is {:.1}% of threshold {} - coincidental",
210                            proximity * 100.0,
211                            threshold
212                        ),
213                    ));
214                }
215            }
216        }
217
218        // Check for unusual legitimate patterns
219        if self.config.unusual_legitimate_enabled {
220            if let Some((pattern_type, justification)) =
221                self.check_unusual_legitimate(date, amount, account)
222            {
223                patterns.push((
224                    NearMissPattern::UnusualLegitimate {
225                        pattern_type,
226                        justification: justification.clone(),
227                    },
228                    FalsePositiveTrigger::UnusualTiming,
229                    justification,
230                ));
231            }
232        }
233
234        patterns
235    }
236
237    /// Finds a similar recent transaction.
238    fn find_similar_transaction(
239        &self,
240        date: NaiveDate,
241        amount: Decimal,
242        account: &str,
243        counterparty: &Option<String>,
244    ) -> Option<&RecentTransaction> {
245        self.recent_transactions.iter().find(|tx| {
246            // Check amount similarity (within 5%)
247            let amount_diff = (tx.amount - amount).abs();
248            let amount_similar = amount_diff <= tx.amount * dec!(0.05);
249
250            // Check account match
251            let account_match = tx.account == account;
252
253            // Check counterparty match
254            let counterparty_match = match (&tx.counterparty, counterparty) {
255                (Some(a), Some(b)) => a == b,
256                _ => true, // If either is missing, don't exclude
257            };
258
259            // Check date range (not same day, but within range)
260            let days_diff = (date - tx.date).num_days().abs();
261            let date_in_range =
262                days_diff > 0 && days_diff <= self.config.near_duplicate_days.1 as i64;
263
264            amount_similar && account_match && counterparty_match && date_in_range
265        })
266    }
267
268    /// Calculates proximity to a threshold.
269    fn calculate_proximity(&self, amount: Decimal, threshold: Decimal) -> f64 {
270        if threshold == Decimal::ZERO {
271            return 0.0;
272        }
273        let amount_f64: f64 = amount.try_into().unwrap_or(0.0);
274        let threshold_f64: f64 = threshold.try_into().unwrap_or(1.0);
275        (amount_f64 / threshold_f64).min(1.0)
276    }
277
278    /// Checks for unusual but legitimate patterns.
279    fn check_unusual_legitimate(
280        &mut self,
281        date: NaiveDate,
282        amount: Decimal,
283        _account: &str,
284    ) -> Option<(LegitimatePatternType, String)> {
285        // Year-end bonuses (December, large amounts)
286        if date.month() == 12 && amount >= dec!(10000) && self.rng.gen::<f64>() < 0.3 {
287            return Some((
288                LegitimatePatternType::YearEndBonus,
289                "Year-end bonus payment per compensation plan".to_string(),
290            ));
291        }
292
293        // Contract prepayments (Q1, moderate amounts)
294        if date.month() <= 3 && amount >= dec!(5000) && self.rng.gen::<f64>() < 0.2 {
295            return Some((
296                LegitimatePatternType::ContractPrepayment,
297                "Annual contract prepayment per terms".to_string(),
298            ));
299        }
300
301        // Promotional spending (Q4)
302        if date.month() >= 10 && amount >= dec!(25000) && self.rng.gen::<f64>() < 0.2 {
303            return Some((
304                LegitimatePatternType::PromotionalSpending,
305                "Holiday promotional campaign spending".to_string(),
306            ));
307        }
308
309        // Seasonal inventory (Q3-Q4)
310        if date.month() >= 8
311            && date.month() <= 11
312            && amount >= dec!(50000)
313            && self.rng.gen::<f64>() < 0.15
314        {
315            return Some((
316                LegitimatePatternType::SeasonalInventory,
317                "Seasonal inventory buildup for holiday sales".to_string(),
318            ));
319        }
320
321        // One-time payments (any time, large amounts)
322        if amount >= dec!(100000) && self.rng.gen::<f64>() < 0.1 {
323            return Some((
324                LegitimatePatternType::OneTimePayment,
325                "One-time strategic vendor payment".to_string(),
326            ));
327        }
328
329        None
330    }
331
332    /// Creates a corrected error near-miss.
333    pub fn create_corrected_error(
334        &mut self,
335        document_id: impl Into<String>,
336        original_error_id: impl Into<String>,
337        correction_lag_days: u32,
338    ) -> NearMissLabel {
339        let pattern = NearMissPattern::CorrectedError {
340            correction_lag_days,
341            correction_document_id: original_error_id.into(),
342        };
343
344        let label = NearMissLabel::new(
345            document_id,
346            pattern,
347            0.60,
348            FalsePositiveTrigger::SimilarTransaction,
349            format!(
350                "Error caught and corrected within {} days",
351                correction_lag_days
352            ),
353        );
354
355        self.labels.push(label.clone());
356        label
357    }
358
359    /// Returns all generated labels.
360    pub fn get_labels(&self) -> &[NearMissLabel] {
361        &self.labels
362    }
363
364    /// Resets the generator.
365    pub fn reset(&mut self) {
366        self.labels.clear();
367        self.recent_transactions.clear();
368        self.rng = ChaCha8Rng::seed_from_u64(self.config.seed);
369    }
370
371    /// Returns statistics about generated near-misses.
372    pub fn get_statistics(&self) -> NearMissStatistics {
373        let mut by_pattern = std::collections::HashMap::new();
374        let mut by_trigger = std::collections::HashMap::new();
375
376        for label in &self.labels {
377            let pattern_name = match &label.pattern {
378                NearMissPattern::NearDuplicate { .. } => "near_duplicate",
379                NearMissPattern::ThresholdProximity { .. } => "threshold_proximity",
380                NearMissPattern::UnusualLegitimate { .. } => "unusual_legitimate",
381                NearMissPattern::CorrectedError { .. } => "corrected_error",
382            };
383
384            *by_pattern.entry(pattern_name.to_string()).or_insert(0) += 1;
385
386            let trigger_name = match label.false_positive_trigger {
387                FalsePositiveTrigger::AmountNearThreshold => "amount_near_threshold",
388                FalsePositiveTrigger::UnusualTiming => "unusual_timing",
389                FalsePositiveTrigger::SimilarTransaction => "similar_transaction",
390                FalsePositiveTrigger::NewCounterparty => "new_counterparty",
391                FalsePositiveTrigger::UnusualAccountCombination => "unusual_account",
392                FalsePositiveTrigger::VolumeSpike => "volume_spike",
393                FalsePositiveTrigger::RoundAmount => "round_amount",
394            };
395
396            *by_trigger.entry(trigger_name.to_string()).or_insert(0) += 1;
397        }
398
399        let avg_suspicion = if self.labels.is_empty() {
400            0.0
401        } else {
402            self.labels.iter().map(|l| l.suspicion_score).sum::<f64>() / self.labels.len() as f64
403        };
404
405        NearMissStatistics {
406            total_count: self.labels.len(),
407            by_pattern,
408            by_trigger,
409            average_suspicion_score: avg_suspicion,
410        }
411    }
412}
413
414/// Statistics about near-miss generation.
415#[derive(Debug, Clone, Serialize, Deserialize)]
416pub struct NearMissStatistics {
417    /// Total near-miss count.
418    pub total_count: usize,
419    /// Count by pattern type.
420    pub by_pattern: std::collections::HashMap<String, usize>,
421    /// Count by trigger type.
422    pub by_trigger: std::collections::HashMap<String, usize>,
423    /// Average suspicion score.
424    pub average_suspicion_score: f64,
425}
426
427#[cfg(test)]
428mod tests {
429    use super::*;
430
431    #[test]
432    fn test_near_miss_config() {
433        let config = NearMissConfig::default();
434        assert!((config.proportion - 0.30).abs() < 0.01);
435        assert!(config.near_duplicate_enabled);
436    }
437
438    #[test]
439    fn test_near_miss_generator_creation() {
440        let generator = NearMissGenerator::new(NearMissConfig::default());
441        assert!(generator.labels.is_empty());
442    }
443
444    #[test]
445    fn test_record_transaction() {
446        let mut generator = NearMissGenerator::new(NearMissConfig::default());
447
448        generator.record_transaction(
449            "JE001",
450            NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
451            dec!(10000),
452            "5000",
453            Some("VENDOR001".to_string()),
454        );
455
456        assert_eq!(generator.recent_transactions.len(), 1);
457    }
458
459    #[test]
460    fn test_threshold_proximity() {
461        let mut generator = NearMissGenerator::new(NearMissConfig {
462            proportion: 1.0, // Always check
463            threshold_proximity_enabled: true,
464            ..Default::default()
465        });
466
467        let thresholds = vec![dec!(10000), dec!(50000)];
468
469        // Amount is 95% of threshold
470        let label = generator.check_near_miss(
471            "JE001",
472            NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
473            dec!(9500),
474            "5000",
475            None,
476            &thresholds,
477        );
478
479        // May or may not generate depending on RNG and pattern selection
480        if let Some(label) = label {
481            // If threshold proximity was selected
482            if matches!(label.pattern, NearMissPattern::ThresholdProximity { .. }) {
483                assert_eq!(
484                    label.false_positive_trigger,
485                    FalsePositiveTrigger::AmountNearThreshold
486                );
487            }
488        }
489    }
490
491    #[test]
492    fn test_corrected_error() {
493        let mut generator = NearMissGenerator::new(NearMissConfig::default());
494
495        let label = generator.create_corrected_error("JE002", "JE001", 3);
496
497        assert!(matches!(
498            label.pattern,
499            NearMissPattern::CorrectedError {
500                correction_lag_days: 3,
501                ..
502            }
503        ));
504        assert_eq!(generator.labels.len(), 1);
505    }
506
507    #[test]
508    fn test_statistics() {
509        let mut generator = NearMissGenerator::new(NearMissConfig::default());
510
511        generator.create_corrected_error("JE001", "JE000", 2);
512        generator.create_corrected_error("JE002", "JE000", 3);
513
514        let stats = generator.get_statistics();
515        assert_eq!(stats.total_count, 2);
516        assert!(stats.by_pattern.contains_key("corrected_error"));
517    }
518}