Skip to main content

datasynth_generators/anomaly/
near_miss.rs

1//! Near-miss pattern generator.
2//!
3//! Generates near-miss cases that appear suspicious but are actually
4//! legitimate, useful for training models to reduce false positives.
5
6use chrono::{Datelike, NaiveDate};
7use datasynth_core::utils::seeded_rng;
8use rand::Rng;
9use rand_chacha::ChaCha8Rng;
10use rust_decimal::Decimal;
11use rust_decimal_macros::dec;
12use serde::{Deserialize, Serialize};
13
14use datasynth_core::models::{
15    FalsePositiveTrigger, LegitimatePatternType, NearMissLabel, NearMissPattern,
16};
17
18/// Configuration for near-miss generation.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct NearMissConfig {
21    /// Proportion of "suspicious" entries that are near-misses (0.0-1.0).
22    pub proportion: f64,
23    /// Enable near-duplicate detection.
24    pub near_duplicate_enabled: bool,
25    /// Enable threshold proximity.
26    pub threshold_proximity_enabled: bool,
27    /// Enable unusual legitimate patterns.
28    pub unusual_legitimate_enabled: bool,
29    /// Enable corrected errors.
30    pub corrected_errors_enabled: bool,
31    /// Date difference range for near-duplicates.
32    pub near_duplicate_days: (u32, u32),
33    /// Proximity range for threshold proximity (0.90-0.99).
34    pub proximity_range: (f64, f64),
35    /// Correction lag range in days.
36    pub correction_lag_days: (u32, u32),
37    /// Random seed.
38    pub seed: u64,
39}
40
41impl Default for NearMissConfig {
42    fn default() -> Self {
43        Self {
44            proportion: 0.30,
45            near_duplicate_enabled: true,
46            threshold_proximity_enabled: true,
47            unusual_legitimate_enabled: true,
48            corrected_errors_enabled: true,
49            near_duplicate_days: (1, 3),
50            proximity_range: (0.90, 0.99),
51            correction_lag_days: (1, 5),
52            seed: 42,
53        }
54    }
55}
56
57/// Generator for near-miss patterns.
58pub struct NearMissGenerator {
59    config: NearMissConfig,
60    rng: ChaCha8Rng,
61    /// Generated near-miss labels.
62    labels: Vec<NearMissLabel>,
63    /// Tracking recent transactions for near-duplicate detection.
64    recent_transactions: Vec<RecentTransaction>,
65    /// Maximum recent transactions to track.
66    max_recent: usize,
67}
68
69/// Tracked transaction for near-duplicate detection.
70#[derive(Debug, Clone)]
71struct RecentTransaction {
72    document_id: String,
73    date: NaiveDate,
74    amount: Decimal,
75    account: String,
76    counterparty: Option<String>,
77}
78
79impl NearMissGenerator {
80    /// Creates a new near-miss generator.
81    pub fn new(config: NearMissConfig) -> Self {
82        let rng = seeded_rng(config.seed, 0);
83        Self {
84            config,
85            rng,
86            labels: Vec::new(),
87            recent_transactions: Vec::new(),
88            max_recent: 100,
89        }
90    }
91
92    /// Records a transaction for near-duplicate tracking.
93    pub fn record_transaction(
94        &mut self,
95        document_id: impl Into<String>,
96        date: NaiveDate,
97        amount: Decimal,
98        account: impl Into<String>,
99        counterparty: Option<String>,
100    ) {
101        let tx = RecentTransaction {
102            document_id: document_id.into(),
103            date,
104            amount,
105            account: account.into(),
106            counterparty,
107        };
108
109        self.recent_transactions.push(tx);
110
111        // Prune old transactions
112        if self.recent_transactions.len() > self.max_recent {
113            self.recent_transactions.remove(0);
114        }
115    }
116
117    /// Checks if a transaction should be marked as a near-miss.
118    pub fn check_near_miss(
119        &mut self,
120        document_id: impl Into<String>,
121        date: NaiveDate,
122        amount: Decimal,
123        account: impl Into<String>,
124        counterparty: Option<String>,
125        thresholds: &[Decimal],
126    ) -> Option<NearMissLabel> {
127        // Check proportion
128        if self.rng.random::<f64>() >= self.config.proportion {
129            return None;
130        }
131
132        let doc_id = document_id.into();
133        let acct = account.into();
134
135        // Try different near-miss patterns
136        let patterns = self.get_applicable_patterns(date, amount, &acct, &counterparty, thresholds);
137
138        if patterns.is_empty() {
139            return None;
140        }
141
142        // Select random pattern
143        let idx = self.rng.random_range(0..patterns.len());
144        let (pattern, trigger, explanation) =
145            patterns.into_iter().nth(idx).expect("idx < patterns.len()");
146
147        // Calculate suspicion score based on pattern
148        let suspicion_score = match &pattern {
149            NearMissPattern::NearDuplicate { .. } => 0.70,
150            NearMissPattern::ThresholdProximity { proximity, .. } => 0.50 + proximity * 0.4,
151            NearMissPattern::UnusualLegitimate { .. } => 0.55,
152            NearMissPattern::CorrectedError { .. } => 0.60,
153        };
154
155        let label = NearMissLabel::new(doc_id, pattern, suspicion_score, trigger, explanation);
156
157        self.labels.push(label.clone());
158        Some(label)
159    }
160
161    /// Gets applicable near-miss patterns for a transaction.
162    fn get_applicable_patterns(
163        &mut self,
164        date: NaiveDate,
165        amount: Decimal,
166        account: &str,
167        counterparty: &Option<String>,
168        thresholds: &[Decimal],
169    ) -> Vec<(NearMissPattern, FalsePositiveTrigger, String)> {
170        let mut patterns = Vec::new();
171
172        // Check for near-duplicate
173        if self.config.near_duplicate_enabled {
174            if let Some(similar) =
175                self.find_similar_transaction(date, amount, account, counterparty)
176            {
177                let days_diff = (date - similar.date).num_days().unsigned_abs() as u32;
178                if days_diff >= self.config.near_duplicate_days.0
179                    && days_diff <= self.config.near_duplicate_days.1
180                {
181                    patterns.push((
182                        NearMissPattern::NearDuplicate {
183                            date_difference_days: days_diff,
184                            similar_transaction_id: similar.document_id.clone(),
185                        },
186                        FalsePositiveTrigger::SimilarTransaction,
187                        format!(
188                            "Similar transaction {days_diff} days apart - different business event"
189                        ),
190                    ));
191                }
192            }
193        }
194
195        // Check for threshold proximity
196        if self.config.threshold_proximity_enabled {
197            for threshold in thresholds {
198                let proximity = self.calculate_proximity(amount, *threshold);
199                if proximity >= self.config.proximity_range.0
200                    && proximity <= self.config.proximity_range.1
201                {
202                    patterns.push((
203                        NearMissPattern::ThresholdProximity {
204                            threshold: *threshold,
205                            proximity,
206                        },
207                        FalsePositiveTrigger::AmountNearThreshold,
208                        format!(
209                            "Amount is {:.1}% of threshold {} - coincidental",
210                            proximity * 100.0,
211                            threshold
212                        ),
213                    ));
214                }
215            }
216        }
217
218        // Check for unusual legitimate patterns
219        if self.config.unusual_legitimate_enabled {
220            if let Some((pattern_type, justification)) =
221                self.check_unusual_legitimate(date, amount, account)
222            {
223                patterns.push((
224                    NearMissPattern::UnusualLegitimate {
225                        pattern_type,
226                        justification: justification.clone(),
227                    },
228                    FalsePositiveTrigger::UnusualTiming,
229                    justification,
230                ));
231            }
232        }
233
234        patterns
235    }
236
237    /// Finds a similar recent transaction.
238    fn find_similar_transaction(
239        &self,
240        date: NaiveDate,
241        amount: Decimal,
242        account: &str,
243        counterparty: &Option<String>,
244    ) -> Option<&RecentTransaction> {
245        self.recent_transactions.iter().find(|tx| {
246            // Check amount similarity (within 5%)
247            let amount_diff = (tx.amount - amount).abs();
248            let amount_similar = amount_diff <= tx.amount * dec!(0.05);
249
250            // Check account match
251            let account_match = tx.account == account;
252
253            // Check counterparty match
254            let counterparty_match = match (&tx.counterparty, counterparty) {
255                (Some(a), Some(b)) => a == b,
256                _ => true, // If either is missing, don't exclude
257            };
258
259            // Check date range (not same day, but within range)
260            let days_diff = (date - tx.date).num_days().abs();
261            let date_in_range =
262                days_diff > 0 && days_diff <= self.config.near_duplicate_days.1 as i64;
263
264            amount_similar && account_match && counterparty_match && date_in_range
265        })
266    }
267
268    /// Calculates proximity to a threshold.
269    fn calculate_proximity(&self, amount: Decimal, threshold: Decimal) -> f64 {
270        if threshold == Decimal::ZERO {
271            return 0.0;
272        }
273        let amount_f64: f64 = amount.try_into().unwrap_or(0.0);
274        let threshold_f64: f64 = threshold.try_into().unwrap_or(1.0);
275        (amount_f64 / threshold_f64).min(1.0)
276    }
277
278    /// Checks for unusual but legitimate patterns.
279    fn check_unusual_legitimate(
280        &mut self,
281        date: NaiveDate,
282        amount: Decimal,
283        _account: &str,
284    ) -> Option<(LegitimatePatternType, String)> {
285        // Year-end bonuses (December, large amounts)
286        if date.month() == 12 && amount >= dec!(10000) && self.rng.random::<f64>() < 0.3 {
287            return Some((
288                LegitimatePatternType::YearEndBonus,
289                "Year-end bonus payment per compensation plan".to_string(),
290            ));
291        }
292
293        // Contract prepayments (Q1, moderate amounts)
294        if date.month() <= 3 && amount >= dec!(5000) && self.rng.random::<f64>() < 0.2 {
295            return Some((
296                LegitimatePatternType::ContractPrepayment,
297                "Annual contract prepayment per terms".to_string(),
298            ));
299        }
300
301        // Promotional spending (Q4)
302        if date.month() >= 10 && amount >= dec!(25000) && self.rng.random::<f64>() < 0.2 {
303            return Some((
304                LegitimatePatternType::PromotionalSpending,
305                "Holiday promotional campaign spending".to_string(),
306            ));
307        }
308
309        // Seasonal inventory (Q3-Q4)
310        if date.month() >= 8
311            && date.month() <= 11
312            && amount >= dec!(50000)
313            && self.rng.random::<f64>() < 0.15
314        {
315            return Some((
316                LegitimatePatternType::SeasonalInventory,
317                "Seasonal inventory buildup for holiday sales".to_string(),
318            ));
319        }
320
321        // One-time payments (any time, large amounts)
322        if amount >= dec!(100000) && self.rng.random::<f64>() < 0.1 {
323            return Some((
324                LegitimatePatternType::OneTimePayment,
325                "One-time strategic vendor payment".to_string(),
326            ));
327        }
328
329        None
330    }
331
332    /// Creates a corrected error near-miss.
333    pub fn create_corrected_error(
334        &mut self,
335        document_id: impl Into<String>,
336        original_error_id: impl Into<String>,
337        correction_lag_days: u32,
338    ) -> NearMissLabel {
339        let pattern = NearMissPattern::CorrectedError {
340            correction_lag_days,
341            correction_document_id: original_error_id.into(),
342        };
343
344        let label = NearMissLabel::new(
345            document_id,
346            pattern,
347            0.60,
348            FalsePositiveTrigger::SimilarTransaction,
349            format!("Error caught and corrected within {correction_lag_days} days"),
350        );
351
352        self.labels.push(label.clone());
353        label
354    }
355
356    /// Returns all generated labels.
357    pub fn get_labels(&self) -> &[NearMissLabel] {
358        &self.labels
359    }
360
361    /// Resets the generator.
362    pub fn reset(&mut self) {
363        self.labels.clear();
364        self.recent_transactions.clear();
365        self.rng = seeded_rng(self.config.seed, 0);
366    }
367
368    /// Returns statistics about generated near-misses.
369    pub fn get_statistics(&self) -> NearMissStatistics {
370        let mut by_pattern = std::collections::HashMap::new();
371        let mut by_trigger = std::collections::HashMap::new();
372
373        for label in &self.labels {
374            let pattern_name = match &label.pattern {
375                NearMissPattern::NearDuplicate { .. } => "near_duplicate",
376                NearMissPattern::ThresholdProximity { .. } => "threshold_proximity",
377                NearMissPattern::UnusualLegitimate { .. } => "unusual_legitimate",
378                NearMissPattern::CorrectedError { .. } => "corrected_error",
379            };
380
381            *by_pattern.entry(pattern_name.to_string()).or_insert(0) += 1;
382
383            let trigger_name = match label.false_positive_trigger {
384                FalsePositiveTrigger::AmountNearThreshold => "amount_near_threshold",
385                FalsePositiveTrigger::UnusualTiming => "unusual_timing",
386                FalsePositiveTrigger::SimilarTransaction => "similar_transaction",
387                FalsePositiveTrigger::NewCounterparty => "new_counterparty",
388                FalsePositiveTrigger::UnusualAccountCombination => "unusual_account",
389                FalsePositiveTrigger::VolumeSpike => "volume_spike",
390                FalsePositiveTrigger::RoundAmount => "round_amount",
391            };
392
393            *by_trigger.entry(trigger_name.to_string()).or_insert(0) += 1;
394        }
395
396        let avg_suspicion = if self.labels.is_empty() {
397            0.0
398        } else {
399            self.labels.iter().map(|l| l.suspicion_score).sum::<f64>() / self.labels.len() as f64
400        };
401
402        NearMissStatistics {
403            total_count: self.labels.len(),
404            by_pattern,
405            by_trigger,
406            average_suspicion_score: avg_suspicion,
407        }
408    }
409}
410
411/// Statistics about near-miss generation.
412#[derive(Debug, Clone, Serialize, Deserialize)]
413pub struct NearMissStatistics {
414    /// Total near-miss count.
415    pub total_count: usize,
416    /// Count by pattern type.
417    pub by_pattern: std::collections::HashMap<String, usize>,
418    /// Count by trigger type.
419    pub by_trigger: std::collections::HashMap<String, usize>,
420    /// Average suspicion score.
421    pub average_suspicion_score: f64,
422}
423
424#[cfg(test)]
425#[allow(clippy::unwrap_used)]
426mod tests {
427    use super::*;
428
429    #[test]
430    fn test_near_miss_config() {
431        let config = NearMissConfig::default();
432        assert!((config.proportion - 0.30).abs() < 0.01);
433        assert!(config.near_duplicate_enabled);
434    }
435
436    #[test]
437    fn test_near_miss_generator_creation() {
438        let generator = NearMissGenerator::new(NearMissConfig::default());
439        assert!(generator.labels.is_empty());
440    }
441
442    #[test]
443    fn test_record_transaction() {
444        let mut generator = NearMissGenerator::new(NearMissConfig::default());
445
446        generator.record_transaction(
447            "JE001",
448            NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
449            dec!(10000),
450            "5000",
451            Some("VENDOR001".to_string()),
452        );
453
454        assert_eq!(generator.recent_transactions.len(), 1);
455    }
456
457    #[test]
458    fn test_threshold_proximity() {
459        let mut generator = NearMissGenerator::new(NearMissConfig {
460            proportion: 1.0, // Always check
461            threshold_proximity_enabled: true,
462            ..Default::default()
463        });
464
465        let thresholds = vec![dec!(10000), dec!(50000)];
466
467        // Amount is 95% of threshold
468        let label = generator.check_near_miss(
469            "JE001",
470            NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
471            dec!(9500),
472            "5000",
473            None,
474            &thresholds,
475        );
476
477        // May or may not generate depending on RNG and pattern selection
478        if let Some(label) = label {
479            // If threshold proximity was selected
480            if matches!(label.pattern, NearMissPattern::ThresholdProximity { .. }) {
481                assert_eq!(
482                    label.false_positive_trigger,
483                    FalsePositiveTrigger::AmountNearThreshold
484                );
485            }
486        }
487    }
488
489    #[test]
490    fn test_corrected_error() {
491        let mut generator = NearMissGenerator::new(NearMissConfig::default());
492
493        let label = generator.create_corrected_error("JE002", "JE001", 3);
494
495        assert!(matches!(
496            label.pattern,
497            NearMissPattern::CorrectedError {
498                correction_lag_days: 3,
499                ..
500            }
501        ));
502        assert_eq!(generator.labels.len(), 1);
503    }
504
505    #[test]
506    fn test_statistics() {
507        let mut generator = NearMissGenerator::new(NearMissConfig::default());
508
509        generator.create_corrected_error("JE001", "JE000", 2);
510        generator.create_corrected_error("JE002", "JE000", 3);
511
512        let stats = generator.get_statistics();
513        assert_eq!(stats.total_count, 2);
514        assert!(stats.by_pattern.contains_key("corrected_error"));
515    }
516}