Skip to main content

datasynth_generators/anomaly/
near_miss.rs

1//! Near-miss pattern generator.
2//!
3//! Generates near-miss cases that appear suspicious but are actually
4//! legitimate, useful for training models to reduce false positives.
5
6use chrono::{Datelike, NaiveDate};
7use datasynth_core::utils::seeded_rng;
8use rand::Rng;
9use rand_chacha::ChaCha8Rng;
10use rust_decimal::Decimal;
11use rust_decimal_macros::dec;
12use serde::{Deserialize, Serialize};
13
14use datasynth_core::models::{
15    FalsePositiveTrigger, LegitimatePatternType, NearMissLabel, NearMissPattern,
16};
17
18/// Configuration for near-miss generation.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct NearMissConfig {
21    /// Proportion of "suspicious" entries that are near-misses (0.0-1.0).
22    pub proportion: f64,
23    /// Enable near-duplicate detection.
24    pub near_duplicate_enabled: bool,
25    /// Enable threshold proximity.
26    pub threshold_proximity_enabled: bool,
27    /// Enable unusual legitimate patterns.
28    pub unusual_legitimate_enabled: bool,
29    /// Enable corrected errors.
30    pub corrected_errors_enabled: bool,
31    /// Date difference range for near-duplicates.
32    pub near_duplicate_days: (u32, u32),
33    /// Proximity range for threshold proximity (0.90-0.99).
34    pub proximity_range: (f64, f64),
35    /// Correction lag range in days.
36    pub correction_lag_days: (u32, u32),
37    /// Random seed.
38    pub seed: u64,
39}
40
41impl Default for NearMissConfig {
42    fn default() -> Self {
43        Self {
44            proportion: 0.30,
45            near_duplicate_enabled: true,
46            threshold_proximity_enabled: true,
47            unusual_legitimate_enabled: true,
48            corrected_errors_enabled: true,
49            near_duplicate_days: (1, 3),
50            proximity_range: (0.90, 0.99),
51            correction_lag_days: (1, 5),
52            seed: 42,
53        }
54    }
55}
56
57/// Generator for near-miss patterns.
58pub struct NearMissGenerator {
59    config: NearMissConfig,
60    rng: ChaCha8Rng,
61    /// Generated near-miss labels.
62    labels: Vec<NearMissLabel>,
63    /// Tracking recent transactions for near-duplicate detection.
64    recent_transactions: Vec<RecentTransaction>,
65    /// Maximum recent transactions to track.
66    max_recent: usize,
67}
68
69/// Tracked transaction for near-duplicate detection.
70#[derive(Debug, Clone)]
71struct RecentTransaction {
72    document_id: String,
73    date: NaiveDate,
74    amount: Decimal,
75    account: String,
76    counterparty: Option<String>,
77}
78
79impl NearMissGenerator {
80    /// Creates a new near-miss generator.
81    pub fn new(config: NearMissConfig) -> Self {
82        let rng = seeded_rng(config.seed, 0);
83        Self {
84            config,
85            rng,
86            labels: Vec::new(),
87            recent_transactions: Vec::new(),
88            max_recent: 100,
89        }
90    }
91
92    /// Records a transaction for near-duplicate tracking.
93    pub fn record_transaction(
94        &mut self,
95        document_id: impl Into<String>,
96        date: NaiveDate,
97        amount: Decimal,
98        account: impl Into<String>,
99        counterparty: Option<String>,
100    ) {
101        let tx = RecentTransaction {
102            document_id: document_id.into(),
103            date,
104            amount,
105            account: account.into(),
106            counterparty,
107        };
108
109        self.recent_transactions.push(tx);
110
111        // Prune old transactions
112        if self.recent_transactions.len() > self.max_recent {
113            self.recent_transactions.remove(0);
114        }
115    }
116
117    /// Checks if a transaction should be marked as a near-miss.
118    pub fn check_near_miss(
119        &mut self,
120        document_id: impl Into<String>,
121        date: NaiveDate,
122        amount: Decimal,
123        account: impl Into<String>,
124        counterparty: Option<String>,
125        thresholds: &[Decimal],
126    ) -> Option<NearMissLabel> {
127        // Check proportion
128        if self.rng.gen::<f64>() >= self.config.proportion {
129            return None;
130        }
131
132        let doc_id = document_id.into();
133        let acct = account.into();
134
135        // Try different near-miss patterns
136        let patterns = self.get_applicable_patterns(date, amount, &acct, &counterparty, thresholds);
137
138        if patterns.is_empty() {
139            return None;
140        }
141
142        // Select random pattern
143        let idx = self.rng.gen_range(0..patterns.len());
144        let (pattern, trigger, explanation) =
145            patterns.into_iter().nth(idx).expect("idx < patterns.len()");
146
147        // Calculate suspicion score based on pattern
148        let suspicion_score = match &pattern {
149            NearMissPattern::NearDuplicate { .. } => 0.70,
150            NearMissPattern::ThresholdProximity { proximity, .. } => 0.50 + proximity * 0.4,
151            NearMissPattern::UnusualLegitimate { .. } => 0.55,
152            NearMissPattern::CorrectedError { .. } => 0.60,
153        };
154
155        let label = NearMissLabel::new(doc_id, pattern, suspicion_score, trigger, explanation);
156
157        self.labels.push(label.clone());
158        Some(label)
159    }
160
161    /// Gets applicable near-miss patterns for a transaction.
162    fn get_applicable_patterns(
163        &mut self,
164        date: NaiveDate,
165        amount: Decimal,
166        account: &str,
167        counterparty: &Option<String>,
168        thresholds: &[Decimal],
169    ) -> Vec<(NearMissPattern, FalsePositiveTrigger, String)> {
170        let mut patterns = Vec::new();
171
172        // Check for near-duplicate
173        if self.config.near_duplicate_enabled {
174            if let Some(similar) =
175                self.find_similar_transaction(date, amount, account, counterparty)
176            {
177                let days_diff = (date - similar.date).num_days().unsigned_abs() as u32;
178                if days_diff >= self.config.near_duplicate_days.0
179                    && days_diff <= self.config.near_duplicate_days.1
180                {
181                    patterns.push((
182                        NearMissPattern::NearDuplicate {
183                            date_difference_days: days_diff,
184                            similar_transaction_id: similar.document_id.clone(),
185                        },
186                        FalsePositiveTrigger::SimilarTransaction,
187                        format!(
188                            "Similar transaction {} days apart - different business event",
189                            days_diff
190                        ),
191                    ));
192                }
193            }
194        }
195
196        // Check for threshold proximity
197        if self.config.threshold_proximity_enabled {
198            for threshold in thresholds {
199                let proximity = self.calculate_proximity(amount, *threshold);
200                if proximity >= self.config.proximity_range.0
201                    && proximity <= self.config.proximity_range.1
202                {
203                    patterns.push((
204                        NearMissPattern::ThresholdProximity {
205                            threshold: *threshold,
206                            proximity,
207                        },
208                        FalsePositiveTrigger::AmountNearThreshold,
209                        format!(
210                            "Amount is {:.1}% of threshold {} - coincidental",
211                            proximity * 100.0,
212                            threshold
213                        ),
214                    ));
215                }
216            }
217        }
218
219        // Check for unusual legitimate patterns
220        if self.config.unusual_legitimate_enabled {
221            if let Some((pattern_type, justification)) =
222                self.check_unusual_legitimate(date, amount, account)
223            {
224                patterns.push((
225                    NearMissPattern::UnusualLegitimate {
226                        pattern_type,
227                        justification: justification.clone(),
228                    },
229                    FalsePositiveTrigger::UnusualTiming,
230                    justification,
231                ));
232            }
233        }
234
235        patterns
236    }
237
238    /// Finds a similar recent transaction.
239    fn find_similar_transaction(
240        &self,
241        date: NaiveDate,
242        amount: Decimal,
243        account: &str,
244        counterparty: &Option<String>,
245    ) -> Option<&RecentTransaction> {
246        self.recent_transactions.iter().find(|tx| {
247            // Check amount similarity (within 5%)
248            let amount_diff = (tx.amount - amount).abs();
249            let amount_similar = amount_diff <= tx.amount * dec!(0.05);
250
251            // Check account match
252            let account_match = tx.account == account;
253
254            // Check counterparty match
255            let counterparty_match = match (&tx.counterparty, counterparty) {
256                (Some(a), Some(b)) => a == b,
257                _ => true, // If either is missing, don't exclude
258            };
259
260            // Check date range (not same day, but within range)
261            let days_diff = (date - tx.date).num_days().abs();
262            let date_in_range =
263                days_diff > 0 && days_diff <= self.config.near_duplicate_days.1 as i64;
264
265            amount_similar && account_match && counterparty_match && date_in_range
266        })
267    }
268
269    /// Calculates proximity to a threshold.
270    fn calculate_proximity(&self, amount: Decimal, threshold: Decimal) -> f64 {
271        if threshold == Decimal::ZERO {
272            return 0.0;
273        }
274        let amount_f64: f64 = amount.try_into().unwrap_or(0.0);
275        let threshold_f64: f64 = threshold.try_into().unwrap_or(1.0);
276        (amount_f64 / threshold_f64).min(1.0)
277    }
278
279    /// Checks for unusual but legitimate patterns.
280    fn check_unusual_legitimate(
281        &mut self,
282        date: NaiveDate,
283        amount: Decimal,
284        _account: &str,
285    ) -> Option<(LegitimatePatternType, String)> {
286        // Year-end bonuses (December, large amounts)
287        if date.month() == 12 && amount >= dec!(10000) && self.rng.gen::<f64>() < 0.3 {
288            return Some((
289                LegitimatePatternType::YearEndBonus,
290                "Year-end bonus payment per compensation plan".to_string(),
291            ));
292        }
293
294        // Contract prepayments (Q1, moderate amounts)
295        if date.month() <= 3 && amount >= dec!(5000) && self.rng.gen::<f64>() < 0.2 {
296            return Some((
297                LegitimatePatternType::ContractPrepayment,
298                "Annual contract prepayment per terms".to_string(),
299            ));
300        }
301
302        // Promotional spending (Q4)
303        if date.month() >= 10 && amount >= dec!(25000) && self.rng.gen::<f64>() < 0.2 {
304            return Some((
305                LegitimatePatternType::PromotionalSpending,
306                "Holiday promotional campaign spending".to_string(),
307            ));
308        }
309
310        // Seasonal inventory (Q3-Q4)
311        if date.month() >= 8
312            && date.month() <= 11
313            && amount >= dec!(50000)
314            && self.rng.gen::<f64>() < 0.15
315        {
316            return Some((
317                LegitimatePatternType::SeasonalInventory,
318                "Seasonal inventory buildup for holiday sales".to_string(),
319            ));
320        }
321
322        // One-time payments (any time, large amounts)
323        if amount >= dec!(100000) && self.rng.gen::<f64>() < 0.1 {
324            return Some((
325                LegitimatePatternType::OneTimePayment,
326                "One-time strategic vendor payment".to_string(),
327            ));
328        }
329
330        None
331    }
332
333    /// Creates a corrected error near-miss.
334    pub fn create_corrected_error(
335        &mut self,
336        document_id: impl Into<String>,
337        original_error_id: impl Into<String>,
338        correction_lag_days: u32,
339    ) -> NearMissLabel {
340        let pattern = NearMissPattern::CorrectedError {
341            correction_lag_days,
342            correction_document_id: original_error_id.into(),
343        };
344
345        let label = NearMissLabel::new(
346            document_id,
347            pattern,
348            0.60,
349            FalsePositiveTrigger::SimilarTransaction,
350            format!(
351                "Error caught and corrected within {} days",
352                correction_lag_days
353            ),
354        );
355
356        self.labels.push(label.clone());
357        label
358    }
359
360    /// Returns all generated labels.
361    pub fn get_labels(&self) -> &[NearMissLabel] {
362        &self.labels
363    }
364
365    /// Resets the generator.
366    pub fn reset(&mut self) {
367        self.labels.clear();
368        self.recent_transactions.clear();
369        self.rng = seeded_rng(self.config.seed, 0);
370    }
371
372    /// Returns statistics about generated near-misses.
373    pub fn get_statistics(&self) -> NearMissStatistics {
374        let mut by_pattern = std::collections::HashMap::new();
375        let mut by_trigger = std::collections::HashMap::new();
376
377        for label in &self.labels {
378            let pattern_name = match &label.pattern {
379                NearMissPattern::NearDuplicate { .. } => "near_duplicate",
380                NearMissPattern::ThresholdProximity { .. } => "threshold_proximity",
381                NearMissPattern::UnusualLegitimate { .. } => "unusual_legitimate",
382                NearMissPattern::CorrectedError { .. } => "corrected_error",
383            };
384
385            *by_pattern.entry(pattern_name.to_string()).or_insert(0) += 1;
386
387            let trigger_name = match label.false_positive_trigger {
388                FalsePositiveTrigger::AmountNearThreshold => "amount_near_threshold",
389                FalsePositiveTrigger::UnusualTiming => "unusual_timing",
390                FalsePositiveTrigger::SimilarTransaction => "similar_transaction",
391                FalsePositiveTrigger::NewCounterparty => "new_counterparty",
392                FalsePositiveTrigger::UnusualAccountCombination => "unusual_account",
393                FalsePositiveTrigger::VolumeSpike => "volume_spike",
394                FalsePositiveTrigger::RoundAmount => "round_amount",
395            };
396
397            *by_trigger.entry(trigger_name.to_string()).or_insert(0) += 1;
398        }
399
400        let avg_suspicion = if self.labels.is_empty() {
401            0.0
402        } else {
403            self.labels.iter().map(|l| l.suspicion_score).sum::<f64>() / self.labels.len() as f64
404        };
405
406        NearMissStatistics {
407            total_count: self.labels.len(),
408            by_pattern,
409            by_trigger,
410            average_suspicion_score: avg_suspicion,
411        }
412    }
413}
414
415/// Statistics about near-miss generation.
416#[derive(Debug, Clone, Serialize, Deserialize)]
417pub struct NearMissStatistics {
418    /// Total near-miss count.
419    pub total_count: usize,
420    /// Count by pattern type.
421    pub by_pattern: std::collections::HashMap<String, usize>,
422    /// Count by trigger type.
423    pub by_trigger: std::collections::HashMap<String, usize>,
424    /// Average suspicion score.
425    pub average_suspicion_score: f64,
426}
427
428#[cfg(test)]
429#[allow(clippy::unwrap_used)]
430mod tests {
431    use super::*;
432
433    #[test]
434    fn test_near_miss_config() {
435        let config = NearMissConfig::default();
436        assert!((config.proportion - 0.30).abs() < 0.01);
437        assert!(config.near_duplicate_enabled);
438    }
439
440    #[test]
441    fn test_near_miss_generator_creation() {
442        let generator = NearMissGenerator::new(NearMissConfig::default());
443        assert!(generator.labels.is_empty());
444    }
445
446    #[test]
447    fn test_record_transaction() {
448        let mut generator = NearMissGenerator::new(NearMissConfig::default());
449
450        generator.record_transaction(
451            "JE001",
452            NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
453            dec!(10000),
454            "5000",
455            Some("VENDOR001".to_string()),
456        );
457
458        assert_eq!(generator.recent_transactions.len(), 1);
459    }
460
461    #[test]
462    fn test_threshold_proximity() {
463        let mut generator = NearMissGenerator::new(NearMissConfig {
464            proportion: 1.0, // Always check
465            threshold_proximity_enabled: true,
466            ..Default::default()
467        });
468
469        let thresholds = vec![dec!(10000), dec!(50000)];
470
471        // Amount is 95% of threshold
472        let label = generator.check_near_miss(
473            "JE001",
474            NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
475            dec!(9500),
476            "5000",
477            None,
478            &thresholds,
479        );
480
481        // May or may not generate depending on RNG and pattern selection
482        if let Some(label) = label {
483            // If threshold proximity was selected
484            if matches!(label.pattern, NearMissPattern::ThresholdProximity { .. }) {
485                assert_eq!(
486                    label.false_positive_trigger,
487                    FalsePositiveTrigger::AmountNearThreshold
488                );
489            }
490        }
491    }
492
493    #[test]
494    fn test_corrected_error() {
495        let mut generator = NearMissGenerator::new(NearMissConfig::default());
496
497        let label = generator.create_corrected_error("JE002", "JE001", 3);
498
499        assert!(matches!(
500            label.pattern,
501            NearMissPattern::CorrectedError {
502                correction_lag_days: 3,
503                ..
504            }
505        ));
506        assert_eq!(generator.labels.len(), 1);
507    }
508
509    #[test]
510    fn test_statistics() {
511        let mut generator = NearMissGenerator::new(NearMissConfig::default());
512
513        generator.create_corrected_error("JE001", "JE000", 2);
514        generator.create_corrected_error("JE002", "JE000", 3);
515
516        let stats = generator.get_statistics();
517        assert_eq!(stats.total_count, 2);
518        assert!(stats.by_pattern.contains_key("corrected_error"));
519    }
520}