datasynth_generators/anomaly/
patterns.rs

1//! Anomaly patterns for realistic distribution.
2//!
3//! Patterns control how anomalies are distributed across time and entities,
4//! including clustering behavior and temporal patterns.
5
6use chrono::{Datelike, NaiveDate, Weekday};
7use rand::Rng;
8use std::collections::HashMap;
9
10/// Temporal pattern for anomaly injection.
11#[derive(Debug, Clone)]
12pub enum TemporalPattern {
13    /// Uniform distribution across all periods.
14    Uniform,
15    /// Higher probability at period/year end.
16    PeriodEndSpike {
17        /// Multiplier for month-end days.
18        month_end_multiplier: f64,
19        /// Multiplier for quarter-end.
20        quarter_end_multiplier: f64,
21        /// Multiplier for year-end.
22        year_end_multiplier: f64,
23    },
24    /// Higher probability at specific times.
25    TimeBased {
26        /// Multiplier for after-hours.
27        after_hours_multiplier: f64,
28        /// Multiplier for weekends.
29        weekend_multiplier: f64,
30    },
31    /// Seasonal pattern.
32    Seasonal {
33        /// Multipliers by month (1-12).
34        month_multipliers: [f64; 12],
35    },
36    /// Custom pattern function.
37    Custom {
38        /// Name of the pattern.
39        name: String,
40    },
41}
42
43impl Default for TemporalPattern {
44    fn default() -> Self {
45        TemporalPattern::PeriodEndSpike {
46            month_end_multiplier: 2.0,
47            quarter_end_multiplier: 3.0,
48            year_end_multiplier: 5.0,
49        }
50    }
51}
52
53impl TemporalPattern {
54    /// Calculates the probability multiplier for a given date.
55    pub fn probability_multiplier(&self, date: NaiveDate) -> f64 {
56        match self {
57            TemporalPattern::Uniform => 1.0,
58            TemporalPattern::PeriodEndSpike {
59                month_end_multiplier,
60                quarter_end_multiplier,
61                year_end_multiplier,
62            } => {
63                let day = date.day();
64                let month = date.month();
65
66                // Year end (December 28-31)
67                if month == 12 && day >= 28 {
68                    return *year_end_multiplier;
69                }
70
71                // Quarter end (Mar, Jun, Sep, Dec last 3 days)
72                if matches!(month, 3 | 6 | 9 | 12) && day >= 28 {
73                    return *quarter_end_multiplier;
74                }
75
76                // Month end (last 3 days)
77                if day >= 28 {
78                    return *month_end_multiplier;
79                }
80
81                1.0
82            }
83            TemporalPattern::TimeBased {
84                after_hours_multiplier: _,
85                weekend_multiplier,
86            } => {
87                let weekday = date.weekday();
88                if weekday == Weekday::Sat || weekday == Weekday::Sun {
89                    return *weekend_multiplier;
90                }
91                // Assume all entries have potential for after-hours
92                // In practice, this would check timestamp
93                1.0
94            }
95            TemporalPattern::Seasonal { month_multipliers } => {
96                let month_idx = (date.month() - 1) as usize;
97                month_multipliers[month_idx]
98            }
99            TemporalPattern::Custom { .. } => 1.0,
100        }
101    }
102
103    /// Creates a standard audit season pattern (higher in Q1).
104    pub fn audit_season() -> Self {
105        TemporalPattern::Seasonal {
106            month_multipliers: [
107                2.0, 2.0, 1.5, // Q1 - audit busy season
108                1.0, 1.0, 1.2, // Q2 - quarter end
109                1.0, 1.0, 1.2, // Q3 - quarter end
110                1.0, 1.0, 3.0, // Q4 - year end
111            ],
112        }
113    }
114}
115
116/// Fraud category for cluster time window selection.
117#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
118pub enum FraudCategory {
119    /// Accounts Receivable fraud (invoice aging: 30-45 days)
120    AccountsReceivable,
121    /// Accounts Payable fraud (payment cycles: 14-30 days)
122    AccountsPayable,
123    /// Payroll fraud (monthly: 28-35 days)
124    Payroll,
125    /// Expense fraud (submission cycles: 7-14 days)
126    Expense,
127    /// Revenue manipulation (quarterly: 85-95 days)
128    Revenue,
129    /// Asset fraud (periodic: 30-60 days)
130    Asset,
131    /// General fraud (default: 7 days)
132    General,
133}
134
135impl FraudCategory {
136    /// Get the time window range (min, max days) for this fraud category.
137    pub fn time_window_days(&self) -> (i64, i64) {
138        match self {
139            FraudCategory::AccountsReceivable => (30, 45), // Invoice aging cycles
140            FraudCategory::AccountsPayable => (14, 30),    // Payment terms
141            FraudCategory::Payroll => (28, 35),            // Monthly pay cycles
142            FraudCategory::Expense => (7, 14),             // Expense report cycles
143            FraudCategory::Revenue => (85, 95),            // Quarterly close periods
144            FraudCategory::Asset => (30, 60),              // Asset reconciliation
145            FraudCategory::General => (5, 10),             // Default short window
146        }
147    }
148
149    /// Infer fraud category from anomaly type string.
150    pub fn from_anomaly_type(anomaly_type: &str) -> Self {
151        let lower = anomaly_type.to_lowercase();
152        if lower.contains("receivable")
153            || lower.contains("ar")
154            || lower.contains("invoice")
155            || lower.contains("customer")
156        {
157            FraudCategory::AccountsReceivable
158        } else if lower.contains("payable")
159            || lower.contains("ap")
160            || lower.contains("vendor")
161            || lower.contains("payment")
162        {
163            FraudCategory::AccountsPayable
164        } else if lower.contains("payroll")
165            || lower.contains("ghost")
166            || lower.contains("employee")
167            || lower.contains("salary")
168        {
169            FraudCategory::Payroll
170        } else if lower.contains("expense") || lower.contains("reimbursement") {
171            FraudCategory::Expense
172        } else if lower.contains("revenue")
173            || lower.contains("sales")
174            || lower.contains("channel")
175            || lower.contains("premature")
176        {
177            FraudCategory::Revenue
178        } else if lower.contains("asset")
179            || lower.contains("inventory")
180            || lower.contains("fixed")
181            || lower.contains("depreciation")
182        {
183            FraudCategory::Asset
184        } else {
185            FraudCategory::General
186        }
187    }
188}
189
190/// Clustering behavior for anomalies.
191#[derive(Debug, Clone)]
192pub struct ClusteringConfig {
193    /// Whether clustering is enabled.
194    pub enabled: bool,
195    /// Probability that an anomaly starts a new cluster.
196    pub cluster_start_probability: f64,
197    /// Probability that next anomaly joins current cluster.
198    pub cluster_continuation_probability: f64,
199    /// Minimum cluster size.
200    pub min_cluster_size: usize,
201    /// Maximum cluster size.
202    pub max_cluster_size: usize,
203    /// Time window for cluster (days) - default for General category.
204    pub cluster_time_window_days: i64,
205    /// Whether to use fraud-type-specific time windows.
206    pub use_fraud_specific_windows: bool,
207    /// Whether to preserve account relationships within clusters.
208    pub preserve_account_relationships: bool,
209}
210
211impl Default for ClusteringConfig {
212    fn default() -> Self {
213        Self {
214            enabled: true,
215            cluster_start_probability: 0.3,
216            cluster_continuation_probability: 0.7,
217            min_cluster_size: 2,
218            max_cluster_size: 10,
219            cluster_time_window_days: 7,
220            use_fraud_specific_windows: true,
221            preserve_account_relationships: true,
222        }
223    }
224}
225
226/// Causal link between entities in a fraud cluster.
227#[derive(Debug, Clone)]
228pub struct CausalLink {
229    /// Source entity (e.g., payment document ID)
230    pub source_entity: String,
231    /// Source entity type
232    pub source_type: String,
233    /// Target entity (e.g., vendor ID)
234    pub target_entity: String,
235    /// Target entity type
236    pub target_type: String,
237    /// Relationship type
238    pub relationship: String,
239}
240
241impl CausalLink {
242    /// Create a new causal link.
243    pub fn new(
244        source_entity: impl Into<String>,
245        source_type: impl Into<String>,
246        target_entity: impl Into<String>,
247        target_type: impl Into<String>,
248        relationship: impl Into<String>,
249    ) -> Self {
250        Self {
251            source_entity: source_entity.into(),
252            source_type: source_type.into(),
253            target_entity: target_entity.into(),
254            target_type: target_type.into(),
255            relationship: relationship.into(),
256        }
257    }
258}
259
260/// Manages anomaly clustering.
261pub struct ClusterManager {
262    config: ClusteringConfig,
263    /// Current active clusters by fraud category.
264    active_clusters: HashMap<FraudCategory, ActiveCluster>,
265    /// Next cluster ID to assign.
266    next_cluster_id: u64,
267    /// Cluster statistics.
268    cluster_stats: HashMap<String, ClusterStats>,
269}
270
271/// Active cluster state.
272#[derive(Debug, Clone)]
273struct ActiveCluster {
274    /// Cluster ID.
275    cluster_id: String,
276    /// Number of anomalies in cluster.
277    size: usize,
278    /// Start date.
279    start_date: NaiveDate,
280    /// Fraud category.
281    category: FraudCategory,
282    /// Time window for this cluster.
283    time_window_days: i64,
284    /// Accounts involved (for relationship preservation).
285    accounts: Vec<String>,
286    /// Entities involved (vendors, customers, employees).
287    entities: Vec<String>,
288}
289
290/// Statistics for a cluster.
291#[derive(Debug, Clone, Default)]
292pub struct ClusterStats {
293    /// Number of anomalies in cluster.
294    pub size: usize,
295    /// Start date.
296    pub start_date: Option<NaiveDate>,
297    /// End date.
298    pub end_date: Option<NaiveDate>,
299    /// Anomaly types in cluster.
300    pub anomaly_types: Vec<String>,
301    /// Fraud category of this cluster.
302    pub fraud_category: Option<FraudCategory>,
303    /// Time window used (days).
304    pub time_window_days: i64,
305    /// Accounts involved in this cluster.
306    pub accounts: Vec<String>,
307    /// Entities involved in this cluster.
308    pub entities: Vec<String>,
309    /// Causal links within this cluster.
310    pub causal_links: Vec<CausalLink>,
311}
312
313impl ClusterManager {
314    /// Creates a new cluster manager.
315    pub fn new(config: ClusteringConfig) -> Self {
316        Self {
317            config,
318            active_clusters: HashMap::new(),
319            next_cluster_id: 1,
320            cluster_stats: HashMap::new(),
321        }
322    }
323
324    /// Determines the cluster ID for a new anomaly.
325    pub fn assign_cluster<R: Rng>(
326        &mut self,
327        date: NaiveDate,
328        anomaly_type: &str,
329        rng: &mut R,
330    ) -> Option<String> {
331        self.assign_cluster_with_context(date, anomaly_type, None, None, rng)
332    }
333
334    /// Determines the cluster ID with additional context for relationship preservation.
335    pub fn assign_cluster_with_context<R: Rng>(
336        &mut self,
337        date: NaiveDate,
338        anomaly_type: &str,
339        account: Option<&str>,
340        entity: Option<&str>,
341        rng: &mut R,
342    ) -> Option<String> {
343        if !self.config.enabled {
344            return None;
345        }
346
347        // Determine fraud category from anomaly type
348        let category = FraudCategory::from_anomaly_type(anomaly_type);
349
350        // Get time window for this category
351        let time_window = if self.config.use_fraud_specific_windows {
352            let (min, max) = category.time_window_days();
353            rng.gen_range(min..=max)
354        } else {
355            self.config.cluster_time_window_days
356        };
357
358        // Check if we should continue an existing cluster for this category
359        if let Some(active) = self.active_clusters.get(&category).cloned() {
360            let days_elapsed = (date - active.start_date).num_days();
361
362            // Check if within time window and not at max size
363            if days_elapsed <= active.time_window_days
364                && active.size < self.config.max_cluster_size
365                && rng.gen::<f64>() < self.config.cluster_continuation_probability
366            {
367                // If preserving relationships, prefer matching accounts/entities
368                let relationship_match = if self.config.preserve_account_relationships {
369                    let account_match =
370                        account.map_or(true, |a| active.accounts.contains(&a.to_string()));
371                    let entity_match =
372                        entity.map_or(true, |e| active.entities.contains(&e.to_string()));
373                    account_match || entity_match
374                } else {
375                    true
376                };
377
378                if relationship_match {
379                    // Continue the cluster
380                    let cluster_id = active.cluster_id.clone();
381
382                    // Update active cluster
383                    if let Some(active_mut) = self.active_clusters.get_mut(&category) {
384                        active_mut.size += 1;
385                        if let Some(acct) = account {
386                            if !active_mut.accounts.contains(&acct.to_string()) {
387                                active_mut.accounts.push(acct.to_string());
388                            }
389                        }
390                        if let Some(ent) = entity {
391                            if !active_mut.entities.contains(&ent.to_string()) {
392                                active_mut.entities.push(ent.to_string());
393                            }
394                        }
395                    }
396
397                    // Update cluster stats
398                    if let Some(stats) = self.cluster_stats.get_mut(&cluster_id) {
399                        stats.size += 1;
400                        stats.end_date = Some(date);
401                        stats.anomaly_types.push(anomaly_type.to_string());
402                        if let Some(acct) = account {
403                            if !stats.accounts.contains(&acct.to_string()) {
404                                stats.accounts.push(acct.to_string());
405                            }
406                        }
407                        if let Some(ent) = entity {
408                            if !stats.entities.contains(&ent.to_string()) {
409                                stats.entities.push(ent.to_string());
410                            }
411                        }
412                    }
413
414                    return Some(cluster_id);
415                }
416            }
417
418            // End current cluster if at min size
419            if active.size >= self.config.min_cluster_size {
420                self.active_clusters.remove(&category);
421            }
422        }
423
424        // Decide whether to start a new cluster
425        if rng.gen::<f64>() < self.config.cluster_start_probability {
426            let cluster_id = format!("CLU{:06}", self.next_cluster_id);
427            self.next_cluster_id += 1;
428
429            let mut accounts = Vec::new();
430            let mut entities = Vec::new();
431            if let Some(acct) = account {
432                accounts.push(acct.to_string());
433            }
434            if let Some(ent) = entity {
435                entities.push(ent.to_string());
436            }
437
438            // Create new active cluster
439            self.active_clusters.insert(
440                category,
441                ActiveCluster {
442                    cluster_id: cluster_id.clone(),
443                    size: 1,
444                    start_date: date,
445                    category,
446                    time_window_days: time_window,
447                    accounts: accounts.clone(),
448                    entities: entities.clone(),
449                },
450            );
451
452            // Initialize cluster stats
453            self.cluster_stats.insert(
454                cluster_id.clone(),
455                ClusterStats {
456                    size: 1,
457                    start_date: Some(date),
458                    end_date: Some(date),
459                    anomaly_types: vec![anomaly_type.to_string()],
460                    fraud_category: Some(category),
461                    time_window_days: time_window,
462                    accounts,
463                    entities,
464                    causal_links: Vec::new(),
465                },
466            );
467
468            return Some(cluster_id);
469        }
470
471        None
472    }
473
474    /// Add a causal link to a cluster.
475    pub fn add_causal_link(&mut self, cluster_id: &str, link: CausalLink) {
476        if let Some(stats) = self.cluster_stats.get_mut(cluster_id) {
477            stats.causal_links.push(link);
478        }
479    }
480
481    /// Get suggested account for relationship preservation within a cluster.
482    pub fn get_related_account(&self, cluster_id: &str) -> Option<&str> {
483        self.cluster_stats
484            .get(cluster_id)
485            .and_then(|s| s.accounts.first().map(|a| a.as_str()))
486    }
487
488    /// Get suggested entity for relationship preservation within a cluster.
489    pub fn get_related_entity(&self, cluster_id: &str) -> Option<&str> {
490        self.cluster_stats
491            .get(cluster_id)
492            .and_then(|s| s.entities.first().map(|e| e.as_str()))
493    }
494
495    /// Gets cluster statistics.
496    pub fn get_cluster_stats(&self, cluster_id: &str) -> Option<&ClusterStats> {
497        self.cluster_stats.get(cluster_id)
498    }
499
500    /// Gets all cluster statistics.
501    pub fn all_cluster_stats(&self) -> &HashMap<String, ClusterStats> {
502        &self.cluster_stats
503    }
504
505    /// Returns the number of clusters created.
506    pub fn cluster_count(&self) -> usize {
507        self.cluster_stats.len()
508    }
509
510    /// Get cluster statistics by fraud category.
511    pub fn clusters_by_category(&self) -> HashMap<FraudCategory, Vec<&ClusterStats>> {
512        let mut by_category: HashMap<FraudCategory, Vec<&ClusterStats>> = HashMap::new();
513        for stats in self.cluster_stats.values() {
514            if let Some(cat) = stats.fraud_category {
515                by_category.entry(cat).or_default().push(stats);
516            }
517        }
518        by_category
519    }
520}
521
522/// Entity targeting pattern.
523#[derive(Debug, Clone, Default)]
524pub enum EntityTargetingPattern {
525    /// Random entity selection.
526    #[default]
527    Random,
528    /// Weighted by transaction volume.
529    VolumeWeighted,
530    /// Focus on specific entity types.
531    TypeFocused {
532        /// Target entity types with weights.
533        type_weights: HashMap<String, f64>,
534    },
535    /// Repeat offender pattern (same entities).
536    RepeatOffender {
537        /// Probability of targeting same entity.
538        repeat_probability: f64,
539    },
540}
541
542/// Manages entity targeting for anomalies.
543pub struct EntityTargetingManager {
544    pattern: EntityTargetingPattern,
545    /// Recently targeted entities.
546    recent_targets: Vec<String>,
547    /// Maximum recent targets to track.
548    max_recent: usize,
549    /// Entity hit counts.
550    hit_counts: HashMap<String, usize>,
551}
552
553impl EntityTargetingManager {
554    /// Creates a new entity targeting manager.
555    pub fn new(pattern: EntityTargetingPattern) -> Self {
556        Self {
557            pattern,
558            recent_targets: Vec::new(),
559            max_recent: 20,
560            hit_counts: HashMap::new(),
561        }
562    }
563
564    /// Selects an entity to target.
565    pub fn select_entity<R: Rng>(&mut self, candidates: &[String], rng: &mut R) -> Option<String> {
566        if candidates.is_empty() {
567            return None;
568        }
569
570        let selected = match &self.pattern {
571            EntityTargetingPattern::Random => {
572                candidates[rng.gen_range(0..candidates.len())].clone()
573            }
574            EntityTargetingPattern::VolumeWeighted => {
575                // In practice, would weight by actual volume
576                // For now, use random
577                candidates[rng.gen_range(0..candidates.len())].clone()
578            }
579            EntityTargetingPattern::TypeFocused { type_weights } => {
580                // Filter by type weights
581                let weighted: Vec<_> = candidates
582                    .iter()
583                    .filter_map(|c| type_weights.get(c).map(|&w| (c.clone(), w)))
584                    .collect();
585
586                if weighted.is_empty() {
587                    candidates[rng.gen_range(0..candidates.len())].clone()
588                } else {
589                    let total: f64 = weighted.iter().map(|(_, w)| w).sum();
590                    let mut r = rng.gen::<f64>() * total;
591                    for (entity, weight) in &weighted {
592                        r -= weight;
593                        if r <= 0.0 {
594                            return Some(entity.clone());
595                        }
596                    }
597                    weighted[0].0.clone()
598                }
599            }
600            EntityTargetingPattern::RepeatOffender { repeat_probability } => {
601                // Check if we should repeat a recent target
602                if !self.recent_targets.is_empty() && rng.gen::<f64>() < *repeat_probability {
603                    let idx = rng.gen_range(0..self.recent_targets.len());
604                    self.recent_targets[idx].clone()
605                } else {
606                    candidates[rng.gen_range(0..candidates.len())].clone()
607                }
608            }
609        };
610
611        // Track the selection
612        self.recent_targets.push(selected.clone());
613        if self.recent_targets.len() > self.max_recent {
614            self.recent_targets.remove(0);
615        }
616
617        *self.hit_counts.entry(selected.clone()).or_insert(0) += 1;
618
619        Some(selected)
620    }
621
622    /// Gets hit count for an entity.
623    pub fn hit_count(&self, entity: &str) -> usize {
624        *self.hit_counts.get(entity).unwrap_or(&0)
625    }
626}
627
628/// Combined pattern configuration.
629#[derive(Debug, Clone)]
630pub struct AnomalyPatternConfig {
631    /// Temporal pattern.
632    pub temporal_pattern: TemporalPattern,
633    /// Clustering configuration.
634    pub clustering: ClusteringConfig,
635    /// Entity targeting pattern.
636    pub entity_targeting: EntityTargetingPattern,
637    /// Whether to inject anomalies in batches.
638    pub batch_injection: bool,
639    /// Batch size range.
640    pub batch_size_range: (usize, usize),
641}
642
643impl Default for AnomalyPatternConfig {
644    fn default() -> Self {
645        Self {
646            temporal_pattern: TemporalPattern::default(),
647            clustering: ClusteringConfig::default(),
648            entity_targeting: EntityTargetingPattern::default(),
649            batch_injection: false,
650            batch_size_range: (2, 5),
651        }
652    }
653}
654
655/// Determines if an anomaly should be injected at this point.
656pub fn should_inject_anomaly<R: Rng>(
657    base_rate: f64,
658    date: NaiveDate,
659    pattern: &TemporalPattern,
660    rng: &mut R,
661) -> bool {
662    let multiplier = pattern.probability_multiplier(date);
663    let adjusted_rate = (base_rate * multiplier).min(1.0);
664    rng.gen::<f64>() < adjusted_rate
665}
666
667// ============================================================================
668// Fraud Actor System - User-Based Fraud Targeting
669// ============================================================================
670
671/// Escalation pattern for fraud amounts over time.
672#[derive(Debug, Clone, Copy, PartialEq, Eq)]
673pub enum EscalationPattern {
674    /// Fraud amounts stay relatively constant.
675    Stable,
676    /// Fraud amounts gradually increase over time (typical embezzlement).
677    Gradual,
678    /// Fraud amounts increase rapidly (getting bolder).
679    Aggressive,
680    /// Fraud amounts vary but trend upward.
681    Erratic,
682    /// Single large fraud after testing with small amounts.
683    TestThenStrike,
684}
685
686impl EscalationPattern {
687    /// Get the escalation multiplier based on the number of prior frauds.
688    pub fn escalation_multiplier(&self, prior_fraud_count: usize) -> f64 {
689        match self {
690            EscalationPattern::Stable => 1.0,
691            EscalationPattern::Gradual => {
692                // 10% increase per prior fraud, max 3x
693                (1.0 + 0.1 * prior_fraud_count as f64).min(3.0)
694            }
695            EscalationPattern::Aggressive => {
696                // 25% increase per prior fraud, max 5x
697                (1.0 + 0.25 * prior_fraud_count as f64).min(5.0)
698            }
699            EscalationPattern::Erratic => {
700                // Variable multiplier with upward trend
701                let base = 1.0 + 0.15 * prior_fraud_count as f64;
702                base.min(4.0)
703            }
704            EscalationPattern::TestThenStrike => {
705                // Small amounts initially, then big jump
706                if prior_fraud_count < 3 {
707                    0.3 // Test with small amounts
708                } else if prior_fraud_count == 3 {
709                    5.0 // Big strike
710                } else {
711                    0.0 // Stop after the strike
712                }
713            }
714        }
715    }
716}
717
718/// A fraud actor represents a user who commits fraud over time.
719#[derive(Debug, Clone)]
720pub struct FraudActor {
721    /// User ID of the fraudster.
722    pub user_id: String,
723    /// User's name for display purposes.
724    pub user_name: String,
725    /// Fraud history (document IDs and dates).
726    pub fraud_history: Vec<FraudIncident>,
727    /// Escalation pattern for this actor.
728    pub escalation_pattern: EscalationPattern,
729    /// Preferred GL accounts for fraud.
730    pub preferred_accounts: Vec<String>,
731    /// Preferred vendors (for AP fraud).
732    pub preferred_vendors: Vec<String>,
733    /// Total amount of fraud committed.
734    pub total_amount: rust_decimal::Decimal,
735    /// Start date of fraud activity.
736    pub start_date: Option<NaiveDate>,
737    /// Detection likelihood (0.0-1.0) - increases with activity.
738    pub detection_risk: f64,
739    /// Is this actor currently active?
740    pub is_active: bool,
741}
742
743/// A single fraud incident committed by an actor.
744#[derive(Debug, Clone)]
745pub struct FraudIncident {
746    /// Document ID of the fraudulent entry.
747    pub document_id: String,
748    /// Date of the fraud.
749    pub date: NaiveDate,
750    /// Amount of the fraud.
751    pub amount: rust_decimal::Decimal,
752    /// Fraud type.
753    pub fraud_type: String,
754    /// Account used.
755    pub account: Option<String>,
756    /// Related entity (vendor, customer, etc.).
757    pub entity: Option<String>,
758}
759
760impl FraudActor {
761    /// Create a new fraud actor.
762    pub fn new(
763        user_id: impl Into<String>,
764        user_name: impl Into<String>,
765        escalation_pattern: EscalationPattern,
766    ) -> Self {
767        Self {
768            user_id: user_id.into(),
769            user_name: user_name.into(),
770            fraud_history: Vec::new(),
771            escalation_pattern,
772            preferred_accounts: Vec::new(),
773            preferred_vendors: Vec::new(),
774            total_amount: rust_decimal::Decimal::ZERO,
775            start_date: None,
776            detection_risk: 0.0,
777            is_active: true,
778        }
779    }
780
781    /// Add a preferred account for fraud.
782    pub fn with_account(mut self, account: impl Into<String>) -> Self {
783        self.preferred_accounts.push(account.into());
784        self
785    }
786
787    /// Add a preferred vendor for fraud.
788    pub fn with_vendor(mut self, vendor: impl Into<String>) -> Self {
789        self.preferred_vendors.push(vendor.into());
790        self
791    }
792
793    /// Record a fraud incident.
794    pub fn record_fraud(
795        &mut self,
796        document_id: impl Into<String>,
797        date: NaiveDate,
798        amount: rust_decimal::Decimal,
799        fraud_type: impl Into<String>,
800        account: Option<String>,
801        entity: Option<String>,
802    ) {
803        let incident = FraudIncident {
804            document_id: document_id.into(),
805            date,
806            amount,
807            fraud_type: fraud_type.into(),
808            account: account.clone(),
809            entity: entity.clone(),
810        };
811
812        self.fraud_history.push(incident);
813        self.total_amount += amount;
814
815        if self.start_date.is_none() {
816            self.start_date = Some(date);
817        }
818
819        // Update detection risk based on activity
820        self.update_detection_risk();
821
822        // Add account/entity to preferences if not already present
823        if let Some(acct) = account {
824            if !self.preferred_accounts.contains(&acct) {
825                self.preferred_accounts.push(acct);
826            }
827        }
828        if let Some(ent) = entity {
829            if !self.preferred_vendors.contains(&ent) {
830                self.preferred_vendors.push(ent);
831            }
832        }
833    }
834
835    /// Update detection risk based on fraud activity.
836    fn update_detection_risk(&mut self) {
837        // Detection risk increases with:
838        // 1. Number of frauds committed
839        // 2. Total amount
840        // 3. How bold the escalation pattern is
841        let count_factor = (self.fraud_history.len() as f64 * 0.05).min(0.3);
842        let amount_factor = if self.total_amount > rust_decimal::Decimal::from(100_000) {
843            0.3
844        } else if self.total_amount > rust_decimal::Decimal::from(10_000) {
845            0.2
846        } else {
847            0.1
848        };
849        let pattern_factor = match self.escalation_pattern {
850            EscalationPattern::Stable => 0.1,
851            EscalationPattern::Gradual => 0.15,
852            EscalationPattern::Erratic => 0.2,
853            EscalationPattern::Aggressive => 0.25,
854            EscalationPattern::TestThenStrike => 0.3,
855        };
856
857        self.detection_risk = (count_factor + amount_factor + pattern_factor).min(0.95);
858    }
859
860    /// Get the escalation multiplier for the next fraud.
861    pub fn next_escalation_multiplier(&self) -> f64 {
862        self.escalation_pattern
863            .escalation_multiplier(self.fraud_history.len())
864    }
865
866    /// Get a preferred account, or None if no preferences.
867    pub fn get_preferred_account<R: Rng>(&self, rng: &mut R) -> Option<&str> {
868        if self.preferred_accounts.is_empty() {
869            None
870        } else {
871            Some(&self.preferred_accounts[rng.gen_range(0..self.preferred_accounts.len())])
872        }
873    }
874
875    /// Get a preferred vendor, or None if no preferences.
876    pub fn get_preferred_vendor<R: Rng>(&self, rng: &mut R) -> Option<&str> {
877        if self.preferred_vendors.is_empty() {
878            None
879        } else {
880            Some(&self.preferred_vendors[rng.gen_range(0..self.preferred_vendors.len())])
881        }
882    }
883}
884
885/// Manages fraud actors for user-based fraud targeting.
886pub struct FraudActorManager {
887    /// All fraud actors.
888    actors: Vec<FraudActor>,
889    /// Map from user_id to actor index.
890    user_index: HashMap<String, usize>,
891    /// Probability of using an existing actor vs creating new one.
892    repeat_actor_probability: f64,
893    /// Maximum active actors at any time.
894    max_active_actors: usize,
895}
896
897impl FraudActorManager {
898    /// Create a new fraud actor manager.
899    pub fn new(repeat_actor_probability: f64, max_active_actors: usize) -> Self {
900        Self {
901            actors: Vec::new(),
902            user_index: HashMap::new(),
903            repeat_actor_probability,
904            max_active_actors,
905        }
906    }
907
908    /// Add a fraud actor.
909    pub fn add_actor(&mut self, actor: FraudActor) {
910        let idx = self.actors.len();
911        self.user_index.insert(actor.user_id.clone(), idx);
912        self.actors.push(actor);
913    }
914
915    /// Get or create a fraud actor for the next fraud.
916    pub fn get_or_create_actor<R: Rng>(
917        &mut self,
918        available_users: &[String],
919        rng: &mut R,
920    ) -> Option<&mut FraudActor> {
921        if available_users.is_empty() {
922            return None;
923        }
924
925        // Check if we should use an existing active actor
926        let active_actors: Vec<usize> = self
927            .actors
928            .iter()
929            .enumerate()
930            .filter(|(_, a)| a.is_active)
931            .map(|(i, _)| i)
932            .collect();
933
934        if !active_actors.is_empty() && rng.gen::<f64>() < self.repeat_actor_probability {
935            // Use existing actor
936            let idx = active_actors[rng.gen_range(0..active_actors.len())];
937            return Some(&mut self.actors[idx]);
938        }
939
940        // Create new actor if under max
941        if self.actors.len() < self.max_active_actors {
942            // Pick a random user
943            let user_id = &available_users[rng.gen_range(0..available_users.len())];
944
945            // Check if user already has an actor
946            if let Some(&idx) = self.user_index.get(user_id) {
947                return Some(&mut self.actors[idx]);
948            }
949
950            // Create new actor with random escalation pattern
951            let pattern = match rng.gen_range(0..5) {
952                0 => EscalationPattern::Stable,
953                1 => EscalationPattern::Gradual,
954                2 => EscalationPattern::Aggressive,
955                3 => EscalationPattern::Erratic,
956                _ => EscalationPattern::TestThenStrike,
957            };
958
959            let actor = FraudActor::new(user_id.clone(), format!("Fraudster {}", user_id), pattern);
960            let idx = self.actors.len();
961            self.user_index.insert(user_id.clone(), idx);
962            self.actors.push(actor);
963            return Some(&mut self.actors[idx]);
964        }
965
966        // Use random existing actor
967        if !self.actors.is_empty() {
968            let idx = rng.gen_range(0..self.actors.len());
969            return Some(&mut self.actors[idx]);
970        }
971
972        None
973    }
974
975    /// Get an actor by user ID.
976    pub fn get_actor(&self, user_id: &str) -> Option<&FraudActor> {
977        self.user_index.get(user_id).map(|&i| &self.actors[i])
978    }
979
980    /// Get a mutable actor by user ID.
981    pub fn get_actor_mut(&mut self, user_id: &str) -> Option<&mut FraudActor> {
982        if let Some(&idx) = self.user_index.get(user_id) {
983            Some(&mut self.actors[idx])
984        } else {
985            None
986        }
987    }
988
989    /// Deactivate actors who have high detection risk.
990    pub fn apply_detection<R: Rng>(&mut self, rng: &mut R) {
991        for actor in &mut self.actors {
992            if actor.is_active && rng.gen::<f64>() < actor.detection_risk {
993                actor.is_active = false;
994            }
995        }
996    }
997
998    /// Get all actors.
999    pub fn all_actors(&self) -> &[FraudActor] {
1000        &self.actors
1001    }
1002
1003    /// Get summary statistics.
1004    pub fn get_statistics(&self) -> FraudActorStatistics {
1005        let total_actors = self.actors.len();
1006        let active_actors = self.actors.iter().filter(|a| a.is_active).count();
1007        let total_incidents: usize = self.actors.iter().map(|a| a.fraud_history.len()).sum();
1008        let total_amount: rust_decimal::Decimal = self.actors.iter().map(|a| a.total_amount).sum();
1009
1010        FraudActorStatistics {
1011            total_actors,
1012            active_actors,
1013            total_incidents,
1014            total_amount,
1015        }
1016    }
1017}
1018
1019/// Statistics about fraud actors.
1020#[derive(Debug, Clone)]
1021pub struct FraudActorStatistics {
1022    /// Total number of fraud actors.
1023    pub total_actors: usize,
1024    /// Number of currently active actors.
1025    pub active_actors: usize,
1026    /// Total fraud incidents across all actors.
1027    pub total_incidents: usize,
1028    /// Total fraud amount across all actors.
1029    pub total_amount: rust_decimal::Decimal,
1030}
1031
1032#[cfg(test)]
1033mod tests {
1034    use super::*;
1035    use rand::SeedableRng;
1036    use rand_chacha::ChaCha8Rng;
1037
1038    #[test]
1039    fn test_temporal_pattern_multiplier() {
1040        let pattern = TemporalPattern::default();
1041
1042        // Regular day
1043        let regular = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap();
1044        assert_eq!(pattern.probability_multiplier(regular), 1.0);
1045
1046        // Month end
1047        let month_end = NaiveDate::from_ymd_opt(2024, 6, 30).unwrap();
1048        assert!(pattern.probability_multiplier(month_end) > 1.0);
1049
1050        // Year end
1051        let year_end = NaiveDate::from_ymd_opt(2024, 12, 31).unwrap();
1052        assert!(
1053            pattern.probability_multiplier(year_end) > pattern.probability_multiplier(month_end)
1054        );
1055    }
1056
1057    #[test]
1058    fn test_cluster_manager() {
1059        let mut manager = ClusterManager::new(ClusteringConfig::default());
1060        let mut rng = ChaCha8Rng::seed_from_u64(42);
1061        let date = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap();
1062
1063        // Generate several anomalies and check clustering
1064        let mut clustered = 0;
1065        for i in 0..20 {
1066            let d = date + chrono::Duration::days(i % 7); // Within time window
1067            if manager.assign_cluster(d, "TestType", &mut rng).is_some() {
1068                clustered += 1;
1069            }
1070        }
1071
1072        // Some should be clustered
1073        assert!(clustered > 0);
1074        assert!(manager.cluster_count() > 0);
1075    }
1076
1077    #[test]
1078    fn test_fraud_category_time_windows() {
1079        // AR fraud should have longer window than general
1080        let ar = FraudCategory::AccountsReceivable;
1081        let general = FraudCategory::General;
1082
1083        let (ar_min, ar_max) = ar.time_window_days();
1084        let (gen_min, gen_max) = general.time_window_days();
1085
1086        assert!(ar_min > gen_min);
1087        assert!(ar_max > gen_max);
1088    }
1089
1090    #[test]
1091    fn test_fraud_category_inference() {
1092        assert_eq!(
1093            FraudCategory::from_anomaly_type("AccountsReceivable"),
1094            FraudCategory::AccountsReceivable
1095        );
1096        assert_eq!(
1097            FraudCategory::from_anomaly_type("VendorPayment"),
1098            FraudCategory::AccountsPayable
1099        );
1100        assert_eq!(
1101            FraudCategory::from_anomaly_type("GhostEmployee"),
1102            FraudCategory::Payroll
1103        );
1104        assert_eq!(
1105            FraudCategory::from_anomaly_type("RandomType"),
1106            FraudCategory::General
1107        );
1108    }
1109
1110    #[test]
1111    fn test_cluster_with_context() {
1112        let mut manager = ClusterManager::new(ClusteringConfig {
1113            cluster_start_probability: 1.0,        // Always start
1114            cluster_continuation_probability: 1.0, // Always continue
1115            ..Default::default()
1116        });
1117        let mut rng = ChaCha8Rng::seed_from_u64(42);
1118        let date = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap();
1119
1120        // First anomaly starts a cluster
1121        let cluster1 = manager.assign_cluster_with_context(
1122            date,
1123            "VendorPayment",
1124            Some("200000"),
1125            Some("V001"),
1126            &mut rng,
1127        );
1128        assert!(cluster1.is_some());
1129
1130        // Second anomaly with same account should join same cluster
1131        let cluster2 = manager.assign_cluster_with_context(
1132            date + chrono::Duration::days(5),
1133            "VendorPayment",
1134            Some("200000"),
1135            Some("V002"),
1136            &mut rng,
1137        );
1138
1139        assert_eq!(cluster1, cluster2);
1140
1141        // Check stats have both entities
1142        let stats = manager.get_cluster_stats(&cluster1.unwrap()).unwrap();
1143        assert_eq!(stats.accounts.len(), 1); // Same account
1144        assert_eq!(stats.entities.len(), 2); // Two vendors
1145    }
1146
1147    #[test]
1148    fn test_causal_links() {
1149        let mut manager = ClusterManager::new(ClusteringConfig {
1150            cluster_start_probability: 1.0,
1151            ..Default::default()
1152        });
1153        let mut rng = ChaCha8Rng::seed_from_u64(42);
1154        let date = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap();
1155
1156        let cluster_id = manager
1157            .assign_cluster(date, "VendorPayment", &mut rng)
1158            .unwrap();
1159
1160        // Add causal link
1161        manager.add_causal_link(
1162            &cluster_id,
1163            CausalLink::new("PAY-001", "Payment", "V001", "Vendor", "references"),
1164        );
1165        manager.add_causal_link(
1166            &cluster_id,
1167            CausalLink::new("V001", "Vendor", "EMP-001", "Employee", "owned_by"),
1168        );
1169
1170        let stats = manager.get_cluster_stats(&cluster_id).unwrap();
1171        assert_eq!(stats.causal_links.len(), 2);
1172    }
1173
1174    #[test]
1175    fn test_should_inject_anomaly() {
1176        let mut rng = ChaCha8Rng::seed_from_u64(42);
1177        let pattern = TemporalPattern::default();
1178
1179        let regular_date = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap();
1180        let year_end = NaiveDate::from_ymd_opt(2024, 12, 31).unwrap();
1181
1182        // Count injections over many trials
1183        let mut regular_count = 0;
1184        let mut year_end_count = 0;
1185
1186        for _ in 0..1000 {
1187            if should_inject_anomaly(0.1, regular_date, &pattern, &mut rng) {
1188                regular_count += 1;
1189            }
1190            if should_inject_anomaly(0.1, year_end, &pattern, &mut rng) {
1191                year_end_count += 1;
1192            }
1193        }
1194
1195        // Year end should have more injections due to multiplier
1196        assert!(year_end_count > regular_count);
1197    }
1198
1199    #[test]
1200    fn test_escalation_patterns() {
1201        // Stable should always return 1.0
1202        assert_eq!(EscalationPattern::Stable.escalation_multiplier(0), 1.0);
1203        assert_eq!(EscalationPattern::Stable.escalation_multiplier(10), 1.0);
1204
1205        // Gradual should increase over time
1206        let gradual = EscalationPattern::Gradual;
1207        assert!(gradual.escalation_multiplier(5) > gradual.escalation_multiplier(0));
1208        assert!(gradual.escalation_multiplier(5) <= 3.0); // Max is 3x
1209
1210        // Aggressive should increase faster
1211        let aggressive = EscalationPattern::Aggressive;
1212        assert!(aggressive.escalation_multiplier(5) > gradual.escalation_multiplier(5));
1213
1214        // TestThenStrike has specific pattern
1215        let tts = EscalationPattern::TestThenStrike;
1216        assert!(tts.escalation_multiplier(0) < 1.0); // Small test amounts
1217        assert!(tts.escalation_multiplier(3) > 1.0); // Big strike
1218        assert_eq!(tts.escalation_multiplier(4), 0.0); // Stop after strike
1219    }
1220
1221    #[test]
1222    fn test_fraud_actor() {
1223        use rust_decimal_macros::dec;
1224
1225        let mut actor = FraudActor::new("USER001", "John Fraudster", EscalationPattern::Gradual)
1226            .with_account("600000")
1227            .with_vendor("V001");
1228
1229        assert_eq!(actor.preferred_accounts.len(), 1);
1230        assert_eq!(actor.preferred_vendors.len(), 1);
1231        assert!(actor.is_active);
1232
1233        // Record some fraud
1234        let date = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap();
1235        actor.record_fraud(
1236            "JE-001",
1237            date,
1238            dec!(1000),
1239            "DuplicatePayment",
1240            Some("600000".to_string()),
1241            Some("V002".to_string()),
1242        );
1243
1244        assert_eq!(actor.fraud_history.len(), 1);
1245        assert_eq!(actor.total_amount, dec!(1000));
1246        assert_eq!(actor.start_date, Some(date));
1247        assert!(actor.detection_risk > 0.0);
1248
1249        // V002 should be added to preferences
1250        assert!(actor.preferred_vendors.contains(&"V002".to_string()));
1251    }
1252
1253    #[test]
1254    fn test_fraud_actor_manager() {
1255        let mut rng = ChaCha8Rng::seed_from_u64(42);
1256        let mut manager = FraudActorManager::new(0.7, 5);
1257
1258        let users = vec![
1259            "USER001".to_string(),
1260            "USER002".to_string(),
1261            "USER003".to_string(),
1262        ];
1263
1264        // Get or create actor
1265        let actor = manager.get_or_create_actor(&users, &mut rng);
1266        assert!(actor.is_some());
1267
1268        // Record fraud
1269        let actor = actor.unwrap();
1270        let user_id = actor.user_id.clone();
1271        actor.record_fraud(
1272            "JE-001",
1273            NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
1274            rust_decimal::Decimal::from(1000),
1275            "FictitiousEntry",
1276            None,
1277            None,
1278        );
1279
1280        // Should be able to retrieve actor
1281        let retrieved = manager.get_actor(&user_id);
1282        assert!(retrieved.is_some());
1283        assert_eq!(retrieved.unwrap().fraud_history.len(), 1);
1284
1285        // Get statistics
1286        let stats = manager.get_statistics();
1287        assert_eq!(stats.total_actors, 1);
1288        assert_eq!(stats.active_actors, 1);
1289        assert_eq!(stats.total_incidents, 1);
1290    }
1291
1292    #[test]
1293    fn test_fraud_actor_detection() {
1294        use rust_decimal_macros::dec;
1295
1296        let mut rng = ChaCha8Rng::seed_from_u64(42);
1297        let mut manager = FraudActorManager::new(1.0, 10);
1298
1299        // Add actor with high activity
1300        let mut actor =
1301            FraudActor::new("USER001", "Heavy Fraudster", EscalationPattern::Aggressive);
1302        let date = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap();
1303
1304        // Record many frauds to increase detection risk
1305        for i in 0..10 {
1306            actor.record_fraud(
1307                format!("JE-{:03}", i),
1308                date + chrono::Duration::days(i as i64),
1309                dec!(10000),
1310                "FictitiousEntry",
1311                None,
1312                None,
1313            );
1314        }
1315
1316        manager.add_actor(actor);
1317
1318        // Detection risk should be high
1319        let actor = manager.get_actor("USER001").unwrap();
1320        assert!(actor.detection_risk > 0.5);
1321
1322        // Apply detection (with high risk, likely to be caught eventually)
1323        for _ in 0..20 {
1324            manager.apply_detection(&mut rng);
1325        }
1326
1327        // After many detection attempts, high-risk actor likely deactivated
1328        let stats = manager.get_statistics();
1329        // Note: This is probabilistic, but with high risk the actor should likely be caught
1330        assert!(stats.active_actors <= stats.total_actors);
1331    }
1332}
datasynth_generators/anomaly/patterns.rs

datasynth_generators/anomaly/
patterns.rs