datasynth_generators/anomaly/
injector.rs

1//! Main anomaly injection engine.
2//!
3//! The injector coordinates anomaly generation across all data types,
4//! managing rates, patterns, clustering, and label generation.
5//!
6//! ## Enhanced Features (v0.3.0+)
7//!
8//! - **Multi-stage fraud schemes**: Embezzlement, revenue manipulation, kickbacks
9//! - **Correlated injection**: Co-occurrence patterns and error cascades
10//! - **Near-miss generation**: Suspicious but legitimate transactions
11//! - **Detection difficulty classification**: Trivial to expert levels
12//! - **Context-aware injection**: Entity-specific anomaly patterns
13
14use chrono::NaiveDate;
15use rand::Rng;
16use rand::SeedableRng;
17use rand_chacha::ChaCha8Rng;
18use rust_decimal::Decimal;
19use std::collections::HashMap;
20
21use datasynth_core::models::{
22    AnomalyCausalReason, AnomalyDetectionDifficulty, AnomalyRateConfig, AnomalySummary,
23    AnomalyType, ErrorType, FraudType, JournalEntry, LabeledAnomaly, NearMissLabel,
24    RelationalAnomalyType,
25};
26
27use super::context::{BehavioralBaseline, BehavioralBaselineConfig, EntityAwareInjector};
28use super::correlation::{AnomalyCoOccurrence, TemporalClusterGenerator};
29use super::difficulty::DifficultyCalculator;
30use super::near_miss::{NearMissConfig, NearMissGenerator};
31use super::patterns::{
32    should_inject_anomaly, AnomalyPatternConfig, ClusterManager, EntityTargetingManager,
33    TemporalPattern,
34};
35use super::scheme_advancer::{SchemeAdvancer, SchemeAdvancerConfig};
36use super::schemes::{SchemeAction, SchemeContext};
37use super::strategies::{DuplicationStrategy, StrategyCollection};
38use super::types::AnomalyTypeSelector;
39
40/// Configuration for the anomaly injector.
41#[derive(Debug, Clone)]
42pub struct AnomalyInjectorConfig {
43    /// Rate configuration.
44    pub rates: AnomalyRateConfig,
45    /// Pattern configuration.
46    pub patterns: AnomalyPatternConfig,
47    /// Random seed for reproducibility.
48    pub seed: u64,
49    /// Whether to generate labels.
50    pub generate_labels: bool,
51    /// Whether to allow duplicate injection.
52    pub allow_duplicates: bool,
53    /// Maximum anomalies per document.
54    pub max_anomalies_per_document: usize,
55    /// Company codes to target (empty = all).
56    pub target_companies: Vec<String>,
57    /// Date range for injection.
58    pub date_range: Option<(NaiveDate, NaiveDate)>,
59    /// Enhanced features configuration.
60    pub enhanced: EnhancedInjectionConfig,
61}
62
63/// Enhanced injection configuration for v0.3.0+ features.
64#[derive(Debug, Clone, Default)]
65pub struct EnhancedInjectionConfig {
66    /// Enable multi-stage fraud scheme generation.
67    pub multi_stage_schemes_enabled: bool,
68    /// Probability of starting a new scheme per perpetrator per year.
69    pub scheme_probability: f64,
70    /// Enable correlated anomaly injection.
71    pub correlated_injection_enabled: bool,
72    /// Enable temporal clustering (period-end spikes).
73    pub temporal_clustering_enabled: bool,
74    /// Period-end anomaly rate multiplier.
75    pub period_end_multiplier: f64,
76    /// Enable near-miss generation.
77    pub near_miss_enabled: bool,
78    /// Proportion of anomalies that are near-misses.
79    pub near_miss_proportion: f64,
80    /// Approval thresholds for threshold-proximity near-misses.
81    pub approval_thresholds: Vec<Decimal>,
82    /// Enable detection difficulty classification.
83    pub difficulty_classification_enabled: bool,
84    /// Enable context-aware injection.
85    pub context_aware_enabled: bool,
86    /// Behavioral baseline configuration.
87    pub behavioral_baseline_config: BehavioralBaselineConfig,
88}
89
90impl Default for AnomalyInjectorConfig {
91    fn default() -> Self {
92        Self {
93            rates: AnomalyRateConfig::default(),
94            patterns: AnomalyPatternConfig::default(),
95            seed: 42,
96            generate_labels: true,
97            allow_duplicates: true,
98            max_anomalies_per_document: 2,
99            target_companies: Vec::new(),
100            date_range: None,
101            enhanced: EnhancedInjectionConfig::default(),
102        }
103    }
104}
105
106/// Result of an injection batch.
107#[derive(Debug, Clone)]
108pub struct InjectionBatchResult {
109    /// Number of entries processed.
110    pub entries_processed: usize,
111    /// Number of anomalies injected.
112    pub anomalies_injected: usize,
113    /// Number of duplicates created.
114    pub duplicates_created: usize,
115    /// Labels generated.
116    pub labels: Vec<LabeledAnomaly>,
117    /// Summary of anomalies.
118    pub summary: AnomalySummary,
119    /// Entries that were modified (document numbers).
120    pub modified_documents: Vec<String>,
121    /// Near-miss labels (suspicious but legitimate transactions).
122    pub near_miss_labels: Vec<NearMissLabel>,
123    /// Multi-stage scheme actions generated.
124    pub scheme_actions: Vec<SchemeAction>,
125    /// Difficulty distribution summary.
126    pub difficulty_distribution: HashMap<AnomalyDetectionDifficulty, usize>,
127}
128
129/// Main anomaly injection engine.
130#[allow(dead_code)]
131pub struct AnomalyInjector {
132    config: AnomalyInjectorConfig,
133    rng: ChaCha8Rng,
134    type_selector: AnomalyTypeSelector,
135    strategies: StrategyCollection,
136    cluster_manager: ClusterManager,
137    entity_targeting: EntityTargetingManager,
138    /// Tracking which documents already have anomalies.
139    document_anomaly_counts: HashMap<String, usize>,
140    /// All generated labels.
141    labels: Vec<LabeledAnomaly>,
142    /// Statistics.
143    stats: InjectorStats,
144    // Enhanced components (v0.3.0+)
145    /// Multi-stage fraud scheme advancer.
146    scheme_advancer: Option<SchemeAdvancer>,
147    /// Near-miss generator.
148    near_miss_generator: Option<NearMissGenerator>,
149    /// Near-miss labels generated.
150    near_miss_labels: Vec<NearMissLabel>,
151    /// Co-occurrence pattern handler.
152    co_occurrence_handler: Option<AnomalyCoOccurrence>,
153    /// Temporal cluster generator.
154    temporal_cluster_generator: Option<TemporalClusterGenerator>,
155    /// Difficulty calculator.
156    difficulty_calculator: Option<DifficultyCalculator>,
157    /// Entity-aware injector.
158    entity_aware_injector: Option<EntityAwareInjector>,
159    /// Behavioral baseline tracker.
160    behavioral_baseline: Option<BehavioralBaseline>,
161    /// Scheme actions generated.
162    scheme_actions: Vec<SchemeAction>,
163    /// Difficulty distribution.
164    difficulty_distribution: HashMap<AnomalyDetectionDifficulty, usize>,
165}
166
167/// Internal statistics tracking.
168#[derive(Debug, Clone, Default)]
169#[allow(dead_code)]
170pub struct InjectorStats {
171    total_processed: usize,
172    total_injected: usize,
173    by_category: HashMap<String, usize>,
174    by_type: HashMap<String, usize>,
175    by_company: HashMap<String, usize>,
176    skipped_rate: usize,
177    skipped_date: usize,
178    skipped_company: usize,
179    skipped_max_per_doc: usize,
180}
181
182impl AnomalyInjector {
183    /// Creates a new anomaly injector.
184    pub fn new(config: AnomalyInjectorConfig) -> Self {
185        let mut rng = ChaCha8Rng::seed_from_u64(config.seed);
186        let cluster_manager = ClusterManager::new(config.patterns.clustering.clone());
187        let entity_targeting =
188            EntityTargetingManager::new(config.patterns.entity_targeting.clone());
189
190        // Initialize enhanced components based on configuration
191        let scheme_advancer = if config.enhanced.multi_stage_schemes_enabled {
192            let scheme_config = SchemeAdvancerConfig {
193                embezzlement_probability: config.enhanced.scheme_probability,
194                revenue_manipulation_probability: config.enhanced.scheme_probability * 0.5,
195                kickback_probability: config.enhanced.scheme_probability * 0.5,
196                seed: rng.gen(),
197                ..Default::default()
198            };
199            Some(SchemeAdvancer::new(scheme_config))
200        } else {
201            None
202        };
203
204        let near_miss_generator = if config.enhanced.near_miss_enabled {
205            let near_miss_config = NearMissConfig {
206                proportion: config.enhanced.near_miss_proportion,
207                seed: rng.gen(),
208                ..Default::default()
209            };
210            Some(NearMissGenerator::new(near_miss_config))
211        } else {
212            None
213        };
214
215        let co_occurrence_handler = if config.enhanced.correlated_injection_enabled {
216            Some(AnomalyCoOccurrence::new())
217        } else {
218            None
219        };
220
221        let temporal_cluster_generator = if config.enhanced.temporal_clustering_enabled {
222            Some(TemporalClusterGenerator::new())
223        } else {
224            None
225        };
226
227        let difficulty_calculator = if config.enhanced.difficulty_classification_enabled {
228            Some(DifficultyCalculator::new())
229        } else {
230            None
231        };
232
233        let entity_aware_injector = if config.enhanced.context_aware_enabled {
234            Some(EntityAwareInjector::default())
235        } else {
236            None
237        };
238
239        let behavioral_baseline = if config.enhanced.context_aware_enabled
240            && config.enhanced.behavioral_baseline_config.enabled
241        {
242            Some(BehavioralBaseline::new(
243                config.enhanced.behavioral_baseline_config.clone(),
244            ))
245        } else {
246            None
247        };
248
249        Self {
250            config,
251            rng,
252            type_selector: AnomalyTypeSelector::new(),
253            strategies: StrategyCollection::default(),
254            cluster_manager,
255            entity_targeting,
256            document_anomaly_counts: HashMap::new(),
257            labels: Vec::new(),
258            stats: InjectorStats::default(),
259            scheme_advancer,
260            near_miss_generator,
261            near_miss_labels: Vec::new(),
262            co_occurrence_handler,
263            temporal_cluster_generator,
264            difficulty_calculator,
265            entity_aware_injector,
266            behavioral_baseline,
267            scheme_actions: Vec::new(),
268            difficulty_distribution: HashMap::new(),
269        }
270    }
271
272    /// Processes a batch of journal entries, potentially injecting anomalies.
273    pub fn process_entries(&mut self, entries: &mut [JournalEntry]) -> InjectionBatchResult {
274        let mut modified_documents = Vec::new();
275        let mut duplicates = Vec::new();
276
277        for entry in entries.iter_mut() {
278            self.stats.total_processed += 1;
279
280            // Update behavioral baseline if enabled
281            if let Some(ref mut baseline) = self.behavioral_baseline {
282                use super::context::Observation;
283                // Record the observation for baseline building
284                let entity_id = entry.header.created_by.clone();
285                let observation =
286                    Observation::new(entry.posting_date()).with_amount(entry.total_debit());
287                baseline.record_observation(&entity_id, observation);
288            }
289
290            // Check if we should process this entry
291            if !self.should_process(entry) {
292                continue;
293            }
294
295            // Calculate effective rate (temporal clustering is applied later per-type)
296            let effective_rate = self.config.rates.total_rate;
297
298            // Calculate entity-aware rate adjustment
299            if let Some(ref injector) = self.entity_aware_injector {
300                // TODO: Would need entity context to adjust rate here
301                // For now, use default rate
302                let _ = injector;
303            }
304
305            // Determine if we inject an anomaly
306            if should_inject_anomaly(
307                effective_rate,
308                entry.posting_date(),
309                &self.config.patterns.temporal_pattern,
310                &mut self.rng,
311            ) {
312                // Check if this should be a near-miss instead
313                if let Some(ref mut near_miss_gen) = self.near_miss_generator {
314                    // Record the transaction for near-duplicate detection
315                    let account = entry
316                        .lines
317                        .first()
318                        .map(|l| l.gl_account.clone())
319                        .unwrap_or_default();
320                    near_miss_gen.record_transaction(
321                        entry.document_number().clone(),
322                        entry.posting_date(),
323                        entry.total_debit(),
324                        &account,
325                        None,
326                    );
327
328                    // Check if this could be a near-miss
329                    if let Some(near_miss_label) = near_miss_gen.check_near_miss(
330                        entry.document_number().clone(),
331                        entry.posting_date(),
332                        entry.total_debit(),
333                        &account,
334                        None,
335                        &self.config.enhanced.approval_thresholds,
336                    ) {
337                        self.near_miss_labels.push(near_miss_label);
338                        continue; // Skip actual anomaly injection
339                    }
340                }
341
342                // Select anomaly category based on rates
343                let anomaly_type = self.select_anomaly_category();
344
345                // Apply the anomaly
346                if let Some(mut label) = self.inject_anomaly(entry, anomaly_type) {
347                    // Calculate detection difficulty if enabled
348                    if let Some(ref calculator) = self.difficulty_calculator {
349                        let difficulty = calculator.calculate(&label);
350
351                        // Store difficulty in metadata
352                        label = label
353                            .with_metadata("detection_difficulty", &format!("{:?}", difficulty));
354                        label = label.with_metadata(
355                            "difficulty_score",
356                            &difficulty.difficulty_score().to_string(),
357                        );
358
359                        // Update difficulty distribution
360                        *self.difficulty_distribution.entry(difficulty).or_insert(0) += 1;
361                    }
362
363                    modified_documents.push(entry.document_number().clone());
364                    self.labels.push(label);
365                    self.stats.total_injected += 1;
366                }
367
368                // Check for duplicate injection
369                if self.config.allow_duplicates
370                    && matches!(
371                        self.labels.last().map(|l| &l.anomaly_type),
372                        Some(AnomalyType::Error(ErrorType::DuplicateEntry))
373                            | Some(AnomalyType::Fraud(FraudType::DuplicatePayment))
374                    )
375                {
376                    let dup_strategy = DuplicationStrategy::default();
377                    let duplicate = dup_strategy.duplicate(entry, &mut self.rng);
378                    duplicates.push(duplicate);
379                }
380            }
381        }
382
383        // Count duplicates
384        let duplicates_created = duplicates.len();
385
386        // Build summary
387        let summary = AnomalySummary::from_anomalies(&self.labels);
388
389        InjectionBatchResult {
390            entries_processed: self.stats.total_processed,
391            anomalies_injected: self.stats.total_injected,
392            duplicates_created,
393            labels: self.labels.clone(),
394            summary,
395            modified_documents,
396            near_miss_labels: self.near_miss_labels.clone(),
397            scheme_actions: self.scheme_actions.clone(),
398            difficulty_distribution: self.difficulty_distribution.clone(),
399        }
400    }
401
402    /// Checks if an entry should be processed.
403    fn should_process(&mut self, entry: &JournalEntry) -> bool {
404        // Check company filter
405        if !self.config.target_companies.is_empty()
406            && !self
407                .config
408                .target_companies
409                .iter()
410                .any(|c| c == entry.company_code())
411        {
412            self.stats.skipped_company += 1;
413            return false;
414        }
415
416        // Check date range
417        if let Some((start, end)) = self.config.date_range {
418            if entry.posting_date() < start || entry.posting_date() > end {
419                self.stats.skipped_date += 1;
420                return false;
421            }
422        }
423
424        // Check max anomalies per document
425        let current_count = self
426            .document_anomaly_counts
427            .get(&entry.document_number())
428            .copied()
429            .unwrap_or(0);
430        if current_count >= self.config.max_anomalies_per_document {
431            self.stats.skipped_max_per_doc += 1;
432            return false;
433        }
434
435        true
436    }
437
438    /// Selects an anomaly category based on configured rates.
439    fn select_anomaly_category(&mut self) -> AnomalyType {
440        let r = self.rng.gen::<f64>();
441        let rates = &self.config.rates;
442
443        let mut cumulative = 0.0;
444
445        cumulative += rates.fraud_rate;
446        if r < cumulative {
447            return self.type_selector.select_fraud(&mut self.rng);
448        }
449
450        cumulative += rates.error_rate;
451        if r < cumulative {
452            return self.type_selector.select_error(&mut self.rng);
453        }
454
455        cumulative += rates.process_issue_rate;
456        if r < cumulative {
457            return self.type_selector.select_process_issue(&mut self.rng);
458        }
459
460        cumulative += rates.statistical_rate;
461        if r < cumulative {
462            return self.type_selector.select_statistical(&mut self.rng);
463        }
464
465        self.type_selector.select_relational(&mut self.rng)
466    }
467
468    /// Injects an anomaly into an entry.
469    fn inject_anomaly(
470        &mut self,
471        entry: &mut JournalEntry,
472        anomaly_type: AnomalyType,
473    ) -> Option<LabeledAnomaly> {
474        // Check if strategy can be applied
475        if !self.strategies.can_apply(entry, &anomaly_type) {
476            return None;
477        }
478
479        // Apply the strategy
480        let result = self
481            .strategies
482            .apply_strategy(entry, &anomaly_type, &mut self.rng);
483
484        if !result.success {
485            return None;
486        }
487
488        // Update document anomaly count
489        *self
490            .document_anomaly_counts
491            .entry(entry.document_number().clone())
492            .or_insert(0) += 1;
493
494        // Update statistics
495        let category = anomaly_type.category().to_string();
496        let type_name = anomaly_type.type_name();
497
498        *self.stats.by_category.entry(category).or_insert(0) += 1;
499        *self.stats.by_type.entry(type_name.clone()).or_insert(0) += 1;
500        *self
501            .stats
502            .by_company
503            .entry(entry.company_code().to_string())
504            .or_insert(0) += 1;
505
506        // Generate label
507        if self.config.generate_labels {
508            let anomaly_id = format!("ANO{:08}", self.labels.len() + 1);
509
510            // Update entry header with anomaly tracking fields
511            entry.header.is_anomaly = true;
512            entry.header.anomaly_id = Some(anomaly_id.clone());
513            entry.header.anomaly_type = Some(type_name.clone());
514
515            // Also set fraud flag if this is a fraud anomaly
516            if matches!(anomaly_type, AnomalyType::Fraud(_)) {
517                entry.header.is_fraud = true;
518                if let AnomalyType::Fraud(ref ft) = anomaly_type {
519                    entry.header.fraud_type = Some(*ft);
520                }
521            }
522
523            let mut label = LabeledAnomaly::new(
524                anomaly_id,
525                anomaly_type.clone(),
526                entry.document_number().clone(),
527                "JE".to_string(),
528                entry.company_code().to_string(),
529                entry.posting_date(),
530            )
531            .with_description(&result.description)
532            .with_injection_strategy(&type_name);
533
534            // Add causal reason with injection context (provenance tracking)
535            let causal_reason = AnomalyCausalReason::RandomRate {
536                base_rate: self.config.rates.total_rate,
537            };
538            label = label.with_causal_reason(causal_reason);
539
540            // Add monetary impact
541            if let Some(impact) = result.monetary_impact {
542                label = label.with_monetary_impact(impact);
543            }
544
545            // Add related entities
546            for entity in &result.related_entities {
547                label = label.with_related_entity(entity);
548            }
549
550            // Add metadata
551            for (key, value) in &result.metadata {
552                label = label.with_metadata(key, value);
553            }
554
555            // Assign cluster and update causal reason if in cluster
556            if let Some(cluster_id) =
557                self.cluster_manager
558                    .assign_cluster(entry.posting_date(), &type_name, &mut self.rng)
559            {
560                label = label.with_cluster(&cluster_id);
561                // Update causal reason to reflect cluster membership
562                label = label.with_causal_reason(AnomalyCausalReason::ClusterMembership {
563                    cluster_id: cluster_id.clone(),
564                });
565            }
566
567            return Some(label);
568        }
569
570        None
571    }
572
573    /// Injects a specific anomaly type into an entry.
574    pub fn inject_specific(
575        &mut self,
576        entry: &mut JournalEntry,
577        anomaly_type: AnomalyType,
578    ) -> Option<LabeledAnomaly> {
579        self.inject_anomaly(entry, anomaly_type)
580    }
581
582    /// Creates a self-approval anomaly.
583    pub fn create_self_approval(
584        &mut self,
585        entry: &mut JournalEntry,
586        user_id: &str,
587    ) -> Option<LabeledAnomaly> {
588        let anomaly_type = AnomalyType::Fraud(FraudType::SelfApproval);
589
590        let label = LabeledAnomaly::new(
591            format!("ANO{:08}", self.labels.len() + 1),
592            anomaly_type,
593            entry.document_number().clone(),
594            "JE".to_string(),
595            entry.company_code().to_string(),
596            entry.posting_date(),
597        )
598        .with_description(&format!("User {} approved their own transaction", user_id))
599        .with_related_entity(user_id)
600        .with_injection_strategy("ManualSelfApproval")
601        .with_causal_reason(AnomalyCausalReason::EntityTargeting {
602            target_type: "User".to_string(),
603            target_id: user_id.to_string(),
604        });
605
606        // Set entry header anomaly tracking fields
607        entry.header.is_anomaly = true;
608        entry.header.is_fraud = true;
609        entry.header.anomaly_id = Some(label.anomaly_id.clone());
610        entry.header.anomaly_type = Some("SelfApproval".to_string());
611        entry.header.fraud_type = Some(FraudType::SelfApproval);
612
613        // Set approver = requester
614        entry.header.created_by = user_id.to_string();
615
616        self.labels.push(label.clone());
617        Some(label)
618    }
619
620    /// Creates a segregation of duties violation.
621    pub fn create_sod_violation(
622        &mut self,
623        entry: &mut JournalEntry,
624        user_id: &str,
625        conflicting_duties: (&str, &str),
626    ) -> Option<LabeledAnomaly> {
627        let anomaly_type = AnomalyType::Fraud(FraudType::SegregationOfDutiesViolation);
628
629        let label = LabeledAnomaly::new(
630            format!("ANO{:08}", self.labels.len() + 1),
631            anomaly_type,
632            entry.document_number().clone(),
633            "JE".to_string(),
634            entry.company_code().to_string(),
635            entry.posting_date(),
636        )
637        .with_description(&format!(
638            "User {} performed conflicting duties: {} and {}",
639            user_id, conflicting_duties.0, conflicting_duties.1
640        ))
641        .with_related_entity(user_id)
642        .with_metadata("duty1", conflicting_duties.0)
643        .with_metadata("duty2", conflicting_duties.1)
644        .with_injection_strategy("ManualSoDViolation")
645        .with_causal_reason(AnomalyCausalReason::EntityTargeting {
646            target_type: "User".to_string(),
647            target_id: user_id.to_string(),
648        });
649
650        // Set entry header anomaly tracking fields
651        entry.header.is_anomaly = true;
652        entry.header.is_fraud = true;
653        entry.header.anomaly_id = Some(label.anomaly_id.clone());
654        entry.header.anomaly_type = Some("SegregationOfDutiesViolation".to_string());
655        entry.header.fraud_type = Some(FraudType::SegregationOfDutiesViolation);
656
657        self.labels.push(label.clone());
658        Some(label)
659    }
660
661    /// Creates an intercompany mismatch anomaly.
662    pub fn create_ic_mismatch(
663        &mut self,
664        entry: &mut JournalEntry,
665        matching_company: &str,
666        expected_amount: Decimal,
667        actual_amount: Decimal,
668    ) -> Option<LabeledAnomaly> {
669        let anomaly_type = AnomalyType::Relational(RelationalAnomalyType::UnmatchedIntercompany);
670
671        let label = LabeledAnomaly::new(
672            format!("ANO{:08}", self.labels.len() + 1),
673            anomaly_type,
674            entry.document_number().clone(),
675            "JE".to_string(),
676            entry.company_code().to_string(),
677            entry.posting_date(),
678        )
679        .with_description(&format!(
680            "Intercompany mismatch with {}: expected {} but got {}",
681            matching_company, expected_amount, actual_amount
682        ))
683        .with_related_entity(matching_company)
684        .with_monetary_impact(actual_amount - expected_amount)
685        .with_metadata("expected_amount", &expected_amount.to_string())
686        .with_metadata("actual_amount", &actual_amount.to_string())
687        .with_injection_strategy("ManualICMismatch")
688        .with_causal_reason(AnomalyCausalReason::EntityTargeting {
689            target_type: "Intercompany".to_string(),
690            target_id: matching_company.to_string(),
691        });
692
693        // Set entry header anomaly tracking fields
694        entry.header.is_anomaly = true;
695        entry.header.anomaly_id = Some(label.anomaly_id.clone());
696        entry.header.anomaly_type = Some("UnmatchedIntercompany".to_string());
697
698        self.labels.push(label.clone());
699        Some(label)
700    }
701
702    /// Returns all generated labels.
703    pub fn get_labels(&self) -> &[LabeledAnomaly] {
704        &self.labels
705    }
706
707    /// Returns the anomaly summary.
708    pub fn get_summary(&self) -> AnomalySummary {
709        AnomalySummary::from_anomalies(&self.labels)
710    }
711
712    /// Returns injection statistics.
713    pub fn get_stats(&self) -> &InjectorStats {
714        &self.stats
715    }
716
717    /// Clears all labels and resets statistics.
718    pub fn reset(&mut self) {
719        self.labels.clear();
720        self.document_anomaly_counts.clear();
721        self.stats = InjectorStats::default();
722        self.cluster_manager = ClusterManager::new(self.config.patterns.clustering.clone());
723
724        // Reset enhanced components
725        self.near_miss_labels.clear();
726        self.scheme_actions.clear();
727        self.difficulty_distribution.clear();
728
729        if let Some(ref mut baseline) = self.behavioral_baseline {
730            *baseline =
731                BehavioralBaseline::new(self.config.enhanced.behavioral_baseline_config.clone());
732        }
733    }
734
735    /// Returns the number of clusters created.
736    pub fn cluster_count(&self) -> usize {
737        self.cluster_manager.cluster_count()
738    }
739
740    // =========================================================================
741    // Enhanced Features API (v0.3.0+)
742    // =========================================================================
743
744    /// Advances all active fraud schemes by one time step.
745    ///
746    /// Call this method once per simulated day to generate scheme actions.
747    /// Returns the scheme actions generated for this date.
748    pub fn advance_schemes(&mut self, date: NaiveDate, company_code: &str) -> Vec<SchemeAction> {
749        if let Some(ref mut advancer) = self.scheme_advancer {
750            let context = SchemeContext::new(date, company_code);
751            let actions = advancer.advance_all(&context);
752            self.scheme_actions.extend(actions.clone());
753            actions
754        } else {
755            Vec::new()
756        }
757    }
758
759    /// Potentially starts a new fraud scheme based on probabilities.
760    ///
761    /// Call this method periodically (e.g., once per period) to allow new
762    /// schemes to start based on configured probabilities.
763    /// Returns the scheme ID if a scheme was started.
764    pub fn maybe_start_scheme(
765        &mut self,
766        date: NaiveDate,
767        company_code: &str,
768        available_users: Vec<String>,
769        available_accounts: Vec<String>,
770        available_counterparties: Vec<String>,
771    ) -> Option<uuid::Uuid> {
772        if let Some(ref mut advancer) = self.scheme_advancer {
773            let mut context = SchemeContext::new(date, company_code);
774            context.available_users = available_users;
775            context.available_accounts = available_accounts;
776            context.available_counterparties = available_counterparties;
777
778            advancer.maybe_start_scheme(&context)
779        } else {
780            None
781        }
782    }
783
784    /// Returns all near-miss labels generated.
785    pub fn get_near_miss_labels(&self) -> &[NearMissLabel] {
786        &self.near_miss_labels
787    }
788
789    /// Returns all scheme actions generated.
790    pub fn get_scheme_actions(&self) -> &[SchemeAction] {
791        &self.scheme_actions
792    }
793
794    /// Returns the detection difficulty distribution.
795    pub fn get_difficulty_distribution(&self) -> &HashMap<AnomalyDetectionDifficulty, usize> {
796        &self.difficulty_distribution
797    }
798
799    /// Checks for behavioral deviations for an entity with an observation.
800    pub fn check_behavioral_deviations(
801        &self,
802        entity_id: &str,
803        observation: &super::context::Observation,
804    ) -> Vec<super::context::BehavioralDeviation> {
805        if let Some(ref baseline) = self.behavioral_baseline {
806            baseline.check_deviation(entity_id, observation)
807        } else {
808            Vec::new()
809        }
810    }
811
812    /// Gets the baseline for an entity.
813    pub fn get_entity_baseline(&self, entity_id: &str) -> Option<&super::context::EntityBaseline> {
814        if let Some(ref baseline) = self.behavioral_baseline {
815            baseline.get_baseline(entity_id)
816        } else {
817            None
818        }
819    }
820
821    /// Returns the number of active schemes.
822    pub fn active_scheme_count(&self) -> usize {
823        if let Some(ref advancer) = self.scheme_advancer {
824            advancer.active_scheme_count()
825        } else {
826            0
827        }
828    }
829
830    /// Returns whether enhanced features are enabled.
831    pub fn has_enhanced_features(&self) -> bool {
832        self.scheme_advancer.is_some()
833            || self.near_miss_generator.is_some()
834            || self.difficulty_calculator.is_some()
835            || self.entity_aware_injector.is_some()
836    }
837}
838
839/// Builder for AnomalyInjectorConfig.
840pub struct AnomalyInjectorConfigBuilder {
841    config: AnomalyInjectorConfig,
842}
843
844impl AnomalyInjectorConfigBuilder {
845    /// Creates a new builder with default configuration.
846    pub fn new() -> Self {
847        Self {
848            config: AnomalyInjectorConfig::default(),
849        }
850    }
851
852    /// Sets the total anomaly rate.
853    pub fn with_total_rate(mut self, rate: f64) -> Self {
854        self.config.rates.total_rate = rate;
855        self
856    }
857
858    /// Sets the fraud rate (proportion of anomalies).
859    pub fn with_fraud_rate(mut self, rate: f64) -> Self {
860        self.config.rates.fraud_rate = rate;
861        self
862    }
863
864    /// Sets the error rate (proportion of anomalies).
865    pub fn with_error_rate(mut self, rate: f64) -> Self {
866        self.config.rates.error_rate = rate;
867        self
868    }
869
870    /// Sets the random seed.
871    pub fn with_seed(mut self, seed: u64) -> Self {
872        self.config.seed = seed;
873        self
874    }
875
876    /// Sets the temporal pattern.
877    pub fn with_temporal_pattern(mut self, pattern: TemporalPattern) -> Self {
878        self.config.patterns.temporal_pattern = pattern;
879        self
880    }
881
882    /// Enables or disables label generation.
883    pub fn with_labels(mut self, generate: bool) -> Self {
884        self.config.generate_labels = generate;
885        self
886    }
887
888    /// Sets target companies.
889    pub fn with_target_companies(mut self, companies: Vec<String>) -> Self {
890        self.config.target_companies = companies;
891        self
892    }
893
894    /// Sets the date range.
895    pub fn with_date_range(mut self, start: NaiveDate, end: NaiveDate) -> Self {
896        self.config.date_range = Some((start, end));
897        self
898    }
899
900    // =========================================================================
901    // Enhanced Features Configuration (v0.3.0+)
902    // =========================================================================
903
904    /// Enables multi-stage fraud scheme generation.
905    pub fn with_multi_stage_schemes(mut self, enabled: bool, probability: f64) -> Self {
906        self.config.enhanced.multi_stage_schemes_enabled = enabled;
907        self.config.enhanced.scheme_probability = probability;
908        self
909    }
910
911    /// Enables near-miss generation.
912    pub fn with_near_misses(mut self, enabled: bool, proportion: f64) -> Self {
913        self.config.enhanced.near_miss_enabled = enabled;
914        self.config.enhanced.near_miss_proportion = proportion;
915        self
916    }
917
918    /// Sets approval thresholds for threshold-proximity near-misses.
919    pub fn with_approval_thresholds(mut self, thresholds: Vec<Decimal>) -> Self {
920        self.config.enhanced.approval_thresholds = thresholds;
921        self
922    }
923
924    /// Enables correlated anomaly injection.
925    pub fn with_correlated_injection(mut self, enabled: bool) -> Self {
926        self.config.enhanced.correlated_injection_enabled = enabled;
927        self
928    }
929
930    /// Enables temporal clustering (period-end spikes).
931    pub fn with_temporal_clustering(mut self, enabled: bool, multiplier: f64) -> Self {
932        self.config.enhanced.temporal_clustering_enabled = enabled;
933        self.config.enhanced.period_end_multiplier = multiplier;
934        self
935    }
936
937    /// Enables detection difficulty classification.
938    pub fn with_difficulty_classification(mut self, enabled: bool) -> Self {
939        self.config.enhanced.difficulty_classification_enabled = enabled;
940        self
941    }
942
943    /// Enables context-aware injection.
944    pub fn with_context_aware_injection(mut self, enabled: bool) -> Self {
945        self.config.enhanced.context_aware_enabled = enabled;
946        self
947    }
948
949    /// Sets behavioral baseline configuration.
950    pub fn with_behavioral_baseline(mut self, config: BehavioralBaselineConfig) -> Self {
951        self.config.enhanced.behavioral_baseline_config = config;
952        self
953    }
954
955    /// Enables all enhanced features with default settings.
956    pub fn with_all_enhanced_features(mut self) -> Self {
957        self.config.enhanced.multi_stage_schemes_enabled = true;
958        self.config.enhanced.scheme_probability = 0.02;
959        self.config.enhanced.correlated_injection_enabled = true;
960        self.config.enhanced.temporal_clustering_enabled = true;
961        self.config.enhanced.period_end_multiplier = 2.5;
962        self.config.enhanced.near_miss_enabled = true;
963        self.config.enhanced.near_miss_proportion = 0.30;
964        self.config.enhanced.difficulty_classification_enabled = true;
965        self.config.enhanced.context_aware_enabled = true;
966        self.config.enhanced.behavioral_baseline_config.enabled = true;
967        self
968    }
969
970    /// Builds the configuration.
971    pub fn build(self) -> AnomalyInjectorConfig {
972        self.config
973    }
974}
975
976impl Default for AnomalyInjectorConfigBuilder {
977    fn default() -> Self {
978        Self::new()
979    }
980}
981
982#[cfg(test)]
983mod tests {
984    use super::*;
985    use chrono::NaiveDate;
986    use datasynth_core::models::{JournalEntryLine, StatisticalAnomalyType};
987    use rust_decimal_macros::dec;
988
989    fn create_test_entry(doc_num: &str) -> JournalEntry {
990        let mut entry = JournalEntry::new_simple(
991            doc_num.to_string(),
992            "1000".to_string(),
993            NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
994            "Test Entry".to_string(),
995        );
996
997        entry.add_line(JournalEntryLine {
998            line_number: 1,
999            gl_account: "5000".to_string(),
1000            debit_amount: dec!(1000),
1001            ..Default::default()
1002        });
1003
1004        entry.add_line(JournalEntryLine {
1005            line_number: 2,
1006            gl_account: "1000".to_string(),
1007            credit_amount: dec!(1000),
1008            ..Default::default()
1009        });
1010
1011        entry
1012    }
1013
1014    #[test]
1015    fn test_anomaly_injector_basic() {
1016        let config = AnomalyInjectorConfigBuilder::new()
1017            .with_total_rate(0.5) // High rate for testing
1018            .with_seed(42)
1019            .build();
1020
1021        let mut injector = AnomalyInjector::new(config);
1022
1023        let mut entries: Vec<_> = (0..100)
1024            .map(|i| create_test_entry(&format!("JE{:04}", i)))
1025            .collect();
1026
1027        let result = injector.process_entries(&mut entries);
1028
1029        // With 50% rate, we should have some anomalies
1030        assert!(result.anomalies_injected > 0);
1031        assert!(!result.labels.is_empty());
1032        assert_eq!(result.labels.len(), result.anomalies_injected);
1033    }
1034
1035    #[test]
1036    fn test_specific_injection() {
1037        let config = AnomalyInjectorConfig::default();
1038        let mut injector = AnomalyInjector::new(config);
1039
1040        let mut entry = create_test_entry("JE001");
1041        let anomaly_type = AnomalyType::Statistical(StatisticalAnomalyType::UnusuallyHighAmount);
1042
1043        let label = injector.inject_specific(&mut entry, anomaly_type);
1044
1045        assert!(label.is_some());
1046        let label = label.unwrap();
1047        // document_id is the UUID string from the journal entry header
1048        assert!(!label.document_id.is_empty());
1049        assert_eq!(label.document_id, entry.document_number());
1050    }
1051
1052    #[test]
1053    fn test_self_approval_injection() {
1054        let config = AnomalyInjectorConfig::default();
1055        let mut injector = AnomalyInjector::new(config);
1056
1057        let mut entry = create_test_entry("JE001");
1058        let label = injector.create_self_approval(&mut entry, "USER001");
1059
1060        assert!(label.is_some());
1061        let label = label.unwrap();
1062        assert!(matches!(
1063            label.anomaly_type,
1064            AnomalyType::Fraud(FraudType::SelfApproval)
1065        ));
1066        assert!(label.related_entities.contains(&"USER001".to_string()));
1067    }
1068
1069    #[test]
1070    fn test_company_filtering() {
1071        let config = AnomalyInjectorConfigBuilder::new()
1072            .with_total_rate(1.0) // Inject all
1073            .with_target_companies(vec!["2000".to_string()])
1074            .build();
1075
1076        let mut injector = AnomalyInjector::new(config);
1077
1078        let mut entries = vec![
1079            create_test_entry("JE001"), // company 1000
1080            create_test_entry("JE002"), // company 1000
1081        ];
1082
1083        let result = injector.process_entries(&mut entries);
1084
1085        // No anomalies because entries are in company 1000, not 2000
1086        assert_eq!(result.anomalies_injected, 0);
1087    }
1088}
datasynth_generators/anomaly/injector.rs

datasynth_generators/anomaly/
injector.rs