datasynth_core/models/
anomaly.rs

1//! Anomaly types and labels for synthetic data generation.
2//!
3//! This module provides comprehensive anomaly classification for:
4//! - Fraud detection training
5//! - Error detection systems
6//! - Process compliance monitoring
7//! - Statistical anomaly detection
8//! - Graph-based anomaly detection
9
10use chrono::{NaiveDate, NaiveDateTime};
11use rust_decimal::Decimal;
12use serde::{Deserialize, Serialize};
13use std::collections::HashMap;
14
15/// Causal reason explaining why an anomaly was injected.
16///
17/// This enables provenance tracking for understanding the "why" behind each anomaly.
18#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
19pub enum AnomalyCausalReason {
20    /// Injected due to random rate selection.
21    RandomRate {
22        /// Base rate used for selection.
23        base_rate: f64,
24    },
25    /// Injected due to temporal pattern matching.
26    TemporalPattern {
27        /// Name of the temporal pattern (e.g., "year_end_spike", "month_end").
28        pattern_name: String,
29    },
30    /// Injected based on entity targeting rules.
31    EntityTargeting {
32        /// Type of entity targeted (e.g., "vendor", "user", "account").
33        target_type: String,
34        /// ID of the targeted entity.
35        target_id: String,
36    },
37    /// Part of an anomaly cluster.
38    ClusterMembership {
39        /// ID of the cluster this anomaly belongs to.
40        cluster_id: String,
41    },
42    /// Part of a multi-step scenario.
43    ScenarioStep {
44        /// Type of scenario (e.g., "kickback_scheme", "round_tripping").
45        scenario_type: String,
46        /// Step number within the scenario.
47        step_number: u32,
48    },
49    /// Injected based on data quality profile.
50    DataQualityProfile {
51        /// Profile name (e.g., "noisy", "legacy", "clean").
52        profile: String,
53    },
54    /// Injected for ML training balance.
55    MLTrainingBalance {
56        /// Target class being balanced.
57        target_class: String,
58    },
59}
60
61/// Structured injection strategy with captured parameters.
62///
63/// Unlike the string-based `injection_strategy` field, this enum captures
64/// the exact parameters used during injection for full reproducibility.
65#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
66pub enum InjectionStrategy {
67    /// Amount was manipulated by a factor.
68    AmountManipulation {
69        /// Original amount before manipulation.
70        original: Decimal,
71        /// Multiplication factor applied.
72        factor: f64,
73    },
74    /// Amount adjusted to avoid a threshold.
75    ThresholdAvoidance {
76        /// Threshold being avoided.
77        threshold: Decimal,
78        /// Final amount after adjustment.
79        adjusted_amount: Decimal,
80    },
81    /// Date was backdated or forward-dated.
82    DateShift {
83        /// Number of days shifted (negative = backdated).
84        days_shifted: i32,
85        /// Original date before shift.
86        original_date: NaiveDate,
87    },
88    /// User approved their own transaction.
89    SelfApproval {
90        /// User who created and approved.
91        user_id: String,
92    },
93    /// Segregation of duties violation.
94    SoDViolation {
95        /// First duty involved.
96        duty1: String,
97        /// Second duty involved.
98        duty2: String,
99        /// User who performed both duties.
100        violating_user: String,
101    },
102    /// Exact duplicate of another document.
103    ExactDuplicate {
104        /// ID of the original document.
105        original_doc_id: String,
106    },
107    /// Near-duplicate with small variations.
108    NearDuplicate {
109        /// ID of the original document.
110        original_doc_id: String,
111        /// Fields that were varied.
112        varied_fields: Vec<String>,
113    },
114    /// Circular flow of funds/goods.
115    CircularFlow {
116        /// Chain of entities involved.
117        entity_chain: Vec<String>,
118    },
119    /// Split transaction to avoid threshold.
120    SplitTransaction {
121        /// Original total amount.
122        original_amount: Decimal,
123        /// Number of splits.
124        split_count: u32,
125        /// IDs of the split documents.
126        split_doc_ids: Vec<String>,
127    },
128    /// Round number manipulation.
129    RoundNumbering {
130        /// Original precise amount.
131        original_amount: Decimal,
132        /// Rounded amount.
133        rounded_amount: Decimal,
134    },
135    /// Timing manipulation (weekend, after-hours, etc.).
136    TimingManipulation {
137        /// Type of timing issue.
138        timing_type: String,
139        /// Original timestamp.
140        original_time: Option<NaiveDateTime>,
141    },
142    /// Account misclassification.
143    AccountMisclassification {
144        /// Correct account.
145        correct_account: String,
146        /// Incorrect account used.
147        incorrect_account: String,
148    },
149    /// Missing required field.
150    MissingField {
151        /// Name of the missing field.
152        field_name: String,
153    },
154    /// Custom injection strategy.
155    Custom {
156        /// Strategy name.
157        name: String,
158        /// Additional parameters.
159        parameters: HashMap<String, String>,
160    },
161}
162
163impl InjectionStrategy {
164    /// Returns a human-readable description of the strategy.
165    pub fn description(&self) -> String {
166        match self {
167            InjectionStrategy::AmountManipulation { factor, .. } => {
168                format!("Amount multiplied by {:.2}", factor)
169            }
170            InjectionStrategy::ThresholdAvoidance { threshold, .. } => {
171                format!("Amount adjusted to avoid {} threshold", threshold)
172            }
173            InjectionStrategy::DateShift { days_shifted, .. } => {
174                if *days_shifted < 0 {
175                    format!("Date backdated by {} days", days_shifted.abs())
176                } else {
177                    format!("Date forward-dated by {} days", days_shifted)
178                }
179            }
180            InjectionStrategy::SelfApproval { user_id } => {
181                format!("Self-approval by user {}", user_id)
182            }
183            InjectionStrategy::SoDViolation { duty1, duty2, .. } => {
184                format!("SoD violation: {} and {}", duty1, duty2)
185            }
186            InjectionStrategy::ExactDuplicate { original_doc_id } => {
187                format!("Exact duplicate of {}", original_doc_id)
188            }
189            InjectionStrategy::NearDuplicate {
190                original_doc_id,
191                varied_fields,
192            } => {
193                format!(
194                    "Near-duplicate of {} (varied: {:?})",
195                    original_doc_id, varied_fields
196                )
197            }
198            InjectionStrategy::CircularFlow { entity_chain } => {
199                format!("Circular flow through {} entities", entity_chain.len())
200            }
201            InjectionStrategy::SplitTransaction { split_count, .. } => {
202                format!("Split into {} transactions", split_count)
203            }
204            InjectionStrategy::RoundNumbering { .. } => "Amount rounded to even number".to_string(),
205            InjectionStrategy::TimingManipulation { timing_type, .. } => {
206                format!("Timing manipulation: {}", timing_type)
207            }
208            InjectionStrategy::AccountMisclassification {
209                correct_account,
210                incorrect_account,
211            } => {
212                format!(
213                    "Misclassified from {} to {}",
214                    correct_account, incorrect_account
215                )
216            }
217            InjectionStrategy::MissingField { field_name } => {
218                format!("Missing required field: {}", field_name)
219            }
220            InjectionStrategy::Custom { name, .. } => format!("Custom: {}", name),
221        }
222    }
223
224    /// Returns the strategy type name.
225    pub fn strategy_type(&self) -> &'static str {
226        match self {
227            InjectionStrategy::AmountManipulation { .. } => "AmountManipulation",
228            InjectionStrategy::ThresholdAvoidance { .. } => "ThresholdAvoidance",
229            InjectionStrategy::DateShift { .. } => "DateShift",
230            InjectionStrategy::SelfApproval { .. } => "SelfApproval",
231            InjectionStrategy::SoDViolation { .. } => "SoDViolation",
232            InjectionStrategy::ExactDuplicate { .. } => "ExactDuplicate",
233            InjectionStrategy::NearDuplicate { .. } => "NearDuplicate",
234            InjectionStrategy::CircularFlow { .. } => "CircularFlow",
235            InjectionStrategy::SplitTransaction { .. } => "SplitTransaction",
236            InjectionStrategy::RoundNumbering { .. } => "RoundNumbering",
237            InjectionStrategy::TimingManipulation { .. } => "TimingManipulation",
238            InjectionStrategy::AccountMisclassification { .. } => "AccountMisclassification",
239            InjectionStrategy::MissingField { .. } => "MissingField",
240            InjectionStrategy::Custom { .. } => "Custom",
241        }
242    }
243}
244
245/// Primary anomaly classification.
246#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
247pub enum AnomalyType {
248    /// Fraudulent activity.
249    Fraud(FraudType),
250    /// Data entry or processing error.
251    Error(ErrorType),
252    /// Process or control issue.
253    ProcessIssue(ProcessIssueType),
254    /// Statistical anomaly.
255    Statistical(StatisticalAnomalyType),
256    /// Relational/graph anomaly.
257    Relational(RelationalAnomalyType),
258    /// Custom anomaly type.
259    Custom(String),
260}
261
262impl AnomalyType {
263    /// Returns the category name.
264    pub fn category(&self) -> &'static str {
265        match self {
266            AnomalyType::Fraud(_) => "Fraud",
267            AnomalyType::Error(_) => "Error",
268            AnomalyType::ProcessIssue(_) => "ProcessIssue",
269            AnomalyType::Statistical(_) => "Statistical",
270            AnomalyType::Relational(_) => "Relational",
271            AnomalyType::Custom(_) => "Custom",
272        }
273    }
274
275    /// Returns the specific type name.
276    pub fn type_name(&self) -> String {
277        match self {
278            AnomalyType::Fraud(t) => format!("{:?}", t),
279            AnomalyType::Error(t) => format!("{:?}", t),
280            AnomalyType::ProcessIssue(t) => format!("{:?}", t),
281            AnomalyType::Statistical(t) => format!("{:?}", t),
282            AnomalyType::Relational(t) => format!("{:?}", t),
283            AnomalyType::Custom(s) => s.clone(),
284        }
285    }
286
287    /// Returns the severity level (1-5, 5 being most severe).
288    pub fn severity(&self) -> u8 {
289        match self {
290            AnomalyType::Fraud(t) => t.severity(),
291            AnomalyType::Error(t) => t.severity(),
292            AnomalyType::ProcessIssue(t) => t.severity(),
293            AnomalyType::Statistical(t) => t.severity(),
294            AnomalyType::Relational(t) => t.severity(),
295            AnomalyType::Custom(_) => 3,
296        }
297    }
298
299    /// Returns whether this anomaly is typically intentional.
300    pub fn is_intentional(&self) -> bool {
301        matches!(self, AnomalyType::Fraud(_))
302    }
303}
304
305/// Fraud types for detection training.
306#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
307pub enum FraudType {
308    // Journal Entry Fraud
309    /// Fictitious journal entry with no business purpose.
310    FictitiousEntry,
311    /// Fictitious transaction (alias for FictitiousEntry).
312    FictitiousTransaction,
313    /// Round-dollar amounts suggesting manual manipulation.
314    RoundDollarManipulation,
315    /// Entry posted just below approval threshold.
316    JustBelowThreshold,
317    /// Revenue recognition manipulation.
318    RevenueManipulation,
319    /// Expense capitalization fraud.
320    ImproperCapitalization,
321    /// Improperly capitalizing expenses as assets.
322    ExpenseCapitalization,
323    /// Cookie jar reserves manipulation.
324    ReserveManipulation,
325    /// Round-tripping funds through suspense/clearing accounts.
326    SuspenseAccountAbuse,
327    /// Splitting transactions to stay below approval thresholds.
328    SplitTransaction,
329    /// Unusual timing (weekend, holiday, after-hours postings).
330    TimingAnomaly,
331    /// Posting to unauthorized accounts.
332    UnauthorizedAccess,
333
334    // Approval Fraud
335    /// User approving their own request.
336    SelfApproval,
337    /// Approval beyond authorized limit.
338    ExceededApprovalLimit,
339    /// Segregation of duties violation.
340    SegregationOfDutiesViolation,
341    /// Approval by unauthorized user.
342    UnauthorizedApproval,
343    /// Collusion between approver and requester.
344    CollusiveApproval,
345
346    // Vendor/Payment Fraud
347    /// Fictitious vendor.
348    FictitiousVendor,
349    /// Duplicate payment to vendor.
350    DuplicatePayment,
351    /// Payment to shell company.
352    ShellCompanyPayment,
353    /// Kickback scheme.
354    Kickback,
355    /// Kickback scheme (alias).
356    KickbackScheme,
357    /// Invoice manipulation.
358    InvoiceManipulation,
359
360    // Asset Fraud
361    /// Misappropriation of assets.
362    AssetMisappropriation,
363    /// Inventory theft.
364    InventoryTheft,
365    /// Ghost employee.
366    GhostEmployee,
367
368    // Financial Statement Fraud
369    /// Premature revenue recognition.
370    PrematureRevenue,
371    /// Understated liabilities.
372    UnderstatedLiabilities,
373    /// Overstated assets.
374    OverstatedAssets,
375    /// Channel stuffing.
376    ChannelStuffing,
377}
378
379impl FraudType {
380    /// Returns severity level (1-5).
381    pub fn severity(&self) -> u8 {
382        match self {
383            FraudType::RoundDollarManipulation => 2,
384            FraudType::JustBelowThreshold => 3,
385            FraudType::SelfApproval => 3,
386            FraudType::ExceededApprovalLimit => 3,
387            FraudType::DuplicatePayment => 3,
388            FraudType::FictitiousEntry => 4,
389            FraudType::RevenueManipulation => 5,
390            FraudType::FictitiousVendor => 5,
391            FraudType::ShellCompanyPayment => 5,
392            FraudType::AssetMisappropriation => 5,
393            FraudType::SegregationOfDutiesViolation => 4,
394            FraudType::CollusiveApproval => 5,
395            _ => 4,
396        }
397    }
398}
399
400/// Error types for error detection.
401#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
402pub enum ErrorType {
403    // Data Entry Errors
404    /// Duplicate document entry.
405    DuplicateEntry,
406    /// Reversed debit/credit amounts.
407    ReversedAmount,
408    /// Transposed digits in amount.
409    TransposedDigits,
410    /// Wrong decimal placement.
411    DecimalError,
412    /// Missing required field.
413    MissingField,
414    /// Invalid account code.
415    InvalidAccount,
416
417    // Timing Errors
418    /// Posted to wrong period.
419    WrongPeriod,
420    /// Backdated entry.
421    BackdatedEntry,
422    /// Future-dated entry.
423    FutureDatedEntry,
424    /// Cutoff error.
425    CutoffError,
426
427    // Classification Errors
428    /// Wrong account classification.
429    MisclassifiedAccount,
430    /// Wrong cost center.
431    WrongCostCenter,
432    /// Wrong company code.
433    WrongCompanyCode,
434
435    // Calculation Errors
436    /// Unbalanced journal entry.
437    UnbalancedEntry,
438    /// Rounding error.
439    RoundingError,
440    /// Currency conversion error.
441    CurrencyError,
442    /// Tax calculation error.
443    TaxCalculationError,
444}
445
446impl ErrorType {
447    /// Returns severity level (1-5).
448    pub fn severity(&self) -> u8 {
449        match self {
450            ErrorType::RoundingError => 1,
451            ErrorType::MissingField => 2,
452            ErrorType::TransposedDigits => 2,
453            ErrorType::DecimalError => 3,
454            ErrorType::DuplicateEntry => 3,
455            ErrorType::ReversedAmount => 3,
456            ErrorType::WrongPeriod => 4,
457            ErrorType::UnbalancedEntry => 5,
458            ErrorType::CurrencyError => 4,
459            _ => 3,
460        }
461    }
462}
463
464/// Process issue types.
465#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
466pub enum ProcessIssueType {
467    // Approval Issues
468    /// Approval skipped entirely.
469    SkippedApproval,
470    /// Late approval (after posting).
471    LateApproval,
472    /// Missing supporting documentation.
473    MissingDocumentation,
474    /// Incomplete approval chain.
475    IncompleteApprovalChain,
476
477    // Timing Issues
478    /// Late posting.
479    LatePosting,
480    /// Posting outside business hours.
481    AfterHoursPosting,
482    /// Weekend/holiday posting.
483    WeekendPosting,
484    /// Rushed period-end posting.
485    RushedPeriodEnd,
486
487    // Control Issues
488    /// Manual override of system control.
489    ManualOverride,
490    /// Unusual user access pattern.
491    UnusualAccess,
492    /// System bypass.
493    SystemBypass,
494    /// Batch processing anomaly.
495    BatchAnomaly,
496
497    // Documentation Issues
498    /// Vague or missing description.
499    VagueDescription,
500    /// Changed after posting.
501    PostFactoChange,
502    /// Incomplete audit trail.
503    IncompleteAuditTrail,
504}
505
506impl ProcessIssueType {
507    /// Returns severity level (1-5).
508    pub fn severity(&self) -> u8 {
509        match self {
510            ProcessIssueType::VagueDescription => 1,
511            ProcessIssueType::LatePosting => 2,
512            ProcessIssueType::AfterHoursPosting => 2,
513            ProcessIssueType::WeekendPosting => 2,
514            ProcessIssueType::SkippedApproval => 4,
515            ProcessIssueType::ManualOverride => 4,
516            ProcessIssueType::SystemBypass => 5,
517            ProcessIssueType::IncompleteAuditTrail => 4,
518            _ => 3,
519        }
520    }
521}
522
523/// Statistical anomaly types.
524#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
525pub enum StatisticalAnomalyType {
526    // Amount Anomalies
527    /// Amount significantly above normal.
528    UnusuallyHighAmount,
529    /// Amount significantly below normal.
530    UnusuallyLowAmount,
531    /// Violates Benford's Law distribution.
532    BenfordViolation,
533    /// Exact duplicate amount (suspicious).
534    ExactDuplicateAmount,
535    /// Repeating pattern in amounts.
536    RepeatingAmount,
537
538    // Frequency Anomalies
539    /// Unusual transaction frequency.
540    UnusualFrequency,
541    /// Burst of transactions.
542    TransactionBurst,
543    /// Unusual time of day.
544    UnusualTiming,
545
546    // Trend Anomalies
547    /// Break in historical trend.
548    TrendBreak,
549    /// Sudden level shift.
550    LevelShift,
551    /// Seasonal pattern violation.
552    SeasonalAnomaly,
553
554    // Distribution Anomalies
555    /// Outlier in distribution.
556    StatisticalOutlier,
557    /// Change in variance.
558    VarianceChange,
559    /// Distribution shift.
560    DistributionShift,
561}
562
563impl StatisticalAnomalyType {
564    /// Returns severity level (1-5).
565    pub fn severity(&self) -> u8 {
566        match self {
567            StatisticalAnomalyType::UnusualTiming => 1,
568            StatisticalAnomalyType::UnusualFrequency => 2,
569            StatisticalAnomalyType::BenfordViolation => 2,
570            StatisticalAnomalyType::UnusuallyHighAmount => 3,
571            StatisticalAnomalyType::TrendBreak => 3,
572            StatisticalAnomalyType::TransactionBurst => 4,
573            StatisticalAnomalyType::ExactDuplicateAmount => 3,
574            _ => 3,
575        }
576    }
577}
578
579/// Relational/graph anomaly types.
580#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
581pub enum RelationalAnomalyType {
582    // Transaction Pattern Anomalies
583    /// Circular transaction pattern.
584    CircularTransaction,
585    /// Unusual account combination.
586    UnusualAccountPair,
587    /// New trading partner.
588    NewCounterparty,
589    /// Dormant account suddenly active.
590    DormantAccountActivity,
591
592    // Network Anomalies
593    /// Unusual network centrality.
594    CentralityAnomaly,
595    /// Isolated transaction cluster.
596    IsolatedCluster,
597    /// Bridge node anomaly.
598    BridgeNodeAnomaly,
599    /// Community structure change.
600    CommunityAnomaly,
601
602    // Relationship Anomalies
603    /// Missing expected relationship.
604    MissingRelationship,
605    /// Unexpected relationship.
606    UnexpectedRelationship,
607    /// Relationship strength change.
608    RelationshipStrengthChange,
609
610    // Intercompany Anomalies
611    /// Unmatched intercompany transaction.
612    UnmatchedIntercompany,
613    /// Circular intercompany flow.
614    CircularIntercompany,
615    /// Transfer pricing anomaly.
616    TransferPricingAnomaly,
617}
618
619impl RelationalAnomalyType {
620    /// Returns severity level (1-5).
621    pub fn severity(&self) -> u8 {
622        match self {
623            RelationalAnomalyType::NewCounterparty => 1,
624            RelationalAnomalyType::DormantAccountActivity => 2,
625            RelationalAnomalyType::UnusualAccountPair => 2,
626            RelationalAnomalyType::CircularTransaction => 4,
627            RelationalAnomalyType::CircularIntercompany => 4,
628            RelationalAnomalyType::TransferPricingAnomaly => 4,
629            RelationalAnomalyType::UnmatchedIntercompany => 3,
630            _ => 3,
631        }
632    }
633}
634
635/// A labeled anomaly for supervised learning.
636#[derive(Debug, Clone, Serialize, Deserialize)]
637pub struct LabeledAnomaly {
638    /// Unique anomaly identifier.
639    pub anomaly_id: String,
640    /// Type of anomaly.
641    pub anomaly_type: AnomalyType,
642    /// Document or entity that contains the anomaly.
643    pub document_id: String,
644    /// Document type (JE, PO, Invoice, etc.).
645    pub document_type: String,
646    /// Company code.
647    pub company_code: String,
648    /// Date the anomaly occurred.
649    pub anomaly_date: NaiveDate,
650    /// Timestamp when detected/injected.
651    pub detection_timestamp: NaiveDateTime,
652    /// Confidence score (0.0 - 1.0) for injected anomalies.
653    pub confidence: f64,
654    /// Severity (1-5).
655    pub severity: u8,
656    /// Description of the anomaly.
657    pub description: String,
658    /// Related entities (user IDs, account codes, etc.).
659    pub related_entities: Vec<String>,
660    /// Monetary impact if applicable.
661    pub monetary_impact: Option<Decimal>,
662    /// Additional metadata.
663    pub metadata: HashMap<String, String>,
664    /// Whether this was injected (true) or naturally occurring (false).
665    pub is_injected: bool,
666    /// Injection strategy used (if injected) - legacy string field.
667    pub injection_strategy: Option<String>,
668    /// Cluster ID if part of an anomaly cluster.
669    pub cluster_id: Option<String>,
670
671    // ========================================
672    // PROVENANCE TRACKING FIELDS (Phase 1.2)
673    // ========================================
674    /// Hash of the original document before modification.
675    /// Enables tracking what the document looked like pre-injection.
676    #[serde(default, skip_serializing_if = "Option::is_none")]
677    pub original_document_hash: Option<String>,
678
679    /// Causal reason explaining why this anomaly was injected.
680    /// Provides "why" tracking for each anomaly.
681    #[serde(default, skip_serializing_if = "Option::is_none")]
682    pub causal_reason: Option<AnomalyCausalReason>,
683
684    /// Structured injection strategy with parameters.
685    /// More detailed than the legacy string-based injection_strategy field.
686    #[serde(default, skip_serializing_if = "Option::is_none")]
687    pub structured_strategy: Option<InjectionStrategy>,
688
689    /// Parent anomaly ID if this was derived from another anomaly.
690    /// Enables anomaly transformation chains.
691    #[serde(default, skip_serializing_if = "Option::is_none")]
692    pub parent_anomaly_id: Option<String>,
693
694    /// Child anomaly IDs that were derived from this anomaly.
695    #[serde(default, skip_serializing_if = "Vec::is_empty")]
696    pub child_anomaly_ids: Vec<String>,
697
698    /// Scenario ID if this anomaly is part of a multi-step scenario.
699    #[serde(default, skip_serializing_if = "Option::is_none")]
700    pub scenario_id: Option<String>,
701
702    /// Generation run ID that produced this anomaly.
703    /// Enables tracing anomalies back to their generation run.
704    #[serde(default, skip_serializing_if = "Option::is_none")]
705    pub run_id: Option<String>,
706
707    /// Seed used for RNG during generation.
708    /// Enables reproducibility.
709    #[serde(default, skip_serializing_if = "Option::is_none")]
710    pub generation_seed: Option<u64>,
711}
712
713impl LabeledAnomaly {
714    /// Creates a new labeled anomaly.
715    pub fn new(
716        anomaly_id: String,
717        anomaly_type: AnomalyType,
718        document_id: String,
719        document_type: String,
720        company_code: String,
721        anomaly_date: NaiveDate,
722    ) -> Self {
723        let severity = anomaly_type.severity();
724        let description = format!(
725            "{} - {} in document {}",
726            anomaly_type.category(),
727            anomaly_type.type_name(),
728            document_id
729        );
730
731        Self {
732            anomaly_id,
733            anomaly_type,
734            document_id,
735            document_type,
736            company_code,
737            anomaly_date,
738            detection_timestamp: chrono::Local::now().naive_local(),
739            confidence: 1.0,
740            severity,
741            description,
742            related_entities: Vec::new(),
743            monetary_impact: None,
744            metadata: HashMap::new(),
745            is_injected: true,
746            injection_strategy: None,
747            cluster_id: None,
748            // Provenance fields
749            original_document_hash: None,
750            causal_reason: None,
751            structured_strategy: None,
752            parent_anomaly_id: None,
753            child_anomaly_ids: Vec::new(),
754            scenario_id: None,
755            run_id: None,
756            generation_seed: None,
757        }
758    }
759
760    /// Sets the description.
761    pub fn with_description(mut self, description: &str) -> Self {
762        self.description = description.to_string();
763        self
764    }
765
766    /// Sets the monetary impact.
767    pub fn with_monetary_impact(mut self, impact: Decimal) -> Self {
768        self.monetary_impact = Some(impact);
769        self
770    }
771
772    /// Adds a related entity.
773    pub fn with_related_entity(mut self, entity: &str) -> Self {
774        self.related_entities.push(entity.to_string());
775        self
776    }
777
778    /// Adds metadata.
779    pub fn with_metadata(mut self, key: &str, value: &str) -> Self {
780        self.metadata.insert(key.to_string(), value.to_string());
781        self
782    }
783
784    /// Sets the injection strategy (legacy string).
785    pub fn with_injection_strategy(mut self, strategy: &str) -> Self {
786        self.injection_strategy = Some(strategy.to_string());
787        self
788    }
789
790    /// Sets the cluster ID.
791    pub fn with_cluster(mut self, cluster_id: &str) -> Self {
792        self.cluster_id = Some(cluster_id.to_string());
793        self
794    }
795
796    // ========================================
797    // PROVENANCE BUILDER METHODS (Phase 1.2)
798    // ========================================
799
800    /// Sets the original document hash for provenance tracking.
801    pub fn with_original_document_hash(mut self, hash: &str) -> Self {
802        self.original_document_hash = Some(hash.to_string());
803        self
804    }
805
806    /// Sets the causal reason for this anomaly.
807    pub fn with_causal_reason(mut self, reason: AnomalyCausalReason) -> Self {
808        self.causal_reason = Some(reason);
809        self
810    }
811
812    /// Sets the structured injection strategy.
813    pub fn with_structured_strategy(mut self, strategy: InjectionStrategy) -> Self {
814        // Also set the legacy string field for backward compatibility
815        self.injection_strategy = Some(strategy.strategy_type().to_string());
816        self.structured_strategy = Some(strategy);
817        self
818    }
819
820    /// Sets the parent anomaly ID (for anomaly derivation chains).
821    pub fn with_parent_anomaly(mut self, parent_id: &str) -> Self {
822        self.parent_anomaly_id = Some(parent_id.to_string());
823        self
824    }
825
826    /// Adds a child anomaly ID.
827    pub fn with_child_anomaly(mut self, child_id: &str) -> Self {
828        self.child_anomaly_ids.push(child_id.to_string());
829        self
830    }
831
832    /// Sets the scenario ID for multi-step scenario tracking.
833    pub fn with_scenario(mut self, scenario_id: &str) -> Self {
834        self.scenario_id = Some(scenario_id.to_string());
835        self
836    }
837
838    /// Sets the generation run ID.
839    pub fn with_run_id(mut self, run_id: &str) -> Self {
840        self.run_id = Some(run_id.to_string());
841        self
842    }
843
844    /// Sets the generation seed for reproducibility.
845    pub fn with_generation_seed(mut self, seed: u64) -> Self {
846        self.generation_seed = Some(seed);
847        self
848    }
849
850    /// Sets multiple provenance fields at once for convenience.
851    pub fn with_provenance(
852        mut self,
853        run_id: Option<&str>,
854        seed: Option<u64>,
855        causal_reason: Option<AnomalyCausalReason>,
856    ) -> Self {
857        if let Some(id) = run_id {
858            self.run_id = Some(id.to_string());
859        }
860        self.generation_seed = seed;
861        self.causal_reason = causal_reason;
862        self
863    }
864
865    /// Converts to a feature vector for ML.
866    ///
867    /// Returns a vector of 15 features:
868    /// - 6 features: Category one-hot encoding (Fraud, Error, ProcessIssue, Statistical, Relational, Custom)
869    /// - 1 feature: Severity (normalized 0-1)
870    /// - 1 feature: Confidence
871    /// - 1 feature: Has monetary impact (0/1)
872    /// - 1 feature: Monetary impact (log-scaled)
873    /// - 1 feature: Is intentional (0/1)
874    /// - 1 feature: Number of related entities
875    /// - 1 feature: Is part of cluster (0/1)
876    /// - 1 feature: Is part of scenario (0/1)
877    /// - 1 feature: Has parent anomaly (0/1) - indicates derivation
878    pub fn to_features(&self) -> Vec<f64> {
879        let mut features = Vec::new();
880
881        // Category one-hot encoding
882        let categories = [
883            "Fraud",
884            "Error",
885            "ProcessIssue",
886            "Statistical",
887            "Relational",
888            "Custom",
889        ];
890        for cat in &categories {
891            features.push(if self.anomaly_type.category() == *cat {
892                1.0
893            } else {
894                0.0
895            });
896        }
897
898        // Severity (normalized)
899        features.push(self.severity as f64 / 5.0);
900
901        // Confidence
902        features.push(self.confidence);
903
904        // Has monetary impact
905        features.push(if self.monetary_impact.is_some() {
906            1.0
907        } else {
908            0.0
909        });
910
911        // Monetary impact (log-scaled)
912        if let Some(impact) = self.monetary_impact {
913            let impact_f64: f64 = impact.try_into().unwrap_or(0.0);
914            features.push((impact_f64.abs() + 1.0).ln());
915        } else {
916            features.push(0.0);
917        }
918
919        // Is intentional
920        features.push(if self.anomaly_type.is_intentional() {
921            1.0
922        } else {
923            0.0
924        });
925
926        // Number of related entities
927        features.push(self.related_entities.len() as f64);
928
929        // Is part of cluster
930        features.push(if self.cluster_id.is_some() { 1.0 } else { 0.0 });
931
932        // Provenance features
933        // Is part of scenario
934        features.push(if self.scenario_id.is_some() { 1.0 } else { 0.0 });
935
936        // Has parent anomaly (indicates this is a derived anomaly)
937        features.push(if self.parent_anomaly_id.is_some() {
938            1.0
939        } else {
940            0.0
941        });
942
943        features
944    }
945
946    /// Returns the number of features in the feature vector.
947    pub fn feature_count() -> usize {
948        15 // 6 category + 9 other features
949    }
950
951    /// Returns feature names for documentation/ML metadata.
952    pub fn feature_names() -> Vec<&'static str> {
953        vec![
954            "category_fraud",
955            "category_error",
956            "category_process_issue",
957            "category_statistical",
958            "category_relational",
959            "category_custom",
960            "severity_normalized",
961            "confidence",
962            "has_monetary_impact",
963            "monetary_impact_log",
964            "is_intentional",
965            "related_entity_count",
966            "is_clustered",
967            "is_scenario_part",
968            "is_derived",
969        ]
970    }
971}
972
973/// Summary of anomalies for reporting.
974#[derive(Debug, Clone, Default, Serialize, Deserialize)]
975pub struct AnomalySummary {
976    /// Total anomaly count.
977    pub total_count: usize,
978    /// Count by category.
979    pub by_category: HashMap<String, usize>,
980    /// Count by specific type.
981    pub by_type: HashMap<String, usize>,
982    /// Count by severity.
983    pub by_severity: HashMap<u8, usize>,
984    /// Count by company.
985    pub by_company: HashMap<String, usize>,
986    /// Total monetary impact.
987    pub total_monetary_impact: Decimal,
988    /// Date range.
989    pub date_range: Option<(NaiveDate, NaiveDate)>,
990    /// Number of clusters.
991    pub cluster_count: usize,
992}
993
994impl AnomalySummary {
995    /// Creates a summary from a list of anomalies.
996    pub fn from_anomalies(anomalies: &[LabeledAnomaly]) -> Self {
997        let mut summary = AnomalySummary {
998            total_count: anomalies.len(),
999            ..Default::default()
1000        };
1001
1002        let mut min_date: Option<NaiveDate> = None;
1003        let mut max_date: Option<NaiveDate> = None;
1004        let mut clusters = std::collections::HashSet::new();
1005
1006        for anomaly in anomalies {
1007            // By category
1008            *summary
1009                .by_category
1010                .entry(anomaly.anomaly_type.category().to_string())
1011                .or_insert(0) += 1;
1012
1013            // By type
1014            *summary
1015                .by_type
1016                .entry(anomaly.anomaly_type.type_name())
1017                .or_insert(0) += 1;
1018
1019            // By severity
1020            *summary.by_severity.entry(anomaly.severity).or_insert(0) += 1;
1021
1022            // By company
1023            *summary
1024                .by_company
1025                .entry(anomaly.company_code.clone())
1026                .or_insert(0) += 1;
1027
1028            // Monetary impact
1029            if let Some(impact) = anomaly.monetary_impact {
1030                summary.total_monetary_impact += impact;
1031            }
1032
1033            // Date range
1034            match min_date {
1035                None => min_date = Some(anomaly.anomaly_date),
1036                Some(d) if anomaly.anomaly_date < d => min_date = Some(anomaly.anomaly_date),
1037                _ => {}
1038            }
1039            match max_date {
1040                None => max_date = Some(anomaly.anomaly_date),
1041                Some(d) if anomaly.anomaly_date > d => max_date = Some(anomaly.anomaly_date),
1042                _ => {}
1043            }
1044
1045            // Clusters
1046            if let Some(cluster_id) = &anomaly.cluster_id {
1047                clusters.insert(cluster_id.clone());
1048            }
1049        }
1050
1051        summary.date_range = min_date.zip(max_date);
1052        summary.cluster_count = clusters.len();
1053
1054        summary
1055    }
1056}
1057
1058/// Configuration for anomaly rates.
1059#[derive(Debug, Clone, Serialize, Deserialize)]
1060pub struct AnomalyRateConfig {
1061    /// Overall anomaly rate (0.0 - 1.0).
1062    pub total_rate: f64,
1063    /// Fraud rate as proportion of anomalies.
1064    pub fraud_rate: f64,
1065    /// Error rate as proportion of anomalies.
1066    pub error_rate: f64,
1067    /// Process issue rate as proportion of anomalies.
1068    pub process_issue_rate: f64,
1069    /// Statistical anomaly rate as proportion of anomalies.
1070    pub statistical_rate: f64,
1071    /// Relational anomaly rate as proportion of anomalies.
1072    pub relational_rate: f64,
1073}
1074
1075impl Default for AnomalyRateConfig {
1076    fn default() -> Self {
1077        Self {
1078            total_rate: 0.02,         // 2% of transactions are anomalous
1079            fraud_rate: 0.25,         // 25% of anomalies are fraud
1080            error_rate: 0.35,         // 35% of anomalies are errors
1081            process_issue_rate: 0.20, // 20% are process issues
1082            statistical_rate: 0.15,   // 15% are statistical
1083            relational_rate: 0.05,    // 5% are relational
1084        }
1085    }
1086}
1087
1088impl AnomalyRateConfig {
1089    /// Validates that rates sum to approximately 1.0.
1090    pub fn validate(&self) -> Result<(), String> {
1091        let sum = self.fraud_rate
1092            + self.error_rate
1093            + self.process_issue_rate
1094            + self.statistical_rate
1095            + self.relational_rate;
1096
1097        if (sum - 1.0).abs() > 0.01 {
1098            return Err(format!(
1099                "Anomaly category rates must sum to 1.0, got {}",
1100                sum
1101            ));
1102        }
1103
1104        if self.total_rate < 0.0 || self.total_rate > 1.0 {
1105            return Err(format!(
1106                "Total rate must be between 0.0 and 1.0, got {}",
1107                self.total_rate
1108            ));
1109        }
1110
1111        Ok(())
1112    }
1113}
1114
1115#[cfg(test)]
1116mod tests {
1117    use super::*;
1118    use rust_decimal_macros::dec;
1119
1120    #[test]
1121    fn test_anomaly_type_category() {
1122        let fraud = AnomalyType::Fraud(FraudType::SelfApproval);
1123        assert_eq!(fraud.category(), "Fraud");
1124        assert!(fraud.is_intentional());
1125
1126        let error = AnomalyType::Error(ErrorType::DuplicateEntry);
1127        assert_eq!(error.category(), "Error");
1128        assert!(!error.is_intentional());
1129    }
1130
1131    #[test]
1132    fn test_labeled_anomaly() {
1133        let anomaly = LabeledAnomaly::new(
1134            "ANO001".to_string(),
1135            AnomalyType::Fraud(FraudType::SelfApproval),
1136            "JE001".to_string(),
1137            "JE".to_string(),
1138            "1000".to_string(),
1139            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1140        )
1141        .with_description("User approved their own expense report")
1142        .with_related_entity("USER001");
1143
1144        assert_eq!(anomaly.severity, 3);
1145        assert!(anomaly.is_injected);
1146        assert_eq!(anomaly.related_entities.len(), 1);
1147    }
1148
1149    #[test]
1150    fn test_labeled_anomaly_with_provenance() {
1151        let anomaly = LabeledAnomaly::new(
1152            "ANO001".to_string(),
1153            AnomalyType::Fraud(FraudType::SelfApproval),
1154            "JE001".to_string(),
1155            "JE".to_string(),
1156            "1000".to_string(),
1157            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1158        )
1159        .with_run_id("run-123")
1160        .with_generation_seed(42)
1161        .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 })
1162        .with_structured_strategy(InjectionStrategy::SelfApproval {
1163            user_id: "USER001".to_string(),
1164        })
1165        .with_scenario("scenario-001")
1166        .with_original_document_hash("abc123");
1167
1168        assert_eq!(anomaly.run_id, Some("run-123".to_string()));
1169        assert_eq!(anomaly.generation_seed, Some(42));
1170        assert!(anomaly.causal_reason.is_some());
1171        assert!(anomaly.structured_strategy.is_some());
1172        assert_eq!(anomaly.scenario_id, Some("scenario-001".to_string()));
1173        assert_eq!(anomaly.original_document_hash, Some("abc123".to_string()));
1174
1175        // Check that legacy injection_strategy is also set
1176        assert_eq!(anomaly.injection_strategy, Some("SelfApproval".to_string()));
1177    }
1178
1179    #[test]
1180    fn test_labeled_anomaly_derivation_chain() {
1181        let parent = LabeledAnomaly::new(
1182            "ANO001".to_string(),
1183            AnomalyType::Fraud(FraudType::DuplicatePayment),
1184            "JE001".to_string(),
1185            "JE".to_string(),
1186            "1000".to_string(),
1187            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1188        );
1189
1190        let child = LabeledAnomaly::new(
1191            "ANO002".to_string(),
1192            AnomalyType::Error(ErrorType::DuplicateEntry),
1193            "JE002".to_string(),
1194            "JE".to_string(),
1195            "1000".to_string(),
1196            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1197        )
1198        .with_parent_anomaly(&parent.anomaly_id);
1199
1200        assert_eq!(child.parent_anomaly_id, Some("ANO001".to_string()));
1201    }
1202
1203    #[test]
1204    fn test_injection_strategy_description() {
1205        let strategy = InjectionStrategy::AmountManipulation {
1206            original: dec!(1000),
1207            factor: 2.5,
1208        };
1209        assert_eq!(strategy.description(), "Amount multiplied by 2.50");
1210        assert_eq!(strategy.strategy_type(), "AmountManipulation");
1211
1212        let strategy = InjectionStrategy::ThresholdAvoidance {
1213            threshold: dec!(10000),
1214            adjusted_amount: dec!(9999),
1215        };
1216        assert_eq!(
1217            strategy.description(),
1218            "Amount adjusted to avoid 10000 threshold"
1219        );
1220
1221        let strategy = InjectionStrategy::DateShift {
1222            days_shifted: -5,
1223            original_date: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1224        };
1225        assert_eq!(strategy.description(), "Date backdated by 5 days");
1226
1227        let strategy = InjectionStrategy::DateShift {
1228            days_shifted: 3,
1229            original_date: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1230        };
1231        assert_eq!(strategy.description(), "Date forward-dated by 3 days");
1232    }
1233
1234    #[test]
1235    fn test_causal_reason_variants() {
1236        let reason = AnomalyCausalReason::RandomRate { base_rate: 0.02 };
1237        if let AnomalyCausalReason::RandomRate { base_rate } = reason {
1238            assert!((base_rate - 0.02).abs() < 0.001);
1239        }
1240
1241        let reason = AnomalyCausalReason::TemporalPattern {
1242            pattern_name: "year_end_spike".to_string(),
1243        };
1244        if let AnomalyCausalReason::TemporalPattern { pattern_name } = reason {
1245            assert_eq!(pattern_name, "year_end_spike");
1246        }
1247
1248        let reason = AnomalyCausalReason::ScenarioStep {
1249            scenario_type: "kickback".to_string(),
1250            step_number: 3,
1251        };
1252        if let AnomalyCausalReason::ScenarioStep {
1253            scenario_type,
1254            step_number,
1255        } = reason
1256        {
1257            assert_eq!(scenario_type, "kickback");
1258            assert_eq!(step_number, 3);
1259        }
1260    }
1261
1262    #[test]
1263    fn test_feature_vector_length() {
1264        let anomaly = LabeledAnomaly::new(
1265            "ANO001".to_string(),
1266            AnomalyType::Fraud(FraudType::SelfApproval),
1267            "JE001".to_string(),
1268            "JE".to_string(),
1269            "1000".to_string(),
1270            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1271        );
1272
1273        let features = anomaly.to_features();
1274        assert_eq!(features.len(), LabeledAnomaly::feature_count());
1275        assert_eq!(features.len(), LabeledAnomaly::feature_names().len());
1276    }
1277
1278    #[test]
1279    fn test_feature_vector_with_provenance() {
1280        let anomaly = LabeledAnomaly::new(
1281            "ANO001".to_string(),
1282            AnomalyType::Fraud(FraudType::SelfApproval),
1283            "JE001".to_string(),
1284            "JE".to_string(),
1285            "1000".to_string(),
1286            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1287        )
1288        .with_scenario("scenario-001")
1289        .with_parent_anomaly("ANO000");
1290
1291        let features = anomaly.to_features();
1292
1293        // Last two features should be 1.0 (has scenario, has parent)
1294        assert_eq!(features[features.len() - 2], 1.0); // is_scenario_part
1295        assert_eq!(features[features.len() - 1], 1.0); // is_derived
1296    }
1297
1298    #[test]
1299    fn test_anomaly_summary() {
1300        let anomalies = vec![
1301            LabeledAnomaly::new(
1302                "ANO001".to_string(),
1303                AnomalyType::Fraud(FraudType::SelfApproval),
1304                "JE001".to_string(),
1305                "JE".to_string(),
1306                "1000".to_string(),
1307                NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1308            ),
1309            LabeledAnomaly::new(
1310                "ANO002".to_string(),
1311                AnomalyType::Error(ErrorType::DuplicateEntry),
1312                "JE002".to_string(),
1313                "JE".to_string(),
1314                "1000".to_string(),
1315                NaiveDate::from_ymd_opt(2024, 1, 16).unwrap(),
1316            ),
1317        ];
1318
1319        let summary = AnomalySummary::from_anomalies(&anomalies);
1320
1321        assert_eq!(summary.total_count, 2);
1322        assert_eq!(summary.by_category.get("Fraud"), Some(&1));
1323        assert_eq!(summary.by_category.get("Error"), Some(&1));
1324    }
1325
1326    #[test]
1327    fn test_rate_config_validation() {
1328        let config = AnomalyRateConfig::default();
1329        assert!(config.validate().is_ok());
1330
1331        let bad_config = AnomalyRateConfig {
1332            fraud_rate: 0.5,
1333            error_rate: 0.5,
1334            process_issue_rate: 0.5, // Sum > 1.0
1335            ..Default::default()
1336        };
1337        assert!(bad_config.validate().is_err());
1338    }
1339
1340    #[test]
1341    fn test_injection_strategy_serialization() {
1342        let strategy = InjectionStrategy::SoDViolation {
1343            duty1: "CreatePO".to_string(),
1344            duty2: "ApprovePO".to_string(),
1345            violating_user: "USER001".to_string(),
1346        };
1347
1348        let json = serde_json::to_string(&strategy).unwrap();
1349        let deserialized: InjectionStrategy = serde_json::from_str(&json).unwrap();
1350
1351        assert_eq!(strategy, deserialized);
1352    }
1353
1354    #[test]
1355    fn test_labeled_anomaly_serialization_with_provenance() {
1356        let anomaly = LabeledAnomaly::new(
1357            "ANO001".to_string(),
1358            AnomalyType::Fraud(FraudType::SelfApproval),
1359            "JE001".to_string(),
1360            "JE".to_string(),
1361            "1000".to_string(),
1362            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1363        )
1364        .with_run_id("run-123")
1365        .with_generation_seed(42)
1366        .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 });
1367
1368        let json = serde_json::to_string(&anomaly).unwrap();
1369        let deserialized: LabeledAnomaly = serde_json::from_str(&json).unwrap();
1370
1371        assert_eq!(anomaly.run_id, deserialized.run_id);
1372        assert_eq!(anomaly.generation_seed, deserialized.generation_seed);
1373    }
1374}
datasynth_core/models/anomaly.rs

datasynth_core/models/
anomaly.rs