datasynth_core/models/
anomaly.rs

1//! Anomaly types and labels for synthetic data generation.
2//!
3//! This module provides comprehensive anomaly classification for:
4//! - Fraud detection training
5//! - Error detection systems
6//! - Process compliance monitoring
7//! - Statistical anomaly detection
8//! - Graph-based anomaly detection
9
10use chrono::{NaiveDate, NaiveDateTime};
11use rust_decimal::Decimal;
12use serde::{Deserialize, Serialize};
13use std::collections::HashMap;
14
15/// Causal reason explaining why an anomaly was injected.
16///
17/// This enables provenance tracking for understanding the "why" behind each anomaly.
18#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
19pub enum AnomalyCausalReason {
20    /// Injected due to random rate selection.
21    RandomRate {
22        /// Base rate used for selection.
23        base_rate: f64,
24    },
25    /// Injected due to temporal pattern matching.
26    TemporalPattern {
27        /// Name of the temporal pattern (e.g., "year_end_spike", "month_end").
28        pattern_name: String,
29    },
30    /// Injected based on entity targeting rules.
31    EntityTargeting {
32        /// Type of entity targeted (e.g., "vendor", "user", "account").
33        target_type: String,
34        /// ID of the targeted entity.
35        target_id: String,
36    },
37    /// Part of an anomaly cluster.
38    ClusterMembership {
39        /// ID of the cluster this anomaly belongs to.
40        cluster_id: String,
41    },
42    /// Part of a multi-step scenario.
43    ScenarioStep {
44        /// Type of scenario (e.g., "kickback_scheme", "round_tripping").
45        scenario_type: String,
46        /// Step number within the scenario.
47        step_number: u32,
48    },
49    /// Injected based on data quality profile.
50    DataQualityProfile {
51        /// Profile name (e.g., "noisy", "legacy", "clean").
52        profile: String,
53    },
54    /// Injected for ML training balance.
55    MLTrainingBalance {
56        /// Target class being balanced.
57        target_class: String,
58    },
59}
60
61/// Structured injection strategy with captured parameters.
62///
63/// Unlike the string-based `injection_strategy` field, this enum captures
64/// the exact parameters used during injection for full reproducibility.
65#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
66pub enum InjectionStrategy {
67    /// Amount was manipulated by a factor.
68    AmountManipulation {
69        /// Original amount before manipulation.
70        original: Decimal,
71        /// Multiplication factor applied.
72        factor: f64,
73    },
74    /// Amount adjusted to avoid a threshold.
75    ThresholdAvoidance {
76        /// Threshold being avoided.
77        threshold: Decimal,
78        /// Final amount after adjustment.
79        adjusted_amount: Decimal,
80    },
81    /// Date was backdated or forward-dated.
82    DateShift {
83        /// Number of days shifted (negative = backdated).
84        days_shifted: i32,
85        /// Original date before shift.
86        original_date: NaiveDate,
87    },
88    /// User approved their own transaction.
89    SelfApproval {
90        /// User who created and approved.
91        user_id: String,
92    },
93    /// Segregation of duties violation.
94    SoDViolation {
95        /// First duty involved.
96        duty1: String,
97        /// Second duty involved.
98        duty2: String,
99        /// User who performed both duties.
100        violating_user: String,
101    },
102    /// Exact duplicate of another document.
103    ExactDuplicate {
104        /// ID of the original document.
105        original_doc_id: String,
106    },
107    /// Near-duplicate with small variations.
108    NearDuplicate {
109        /// ID of the original document.
110        original_doc_id: String,
111        /// Fields that were varied.
112        varied_fields: Vec<String>,
113    },
114    /// Circular flow of funds/goods.
115    CircularFlow {
116        /// Chain of entities involved.
117        entity_chain: Vec<String>,
118    },
119    /// Split transaction to avoid threshold.
120    SplitTransaction {
121        /// Original total amount.
122        original_amount: Decimal,
123        /// Number of splits.
124        split_count: u32,
125        /// IDs of the split documents.
126        split_doc_ids: Vec<String>,
127    },
128    /// Round number manipulation.
129    RoundNumbering {
130        /// Original precise amount.
131        original_amount: Decimal,
132        /// Rounded amount.
133        rounded_amount: Decimal,
134    },
135    /// Timing manipulation (weekend, after-hours, etc.).
136    TimingManipulation {
137        /// Type of timing issue.
138        timing_type: String,
139        /// Original timestamp.
140        original_time: Option<NaiveDateTime>,
141    },
142    /// Account misclassification.
143    AccountMisclassification {
144        /// Correct account.
145        correct_account: String,
146        /// Incorrect account used.
147        incorrect_account: String,
148    },
149    /// Missing required field.
150    MissingField {
151        /// Name of the missing field.
152        field_name: String,
153    },
154    /// Custom injection strategy.
155    Custom {
156        /// Strategy name.
157        name: String,
158        /// Additional parameters.
159        parameters: HashMap<String, String>,
160    },
161}
162
163impl InjectionStrategy {
164    /// Returns a human-readable description of the strategy.
165    pub fn description(&self) -> String {
166        match self {
167            InjectionStrategy::AmountManipulation { factor, .. } => {
168                format!("Amount multiplied by {:.2}", factor)
169            }
170            InjectionStrategy::ThresholdAvoidance { threshold, .. } => {
171                format!("Amount adjusted to avoid {} threshold", threshold)
172            }
173            InjectionStrategy::DateShift { days_shifted, .. } => {
174                if *days_shifted < 0 {
175                    format!("Date backdated by {} days", days_shifted.abs())
176                } else {
177                    format!("Date forward-dated by {} days", days_shifted)
178                }
179            }
180            InjectionStrategy::SelfApproval { user_id } => {
181                format!("Self-approval by user {}", user_id)
182            }
183            InjectionStrategy::SoDViolation { duty1, duty2, .. } => {
184                format!("SoD violation: {} and {}", duty1, duty2)
185            }
186            InjectionStrategy::ExactDuplicate { original_doc_id } => {
187                format!("Exact duplicate of {}", original_doc_id)
188            }
189            InjectionStrategy::NearDuplicate {
190                original_doc_id,
191                varied_fields,
192            } => {
193                format!(
194                    "Near-duplicate of {} (varied: {:?})",
195                    original_doc_id, varied_fields
196                )
197            }
198            InjectionStrategy::CircularFlow { entity_chain } => {
199                format!("Circular flow through {} entities", entity_chain.len())
200            }
201            InjectionStrategy::SplitTransaction { split_count, .. } => {
202                format!("Split into {} transactions", split_count)
203            }
204            InjectionStrategy::RoundNumbering { .. } => "Amount rounded to even number".to_string(),
205            InjectionStrategy::TimingManipulation { timing_type, .. } => {
206                format!("Timing manipulation: {}", timing_type)
207            }
208            InjectionStrategy::AccountMisclassification {
209                correct_account,
210                incorrect_account,
211            } => {
212                format!(
213                    "Misclassified from {} to {}",
214                    correct_account, incorrect_account
215                )
216            }
217            InjectionStrategy::MissingField { field_name } => {
218                format!("Missing required field: {}", field_name)
219            }
220            InjectionStrategy::Custom { name, .. } => format!("Custom: {}", name),
221        }
222    }
223
224    /// Returns the strategy type name.
225    pub fn strategy_type(&self) -> &'static str {
226        match self {
227            InjectionStrategy::AmountManipulation { .. } => "AmountManipulation",
228            InjectionStrategy::ThresholdAvoidance { .. } => "ThresholdAvoidance",
229            InjectionStrategy::DateShift { .. } => "DateShift",
230            InjectionStrategy::SelfApproval { .. } => "SelfApproval",
231            InjectionStrategy::SoDViolation { .. } => "SoDViolation",
232            InjectionStrategy::ExactDuplicate { .. } => "ExactDuplicate",
233            InjectionStrategy::NearDuplicate { .. } => "NearDuplicate",
234            InjectionStrategy::CircularFlow { .. } => "CircularFlow",
235            InjectionStrategy::SplitTransaction { .. } => "SplitTransaction",
236            InjectionStrategy::RoundNumbering { .. } => "RoundNumbering",
237            InjectionStrategy::TimingManipulation { .. } => "TimingManipulation",
238            InjectionStrategy::AccountMisclassification { .. } => "AccountMisclassification",
239            InjectionStrategy::MissingField { .. } => "MissingField",
240            InjectionStrategy::Custom { .. } => "Custom",
241        }
242    }
243}
244
245/// Primary anomaly classification.
246#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
247pub enum AnomalyType {
248    /// Fraudulent activity.
249    Fraud(FraudType),
250    /// Data entry or processing error.
251    Error(ErrorType),
252    /// Process or control issue.
253    ProcessIssue(ProcessIssueType),
254    /// Statistical anomaly.
255    Statistical(StatisticalAnomalyType),
256    /// Relational/graph anomaly.
257    Relational(RelationalAnomalyType),
258    /// Custom anomaly type.
259    Custom(String),
260}
261
262impl AnomalyType {
263    /// Returns the category name.
264    pub fn category(&self) -> &'static str {
265        match self {
266            AnomalyType::Fraud(_) => "Fraud",
267            AnomalyType::Error(_) => "Error",
268            AnomalyType::ProcessIssue(_) => "ProcessIssue",
269            AnomalyType::Statistical(_) => "Statistical",
270            AnomalyType::Relational(_) => "Relational",
271            AnomalyType::Custom(_) => "Custom",
272        }
273    }
274
275    /// Returns the specific type name.
276    pub fn type_name(&self) -> String {
277        match self {
278            AnomalyType::Fraud(t) => format!("{:?}", t),
279            AnomalyType::Error(t) => format!("{:?}", t),
280            AnomalyType::ProcessIssue(t) => format!("{:?}", t),
281            AnomalyType::Statistical(t) => format!("{:?}", t),
282            AnomalyType::Relational(t) => format!("{:?}", t),
283            AnomalyType::Custom(s) => s.clone(),
284        }
285    }
286
287    /// Returns the severity level (1-5, 5 being most severe).
288    pub fn severity(&self) -> u8 {
289        match self {
290            AnomalyType::Fraud(t) => t.severity(),
291            AnomalyType::Error(t) => t.severity(),
292            AnomalyType::ProcessIssue(t) => t.severity(),
293            AnomalyType::Statistical(t) => t.severity(),
294            AnomalyType::Relational(t) => t.severity(),
295            AnomalyType::Custom(_) => 3,
296        }
297    }
298
299    /// Returns whether this anomaly is typically intentional.
300    pub fn is_intentional(&self) -> bool {
301        matches!(self, AnomalyType::Fraud(_))
302    }
303}
304
305/// Fraud types for detection training.
306#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
307pub enum FraudType {
308    // Journal Entry Fraud
309    /// Fictitious journal entry with no business purpose.
310    FictitiousEntry,
311    /// Fictitious transaction (alias for FictitiousEntry).
312    FictitiousTransaction,
313    /// Round-dollar amounts suggesting manual manipulation.
314    RoundDollarManipulation,
315    /// Entry posted just below approval threshold.
316    JustBelowThreshold,
317    /// Revenue recognition manipulation.
318    RevenueManipulation,
319    /// Expense capitalization fraud.
320    ImproperCapitalization,
321    /// Improperly capitalizing expenses as assets.
322    ExpenseCapitalization,
323    /// Cookie jar reserves manipulation.
324    ReserveManipulation,
325    /// Round-tripping funds through suspense/clearing accounts.
326    SuspenseAccountAbuse,
327    /// Splitting transactions to stay below approval thresholds.
328    SplitTransaction,
329    /// Unusual timing (weekend, holiday, after-hours postings).
330    TimingAnomaly,
331    /// Posting to unauthorized accounts.
332    UnauthorizedAccess,
333
334    // Approval Fraud
335    /// User approving their own request.
336    SelfApproval,
337    /// Approval beyond authorized limit.
338    ExceededApprovalLimit,
339    /// Segregation of duties violation.
340    SegregationOfDutiesViolation,
341    /// Approval by unauthorized user.
342    UnauthorizedApproval,
343    /// Collusion between approver and requester.
344    CollusiveApproval,
345
346    // Vendor/Payment Fraud
347    /// Fictitious vendor.
348    FictitiousVendor,
349    /// Duplicate payment to vendor.
350    DuplicatePayment,
351    /// Payment to shell company.
352    ShellCompanyPayment,
353    /// Kickback scheme.
354    Kickback,
355    /// Kickback scheme (alias).
356    KickbackScheme,
357    /// Invoice manipulation.
358    InvoiceManipulation,
359
360    // Asset Fraud
361    /// Misappropriation of assets.
362    AssetMisappropriation,
363    /// Inventory theft.
364    InventoryTheft,
365    /// Ghost employee.
366    GhostEmployee,
367
368    // Financial Statement Fraud
369    /// Premature revenue recognition.
370    PrematureRevenue,
371    /// Understated liabilities.
372    UnderstatedLiabilities,
373    /// Overstated assets.
374    OverstatedAssets,
375    /// Channel stuffing.
376    ChannelStuffing,
377}
378
379impl FraudType {
380    /// Returns severity level (1-5).
381    pub fn severity(&self) -> u8 {
382        match self {
383            FraudType::RoundDollarManipulation => 2,
384            FraudType::JustBelowThreshold => 3,
385            FraudType::SelfApproval => 3,
386            FraudType::ExceededApprovalLimit => 3,
387            FraudType::DuplicatePayment => 3,
388            FraudType::FictitiousEntry => 4,
389            FraudType::RevenueManipulation => 5,
390            FraudType::FictitiousVendor => 5,
391            FraudType::ShellCompanyPayment => 5,
392            FraudType::AssetMisappropriation => 5,
393            FraudType::SegregationOfDutiesViolation => 4,
394            FraudType::CollusiveApproval => 5,
395            _ => 4,
396        }
397    }
398}
399
400/// Error types for error detection.
401#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
402pub enum ErrorType {
403    // Data Entry Errors
404    /// Duplicate document entry.
405    DuplicateEntry,
406    /// Reversed debit/credit amounts.
407    ReversedAmount,
408    /// Transposed digits in amount.
409    TransposedDigits,
410    /// Wrong decimal placement.
411    DecimalError,
412    /// Missing required field.
413    MissingField,
414    /// Invalid account code.
415    InvalidAccount,
416
417    // Timing Errors
418    /// Posted to wrong period.
419    WrongPeriod,
420    /// Backdated entry.
421    BackdatedEntry,
422    /// Future-dated entry.
423    FutureDatedEntry,
424    /// Cutoff error.
425    CutoffError,
426
427    // Classification Errors
428    /// Wrong account classification.
429    MisclassifiedAccount,
430    /// Wrong cost center.
431    WrongCostCenter,
432    /// Wrong company code.
433    WrongCompanyCode,
434
435    // Calculation Errors
436    /// Unbalanced journal entry.
437    UnbalancedEntry,
438    /// Rounding error.
439    RoundingError,
440    /// Currency conversion error.
441    CurrencyError,
442    /// Tax calculation error.
443    TaxCalculationError,
444}
445
446impl ErrorType {
447    /// Returns severity level (1-5).
448    pub fn severity(&self) -> u8 {
449        match self {
450            ErrorType::RoundingError => 1,
451            ErrorType::MissingField => 2,
452            ErrorType::TransposedDigits => 2,
453            ErrorType::DecimalError => 3,
454            ErrorType::DuplicateEntry => 3,
455            ErrorType::ReversedAmount => 3,
456            ErrorType::WrongPeriod => 4,
457            ErrorType::UnbalancedEntry => 5,
458            ErrorType::CurrencyError => 4,
459            _ => 3,
460        }
461    }
462}
463
464/// Process issue types.
465#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
466pub enum ProcessIssueType {
467    // Approval Issues
468    /// Approval skipped entirely.
469    SkippedApproval,
470    /// Late approval (after posting).
471    LateApproval,
472    /// Missing supporting documentation.
473    MissingDocumentation,
474    /// Incomplete approval chain.
475    IncompleteApprovalChain,
476
477    // Timing Issues
478    /// Late posting.
479    LatePosting,
480    /// Posting outside business hours.
481    AfterHoursPosting,
482    /// Weekend/holiday posting.
483    WeekendPosting,
484    /// Rushed period-end posting.
485    RushedPeriodEnd,
486
487    // Control Issues
488    /// Manual override of system control.
489    ManualOverride,
490    /// Unusual user access pattern.
491    UnusualAccess,
492    /// System bypass.
493    SystemBypass,
494    /// Batch processing anomaly.
495    BatchAnomaly,
496
497    // Documentation Issues
498    /// Vague or missing description.
499    VagueDescription,
500    /// Changed after posting.
501    PostFactoChange,
502    /// Incomplete audit trail.
503    IncompleteAuditTrail,
504}
505
506impl ProcessIssueType {
507    /// Returns severity level (1-5).
508    pub fn severity(&self) -> u8 {
509        match self {
510            ProcessIssueType::VagueDescription => 1,
511            ProcessIssueType::LatePosting => 2,
512            ProcessIssueType::AfterHoursPosting => 2,
513            ProcessIssueType::WeekendPosting => 2,
514            ProcessIssueType::SkippedApproval => 4,
515            ProcessIssueType::ManualOverride => 4,
516            ProcessIssueType::SystemBypass => 5,
517            ProcessIssueType::IncompleteAuditTrail => 4,
518            _ => 3,
519        }
520    }
521}
522
523/// Statistical anomaly types.
524#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
525pub enum StatisticalAnomalyType {
526    // Amount Anomalies
527    /// Amount significantly above normal.
528    UnusuallyHighAmount,
529    /// Amount significantly below normal.
530    UnusuallyLowAmount,
531    /// Violates Benford's Law distribution.
532    BenfordViolation,
533    /// Exact duplicate amount (suspicious).
534    ExactDuplicateAmount,
535    /// Repeating pattern in amounts.
536    RepeatingAmount,
537
538    // Frequency Anomalies
539    /// Unusual transaction frequency.
540    UnusualFrequency,
541    /// Burst of transactions.
542    TransactionBurst,
543    /// Unusual time of day.
544    UnusualTiming,
545
546    // Trend Anomalies
547    /// Break in historical trend.
548    TrendBreak,
549    /// Sudden level shift.
550    LevelShift,
551    /// Seasonal pattern violation.
552    SeasonalAnomaly,
553
554    // Distribution Anomalies
555    /// Outlier in distribution.
556    StatisticalOutlier,
557    /// Change in variance.
558    VarianceChange,
559    /// Distribution shift.
560    DistributionShift,
561}
562
563impl StatisticalAnomalyType {
564    /// Returns severity level (1-5).
565    pub fn severity(&self) -> u8 {
566        match self {
567            StatisticalAnomalyType::UnusualTiming => 1,
568            StatisticalAnomalyType::UnusualFrequency => 2,
569            StatisticalAnomalyType::BenfordViolation => 2,
570            StatisticalAnomalyType::UnusuallyHighAmount => 3,
571            StatisticalAnomalyType::TrendBreak => 3,
572            StatisticalAnomalyType::TransactionBurst => 4,
573            StatisticalAnomalyType::ExactDuplicateAmount => 3,
574            _ => 3,
575        }
576    }
577}
578
579/// Relational/graph anomaly types.
580#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
581pub enum RelationalAnomalyType {
582    // Transaction Pattern Anomalies
583    /// Circular transaction pattern.
584    CircularTransaction,
585    /// Unusual account combination.
586    UnusualAccountPair,
587    /// New trading partner.
588    NewCounterparty,
589    /// Dormant account suddenly active.
590    DormantAccountActivity,
591
592    // Network Anomalies
593    /// Unusual network centrality.
594    CentralityAnomaly,
595    /// Isolated transaction cluster.
596    IsolatedCluster,
597    /// Bridge node anomaly.
598    BridgeNodeAnomaly,
599    /// Community structure change.
600    CommunityAnomaly,
601
602    // Relationship Anomalies
603    /// Missing expected relationship.
604    MissingRelationship,
605    /// Unexpected relationship.
606    UnexpectedRelationship,
607    /// Relationship strength change.
608    RelationshipStrengthChange,
609
610    // Intercompany Anomalies
611    /// Unmatched intercompany transaction.
612    UnmatchedIntercompany,
613    /// Circular intercompany flow.
614    CircularIntercompany,
615    /// Transfer pricing anomaly.
616    TransferPricingAnomaly,
617}
618
619impl RelationalAnomalyType {
620    /// Returns severity level (1-5).
621    pub fn severity(&self) -> u8 {
622        match self {
623            RelationalAnomalyType::NewCounterparty => 1,
624            RelationalAnomalyType::DormantAccountActivity => 2,
625            RelationalAnomalyType::UnusualAccountPair => 2,
626            RelationalAnomalyType::CircularTransaction => 4,
627            RelationalAnomalyType::CircularIntercompany => 4,
628            RelationalAnomalyType::TransferPricingAnomaly => 4,
629            RelationalAnomalyType::UnmatchedIntercompany => 3,
630            _ => 3,
631        }
632    }
633}
634
635/// A labeled anomaly for supervised learning.
636#[derive(Debug, Clone, Serialize, Deserialize)]
637pub struct LabeledAnomaly {
638    /// Unique anomaly identifier.
639    pub anomaly_id: String,
640    /// Type of anomaly.
641    pub anomaly_type: AnomalyType,
642    /// Document or entity that contains the anomaly.
643    pub document_id: String,
644    /// Document type (JE, PO, Invoice, etc.).
645    pub document_type: String,
646    /// Company code.
647    pub company_code: String,
648    /// Date the anomaly occurred.
649    pub anomaly_date: NaiveDate,
650    /// Timestamp when detected/injected.
651    pub detection_timestamp: NaiveDateTime,
652    /// Confidence score (0.0 - 1.0) for injected anomalies.
653    pub confidence: f64,
654    /// Severity (1-5).
655    pub severity: u8,
656    /// Description of the anomaly.
657    pub description: String,
658    /// Related entities (user IDs, account codes, etc.).
659    pub related_entities: Vec<String>,
660    /// Monetary impact if applicable.
661    pub monetary_impact: Option<Decimal>,
662    /// Additional metadata.
663    pub metadata: HashMap<String, String>,
664    /// Whether this was injected (true) or naturally occurring (false).
665    pub is_injected: bool,
666    /// Injection strategy used (if injected) - legacy string field.
667    pub injection_strategy: Option<String>,
668    /// Cluster ID if part of an anomaly cluster.
669    pub cluster_id: Option<String>,
670
671    // ========================================
672    // PROVENANCE TRACKING FIELDS (Phase 1.2)
673    // ========================================
674    /// Hash of the original document before modification.
675    /// Enables tracking what the document looked like pre-injection.
676    #[serde(default, skip_serializing_if = "Option::is_none")]
677    pub original_document_hash: Option<String>,
678
679    /// Causal reason explaining why this anomaly was injected.
680    /// Provides "why" tracking for each anomaly.
681    #[serde(default, skip_serializing_if = "Option::is_none")]
682    pub causal_reason: Option<AnomalyCausalReason>,
683
684    /// Structured injection strategy with parameters.
685    /// More detailed than the legacy string-based injection_strategy field.
686    #[serde(default, skip_serializing_if = "Option::is_none")]
687    pub structured_strategy: Option<InjectionStrategy>,
688
689    /// Parent anomaly ID if this was derived from another anomaly.
690    /// Enables anomaly transformation chains.
691    #[serde(default, skip_serializing_if = "Option::is_none")]
692    pub parent_anomaly_id: Option<String>,
693
694    /// Child anomaly IDs that were derived from this anomaly.
695    #[serde(default, skip_serializing_if = "Vec::is_empty")]
696    pub child_anomaly_ids: Vec<String>,
697
698    /// Scenario ID if this anomaly is part of a multi-step scenario.
699    #[serde(default, skip_serializing_if = "Option::is_none")]
700    pub scenario_id: Option<String>,
701
702    /// Generation run ID that produced this anomaly.
703    /// Enables tracing anomalies back to their generation run.
704    #[serde(default, skip_serializing_if = "Option::is_none")]
705    pub run_id: Option<String>,
706
707    /// Seed used for RNG during generation.
708    /// Enables reproducibility.
709    #[serde(default, skip_serializing_if = "Option::is_none")]
710    pub generation_seed: Option<u64>,
711}
712
713impl LabeledAnomaly {
714    /// Creates a new labeled anomaly.
715    pub fn new(
716        anomaly_id: String,
717        anomaly_type: AnomalyType,
718        document_id: String,
719        document_type: String,
720        company_code: String,
721        anomaly_date: NaiveDate,
722    ) -> Self {
723        let severity = anomaly_type.severity();
724        let description = format!(
725            "{} - {} in document {}",
726            anomaly_type.category(),
727            anomaly_type.type_name(),
728            document_id
729        );
730
731        Self {
732            anomaly_id,
733            anomaly_type,
734            document_id,
735            document_type,
736            company_code,
737            anomaly_date,
738            detection_timestamp: chrono::Local::now().naive_local(),
739            confidence: 1.0,
740            severity,
741            description,
742            related_entities: Vec::new(),
743            monetary_impact: None,
744            metadata: HashMap::new(),
745            is_injected: true,
746            injection_strategy: None,
747            cluster_id: None,
748            // Provenance fields
749            original_document_hash: None,
750            causal_reason: None,
751            structured_strategy: None,
752            parent_anomaly_id: None,
753            child_anomaly_ids: Vec::new(),
754            scenario_id: None,
755            run_id: None,
756            generation_seed: None,
757        }
758    }
759
760    /// Sets the description.
761    pub fn with_description(mut self, description: &str) -> Self {
762        self.description = description.to_string();
763        self
764    }
765
766    /// Sets the monetary impact.
767    pub fn with_monetary_impact(mut self, impact: Decimal) -> Self {
768        self.monetary_impact = Some(impact);
769        self
770    }
771
772    /// Adds a related entity.
773    pub fn with_related_entity(mut self, entity: &str) -> Self {
774        self.related_entities.push(entity.to_string());
775        self
776    }
777
778    /// Adds metadata.
779    pub fn with_metadata(mut self, key: &str, value: &str) -> Self {
780        self.metadata.insert(key.to_string(), value.to_string());
781        self
782    }
783
784    /// Sets the injection strategy (legacy string).
785    pub fn with_injection_strategy(mut self, strategy: &str) -> Self {
786        self.injection_strategy = Some(strategy.to_string());
787        self
788    }
789
790    /// Sets the cluster ID.
791    pub fn with_cluster(mut self, cluster_id: &str) -> Self {
792        self.cluster_id = Some(cluster_id.to_string());
793        self
794    }
795
796    // ========================================
797    // PROVENANCE BUILDER METHODS (Phase 1.2)
798    // ========================================
799
800    /// Sets the original document hash for provenance tracking.
801    pub fn with_original_document_hash(mut self, hash: &str) -> Self {
802        self.original_document_hash = Some(hash.to_string());
803        self
804    }
805
806    /// Sets the causal reason for this anomaly.
807    pub fn with_causal_reason(mut self, reason: AnomalyCausalReason) -> Self {
808        self.causal_reason = Some(reason);
809        self
810    }
811
812    /// Sets the structured injection strategy.
813    pub fn with_structured_strategy(mut self, strategy: InjectionStrategy) -> Self {
814        // Also set the legacy string field for backward compatibility
815        self.injection_strategy = Some(strategy.strategy_type().to_string());
816        self.structured_strategy = Some(strategy);
817        self
818    }
819
820    /// Sets the parent anomaly ID (for anomaly derivation chains).
821    pub fn with_parent_anomaly(mut self, parent_id: &str) -> Self {
822        self.parent_anomaly_id = Some(parent_id.to_string());
823        self
824    }
825
826    /// Adds a child anomaly ID.
827    pub fn with_child_anomaly(mut self, child_id: &str) -> Self {
828        self.child_anomaly_ids.push(child_id.to_string());
829        self
830    }
831
832    /// Sets the scenario ID for multi-step scenario tracking.
833    pub fn with_scenario(mut self, scenario_id: &str) -> Self {
834        self.scenario_id = Some(scenario_id.to_string());
835        self
836    }
837
838    /// Sets the generation run ID.
839    pub fn with_run_id(mut self, run_id: &str) -> Self {
840        self.run_id = Some(run_id.to_string());
841        self
842    }
843
844    /// Sets the generation seed for reproducibility.
845    pub fn with_generation_seed(mut self, seed: u64) -> Self {
846        self.generation_seed = Some(seed);
847        self
848    }
849
850    /// Sets multiple provenance fields at once for convenience.
851    pub fn with_provenance(
852        mut self,
853        run_id: Option<&str>,
854        seed: Option<u64>,
855        causal_reason: Option<AnomalyCausalReason>,
856    ) -> Self {
857        if let Some(id) = run_id {
858            self.run_id = Some(id.to_string());
859        }
860        self.generation_seed = seed;
861        self.causal_reason = causal_reason;
862        self
863    }
864
865    /// Converts to a feature vector for ML.
866    ///
867    /// Returns a vector of 15 features:
868    /// - 6 features: Category one-hot encoding (Fraud, Error, ProcessIssue, Statistical, Relational, Custom)
869    /// - 1 feature: Severity (normalized 0-1)
870    /// - 1 feature: Confidence
871    /// - 1 feature: Has monetary impact (0/1)
872    /// - 1 feature: Monetary impact (log-scaled)
873    /// - 1 feature: Is intentional (0/1)
874    /// - 1 feature: Number of related entities
875    /// - 1 feature: Is part of cluster (0/1)
876    /// - 1 feature: Is part of scenario (0/1)
877    /// - 1 feature: Has parent anomaly (0/1) - indicates derivation
878    pub fn to_features(&self) -> Vec<f64> {
879        let mut features = Vec::new();
880
881        // Category one-hot encoding
882        let categories = [
883            "Fraud",
884            "Error",
885            "ProcessIssue",
886            "Statistical",
887            "Relational",
888            "Custom",
889        ];
890        for cat in &categories {
891            features.push(if self.anomaly_type.category() == *cat {
892                1.0
893            } else {
894                0.0
895            });
896        }
897
898        // Severity (normalized)
899        features.push(self.severity as f64 / 5.0);
900
901        // Confidence
902        features.push(self.confidence);
903
904        // Has monetary impact
905        features.push(if self.monetary_impact.is_some() {
906            1.0
907        } else {
908            0.0
909        });
910
911        // Monetary impact (log-scaled)
912        if let Some(impact) = self.monetary_impact {
913            let impact_f64: f64 = impact.try_into().unwrap_or(0.0);
914            features.push((impact_f64.abs() + 1.0).ln());
915        } else {
916            features.push(0.0);
917        }
918
919        // Is intentional
920        features.push(if self.anomaly_type.is_intentional() {
921            1.0
922        } else {
923            0.0
924        });
925
926        // Number of related entities
927        features.push(self.related_entities.len() as f64);
928
929        // Is part of cluster
930        features.push(if self.cluster_id.is_some() { 1.0 } else { 0.0 });
931
932        // Provenance features
933        // Is part of scenario
934        features.push(if self.scenario_id.is_some() { 1.0 } else { 0.0 });
935
936        // Has parent anomaly (indicates this is a derived anomaly)
937        features.push(if self.parent_anomaly_id.is_some() {
938            1.0
939        } else {
940            0.0
941        });
942
943        features
944    }
945
946    /// Returns the number of features in the feature vector.
947    pub fn feature_count() -> usize {
948        15 // 6 category + 9 other features
949    }
950
951    /// Returns feature names for documentation/ML metadata.
952    pub fn feature_names() -> Vec<&'static str> {
953        vec![
954            "category_fraud",
955            "category_error",
956            "category_process_issue",
957            "category_statistical",
958            "category_relational",
959            "category_custom",
960            "severity_normalized",
961            "confidence",
962            "has_monetary_impact",
963            "monetary_impact_log",
964            "is_intentional",
965            "related_entity_count",
966            "is_clustered",
967            "is_scenario_part",
968            "is_derived",
969        ]
970    }
971}
972
973/// Summary of anomalies for reporting.
974#[derive(Debug, Clone, Default, Serialize, Deserialize)]
975pub struct AnomalySummary {
976    /// Total anomaly count.
977    pub total_count: usize,
978    /// Count by category.
979    pub by_category: HashMap<String, usize>,
980    /// Count by specific type.
981    pub by_type: HashMap<String, usize>,
982    /// Count by severity.
983    pub by_severity: HashMap<u8, usize>,
984    /// Count by company.
985    pub by_company: HashMap<String, usize>,
986    /// Total monetary impact.
987    pub total_monetary_impact: Decimal,
988    /// Date range.
989    pub date_range: Option<(NaiveDate, NaiveDate)>,
990    /// Number of clusters.
991    pub cluster_count: usize,
992}
993
994impl AnomalySummary {
995    /// Creates a summary from a list of anomalies.
996    pub fn from_anomalies(anomalies: &[LabeledAnomaly]) -> Self {
997        let mut summary = AnomalySummary {
998            total_count: anomalies.len(),
999            ..Default::default()
1000        };
1001
1002        let mut min_date: Option<NaiveDate> = None;
1003        let mut max_date: Option<NaiveDate> = None;
1004        let mut clusters = std::collections::HashSet::new();
1005
1006        for anomaly in anomalies {
1007            // By category
1008            *summary
1009                .by_category
1010                .entry(anomaly.anomaly_type.category().to_string())
1011                .or_insert(0) += 1;
1012
1013            // By type
1014            *summary
1015                .by_type
1016                .entry(anomaly.anomaly_type.type_name())
1017                .or_insert(0) += 1;
1018
1019            // By severity
1020            *summary.by_severity.entry(anomaly.severity).or_insert(0) += 1;
1021
1022            // By company
1023            *summary
1024                .by_company
1025                .entry(anomaly.company_code.clone())
1026                .or_insert(0) += 1;
1027
1028            // Monetary impact
1029            if let Some(impact) = anomaly.monetary_impact {
1030                summary.total_monetary_impact += impact;
1031            }
1032
1033            // Date range
1034            match min_date {
1035                None => min_date = Some(anomaly.anomaly_date),
1036                Some(d) if anomaly.anomaly_date < d => min_date = Some(anomaly.anomaly_date),
1037                _ => {}
1038            }
1039            match max_date {
1040                None => max_date = Some(anomaly.anomaly_date),
1041                Some(d) if anomaly.anomaly_date > d => max_date = Some(anomaly.anomaly_date),
1042                _ => {}
1043            }
1044
1045            // Clusters
1046            if let Some(cluster_id) = &anomaly.cluster_id {
1047                clusters.insert(cluster_id.clone());
1048            }
1049        }
1050
1051        summary.date_range = min_date.zip(max_date);
1052        summary.cluster_count = clusters.len();
1053
1054        summary
1055    }
1056}
1057
1058// ============================================================================
1059// ENHANCED ANOMALY TAXONOMY (FR-003)
1060// ============================================================================
1061
1062/// High-level anomaly category for multi-class classification.
1063///
1064/// These categories provide a more granular classification than the base
1065/// AnomalyType enum, enabling better ML model training and audit reporting.
1066#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
1067pub enum AnomalyCategory {
1068    // Vendor-related anomalies
1069    /// Fictitious or shell vendor.
1070    FictitiousVendor,
1071    /// Kickback or collusion with vendor.
1072    VendorKickback,
1073    /// Related party vendor transactions.
1074    RelatedPartyVendor,
1075
1076    // Transaction-related anomalies
1077    /// Duplicate payment or invoice.
1078    DuplicatePayment,
1079    /// Unauthorized transaction.
1080    UnauthorizedTransaction,
1081    /// Structured transactions to avoid thresholds.
1082    StructuredTransaction,
1083
1084    // Pattern-based anomalies
1085    /// Circular flow of funds.
1086    CircularFlow,
1087    /// Behavioral anomaly (deviation from normal patterns).
1088    BehavioralAnomaly,
1089    /// Timing-based anomaly.
1090    TimingAnomaly,
1091
1092    // Journal entry anomalies
1093    /// Manual journal entry anomaly.
1094    JournalAnomaly,
1095    /// Manual override of controls.
1096    ManualOverride,
1097    /// Missing approval in chain.
1098    MissingApproval,
1099
1100    // Statistical anomalies
1101    /// Statistical outlier.
1102    StatisticalOutlier,
1103    /// Distribution anomaly (Benford, etc.).
1104    DistributionAnomaly,
1105
1106    // Custom category
1107    /// User-defined category.
1108    Custom(String),
1109}
1110
1111impl AnomalyCategory {
1112    /// Derives an AnomalyCategory from an AnomalyType.
1113    pub fn from_anomaly_type(anomaly_type: &AnomalyType) -> Self {
1114        match anomaly_type {
1115            AnomalyType::Fraud(fraud_type) => match fraud_type {
1116                FraudType::FictitiousVendor | FraudType::ShellCompanyPayment => {
1117                    AnomalyCategory::FictitiousVendor
1118                }
1119                FraudType::Kickback | FraudType::KickbackScheme => AnomalyCategory::VendorKickback,
1120                FraudType::DuplicatePayment => AnomalyCategory::DuplicatePayment,
1121                FraudType::SplitTransaction | FraudType::JustBelowThreshold => {
1122                    AnomalyCategory::StructuredTransaction
1123                }
1124                FraudType::SelfApproval
1125                | FraudType::UnauthorizedApproval
1126                | FraudType::CollusiveApproval => AnomalyCategory::UnauthorizedTransaction,
1127                FraudType::TimingAnomaly
1128                | FraudType::RoundDollarManipulation
1129                | FraudType::SuspenseAccountAbuse => AnomalyCategory::JournalAnomaly,
1130                _ => AnomalyCategory::BehavioralAnomaly,
1131            },
1132            AnomalyType::Error(error_type) => match error_type {
1133                ErrorType::DuplicateEntry => AnomalyCategory::DuplicatePayment,
1134                ErrorType::WrongPeriod
1135                | ErrorType::BackdatedEntry
1136                | ErrorType::FutureDatedEntry => AnomalyCategory::TimingAnomaly,
1137                _ => AnomalyCategory::JournalAnomaly,
1138            },
1139            AnomalyType::ProcessIssue(process_type) => match process_type {
1140                ProcessIssueType::SkippedApproval | ProcessIssueType::IncompleteApprovalChain => {
1141                    AnomalyCategory::MissingApproval
1142                }
1143                ProcessIssueType::ManualOverride | ProcessIssueType::SystemBypass => {
1144                    AnomalyCategory::ManualOverride
1145                }
1146                ProcessIssueType::AfterHoursPosting | ProcessIssueType::WeekendPosting => {
1147                    AnomalyCategory::TimingAnomaly
1148                }
1149                _ => AnomalyCategory::BehavioralAnomaly,
1150            },
1151            AnomalyType::Statistical(stat_type) => match stat_type {
1152                StatisticalAnomalyType::BenfordViolation
1153                | StatisticalAnomalyType::DistributionShift => AnomalyCategory::DistributionAnomaly,
1154                _ => AnomalyCategory::StatisticalOutlier,
1155            },
1156            AnomalyType::Relational(rel_type) => match rel_type {
1157                RelationalAnomalyType::CircularTransaction
1158                | RelationalAnomalyType::CircularIntercompany => AnomalyCategory::CircularFlow,
1159                _ => AnomalyCategory::BehavioralAnomaly,
1160            },
1161            AnomalyType::Custom(s) => AnomalyCategory::Custom(s.clone()),
1162        }
1163    }
1164
1165    /// Returns the category name as a string.
1166    pub fn name(&self) -> &str {
1167        match self {
1168            AnomalyCategory::FictitiousVendor => "fictitious_vendor",
1169            AnomalyCategory::VendorKickback => "vendor_kickback",
1170            AnomalyCategory::RelatedPartyVendor => "related_party_vendor",
1171            AnomalyCategory::DuplicatePayment => "duplicate_payment",
1172            AnomalyCategory::UnauthorizedTransaction => "unauthorized_transaction",
1173            AnomalyCategory::StructuredTransaction => "structured_transaction",
1174            AnomalyCategory::CircularFlow => "circular_flow",
1175            AnomalyCategory::BehavioralAnomaly => "behavioral_anomaly",
1176            AnomalyCategory::TimingAnomaly => "timing_anomaly",
1177            AnomalyCategory::JournalAnomaly => "journal_anomaly",
1178            AnomalyCategory::ManualOverride => "manual_override",
1179            AnomalyCategory::MissingApproval => "missing_approval",
1180            AnomalyCategory::StatisticalOutlier => "statistical_outlier",
1181            AnomalyCategory::DistributionAnomaly => "distribution_anomaly",
1182            AnomalyCategory::Custom(s) => s.as_str(),
1183        }
1184    }
1185
1186    /// Returns the ordinal value for ML encoding.
1187    pub fn ordinal(&self) -> u8 {
1188        match self {
1189            AnomalyCategory::FictitiousVendor => 0,
1190            AnomalyCategory::VendorKickback => 1,
1191            AnomalyCategory::RelatedPartyVendor => 2,
1192            AnomalyCategory::DuplicatePayment => 3,
1193            AnomalyCategory::UnauthorizedTransaction => 4,
1194            AnomalyCategory::StructuredTransaction => 5,
1195            AnomalyCategory::CircularFlow => 6,
1196            AnomalyCategory::BehavioralAnomaly => 7,
1197            AnomalyCategory::TimingAnomaly => 8,
1198            AnomalyCategory::JournalAnomaly => 9,
1199            AnomalyCategory::ManualOverride => 10,
1200            AnomalyCategory::MissingApproval => 11,
1201            AnomalyCategory::StatisticalOutlier => 12,
1202            AnomalyCategory::DistributionAnomaly => 13,
1203            AnomalyCategory::Custom(_) => 14,
1204        }
1205    }
1206
1207    /// Returns the total number of categories (excluding Custom).
1208    pub fn category_count() -> usize {
1209        15 // 14 fixed categories + Custom
1210    }
1211}
1212
1213/// Type of contributing factor for anomaly confidence/severity calculation.
1214#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1215pub enum FactorType {
1216    /// Amount deviation from expected value.
1217    AmountDeviation,
1218    /// Proximity to approval/reporting threshold.
1219    ThresholdProximity,
1220    /// Timing-related anomaly indicator.
1221    TimingAnomaly,
1222    /// Entity risk score contribution.
1223    EntityRisk,
1224    /// Pattern match confidence.
1225    PatternMatch,
1226    /// Frequency deviation from normal.
1227    FrequencyDeviation,
1228    /// Relationship-based anomaly indicator.
1229    RelationshipAnomaly,
1230    /// Control bypass indicator.
1231    ControlBypass,
1232    /// Benford's Law violation.
1233    BenfordViolation,
1234    /// Duplicate indicator.
1235    DuplicateIndicator,
1236    /// Approval chain issue.
1237    ApprovalChainIssue,
1238    /// Documentation gap.
1239    DocumentationGap,
1240    /// Custom factor type.
1241    Custom,
1242}
1243
1244impl FactorType {
1245    /// Returns the factor type name.
1246    pub fn name(&self) -> &'static str {
1247        match self {
1248            FactorType::AmountDeviation => "amount_deviation",
1249            FactorType::ThresholdProximity => "threshold_proximity",
1250            FactorType::TimingAnomaly => "timing_anomaly",
1251            FactorType::EntityRisk => "entity_risk",
1252            FactorType::PatternMatch => "pattern_match",
1253            FactorType::FrequencyDeviation => "frequency_deviation",
1254            FactorType::RelationshipAnomaly => "relationship_anomaly",
1255            FactorType::ControlBypass => "control_bypass",
1256            FactorType::BenfordViolation => "benford_violation",
1257            FactorType::DuplicateIndicator => "duplicate_indicator",
1258            FactorType::ApprovalChainIssue => "approval_chain_issue",
1259            FactorType::DocumentationGap => "documentation_gap",
1260            FactorType::Custom => "custom",
1261        }
1262    }
1263}
1264
1265/// Evidence supporting a contributing factor.
1266#[derive(Debug, Clone, Serialize, Deserialize)]
1267pub struct FactorEvidence {
1268    /// Source of the evidence (e.g., "transaction_history", "entity_registry").
1269    pub source: String,
1270    /// Raw evidence data.
1271    pub data: HashMap<String, String>,
1272}
1273
1274/// A contributing factor to anomaly confidence/severity.
1275#[derive(Debug, Clone, Serialize, Deserialize)]
1276pub struct ContributingFactor {
1277    /// Type of factor.
1278    pub factor_type: FactorType,
1279    /// Observed value.
1280    pub value: f64,
1281    /// Threshold or expected value.
1282    pub threshold: f64,
1283    /// Direction of comparison (true = value > threshold is anomalous).
1284    pub direction_greater: bool,
1285    /// Weight of this factor in overall calculation (0.0 - 1.0).
1286    pub weight: f64,
1287    /// Human-readable description.
1288    pub description: String,
1289    /// Optional supporting evidence.
1290    pub evidence: Option<FactorEvidence>,
1291}
1292
1293impl ContributingFactor {
1294    /// Creates a new contributing factor.
1295    pub fn new(
1296        factor_type: FactorType,
1297        value: f64,
1298        threshold: f64,
1299        direction_greater: bool,
1300        weight: f64,
1301        description: &str,
1302    ) -> Self {
1303        Self {
1304            factor_type,
1305            value,
1306            threshold,
1307            direction_greater,
1308            weight,
1309            description: description.to_string(),
1310            evidence: None,
1311        }
1312    }
1313
1314    /// Adds evidence to the factor.
1315    pub fn with_evidence(mut self, source: &str, data: HashMap<String, String>) -> Self {
1316        self.evidence = Some(FactorEvidence {
1317            source: source.to_string(),
1318            data,
1319        });
1320        self
1321    }
1322
1323    /// Calculates the factor's contribution to anomaly score.
1324    pub fn contribution(&self) -> f64 {
1325        let deviation = if self.direction_greater {
1326            (self.value - self.threshold).max(0.0)
1327        } else {
1328            (self.threshold - self.value).max(0.0)
1329        };
1330
1331        // Normalize by threshold to get relative deviation
1332        let relative_deviation = if self.threshold.abs() > 0.001 {
1333            deviation / self.threshold.abs()
1334        } else {
1335            deviation
1336        };
1337
1338        // Apply weight and cap at 1.0
1339        (relative_deviation * self.weight).min(1.0)
1340    }
1341}
1342
1343/// Enhanced anomaly label with dynamic confidence and severity.
1344#[derive(Debug, Clone, Serialize, Deserialize)]
1345pub struct EnhancedAnomalyLabel {
1346    /// Base labeled anomaly (backward compatible).
1347    pub base: LabeledAnomaly,
1348    /// Enhanced category classification.
1349    pub category: AnomalyCategory,
1350    /// Dynamically calculated confidence (0.0 - 1.0).
1351    pub enhanced_confidence: f64,
1352    /// Contextually calculated severity (0.0 - 1.0).
1353    pub enhanced_severity: f64,
1354    /// Factors contributing to confidence/severity.
1355    pub contributing_factors: Vec<ContributingFactor>,
1356    /// Secondary categories (for multi-label classification).
1357    pub secondary_categories: Vec<AnomalyCategory>,
1358}
1359
1360impl EnhancedAnomalyLabel {
1361    /// Creates an enhanced label from a base labeled anomaly.
1362    pub fn from_base(base: LabeledAnomaly) -> Self {
1363        let category = AnomalyCategory::from_anomaly_type(&base.anomaly_type);
1364        let enhanced_confidence = base.confidence;
1365        let enhanced_severity = base.severity as f64 / 5.0;
1366
1367        Self {
1368            base,
1369            category,
1370            enhanced_confidence,
1371            enhanced_severity,
1372            contributing_factors: Vec::new(),
1373            secondary_categories: Vec::new(),
1374        }
1375    }
1376
1377    /// Sets the enhanced confidence.
1378    pub fn with_confidence(mut self, confidence: f64) -> Self {
1379        self.enhanced_confidence = confidence.clamp(0.0, 1.0);
1380        self
1381    }
1382
1383    /// Sets the enhanced severity.
1384    pub fn with_severity(mut self, severity: f64) -> Self {
1385        self.enhanced_severity = severity.clamp(0.0, 1.0);
1386        self
1387    }
1388
1389    /// Adds a contributing factor.
1390    pub fn with_factor(mut self, factor: ContributingFactor) -> Self {
1391        self.contributing_factors.push(factor);
1392        self
1393    }
1394
1395    /// Adds a secondary category.
1396    pub fn with_secondary_category(mut self, category: AnomalyCategory) -> Self {
1397        if !self.secondary_categories.contains(&category) && category != self.category {
1398            self.secondary_categories.push(category);
1399        }
1400        self
1401    }
1402
1403    /// Converts to an extended feature vector.
1404    ///
1405    /// Returns base features (15) + enhanced features (10) = 25 features.
1406    pub fn to_features(&self) -> Vec<f64> {
1407        let mut features = self.base.to_features();
1408
1409        // Enhanced features
1410        features.push(self.enhanced_confidence);
1411        features.push(self.enhanced_severity);
1412        features.push(self.category.ordinal() as f64 / AnomalyCategory::category_count() as f64);
1413        features.push(self.secondary_categories.len() as f64);
1414        features.push(self.contributing_factors.len() as f64);
1415
1416        // Max factor weight
1417        let max_weight = self
1418            .contributing_factors
1419            .iter()
1420            .map(|f| f.weight)
1421            .fold(0.0, f64::max);
1422        features.push(max_weight);
1423
1424        // Factor type indicators (binary flags for key factor types)
1425        let has_control_bypass = self
1426            .contributing_factors
1427            .iter()
1428            .any(|f| f.factor_type == FactorType::ControlBypass);
1429        features.push(if has_control_bypass { 1.0 } else { 0.0 });
1430
1431        let has_amount_deviation = self
1432            .contributing_factors
1433            .iter()
1434            .any(|f| f.factor_type == FactorType::AmountDeviation);
1435        features.push(if has_amount_deviation { 1.0 } else { 0.0 });
1436
1437        let has_timing = self
1438            .contributing_factors
1439            .iter()
1440            .any(|f| f.factor_type == FactorType::TimingAnomaly);
1441        features.push(if has_timing { 1.0 } else { 0.0 });
1442
1443        let has_pattern_match = self
1444            .contributing_factors
1445            .iter()
1446            .any(|f| f.factor_type == FactorType::PatternMatch);
1447        features.push(if has_pattern_match { 1.0 } else { 0.0 });
1448
1449        features
1450    }
1451
1452    /// Returns the number of features in the enhanced feature vector.
1453    pub fn feature_count() -> usize {
1454        25 // 15 base + 10 enhanced
1455    }
1456
1457    /// Returns feature names for the enhanced feature vector.
1458    pub fn feature_names() -> Vec<&'static str> {
1459        let mut names = LabeledAnomaly::feature_names();
1460        names.extend(vec![
1461            "enhanced_confidence",
1462            "enhanced_severity",
1463            "category_ordinal",
1464            "secondary_category_count",
1465            "contributing_factor_count",
1466            "max_factor_weight",
1467            "has_control_bypass",
1468            "has_amount_deviation",
1469            "has_timing_factor",
1470            "has_pattern_match",
1471        ]);
1472        names
1473    }
1474}
1475
1476/// Configuration for anomaly rates.
1477#[derive(Debug, Clone, Serialize, Deserialize)]
1478pub struct AnomalyRateConfig {
1479    /// Overall anomaly rate (0.0 - 1.0).
1480    pub total_rate: f64,
1481    /// Fraud rate as proportion of anomalies.
1482    pub fraud_rate: f64,
1483    /// Error rate as proportion of anomalies.
1484    pub error_rate: f64,
1485    /// Process issue rate as proportion of anomalies.
1486    pub process_issue_rate: f64,
1487    /// Statistical anomaly rate as proportion of anomalies.
1488    pub statistical_rate: f64,
1489    /// Relational anomaly rate as proportion of anomalies.
1490    pub relational_rate: f64,
1491}
1492
1493impl Default for AnomalyRateConfig {
1494    fn default() -> Self {
1495        Self {
1496            total_rate: 0.02,         // 2% of transactions are anomalous
1497            fraud_rate: 0.25,         // 25% of anomalies are fraud
1498            error_rate: 0.35,         // 35% of anomalies are errors
1499            process_issue_rate: 0.20, // 20% are process issues
1500            statistical_rate: 0.15,   // 15% are statistical
1501            relational_rate: 0.05,    // 5% are relational
1502        }
1503    }
1504}
1505
1506impl AnomalyRateConfig {
1507    /// Validates that rates sum to approximately 1.0.
1508    pub fn validate(&self) -> Result<(), String> {
1509        let sum = self.fraud_rate
1510            + self.error_rate
1511            + self.process_issue_rate
1512            + self.statistical_rate
1513            + self.relational_rate;
1514
1515        if (sum - 1.0).abs() > 0.01 {
1516            return Err(format!(
1517                "Anomaly category rates must sum to 1.0, got {}",
1518                sum
1519            ));
1520        }
1521
1522        if self.total_rate < 0.0 || self.total_rate > 1.0 {
1523            return Err(format!(
1524                "Total rate must be between 0.0 and 1.0, got {}",
1525                self.total_rate
1526            ));
1527        }
1528
1529        Ok(())
1530    }
1531}
1532
1533#[cfg(test)]
1534mod tests {
1535    use super::*;
1536    use rust_decimal_macros::dec;
1537
1538    #[test]
1539    fn test_anomaly_type_category() {
1540        let fraud = AnomalyType::Fraud(FraudType::SelfApproval);
1541        assert_eq!(fraud.category(), "Fraud");
1542        assert!(fraud.is_intentional());
1543
1544        let error = AnomalyType::Error(ErrorType::DuplicateEntry);
1545        assert_eq!(error.category(), "Error");
1546        assert!(!error.is_intentional());
1547    }
1548
1549    #[test]
1550    fn test_labeled_anomaly() {
1551        let anomaly = LabeledAnomaly::new(
1552            "ANO001".to_string(),
1553            AnomalyType::Fraud(FraudType::SelfApproval),
1554            "JE001".to_string(),
1555            "JE".to_string(),
1556            "1000".to_string(),
1557            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1558        )
1559        .with_description("User approved their own expense report")
1560        .with_related_entity("USER001");
1561
1562        assert_eq!(anomaly.severity, 3);
1563        assert!(anomaly.is_injected);
1564        assert_eq!(anomaly.related_entities.len(), 1);
1565    }
1566
1567    #[test]
1568    fn test_labeled_anomaly_with_provenance() {
1569        let anomaly = LabeledAnomaly::new(
1570            "ANO001".to_string(),
1571            AnomalyType::Fraud(FraudType::SelfApproval),
1572            "JE001".to_string(),
1573            "JE".to_string(),
1574            "1000".to_string(),
1575            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1576        )
1577        .with_run_id("run-123")
1578        .with_generation_seed(42)
1579        .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 })
1580        .with_structured_strategy(InjectionStrategy::SelfApproval {
1581            user_id: "USER001".to_string(),
1582        })
1583        .with_scenario("scenario-001")
1584        .with_original_document_hash("abc123");
1585
1586        assert_eq!(anomaly.run_id, Some("run-123".to_string()));
1587        assert_eq!(anomaly.generation_seed, Some(42));
1588        assert!(anomaly.causal_reason.is_some());
1589        assert!(anomaly.structured_strategy.is_some());
1590        assert_eq!(anomaly.scenario_id, Some("scenario-001".to_string()));
1591        assert_eq!(anomaly.original_document_hash, Some("abc123".to_string()));
1592
1593        // Check that legacy injection_strategy is also set
1594        assert_eq!(anomaly.injection_strategy, Some("SelfApproval".to_string()));
1595    }
1596
1597    #[test]
1598    fn test_labeled_anomaly_derivation_chain() {
1599        let parent = LabeledAnomaly::new(
1600            "ANO001".to_string(),
1601            AnomalyType::Fraud(FraudType::DuplicatePayment),
1602            "JE001".to_string(),
1603            "JE".to_string(),
1604            "1000".to_string(),
1605            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1606        );
1607
1608        let child = LabeledAnomaly::new(
1609            "ANO002".to_string(),
1610            AnomalyType::Error(ErrorType::DuplicateEntry),
1611            "JE002".to_string(),
1612            "JE".to_string(),
1613            "1000".to_string(),
1614            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1615        )
1616        .with_parent_anomaly(&parent.anomaly_id);
1617
1618        assert_eq!(child.parent_anomaly_id, Some("ANO001".to_string()));
1619    }
1620
1621    #[test]
1622    fn test_injection_strategy_description() {
1623        let strategy = InjectionStrategy::AmountManipulation {
1624            original: dec!(1000),
1625            factor: 2.5,
1626        };
1627        assert_eq!(strategy.description(), "Amount multiplied by 2.50");
1628        assert_eq!(strategy.strategy_type(), "AmountManipulation");
1629
1630        let strategy = InjectionStrategy::ThresholdAvoidance {
1631            threshold: dec!(10000),
1632            adjusted_amount: dec!(9999),
1633        };
1634        assert_eq!(
1635            strategy.description(),
1636            "Amount adjusted to avoid 10000 threshold"
1637        );
1638
1639        let strategy = InjectionStrategy::DateShift {
1640            days_shifted: -5,
1641            original_date: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1642        };
1643        assert_eq!(strategy.description(), "Date backdated by 5 days");
1644
1645        let strategy = InjectionStrategy::DateShift {
1646            days_shifted: 3,
1647            original_date: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1648        };
1649        assert_eq!(strategy.description(), "Date forward-dated by 3 days");
1650    }
1651
1652    #[test]
1653    fn test_causal_reason_variants() {
1654        let reason = AnomalyCausalReason::RandomRate { base_rate: 0.02 };
1655        if let AnomalyCausalReason::RandomRate { base_rate } = reason {
1656            assert!((base_rate - 0.02).abs() < 0.001);
1657        }
1658
1659        let reason = AnomalyCausalReason::TemporalPattern {
1660            pattern_name: "year_end_spike".to_string(),
1661        };
1662        if let AnomalyCausalReason::TemporalPattern { pattern_name } = reason {
1663            assert_eq!(pattern_name, "year_end_spike");
1664        }
1665
1666        let reason = AnomalyCausalReason::ScenarioStep {
1667            scenario_type: "kickback".to_string(),
1668            step_number: 3,
1669        };
1670        if let AnomalyCausalReason::ScenarioStep {
1671            scenario_type,
1672            step_number,
1673        } = reason
1674        {
1675            assert_eq!(scenario_type, "kickback");
1676            assert_eq!(step_number, 3);
1677        }
1678    }
1679
1680    #[test]
1681    fn test_feature_vector_length() {
1682        let anomaly = LabeledAnomaly::new(
1683            "ANO001".to_string(),
1684            AnomalyType::Fraud(FraudType::SelfApproval),
1685            "JE001".to_string(),
1686            "JE".to_string(),
1687            "1000".to_string(),
1688            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1689        );
1690
1691        let features = anomaly.to_features();
1692        assert_eq!(features.len(), LabeledAnomaly::feature_count());
1693        assert_eq!(features.len(), LabeledAnomaly::feature_names().len());
1694    }
1695
1696    #[test]
1697    fn test_feature_vector_with_provenance() {
1698        let anomaly = LabeledAnomaly::new(
1699            "ANO001".to_string(),
1700            AnomalyType::Fraud(FraudType::SelfApproval),
1701            "JE001".to_string(),
1702            "JE".to_string(),
1703            "1000".to_string(),
1704            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1705        )
1706        .with_scenario("scenario-001")
1707        .with_parent_anomaly("ANO000");
1708
1709        let features = anomaly.to_features();
1710
1711        // Last two features should be 1.0 (has scenario, has parent)
1712        assert_eq!(features[features.len() - 2], 1.0); // is_scenario_part
1713        assert_eq!(features[features.len() - 1], 1.0); // is_derived
1714    }
1715
1716    #[test]
1717    fn test_anomaly_summary() {
1718        let anomalies = vec![
1719            LabeledAnomaly::new(
1720                "ANO001".to_string(),
1721                AnomalyType::Fraud(FraudType::SelfApproval),
1722                "JE001".to_string(),
1723                "JE".to_string(),
1724                "1000".to_string(),
1725                NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1726            ),
1727            LabeledAnomaly::new(
1728                "ANO002".to_string(),
1729                AnomalyType::Error(ErrorType::DuplicateEntry),
1730                "JE002".to_string(),
1731                "JE".to_string(),
1732                "1000".to_string(),
1733                NaiveDate::from_ymd_opt(2024, 1, 16).unwrap(),
1734            ),
1735        ];
1736
1737        let summary = AnomalySummary::from_anomalies(&anomalies);
1738
1739        assert_eq!(summary.total_count, 2);
1740        assert_eq!(summary.by_category.get("Fraud"), Some(&1));
1741        assert_eq!(summary.by_category.get("Error"), Some(&1));
1742    }
1743
1744    #[test]
1745    fn test_rate_config_validation() {
1746        let config = AnomalyRateConfig::default();
1747        assert!(config.validate().is_ok());
1748
1749        let bad_config = AnomalyRateConfig {
1750            fraud_rate: 0.5,
1751            error_rate: 0.5,
1752            process_issue_rate: 0.5, // Sum > 1.0
1753            ..Default::default()
1754        };
1755        assert!(bad_config.validate().is_err());
1756    }
1757
1758    #[test]
1759    fn test_injection_strategy_serialization() {
1760        let strategy = InjectionStrategy::SoDViolation {
1761            duty1: "CreatePO".to_string(),
1762            duty2: "ApprovePO".to_string(),
1763            violating_user: "USER001".to_string(),
1764        };
1765
1766        let json = serde_json::to_string(&strategy).unwrap();
1767        let deserialized: InjectionStrategy = serde_json::from_str(&json).unwrap();
1768
1769        assert_eq!(strategy, deserialized);
1770    }
1771
1772    #[test]
1773    fn test_labeled_anomaly_serialization_with_provenance() {
1774        let anomaly = LabeledAnomaly::new(
1775            "ANO001".to_string(),
1776            AnomalyType::Fraud(FraudType::SelfApproval),
1777            "JE001".to_string(),
1778            "JE".to_string(),
1779            "1000".to_string(),
1780            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1781        )
1782        .with_run_id("run-123")
1783        .with_generation_seed(42)
1784        .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 });
1785
1786        let json = serde_json::to_string(&anomaly).unwrap();
1787        let deserialized: LabeledAnomaly = serde_json::from_str(&json).unwrap();
1788
1789        assert_eq!(anomaly.run_id, deserialized.run_id);
1790        assert_eq!(anomaly.generation_seed, deserialized.generation_seed);
1791    }
1792
1793    // ========================================
1794    // FR-003 ENHANCED TAXONOMY TESTS
1795    // ========================================
1796
1797    #[test]
1798    fn test_anomaly_category_from_anomaly_type() {
1799        // Fraud mappings
1800        let fraud_vendor = AnomalyType::Fraud(FraudType::FictitiousVendor);
1801        assert_eq!(
1802            AnomalyCategory::from_anomaly_type(&fraud_vendor),
1803            AnomalyCategory::FictitiousVendor
1804        );
1805
1806        let fraud_kickback = AnomalyType::Fraud(FraudType::KickbackScheme);
1807        assert_eq!(
1808            AnomalyCategory::from_anomaly_type(&fraud_kickback),
1809            AnomalyCategory::VendorKickback
1810        );
1811
1812        let fraud_structured = AnomalyType::Fraud(FraudType::SplitTransaction);
1813        assert_eq!(
1814            AnomalyCategory::from_anomaly_type(&fraud_structured),
1815            AnomalyCategory::StructuredTransaction
1816        );
1817
1818        // Error mappings
1819        let error_duplicate = AnomalyType::Error(ErrorType::DuplicateEntry);
1820        assert_eq!(
1821            AnomalyCategory::from_anomaly_type(&error_duplicate),
1822            AnomalyCategory::DuplicatePayment
1823        );
1824
1825        // Process issue mappings
1826        let process_skip = AnomalyType::ProcessIssue(ProcessIssueType::SkippedApproval);
1827        assert_eq!(
1828            AnomalyCategory::from_anomaly_type(&process_skip),
1829            AnomalyCategory::MissingApproval
1830        );
1831
1832        // Relational mappings
1833        let relational_circular =
1834            AnomalyType::Relational(RelationalAnomalyType::CircularTransaction);
1835        assert_eq!(
1836            AnomalyCategory::from_anomaly_type(&relational_circular),
1837            AnomalyCategory::CircularFlow
1838        );
1839    }
1840
1841    #[test]
1842    fn test_anomaly_category_ordinal() {
1843        assert_eq!(AnomalyCategory::FictitiousVendor.ordinal(), 0);
1844        assert_eq!(AnomalyCategory::VendorKickback.ordinal(), 1);
1845        assert_eq!(AnomalyCategory::Custom("test".to_string()).ordinal(), 14);
1846    }
1847
1848    #[test]
1849    fn test_contributing_factor() {
1850        let factor = ContributingFactor::new(
1851            FactorType::AmountDeviation,
1852            15000.0,
1853            10000.0,
1854            true,
1855            0.5,
1856            "Amount exceeds threshold",
1857        );
1858
1859        assert_eq!(factor.factor_type, FactorType::AmountDeviation);
1860        assert_eq!(factor.value, 15000.0);
1861        assert_eq!(factor.threshold, 10000.0);
1862        assert!(factor.direction_greater);
1863
1864        // Contribution: (15000 - 10000) / 10000 * 0.5 = 0.25
1865        let contribution = factor.contribution();
1866        assert!((contribution - 0.25).abs() < 0.01);
1867    }
1868
1869    #[test]
1870    fn test_contributing_factor_with_evidence() {
1871        let mut data = HashMap::new();
1872        data.insert("expected".to_string(), "10000".to_string());
1873        data.insert("actual".to_string(), "15000".to_string());
1874
1875        let factor = ContributingFactor::new(
1876            FactorType::AmountDeviation,
1877            15000.0,
1878            10000.0,
1879            true,
1880            0.5,
1881            "Amount deviation detected",
1882        )
1883        .with_evidence("transaction_history", data);
1884
1885        assert!(factor.evidence.is_some());
1886        let evidence = factor.evidence.unwrap();
1887        assert_eq!(evidence.source, "transaction_history");
1888        assert_eq!(evidence.data.get("expected"), Some(&"10000".to_string()));
1889    }
1890
1891    #[test]
1892    fn test_enhanced_anomaly_label() {
1893        let base = LabeledAnomaly::new(
1894            "ANO001".to_string(),
1895            AnomalyType::Fraud(FraudType::DuplicatePayment),
1896            "JE001".to_string(),
1897            "JE".to_string(),
1898            "1000".to_string(),
1899            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1900        );
1901
1902        let enhanced = EnhancedAnomalyLabel::from_base(base)
1903            .with_confidence(0.85)
1904            .with_severity(0.7)
1905            .with_factor(ContributingFactor::new(
1906                FactorType::DuplicateIndicator,
1907                1.0,
1908                0.5,
1909                true,
1910                0.4,
1911                "Duplicate payment detected",
1912            ))
1913            .with_secondary_category(AnomalyCategory::StructuredTransaction);
1914
1915        assert_eq!(enhanced.category, AnomalyCategory::DuplicatePayment);
1916        assert_eq!(enhanced.enhanced_confidence, 0.85);
1917        assert_eq!(enhanced.enhanced_severity, 0.7);
1918        assert_eq!(enhanced.contributing_factors.len(), 1);
1919        assert_eq!(enhanced.secondary_categories.len(), 1);
1920    }
1921
1922    #[test]
1923    fn test_enhanced_anomaly_label_features() {
1924        let base = LabeledAnomaly::new(
1925            "ANO001".to_string(),
1926            AnomalyType::Fraud(FraudType::SelfApproval),
1927            "JE001".to_string(),
1928            "JE".to_string(),
1929            "1000".to_string(),
1930            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1931        );
1932
1933        let enhanced = EnhancedAnomalyLabel::from_base(base)
1934            .with_confidence(0.9)
1935            .with_severity(0.8)
1936            .with_factor(ContributingFactor::new(
1937                FactorType::ControlBypass,
1938                1.0,
1939                0.0,
1940                true,
1941                0.5,
1942                "Control bypass detected",
1943            ));
1944
1945        let features = enhanced.to_features();
1946
1947        // Should have 25 features (15 base + 10 enhanced)
1948        assert_eq!(features.len(), EnhancedAnomalyLabel::feature_count());
1949        assert_eq!(features.len(), 25);
1950
1951        // Check enhanced confidence is in features
1952        assert_eq!(features[15], 0.9); // enhanced_confidence
1953
1954        // Check has_control_bypass flag
1955        assert_eq!(features[21], 1.0); // has_control_bypass
1956    }
1957
1958    #[test]
1959    fn test_enhanced_anomaly_label_feature_names() {
1960        let names = EnhancedAnomalyLabel::feature_names();
1961        assert_eq!(names.len(), 25);
1962        assert!(names.contains(&"enhanced_confidence"));
1963        assert!(names.contains(&"enhanced_severity"));
1964        assert!(names.contains(&"has_control_bypass"));
1965    }
1966
1967    #[test]
1968    fn test_factor_type_names() {
1969        assert_eq!(FactorType::AmountDeviation.name(), "amount_deviation");
1970        assert_eq!(FactorType::ThresholdProximity.name(), "threshold_proximity");
1971        assert_eq!(FactorType::ControlBypass.name(), "control_bypass");
1972    }
1973
1974    #[test]
1975    fn test_anomaly_category_serialization() {
1976        let category = AnomalyCategory::CircularFlow;
1977        let json = serde_json::to_string(&category).unwrap();
1978        let deserialized: AnomalyCategory = serde_json::from_str(&json).unwrap();
1979        assert_eq!(category, deserialized);
1980
1981        let custom = AnomalyCategory::Custom("custom_type".to_string());
1982        let json = serde_json::to_string(&custom).unwrap();
1983        let deserialized: AnomalyCategory = serde_json::from_str(&json).unwrap();
1984        assert_eq!(custom, deserialized);
1985    }
1986
1987    #[test]
1988    fn test_enhanced_label_secondary_category_dedup() {
1989        let base = LabeledAnomaly::new(
1990            "ANO001".to_string(),
1991            AnomalyType::Fraud(FraudType::DuplicatePayment),
1992            "JE001".to_string(),
1993            "JE".to_string(),
1994            "1000".to_string(),
1995            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
1996        );
1997
1998        let enhanced = EnhancedAnomalyLabel::from_base(base)
1999            // Try to add the primary category as secondary (should be ignored)
2000            .with_secondary_category(AnomalyCategory::DuplicatePayment)
2001            // Add a valid secondary
2002            .with_secondary_category(AnomalyCategory::TimingAnomaly)
2003            // Try to add duplicate secondary (should be ignored)
2004            .with_secondary_category(AnomalyCategory::TimingAnomaly);
2005
2006        // Should only have 1 secondary category (TimingAnomaly)
2007        assert_eq!(enhanced.secondary_categories.len(), 1);
2008        assert_eq!(
2009            enhanced.secondary_categories[0],
2010            AnomalyCategory::TimingAnomaly
2011        );
2012    }
2013}
datasynth_core/models/anomaly.rs

datasynth_core/models/
anomaly.rs