datasynth-eval 2.3.1

Evaluation framework for synthetic financial data quality and coherence
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
//! Drift Detection Evaluation Module.
//!
//! Provides tools for evaluating drift detection ground truth labels and
//! validating that generated drift events are detectable and properly labeled.
//!
//! # Overview
//!
//! This module evaluates the quality and detectability of drift events in
//! synthetic data by analyzing:
//!
//! - Statistical distribution shifts (mean, variance changes)
//! - Categorical shifts (proportion changes, new categories)
//! - Temporal pattern changes (seasonality, trend)
//! - Regulatory and organizational event impacts
//!
//! # Example
//!
//! ```ignore
//! use datasynth_eval::statistical::{DriftDetectionAnalyzer, DriftDetectionEntry};
//!
//! let analyzer = DriftDetectionAnalyzer::new(0.05);
//! let entries = vec![
//!     DriftDetectionEntry::new(1, 100.0, Some(true)),
//!     DriftDetectionEntry::new(2, 102.0, Some(false)),
//!     // ...
//! ];
//!
//! let analysis = analyzer.analyze(&entries)?;
//! println!("Drift detected: {}", analysis.drift_detected);
//! ```

use crate::error::{EvalError, EvalResult};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

// =============================================================================
// Drift Detection Entry
// =============================================================================

/// A single data point for drift detection analysis.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DriftDetectionEntry {
    /// Period number (e.g., month number from start).
    pub period: u32,
    /// Observed value at this period.
    pub value: f64,
    /// Ground truth label: true if this period has drift, false otherwise.
    pub ground_truth_drift: Option<bool>,
    /// Drift event type if labeled.
    pub drift_type: Option<String>,
    /// Magnitude of drift if known.
    pub drift_magnitude: Option<f64>,
    /// Detection difficulty (0.0 = easy, 1.0 = hard).
    pub detection_difficulty: Option<f64>,
}

impl DriftDetectionEntry {
    /// Create a new drift detection entry.
    pub fn new(period: u32, value: f64, ground_truth_drift: Option<bool>) -> Self {
        Self {
            period,
            value,
            ground_truth_drift,
            drift_type: None,
            drift_magnitude: None,
            detection_difficulty: None,
        }
    }

    /// Create entry with full metadata.
    pub fn with_metadata(
        period: u32,
        value: f64,
        ground_truth_drift: bool,
        drift_type: impl Into<String>,
        drift_magnitude: f64,
        detection_difficulty: f64,
    ) -> Self {
        Self {
            period,
            value,
            ground_truth_drift: Some(ground_truth_drift),
            drift_type: Some(drift_type.into()),
            drift_magnitude: Some(drift_magnitude),
            detection_difficulty: Some(detection_difficulty),
        }
    }
}

// =============================================================================
// Labeled Drift Event
// =============================================================================

/// A labeled drift event from ground truth data.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LabeledDriftEvent {
    /// Unique event identifier.
    pub event_id: String,
    /// Event type classification.
    pub event_type: DriftEventCategory,
    /// Start period of the drift.
    pub start_period: u32,
    /// End period of the drift (None if ongoing).
    pub end_period: Option<u32>,
    /// Affected fields/metrics.
    pub affected_fields: Vec<String>,
    /// Magnitude of the drift effect.
    pub magnitude: f64,
    /// Detection difficulty level.
    pub detection_difficulty: DetectionDifficulty,
}

/// Categories of drift events.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum DriftEventCategory {
    /// Mean shift in distribution.
    MeanShift,
    /// Variance change in distribution.
    VarianceChange,
    /// Trend change (slope).
    TrendChange,
    /// Seasonality pattern change.
    SeasonalityChange,
    /// Categorical proportion shift.
    ProportionShift,
    /// New category emergence.
    NewCategory,
    /// Organizational event (acquisition, merger, etc.).
    OrganizationalEvent,
    /// Regulatory change impact.
    RegulatoryChange,
    /// Technology transition impact.
    TechnologyTransition,
    /// Economic cycle effect.
    EconomicCycle,
    /// Process evolution.
    ProcessEvolution,
}

impl DriftEventCategory {
    /// Get human-readable name.
    pub fn name(&self) -> &'static str {
        match self {
            Self::MeanShift => "Mean Shift",
            Self::VarianceChange => "Variance Change",
            Self::TrendChange => "Trend Change",
            Self::SeasonalityChange => "Seasonality Change",
            Self::ProportionShift => "Proportion Shift",
            Self::NewCategory => "New Category",
            Self::OrganizationalEvent => "Organizational Event",
            Self::RegulatoryChange => "Regulatory Change",
            Self::TechnologyTransition => "Technology Transition",
            Self::EconomicCycle => "Economic Cycle",
            Self::ProcessEvolution => "Process Evolution",
        }
    }

    /// Check if this is a statistical drift type.
    pub fn is_statistical(&self) -> bool {
        matches!(
            self,
            Self::MeanShift | Self::VarianceChange | Self::TrendChange | Self::SeasonalityChange
        )
    }

    /// Check if this is a business event drift type.
    pub fn is_business_event(&self) -> bool {
        matches!(
            self,
            Self::OrganizationalEvent
                | Self::RegulatoryChange
                | Self::TechnologyTransition
                | Self::ProcessEvolution
        )
    }
}

/// Detection difficulty levels.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum DetectionDifficulty {
    /// Easy to detect (large magnitude, clear signal).
    Easy,
    /// Medium difficulty.
    Medium,
    /// Hard to detect (small magnitude, noisy signal).
    Hard,
}

impl DetectionDifficulty {
    /// Convert to numeric score (0.0 = easy, 1.0 = hard).
    pub fn to_score(&self) -> f64 {
        match self {
            Self::Easy => 0.0,
            Self::Medium => 0.5,
            Self::Hard => 1.0,
        }
    }

    /// Create from numeric score.
    pub fn from_score(score: f64) -> Self {
        if score < 0.33 {
            Self::Easy
        } else if score < 0.67 {
            Self::Medium
        } else {
            Self::Hard
        }
    }
}

// =============================================================================
// Drift Detection Analyzer
// =============================================================================

/// Analyzer for drift detection evaluation.
#[derive(Debug, Clone)]
pub struct DriftDetectionAnalyzer {
    /// Significance level for statistical tests.
    significance_level: f64,
    /// Window size for rolling statistics.
    window_size: usize,
    /// Minimum magnitude threshold to consider as drift.
    min_magnitude_threshold: f64,
    /// Enable Hellinger distance calculation.
    use_hellinger: bool,
    /// Enable Population Stability Index (PSI) calculation.
    use_psi: bool,
}

impl DriftDetectionAnalyzer {
    /// Create a new drift detection analyzer.
    pub fn new(significance_level: f64) -> Self {
        Self {
            significance_level,
            window_size: 10,
            min_magnitude_threshold: 0.05,
            use_hellinger: true,
            use_psi: true,
        }
    }

    /// Set the rolling window size.
    pub fn with_window_size(mut self, size: usize) -> Self {
        self.window_size = size;
        self
    }

    /// Set the minimum magnitude threshold.
    pub fn with_min_magnitude(mut self, threshold: f64) -> Self {
        self.min_magnitude_threshold = threshold;
        self
    }

    /// Enable or disable Hellinger distance calculation.
    pub fn with_hellinger(mut self, enabled: bool) -> Self {
        self.use_hellinger = enabled;
        self
    }

    /// Enable or disable PSI calculation.
    pub fn with_psi(mut self, enabled: bool) -> Self {
        self.use_psi = enabled;
        self
    }

    /// Analyze drift detection entries.
    pub fn analyze(&self, entries: &[DriftDetectionEntry]) -> EvalResult<DriftDetectionAnalysis> {
        if entries.len() < self.window_size * 2 {
            return Err(EvalError::InsufficientData {
                required: self.window_size * 2,
                actual: entries.len(),
            });
        }

        // Extract values and labels
        let values: Vec<f64> = entries.iter().map(|e| e.value).collect();
        let ground_truth: Vec<Option<bool>> =
            entries.iter().map(|e| e.ground_truth_drift).collect();

        // Calculate rolling statistics
        let rolling_means = self.calculate_rolling_means(&values);
        let rolling_stds = self.calculate_rolling_stds(&values);

        // Detect drift points using CUSUM-like approach
        let detected_drift = self.detect_drift_points(&rolling_means, &rolling_stds);

        // Calculate detection metrics if ground truth is available
        let metrics = self.calculate_detection_metrics(&detected_drift, &ground_truth);

        // Calculate statistical measures
        let hellinger_distance = if self.use_hellinger {
            Some(self.calculate_hellinger_distance(&values))
        } else {
            None
        };

        let psi = if self.use_psi {
            Some(self.calculate_psi(&values))
        } else {
            None
        };

        // Determine overall drift status
        let drift_detected = detected_drift.iter().any(|&d| d);
        let drift_count = detected_drift.iter().filter(|&&d| d).count();

        // Calculate magnitude of detected drifts
        let drift_magnitude = self.calculate_drift_magnitude(&rolling_means);

        let passes = self.evaluate_pass_status(&metrics, drift_magnitude);
        let issues = self.collect_issues(&metrics, drift_magnitude, drift_count);

        Ok(DriftDetectionAnalysis {
            sample_size: entries.len(),
            drift_detected,
            drift_count,
            drift_magnitude,
            detection_metrics: metrics,
            hellinger_distance,
            psi,
            rolling_mean_change: self.calculate_mean_change(&rolling_means),
            rolling_std_change: self.calculate_std_change(&rolling_stds),
            passes,
            issues,
        })
    }

    /// Analyze labeled drift events for quality.
    pub fn analyze_labeled_events(
        &self,
        events: &[LabeledDriftEvent],
    ) -> EvalResult<LabeledEventAnalysis> {
        if events.is_empty() {
            return Ok(LabeledEventAnalysis::empty());
        }

        // Count events by category
        let mut category_counts: HashMap<DriftEventCategory, usize> = HashMap::new();
        for event in events {
            *category_counts.entry(event.event_type).or_insert(0) += 1;
        }

        // Count events by difficulty
        let mut difficulty_counts: HashMap<DetectionDifficulty, usize> = HashMap::new();
        for event in events {
            *difficulty_counts
                .entry(event.detection_difficulty)
                .or_insert(0) += 1;
        }

        // Calculate coverage metrics
        let total_events = events.len();
        let statistical_events = events
            .iter()
            .filter(|e| e.event_type.is_statistical())
            .count();
        let business_events = events
            .iter()
            .filter(|e| e.event_type.is_business_event())
            .count();

        // Calculate average magnitude and difficulty
        let avg_magnitude = events.iter().map(|e| e.magnitude).sum::<f64>() / total_events as f64;
        let avg_difficulty = events
            .iter()
            .map(|e| e.detection_difficulty.to_score())
            .sum::<f64>()
            / total_events as f64;

        // Calculate period coverage
        let min_period = events.iter().map(|e| e.start_period).min().unwrap_or(0);
        let max_period = events
            .iter()
            .filter_map(|e| e.end_period)
            .max()
            .unwrap_or(min_period);

        let passes = total_events > 0 && avg_magnitude >= self.min_magnitude_threshold;
        let issues = if !passes {
            vec!["Insufficient drift events or magnitude too low".to_string()]
        } else {
            Vec::new()
        };

        Ok(LabeledEventAnalysis {
            total_events,
            statistical_events,
            business_events,
            category_distribution: category_counts,
            difficulty_distribution: difficulty_counts,
            avg_magnitude,
            avg_difficulty,
            period_coverage: (min_period, max_period),
            passes,
            issues,
        })
    }

    // Helper methods

    fn calculate_rolling_means(&self, values: &[f64]) -> Vec<f64> {
        if values.len() < self.window_size {
            tracing::debug!(
                "Drift detection: not enough values ({}) for window size ({}), returning empty",
                values.len(),
                self.window_size
            );
            return Vec::new();
        }
        let mut means = Vec::with_capacity(values.len() - self.window_size + 1);
        for i in 0..=(values.len() - self.window_size) {
            let window = &values[i..i + self.window_size];
            let mean = window.iter().sum::<f64>() / self.window_size as f64;
            means.push(mean);
        }
        means
    }

    fn calculate_rolling_stds(&self, values: &[f64]) -> Vec<f64> {
        if values.len() < self.window_size {
            tracing::debug!(
                "Drift detection: not enough values ({}) for window size ({}), returning empty",
                values.len(),
                self.window_size
            );
            return Vec::new();
        }
        let mut stds = Vec::with_capacity(values.len() - self.window_size + 1);
        for i in 0..=(values.len() - self.window_size) {
            let window = &values[i..i + self.window_size];
            let mean = window.iter().sum::<f64>() / self.window_size as f64;
            let variance =
                window.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / self.window_size as f64;
            stds.push(variance.sqrt());
        }
        stds
    }

    fn detect_drift_points(&self, means: &[f64], stds: &[f64]) -> Vec<bool> {
        if means.len() < 2 {
            return vec![false; means.len()];
        }

        let mut detected = vec![false; means.len()];

        // Calculate baseline statistics from first half
        let baseline_end = means.len() / 2;
        let baseline_mean = means[..baseline_end].iter().sum::<f64>() / baseline_end as f64;
        let baseline_std = if baseline_end > 1 {
            let variance = means[..baseline_end]
                .iter()
                .map(|x| (x - baseline_mean).powi(2))
                .sum::<f64>()
                / baseline_end as f64;
            variance.sqrt().max(0.001) // Avoid division by zero
        } else {
            0.001
        };

        // Detect drift using z-score approach
        for i in baseline_end..means.len() {
            let z_score = (means[i] - baseline_mean).abs() / baseline_std;
            let threshold = 1.96 / self.significance_level.sqrt(); // Adjust for significance

            if z_score > threshold {
                detected[i] = true;
            }

            // Also check for variance change
            if i < stds.len() && baseline_end > 0 {
                let baseline_var_mean =
                    stds[..baseline_end].iter().sum::<f64>() / baseline_end as f64;
                if baseline_var_mean > 0.001 {
                    let var_ratio = stds[i] / baseline_var_mean;
                    if !(0.5..=2.0).contains(&var_ratio) {
                        detected[i] = true;
                    }
                }
            }
        }

        detected
    }

    fn calculate_detection_metrics(
        &self,
        detected: &[bool],
        ground_truth: &[Option<bool>],
    ) -> DriftDetectionMetrics {
        let mut true_positives = 0;
        let mut false_positives = 0;
        let mut true_negatives = 0;
        let mut false_negatives = 0;
        let mut detection_delays = Vec::new();

        // Adjust for window offset
        let offset = detected.len().saturating_sub(ground_truth.len());

        for (i, &gt) in ground_truth.iter().enumerate() {
            let detected_idx = i + offset;
            if detected_idx >= detected.len() {
                break;
            }

            let pred = detected[detected_idx];
            match gt {
                Some(true) => {
                    if pred {
                        true_positives += 1;
                    } else {
                        false_negatives += 1;
                    }
                }
                Some(false) => {
                    if pred {
                        false_positives += 1;
                    } else {
                        true_negatives += 1;
                    }
                }
                None => {}
            }
        }

        // Calculate detection delay for true positives
        let mut last_drift_start: Option<usize> = None;
        for (i, &gt) in ground_truth.iter().enumerate() {
            if gt == Some(true) && last_drift_start.is_none() {
                last_drift_start = Some(i);
            } else if gt == Some(false) {
                last_drift_start = None;
            }

            let detected_idx = i + offset;
            if detected_idx < detected.len() && detected[detected_idx] {
                if let Some(start) = last_drift_start {
                    detection_delays.push((i - start) as f64);
                    last_drift_start = None;
                }
            }
        }

        let precision = if true_positives + false_positives > 0 {
            true_positives as f64 / (true_positives + false_positives) as f64
        } else {
            0.0
        };

        let recall = if true_positives + false_negatives > 0 {
            true_positives as f64 / (true_positives + false_negatives) as f64
        } else {
            0.0
        };

        let f1_score = if precision + recall > 0.0 {
            2.0 * precision * recall / (precision + recall)
        } else {
            0.0
        };

        let mean_detection_delay = if detection_delays.is_empty() {
            None
        } else {
            Some(detection_delays.iter().sum::<f64>() / detection_delays.len() as f64)
        };

        DriftDetectionMetrics {
            true_positives,
            false_positives,
            true_negatives,
            false_negatives,
            precision,
            recall,
            f1_score,
            mean_detection_delay,
        }
    }

    fn calculate_hellinger_distance(&self, values: &[f64]) -> f64 {
        if values.len() < 20 {
            return 0.0;
        }

        let mid = values.len() / 2;
        let first_half = &values[..mid];
        let second_half = &values[mid..];

        // Create histograms with 10 bins
        let (min_val, max_val) = values.iter().fold((f64::MAX, f64::MIN), |(min, max), &v| {
            (min.min(v), max.max(v))
        });

        if (max_val - min_val).abs() < f64::EPSILON {
            return 0.0;
        }

        let num_bins = 10;
        let bin_width = (max_val - min_val) / num_bins as f64;

        let mut hist1 = vec![0.0; num_bins];
        let mut hist2 = vec![0.0; num_bins];

        for &v in first_half {
            let bin = ((v - min_val) / bin_width).floor() as usize;
            let bin = bin.min(num_bins - 1);
            hist1[bin] += 1.0;
        }

        for &v in second_half {
            let bin = ((v - min_val) / bin_width).floor() as usize;
            let bin = bin.min(num_bins - 1);
            hist2[bin] += 1.0;
        }

        // Normalize
        let sum1: f64 = hist1.iter().sum();
        let sum2: f64 = hist2.iter().sum();

        if sum1 == 0.0 || sum2 == 0.0 {
            return 0.0;
        }

        for h in &mut hist1 {
            *h /= sum1;
        }
        for h in &mut hist2 {
            *h /= sum2;
        }

        // Calculate Hellinger distance
        let mut sum_sq_diff = 0.0;
        for i in 0..num_bins {
            let diff = hist1[i].sqrt() - hist2[i].sqrt();
            sum_sq_diff += diff * diff;
        }

        (sum_sq_diff / 2.0).sqrt()
    }

    fn calculate_psi(&self, values: &[f64]) -> f64 {
        if values.len() < 20 {
            return 0.0;
        }

        let mid = values.len() / 2;
        let baseline = &values[..mid];
        let current = &values[mid..];

        // Create histograms with 10 bins
        let (min_val, max_val) = values.iter().fold((f64::MAX, f64::MIN), |(min, max), &v| {
            (min.min(v), max.max(v))
        });

        if (max_val - min_val).abs() < f64::EPSILON {
            return 0.0;
        }

        let num_bins = 10;
        let bin_width = (max_val - min_val) / num_bins as f64;

        let mut hist_baseline = vec![0.0; num_bins];
        let mut hist_current = vec![0.0; num_bins];

        for &v in baseline {
            let bin = ((v - min_val) / bin_width).floor() as usize;
            let bin = bin.min(num_bins - 1);
            hist_baseline[bin] += 1.0;
        }

        for &v in current {
            let bin = ((v - min_val) / bin_width).floor() as usize;
            let bin = bin.min(num_bins - 1);
            hist_current[bin] += 1.0;
        }

        // Normalize and add small constant to avoid log(0)
        let epsilon = 0.0001;
        let sum_baseline: f64 = hist_baseline.iter().sum();
        let sum_current: f64 = hist_current.iter().sum();

        if sum_baseline == 0.0 || sum_current == 0.0 {
            return 0.0;
        }

        for h in &mut hist_baseline {
            *h = (*h / sum_baseline).max(epsilon);
        }
        for h in &mut hist_current {
            *h = (*h / sum_current).max(epsilon);
        }

        // Calculate PSI
        let mut psi = 0.0;
        for i in 0..num_bins {
            let diff = hist_current[i] - hist_baseline[i];
            let ratio = hist_current[i] / hist_baseline[i];
            psi += diff * ratio.ln();
        }

        psi
    }

    fn calculate_drift_magnitude(&self, means: &[f64]) -> f64 {
        if means.len() < 2 {
            return 0.0;
        }

        let mid = means.len() / 2;
        let first_mean = means[..mid].iter().sum::<f64>() / mid as f64;
        let second_mean = means[mid..].iter().sum::<f64>() / (means.len() - mid) as f64;

        if first_mean.abs() < f64::EPSILON {
            return (second_mean - first_mean).abs();
        }

        ((second_mean - first_mean) / first_mean).abs()
    }

    fn calculate_mean_change(&self, means: &[f64]) -> f64 {
        if means.len() < 2 {
            return 0.0;
        }
        let first = means.first().unwrap_or(&0.0);
        let last = means.last().unwrap_or(&0.0);
        if first.abs() < f64::EPSILON {
            return 0.0;
        }
        (last - first) / first
    }

    fn calculate_std_change(&self, stds: &[f64]) -> f64 {
        if stds.len() < 2 {
            return 0.0;
        }
        let first = stds.first().unwrap_or(&0.0);
        let last = stds.last().unwrap_or(&0.0);
        if first.abs() < f64::EPSILON {
            return 0.0;
        }
        (last - first) / first
    }

    fn evaluate_pass_status(&self, metrics: &DriftDetectionMetrics, drift_magnitude: f64) -> bool {
        // Pass if we have reasonable detection metrics or magnitude is below threshold
        if drift_magnitude < self.min_magnitude_threshold {
            return true; // No significant drift to detect
        }

        // If there's significant drift, we need decent detection
        metrics.f1_score >= 0.5 || metrics.precision >= 0.6 || metrics.recall >= 0.6
    }

    fn collect_issues(
        &self,
        metrics: &DriftDetectionMetrics,
        drift_magnitude: f64,
        drift_count: usize,
    ) -> Vec<String> {
        let mut issues = Vec::new();

        if drift_magnitude >= self.min_magnitude_threshold {
            if metrics.precision < 0.5 {
                issues.push(format!(
                    "Low precision ({:.2}): many false positives",
                    metrics.precision
                ));
            }
            if metrics.recall < 0.5 {
                issues.push(format!(
                    "Low recall ({:.2}): many drift events missed",
                    metrics.recall
                ));
            }
            if let Some(delay) = metrics.mean_detection_delay {
                if delay > 3.0 {
                    issues.push(format!("High detection delay ({delay:.1} periods)"));
                }
            }
        }

        if drift_count == 0 && drift_magnitude >= self.min_magnitude_threshold {
            issues.push("No drift detected despite significant magnitude change".to_string());
        }

        issues
    }
}

impl Default for DriftDetectionAnalyzer {
    fn default() -> Self {
        Self::new(0.05)
    }
}

// =============================================================================
// Analysis Results
// =============================================================================

/// Results from drift detection analysis.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DriftDetectionAnalysis {
    /// Number of data points analyzed.
    pub sample_size: usize,
    /// Whether any drift was detected.
    pub drift_detected: bool,
    /// Number of drift points detected.
    pub drift_count: usize,
    /// Overall magnitude of detected drift.
    pub drift_magnitude: f64,
    /// Detection metrics (precision, recall, F1).
    pub detection_metrics: DriftDetectionMetrics,
    /// Hellinger distance between first and second half.
    pub hellinger_distance: Option<f64>,
    /// Population Stability Index.
    pub psi: Option<f64>,
    /// Relative change in rolling mean.
    pub rolling_mean_change: f64,
    /// Relative change in rolling standard deviation.
    pub rolling_std_change: f64,
    /// Whether the analysis passes quality thresholds.
    pub passes: bool,
    /// Issues identified during analysis.
    pub issues: Vec<String>,
}

/// Drift detection performance metrics.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DriftDetectionMetrics {
    /// True positive count.
    pub true_positives: usize,
    /// False positive count.
    pub false_positives: usize,
    /// True negative count.
    pub true_negatives: usize,
    /// False negative count.
    pub false_negatives: usize,
    /// Precision (TP / (TP + FP)).
    pub precision: f64,
    /// Recall (TP / (TP + FN)).
    pub recall: f64,
    /// F1 score (harmonic mean of precision and recall).
    pub f1_score: f64,
    /// Mean delay in detecting drift (in periods).
    pub mean_detection_delay: Option<f64>,
}

/// Analysis of labeled drift events.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LabeledEventAnalysis {
    /// Total number of labeled events.
    pub total_events: usize,
    /// Number of statistical drift events.
    pub statistical_events: usize,
    /// Number of business event drifts.
    pub business_events: usize,
    /// Distribution by event category.
    pub category_distribution: HashMap<DriftEventCategory, usize>,
    /// Distribution by detection difficulty.
    pub difficulty_distribution: HashMap<DetectionDifficulty, usize>,
    /// Average drift magnitude.
    pub avg_magnitude: f64,
    /// Average detection difficulty score.
    pub avg_difficulty: f64,
    /// Period coverage (min_period, max_period).
    pub period_coverage: (u32, u32),
    /// Whether the analysis passes quality thresholds.
    pub passes: bool,
    /// Issues identified.
    pub issues: Vec<String>,
}

impl LabeledEventAnalysis {
    /// Create an empty analysis result.
    pub fn empty() -> Self {
        Self {
            total_events: 0,
            statistical_events: 0,
            business_events: 0,
            category_distribution: HashMap::new(),
            difficulty_distribution: HashMap::new(),
            avg_magnitude: 0.0,
            avg_difficulty: 0.0,
            period_coverage: (0, 0),
            passes: true,
            issues: Vec::new(),
        }
    }
}

// =============================================================================
// Tests
// =============================================================================

#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
    use super::*;

    #[test]
    fn test_drift_detection_entry_creation() {
        let entry = DriftDetectionEntry::new(1, 100.0, Some(true));
        assert_eq!(entry.period, 1);
        assert_eq!(entry.value, 100.0);
        assert_eq!(entry.ground_truth_drift, Some(true));
    }

    #[test]
    fn test_drift_detection_entry_with_metadata() {
        let entry = DriftDetectionEntry::with_metadata(5, 150.0, true, "MeanShift", 0.15, 0.3);
        assert_eq!(entry.period, 5);
        assert_eq!(entry.drift_type, Some("MeanShift".to_string()));
        assert_eq!(entry.drift_magnitude, Some(0.15));
        assert_eq!(entry.detection_difficulty, Some(0.3));
    }

    #[test]
    fn test_drift_event_category_names() {
        assert_eq!(DriftEventCategory::MeanShift.name(), "Mean Shift");
        assert_eq!(
            DriftEventCategory::OrganizationalEvent.name(),
            "Organizational Event"
        );
    }

    #[test]
    fn test_drift_event_category_classification() {
        assert!(DriftEventCategory::MeanShift.is_statistical());
        assert!(!DriftEventCategory::MeanShift.is_business_event());
        assert!(DriftEventCategory::OrganizationalEvent.is_business_event());
        assert!(!DriftEventCategory::OrganizationalEvent.is_statistical());
    }

    #[test]
    fn test_detection_difficulty_conversion() {
        assert_eq!(DetectionDifficulty::Easy.to_score(), 0.0);
        assert_eq!(DetectionDifficulty::Medium.to_score(), 0.5);
        assert_eq!(DetectionDifficulty::Hard.to_score(), 1.0);

        assert_eq!(
            DetectionDifficulty::from_score(0.1),
            DetectionDifficulty::Easy
        );
        assert_eq!(
            DetectionDifficulty::from_score(0.5),
            DetectionDifficulty::Medium
        );
        assert_eq!(
            DetectionDifficulty::from_score(0.8),
            DetectionDifficulty::Hard
        );
    }

    #[test]
    fn test_analyzer_creation() {
        let analyzer = DriftDetectionAnalyzer::new(0.05)
            .with_window_size(15)
            .with_min_magnitude(0.1)
            .with_hellinger(true)
            .with_psi(true);

        assert_eq!(analyzer.significance_level, 0.05);
        assert_eq!(analyzer.window_size, 15);
        assert_eq!(analyzer.min_magnitude_threshold, 0.1);
    }

    #[test]
    fn test_analyze_no_drift() {
        let analyzer = DriftDetectionAnalyzer::new(0.05).with_window_size(5);

        // Create stable data with no drift
        let entries: Vec<DriftDetectionEntry> = (0..30)
            .map(|i| DriftDetectionEntry::new(i, 100.0 + (i as f64 * 0.01), Some(false)))
            .collect();

        let result = analyzer.analyze(&entries).unwrap();
        assert!(!result.drift_detected || result.drift_count < 5);
        assert!(result.drift_magnitude < 0.1);
    }

    #[test]
    fn test_analyze_with_drift() {
        let analyzer = DriftDetectionAnalyzer::new(0.05).with_window_size(5);

        // Create data with clear drift in the middle
        let mut entries: Vec<DriftDetectionEntry> = (0..15)
            .map(|i| DriftDetectionEntry::new(i, 100.0, Some(false)))
            .collect();

        // Add drift after period 15
        for i in 15..30 {
            entries.push(DriftDetectionEntry::new(i, 150.0, Some(true)));
        }

        let result = analyzer.analyze(&entries).unwrap();
        assert!(result.drift_detected);
        assert!(result.drift_magnitude > 0.3);
    }

    #[test]
    fn test_analyze_insufficient_data() {
        let analyzer = DriftDetectionAnalyzer::new(0.05).with_window_size(10);

        let entries: Vec<DriftDetectionEntry> = (0..5)
            .map(|i| DriftDetectionEntry::new(i, 100.0, None))
            .collect();

        let result = analyzer.analyze(&entries);
        assert!(result.is_err());
    }

    #[test]
    fn test_analyze_labeled_events() {
        let analyzer = DriftDetectionAnalyzer::new(0.05);

        let events = vec![
            LabeledDriftEvent {
                event_id: "E1".to_string(),
                event_type: DriftEventCategory::MeanShift,
                start_period: 10,
                end_period: Some(15),
                affected_fields: vec!["amount".to_string()],
                magnitude: 0.15,
                detection_difficulty: DetectionDifficulty::Easy,
            },
            LabeledDriftEvent {
                event_id: "E2".to_string(),
                event_type: DriftEventCategory::OrganizationalEvent,
                start_period: 20,
                end_period: Some(25),
                affected_fields: vec!["volume".to_string()],
                magnitude: 0.30,
                detection_difficulty: DetectionDifficulty::Medium,
            },
        ];

        let result = analyzer.analyze_labeled_events(&events).unwrap();
        assert_eq!(result.total_events, 2);
        assert_eq!(result.statistical_events, 1);
        assert_eq!(result.business_events, 1);
        assert!(result.avg_magnitude > 0.2);
        assert!(result.passes);
    }

    #[test]
    fn test_empty_labeled_events() {
        let analyzer = DriftDetectionAnalyzer::new(0.05);
        let result = analyzer.analyze_labeled_events(&[]).unwrap();
        assert_eq!(result.total_events, 0);
        assert!(result.passes);
    }

    #[test]
    fn test_hellinger_distance_no_drift() {
        let analyzer = DriftDetectionAnalyzer::new(0.05);

        // Stable data
        let entries: Vec<DriftDetectionEntry> = (0..40)
            .map(|i| DriftDetectionEntry::new(i, 100.0 + (i as f64 % 5.0), None))
            .collect();

        let result = analyzer.analyze(&entries).unwrap();
        assert!(result.hellinger_distance.unwrap() < 0.3);
    }

    #[test]
    fn test_psi_calculation() {
        let analyzer = DriftDetectionAnalyzer::new(0.05);

        // Data with drift
        let mut entries: Vec<DriftDetectionEntry> = (0..20)
            .map(|i| DriftDetectionEntry::new(i, 100.0, None))
            .collect();
        for i in 20..40 {
            entries.push(DriftDetectionEntry::new(i, 200.0, None));
        }

        let result = analyzer.analyze(&entries).unwrap();
        assert!(result.psi.is_some());
        // PSI > 0.1 indicates significant drift
        assert!(result.psi.unwrap() > 0.0);
    }

    #[test]
    fn test_detection_metrics_calculation() {
        let analyzer = DriftDetectionAnalyzer::new(0.05).with_window_size(3);

        // Create data where we know the ground truth
        let mut entries = Vec::new();
        for i in 0..10 {
            entries.push(DriftDetectionEntry::new(i, 100.0, Some(false)));
        }
        for i in 10..20 {
            entries.push(DriftDetectionEntry::new(i, 200.0, Some(true)));
        }

        let result = analyzer.analyze(&entries).unwrap();

        // Should have some detection capability
        assert!(result.detection_metrics.precision >= 0.0);
        assert!(result.detection_metrics.recall >= 0.0);
    }
}