Skip to main content

ftui_runtime/
flake_detector.rs

1//! Anytime-Valid Flake Detector (bd-1plj).
2//!
3//! Detects flaky timing regressions in E2E tests without inflating false positives,
4//! using anytime-valid e-process statistics.
5//!
6//! # Mathematical Model
7//!
8//! For sub-Gaussian residuals `r_t` (mean 0 under null), the e-value at time `t` is:
9//!
10//! ```text
11//! e_t = exp(λ × r_t − (λ² × σ²) / 2)
12//! E_t = ∏_{i=1}^t e_i
13//! ```
14//!
15//! We reject H₀ (system is stable) when `E_t > 1/α`, providing anytime-valid
16//! Type I error control.
17//!
18//! # Key Properties
19//!
20//! - **Anytime-valid**: Can stop testing early without invalid inference
21//! - **No false positives in stable runs**: E[E_t] ≤ 1 under H₀
22//! - **Early detection**: Strong evidence triggers early failure
23//! - **Variable-length support**: Works with different test run lengths
24//!
25//! # Failure Modes
26//!
27//! | Condition | Behavior | Rationale |
28//! |-----------|----------|-----------|
29//! | σ = 0 | Clamp to σ_MIN | Division by zero guard |
30//! | E_t underflow | Clamp to E_MIN | Prevents permanent zero-lock |
31//! | E_t overflow | Clamp to E_MAX | Numerical stability |
32//! | No observations | E_t = 1 | Identity element |
33//!
34//! # Example
35//!
36//! ```rust,ignore
37//! use ftui_runtime::flake_detector::{FlakeDetector, FlakeConfig};
38//!
39//! let mut detector = FlakeDetector::new(FlakeConfig::default());
40//!
41//! // Observe latency deviations
42//! let decision = detector.observe(latency_deviation);
43//! if decision.is_flaky {
44//!     eprintln!("Flaky test detected");
45//! }
46//! ```
47
48#![forbid(unsafe_code)]
49
50use std::collections::VecDeque;
51
52/// Minimum sigma to prevent division by zero.
53const SIGMA_MIN: f64 = 1e-9;
54
55/// Minimum e-value floor to prevent permanent zero-lock.
56const E_MIN: f64 = 1e-100;
57
58/// Maximum e-value to prevent overflow.
59const E_MAX: f64 = 1e100;
60
61/// Default significance level.
62const DEFAULT_ALPHA: f64 = 0.05;
63
64/// Default lambda (betting intensity).
65const DEFAULT_LAMBDA: f64 = 0.5;
66
67// =============================================================================
68// Configuration
69// =============================================================================
70
71/// Configuration for the flake detector.
72#[derive(Debug, Clone)]
73pub struct FlakeConfig {
74    /// Significance level `α`. Fail when `E_t > 1/α`.
75    /// Lower α → more conservative (fewer false alarms). Default: 0.05.
76    pub alpha: f64,
77
78    /// Betting intensity `λ`. Higher values detect deviations faster
79    /// but are more sensitive to noise. Default: 0.5.
80    pub lambda: f64,
81
82    /// Prior estimate of standard deviation for latency residuals.
83    /// Used in the e-value formula. Default: 1.0 (normalized units).
84    pub sigma: f64,
85
86    /// Rolling window size for empirical variance estimation.
87    /// Set to 0 to use fixed sigma. Default: 50.
88    pub variance_window: usize,
89
90    /// Minimum observations before making decisions.
91    /// Helps with warm-up. Default: 3.
92    pub min_observations: usize,
93
94    /// Enable JSONL-compatible evidence logging. Default: false.
95    pub enable_logging: bool,
96
97    /// Minimum e-value before flagging as flaky (1/alpha).
98    /// Computed from alpha but can be overridden.
99    pub threshold: Option<f64>,
100}
101
102impl Default for FlakeConfig {
103    fn default() -> Self {
104        Self {
105            alpha: DEFAULT_ALPHA,
106            lambda: DEFAULT_LAMBDA,
107            sigma: 1.0,
108            variance_window: 50,
109            min_observations: 3,
110            enable_logging: false,
111            threshold: None,
112        }
113    }
114}
115
116impl FlakeConfig {
117    /// Create a new configuration with the given alpha.
118    #[must_use]
119    pub fn new(alpha: f64) -> Self {
120        Self {
121            alpha: alpha.clamp(1e-10, 0.5),
122            ..Default::default()
123        }
124    }
125
126    /// Set the betting intensity lambda.
127    #[must_use]
128    pub fn with_lambda(mut self, lambda: f64) -> Self {
129        self.lambda = lambda.clamp(0.01, 2.0);
130        self
131    }
132
133    /// Set the prior sigma.
134    #[must_use]
135    pub fn with_sigma(mut self, sigma: f64) -> Self {
136        self.sigma = sigma.max(SIGMA_MIN);
137        self
138    }
139
140    /// Set the variance window size.
141    #[must_use]
142    pub fn with_variance_window(mut self, window: usize) -> Self {
143        self.variance_window = window;
144        self
145    }
146
147    /// Set minimum observations.
148    #[must_use]
149    pub fn with_min_observations(mut self, min: usize) -> Self {
150        self.min_observations = min.max(1);
151        self
152    }
153
154    /// Enable logging.
155    #[must_use]
156    pub fn with_logging(mut self, enabled: bool) -> Self {
157        self.enable_logging = enabled;
158        self
159    }
160
161    /// Get the threshold (1/alpha).
162    #[must_use]
163    pub fn threshold(&self) -> f64 {
164        self.threshold.unwrap_or(1.0 / self.alpha)
165    }
166}
167
168// =============================================================================
169// Decision Types
170// =============================================================================
171
172/// Decision returned by the flake detector.
173#[derive(Debug, Clone, PartialEq)]
174pub struct FlakeDecision {
175    /// Whether the test is flagged as flaky.
176    pub is_flaky: bool,
177    /// Current cumulative e-value.
178    pub e_value: f64,
179    /// Threshold for flakiness (1/alpha).
180    pub threshold: f64,
181    /// Number of observations so far.
182    pub observation_count: usize,
183    /// Current variance estimate.
184    pub variance_estimate: f64,
185    /// Whether we have enough observations.
186    pub warmed_up: bool,
187}
188
189impl FlakeDecision {
190    /// Check if we should fail the test.
191    #[must_use]
192    pub fn should_fail(&self) -> bool {
193        self.is_flaky && self.warmed_up
194    }
195}
196
197/// Log entry for evidence tracking.
198#[derive(Debug, Clone)]
199pub struct EvidenceLog {
200    /// Observation index.
201    pub observation_idx: usize,
202    /// The residual value observed.
203    pub residual: f64,
204    /// The incremental e-value for this observation.
205    pub e_increment: f64,
206    /// Cumulative e-value after this observation.
207    pub e_cumulative: f64,
208    /// Variance estimate at this point.
209    pub variance: f64,
210    /// Decision at this point.
211    pub decision: bool,
212}
213
214impl EvidenceLog {
215    /// Serialize to JSONL format.
216    #[must_use]
217    pub fn to_jsonl(&self) -> String {
218        format!(
219            r#"{{"idx":{},"residual":{:.6},"e_inc":{:.6},"e_cum":{:.6},"var":{:.6},"decision":{}}}"#,
220            self.observation_idx,
221            self.residual,
222            self.e_increment,
223            self.e_cumulative,
224            self.variance,
225            self.decision
226        )
227    }
228}
229
230// =============================================================================
231// Flake Detector
232// =============================================================================
233
234/// Anytime-valid flake detector using e-process statistics.
235#[derive(Debug, Clone)]
236pub struct FlakeDetector {
237    /// Configuration.
238    config: FlakeConfig,
239    /// Cumulative e-value (product of incremental e-values).
240    e_cumulative: f64,
241    /// Observation count.
242    observation_count: usize,
243    /// Rolling window for variance estimation.
244    variance_window: VecDeque<f64>,
245    /// Evidence log (if logging enabled).
246    evidence_log: Vec<EvidenceLog>,
247    /// Observation index where a flaky decision was first reached.
248    first_flaky_at: Option<usize>,
249    /// Maximum cumulative e-value observed over the detector lifetime.
250    max_e_value: f64,
251}
252
253impl FlakeDetector {
254    /// Create a new flake detector.
255    #[must_use]
256    pub fn new(config: FlakeConfig) -> Self {
257        let capacity = if config.variance_window > 0 {
258            config.variance_window
259        } else {
260            1
261        };
262        Self {
263            config,
264            e_cumulative: 1.0, // Identity element
265            observation_count: 0,
266            variance_window: VecDeque::with_capacity(capacity),
267            evidence_log: Vec::new(),
268            first_flaky_at: None,
269            max_e_value: 1.0,
270        }
271    }
272
273    /// Observe a latency deviation (residual).
274    ///
275    /// The residual should be the difference between observed latency
276    /// and expected latency, ideally normalized.
277    pub fn observe(&mut self, residual: f64) -> FlakeDecision {
278        if residual.is_nan() {
279            return FlakeDecision {
280                is_flaky: false,
281                e_value: self.e_cumulative,
282                threshold: self.config.threshold(),
283                observation_count: self.observation_count,
284                variance_estimate: self.current_sigma().powi(2),
285                warmed_up: self.observation_count >= self.config.min_observations,
286            };
287        }
288
289        self.observation_count += 1;
290
291        // Update variance estimate
292        self.update_variance(residual);
293        let sigma = self.current_sigma();
294
295        // Compute incremental e-value: e_t = exp(λ × r_t − (λ² × σ²) / 2)
296        let lambda = self.config.lambda;
297        let exponent = lambda * residual - (lambda * lambda * sigma * sigma) / 2.0;
298        let e_increment = exponent.exp().clamp(E_MIN, E_MAX);
299
300        // Update cumulative e-value
301        self.e_cumulative = (self.e_cumulative * e_increment).clamp(E_MIN, E_MAX);
302
303        // Check threshold
304        let threshold = self.config.threshold();
305        let is_flaky = self.e_cumulative > threshold;
306        let warmed_up = self.observation_count >= self.config.min_observations;
307        let decision = is_flaky && warmed_up;
308
309        if decision && self.first_flaky_at.is_none() {
310            self.first_flaky_at = Some(self.observation_count);
311        }
312        self.max_e_value = self.max_e_value.max(self.e_cumulative);
313
314        // Log if enabled
315        if self.config.enable_logging {
316            self.evidence_log.push(EvidenceLog {
317                observation_idx: self.observation_count,
318                residual,
319                e_increment,
320                e_cumulative: self.e_cumulative,
321                variance: sigma * sigma,
322                decision,
323            });
324        }
325
326        FlakeDecision {
327            is_flaky,
328            e_value: self.e_cumulative,
329            threshold,
330            observation_count: self.observation_count,
331            variance_estimate: sigma * sigma,
332            warmed_up,
333        }
334    }
335
336    /// Observe multiple residuals and return the final decision.
337    pub fn observe_batch(&mut self, residuals: &[f64]) -> FlakeDecision {
338        let mut decision = FlakeDecision {
339            is_flaky: false,
340            e_value: self.e_cumulative,
341            threshold: self.config.threshold(),
342            observation_count: self.observation_count,
343            variance_estimate: self.current_sigma().powi(2),
344            warmed_up: false,
345        };
346
347        for &r in residuals {
348            decision = self.observe(r);
349            if decision.should_fail() {
350                break; // Early stopping with anytime-valid guarantee
351            }
352        }
353
354        decision
355    }
356
357    /// Reset the detector state.
358    pub fn reset(&mut self) {
359        self.e_cumulative = 1.0;
360        self.observation_count = 0;
361        self.variance_window.clear();
362        self.evidence_log.clear();
363        self.first_flaky_at = None;
364        self.max_e_value = 1.0;
365    }
366
367    /// Get the current e-value.
368    #[must_use]
369    pub fn e_value(&self) -> f64 {
370        self.e_cumulative
371    }
372
373    /// Get the observation count.
374    #[must_use]
375    pub fn observation_count(&self) -> usize {
376        self.observation_count
377    }
378
379    /// Check if the detector has warmed up.
380    #[must_use]
381    pub fn is_warmed_up(&self) -> bool {
382        self.observation_count >= self.config.min_observations
383    }
384
385    /// Get the evidence log.
386    #[must_use]
387    pub fn evidence_log(&self) -> &[EvidenceLog] {
388        &self.evidence_log
389    }
390
391    /// Export evidence log as JSONL.
392    #[must_use]
393    pub fn evidence_to_jsonl(&self) -> String {
394        self.evidence_log
395            .iter()
396            .map(|e| e.to_jsonl())
397            .collect::<Vec<_>>()
398            .join("\n")
399    }
400
401    /// Get the current sigma estimate.
402    #[must_use]
403    pub fn current_sigma(&self) -> f64 {
404        if self.config.variance_window == 0 || self.variance_window.len() < 2 {
405            return self.config.sigma.max(SIGMA_MIN);
406        }
407
408        let n = self.variance_window.len() as f64;
409        let mean = self.variance_window.iter().sum::<f64>() / n;
410        let variance = self
411            .variance_window
412            .iter()
413            .map(|&x| {
414                let diff = x - mean;
415                diff * diff
416            })
417            .sum::<f64>()
418            / (n - 1.0);
419
420        variance.sqrt().max(SIGMA_MIN)
421    }
422
423    /// Update variance estimate with new observation.
424    fn update_variance(&mut self, residual: f64) {
425        if self.config.variance_window == 0 {
426            return;
427        }
428
429        // Maintain rolling window
430        if self.variance_window.len() >= self.config.variance_window {
431            self.variance_window.pop_front();
432        }
433        self.variance_window.push_back(residual);
434    }
435
436    /// Get configuration.
437    #[must_use]
438    pub fn config(&self) -> &FlakeConfig {
439        &self.config
440    }
441}
442
443impl Default for FlakeDetector {
444    fn default() -> Self {
445        Self::new(FlakeConfig::default())
446    }
447}
448
449// =============================================================================
450// Summary Statistics
451// =============================================================================
452
453/// Summary of flake detection run.
454#[derive(Debug, Clone)]
455pub struct FlakeSummary {
456    /// Total observations.
457    pub total_observations: usize,
458    /// Final e-value.
459    pub final_e_value: f64,
460    /// Whether flagged as flaky.
461    pub is_flaky: bool,
462    /// Observation index where flakiness was first detected (if any).
463    pub first_flaky_at: Option<usize>,
464    /// Maximum e-value observed.
465    pub max_e_value: f64,
466    /// Threshold used.
467    pub threshold: f64,
468}
469
470impl FlakeDetector {
471    /// Generate summary statistics.
472    #[must_use]
473    pub fn summary(&self) -> FlakeSummary {
474        FlakeSummary {
475            total_observations: self.observation_count,
476            final_e_value: self.e_cumulative,
477            is_flaky: self.e_cumulative > self.config.threshold(),
478            first_flaky_at: self.first_flaky_at,
479            max_e_value: self.max_e_value,
480            threshold: self.config.threshold(),
481        }
482    }
483}
484
485// =============================================================================
486// Tests
487// =============================================================================
488
489#[cfg(test)]
490mod tests {
491    use super::*;
492
493    #[test]
494    fn unit_eprocess_threshold() {
495        // Test that we fail when E_t > 1/alpha
496        let config = FlakeConfig::new(0.05).with_min_observations(1);
497        let mut detector = FlakeDetector::new(config);
498
499        // Feed large positive residuals to drive e-value up
500        for _ in 0..20 {
501            let decision = detector.observe(3.0); // Large deviation
502            if decision.should_fail() {
503                // Should eventually trigger
504                assert!(decision.e_value > decision.threshold);
505                return;
506            }
507        }
508
509        // If we didn't fail, check threshold
510        let decision = detector.observe(0.0);
511        assert!(
512            decision.e_value > decision.threshold || !decision.is_flaky,
513            "Should either have triggered or not be flaky"
514        );
515    }
516
517    #[test]
518    fn unit_eprocess_nonnegative() {
519        // E-values should never be negative
520        let mut detector = FlakeDetector::default();
521
522        // Test with various residuals including negative
523        let residuals = [-5.0, -2.0, 0.0, 2.0, 5.0, -10.0, 10.0];
524        for r in residuals {
525            let decision = detector.observe(r);
526            assert!(
527                decision.e_value > 0.0,
528                "E-value must be positive, got {}",
529                decision.e_value
530            );
531        }
532    }
533
534    #[test]
535    fn unit_optional_stopping() {
536        // Stopping early should preserve decision validity
537        let config = FlakeConfig::new(0.05)
538            .with_lambda(0.3)
539            .with_min_observations(1)
540            .with_logging(true);
541        let mut detector = FlakeDetector::new(config);
542
543        // Simulate stable run (small residuals around 0)
544        let stable_residuals: Vec<f64> = (0..100).map(|i| (i as f64 * 0.1).sin() * 0.1).collect();
545
546        let decision = detector.observe_batch(&stable_residuals);
547
548        // Under H₀ (stable), we shouldn't flag as flaky
549        // Note: Due to random variation, we check the e-value is reasonable
550        assert!(
551            decision.e_value <= decision.threshold * 2.0 || !decision.should_fail(),
552            "Stable run should rarely trigger flakiness"
553        );
554    }
555
556    #[test]
557    fn unit_stable_run_no_false_positives() {
558        // A truly stable run should not trigger false positives
559        let config = FlakeConfig::new(0.05)
560            .with_sigma(1.0)
561            .with_lambda(0.5)
562            .with_min_observations(3);
563        let mut detector = FlakeDetector::new(config);
564
565        // Zero residuals (perfectly stable)
566        for _ in 0..50 {
567            let decision = detector.observe(0.0);
568            // With zero residuals, e_increment = exp(-λ²σ²/2) < 1
569            // So e_cumulative should decrease over time
570            assert!(
571                !decision.should_fail(),
572                "Zero residuals should never trigger flakiness"
573            );
574        }
575    }
576
577    #[test]
578    fn unit_spike_detection() {
579        // Inject latency spikes and verify detection
580        let config = FlakeConfig::new(0.05)
581            .with_sigma(1.0)
582            .with_lambda(0.5)
583            .with_min_observations(3)
584            .with_logging(true);
585        let mut detector = FlakeDetector::new(config);
586
587        // Start with some normal observations
588        for _ in 0..5 {
589            detector.observe(0.1);
590        }
591
592        // Inject spike
593        let mut detected = false;
594        for _ in 0..20 {
595            let decision = detector.observe(5.0); // Large spike
596            if decision.should_fail() {
597                detected = true;
598                break;
599            }
600        }
601
602        assert!(detected, "Should detect sustained spike");
603    }
604
605    #[test]
606    fn unit_reset() {
607        let mut detector = FlakeDetector::default();
608        detector.observe(1.0);
609        detector.observe(2.0);
610
611        assert_eq!(detector.observation_count(), 2);
612
613        detector.reset();
614
615        assert_eq!(detector.observation_count(), 0);
616        assert!((detector.e_value() - 1.0).abs() < 1e-10);
617    }
618
619    #[test]
620    fn unit_variance_estimation() {
621        let config = FlakeConfig::default().with_variance_window(10);
622        let mut detector = FlakeDetector::new(config);
623
624        // Feed constant residuals
625        for _ in 0..20 {
626            detector.observe(1.0);
627        }
628
629        // With constant input, variance should be low
630        let sigma = detector.current_sigma();
631        assert!(
632            sigma < 0.1 || (sigma - 1.0).abs() < 0.5,
633            "Variance should converge"
634        );
635    }
636
637    #[test]
638    fn unit_evidence_log() {
639        let config = FlakeConfig::default()
640            .with_logging(true)
641            .with_min_observations(1);
642        let mut detector = FlakeDetector::new(config);
643
644        detector.observe(0.5);
645        detector.observe(1.0);
646        detector.observe(-0.5);
647
648        assert_eq!(detector.evidence_log().len(), 3);
649
650        let jsonl = detector.evidence_to_jsonl();
651        assert!(jsonl.contains("\"idx\":1"));
652        assert!(jsonl.contains("\"idx\":2"));
653        assert!(jsonl.contains("\"idx\":3"));
654    }
655
656    #[test]
657    fn unit_summary() {
658        let config = FlakeConfig::default()
659            .with_logging(true)
660            .with_min_observations(1);
661        let mut detector = FlakeDetector::new(config);
662
663        for _ in 0..10 {
664            detector.observe(0.1);
665        }
666
667        let summary = detector.summary();
668        assert_eq!(summary.total_observations, 10);
669        assert!(summary.final_e_value > 0.0);
670        assert!(summary.threshold > 0.0);
671    }
672
673    #[test]
674    fn unit_batch_observe() {
675        let config = FlakeConfig::default().with_min_observations(1);
676        let mut detector = FlakeDetector::new(config);
677
678        let residuals = vec![0.1, 0.2, 0.3, 0.4, 0.5];
679        let decision = detector.observe_batch(&residuals);
680
681        assert_eq!(decision.observation_count, 5);
682    }
683
684    #[test]
685    fn unit_config_builder() {
686        let config = FlakeConfig::new(0.01)
687            .with_lambda(0.3)
688            .with_sigma(2.0)
689            .with_variance_window(100)
690            .with_min_observations(5)
691            .with_logging(true);
692
693        assert!((config.alpha - 0.01).abs() < 1e-10);
694        assert!((config.lambda - 0.3).abs() < 1e-10);
695        assert!((config.sigma - 2.0).abs() < 1e-10);
696        assert_eq!(config.variance_window, 100);
697        assert_eq!(config.min_observations, 5);
698        assert!(config.enable_logging);
699        assert!((config.threshold() - 100.0).abs() < 1e-10);
700    }
701
702    #[test]
703    fn unit_numerical_stability() {
704        let mut detector = FlakeDetector::default();
705
706        // Very large residuals
707        for _ in 0..10 {
708            let decision = detector.observe(1000.0);
709            assert!(decision.e_value.is_finite());
710            assert!(decision.e_value > 0.0);
711        }
712
713        detector.reset();
714
715        // Very small negative residuals
716        for _ in 0..10 {
717            let decision = detector.observe(-1000.0);
718            assert!(decision.e_value.is_finite());
719            assert!(decision.e_value > 0.0);
720        }
721    }
722
723    // ── FlakeConfig defaults ─────────────────────────────────────
724
725    #[test]
726    fn config_default_values() {
727        let config = FlakeConfig::default();
728        assert!((config.alpha - DEFAULT_ALPHA).abs() < f64::EPSILON);
729        assert!((config.lambda - DEFAULT_LAMBDA).abs() < f64::EPSILON);
730        assert!((config.sigma - 1.0).abs() < f64::EPSILON);
731        assert_eq!(config.variance_window, 50);
732        assert_eq!(config.min_observations, 3);
733        assert!(!config.enable_logging);
734        assert!(config.threshold.is_none());
735    }
736
737    #[test]
738    fn config_threshold_computed_from_alpha() {
739        let config = FlakeConfig::new(0.05);
740        assert!((config.threshold() - 20.0).abs() < 1e-10);
741    }
742
743    #[test]
744    fn config_threshold_override() {
745        let mut config = FlakeConfig::new(0.05);
746        config.threshold = Some(42.0);
747        assert!((config.threshold() - 42.0).abs() < f64::EPSILON);
748    }
749
750    // ── FlakeConfig clamping ─────────────────────────────────────
751
752    #[test]
753    fn config_new_clamps_alpha_low() {
754        let config = FlakeConfig::new(0.0);
755        assert!(config.alpha >= 1e-10);
756    }
757
758    #[test]
759    fn config_new_clamps_alpha_high() {
760        let config = FlakeConfig::new(1.0);
761        assert!(config.alpha <= 0.5);
762    }
763
764    #[test]
765    fn config_with_lambda_clamps_low() {
766        let config = FlakeConfig::default().with_lambda(0.0);
767        assert!(config.lambda >= 0.01);
768    }
769
770    #[test]
771    fn config_with_lambda_clamps_high() {
772        let config = FlakeConfig::default().with_lambda(100.0);
773        assert!(config.lambda <= 2.0);
774    }
775
776    #[test]
777    fn config_with_sigma_clamps_to_min() {
778        let config = FlakeConfig::default().with_sigma(0.0);
779        assert!(config.sigma >= SIGMA_MIN);
780    }
781
782    #[test]
783    fn config_with_min_observations_clamps_to_one() {
784        let config = FlakeConfig::default().with_min_observations(0);
785        assert!(config.min_observations >= 1);
786    }
787
788    // ── FlakeDecision ────────────────────────────────────────────
789
790    #[test]
791    fn decision_should_fail_requires_both_flaky_and_warmed_up() {
792        let d1 = FlakeDecision {
793            is_flaky: true,
794            warmed_up: false,
795            e_value: 100.0,
796            threshold: 20.0,
797            observation_count: 1,
798            variance_estimate: 1.0,
799        };
800        assert!(!d1.should_fail());
801
802        let d2 = FlakeDecision {
803            is_flaky: false,
804            warmed_up: true,
805            e_value: 1.0,
806            threshold: 20.0,
807            observation_count: 5,
808            variance_estimate: 1.0,
809        };
810        assert!(!d2.should_fail());
811
812        let d3 = FlakeDecision {
813            is_flaky: true,
814            warmed_up: true,
815            e_value: 100.0,
816            threshold: 20.0,
817            observation_count: 5,
818            variance_estimate: 1.0,
819        };
820        assert!(d3.should_fail());
821    }
822
823    // ── EvidenceLog JSONL ────────────────────────────────────────
824
825    #[test]
826    fn evidence_log_to_jsonl_format() {
827        let log = EvidenceLog {
828            observation_idx: 3,
829            residual: 1.5,
830            e_increment: 2.1,
831            e_cumulative: 4.2,
832            variance: 0.9,
833            decision: true,
834        };
835        let jsonl = log.to_jsonl();
836        assert!(jsonl.contains("\"idx\":3"));
837        assert!(jsonl.contains("\"residual\":"));
838        assert!(jsonl.contains("\"e_inc\":"));
839        assert!(jsonl.contains("\"e_cum\":"));
840        assert!(jsonl.contains("\"var\":"));
841        assert!(jsonl.contains("\"decision\":true"));
842    }
843
844    #[test]
845    fn evidence_log_to_jsonl_false_decision() {
846        let log = EvidenceLog {
847            observation_idx: 1,
848            residual: 0.0,
849            e_increment: 1.0,
850            e_cumulative: 1.0,
851            variance: 1.0,
852            decision: false,
853        };
854        let jsonl = log.to_jsonl();
855        assert!(jsonl.contains("\"decision\":false"));
856    }
857
858    // ── FlakeDetector accessors ──────────────────────────────────
859
860    #[test]
861    fn detector_default_initial_state() {
862        let detector = FlakeDetector::default();
863        assert_eq!(detector.observation_count(), 0);
864        assert!((detector.e_value() - 1.0).abs() < f64::EPSILON);
865        assert!(!detector.is_warmed_up());
866        assert!(detector.evidence_log().is_empty());
867    }
868
869    #[test]
870    fn detector_config_accessor() {
871        let config = FlakeConfig::new(0.01).with_lambda(0.3);
872        let detector = FlakeDetector::new(config);
873        assert!((detector.config().alpha - 0.01).abs() < 1e-10);
874        assert!((detector.config().lambda - 0.3).abs() < 1e-10);
875    }
876
877    #[test]
878    fn detector_is_warmed_up_after_min_observations() {
879        let config = FlakeConfig::default().with_min_observations(3);
880        let mut detector = FlakeDetector::new(config);
881        assert!(!detector.is_warmed_up());
882        detector.observe(0.0);
883        detector.observe(0.0);
884        assert!(!detector.is_warmed_up());
885        detector.observe(0.0);
886        assert!(detector.is_warmed_up());
887    }
888
889    // ── Variance window = 0 (fixed sigma) ────────────────────────
890
891    #[test]
892    fn fixed_sigma_when_variance_window_zero() {
893        let config = FlakeConfig::default()
894            .with_sigma(3.0)
895            .with_variance_window(0);
896        let mut detector = FlakeDetector::new(config);
897        detector.observe(10.0);
898        detector.observe(20.0);
899        assert!((detector.current_sigma() - 3.0).abs() < f64::EPSILON);
900    }
901
902    // ── Summary edge cases ───────────────────────────────────────
903
904    #[test]
905    fn summary_empty_detector() {
906        let detector = FlakeDetector::new(FlakeConfig::default().with_logging(true));
907        let summary = detector.summary();
908        assert_eq!(summary.total_observations, 0);
909        assert!((summary.final_e_value - 1.0).abs() < f64::EPSILON);
910        assert!(!summary.is_flaky);
911        assert!(summary.first_flaky_at.is_none());
912        assert!((summary.max_e_value - 1.0).abs() < f64::EPSILON);
913    }
914
915    #[test]
916    fn summary_first_flaky_at_recorded() {
917        let config = FlakeConfig::new(0.05)
918            .with_min_observations(1)
919            .with_logging(true);
920        let mut detector = FlakeDetector::new(config);
921        for _ in 0..50 {
922            detector.observe(5.0);
923        }
924        let summary = detector.summary();
925        if summary.is_flaky {
926            assert!(
927                summary.first_flaky_at.is_some(),
928                "should record first flaky index"
929            );
930            assert!(summary.first_flaky_at.unwrap() > 0);
931        }
932    }
933
934    // ── Determinism ──────────────────────────────────────────────
935
936    #[test]
937    fn deterministic_same_inputs() {
938        let config = FlakeConfig::new(0.05).with_lambda(0.5).with_sigma(1.0);
939        let residuals = [0.1, -0.2, 0.5, -0.1, 3.0, 0.0, -1.0, 2.0];
940        let mut d1 = FlakeDetector::new(config.clone());
941        let mut d2 = FlakeDetector::new(config);
942        for &r in &residuals {
943            d1.observe(r);
944            d2.observe(r);
945        }
946        assert!((d1.e_value() - d2.e_value()).abs() < 1e-10);
947        assert_eq!(d1.observation_count(), d2.observation_count());
948    }
949
950    // ── Batch early stopping ─────────────────────────────────────
951
952    #[test]
953    fn batch_early_stops_on_flaky() {
954        let config = FlakeConfig::new(0.05)
955            .with_min_observations(1)
956            .with_lambda(0.5);
957        let mut detector = FlakeDetector::new(config);
958        let mut residuals = vec![10.0; 20];
959        residuals.extend(vec![0.0; 80]);
960        let decision = detector.observe_batch(&residuals);
961        if decision.should_fail() {
962            assert!(
963                decision.observation_count < 100,
964                "should stop early, count={}",
965                decision.observation_count
966            );
967        }
968    }
969
970    // ── E-value monotone under positive residuals ────────────────
971
972    #[test]
973    fn e_value_increases_under_consistent_positive_residuals() {
974        let config = FlakeConfig::default()
975            .with_variance_window(0)
976            .with_sigma(1.0);
977        let mut detector = FlakeDetector::new(config);
978        let mut prev_e = 1.0;
979        for _ in 0..5 {
980            let decision = detector.observe(2.0);
981            assert!(
982                decision.e_value >= prev_e,
983                "e-value should increase: prev={prev_e}, cur={}",
984                decision.e_value
985            );
986            prev_e = decision.e_value;
987        }
988    }
989
990    // ── Evidence log only when enabled ───────────────────────────
991
992    #[test]
993    fn no_evidence_log_when_disabled() {
994        let config = FlakeConfig::default();
995        let mut detector = FlakeDetector::new(config);
996        detector.observe(1.0);
997        detector.observe(2.0);
998        assert!(detector.evidence_log().is_empty());
999        assert!(detector.evidence_to_jsonl().is_empty());
1000    }
1001
1002    #[test]
1003    fn summary_tracks_metrics_when_logging_disabled() {
1004        let config = FlakeConfig::new(0.05).with_min_observations(1);
1005        let mut detector = FlakeDetector::new(config);
1006        detector.observe(5.0);
1007        detector.observe(5.0);
1008        detector.observe(-1.0);
1009
1010        let summary = detector.summary();
1011        assert_eq!(summary.first_flaky_at, Some(2));
1012        assert!(summary.max_e_value > summary.threshold);
1013        assert!(summary.max_e_value + f64::EPSILON >= summary.final_e_value);
1014    }
1015
1016    // ── Reset clears everything ──────────────────────────────────
1017
1018    #[test]
1019    fn reset_clears_evidence_log() {
1020        let config = FlakeConfig::default().with_logging(true);
1021        let mut detector = FlakeDetector::new(config);
1022        detector.observe(1.0);
1023        assert_eq!(detector.evidence_log().len(), 1);
1024        detector.reset();
1025        assert!(detector.evidence_log().is_empty());
1026    }
1027}