Skip to main content

ftui_runtime/
flake_detector.rs

1//! Anytime-Valid Flake Detector (bd-1plj).
2//!
3//! Detects flaky timing regressions in E2E tests without inflating false positives,
4//! using anytime-valid e-process statistics.
5//!
6//! # Mathematical Model
7//!
8//! For sub-Gaussian residuals `r_t` (mean 0 under null), the e-value at time `t` is:
9//!
10//! ```text
11//! e_t = exp(λ × r_t − (λ² × σ²) / 2)
12//! E_t = ∏_{i=1}^t e_i
13//! ```
14//!
15//! We reject H₀ (system is stable) when `E_t > 1/α`, providing anytime-valid
16//! Type I error control.
17//!
18//! # Key Properties
19//!
20//! - **Anytime-valid**: Can stop testing early without invalid inference
21//! - **No false positives in stable runs**: E[E_t] ≤ 1 under H₀
22//! - **Early detection**: Strong evidence triggers early failure
23//! - **Variable-length support**: Works with different test run lengths
24//!
25//! # Failure Modes
26//!
27//! | Condition | Behavior | Rationale |
28//! |-----------|----------|-----------|
29//! | σ = 0 | Clamp to σ_MIN | Division by zero guard |
30//! | E_t underflow | Clamp to E_MIN | Prevents permanent zero-lock |
31//! | E_t overflow | Clamp to E_MAX | Numerical stability |
32//! | No observations | E_t = 1 | Identity element |
33//!
34//! # Example
35//!
36//! ```rust,ignore
37//! use ftui_runtime::flake_detector::{FlakeDetector, FlakeConfig};
38//!
39//! let mut detector = FlakeDetector::new(FlakeConfig::default());
40//!
41//! // Observe latency deviations
42//! let decision = detector.observe(latency_deviation);
43//! if decision.is_flaky {
44//!     eprintln!("Flaky test detected");
45//! }
46//! ```
47
48#![forbid(unsafe_code)]
49
50use std::collections::VecDeque;
51
52/// Minimum sigma to prevent division by zero.
53const SIGMA_MIN: f64 = 1e-9;
54
55/// Minimum e-value floor to prevent permanent zero-lock.
56const E_MIN: f64 = 1e-100;
57
58/// Maximum e-value to prevent overflow.
59const E_MAX: f64 = 1e100;
60
61/// Default significance level.
62const DEFAULT_ALPHA: f64 = 0.05;
63
64/// Default lambda (betting intensity).
65const DEFAULT_LAMBDA: f64 = 0.5;
66
67// =============================================================================
68// Configuration
69// =============================================================================
70
71/// Configuration for the flake detector.
72#[derive(Debug, Clone)]
73pub struct FlakeConfig {
74    /// Significance level `α`. Fail when `E_t > 1/α`.
75    /// Lower α → more conservative (fewer false alarms). Default: 0.05.
76    pub alpha: f64,
77
78    /// Betting intensity `λ`. Higher values detect deviations faster
79    /// but are more sensitive to noise. Default: 0.5.
80    pub lambda: f64,
81
82    /// Prior estimate of standard deviation for latency residuals.
83    /// Used in the e-value formula. Default: 1.0 (normalized units).
84    pub sigma: f64,
85
86    /// Rolling window size for empirical variance estimation.
87    /// Set to 0 to use fixed sigma. Default: 50.
88    pub variance_window: usize,
89
90    /// Minimum observations before making decisions.
91    /// Helps with warm-up. Default: 3.
92    pub min_observations: usize,
93
94    /// Enable JSONL-compatible evidence logging. Default: false.
95    pub enable_logging: bool,
96
97    /// Minimum e-value before flagging as flaky (1/alpha).
98    /// Computed from alpha but can be overridden.
99    pub threshold: Option<f64>,
100}
101
102impl Default for FlakeConfig {
103    fn default() -> Self {
104        Self {
105            alpha: DEFAULT_ALPHA,
106            lambda: DEFAULT_LAMBDA,
107            sigma: 1.0,
108            variance_window: 50,
109            min_observations: 3,
110            enable_logging: false,
111            threshold: None,
112        }
113    }
114}
115
116impl FlakeConfig {
117    /// Create a new configuration with the given alpha.
118    #[must_use]
119    pub fn new(alpha: f64) -> Self {
120        Self {
121            alpha: alpha.clamp(1e-10, 0.5),
122            ..Default::default()
123        }
124    }
125
126    /// Set the betting intensity lambda.
127    #[must_use]
128    pub fn with_lambda(mut self, lambda: f64) -> Self {
129        self.lambda = lambda.clamp(0.01, 2.0);
130        self
131    }
132
133    /// Set the prior sigma.
134    #[must_use]
135    pub fn with_sigma(mut self, sigma: f64) -> Self {
136        self.sigma = sigma.max(SIGMA_MIN);
137        self
138    }
139
140    /// Set the variance window size.
141    #[must_use]
142    pub fn with_variance_window(mut self, window: usize) -> Self {
143        self.variance_window = window;
144        self
145    }
146
147    /// Set minimum observations.
148    #[must_use]
149    pub fn with_min_observations(mut self, min: usize) -> Self {
150        self.min_observations = min.max(1);
151        self
152    }
153
154    /// Enable logging.
155    #[must_use]
156    pub fn with_logging(mut self, enabled: bool) -> Self {
157        self.enable_logging = enabled;
158        self
159    }
160
161    /// Get the threshold (1/alpha).
162    #[must_use]
163    pub fn threshold(&self) -> f64 {
164        self.threshold.unwrap_or(1.0 / self.alpha)
165    }
166}
167
168// =============================================================================
169// Decision Types
170// =============================================================================
171
172/// Decision returned by the flake detector.
173#[derive(Debug, Clone, PartialEq)]
174pub struct FlakeDecision {
175    /// Whether the test is flagged as flaky.
176    pub is_flaky: bool,
177    /// Current cumulative e-value.
178    pub e_value: f64,
179    /// Threshold for flakiness (1/alpha).
180    pub threshold: f64,
181    /// Number of observations so far.
182    pub observation_count: usize,
183    /// Current variance estimate.
184    pub variance_estimate: f64,
185    /// Whether we have enough observations.
186    pub warmed_up: bool,
187}
188
189impl FlakeDecision {
190    /// Check if we should fail the test.
191    #[must_use]
192    pub fn should_fail(&self) -> bool {
193        self.is_flaky && self.warmed_up
194    }
195}
196
197/// Log entry for evidence tracking.
198#[derive(Debug, Clone)]
199pub struct EvidenceLog {
200    /// Observation index.
201    pub observation_idx: usize,
202    /// The residual value observed.
203    pub residual: f64,
204    /// The incremental e-value for this observation.
205    pub e_increment: f64,
206    /// Cumulative e-value after this observation.
207    pub e_cumulative: f64,
208    /// Variance estimate at this point.
209    pub variance: f64,
210    /// Decision at this point.
211    pub decision: bool,
212}
213
214impl EvidenceLog {
215    /// Serialize to JSONL format.
216    #[must_use]
217    pub fn to_jsonl(&self) -> String {
218        format!(
219            r#"{{"idx":{},"residual":{:.6},"e_inc":{:.6},"e_cum":{:.6},"var":{:.6},"decision":{}}}"#,
220            self.observation_idx,
221            self.residual,
222            self.e_increment,
223            self.e_cumulative,
224            self.variance,
225            self.decision
226        )
227    }
228}
229
230// =============================================================================
231// Flake Detector
232// =============================================================================
233
234/// Anytime-valid flake detector using e-process statistics.
235#[derive(Debug, Clone)]
236pub struct FlakeDetector {
237    /// Configuration.
238    config: FlakeConfig,
239    /// Cumulative e-value (product of incremental e-values).
240    e_cumulative: f64,
241    /// Observation count.
242    observation_count: usize,
243    /// Rolling window for variance estimation.
244    variance_window: VecDeque<f64>,
245    /// Online mean for variance calculation.
246    online_mean: f64,
247    /// Online M2 for variance calculation (Welford's algorithm).
248    online_m2: f64,
249    /// Evidence log (if logging enabled).
250    evidence_log: Vec<EvidenceLog>,
251}
252
253impl FlakeDetector {
254    /// Create a new flake detector.
255    #[must_use]
256    pub fn new(config: FlakeConfig) -> Self {
257        let capacity = if config.variance_window > 0 {
258            config.variance_window
259        } else {
260            1
261        };
262        Self {
263            config,
264            e_cumulative: 1.0, // Identity element
265            observation_count: 0,
266            variance_window: VecDeque::with_capacity(capacity),
267            online_mean: 0.0,
268            online_m2: 0.0,
269            evidence_log: Vec::new(),
270        }
271    }
272
273    /// Observe a latency deviation (residual).
274    ///
275    /// The residual should be the difference between observed latency
276    /// and expected latency, ideally normalized.
277    pub fn observe(&mut self, residual: f64) -> FlakeDecision {
278        self.observation_count += 1;
279
280        // Update variance estimate
281        self.update_variance(residual);
282        let sigma = self.current_sigma();
283
284        // Compute incremental e-value: e_t = exp(λ × r_t − (λ² × σ²) / 2)
285        let lambda = self.config.lambda;
286        let exponent = lambda * residual - (lambda * lambda * sigma * sigma) / 2.0;
287        let e_increment = exponent.exp().clamp(E_MIN, E_MAX);
288
289        // Update cumulative e-value
290        self.e_cumulative = (self.e_cumulative * e_increment).clamp(E_MIN, E_MAX);
291
292        // Check threshold
293        let threshold = self.config.threshold();
294        let is_flaky = self.e_cumulative > threshold;
295        let warmed_up = self.observation_count >= self.config.min_observations;
296
297        // Log if enabled
298        if self.config.enable_logging {
299            self.evidence_log.push(EvidenceLog {
300                observation_idx: self.observation_count,
301                residual,
302                e_increment,
303                e_cumulative: self.e_cumulative,
304                variance: sigma * sigma,
305                decision: is_flaky && warmed_up,
306            });
307        }
308
309        FlakeDecision {
310            is_flaky,
311            e_value: self.e_cumulative,
312            threshold,
313            observation_count: self.observation_count,
314            variance_estimate: sigma * sigma,
315            warmed_up,
316        }
317    }
318
319    /// Observe multiple residuals and return the final decision.
320    pub fn observe_batch(&mut self, residuals: &[f64]) -> FlakeDecision {
321        let mut decision = FlakeDecision {
322            is_flaky: false,
323            e_value: self.e_cumulative,
324            threshold: self.config.threshold(),
325            observation_count: self.observation_count,
326            variance_estimate: self.current_sigma().powi(2),
327            warmed_up: false,
328        };
329
330        for &r in residuals {
331            decision = self.observe(r);
332            if decision.should_fail() {
333                break; // Early stopping with anytime-valid guarantee
334            }
335        }
336
337        decision
338    }
339
340    /// Reset the detector state.
341    pub fn reset(&mut self) {
342        self.e_cumulative = 1.0;
343        self.observation_count = 0;
344        self.variance_window.clear();
345        self.online_mean = 0.0;
346        self.online_m2 = 0.0;
347        self.evidence_log.clear();
348    }
349
350    /// Get the current e-value.
351    #[must_use]
352    pub fn e_value(&self) -> f64 {
353        self.e_cumulative
354    }
355
356    /// Get the observation count.
357    #[must_use]
358    pub fn observation_count(&self) -> usize {
359        self.observation_count
360    }
361
362    /// Check if the detector has warmed up.
363    #[must_use]
364    pub fn is_warmed_up(&self) -> bool {
365        self.observation_count >= self.config.min_observations
366    }
367
368    /// Get the evidence log.
369    #[must_use]
370    pub fn evidence_log(&self) -> &[EvidenceLog] {
371        &self.evidence_log
372    }
373
374    /// Export evidence log as JSONL.
375    #[must_use]
376    pub fn evidence_to_jsonl(&self) -> String {
377        self.evidence_log
378            .iter()
379            .map(|e| e.to_jsonl())
380            .collect::<Vec<_>>()
381            .join("\n")
382    }
383
384    /// Get the current sigma estimate.
385    #[must_use]
386    pub fn current_sigma(&self) -> f64 {
387        if self.config.variance_window == 0 || self.observation_count < 2 {
388            return self.config.sigma.max(SIGMA_MIN);
389        }
390
391        // Use Welford's variance estimate
392        let variance = if self.observation_count > 1 {
393            self.online_m2 / (self.observation_count - 1) as f64
394        } else {
395            self.config.sigma * self.config.sigma
396        };
397
398        variance.sqrt().max(SIGMA_MIN)
399    }
400
401    /// Update variance estimate with new observation.
402    fn update_variance(&mut self, residual: f64) {
403        if self.config.variance_window == 0 {
404            return;
405        }
406
407        // Welford's online algorithm
408        let n = self.observation_count as f64;
409        let delta = residual - self.online_mean;
410        self.online_mean += delta / n;
411        let delta2 = residual - self.online_mean;
412        self.online_m2 += delta * delta2;
413
414        // Also maintain rolling window for optional use
415        if self.variance_window.len() >= self.config.variance_window {
416            self.variance_window.pop_front();
417        }
418        self.variance_window.push_back(residual);
419    }
420
421    /// Get configuration.
422    #[must_use]
423    pub fn config(&self) -> &FlakeConfig {
424        &self.config
425    }
426}
427
428impl Default for FlakeDetector {
429    fn default() -> Self {
430        Self::new(FlakeConfig::default())
431    }
432}
433
434// =============================================================================
435// Summary Statistics
436// =============================================================================
437
438/// Summary of flake detection run.
439#[derive(Debug, Clone)]
440pub struct FlakeSummary {
441    /// Total observations.
442    pub total_observations: usize,
443    /// Final e-value.
444    pub final_e_value: f64,
445    /// Whether flagged as flaky.
446    pub is_flaky: bool,
447    /// Observation index where flakiness was first detected (if any).
448    pub first_flaky_at: Option<usize>,
449    /// Maximum e-value observed.
450    pub max_e_value: f64,
451    /// Threshold used.
452    pub threshold: f64,
453}
454
455impl FlakeDetector {
456    /// Generate summary statistics.
457    #[must_use]
458    pub fn summary(&self) -> FlakeSummary {
459        let first_flaky_at = self
460            .evidence_log
461            .iter()
462            .find(|e| e.decision)
463            .map(|e| e.observation_idx);
464
465        let max_e_value = self
466            .evidence_log
467            .iter()
468            .map(|e| e.e_cumulative)
469            .fold(1.0_f64, f64::max);
470
471        FlakeSummary {
472            total_observations: self.observation_count,
473            final_e_value: self.e_cumulative,
474            is_flaky: self.e_cumulative > self.config.threshold(),
475            first_flaky_at,
476            max_e_value,
477            threshold: self.config.threshold(),
478        }
479    }
480}
481
482// =============================================================================
483// Tests
484// =============================================================================
485
486#[cfg(test)]
487mod tests {
488    use super::*;
489
490    #[test]
491    fn unit_eprocess_threshold() {
492        // Test that we fail when E_t > 1/alpha
493        let config = FlakeConfig::new(0.05).with_min_observations(1);
494        let mut detector = FlakeDetector::new(config);
495
496        // Feed large positive residuals to drive e-value up
497        for _ in 0..20 {
498            let decision = detector.observe(3.0); // Large deviation
499            if decision.should_fail() {
500                // Should eventually trigger
501                assert!(decision.e_value > decision.threshold);
502                return;
503            }
504        }
505
506        // If we didn't fail, check threshold
507        let decision = detector.observe(0.0);
508        assert!(
509            decision.e_value > decision.threshold || !decision.is_flaky,
510            "Should either have triggered or not be flaky"
511        );
512    }
513
514    #[test]
515    fn unit_eprocess_nonnegative() {
516        // E-values should never be negative
517        let mut detector = FlakeDetector::default();
518
519        // Test with various residuals including negative
520        let residuals = [-5.0, -2.0, 0.0, 2.0, 5.0, -10.0, 10.0];
521        for r in residuals {
522            let decision = detector.observe(r);
523            assert!(
524                decision.e_value > 0.0,
525                "E-value must be positive, got {}",
526                decision.e_value
527            );
528        }
529    }
530
531    #[test]
532    fn unit_optional_stopping() {
533        // Stopping early should preserve decision validity
534        let config = FlakeConfig::new(0.05)
535            .with_lambda(0.3)
536            .with_min_observations(1)
537            .with_logging(true);
538        let mut detector = FlakeDetector::new(config);
539
540        // Simulate stable run (small residuals around 0)
541        let stable_residuals: Vec<f64> = (0..100).map(|i| (i as f64 * 0.1).sin() * 0.1).collect();
542
543        let decision = detector.observe_batch(&stable_residuals);
544
545        // Under H₀ (stable), we shouldn't flag as flaky
546        // Note: Due to random variation, we check the e-value is reasonable
547        assert!(
548            decision.e_value <= decision.threshold * 2.0 || !decision.should_fail(),
549            "Stable run should rarely trigger flakiness"
550        );
551    }
552
553    #[test]
554    fn unit_stable_run_no_false_positives() {
555        // A truly stable run should not trigger false positives
556        let config = FlakeConfig::new(0.05)
557            .with_sigma(1.0)
558            .with_lambda(0.5)
559            .with_min_observations(3);
560        let mut detector = FlakeDetector::new(config);
561
562        // Zero residuals (perfectly stable)
563        for _ in 0..50 {
564            let decision = detector.observe(0.0);
565            // With zero residuals, e_increment = exp(-λ²σ²/2) < 1
566            // So e_cumulative should decrease over time
567            assert!(
568                !decision.should_fail(),
569                "Zero residuals should never trigger flakiness"
570            );
571        }
572    }
573
574    #[test]
575    fn unit_spike_detection() {
576        // Inject latency spikes and verify detection
577        let config = FlakeConfig::new(0.05)
578            .with_sigma(1.0)
579            .with_lambda(0.5)
580            .with_min_observations(3)
581            .with_logging(true);
582        let mut detector = FlakeDetector::new(config);
583
584        // Start with some normal observations
585        for _ in 0..5 {
586            detector.observe(0.1);
587        }
588
589        // Inject spike
590        let mut detected = false;
591        for _ in 0..20 {
592            let decision = detector.observe(5.0); // Large spike
593            if decision.should_fail() {
594                detected = true;
595                break;
596            }
597        }
598
599        assert!(detected, "Should detect sustained spike");
600    }
601
602    #[test]
603    fn unit_reset() {
604        let mut detector = FlakeDetector::default();
605        detector.observe(1.0);
606        detector.observe(2.0);
607
608        assert_eq!(detector.observation_count(), 2);
609
610        detector.reset();
611
612        assert_eq!(detector.observation_count(), 0);
613        assert!((detector.e_value() - 1.0).abs() < 1e-10);
614    }
615
616    #[test]
617    fn unit_variance_estimation() {
618        let config = FlakeConfig::default().with_variance_window(10);
619        let mut detector = FlakeDetector::new(config);
620
621        // Feed constant residuals
622        for _ in 0..20 {
623            detector.observe(1.0);
624        }
625
626        // With constant input, variance should be low
627        let sigma = detector.current_sigma();
628        assert!(
629            sigma < 0.1 || (sigma - 1.0).abs() < 0.5,
630            "Variance should converge"
631        );
632    }
633
634    #[test]
635    fn unit_evidence_log() {
636        let config = FlakeConfig::default()
637            .with_logging(true)
638            .with_min_observations(1);
639        let mut detector = FlakeDetector::new(config);
640
641        detector.observe(0.5);
642        detector.observe(1.0);
643        detector.observe(-0.5);
644
645        assert_eq!(detector.evidence_log().len(), 3);
646
647        let jsonl = detector.evidence_to_jsonl();
648        assert!(jsonl.contains("\"idx\":1"));
649        assert!(jsonl.contains("\"idx\":2"));
650        assert!(jsonl.contains("\"idx\":3"));
651    }
652
653    #[test]
654    fn unit_summary() {
655        let config = FlakeConfig::default()
656            .with_logging(true)
657            .with_min_observations(1);
658        let mut detector = FlakeDetector::new(config);
659
660        for _ in 0..10 {
661            detector.observe(0.1);
662        }
663
664        let summary = detector.summary();
665        assert_eq!(summary.total_observations, 10);
666        assert!(summary.final_e_value > 0.0);
667        assert!(summary.threshold > 0.0);
668    }
669
670    #[test]
671    fn unit_batch_observe() {
672        let config = FlakeConfig::default().with_min_observations(1);
673        let mut detector = FlakeDetector::new(config);
674
675        let residuals = vec![0.1, 0.2, 0.3, 0.4, 0.5];
676        let decision = detector.observe_batch(&residuals);
677
678        assert_eq!(decision.observation_count, 5);
679    }
680
681    #[test]
682    fn unit_config_builder() {
683        let config = FlakeConfig::new(0.01)
684            .with_lambda(0.3)
685            .with_sigma(2.0)
686            .with_variance_window(100)
687            .with_min_observations(5)
688            .with_logging(true);
689
690        assert!((config.alpha - 0.01).abs() < 1e-10);
691        assert!((config.lambda - 0.3).abs() < 1e-10);
692        assert!((config.sigma - 2.0).abs() < 1e-10);
693        assert_eq!(config.variance_window, 100);
694        assert_eq!(config.min_observations, 5);
695        assert!(config.enable_logging);
696        assert!((config.threshold() - 100.0).abs() < 1e-10);
697    }
698
699    #[test]
700    fn unit_numerical_stability() {
701        let mut detector = FlakeDetector::default();
702
703        // Very large residuals
704        for _ in 0..10 {
705            let decision = detector.observe(1000.0);
706            assert!(decision.e_value.is_finite());
707            assert!(decision.e_value > 0.0);
708        }
709
710        detector.reset();
711
712        // Very small negative residuals
713        for _ in 0..10 {
714            let decision = detector.observe(-1000.0);
715            assert!(decision.e_value.is_finite());
716            assert!(decision.e_value > 0.0);
717        }
718    }
719}