Skip to main content

tacet_core/
result.rs

1//! Result types for adaptive Bayesian timing analysis.
2//!
3//! See spec Section 4.1 (Result Types) for the full specification.
4
5extern crate alloc;
6
7use alloc::string::String;
8use alloc::vec::Vec;
9use core::fmt;
10use serde::{Deserialize, Serialize};
11
12// ============================================================================
13// Outcome - The top-level result type
14// ============================================================================
15
16/// Top-level outcome of a timing test.
17///
18/// The adaptive Bayesian oracle returns one of four outcomes:
19/// - `Pass`: No timing leak detected (leak_probability < pass_threshold)
20/// - `Fail`: Timing leak confirmed (leak_probability > fail_threshold)
21/// - `Inconclusive`: Cannot reach a definitive conclusion
22/// - `Unmeasurable`: Operation too fast to measure on this platform
23///
24/// See spec Section 4.1 (Result Types).
25#[derive(Clone, Serialize, Deserialize)]
26#[allow(clippy::large_enum_variant)]
27pub enum Outcome {
28    /// No timing leak detected.
29    ///
30    /// The posterior probability of a timing leak is below the pass threshold
31    /// (default 0.05), meaning we're confident there is no exploitable leak.
32    Pass {
33        /// Posterior probability of timing leak: P(effect > theta | data).
34        /// Will be < pass_threshold (default 0.05).
35        leak_probability: f64,
36
37        /// Effect size estimate (shift and tail components).
38        effect: EffectEstimate,
39
40        /// Number of samples used in the analysis.
41        samples_used: usize,
42
43        /// Measurement quality assessment.
44        quality: MeasurementQuality,
45
46        /// Diagnostic information for debugging.
47        diagnostics: Diagnostics,
48
49        /// User's requested threshold in nanoseconds.
50        theta_user: f64,
51
52        /// Effective threshold used for inference (may be elevated due to measurement floor).
53        theta_eff: f64,
54
55        /// Measurement floor at final sample count.
56        theta_floor: f64,
57    },
58
59    /// Timing leak confirmed.
60    ///
61    /// The posterior probability of a timing leak exceeds the fail threshold
62    /// (default 0.95), meaning we're confident there is an exploitable leak.
63    Fail {
64        /// Posterior probability of timing leak: P(effect > theta | data).
65        /// Will be > fail_threshold (default 0.95).
66        leak_probability: f64,
67
68        /// Effect size estimate (shift and tail components).
69        effect: EffectEstimate,
70
71        /// Exploitability assessment based on effect magnitude.
72        exploitability: Exploitability,
73
74        /// Number of samples used in the analysis.
75        samples_used: usize,
76
77        /// Measurement quality assessment.
78        quality: MeasurementQuality,
79
80        /// Diagnostic information for debugging.
81        diagnostics: Diagnostics,
82
83        /// User's requested threshold in nanoseconds.
84        theta_user: f64,
85
86        /// Effective threshold used for inference (may be elevated due to measurement floor).
87        theta_eff: f64,
88
89        /// Measurement floor at final sample count.
90        theta_floor: f64,
91    },
92
93    /// Cannot reach a definitive conclusion.
94    ///
95    /// The posterior probability is between pass_threshold and fail_threshold,
96    /// or the analysis hit a limit (timeout, sample budget, noise).
97    Inconclusive {
98        /// Reason why the result is inconclusive.
99        reason: InconclusiveReason,
100
101        /// Current posterior probability of timing leak.
102        leak_probability: f64,
103
104        /// Effect size estimate (may have wide credible intervals).
105        effect: EffectEstimate,
106
107        /// Number of samples used in the analysis.
108        samples_used: usize,
109
110        /// Measurement quality assessment.
111        quality: MeasurementQuality,
112
113        /// Diagnostic information for debugging.
114        diagnostics: Diagnostics,
115
116        /// User's requested threshold in nanoseconds.
117        theta_user: f64,
118
119        /// Effective threshold used for inference (may be elevated due to measurement floor).
120        theta_eff: f64,
121
122        /// Measurement floor at final sample count.
123        theta_floor: f64,
124    },
125
126    /// Operation too fast to measure reliably on this platform.
127    ///
128    /// The operation completes faster than the timer's resolution allows
129    /// for meaningful measurement, even with adaptive batching.
130    Unmeasurable {
131        /// Estimated operation duration in nanoseconds.
132        operation_ns: f64,
133
134        /// Minimum measurable duration on this platform.
135        threshold_ns: f64,
136
137        /// Platform description (e.g., "Apple Silicon (cntvct)").
138        platform: String,
139
140        /// Suggested actions to make the operation measurable.
141        recommendation: String,
142    },
143
144    /// Research mode result.
145    ///
146    /// Returned when using `AttackerModel::Research`. Unlike Pass/Fail/Inconclusive
147    /// which make threshold-based decisions, research mode characterizes the
148    /// timing behavior relative to the measurement floor using CI-based semantics.
149    ///
150    /// See `ResearchOutcome` for details on the stopping conditions.
151    Research(ResearchOutcome),
152}
153
154// ============================================================================
155// InconclusiveReason - Why we couldn't reach a conclusion
156// ============================================================================
157
158/// Reason why a timing test result is inconclusive.
159///
160/// See spec Section 4.1 (Result Types).
161#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
162pub enum InconclusiveReason {
163    /// Data is too noisy to reach a conclusion.
164    ///
165    /// The measurement noise is high enough that we cannot distinguish
166    /// between "no leak" and "small leak" with the available samples.
167    DataTooNoisy {
168        /// Human-readable explanation.
169        message: String,
170        /// Suggested actions to improve measurement quality.
171        guidance: String,
172    },
173
174    /// Posterior is not converging toward either threshold.
175    ///
176    /// After collecting samples, the leak probability remains in the
177    /// inconclusive range and isn't trending toward pass or fail.
178    NotLearning {
179        /// Human-readable explanation.
180        message: String,
181        /// Suggested actions.
182        guidance: String,
183    },
184
185    /// Reaching a conclusion would take too long.
186    ///
187    /// Based on current convergence rate, reaching the pass or fail
188    /// threshold would exceed the configured time budget.
189    WouldTakeTooLong {
190        /// Estimated time in seconds to reach a conclusion.
191        estimated_time_secs: f64,
192        /// Estimated samples needed to reach a conclusion.
193        samples_needed: usize,
194        /// Suggested actions.
195        guidance: String,
196    },
197
198    /// Time budget exhausted.
199    ///
200    /// The configured time limit was reached before the posterior
201    /// converged to a conclusive result.
202    TimeBudgetExceeded {
203        /// Posterior probability when budget was exhausted.
204        current_probability: f64,
205        /// Number of samples collected.
206        samples_collected: usize,
207    },
208
209    /// Sample budget exhausted.
210    ///
211    /// The maximum number of samples was collected without reaching
212    /// a conclusive result.
213    SampleBudgetExceeded {
214        /// Posterior probability when budget was exhausted.
215        current_probability: f64,
216        /// Number of samples collected.
217        samples_collected: usize,
218    },
219
220    /// Measurement conditions changed during the test.
221    ///
222    /// Detected by comparing calibration statistics with post-test statistics.
223    /// This can indicate environmental interference (CPU frequency scaling,
224    /// concurrent processes, etc.) that invalidates the covariance estimate.
225    /// See spec Section 2.6, Gate 6.
226    ConditionsChanged {
227        /// Human-readable explanation.
228        message: String,
229        /// Suggested actions.
230        guidance: String,
231    },
232
233    /// Threshold was elevated and pass criterion was met at effective threshold.
234    ///
235    /// The measurement floor exceeded the user's requested threshold, so inference
236    /// was performed at an elevated effective threshold. The posterior probability
237    /// dropped below pass_threshold at θ_eff, but since θ_eff > θ_user + ε, we
238    /// cannot guarantee the user's original requirement is met.
239    ///
240    /// This is NOT a quality gate failure - it's a semantic constraint: Pass requires
241    /// both P < pass_threshold AND θ_eff ≤ θ_user + ε.
242    ///
243    /// See spec Section 3.5.3 (v5.5 Threshold Elevation Decision Rule).
244    ThresholdElevated {
245        /// User's requested threshold in nanoseconds (θ_user).
246        theta_user: f64,
247        /// Effective threshold used for inference (θ_eff = max(θ_user, θ_floor)).
248        theta_eff: f64,
249        /// Posterior probability at θ_eff (was < pass_threshold).
250        leak_probability_at_eff: f64,
251        /// True: P(leak > θ_eff) < pass_threshold (pass criterion met at elevated threshold).
252        meets_pass_criterion_at_eff: bool,
253        /// True: θ_floor at max_samples would be ≤ θ_user + ε (more samples could achieve user threshold).
254        achievable_at_max: bool,
255        /// Human-readable explanation.
256        message: String,
257        /// Suggested actions.
258        guidance: String,
259    },
260}
261
262// ============================================================================
263// EffectEstimate - Decomposed timing effect
264// ============================================================================
265
266/// Estimated timing effect decomposed into shift and tail components.
267///
268/// The effect is decomposed using a 2-component linear model:
269/// - **Shift**: Uniform timing difference across all quantiles (e.g., different code path)
270/// - **Tail**: Upper quantiles shift more than lower (e.g., cache misses)
271///
272/// See spec Section 2.5 (Bayesian Inference).
273#[derive(Debug, Clone, Serialize, Deserialize)]
274pub struct EffectEstimate {
275    /// Uniform shift in nanoseconds.
276    ///
277    /// Positive value means the sample class is slower (timing leak detected).
278    /// Negative value means the sample class is faster (no leak, or unusual).
279    /// This captures effects like branch timing where all operations
280    /// take a fixed additional time.
281    pub shift_ns: f64,
282
283    /// Tail effect in nanoseconds.
284    ///
285    /// Positive value means the sample class has a heavier upper tail.
286    /// This captures effects like cache misses that occur probabilistically.
287    pub tail_ns: f64,
288
289    /// 95% credible interval for the total effect magnitude in nanoseconds.
290    ///
291    /// This is a Bayesian credible interval, not a frequentist confidence interval.
292    /// There is a 95% posterior probability that the true effect lies within this range.
293    pub credible_interval_ns: (f64, f64),
294
295    /// Classification of the dominant effect pattern.
296    pub pattern: EffectPattern,
297
298    /// When Some, the (μ, τ) decomposition may be unreliable.
299    ///
300    /// This is set when model fit is poor (Q > q_thresh), indicating that the
301    /// observed quantile pattern is not well-explained by the shift+tail basis.
302    /// The shift_ns and tail_ns values should be interpreted with caution.
303    #[serde(skip_serializing_if = "Option::is_none")]
304    pub interpretation_caveat: Option<String>,
305}
306
307impl EffectEstimate {
308    /// Compute the total effect magnitude (L2 norm of shift and tail).
309    #[cfg(feature = "std")]
310    pub fn total_effect_ns(&self) -> f64 {
311        (self.shift_ns.powi(2) + self.tail_ns.powi(2)).sqrt()
312    }
313
314    /// Compute the total effect magnitude (L2 norm of shift and tail).
315    #[cfg(not(feature = "std"))]
316    pub fn total_effect_ns(&self) -> f64 {
317        libm::sqrt(self.shift_ns * self.shift_ns + self.tail_ns * self.tail_ns)
318    }
319
320    /// Check if the effect is negligible (both components near zero).
321    pub fn is_negligible(&self, threshold_ns: f64) -> bool {
322        self.shift_ns.abs() < threshold_ns && self.tail_ns.abs() < threshold_ns
323    }
324}
325
326impl Default for EffectEstimate {
327    fn default() -> Self {
328        Self {
329            shift_ns: 0.0,
330            tail_ns: 0.0,
331            credible_interval_ns: (0.0, 0.0),
332            pattern: EffectPattern::Indeterminate,
333            interpretation_caveat: None,
334        }
335    }
336}
337
338// ============================================================================
339// EffectPattern - Classification of timing effect type
340// ============================================================================
341
342/// Pattern of timing difference.
343///
344/// Classifies the dominant type of timing difference based on the
345/// relative magnitudes of shift and tail components.
346#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
347pub enum EffectPattern {
348    /// Uniform shift across all quantiles.
349    ///
350    /// All quantiles shift by approximately the same amount.
351    /// Typical cause: branch on secret data, different code path.
352    UniformShift,
353
354    /// Primarily affects upper tail.
355    ///
356    /// Upper quantiles (e.g., 80th, 90th percentile) shift more than
357    /// lower quantiles. Typical cause: cache misses, memory access patterns.
358    TailEffect,
359
360    /// Mixed pattern with both shift and tail components.
361    ///
362    /// Both uniform shift and tail effect are significant.
363    Mixed,
364
365    /// Neither shift nor tail is statistically significant.
366    ///
367    /// The effect magnitude is below the detection threshold or
368    /// uncertainty is too high to classify.
369    #[default]
370    Indeterminate,
371}
372
373// ============================================================================
374// Exploitability - Risk assessment
375// ============================================================================
376
377/// Exploitability assessment based on effect magnitude.
378///
379/// Based on Crosby et al. (2009) thresholds for timing attack feasibility.
380/// These thresholds are heuristics based on academic research for risk
381/// prioritization, not guarantees. The thresholds reflect modern attack
382/// techniques including HTTP/2 multiplexing (Timeless Timing Attacks) and
383/// shared-hardware attacks (KyberSlash, Flush+Reload).
384///
385/// See spec Section 5.4 (Exploitability).
386#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
387pub enum Exploitability {
388    /// Effect < 10 ns: Requires shared hardware to exploit.
389    ///
390    /// Only exploitable by attackers with physical co-location: SGX enclaves,
391    /// hyperthreading on same core, containers on same host, or cross-VM on
392    /// shared cache. Remote exploitation is impractical.
393    ///
394    /// References: KyberSlash (2024), Flush+Reload, Prime+Probe literature
395    SharedHardwareOnly,
396
397    /// 10-100 ns: Exploitable via HTTP/2 request multiplexing.
398    ///
399    /// Requires ~100k concurrent HTTP/2 requests to exploit. The "Timeless
400    /// Timing Attacks" technique eliminates network jitter by sending requests
401    /// that arrive simultaneously, making response order reveal timing differences.
402    ///
403    /// Reference: Van Goethem et al., "Timeless Timing Attacks" (USENIX Security 2020)
404    Http2Multiplexing,
405
406    /// 100 ns - 10 μs: Exploitable with standard remote timing.
407    ///
408    /// Requires ~1k-10k requests using traditional timing techniques.
409    /// Exploitable on LAN with any protocol, or over internet with HTTP/2.
410    ///
411    /// References: Crosby et al. (2009), Brumley & Boneh (2005)
412    StandardRemote,
413
414    /// > 10 μs: Obvious timing leak, trivially exploitable.
415    ///
416    /// Detectable with < 100 requests. Exploitable over the internet even
417    /// with high-jitter connections using traditional timing techniques.
418    ObviousLeak,
419}
420
421impl Exploitability {
422    /// Determine exploitability from effect size in nanoseconds.
423    ///
424    /// Thresholds are based on:
425    /// - < 10 ns: Below HTTP/2 timing precision, requires shared hardware
426    /// - 10-100 ns: Within HTTP/2 "Timeless Timing Attacks" range
427    /// - 100 ns - 10 μs: Standard remote timing attack range
428    /// - > 10 μs: Trivially observable
429    pub fn from_effect_ns(effect_ns: f64) -> Self {
430        let effect_ns = effect_ns.abs();
431        if effect_ns < 10.0 {
432            Exploitability::SharedHardwareOnly
433        } else if effect_ns < 100.0 {
434            Exploitability::Http2Multiplexing
435        } else if effect_ns < 10_000.0 {
436            Exploitability::StandardRemote
437        } else {
438            Exploitability::ObviousLeak
439        }
440    }
441}
442
443// ============================================================================
444// MeasurementQuality - Assessment of measurement reliability
445// ============================================================================
446
447/// Measurement quality assessment based on noise level.
448///
449/// Quality is determined primarily by the minimum detectable effect (MDE)
450/// relative to the configured threshold.
451///
452/// See spec Section 5.5 (Quality Assessment).
453#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
454pub enum MeasurementQuality {
455    /// Low noise, high confidence (MDE < 5 ns).
456    Excellent,
457
458    /// Normal noise levels (MDE 5-20 ns).
459    Good,
460
461    /// High noise, results less reliable (MDE 20-100 ns).
462    Poor,
463
464    /// Cannot produce meaningful results (MDE > 100 ns).
465    TooNoisy,
466}
467
468impl MeasurementQuality {
469    /// Determine quality from minimum detectable effect.
470    ///
471    /// Invalid MDE values (less than or equal to 0 or non-finite) indicate a measurement problem
472    /// and are classified as `TooNoisy`.
473    ///
474    /// Very small MDE (< 0.01 ns) also indicates timer resolution issues
475    /// where most samples have identical values.
476    pub fn from_mde_ns(mde_ns: f64) -> Self {
477        // Invalid MDE indicates measurement failure
478        if mde_ns <= 0.01 || !mde_ns.is_finite() {
479            return MeasurementQuality::TooNoisy;
480        }
481
482        if mde_ns < 5.0 {
483            MeasurementQuality::Excellent
484        } else if mde_ns < 20.0 {
485            MeasurementQuality::Good
486        } else if mde_ns < 100.0 {
487            MeasurementQuality::Poor
488        } else {
489            MeasurementQuality::TooNoisy
490        }
491    }
492}
493
494// ============================================================================
495// ResearchOutcome - Result type for research mode
496// ============================================================================
497
498/// Status of a research mode run.
499///
500/// Research mode (AttackerModel::Research) doesn't make Pass/Fail decisions.
501/// Instead, it characterizes the timing behavior with respect to the measurement floor.
502#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
503pub enum ResearchStatus {
504    /// CI clearly above θ_floor — timing difference detected.
505    ///
506    /// The 95% credible interval lower bound is clearly above the measurement
507    /// floor (> 1.1 × θ_floor), indicating a confidently detectable effect.
508    EffectDetected,
509
510    /// CI clearly below θ_floor — no timing difference above noise.
511    ///
512    /// The 95% credible interval upper bound is clearly below the measurement
513    /// floor (< 0.9 × θ_floor), indicating no detectable effect.
514    NoEffectDetected,
515
516    /// Hit timer resolution limit; θ_floor is as good as it gets.
517    ///
518    /// Further sampling won't improve the measurement floor because we've
519    /// hit the fundamental timer tick resolution.
520    ResolutionLimitReached,
521
522    /// Data quality issue detected.
523    ///
524    /// A quality gate triggered during research mode. Unlike standard mode,
525    /// this doesn't block the result but is reported for transparency.
526    QualityIssue(InconclusiveReason),
527
528    /// Ran out of time/samples before reaching conclusion.
529    ///
530    /// The budget was exhausted before the CI could confidently settle
531    /// above or below the measurement floor.
532    BudgetExhausted,
533}
534
535/// Research mode outcome (spec v4.1 research mode).
536///
537/// This struct is returned when using `AttackerModel::Research`. Unlike the
538/// standard `Outcome` which makes Pass/Fail decisions, research mode characterizes
539/// the timing behavior relative to the measurement floor.
540///
541/// Key differences from standard mode:
542/// - No Pass/Fail verdict (no threshold comparison)
543/// - Reports measurement floor (`theta_floor`) at final sample size
544/// - `detectable` field indicates if CI lower bound > floor
545/// - `model_mismatch` is non-blocking (tracked but doesn't stop analysis)
546#[derive(Debug, Clone, Serialize, Deserialize)]
547pub struct ResearchOutcome {
548    /// Research outcome status.
549    pub status: ResearchStatus,
550
551    /// Maximum effect across quantiles: max_k |(Xβ)_k| in nanoseconds.
552    /// This is the posterior mean of the maximum absolute predicted effect.
553    pub max_effect_ns: f64,
554
555    /// 95% credible interval for maximum effect: (2.5th, 97.5th percentile).
556    pub max_effect_ci: (f64, f64),
557
558    /// Measurement floor at final sample size.
559    /// This is the minimum detectable effect given measurement noise.
560    pub theta_floor: f64,
561
562    /// True if the effect is detectable: CI lower bound > theta_floor.
563    pub detectable: bool,
564
565    /// True if model mismatch was detected (Q > q_thresh).
566    /// In research mode, this is non-blocking but adds a caveat to interpretation.
567    pub model_mismatch: bool,
568
569    /// Effect size estimate with decomposition.
570    /// If `model_mismatch` is true, `interpretation_caveat` will be set.
571    pub effect: EffectEstimate,
572
573    /// Number of samples used.
574    pub samples_used: usize,
575
576    /// Measurement quality assessment.
577    pub quality: MeasurementQuality,
578
579    /// Diagnostic information.
580    pub diagnostics: Diagnostics,
581}
582
583impl ResearchOutcome {
584    /// Check if a timing effect was confidently detected.
585    pub fn is_effect_detected(&self) -> bool {
586        matches!(self.status, ResearchStatus::EffectDetected)
587    }
588
589    /// Check if no effect was confidently detected.
590    pub fn is_no_effect_detected(&self) -> bool {
591        matches!(self.status, ResearchStatus::NoEffectDetected)
592    }
593
594    /// Check if the resolution limit was reached.
595    pub fn is_resolution_limit_reached(&self) -> bool {
596        matches!(self.status, ResearchStatus::ResolutionLimitReached)
597    }
598
599    /// Check if there was a quality issue.
600    pub fn has_quality_issue(&self) -> bool {
601        matches!(self.status, ResearchStatus::QualityIssue(_))
602    }
603
604    /// Get the effect estimate.
605    pub fn effect(&self) -> &EffectEstimate {
606        &self.effect
607    }
608
609    /// Get the measurement quality.
610    pub fn quality(&self) -> MeasurementQuality {
611        self.quality
612    }
613
614    /// Get the diagnostics.
615    pub fn diagnostics(&self) -> &Diagnostics {
616        &self.diagnostics
617    }
618}
619
620impl fmt::Display for ResearchStatus {
621    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
622        match self {
623            ResearchStatus::EffectDetected => write!(f, "effect detected"),
624            ResearchStatus::NoEffectDetected => write!(f, "no effect detected"),
625            ResearchStatus::ResolutionLimitReached => write!(f, "resolution limit reached"),
626            ResearchStatus::QualityIssue(reason) => write!(f, "quality issue: {}", reason),
627            ResearchStatus::BudgetExhausted => write!(f, "budget exhausted"),
628        }
629    }
630}
631
632impl fmt::Display for ResearchOutcome {
633    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
634        writeln!(f, "Research Mode: {}", self.status)?;
635        writeln!(
636            f,
637            "  Max effect: {:.2}ns (CI: {:.2}-{:.2}ns)",
638            self.max_effect_ns, self.max_effect_ci.0, self.max_effect_ci.1
639        )?;
640        writeln!(f, "  Measurement floor: {:.2}ns", self.theta_floor)?;
641        writeln!(
642            f,
643            "  Detectable: {}",
644            if self.detectable { "yes" } else { "no" }
645        )?;
646        if self.model_mismatch {
647            writeln!(f, "  Warning: model mismatch detected")?;
648        }
649        writeln!(f, "  Samples: {}", self.samples_used)?;
650        writeln!(f, "  Quality: {}", self.quality)?;
651        Ok(())
652    }
653}
654
655// ============================================================================
656// TopQuantile - Information about significant quantiles
657// ============================================================================
658
659/// Information about a significant quantile (for projection mismatch reporting).
660///
661/// When the 2D (shift, tail) projection doesn't fit the data well, this struct
662/// provides information about which individual quantiles are driving the leak
663/// detection. This helps diagnose effects that don't fit the shift+tail model
664/// (e.g., effects concentrated at a single quantile).
665///
666/// See spec Section 7.5 (Per-Quantile Exceedance).
667#[derive(Debug, Clone, Serialize, Deserialize)]
668pub struct TopQuantile {
669    /// Quantile probability (e.g., 0.9 for 90th percentile).
670    pub quantile_p: f64,
671
672    /// Posterior mean δ_k in nanoseconds.
673    pub mean_ns: f64,
674
675    /// 95% marginal credible interval (lower, upper) in nanoseconds.
676    pub ci95_ns: (f64, f64),
677
678    /// P(|δ_k| > θ_eff | Δ) - per-quantile exceedance probability.
679    ///
680    /// This is the probability that this individual quantile's effect
681    /// exceeds the threshold, computed from the marginal posterior.
682    pub exceed_prob: f64,
683}
684
685impl TopQuantile {
686    /// Create a new TopQuantile entry.
687    pub fn new(quantile_p: f64, mean_ns: f64, ci95_ns: (f64, f64), exceed_prob: f64) -> Self {
688        Self {
689            quantile_p,
690            mean_ns,
691            ci95_ns,
692            exceed_prob,
693        }
694    }
695}
696
697// ============================================================================
698// Diagnostics - Detailed diagnostic information
699// ============================================================================
700
701/// Diagnostic information for debugging and analysis.
702///
703/// See spec Section 4.1 (Result Types).
704#[derive(Debug, Clone, Serialize, Deserialize)]
705pub struct Diagnostics {
706    /// Block size used for bootstrap (Politis-White automatic selection).
707    pub dependence_length: usize,
708
709    /// Effective sample size accounting for autocorrelation (ESS approx n / dependence_length).
710    pub effective_sample_size: usize,
711
712    /// Non-stationarity: ratio of inference to calibration variance.
713    /// Values 0.5-2.0 are normal; >5.0 indicates non-stationarity.
714    pub stationarity_ratio: f64,
715
716    /// True if stationarity ratio is within acceptable bounds (0.5-2.0).
717    pub stationarity_ok: bool,
718
719    /// Projection mismatch Q statistic.
720    ///
721    /// Measures how well the 2D (shift, tail) model fits the 9D quantile differences.
722    /// A high value indicates the effect is concentrated at specific quantiles
723    /// rather than following the shift+tail pattern.
724    pub projection_mismatch_q: f64,
725
726    /// Bootstrap-calibrated threshold for projection mismatch Q statistic.
727    /// Q > threshold indicates the 2D projection may be unreliable.
728    pub projection_mismatch_threshold: f64,
729
730    /// True if projection fits well (Q <= threshold).
731    ///
732    /// When false, the shift_ns and tail_ns estimates should be interpreted
733    /// with caution; use top_quantiles for more detailed information.
734    pub projection_mismatch_ok: bool,
735
736    /// Top quantiles by exceedance probability (when projection mismatch detected).
737    ///
738    /// When projection_mismatch_ok is false, this field contains detailed
739    /// information about which quantiles drive the leak detection. This helps
740    /// diagnose effects that don't fit the shift+tail model (e.g., effects
741    /// concentrated at a single quantile).
742    #[serde(skip_serializing_if = "Option::is_none")]
743    pub top_quantiles: Option<Vec<TopQuantile>>,
744
745    /// Outlier rate for baseline class (fraction trimmed).
746    pub outlier_rate_baseline: f64,
747
748    /// Outlier rate for sample class (fraction trimmed).
749    pub outlier_rate_sample: f64,
750
751    /// True if outlier rates are symmetric (both <1%, ratio <3x, diff <2%).
752    pub outlier_asymmetry_ok: bool,
753
754    /// Whether discrete timer mode was used (low timer resolution).
755    pub discrete_mode: bool,
756
757    /// Timer resolution in nanoseconds.
758    pub timer_resolution_ns: f64,
759
760    /// Fraction of samples with duplicate timing values (0.0-1.0).
761    pub duplicate_fraction: f64,
762
763    /// True if preflight checks passed (sanity, generator, system).
764    pub preflight_ok: bool,
765
766    /// Number of samples used for calibration (covariance estimation).
767    pub calibration_samples: usize,
768
769    /// Total time spent on the analysis in seconds.
770    pub total_time_secs: f64,
771
772    /// Human-readable warnings (empty if all checks pass).
773    pub warnings: Vec<String>,
774
775    /// Quality issues detected during measurement.
776    pub quality_issues: Vec<QualityIssue>,
777
778    /// Preflight warnings from calibration phase.
779    ///
780    /// These warnings are categorized by severity:
781    /// - `Informational`: Sampling efficiency issues (results still valid)
782    /// - `ResultUndermining`: Statistical assumption violations (results may be unreliable)
783    pub preflight_warnings: Vec<PreflightWarningInfo>,
784
785    // =========================================================================
786    // Reproduction info (for verbose/debug output)
787    // =========================================================================
788    /// Measurement seed used for reproducibility.
789    pub seed: Option<u64>,
790
791    /// Attacker model name (e.g., "AdjacentNetwork", "SharedHardware").
792    pub attacker_model: Option<String>,
793
794    /// Effect threshold (theta) in nanoseconds.
795    pub threshold_ns: f64,
796
797    /// Timer implementation name (e.g., "rdtsc", "cntvct_el0", "kperf").
798    pub timer_name: String,
799
800    /// Platform description (e.g., "macos-aarch64").
801    pub platform: String,
802
803    /// Reason the timer fell back from high-precision PMU (if applicable).
804    ///
805    /// Used to generate context-aware recommendations in output.
806    /// - "concurrent access": kperf locked by another process
807    /// - "no sudo": not running with elevated privileges
808    /// - "unavailable": PMU init failed despite privileges
809    /// - None: using high-precision timer or x86_64 (rdtsc is already ~0.3ns)
810    #[serde(skip_serializing_if = "Option::is_none")]
811    pub timer_fallback_reason: Option<String>,
812
813    // =========================================================================
814    // v5.4 Gibbs sampler diagnostics
815    // =========================================================================
816
817    /// v5.4: Total number of Gibbs iterations.
818    pub gibbs_iters_total: usize,
819
820    /// v5.4: Number of burn-in iterations.
821    pub gibbs_burnin: usize,
822
823    /// v5.4: Number of retained samples.
824    pub gibbs_retained: usize,
825
826    /// v5.4: Posterior mean of latent scale λ.
827    pub lambda_mean: f64,
828
829    /// v5.4: Posterior standard deviation of λ.
830    pub lambda_sd: f64,
831
832    /// v5.4: Coefficient of variation of λ (λ_sd / λ_mean).
833    pub lambda_cv: f64,
834
835    /// v5.4: Effective sample size of λ chain.
836    pub lambda_ess: f64,
837
838    /// v5.4: Whether λ chain mixed well (CV ≥ 0.1 AND ESS ≥ 20).
839    pub lambda_mixing_ok: bool,
840
841    // =========================================================================
842    // v5.6 Gibbs sampler κ (kappa) diagnostics - robust t-likelihood
843    // =========================================================================
844
845    /// v5.6: Posterior mean of likelihood precision κ.
846    pub kappa_mean: f64,
847
848    /// v5.6: Posterior standard deviation of κ.
849    pub kappa_sd: f64,
850
851    /// v5.6: Coefficient of variation of κ (kappa_sd / kappa_mean).
852    pub kappa_cv: f64,
853
854    /// v5.6: Effective sample size of κ chain.
855    pub kappa_ess: f64,
856
857    /// v5.6: Whether κ chain mixed well (CV ≥ 0.1 AND ESS ≥ 20).
858    pub kappa_mixing_ok: bool,
859}
860
861impl Diagnostics {
862    /// Create diagnostics indicating all checks passed.
863    ///
864    /// Uses placeholder values for numeric fields; prefer constructing
865    /// explicitly with actual measured values.
866    pub fn all_ok() -> Self {
867        Self {
868            dependence_length: 1,
869            effective_sample_size: 0,
870            stationarity_ratio: 1.0,
871            stationarity_ok: true,
872            projection_mismatch_q: 0.0,
873            projection_mismatch_threshold: 18.48, // chi-squared(7, 0.99) as default
874            projection_mismatch_ok: true,
875            top_quantiles: None,
876            outlier_rate_baseline: 0.0,
877            outlier_rate_sample: 0.0,
878            outlier_asymmetry_ok: true,
879            discrete_mode: false,
880            timer_resolution_ns: 1.0,
881            duplicate_fraction: 0.0,
882            preflight_ok: true,
883            calibration_samples: 0,
884            total_time_secs: 0.0,
885            warnings: Vec::new(),
886            quality_issues: Vec::new(),
887            preflight_warnings: Vec::new(),
888            seed: None,
889            attacker_model: None,
890            threshold_ns: 0.0,
891            timer_name: String::new(),
892            platform: String::new(),
893            timer_fallback_reason: None,
894            // v5.4 Gibbs sampler diagnostics
895            gibbs_iters_total: 256,
896            gibbs_burnin: 64,
897            gibbs_retained: 192,
898            lambda_mean: 1.0,
899            lambda_sd: 0.0,
900            lambda_cv: 0.0,
901            lambda_ess: 0.0,
902            lambda_mixing_ok: true,
903            // v5.6 kappa diagnostics
904            kappa_mean: 1.0,
905            kappa_sd: 0.0,
906            kappa_cv: 0.0,
907            kappa_ess: 0.0,
908            kappa_mixing_ok: true,
909        }
910    }
911
912    /// Check if all diagnostics are OK.
913    pub fn all_checks_passed(&self) -> bool {
914        self.stationarity_ok
915            && self.projection_mismatch_ok
916            && self.outlier_asymmetry_ok
917            && self.preflight_ok
918    }
919}
920
921impl Default for Diagnostics {
922    fn default() -> Self {
923        Self::all_ok()
924    }
925}
926
927// ============================================================================
928// QualityIssue - Specific quality problems
929// ============================================================================
930
931/// A specific quality issue detected during measurement.
932#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
933pub struct QualityIssue {
934    /// Issue code for programmatic handling.
935    pub code: IssueCode,
936
937    /// Human-readable description of the issue.
938    pub message: String,
939
940    /// Suggested actions to address the issue.
941    pub guidance: String,
942}
943
944/// Issue codes for programmatic handling of quality problems.
945#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
946pub enum IssueCode {
947    /// High autocorrelation reduces effective sample size.
948    HighDependence,
949
950    /// Effective sample size is too low for reliable inference.
951    LowEffectiveSamples,
952
953    /// Timing distribution appears to drift during measurement.
954    StationaritySuspect,
955
956    /// Timer has low resolution, using discrete mode.
957    DiscreteTimer,
958
959    /// Sample count is small for discrete mode bootstrap.
960    SmallSampleDiscrete,
961
962    /// Generator cost differs between classes.
963    HighGeneratorCost,
964
965    /// Low entropy in random inputs (possible API misuse).
966    LowUniqueInputs,
967
968    /// Some quantiles were filtered from analysis.
969    QuantilesFiltered,
970
971    /// Threshold was clamped to timer resolution.
972    ThresholdClamped,
973
974    /// High fraction of samples were winsorized.
975    HighWinsorRate,
976
977    /// User's threshold was elevated due to measurement floor.
978    ThresholdElevated,
979
980    /// Slab component dominates posterior (v5.2 mixture prior).
981    ///
982    /// The wide "slab" prior component has higher posterior weight than the
983    /// narrow component, indicating evidence strongly favors a large effect
984    /// (well above the threshold). This is informational, not a problem.
985    ///
986    /// DEPRECATED: v5.4 uses Student's t prior with Gibbs sampling instead
987    /// of mixture prior. This code is kept for backwards compatibility.
988    SlabDominant,
989
990    /// v5.4: Gibbs sampler's lambda chain did not mix well.
991    ///
992    /// The latent scale variable λ showed poor mixing (CV < 0.1 or ESS < 20),
993    /// indicating the posterior may be unreliable. This typically occurs with
994    /// very small or very large effects where the posterior is concentrated.
995    LambdaMixingPoor,
996
997    /// v5.6: Gibbs sampler's kappa chain did not mix well.
998    ///
999    /// The likelihood precision variable κ showed poor mixing (CV < 0.1 or ESS < 20),
1000    /// indicating the posterior may be unreliable.
1001    KappaMixingPoor,
1002
1003    /// v5.6: Likelihood covariance was inflated (kappa_mean < 0.3).
1004    ///
1005    /// The robust t-likelihood inflated covariance by ~1/κ_mean to accommodate
1006    /// data that doesn't match the estimated Σₙ. Effect estimates remain valid
1007    /// but uncertainty was increased for robustness.
1008    LikelihoodInflated,
1009}
1010
1011// ============================================================================
1012// PreflightWarning - Preflight check results
1013// ============================================================================
1014
1015/// Severity of a preflight warning.
1016///
1017/// This distinction is critical for interpreting results:
1018///
1019/// - **Informational**: Affects sampling efficiency but not result validity.
1020///   The Bayesian posterior is still trustworthy; you just needed more samples
1021///   to reach the same confidence level. Examples: high autocorrelation,
1022///   coarse timer resolution, suboptimal CPU governor.
1023///
1024/// - **ResultUndermining**: Violates statistical assumptions the Bayesian model
1025///   relies on. The posterior confidence may be misplaced because the model's
1026///   assumptions don't hold. Examples: non-monotonic timer (measurements are
1027///   garbage), severe non-stationarity (distribution changed during measurement),
1028///   broken harness with mutable state (Fixed-vs-Fixed inconsistency).
1029#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
1030pub enum PreflightSeverity {
1031    /// Sampling efficiency issue - doesn't invalidate results.
1032    ///
1033    /// These warnings indicate that the measurement setup is suboptimal and
1034    /// required more samples to reach a conclusion, but the Bayesian posterior
1035    /// is still valid. The result can be trusted.
1036    ///
1037    /// Examples:
1038    /// - High autocorrelation (reduces effective sample size)
1039    /// - Coarse timer resolution (requires more samples)
1040    /// - Suboptimal CPU governor (adds variance)
1041    /// - Generator cost asymmetry (may inflate differences but doesn't invalidate)
1042    Informational,
1043
1044    /// Statistical assumption violation - undermines result confidence.
1045    ///
1046    /// These warnings indicate that fundamental assumptions of the Bayesian
1047    /// model may be violated. Even if the posterior appears confident, that
1048    /// confidence may be misplaced.
1049    ///
1050    /// Examples:
1051    /// - Non-monotonic timer (measurements are meaningless)
1052    /// - Severe non-stationarity (distribution changed during measurement)
1053    /// - Fixed-vs-Fixed inconsistency with randomization (likely mutable state bug)
1054    ResultUndermining,
1055}
1056
1057/// Category of preflight check.
1058///
1059/// Used for organizing warnings in output and for programmatic filtering.
1060#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
1061pub enum PreflightCategory {
1062    /// Timer sanity checks (monotonicity, basic functionality).
1063    ///
1064    /// **Severity**: ResultUndermining if failed - measurements are unreliable.
1065    TimerSanity,
1066
1067    /// Fixed-vs-Fixed internal consistency check.
1068    ///
1069    /// **Severity**: ResultUndermining if triggered - may indicate mutable state
1070    /// captured in test closure, or severe environmental interference.
1071    /// Note: May be intentional for FPR validation testing.
1072    Sanity,
1073
1074    /// Autocorrelation in timing samples.
1075    ///
1076    /// **Severity**: Informational - reduces effective sample size but the
1077    /// block bootstrap accounts for this.
1078    Autocorrelation,
1079
1080    /// System configuration (CPU governor, turbo boost, etc.).
1081    ///
1082    /// **Severity**: Informational - suboptimal config adds variance but
1083    /// doesn't invalidate results.
1084    System,
1085
1086    /// Timer resolution and precision.
1087    ///
1088    /// **Severity**: Informational - coarse timers require more samples but
1089    /// adaptive batching compensates for this.
1090    Resolution,
1091
1092    /// Stationarity of timing distribution.
1093    ///
1094    /// **Severity**: ResultUndermining if severely violated - indicates the
1095    /// timing distribution changed during measurement.
1096    Stationarity,
1097}
1098
1099/// Information about a preflight warning.
1100///
1101/// Preflight warnings are collected during the calibration phase and reported
1102/// to help users understand measurement quality and potential issues.
1103#[derive(Debug, Clone, Serialize, Deserialize)]
1104pub struct PreflightWarningInfo {
1105    /// Category of the check that generated this warning.
1106    pub category: PreflightCategory,
1107
1108    /// Severity of this warning.
1109    ///
1110    /// - `Informational`: Sampling efficiency issue, results still valid.
1111    /// - `ResultUndermining`: Statistical assumption violation, results may be unreliable.
1112    pub severity: PreflightSeverity,
1113
1114    /// Human-readable description of the warning.
1115    pub message: String,
1116
1117    /// Optional guidance for addressing the issue.
1118    pub guidance: Option<String>,
1119}
1120
1121impl PreflightWarningInfo {
1122    /// Create a new preflight warning.
1123    pub fn new(
1124        category: PreflightCategory,
1125        severity: PreflightSeverity,
1126        message: impl Into<String>,
1127    ) -> Self {
1128        Self {
1129            category,
1130            severity,
1131            message: message.into(),
1132            guidance: None,
1133        }
1134    }
1135
1136    /// Create a new preflight warning with guidance.
1137    pub fn with_guidance(
1138        category: PreflightCategory,
1139        severity: PreflightSeverity,
1140        message: impl Into<String>,
1141        guidance: impl Into<String>,
1142    ) -> Self {
1143        Self {
1144            category,
1145            severity,
1146            message: message.into(),
1147            guidance: Some(guidance.into()),
1148        }
1149    }
1150
1151    /// Check if this warning undermines result confidence.
1152    pub fn is_result_undermining(&self) -> bool {
1153        self.severity == PreflightSeverity::ResultUndermining
1154    }
1155}
1156
1157// ============================================================================
1158// MinDetectableEffect - Sensitivity information
1159// ============================================================================
1160
1161/// Minimum detectable effect at current noise level.
1162///
1163/// The MDE tells you the smallest effect that could be reliably detected
1164/// given the measurement noise. If MDE > threshold, a "pass" result means
1165/// insufficient sensitivity, not necessarily safety.
1166///
1167/// See spec Section 2.7 (Minimum Detectable Effect).
1168#[derive(Debug, Clone, Serialize, Deserialize)]
1169pub struct MinDetectableEffect {
1170    /// Minimum detectable uniform shift in nanoseconds.
1171    pub shift_ns: f64,
1172
1173    /// Minimum detectable tail effect in nanoseconds.
1174    pub tail_ns: f64,
1175}
1176
1177impl Default for MinDetectableEffect {
1178    fn default() -> Self {
1179        Self {
1180            shift_ns: f64::INFINITY,
1181            tail_ns: f64::INFINITY,
1182        }
1183    }
1184}
1185
1186// ============================================================================
1187// BatchingInfo - Metadata about batching
1188// ============================================================================
1189
1190/// Information about batching configuration used during collection.
1191#[derive(Debug, Clone, Serialize, Deserialize)]
1192pub struct BatchingInfo {
1193    /// Whether batching was enabled.
1194    pub enabled: bool,
1195
1196    /// Iterations per batch (1 if batching disabled).
1197    pub k: u32,
1198
1199    /// Effective ticks per batch measurement.
1200    pub ticks_per_batch: f64,
1201
1202    /// Explanation of why batching was enabled/disabled.
1203    pub rationale: String,
1204
1205    /// Whether the operation was too fast to measure reliably.
1206    pub unmeasurable: Option<UnmeasurableInfo>,
1207}
1208
1209/// Information about why an operation is unmeasurable.
1210#[derive(Debug, Clone, Serialize, Deserialize)]
1211pub struct UnmeasurableInfo {
1212    /// Estimated operation duration in nanoseconds.
1213    pub operation_ns: f64,
1214
1215    /// Minimum measurable threshold in nanoseconds.
1216    pub threshold_ns: f64,
1217
1218    /// Ticks per call (below MIN_TICKS_SINGLE_CALL).
1219    pub ticks_per_call: f64,
1220}
1221
1222// ============================================================================
1223// Metadata - Runtime information
1224// ============================================================================
1225
1226/// Metadata for debugging and analysis.
1227#[derive(Debug, Clone, Serialize, Deserialize)]
1228pub struct Metadata {
1229    /// Samples per class after outlier filtering.
1230    pub samples_per_class: usize,
1231
1232    /// Cycles per nanosecond (for conversion).
1233    pub cycles_per_ns: f64,
1234
1235    /// Timer type used.
1236    pub timer: String,
1237
1238    /// Timer resolution in nanoseconds.
1239    pub timer_resolution_ns: f64,
1240
1241    /// Batching configuration and rationale.
1242    pub batching: BatchingInfo,
1243
1244    /// Total runtime in seconds.
1245    pub runtime_secs: f64,
1246}
1247
1248// ============================================================================
1249// UnreliablePolicy - How to handle unreliable results
1250// ============================================================================
1251
1252/// Policy for handling unreliable measurements in test assertions.
1253#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1254pub enum UnreliablePolicy {
1255    /// Log warning and skip assertions. Test passes.
1256    /// Use when: noisy CI, parallel tests, "some coverage is better than none".
1257    #[default]
1258    FailOpen,
1259
1260    /// Panic. Test fails.
1261    /// Use when: security-critical code, dedicated quiet CI runners.
1262    FailClosed,
1263}
1264
1265impl UnreliablePolicy {
1266    /// Get policy from environment variable, or use default.
1267    ///
1268    /// Checks `TIMING_ORACLE_UNRELIABLE_POLICY`:
1269    /// - "fail_open" or "skip" → FailOpen
1270    /// - "fail_closed" or "panic" → FailClosed
1271    /// - unset or other → default
1272    #[cfg(feature = "std")]
1273    pub fn from_env_or(default: Self) -> Self {
1274        match std::env::var("TIMING_ORACLE_UNRELIABLE_POLICY")
1275            .ok()
1276            .as_deref()
1277        {
1278            Some("fail_open") | Some("skip") => Self::FailOpen,
1279            Some("fail_closed") | Some("panic") => Self::FailClosed,
1280            _ => default,
1281        }
1282    }
1283
1284    /// Get policy from environment variable, or use default.
1285    ///
1286    /// In no_std mode, always returns the default.
1287    #[cfg(not(feature = "std"))]
1288    pub fn from_env_or(default: Self) -> Self {
1289        default
1290    }
1291}
1292
1293// ============================================================================
1294// Outcome implementation
1295// ============================================================================
1296
1297impl Outcome {
1298    /// Check if the test passed (no timing leak detected).
1299    pub fn passed(&self) -> bool {
1300        matches!(self, Outcome::Pass { .. })
1301    }
1302
1303    /// Check if the test failed (timing leak detected).
1304    pub fn failed(&self) -> bool {
1305        matches!(self, Outcome::Fail { .. })
1306    }
1307
1308    /// Check if the result is conclusive (either Pass or Fail).
1309    pub fn is_conclusive(&self) -> bool {
1310        matches!(self, Outcome::Pass { .. } | Outcome::Fail { .. })
1311    }
1312
1313    /// Check if the operation was measurable.
1314    pub fn is_measurable(&self) -> bool {
1315        !matches!(self, Outcome::Unmeasurable { .. })
1316    }
1317
1318    /// Get the leak probability if available.
1319    ///
1320    /// Returns `None` for `Unmeasurable` and `Research` (research mode uses CI, not probability).
1321    pub fn leak_probability(&self) -> Option<f64> {
1322        match self {
1323            Outcome::Pass {
1324                leak_probability, ..
1325            } => Some(*leak_probability),
1326            Outcome::Fail {
1327                leak_probability, ..
1328            } => Some(*leak_probability),
1329            Outcome::Inconclusive {
1330                leak_probability, ..
1331            } => Some(*leak_probability),
1332            Outcome::Unmeasurable { .. } => None,
1333            Outcome::Research(_) => None, // Research mode uses CI-based semantics
1334        }
1335    }
1336
1337    /// Get the effect estimate if available.
1338    pub fn effect(&self) -> Option<&EffectEstimate> {
1339        match self {
1340            Outcome::Pass { effect, .. } => Some(effect),
1341            Outcome::Fail { effect, .. } => Some(effect),
1342            Outcome::Inconclusive { effect, .. } => Some(effect),
1343            Outcome::Unmeasurable { .. } => None,
1344            Outcome::Research(res) => Some(&res.effect),
1345        }
1346    }
1347
1348    /// Get the measurement quality if available.
1349    pub fn quality(&self) -> Option<MeasurementQuality> {
1350        match self {
1351            Outcome::Pass { quality, .. } => Some(*quality),
1352            Outcome::Fail { quality, .. } => Some(*quality),
1353            Outcome::Inconclusive { quality, .. } => Some(*quality),
1354            Outcome::Unmeasurable { .. } => None,
1355            Outcome::Research(res) => Some(res.quality),
1356        }
1357    }
1358
1359    /// Get the diagnostics if available.
1360    pub fn diagnostics(&self) -> Option<&Diagnostics> {
1361        match self {
1362            Outcome::Pass { diagnostics, .. } => Some(diagnostics),
1363            Outcome::Fail { diagnostics, .. } => Some(diagnostics),
1364            Outcome::Inconclusive { diagnostics, .. } => Some(diagnostics),
1365            Outcome::Unmeasurable { .. } => None,
1366            Outcome::Research(res) => Some(&res.diagnostics),
1367        }
1368    }
1369
1370    /// Get the number of samples used if available.
1371    pub fn samples_used(&self) -> Option<usize> {
1372        match self {
1373            Outcome::Pass { samples_used, .. } => Some(*samples_used),
1374            Outcome::Fail { samples_used, .. } => Some(*samples_used),
1375            Outcome::Inconclusive { samples_used, .. } => Some(*samples_used),
1376            Outcome::Unmeasurable { .. } => None,
1377            Outcome::Research(res) => Some(res.samples_used),
1378        }
1379    }
1380
1381    /// Check if the measurement is reliable enough for assertions.
1382    ///
1383    /// Returns `true` if:
1384    /// - Test is conclusive (Pass or Fail), AND
1385    /// - Quality is not TooNoisy, OR posterior is very conclusive (< 0.1 or > 0.9)
1386    ///
1387    /// The key insight: a very conclusive posterior is trustworthy even with noisy
1388    /// measurements - the signal overcame the noise.
1389    ///
1390    /// For Research mode, reliability is based on whether the CI is clearly above
1391    /// or below the measurement floor.
1392    pub fn is_reliable(&self) -> bool {
1393        match self {
1394            Outcome::Unmeasurable { .. } => false,
1395            Outcome::Inconclusive { .. } => false,
1396            Outcome::Pass {
1397                quality,
1398                leak_probability,
1399                ..
1400            } => *quality != MeasurementQuality::TooNoisy || *leak_probability < 0.01,
1401            Outcome::Fail {
1402                quality,
1403                leak_probability,
1404                ..
1405            } => *quality != MeasurementQuality::TooNoisy || *leak_probability > 0.99,
1406            Outcome::Research(res) => {
1407                // Research mode is reliable if we reached a confident conclusion
1408                matches!(
1409                    res.status,
1410                    ResearchStatus::EffectDetected | ResearchStatus::NoEffectDetected
1411                )
1412            }
1413        }
1414    }
1415
1416    /// Unwrap a Pass result, panicking otherwise.
1417    pub fn unwrap_pass(self) -> (f64, EffectEstimate, MeasurementQuality, Diagnostics) {
1418        match self {
1419            Outcome::Pass {
1420                leak_probability,
1421                effect,
1422                quality,
1423                diagnostics,
1424                ..
1425            } => (leak_probability, effect, quality, diagnostics),
1426            _ => panic!("Expected Pass outcome, got {:?}", self),
1427        }
1428    }
1429
1430    /// Unwrap a Fail result, panicking otherwise.
1431    pub fn unwrap_fail(
1432        self,
1433    ) -> (
1434        f64,
1435        EffectEstimate,
1436        Exploitability,
1437        MeasurementQuality,
1438        Diagnostics,
1439    ) {
1440        match self {
1441            Outcome::Fail {
1442                leak_probability,
1443                effect,
1444                exploitability,
1445                quality,
1446                diagnostics,
1447                ..
1448            } => (
1449                leak_probability,
1450                effect,
1451                exploitability,
1452                quality,
1453                diagnostics,
1454            ),
1455            _ => panic!("Expected Fail outcome, got {:?}", self),
1456        }
1457    }
1458
1459    /// Handle unreliable results according to policy.
1460    ///
1461    /// Returns `Some(self)` if the result is reliable.
1462    /// For unreliable results:
1463    /// - `FailOpen`: prints warning, returns `None`
1464    /// - `FailClosed`: panics
1465    ///
1466    /// # Example
1467    ///
1468    /// ```ignore
1469    /// let outcome = oracle.test(...);
1470    /// if let Some(result) = outcome.handle_unreliable("test_name", UnreliablePolicy::FailOpen) {
1471    ///     assert!(result.passed());
1472    /// }
1473    /// ```
1474    #[cfg(feature = "std")]
1475    pub fn handle_unreliable(self, test_name: &str, policy: UnreliablePolicy) -> Option<Self> {
1476        if self.is_reliable() {
1477            return Some(self);
1478        }
1479
1480        let reason = match &self {
1481            Outcome::Unmeasurable { recommendation, .. } => {
1482                format!("unmeasurable: {}", recommendation)
1483            }
1484            Outcome::Inconclusive { reason, .. } => {
1485                format!("inconclusive: {:?}", reason)
1486            }
1487            Outcome::Pass { quality, .. } | Outcome::Fail { quality, .. } => {
1488                format!("unreliable quality: {:?}", quality)
1489            }
1490            Outcome::Research(research) => {
1491                format!("research mode: {:?}", research.status)
1492            }
1493        };
1494
1495        match policy {
1496            UnreliablePolicy::FailOpen => {
1497                eprintln!("[SKIPPED] {}: {} (fail-open policy)", test_name, reason);
1498                None
1499            }
1500            UnreliablePolicy::FailClosed => {
1501                panic!("[FAILED] {}: {} (fail-closed policy)", test_name, reason);
1502            }
1503        }
1504    }
1505
1506    /// Handle unreliable results according to policy (no_std version).
1507    ///
1508    /// In no_std mode, this always panics on unreliable results with FailClosed,
1509    /// and returns None with FailOpen (no printing).
1510    #[cfg(not(feature = "std"))]
1511    pub fn handle_unreliable(self, _test_name: &str, policy: UnreliablePolicy) -> Option<Self> {
1512        if self.is_reliable() {
1513            return Some(self);
1514        }
1515
1516        match policy {
1517            UnreliablePolicy::FailOpen => None,
1518            UnreliablePolicy::FailClosed => {
1519                panic!("Unreliable result with fail-closed policy");
1520            }
1521        }
1522    }
1523}
1524
1525// ============================================================================
1526// Display implementations
1527// ============================================================================
1528
1529impl fmt::Display for Outcome {
1530    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1531        write!(f, "{}", crate::formatting::format_outcome_plain(self))
1532    }
1533}
1534
1535impl fmt::Display for EffectPattern {
1536    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1537        match self {
1538            EffectPattern::UniformShift => write!(f, "uniform shift"),
1539            EffectPattern::TailEffect => write!(f, "tail effect"),
1540            EffectPattern::Mixed => write!(f, "mixed"),
1541            EffectPattern::Indeterminate => write!(f, "indeterminate"),
1542        }
1543    }
1544}
1545
1546impl fmt::Display for Exploitability {
1547    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1548        match self {
1549            Exploitability::SharedHardwareOnly => write!(f, "shared hardware only"),
1550            Exploitability::Http2Multiplexing => write!(f, "HTTP/2 multiplexing"),
1551            Exploitability::StandardRemote => write!(f, "standard remote"),
1552            Exploitability::ObviousLeak => write!(f, "obvious leak"),
1553        }
1554    }
1555}
1556
1557impl fmt::Display for MeasurementQuality {
1558    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1559        match self {
1560            MeasurementQuality::Excellent => write!(f, "excellent"),
1561            MeasurementQuality::Good => write!(f, "good"),
1562            MeasurementQuality::Poor => write!(f, "poor"),
1563            MeasurementQuality::TooNoisy => write!(f, "too noisy"),
1564        }
1565    }
1566}
1567
1568impl fmt::Display for InconclusiveReason {
1569    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1570        match self {
1571            InconclusiveReason::DataTooNoisy { message, guidance } => {
1572                write!(f, "Data too noisy: {}\n  \u{2192} {}", message, guidance)
1573            }
1574            InconclusiveReason::NotLearning { message, guidance } => {
1575                write!(f, "Not learning: {}\n  \u{2192} {}", message, guidance)
1576            }
1577            InconclusiveReason::WouldTakeTooLong {
1578                estimated_time_secs,
1579                samples_needed,
1580                guidance,
1581            } => {
1582                write!(
1583                    f,
1584                    "Would take too long: ~{:.0}s / {} samples needed\n  \u{2192} {}",
1585                    estimated_time_secs, samples_needed, guidance
1586                )
1587            }
1588            InconclusiveReason::TimeBudgetExceeded { .. } => {
1589                write!(f, "Time budget exceeded")
1590            }
1591            InconclusiveReason::SampleBudgetExceeded { .. } => {
1592                write!(f, "Sample budget exceeded")
1593            }
1594            InconclusiveReason::ConditionsChanged { message, guidance } => {
1595                write!(
1596                    f,
1597                    "Conditions changed: {}\n  \u{2192} {}",
1598                    message, guidance
1599                )
1600            }
1601            InconclusiveReason::ThresholdElevated {
1602                theta_user,
1603                theta_eff,
1604                leak_probability_at_eff,
1605                achievable_at_max,
1606                guidance,
1607                ..
1608            } => {
1609                let achievability = if *achievable_at_max {
1610                    "achievable with more samples"
1611                } else {
1612                    "not achievable at max samples"
1613                };
1614                write!(
1615                    f,
1616                    "Threshold elevated: requested {:.1}ns, used {:.1}ns (P={:.1}% at θ_eff, {})\n  \u{2192} {}",
1617                    theta_user, theta_eff, leak_probability_at_eff * 100.0, achievability, guidance
1618                )
1619            }
1620        }
1621    }
1622}
1623
1624// ============================================================================
1625// Debug implementation for Outcome
1626// ============================================================================
1627
1628impl fmt::Debug for Outcome {
1629    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1630        write!(f, "{}", crate::formatting::format_debug_summary_plain(self))
1631    }
1632}