aprender-simulate 0.30.0

//! Popperian falsification framework.
//!
//! Implements Karl Popper's philosophy of science:
//! - Every hypothesis must be falsifiable
//! - Null hypothesis testing (NHST)
//! - Robustness metrics via Signal Temporal Logic
//!
//! # Demarcation Criterion
//!
//! A theory T is scientific iff there exists some observation O
//! that could refute T.

use serde::{Deserialize, Serialize};

/// Result of null hypothesis significance testing.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum NHSTResult {
    /// H₀ rejected: evidence supports falsification.
    Rejected {
        /// Statistical p-value.
        p_value: f64,
        /// Effect size (Cohen's d or similar).
        effect_size: f64,
        /// Test statistic.
        test_statistic: f64,
    },
    /// Failed to reject H₀: model corroborated (not proven).
    NotRejected {
        /// Statistical p-value.
        p_value: f64,
        /// Statistical power.
        power: f64,
    },
}

impl NHSTResult {
    /// Check if the null hypothesis was rejected.
    #[must_use]
    pub const fn is_rejected(&self) -> bool {
        matches!(self, Self::Rejected { .. })
    }

    /// Get the p-value.
    #[must_use]
    pub const fn p_value(&self) -> f64 {
        match self {
            Self::Rejected { p_value, .. } | Self::NotRejected { p_value, .. } => *p_value,
        }
    }
}

/// Predictions generated by a hypothesis.
#[derive(Debug, Clone, Default)]
pub struct Predictions {
    /// Predicted values.
    pub values: Vec<f64>,
    /// Prediction uncertainties.
    pub uncertainties: Vec<f64>,
}

impl Predictions {
    /// Create new empty predictions.
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    /// Add a prediction.
    pub fn add(&mut self, value: f64, uncertainty: f64) {
        self.values.push(value);
        self.uncertainties.push(uncertainty);
    }

    /// Get number of predictions.
    #[must_use]
    pub fn len(&self) -> usize {
        self.values.len()
    }

    /// Check if empty.
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.values.is_empty()
    }
}

/// Observations to test against predictions.
#[derive(Debug, Clone, Default)]
pub struct Observations {
    /// Observed values.
    pub values: Vec<f64>,
    /// Measurement uncertainties.
    pub uncertainties: Vec<f64>,
}

impl Observations {
    /// Create new empty observations.
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    /// Add an observation.
    pub fn add(&mut self, value: f64, uncertainty: f64) {
        self.values.push(value);
        self.uncertainties.push(uncertainty);
    }

    /// Get number of observations.
    #[must_use]
    pub fn len(&self) -> usize {
        self.values.len()
    }

    /// Check if empty.
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.values.is_empty()
    }
}

/// Falsification criteria defining what would refute a hypothesis.
#[derive(Debug, Clone)]
pub struct FalsificationCriteria {
    /// Metric name.
    pub metric: String,
    /// Comparison operator.
    pub operator: ComparisonOp,
    /// Threshold value.
    pub threshold: f64,
}

/// Comparison operator for falsification criteria.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ComparisonOp {
    /// Less than.
    Lt,
    /// Less than or equal.
    Le,
    /// Greater than.
    Gt,
    /// Greater than or equal.
    Ge,
    /// Equal (within tolerance).
    Eq,
    /// Not equal.
    Ne,
}

impl FalsificationCriteria {
    /// Check if a value satisfies the criterion.
    #[must_use]
    pub fn check(&self, value: f64) -> bool {
        match self.operator {
            ComparisonOp::Lt => value < self.threshold,
            ComparisonOp::Le => value <= self.threshold,
            ComparisonOp::Gt => value > self.threshold,
            ComparisonOp::Ge => value >= self.threshold,
            ComparisonOp::Eq => (value - self.threshold).abs() < f64::EPSILON,
            ComparisonOp::Ne => (value - self.threshold).abs() >= f64::EPSILON,
        }
    }

    /// Create a "less than" criterion.
    #[must_use]
    pub fn less_than(metric: impl Into<String>, threshold: f64) -> Self {
        Self {
            metric: metric.into(),
            operator: ComparisonOp::Lt,
            threshold,
        }
    }

    /// Create a "greater than" criterion.
    #[must_use]
    pub fn greater_than(metric: impl Into<String>, threshold: f64) -> Self {
        Self {
            metric: metric.into(),
            operator: ComparisonOp::Gt,
            threshold,
        }
    }
}

/// Trait for falsifiable hypotheses.
///
/// Implements Popper's demarcation criterion: a hypothesis is scientific
/// only if it can be falsified by observation.
pub trait FalsifiableHypothesis {
    /// Type of state this hypothesis operates on.
    type State;

    /// Generate testable predictions from state.
    fn predict(&self, state: &Self::State) -> Predictions;

    /// Define what would falsify this hypothesis.
    fn falsification_criteria(&self) -> Vec<FalsificationCriteria>;

    /// Compute robustness degree (distance to falsification).
    ///
    /// Positive values indicate satisfaction with margin.
    /// Negative values indicate violation.
    /// Zero indicates boundary (maximally falsifiable).
    fn robustness(&self, state: &Self::State) -> f64;

    /// Perform null hypothesis significance test.
    fn null_hypothesis_test(
        &self,
        predictions: &Predictions,
        observations: &Observations,
        significance: f64,
    ) -> NHSTResult;
}

/// Statistical test functions.
pub mod stats {
    use super::NHSTResult;

    /// Compute mean of values.
    #[must_use]
    pub fn mean(values: &[f64]) -> f64 {
        if values.is_empty() {
            return 0.0;
        }
        values.iter().sum::<f64>() / values.len() as f64
    }

    /// Compute variance of values.
    #[must_use]
    pub fn variance(values: &[f64]) -> f64 {
        if values.len() < 2 {
            return 0.0;
        }
        let m = mean(values);
        let sum_sq: f64 = values.iter().map(|x| (x - m).powi(2)).sum();
        sum_sq / (values.len() - 1) as f64
    }

    /// Compute standard deviation.
    #[must_use]
    pub fn std_dev(values: &[f64]) -> f64 {
        variance(values).sqrt()
    }

    /// Compute standard error of the mean.
    #[must_use]
    pub fn std_error(values: &[f64]) -> f64 {
        if values.is_empty() {
            return 0.0;
        }
        std_dev(values) / (values.len() as f64).sqrt()
    }

    /// Perform one-sample t-test.
    ///
    /// Tests H₀: μ = μ₀ against H₁: μ ≠ μ₀.
    #[must_use]
    pub fn one_sample_t_test(values: &[f64], mu_0: f64, significance: f64) -> NHSTResult {
        if values.len() < 2 {
            return NHSTResult::NotRejected {
                p_value: 1.0,
                power: 0.0,
            };
        }

        let n = values.len() as f64;
        let sample_mean = mean(values);
        let sample_std = std_dev(values);

        if sample_std < f64::EPSILON {
            // No variance - can't perform test
            return NHSTResult::NotRejected {
                p_value: 1.0,
                power: 0.0,
            };
        }

        let t_stat = (sample_mean - mu_0) / (sample_std / n.sqrt());
        let _df = n - 1.0; // degrees of freedom (used for exact t-distribution)

        // Approximate p-value using normal distribution for large n
        let p_value = 2.0 * (1.0 - normal_cdf(t_stat.abs()));

        // Effect size (Cohen's d)
        let effect_size = (sample_mean - mu_0) / sample_std;

        if p_value < significance {
            NHSTResult::Rejected {
                p_value,
                effect_size: effect_size.abs(),
                test_statistic: t_stat,
            }
        } else {
            // Approximate power (simplified)
            let power = 1.0 - normal_cdf(1.96 - effect_size.abs() * n.sqrt());
            NHSTResult::NotRejected {
                p_value,
                power: power.clamp(0.0, 1.0),
            }
        }
    }

    /// Perform two-sample t-test.
    ///
    /// Tests H₀: μ₁ = μ₂ against H₁: μ₁ ≠ μ₂.
    #[must_use]
    pub fn two_sample_t_test(sample1: &[f64], sample2: &[f64], significance: f64) -> NHSTResult {
        if sample1.len() < 2 || sample2.len() < 2 {
            return NHSTResult::NotRejected {
                p_value: 1.0,
                power: 0.0,
            };
        }

        let n1 = sample1.len() as f64;
        let n2 = sample2.len() as f64;

        let mean1 = mean(sample1);
        let mean2 = mean(sample2);

        let var1 = variance(sample1);
        let var2 = variance(sample2);

        // Pooled standard error
        let se = (var1 / n1 + var2 / n2).sqrt();

        if se < f64::EPSILON {
            return NHSTResult::NotRejected {
                p_value: 1.0,
                power: 0.0,
            };
        }

        let t_stat = (mean1 - mean2) / se;

        // Approximate p-value
        let p_value = 2.0 * (1.0 - normal_cdf(t_stat.abs()));

        // Effect size
        let pooled_std = ((var1 + var2) / 2.0).sqrt();
        let effect_size = if pooled_std > f64::EPSILON {
            (mean1 - mean2) / pooled_std
        } else {
            0.0
        };

        if p_value < significance {
            NHSTResult::Rejected {
                p_value,
                effect_size: effect_size.abs(),
                test_statistic: t_stat,
            }
        } else {
            NHSTResult::NotRejected {
                p_value,
                power: 0.5, // Simplified
            }
        }
    }

    /// Chi-square goodness of fit test.
    #[must_use]
    pub fn chi_square_test(observed: &[f64], expected: &[f64], significance: f64) -> NHSTResult {
        if observed.len() != expected.len() || observed.is_empty() {
            return NHSTResult::NotRejected {
                p_value: 1.0,
                power: 0.0,
            };
        }

        let chi_sq: f64 = observed
            .iter()
            .zip(expected)
            .filter(|(_, &e)| e > f64::EPSILON)
            .map(|(&o, &e)| (o - e).powi(2) / e)
            .sum();

        let df = (observed.len() - 1) as f64;

        // Approximate p-value (simplified)
        let p_value = 1.0 - chi_square_cdf(chi_sq, df);

        if p_value < significance {
            NHSTResult::Rejected {
                p_value,
                effect_size: (chi_sq / df).sqrt(), // Cramér's V approximation
                test_statistic: chi_sq,
            }
        } else {
            NHSTResult::NotRejected {
                p_value,
                power: 0.5,
            }
        }
    }

    /// Standard normal CDF approximation (private).
    fn normal_cdf(x: f64) -> f64 {
        0.5 * (1.0 + erf(x / std::f64::consts::SQRT_2))
    }

    /// Standard normal CDF approximation (public).
    #[must_use]
    pub fn normal_cdf_pub(x: f64) -> f64 {
        normal_cdf(x)
    }

    /// Chi-square CDF approximation.
    fn chi_square_cdf(x: f64, df: f64) -> f64 {
        if x <= 0.0 {
            return 0.0;
        }
        // Wilson-Hilferty approximation
        let z = (x / df).powf(1.0 / 3.0) - (1.0 - 2.0 / (9.0 * df));
        let se = (2.0 / (9.0 * df)).sqrt();
        normal_cdf(z / se)
    }

    /// Error function approximation.
    fn erf(x: f64) -> f64 {
        // Approximation with max error 1.5e-7
        let a1 = 0.254_829_592;
        let a2 = -0.284_496_736;
        let a3 = 1.421_413_741;
        let a4 = -1.453_152_027;
        let a5 = 1.061_405_429;
        let p = 0.327_591_1;

        let sign = if x < 0.0 { -1.0 } else { 1.0 };
        let x = x.abs();

        let t = 1.0 / (1.0 + p * x);
        let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();

        sign * y
    }
}

// ============================================================================
// Nullification Test Framework (Appendix F)
// ============================================================================

/// Result of a nullification test.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NullificationResult {
    /// Hypothesis identifier.
    pub hypothesis_id: String,
    /// Whether H₀ was rejected.
    pub rejected: bool,
    /// Statistical p-value.
    pub p_value: f64,
    /// 95% confidence interval.
    pub confidence_interval: (f64, f64),
    /// Effect size (Cohen's d).
    pub effect_size: f64,
    /// Observations collected.
    pub observations: Vec<f64>,
}

impl NullificationResult {
    /// Format result following Princeton reporting standards.
    #[must_use]
    pub fn report(&self) -> String {
        format!(
            "{}: {} (p={:.4}, 95% CI [{:.4}, {:.4}], d={:.2})",
            self.hypothesis_id,
            if self.rejected {
                "REJECTED"
            } else {
                "NOT REJECTED"
            },
            self.p_value,
            self.confidence_interval.0,
            self.confidence_interval.1,
            self.effect_size,
        )
    }
}

/// Popperian nullification test for ML simulation.
///
/// Following Princeton methodology: minimum 5 runs, bootstrap CI.
#[derive(Debug, Clone)]
pub struct NullificationTest {
    /// Null hypothesis identifier.
    pub hypothesis_id: String,
    /// Number of independent runs (minimum 5 per Princeton methodology).
    pub n_runs: usize,
    /// Significance level (α = 0.05 standard).
    pub alpha: f64,
    /// Number of bootstrap samples for CI.
    pub bootstrap_samples: usize,
    /// Expected value under H₀ (for one-sample tests).
    pub expected_value: f64,
}

impl NullificationTest {
    /// Create a new nullification test.
    #[must_use]
    pub fn new(hypothesis_id: impl Into<String>) -> Self {
        Self {
            hypothesis_id: hypothesis_id.into(),
            n_runs: 5,
            alpha: 0.05,
            bootstrap_samples: 10_000,
            expected_value: 0.0,
        }
    }

    /// Set number of runs (minimum 5).
    #[must_use]
    pub fn with_runs(mut self, n_runs: usize) -> Self {
        self.n_runs = n_runs.max(5);
        self
    }

    /// Set significance level.
    #[must_use]
    pub fn with_alpha(mut self, alpha: f64) -> Self {
        self.alpha = alpha.clamp(0.001, 0.1);
        self
    }

    /// Set expected value under H₀.
    #[must_use]
    pub fn with_expected(mut self, expected: f64) -> Self {
        self.expected_value = expected;
        self
    }

    /// Execute nullification test with a test function.
    ///
    /// The `test_fn` is called `n_runs` times and should return
    /// the test statistic for each run.
    pub fn execute<F>(&self, test_fn: F) -> NullificationResult
    where
        F: Fn() -> f64,
    {
        // Collect observations
        let observations: Vec<f64> = (0..self.n_runs).map(|_| test_fn()).collect();

        // Compute bootstrap 95% CI
        let ci = self.bootstrap_ci(&observations);

        // Compute p-value and effect size
        let (p_value, effect_size) = self.compute_stats(&observations);

        NullificationResult {
            hypothesis_id: self.hypothesis_id.clone(),
            rejected: p_value < self.alpha,
            p_value,
            confidence_interval: ci,
            effect_size,
            observations,
        }
    }

    /// Execute nullification test with provided observations.
    #[must_use]
    pub fn evaluate(&self, observations: &[f64]) -> NullificationResult {
        let ci = self.bootstrap_ci(observations);
        let (p_value, effect_size) = self.compute_stats(observations);

        NullificationResult {
            hypothesis_id: self.hypothesis_id.clone(),
            rejected: p_value < self.alpha,
            p_value,
            confidence_interval: ci,
            effect_size,
            observations: observations.to_vec(),
        }
    }

    /// Compute bootstrap confidence interval.
    fn bootstrap_ci(&self, observations: &[f64]) -> (f64, f64) {
        if observations.is_empty() {
            return (0.0, 0.0);
        }

        // Simple bootstrap using resampling
        let n = observations.len();
        let mut bootstrap_means = Vec::with_capacity(self.bootstrap_samples.min(1000));

        // Use a simple LCG for bootstrap sampling (deterministic)
        let mut lcg_state = 12345_u64;
        for _ in 0..self.bootstrap_samples.min(1000) {
            let mut sum = 0.0;
            for _ in 0..n {
                // LCG: next = (a * state + c) mod m
                lcg_state = lcg_state
                    .wrapping_mul(6_364_136_223_846_793_005)
                    .wrapping_add(1);
                let idx = (lcg_state as usize) % n;
                sum += observations[idx];
            }
            bootstrap_means.push(sum / n as f64);
        }

        bootstrap_means.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));

        // 95% CI: 2.5th and 97.5th percentiles
        let lower_idx = (bootstrap_means.len() as f64 * 0.025) as usize;
        let upper_idx = (bootstrap_means.len() as f64 * 0.975) as usize;

        let lower = bootstrap_means.get(lower_idx).copied().unwrap_or(0.0);
        let upper = bootstrap_means
            .get(upper_idx.min(bootstrap_means.len() - 1))
            .copied()
            .unwrap_or(0.0);

        (lower, upper)
    }

    /// Compute p-value and effect size.
    fn compute_stats(&self, observations: &[f64]) -> (f64, f64) {
        if observations.len() < 2 {
            return (1.0, 0.0);
        }

        let mean = stats::mean(observations);
        let std = stats::std_dev(observations);
        let n = observations.len() as f64;

        if std < f64::EPSILON {
            // No variance
            let p_value = if (mean - self.expected_value).abs() < f64::EPSILON {
                1.0 // Exactly at expected
            } else {
                0.0 // Different from expected
            };
            return (p_value, 0.0);
        }

        // t-statistic
        let t_stat = (mean - self.expected_value) / (std / n.sqrt());

        // Approximate two-tailed p-value
        let p_value = 2.0 * (1.0 - stats::normal_cdf_pub(t_stat.abs()));

        // Cohen's d
        let effect_size = (mean - self.expected_value).abs() / std;

        (p_value.clamp(0.0, 1.0), effect_size)
    }
}

/// Report containing multiple nullification test results.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct NullificationReport {
    /// Individual test results.
    pub results: Vec<NullificationResult>,
    /// Summary statistics.
    pub summary: ReportSummary,
}

/// Summary of nullification report.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ReportSummary {
    /// Total number of tests.
    pub total_tests: usize,
    /// Number of rejected hypotheses.
    pub rejected: usize,
    /// Number of not-rejected hypotheses.
    pub not_rejected: usize,
    /// Overall status.
    pub status: String,
}

impl NullificationReport {
    /// Create new empty report.
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    /// Add a test result.
    pub fn add(&mut self, result: NullificationResult) {
        if result.rejected {
            self.summary.rejected += 1;
        } else {
            self.summary.not_rejected += 1;
        }
        self.summary.total_tests += 1;
        self.results.push(result);

        // Update status
        self.summary.status = if self.summary.rejected == 0 {
            "PASS".to_string()
        } else if self.summary.rejected == self.summary.total_tests {
            "FAIL".to_string()
        } else {
            "PARTIAL_PASS".to_string()
        };
    }

    /// Get full report as string.
    #[must_use]
    pub fn full_report(&self) -> String {
        use std::fmt::Write;

        let mut report = String::new();
        report.push_str("=== Nullification Report ===\n\n");

        for result in &self.results {
            report.push_str(&result.report());
            report.push('\n');
        }

        let _ = write!(
            report,
            "\nSummary: {} tests, {} rejected, {} not rejected\n",
            self.summary.total_tests, self.summary.rejected, self.summary.not_rejected
        );
        let _ = writeln!(report, "Status: {}", self.summary.status);

        report
    }
}

/// ML-specific nullification hypotheses.
pub mod ml_hypotheses {
    use super::NullificationTest;

    /// H₀-TRAIN-01: Training converges identically across runs with same seed.
    #[must_use]
    pub fn training_determinism() -> NullificationTest {
        NullificationTest::new("H0-TRAIN-01").with_expected(0.0) // Expected bitwise difference = 0
    }

    /// H₀-TRAIN-02: Loss monotonically decreases (no spikes).
    #[must_use]
    pub fn loss_stability(threshold_sigma: f64) -> NullificationTest {
        NullificationTest::new("H0-TRAIN-02").with_expected(threshold_sigma) // Z-score threshold
    }

    /// H₀-TRAIN-03: Gradient norms remain bounded.
    #[must_use]
    pub fn gradient_bounded(max_norm: f64) -> NullificationTest {
        NullificationTest::new("H0-TRAIN-03").with_expected(max_norm)
    }

    /// H₀-TRAIN-05: Model parameters remain finite.
    #[must_use]
    pub fn params_finite() -> NullificationTest {
        NullificationTest::new("H0-TRAIN-05").with_expected(0.0) // Expected NaN count = 0
    }

    /// H₀-PRED-01: Predictions are deterministic for temperature=0.
    #[must_use]
    pub fn prediction_determinism() -> NullificationTest {
        NullificationTest::new("H0-PRED-01").with_expected(0.0) // Expected variance = 0
    }

    /// H₀-PRED-03: Latency within specified SLA.
    #[must_use]
    pub fn latency_sla(sla_ms: f64) -> NullificationTest {
        NullificationTest::new("H0-PRED-03").with_expected(sla_ms)
    }

    /// H₀-MULTI-01: Accuracy >= oracle baseline.
    #[must_use]
    pub fn accuracy_baseline(baseline: f64) -> NullificationTest {
        NullificationTest::new("H0-MULTI-01").with_expected(baseline)
    }

    /// H₀-MULTI-05: Results are statistically significant (5 runs, t-test).
    #[must_use]
    pub fn statistical_significance() -> NullificationTest {
        NullificationTest::new("H0-MULTI-05")
            .with_runs(5)
            .with_alpha(0.05)
    }
}

// ============================================================================
// Energy Conservation Hypothesis
// ============================================================================

/// Energy conservation hypothesis.
///
/// Tests whether total energy is conserved within tolerance.
#[derive(Debug, Clone)]
pub struct EnergyConservationHypothesis {
    /// Initial energy.
    pub initial_energy: f64,
    /// Tolerance for energy drift.
    pub tolerance: f64,
}

impl EnergyConservationHypothesis {
    /// Create a new energy conservation hypothesis.
    #[must_use]
    pub fn new(initial_energy: f64, tolerance: f64) -> Self {
        Self {
            initial_energy,
            tolerance,
        }
    }
}

impl FalsifiableHypothesis for EnergyConservationHypothesis {
    type State = f64; // Current energy

    fn predict(&self, _state: &Self::State) -> Predictions {
        let mut predictions = Predictions::new();
        predictions.add(self.initial_energy, self.tolerance);
        predictions
    }

    fn falsification_criteria(&self) -> Vec<FalsificationCriteria> {
        vec![FalsificationCriteria::less_than(
            "energy_drift",
            self.tolerance,
        )]
    }

    fn robustness(&self, state: &Self::State) -> f64 {
        let drift =
            (state - self.initial_energy).abs() / self.initial_energy.abs().max(f64::EPSILON);
        self.tolerance - drift
    }

    fn null_hypothesis_test(
        &self,
        predictions: &Predictions,
        observations: &Observations,
        significance: f64,
    ) -> NHSTResult {
        if predictions.is_empty() || observations.is_empty() {
            return NHSTResult::NotRejected {
                p_value: 1.0,
                power: 0.0,
            };
        }

        stats::two_sample_t_test(&predictions.values, &observations.values, significance)
    }
}

#[cfg(test)]
mod tests {
    use super::stats::*;
    use super::*;

    // -------------------------------------------------------------------------
    // NullificationTest Tests
    // -------------------------------------------------------------------------

    #[test]
    fn test_nullification_test_new() {
        let test = NullificationTest::new("H0-TEST-01");
        assert_eq!(test.hypothesis_id, "H0-TEST-01");
        assert_eq!(test.n_runs, 5);
        assert!((test.alpha - 0.05).abs() < f64::EPSILON);
    }

    #[test]
    fn test_nullification_test_with_runs() {
        let test = NullificationTest::new("H0-TEST").with_runs(10);
        assert_eq!(test.n_runs, 10);

        // Should enforce minimum of 5
        let test = NullificationTest::new("H0-TEST").with_runs(3);
        assert_eq!(test.n_runs, 5);
    }

    #[test]
    fn test_nullification_test_execute() {
        let test = NullificationTest::new("H0-TEST").with_expected(0.0);
        let result = test.execute(|| 0.0);
        assert!(!result.rejected); // All zeros = no difference from expected
    }

    #[test]
    fn test_nullification_test_execute_reject() {
        let test = NullificationTest::new("H0-TEST").with_expected(0.0);
        let result = test.execute(|| 100.0); // All 100s, clearly different from 0
        assert!(result.rejected);
    }

    #[test]
    fn test_nullification_test_evaluate() {
        let test = NullificationTest::new("H0-TEST").with_expected(0.0);
        let observations = vec![0.1, 0.05, -0.05, 0.02, -0.01];
        let result = test.evaluate(&observations);
        assert!(!result.rejected); // Small values close to 0
    }

    #[test]
    fn test_nullification_result_report() {
        let result = NullificationResult {
            hypothesis_id: "H0-TEST".to_string(),
            rejected: true,
            p_value: 0.01,
            confidence_interval: (0.5, 1.5),
            effect_size: 0.8,
            observations: vec![1.0],
        };
        let report = result.report();
        assert!(report.contains("H0-TEST"));
        assert!(report.contains("REJECTED"));
        assert!(report.contains("0.0100"));
    }

    #[test]
    fn test_nullification_report() {
        let mut report = NullificationReport::new();

        report.add(NullificationResult {
            hypothesis_id: "H0-1".to_string(),
            rejected: false,
            p_value: 0.1,
            confidence_interval: (0.0, 1.0),
            effect_size: 0.1,
            observations: vec![],
        });

        report.add(NullificationResult {
            hypothesis_id: "H0-2".to_string(),
            rejected: true,
            p_value: 0.01,
            confidence_interval: (0.0, 1.0),
            effect_size: 0.8,
            observations: vec![],
        });

        assert_eq!(report.summary.total_tests, 2);
        assert_eq!(report.summary.rejected, 1);
        assert_eq!(report.summary.not_rejected, 1);
        assert_eq!(report.summary.status, "PARTIAL_PASS");
    }

    #[test]
    fn test_nullification_report_full() {
        let mut report = NullificationReport::new();
        report.add(NullificationResult {
            hypothesis_id: "H0-PASS".to_string(),
            rejected: false,
            p_value: 0.5,
            confidence_interval: (0.0, 1.0),
            effect_size: 0.1,
            observations: vec![1.0, 2.0],
        });

        let text = report.full_report();
        assert!(text.contains("Nullification Report"));
        assert!(text.contains("H0-PASS"));
        assert!(text.contains("NOT REJECTED"));
    }

    #[test]
    fn test_ml_hypotheses() {
        let test = ml_hypotheses::training_determinism();
        assert_eq!(test.hypothesis_id, "H0-TRAIN-01");

        let test = ml_hypotheses::loss_stability(3.0);
        assert_eq!(test.hypothesis_id, "H0-TRAIN-02");

        let test = ml_hypotheses::prediction_determinism();
        assert_eq!(test.hypothesis_id, "H0-PRED-01");

        let test = ml_hypotheses::accuracy_baseline(0.9);
        assert!((test.expected_value - 0.9).abs() < f64::EPSILON);
    }

    // -------------------------------------------------------------------------
    // Original Tests
    // -------------------------------------------------------------------------

    #[test]
    fn test_nhst_result() {
        let rejected = NHSTResult::Rejected {
            p_value: 0.01,
            effect_size: 0.5,
            test_statistic: 2.5,
        };
        assert!(rejected.is_rejected());
        assert!((rejected.p_value() - 0.01).abs() < f64::EPSILON);

        let not_rejected = NHSTResult::NotRejected {
            p_value: 0.1,
            power: 0.8,
        };
        assert!(!not_rejected.is_rejected());
    }

    #[test]
    fn test_predictions_observations() {
        let mut pred = Predictions::new();
        pred.add(1.0, 0.1);
        pred.add(2.0, 0.2);
        assert_eq!(pred.len(), 2);
        assert!(!pred.is_empty());

        let mut obs = Observations::new();
        obs.add(1.1, 0.05);
        assert_eq!(obs.len(), 1);
    }

    #[test]
    fn test_falsification_criteria() {
        let crit = FalsificationCriteria::less_than("energy_drift", 0.01);
        assert!(crit.check(0.005));
        assert!(!crit.check(0.02));

        let crit = FalsificationCriteria::greater_than("power", 0.8);
        assert!(crit.check(0.9));
        assert!(!crit.check(0.7));
    }

    #[test]
    fn test_stats_mean() {
        let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        assert!((mean(&values) - 3.0).abs() < f64::EPSILON);

        let empty: Vec<f64> = vec![];
        assert!((mean(&empty) - 0.0).abs() < f64::EPSILON);
    }

    #[test]
    fn test_stats_variance() {
        let values = vec![2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0];
        let var = variance(&values);
        assert!((var - 4.571_428_571_428_571).abs() < 0.001);
    }

    #[test]
    fn test_stats_std_dev() {
        let values = vec![2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0];
        let sd = std_dev(&values);
        assert!((sd - 2.138).abs() < 0.01);
    }

    #[test]
    fn test_one_sample_t_test() {
        // Test with values clearly different from null hypothesis
        let values = vec![5.0, 5.1, 4.9, 5.2, 4.8, 5.0, 5.1, 4.9];
        let result = one_sample_t_test(&values, 0.0, 0.05);
        assert!(result.is_rejected());

        // Test with values close to null hypothesis
        let values = vec![0.1, -0.1, 0.05, -0.05, 0.0, 0.02, -0.02, 0.01];
        let result = one_sample_t_test(&values, 0.0, 0.05);
        assert!(!result.is_rejected());
    }

    #[test]
    fn test_two_sample_t_test() {
        // Clearly different samples
        let sample1 = vec![10.0, 11.0, 12.0, 10.5, 11.5];
        let sample2 = vec![1.0, 2.0, 1.5, 2.5, 1.2];
        let result = two_sample_t_test(&sample1, &sample2, 0.05);
        assert!(result.is_rejected());

        // Similar samples
        let sample1 = vec![5.0, 5.1, 4.9, 5.2, 4.8];
        let sample2 = vec![5.05, 5.15, 4.95, 5.1, 4.85];
        let result = two_sample_t_test(&sample1, &sample2, 0.05);
        assert!(!result.is_rejected());
    }

    #[test]
    fn test_energy_conservation_hypothesis() {
        let hypothesis = EnergyConservationHypothesis::new(100.0, 0.01);

        // Test robustness
        let robustness_good = hypothesis.robustness(&100.5); // 0.5% drift
        assert!(robustness_good > 0.0);

        let robustness_bad = hypothesis.robustness(&110.0); // 10% drift
        assert!(robustness_bad < 0.0);

        // Test falsification criteria
        let criteria = hypothesis.falsification_criteria();
        assert_eq!(criteria.len(), 1);
        assert_eq!(criteria[0].metric, "energy_drift");
    }

    // -------------------------------------------------------------------------
    // Additional Tests for Coverage
    // -------------------------------------------------------------------------

    #[test]
    fn test_falsification_criteria_all_operators() {
        // Test Le (less than or equal)
        let crit = FalsificationCriteria {
            metric: "test".to_string(),
            operator: ComparisonOp::Le,
            threshold: 10.0,
        };
        assert!(crit.check(9.0));
        assert!(crit.check(10.0));
        assert!(!crit.check(11.0));

        // Test Ge (greater than or equal)
        let crit = FalsificationCriteria {
            metric: "test".to_string(),
            operator: ComparisonOp::Ge,
            threshold: 10.0,
        };
        assert!(!crit.check(9.0));
        assert!(crit.check(10.0));
        assert!(crit.check(11.0));

        // Test Eq (equal within epsilon)
        let crit = FalsificationCriteria {
            metric: "test".to_string(),
            operator: ComparisonOp::Eq,
            threshold: 10.0,
        };
        assert!(crit.check(10.0));
        assert!(!crit.check(10.1));

        // Test Ne (not equal)
        let crit = FalsificationCriteria {
            metric: "test".to_string(),
            operator: ComparisonOp::Ne,
            threshold: 10.0,
        };
        assert!(!crit.check(10.0));
        assert!(crit.check(10.1));
    }

    #[test]
    fn test_stats_std_error() {
        let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let se = std_error(&values);
        assert!(se > 0.0);

        let empty: Vec<f64> = vec![];
        assert!((std_error(&empty) - 0.0).abs() < f64::EPSILON);
    }

    #[test]
    fn test_stats_chi_square_test() {
        // Test chi-square with clearly different distributions
        let observed = vec![10.0, 20.0, 30.0, 40.0];
        let expected = vec![25.0, 25.0, 25.0, 25.0];
        let result = chi_square_test(&observed, &expected, 0.05);
        assert!(result.is_rejected());

        // Test chi-square with similar distributions
        let observed = vec![24.0, 26.0, 24.0, 26.0];
        let expected = vec![25.0, 25.0, 25.0, 25.0];
        let result = chi_square_test(&observed, &expected, 0.05);
        assert!(!result.is_rejected());
    }

    #[test]
    fn test_chi_square_test_edge_cases() {
        // Mismatched lengths
        let result = chi_square_test(&[1.0, 2.0], &[1.0], 0.05);
        assert!(!result.is_rejected());
        assert!((result.p_value() - 1.0).abs() < f64::EPSILON);

        // Empty arrays
        let result = chi_square_test(&[], &[], 0.05);
        assert!(!result.is_rejected());
    }

    #[test]
    fn test_stats_normal_cdf_pub() {
        // Test at various points
        assert!((normal_cdf_pub(0.0) - 0.5).abs() < 0.01);
        assert!(normal_cdf_pub(3.0) > 0.99);
        assert!(normal_cdf_pub(-3.0) < 0.01);
    }

    #[test]
    fn test_ml_hypotheses_all() {
        let test = ml_hypotheses::gradient_bounded(10.0);
        assert_eq!(test.hypothesis_id, "H0-TRAIN-03");
        assert!((test.expected_value - 10.0).abs() < f64::EPSILON);

        let test = ml_hypotheses::params_finite();
        assert_eq!(test.hypothesis_id, "H0-TRAIN-05");

        let test = ml_hypotheses::latency_sla(100.0);
        assert_eq!(test.hypothesis_id, "H0-PRED-03");
        assert!((test.expected_value - 100.0).abs() < f64::EPSILON);

        let test = ml_hypotheses::statistical_significance();
        assert_eq!(test.hypothesis_id, "H0-MULTI-05");
        assert_eq!(test.n_runs, 5);
        assert!((test.alpha - 0.05).abs() < f64::EPSILON);
    }

    #[test]
    fn test_nullification_test_with_alpha() {
        let test = NullificationTest::new("H0-TEST").with_alpha(0.01);
        assert!((test.alpha - 0.01).abs() < f64::EPSILON);

        // Test clamping
        let test = NullificationTest::new("H0-TEST").with_alpha(0.0001); // Below minimum
        assert!((test.alpha - 0.001).abs() < f64::EPSILON);

        let test = NullificationTest::new("H0-TEST").with_alpha(0.5); // Above maximum
        assert!((test.alpha - 0.1).abs() < f64::EPSILON);
    }

    #[test]
    fn test_nullification_report_all_pass() {
        let mut report = NullificationReport::new();
        report.add(NullificationResult {
            hypothesis_id: "H0-1".to_string(),
            rejected: false,
            p_value: 0.5,
            confidence_interval: (0.0, 1.0),
            effect_size: 0.1,
            observations: vec![],
        });
        assert_eq!(report.summary.status, "PASS");
    }

    #[test]
    fn test_nullification_report_all_fail() {
        let mut report = NullificationReport::new();
        report.add(NullificationResult {
            hypothesis_id: "H0-1".to_string(),
            rejected: true,
            p_value: 0.01,
            confidence_interval: (0.0, 1.0),
            effect_size: 0.8,
            observations: vec![],
        });
        assert_eq!(report.summary.status, "FAIL");
    }

    #[test]
    fn test_bootstrap_ci_empty() {
        let test = NullificationTest::new("H0-TEST");
        let result = test.evaluate(&[]);
        assert!((result.confidence_interval.0 - 0.0).abs() < f64::EPSILON);
        assert!((result.confidence_interval.1 - 0.0).abs() < f64::EPSILON);
    }

    #[test]
    fn test_compute_stats_single_value() {
        let test = NullificationTest::new("H0-TEST").with_expected(0.0);
        let result = test.evaluate(&[1.0]);
        assert!((result.p_value - 1.0).abs() < f64::EPSILON);
    }

    #[test]
    fn test_compute_stats_no_variance() {
        let test = NullificationTest::new("H0-TEST").with_expected(5.0);
        let result = test.evaluate(&[5.0, 5.0, 5.0, 5.0, 5.0]);
        // All same values = expected, p_value should be 1.0
        assert!((result.p_value - 1.0).abs() < f64::EPSILON);

        // Different from expected, all same values
        let test = NullificationTest::new("H0-TEST").with_expected(0.0);
        let result = test.evaluate(&[10.0, 10.0, 10.0, 10.0, 10.0]);
        assert!((result.p_value - 0.0).abs() < f64::EPSILON);
    }

    #[test]
    fn test_one_sample_t_test_edge_cases() {
        // Single value
        let result = one_sample_t_test(&[5.0], 0.0, 0.05);
        assert!(!result.is_rejected());
        assert!((result.p_value() - 1.0).abs() < f64::EPSILON);

        // No variance
        let result = one_sample_t_test(&[5.0, 5.0, 5.0, 5.0, 5.0], 5.0, 0.05);
        assert!(!result.is_rejected());
    }

    #[test]
    fn test_two_sample_t_test_edge_cases() {
        // Single value in each sample
        let result = two_sample_t_test(&[5.0], &[10.0], 0.05);
        assert!(!result.is_rejected());

        // No variance in both samples (different values)
        let result = two_sample_t_test(
            &[5.0, 5.0, 5.0, 5.0, 5.0],
            &[10.0, 10.0, 10.0, 10.0, 10.0],
            0.05,
        );
        assert!(!result.is_rejected()); // se is near zero
    }

    #[test]
    fn test_energy_conservation_hypothesis_predict() {
        let hypothesis = EnergyConservationHypothesis::new(100.0, 0.01);
        let predictions = hypothesis.predict(&100.0);
        assert_eq!(predictions.len(), 1);
        assert!((predictions.values[0] - 100.0).abs() < f64::EPSILON);
    }

    #[test]
    fn test_energy_conservation_hypothesis_null_test() {
        let hypothesis = EnergyConservationHypothesis::new(100.0, 0.01);

        // Empty predictions
        let pred = Predictions::new();
        let obs = Observations::new();
        let result = hypothesis.null_hypothesis_test(&pred, &obs, 0.05);
        assert!(!result.is_rejected());

        // With data
        let mut pred = Predictions::new();
        pred.add(100.0, 0.1);
        pred.add(100.0, 0.1);
        pred.add(100.0, 0.1);

        let mut obs = Observations::new();
        obs.add(100.5, 0.1);
        obs.add(99.5, 0.1);
        obs.add(100.2, 0.1);

        let result = hypothesis.null_hypothesis_test(&pred, &obs, 0.05);
        // Similar values, shouldn't reject
        assert!(!result.is_rejected());
    }

    #[test]
    fn test_predictions_default() {
        let pred = Predictions::default();
        assert!(pred.is_empty());
        assert_eq!(pred.len(), 0);
    }

    #[test]
    fn test_observations_default() {
        let obs = Observations::default();
        assert!(obs.is_empty());
        assert_eq!(obs.len(), 0);
    }

    #[test]
    fn test_falsification_criteria_debug() {
        let crit = FalsificationCriteria::less_than("test", 0.01);
        let debug = format!("{:?}", crit);
        assert!(debug.contains("test"));
    }

    #[test]
    fn test_comparison_op_eq() {
        assert_eq!(ComparisonOp::Lt, ComparisonOp::Lt);
        assert_ne!(ComparisonOp::Lt, ComparisonOp::Gt);
    }

    #[test]
    fn test_nhst_result_clone() {
        let result = NHSTResult::Rejected {
            p_value: 0.01,
            effect_size: 0.5,
            test_statistic: 2.5,
        };
        let cloned = result.clone();
        assert!(cloned.is_rejected());
    }

    #[test]
    fn test_nullification_result_clone() {
        let result = NullificationResult {
            hypothesis_id: "H0-TEST".to_string(),
            rejected: true,
            p_value: 0.01,
            confidence_interval: (0.5, 1.5),
            effect_size: 0.8,
            observations: vec![1.0, 2.0],
        };
        let cloned = result.clone();
        assert_eq!(cloned.hypothesis_id, "H0-TEST");
        assert!(cloned.rejected);
    }

    #[test]
    fn test_energy_conservation_clone() {
        let hypothesis = EnergyConservationHypothesis::new(100.0, 0.01);
        let cloned = hypothesis.clone();
        assert!((cloned.initial_energy - 100.0).abs() < f64::EPSILON);
    }
}

#[cfg(test)]
mod proptests {
    use super::stats::*;
    use super::*;
    use proptest::prelude::*;

    proptest! {
        /// Falsification: mean of identical values equals that value.
        #[test]
        fn prop_mean_constant(value in -1000.0f64..1000.0, n in 1usize..100) {
            let values: Vec<f64> = vec![value; n];
            let m = mean(&values);
            prop_assert!((m - value).abs() < 1e-10);
        }

        /// Falsification: variance of identical values is zero.
        #[test]
        fn prop_variance_constant(value in -1000.0f64..1000.0, n in 2usize..100) {
            let values: Vec<f64> = vec![value; n];
            let v = variance(&values);
            prop_assert!(v.abs() < 1e-10);
        }

        /// Falsification: robustness sign indicates satisfaction.
        #[test]
        fn prop_robustness_sign(
            initial in 1.0f64..1000.0,
            tolerance in 0.001f64..0.1,
            drift_factor in 0.0f64..0.2,
        ) {
            let hypothesis = EnergyConservationHypothesis::new(initial, tolerance);
            let current = initial * (1.0 + drift_factor);
            let robustness = hypothesis.robustness(&current);

            if drift_factor < tolerance {
                prop_assert!(robustness > 0.0,
                    "Expected positive robustness for drift {} < tolerance {}",
                    drift_factor, tolerance);
            }
            // Note: drift_factor == tolerance gives robustness ≈ 0
        }
    }
}