openentropy_core/
comparison.rs

1//! Differential comparison of two byte streams.
2//!
3//! All functions operate on raw `&[u8]` — no session I/O, no formatting.
4//! The CLI layer handles loading, printing, and serialization.
5//!
6//! # Two-sample philosophy
7//!
8//! Per-stream metrics (Shannon H, autocorrelation, etc.) are computed by
9//! [`analysis::full_analysis`] and displayed side-by-side by the CLI.
10//! This module adds **cross-stream** statistics that directly test whether
11//! two byte streams differ: two-sample KS, chi-squared homogeneity,
12//! Cliff's delta, and Mann-Whitney U.
13
14use serde::Serialize;
15
16use crate::analysis::{self, SourceAnalysis};
17use crate::conditioning::{quick_min_entropy, quick_shannon};
18use crate::math;
19
20// ---------------------------------------------------------------------------
21// Result types
22// ---------------------------------------------------------------------------
23
24/// Full comparison result between two byte streams.
25#[derive(Debug, Clone, Serialize)]
26pub struct ComparisonResult {
27    pub label_a: String,
28    pub label_b: String,
29    pub size_a: usize,
30    pub size_b: usize,
31    pub aggregate: AggregateDelta,
32    pub two_sample: TwoSampleTests,
33    pub temporal: TemporalAnalysis,
34    pub digram: DigramAnalysis,
35    pub markov: MarkovAnalysis,
36    pub multi_lag: MultiLagAnalysis,
37    pub run_lengths: RunLengthComparison,
38}
39
40/// Aggregate statistics delta between two streams.
41#[derive(Debug, Clone, Serialize)]
42pub struct AggregateDelta {
43    pub shannon_a: f64,
44    pub shannon_b: f64,
45    pub min_entropy_a: f64,
46    pub min_entropy_b: f64,
47    pub mean_a: f64,
48    pub mean_b: f64,
49    pub variance_a: f64,
50    pub variance_b: f64,
51    pub bit_bias_a: f64,
52    pub bit_bias_b: f64,
53    pub per_bit_bias_a: [f64; 8],
54    pub per_bit_bias_b: [f64; 8],
55    pub chi_squared_a: f64,
56    pub chi_squared_b: f64,
57    pub ks_statistic_a: f64,
58    pub ks_statistic_b: f64,
59    /// Cohen's d effect size for byte mean difference (assumes normality —
60    /// use [`TwoSampleTests::cliffs_delta`] for the non-parametric alternative).
61    pub cohens_d: f64,
62}
63
64/// Two-sample statistical tests that directly compare stream A against stream B.
65#[derive(Debug, Clone, Serialize)]
66pub struct TwoSampleTests {
67    /// Two-sample Kolmogorov-Smirnov statistic: max|F_A(x) - F_B(x)|.
68    /// Tests whether both samples come from the same distribution.
69    pub ks_statistic: f64,
70    /// Approximate p-value for the two-sample KS test.
71    pub ks_p_value: f64,
72    /// Chi-squared homogeneity statistic comparing the 256-bin byte histograms.
73    /// Tests whether A and B have the same byte frequency distribution.
74    pub chi2_homogeneity: f64,
75    /// Degrees of freedom for the homogeneity test (255 for byte data).
76    pub chi2_df: usize,
77    /// Approximate p-value for the chi-squared homogeneity test.
78    pub chi2_p_value: f64,
79    /// Whether the chi-squared approximation is reliable (expected count >= 5
80    /// in every cell of the 2×k contingency table).
81    pub chi2_reliable: bool,
82    /// Cliff's delta: non-parametric effect size for ordinal data.
83    /// Range [-1, 1]. 0 = no difference. Appropriate for discrete byte values
84    /// (unlike Cohen's d which assumes normality).
85    pub cliffs_delta: f64,
86    /// Mann-Whitney U statistic (normalized to [0, 1]).
87    /// Tests whether values from A tend to be larger or smaller than values from B.
88    /// 0.5 = no difference.
89    pub mann_whitney_u: f64,
90    /// Approximate p-value for the Mann-Whitney U test (two-tailed, z-approximation).
91    /// Valid for large samples (n, m >> 20).
92    pub mann_whitney_p_value: f64,
93}
94
95/// A single anomalous window detected during temporal analysis.
96#[derive(Debug, Clone, Serialize)]
97pub struct WindowAnomaly {
98    pub offset: usize,
99    pub mean: f64,
100    /// Z-score vs theoretical Uniform(0,255) parameters, not empirical.
101    pub z_score: f64,
102}
103
104/// Sliding-window temporal anomaly detection using theoretical parameters.
105///
106/// Window means are compared against the theoretical mean (127.5) and
107/// theoretical standard deviation (sqrt(Var(U(0,255)) / window_size)) rather
108/// than empirical estimates. This avoids the circular-statistics pitfall where
109/// z-scores computed from empirical parameters always produce ~0.27% outliers.
110#[derive(Debug, Clone, Serialize)]
111pub struct TemporalAnalysis {
112    pub window_size: usize,
113    pub anomaly_count_a: usize,
114    pub anomaly_count_b: usize,
115    pub max_z_a: f64,
116    pub max_z_b: f64,
117    pub top_anomalies_a: Vec<WindowAnomaly>,
118    pub top_anomalies_b: Vec<WindowAnomaly>,
119    /// Per-window Shannon entropy. Capped at 1024 entries in each vec to keep
120    /// JSON output bounded; for larger sessions the windows are sub-sampled.
121    pub windowed_entropy_a: Vec<f64>,
122    pub windowed_entropy_b: Vec<f64>,
123}
124
125/// Digram (byte bigram) chi-squared uniformity.
126///
127/// Trigrams are intentionally omitted: hashing 16M possible trigrams into
128/// fewer bins produces systematic chi-squared bias from hash collisions,
129/// making the statistic unreliable. Digrams with 65536 exact bins are sound
130/// when the sample has >= ~320KB (expected count >= 5 per bin).
131#[derive(Debug, Clone, Serialize)]
132pub struct DigramAnalysis {
133    /// Chi-squared statistic for digram uniformity, or NaN if insufficient data.
134    pub chi2_a: f64,
135    /// Chi-squared statistic for digram uniformity, or NaN if insufficient data.
136    pub chi2_b: f64,
137    /// Whether the sample was large enough for the chi-squared approximation.
138    pub sufficient_data: bool,
139    /// Minimum sample size needed (bytes) for expected >= 5 per bin.
140    pub min_sample_bytes: usize,
141}
142
143/// Per-bit Markov transition probabilities.
144#[derive(Debug, Clone, Serialize)]
145pub struct MarkovAnalysis {
146    /// Per-bit P(to|from): transitions\[bit\]\[from\]\[to\].
147    pub transitions_a: [[[f64; 2]; 2]; 8],
148    pub transitions_b: [[[f64; 2]; 2]; 8],
149}
150
151/// Autocorrelation at multiple lags.
152#[derive(Debug, Clone, Serialize)]
153pub struct MultiLagAnalysis {
154    pub lags: Vec<usize>,
155    pub autocorr_a: Vec<f64>,
156    pub autocorr_b: Vec<f64>,
157}
158
159/// Byte run-length distributions.
160#[derive(Debug, Clone, Serialize)]
161pub struct RunLengthComparison {
162    /// (run_length, count) pairs for stream A.
163    pub distribution_a: Vec<(usize, usize)>,
164    /// (run_length, count) pairs for stream B.
165    pub distribution_b: Vec<(usize, usize)>,
166}
167
168// ---------------------------------------------------------------------------
169// Top-level comparison
170// ---------------------------------------------------------------------------
171
172/// Compare two byte streams and produce a full differential report.
173///
174/// This is the standalone entry point that computes everything from scratch.
175/// If you already have [`SourceAnalysis`] results, prefer [`compare_with_analysis`]
176/// to avoid redundant computation.
177pub fn compare(label_a: &str, data_a: &[u8], label_b: &str, data_b: &[u8]) -> ComparisonResult {
178    ComparisonResult {
179        label_a: label_a.to_string(),
180        label_b: label_b.to_string(),
181        size_a: data_a.len(),
182        size_b: data_b.len(),
183        aggregate: aggregate_delta(data_a, data_b),
184        two_sample: two_sample_tests(data_a, data_b),
185        temporal: temporal_analysis(data_a, data_b, 1024, 3.0),
186        digram: digram_analysis(data_a, data_b),
187        markov: markov_analysis(data_a, data_b),
188        multi_lag: multi_lag_analysis(data_a, data_b),
189        run_lengths: run_length_comparison(data_a, data_b),
190    }
191}
192
193/// Compare two byte streams, reusing pre-computed [`SourceAnalysis`] results.
194///
195/// This avoids re-computing `distribution_stats` and `bit_bias` which are
196/// already available from [`analysis::full_analysis`].
197pub fn compare_with_analysis(
198    label_a: &str,
199    data_a: &[u8],
200    analysis_a: &SourceAnalysis,
201    label_b: &str,
202    data_b: &[u8],
203    analysis_b: &SourceAnalysis,
204) -> ComparisonResult {
205    ComparisonResult {
206        label_a: label_a.to_string(),
207        label_b: label_b.to_string(),
208        size_a: data_a.len(),
209        size_b: data_b.len(),
210        aggregate: aggregate_delta_from_analysis(analysis_a, analysis_b),
211        two_sample: two_sample_tests(data_a, data_b),
212        temporal: temporal_analysis(data_a, data_b, 1024, 3.0),
213        digram: digram_analysis(data_a, data_b),
214        markov: markov_analysis(data_a, data_b),
215        multi_lag: multi_lag_analysis(data_a, data_b),
216        run_lengths: run_length_comparison(data_a, data_b),
217    }
218}
219
220// ---------------------------------------------------------------------------
221// Aggregate delta
222// ---------------------------------------------------------------------------
223
224/// Compute aggregate statistics for both streams and Cohen's d effect size.
225pub fn aggregate_delta(data_a: &[u8], data_b: &[u8]) -> AggregateDelta {
226    let dist_a = analysis::distribution_stats(data_a);
227    let dist_b = analysis::distribution_stats(data_b);
228    let bias_a = analysis::bit_bias(data_a);
229    let bias_b = analysis::bit_bias(data_b);
230
231    build_aggregate(
232        quick_shannon(data_a),
233        quick_shannon(data_b),
234        quick_min_entropy(data_a),
235        quick_min_entropy(data_b),
236        &dist_a,
237        &dist_b,
238        &bias_a,
239        &bias_b,
240    )
241}
242
243/// Build aggregate delta from pre-computed analysis results.
244fn aggregate_delta_from_analysis(a: &SourceAnalysis, b: &SourceAnalysis) -> AggregateDelta {
245    build_aggregate(
246        a.shannon_entropy,
247        b.shannon_entropy,
248        a.min_entropy,
249        b.min_entropy,
250        &a.distribution,
251        &b.distribution,
252        &a.bit_bias,
253        &b.bit_bias,
254    )
255}
256
257#[allow(clippy::too_many_arguments)]
258fn build_aggregate(
259    shannon_a: f64,
260    shannon_b: f64,
261    min_entropy_a: f64,
262    min_entropy_b: f64,
263    dist_a: &analysis::DistributionResult,
264    dist_b: &analysis::DistributionResult,
265    bias_a: &analysis::BitBiasResult,
266    bias_b: &analysis::BitBiasResult,
267) -> AggregateDelta {
268    // Cohen's d = (mean_a - mean_b) / pooled_std
269    let pooled_std = ((dist_a.variance + dist_b.variance) / 2.0).sqrt();
270    let cohens_d = if pooled_std > 1e-10 {
271        (dist_a.mean - dist_b.mean) / pooled_std
272    } else {
273        0.0
274    };
275
276    AggregateDelta {
277        shannon_a,
278        shannon_b,
279        min_entropy_a,
280        min_entropy_b,
281        mean_a: dist_a.mean,
282        mean_b: dist_b.mean,
283        variance_a: dist_a.variance,
284        variance_b: dist_b.variance,
285        bit_bias_a: bias_a.overall_bias,
286        bit_bias_b: bias_b.overall_bias,
287        per_bit_bias_a: bias_a.bit_probabilities,
288        per_bit_bias_b: bias_b.bit_probabilities,
289        chi_squared_a: bias_a.chi_squared,
290        chi_squared_b: bias_b.chi_squared,
291        ks_statistic_a: dist_a.ks_statistic,
292        ks_statistic_b: dist_b.ks_statistic,
293        cohens_d,
294    }
295}
296
297// ---------------------------------------------------------------------------
298// Two-sample tests
299// ---------------------------------------------------------------------------
300
301/// Run proper two-sample statistical tests comparing stream A directly to stream B.
302pub fn two_sample_tests(data_a: &[u8], data_b: &[u8]) -> TwoSampleTests {
303    let (ks_stat, ks_p) = two_sample_ks(data_a, data_b);
304    let (chi2, chi2_df, chi2_p, chi2_reliable) = chi2_homogeneity(data_a, data_b);
305    let (mw_u, mw_p) = mann_whitney_u_test(data_a, data_b);
306
307    TwoSampleTests {
308        ks_statistic: ks_stat,
309        ks_p_value: ks_p,
310        chi2_homogeneity: chi2,
311        chi2_df,
312        chi2_p_value: chi2_p,
313        chi2_reliable,
314        cliffs_delta: cliffs_delta(data_a, data_b),
315        mann_whitney_u: mw_u,
316        mann_whitney_p_value: mw_p,
317    }
318}
319
320/// Two-sample Kolmogorov-Smirnov test.
321///
322/// Returns (D_statistic, approximate_p_value).
323/// D = max|F_A(x) - F_B(x)| over all byte values 0..=255.
324fn two_sample_ks(data_a: &[u8], data_b: &[u8]) -> (f64, f64) {
325    if data_a.is_empty() || data_b.is_empty() {
326        return (0.0, 1.0);
327    }
328
329    // Build CDFs from byte histograms.
330    let mut hist_a = [0u64; 256];
331    let mut hist_b = [0u64; 256];
332    for &b in data_a {
333        hist_a[b as usize] += 1;
334    }
335    for &b in data_b {
336        hist_b[b as usize] += 1;
337    }
338
339    let n_a = data_a.len() as f64;
340    let n_b = data_b.len() as f64;
341    let mut cdf_a = 0.0;
342    let mut cdf_b = 0.0;
343    let mut d_max = 0.0f64;
344
345    for i in 0..256 {
346        cdf_a += hist_a[i] as f64 / n_a;
347        cdf_b += hist_b[i] as f64 / n_b;
348        let d = (cdf_a - cdf_b).abs();
349        if d > d_max {
350            d_max = d;
351        }
352    }
353
354    // Asymptotic p-value via the Kolmogorov distribution survival function:
355    //   P(K > lambda) = 2 * sum_{k=1}^{inf} (-1)^{k+1} * exp(-2*k^2*lambda^2)
356    // where lambda = D * sqrt(n_a * n_b / (n_a + n_b)).
357    // The alternating series converges to machine precision within ~10 terms.
358    let n_eff = (n_a * n_b) / (n_a + n_b);
359    let lambda = d_max * n_eff.sqrt();
360    let p_value = kolmogorov_survival(lambda);
361
362    (d_max, p_value)
363}
364
365/// Chi-squared test of homogeneity between two byte histograms.
366///
367/// Tests H0: both samples come from the same (unknown) distribution.
368/// Returns (chi2_statistic, degrees_of_freedom, approximate_p_value, reliable).
369/// The `reliable` flag is false when any cell's expected count is below 5
370/// (Cochran's rule), indicating the chi-squared approximation may be poor.
371fn chi2_homogeneity(data_a: &[u8], data_b: &[u8]) -> (f64, usize, f64, bool) {
372    if data_a.is_empty() || data_b.is_empty() {
373        return (0.0, 255, 1.0, false);
374    }
375
376    let mut hist_a = [0u64; 256];
377    let mut hist_b = [0u64; 256];
378    for &b in data_a {
379        hist_a[b as usize] += 1;
380    }
381    for &b in data_b {
382        hist_b[b as usize] += 1;
383    }
384
385    let n_a = data_a.len() as f64;
386    let n_b = data_b.len() as f64;
387    let n_total = n_a + n_b;
388
389    let mut chi2 = 0.0;
390    let mut df = 0usize;
391    let mut reliable = true;
392
393    for i in 0..256 {
394        let pooled = hist_a[i] + hist_b[i];
395        if pooled == 0 {
396            continue; // empty bin — skip, don't count in df
397        }
398        df += 1;
399
400        let expected_a = pooled as f64 * n_a / n_total;
401        let expected_b = pooled as f64 * n_b / n_total;
402
403        if expected_a < CHI2_MIN_EXPECTED || expected_b < CHI2_MIN_EXPECTED {
404            reliable = false;
405        }
406
407        let diff_a = hist_a[i] as f64 - expected_a;
408        let diff_b = hist_b[i] as f64 - expected_b;
409
410        chi2 += diff_a * diff_a / expected_a + diff_b * diff_b / expected_b;
411    }
412
413    // df = (rows-1)*(cols-1) = (non_empty_bins - 1) * (2 - 1)
414    df = df.saturating_sub(1);
415    let p_value = math::chi_squared_survival(chi2, df);
416
417    (chi2, df, p_value, reliable)
418}
419
420/// Cliff's delta: non-parametric effect size for ordinal data.
421///
422/// delta = (P(A > B) - P(A < B)), range [-1, 1].
423/// Computed efficiently via histograms in O(256) rather than O(n*m).
424pub fn cliffs_delta(data_a: &[u8], data_b: &[u8]) -> f64 {
425    if data_a.is_empty() || data_b.is_empty() {
426        return 0.0;
427    }
428
429    let mut hist_a = [0u64; 256];
430    let mut hist_b = [0u64; 256];
431    for &b in data_a {
432        hist_a[b as usize] += 1;
433    }
434    for &b in data_b {
435        hist_b[b as usize] += 1;
436    }
437
438    // Prefix sum of hist_b to efficiently compute P(A > B) and P(A < B).
439    let mut cum_b = [0u64; 257]; // cum_b[i] = count of B values < i
440    for i in 0..256 {
441        cum_b[i + 1] = cum_b[i] + hist_b[i];
442    }
443
444    let mut greater = 0i128; // count(A > B) - count(A < B)
445    let n_a = data_a.len() as i128;
446    let n_b = data_b.len() as i128;
447
448    for val in 0..256 {
449        let count_a = hist_a[val] as i128;
450        if count_a == 0 {
451            continue;
452        }
453        let b_less = cum_b[val] as i128; // B values strictly less than val
454        let b_greater = n_b - cum_b[val + 1] as i128; // B values strictly greater
455        greater += count_a * (b_less - b_greater);
456    }
457
458    greater as f64 / (n_a * n_b) as f64
459}
460
461/// Mann-Whitney U test with two-tailed z-approximation p-value.
462///
463/// Returns (U_normalized, p_value) where U_normalized = U / (n_a * n_b) ∈ [0,1]
464/// and 0.5 = no difference. The p-value uses the standard large-sample
465/// normal approximation with tie correction.
466/// Computed efficiently via histograms.
467fn mann_whitney_u_test(data_a: &[u8], data_b: &[u8]) -> (f64, f64) {
468    if data_a.is_empty() || data_b.is_empty() {
469        return (0.5, 1.0);
470    }
471
472    let mut hist_a = [0u64; 256];
473    let mut hist_b = [0u64; 256];
474    for &b in data_a {
475        hist_a[b as usize] += 1;
476    }
477    for &b in data_b {
478        hist_b[b as usize] += 1;
479    }
480
481    // U_A = sum over all (a_i, b_j) pairs where a_i > b_j, + 0.5 * ties
482    let mut cum_b = [0u64; 257];
483    for i in 0..256 {
484        cum_b[i + 1] = cum_b[i] + hist_b[i];
485    }
486
487    let mut u: f64 = 0.0;
488    for val in 0..256 {
489        let count_a = hist_a[val] as f64;
490        if count_a == 0.0 {
491            continue;
492        }
493        let b_less = cum_b[val] as f64;
494        let b_equal = hist_b[val] as f64;
495        u += count_a * (b_less + 0.5 * b_equal);
496    }
497
498    let n_a = data_a.len() as f64;
499    let n_b = data_b.len() as f64;
500    let u_norm = u / (n_a * n_b);
501
502    // Z-approximation: z = (U - mu_U) / sigma_U
503    // mu_U = n_a * n_b / 2
504    // sigma_U = sqrt(n_a * n_b * (n_a + n_b + 1) / 12 - tie_correction)
505    // Tie correction: n_a * n_b / (12 * N * (N-1)) * sum(t^3 - t)
506    // where t = size of each tied group, N = n_a + n_b.
507    let n_total = n_a + n_b;
508    let mu = n_a * n_b / 2.0;
509
510    let mut tie_sum = 0.0;
511    for i in 0..256 {
512        let t = (hist_a[i] + hist_b[i]) as f64;
513        if t > 1.0 {
514            tie_sum += t * t * t - t;
515        }
516    }
517
518    let sigma_sq = n_a * n_b / 12.0 * ((n_total + 1.0) - tie_sum / (n_total * (n_total - 1.0)));
519    let p_value = if sigma_sq > 0.0 {
520        let z = (u - mu).abs() / sigma_sq.sqrt();
521        // Two-tailed p-value from standard normal: p = 2 * Phi(-|z|)
522        // Using the complementary error function: erfc(x/sqrt(2))
523        math::erfc(z / std::f64::consts::SQRT_2)
524    } else {
525        1.0
526    };
527
528    (u_norm, p_value.clamp(0.0, 1.0))
529}
530
531// ---------------------------------------------------------------------------
532// Temporal analysis
533// ---------------------------------------------------------------------------
534
535/// Maximum number of per-window entropy values retained in the result.
536/// Keeps JSON output bounded for very large sessions.
537const MAX_WINDOWED_ENTROPY: usize = 1024;
538
539/// Variance of Uniform(0, 255): E[X^2] - E[X]^2 = (255*256/3)/256 ... exact value:
540/// Var = (256^2 - 1) / 12 = 65535 / 12 ≈ 5461.25
541const UNIFORM_BYTE_VARIANCE: f64 = 65535.0 / 12.0;
542
543/// Theoretical mean of Uniform(0, 255).
544const UNIFORM_BYTE_MEAN: f64 = 127.5;
545
546/// Sliding-window anomaly detection over both streams.
547///
548/// Uses **theoretical** parameters from Uniform(0,255) for z-score computation,
549/// avoiding the circular-statistics pitfall of comparing data against itself.
550pub fn temporal_analysis(
551    data_a: &[u8],
552    data_b: &[u8],
553    window: usize,
554    z_threshold: f64,
555) -> TemporalAnalysis {
556    let (anomalies_a, entropy_a) = windowed_scan(data_a, window, z_threshold);
557    let (anomalies_b, entropy_b) = windowed_scan(data_b, window, z_threshold);
558
559    let max_z_a = anomalies_a
560        .iter()
561        .map(|a| a.z_score.abs())
562        .fold(0.0f64, f64::max);
563    let max_z_b = anomalies_b
564        .iter()
565        .map(|a| a.z_score.abs())
566        .fold(0.0f64, f64::max);
567
568    let count_a = anomalies_a.len();
569    let count_b = anomalies_b.len();
570
571    // Keep top 20 anomalies by |z|.
572    let top_a = top_anomalies(anomalies_a, 20);
573    let top_b = top_anomalies(anomalies_b, 20);
574
575    TemporalAnalysis {
576        window_size: window,
577        anomaly_count_a: count_a,
578        anomaly_count_b: count_b,
579        max_z_a,
580        max_z_b,
581        top_anomalies_a: top_a,
582        top_anomalies_b: top_b,
583        windowed_entropy_a: subsample(entropy_a, MAX_WINDOWED_ENTROPY),
584        windowed_entropy_b: subsample(entropy_b, MAX_WINDOWED_ENTROPY),
585    }
586}
587
588/// Scan data in non-overlapping windows using theoretical Uniform(0,255) parameters.
589fn windowed_scan(data: &[u8], window: usize, z_threshold: f64) -> (Vec<WindowAnomaly>, Vec<f64>) {
590    if data.is_empty() || window == 0 {
591        return (Vec::new(), Vec::new());
592    }
593
594    let n_windows = data.len() / window;
595    if n_windows == 0 {
596        return (Vec::new(), Vec::new());
597    }
598
599    // Theoretical std of window mean under Uniform(0,255).
600    let theoretical_std = (UNIFORM_BYTE_VARIANCE / window as f64).sqrt();
601
602    let mut anomalies = Vec::new();
603    let mut entropies = Vec::with_capacity(n_windows);
604
605    for i in 0..n_windows {
606        let start = i * window;
607        let chunk = &data[start..start + window];
608        let sum: f64 = chunk.iter().map(|&b| b as f64).sum();
609        let mean = sum / window as f64;
610        entropies.push(quick_shannon(chunk));
611
612        // Z-score against theoretical parameters — not circular.
613        let z = if theoretical_std > 1e-10 {
614            (mean - UNIFORM_BYTE_MEAN) / theoretical_std
615        } else {
616            0.0
617        };
618
619        if z.abs() > z_threshold {
620            anomalies.push(WindowAnomaly {
621                offset: i * window,
622                mean,
623                z_score: z,
624            });
625        }
626    }
627
628    (anomalies, entropies)
629}
630
631/// Keep top N anomalies by |z_score|.
632fn top_anomalies(mut anomalies: Vec<WindowAnomaly>, n: usize) -> Vec<WindowAnomaly> {
633    anomalies.sort_by(|a, b| {
634        b.z_score
635            .abs()
636            .partial_cmp(&a.z_score.abs())
637            .unwrap_or(std::cmp::Ordering::Equal)
638    });
639    anomalies.truncate(n);
640    anomalies
641}
642
643/// Uniformly subsample a vector to at most `max` elements.
644fn subsample(v: Vec<f64>, max: usize) -> Vec<f64> {
645    if v.len() <= max {
646        return v;
647    }
648    let step = v.len() as f64 / max as f64;
649    (0..max).map(|i| v[(i as f64 * step) as usize]).collect()
650}
651
652// ---------------------------------------------------------------------------
653// Digram analysis
654// ---------------------------------------------------------------------------
655
656/// Minimum expected count per bin for chi-squared to be valid.
657const CHI2_MIN_EXPECTED: f64 = 5.0;
658
659/// Number of possible byte digrams (256^2).
660const N_DIGRAM_BINS: usize = 65536;
661
662/// Compute digram chi-squared statistics for both streams.
663///
664/// Returns NaN for chi-squared values when the sample is too small for the
665/// chi-squared approximation to be valid (expected count per bin < 5).
666pub fn digram_analysis(data_a: &[u8], data_b: &[u8]) -> DigramAnalysis {
667    // Need n-1 >= N_DIGRAM_BINS * 5 = 327,680 digrams, so ~327,681 bytes.
668    let min_bytes = N_DIGRAM_BINS * CHI2_MIN_EXPECTED as usize + 1;
669    let sufficient = data_a.len() >= min_bytes && data_b.len() >= min_bytes;
670
671    DigramAnalysis {
672        chi2_a: digram_chi2(data_a),
673        chi2_b: digram_chi2(data_b),
674        sufficient_data: sufficient,
675        min_sample_bytes: min_bytes,
676    }
677}
678
679/// Chi-squared statistic for byte digram (bigram) uniformity.
680///
681/// Returns NaN if the sample is too small for the approximation to be valid.
682fn digram_chi2(data: &[u8]) -> f64 {
683    if data.len() < 2 {
684        return f64::NAN;
685    }
686    let n_digrams = data.len() - 1;
687    let expected = n_digrams as f64 / N_DIGRAM_BINS as f64;
688    if expected < CHI2_MIN_EXPECTED {
689        return f64::NAN;
690    }
691
692    let mut counts = vec![0u64; N_DIGRAM_BINS];
693    for pair in data.windows(2) {
694        let idx = (pair[0] as usize) * 256 + pair[1] as usize;
695        counts[idx] += 1;
696    }
697
698    counts
699        .iter()
700        .map(|&c| {
701            let diff = c as f64 - expected;
702            diff * diff / expected
703        })
704        .sum()
705}
706
707// ---------------------------------------------------------------------------
708// Markov analysis
709// ---------------------------------------------------------------------------
710
711/// Per-bit first-order Markov transition probabilities.
712pub fn markov_analysis(data_a: &[u8], data_b: &[u8]) -> MarkovAnalysis {
713    MarkovAnalysis {
714        transitions_a: bit_markov_transitions(data_a),
715        transitions_b: bit_markov_transitions(data_b),
716    }
717}
718
719/// Compute P(to|from) for each of the 8 bit positions.
720fn bit_markov_transitions(data: &[u8]) -> [[[f64; 2]; 2]; 8] {
721    let mut counts = [[[0u64; 2]; 2]; 8]; // [bit][from][to]
722
723    for pair in data.windows(2) {
724        let prev = pair[0];
725        let curr = pair[1];
726        for bit in 0..8u8 {
727            let from = ((prev >> bit) & 1) as usize;
728            let to = ((curr >> bit) & 1) as usize;
729            counts[bit as usize][from][to] += 1;
730        }
731    }
732
733    let mut probs = [[[0.0f64; 2]; 2]; 8];
734    for (bit, count_bit) in counts.iter().enumerate() {
735        for (from, count_from) in count_bit.iter().enumerate() {
736            let total = count_from[0] + count_from[1];
737            if total > 0 {
738                probs[bit][from][0] = count_from[0] as f64 / total as f64;
739                probs[bit][from][1] = count_from[1] as f64 / total as f64;
740            }
741        }
742    }
743
744    probs
745}
746
747// ---------------------------------------------------------------------------
748// Multi-lag autocorrelation
749// ---------------------------------------------------------------------------
750
751/// Autocorrelation at standard lags for both streams.
752pub fn multi_lag_analysis(data_a: &[u8], data_b: &[u8]) -> MultiLagAnalysis {
753    let lags: Vec<usize> = vec![1, 2, 3, 4, 5, 8, 16, 32, 64, 128];
754
755    let profile_a = analysis::autocorrelation_profile(data_a, 128);
756    let profile_b = analysis::autocorrelation_profile(data_b, 128);
757
758    let extract = |profile: &analysis::AutocorrResult| -> Vec<f64> {
759        lags.iter()
760            .map(|&lag| {
761                profile
762                    .lags
763                    .iter()
764                    .find(|lc| lc.lag == lag)
765                    .map(|lc| lc.correlation)
766                    .unwrap_or(0.0)
767            })
768            .collect()
769    };
770
771    MultiLagAnalysis {
772        autocorr_a: extract(&profile_a),
773        autocorr_b: extract(&profile_b),
774        lags,
775    }
776}
777
778// ---------------------------------------------------------------------------
779// Run-length comparison
780// ---------------------------------------------------------------------------
781
782/// Byte run-length distributions for both streams.
783pub fn run_length_comparison(data_a: &[u8], data_b: &[u8]) -> RunLengthComparison {
784    RunLengthComparison {
785        distribution_a: run_length_distribution(data_a),
786        distribution_b: run_length_distribution(data_b),
787    }
788}
789
790/// Compute (run_length, count) distribution for consecutive identical bytes.
791fn run_length_distribution(data: &[u8]) -> Vec<(usize, usize)> {
792    if data.is_empty() {
793        return Vec::new();
794    }
795
796    let mut dist: std::collections::BTreeMap<usize, usize> = std::collections::BTreeMap::new();
797    let mut current_len = 1usize;
798
799    for i in 1..data.len() {
800        if data[i] == data[i - 1] {
801            current_len += 1;
802        } else {
803            *dist.entry(current_len).or_default() += 1;
804            current_len = 1;
805        }
806    }
807    // Final run.
808    *dist.entry(current_len).or_default() += 1;
809
810    dist.into_iter().collect()
811}
812
813// ---------------------------------------------------------------------------
814// Shared statistical helpers
815// ---------------------------------------------------------------------------
816
817/// Kolmogorov distribution survival function: P(K > lambda).
818///
819/// Uses the standard alternating series:
820///   P(K > x) = 2 * sum_{k=1}^{inf} (-1)^{k+1} * exp(-2*k^2*x^2)
821/// Converges to machine precision within ~10 terms for any lambda > 0.
822fn kolmogorov_survival(lambda: f64) -> f64 {
823    if lambda <= 0.0 {
824        return 1.0;
825    }
826    let mut sum = 0.0;
827    for k in 1..=100 {
828        let kf = k as f64;
829        let term = (-2.0 * kf * kf * lambda * lambda).exp();
830        if term < 1e-15 {
831            break;
832        }
833        if k % 2 == 1 {
834            sum += term;
835        } else {
836            sum -= term;
837        }
838    }
839    (2.0 * sum).clamp(0.0, 1.0)
840}
841
842/// Upper-tail probability (survival function) for chi-squared distribution.
843///
844/// P(X > chi2 | df) using the regularized incomplete gamma function.
845
846// ---------------------------------------------------------------------------
847// Tests
848// ---------------------------------------------------------------------------
849
850#[cfg(test)]
851mod tests {
852    use super::*;
853    use crate::math::{chi_squared_survival, erfc};
854
855    fn pseudo_random(n: usize, seed: u64) -> Vec<u8> {
856        let mut data = Vec::with_capacity(n);
857        let mut state: u64 = seed;
858        for _ in 0..n {
859            state = state
860                .wrapping_mul(6364136223846793005)
861                .wrapping_add(1442695040888963407);
862            data.push((state >> 33) as u8);
863        }
864        data
865    }
866
867    // -- Top-level compare --
868
869    #[test]
870    fn compare_two_random_streams() {
871        let a = pseudo_random(10_000, 0xdeadbeef);
872        let b = pseudo_random(10_000, 0xcafebabe);
873        let result = compare("a", &a, "b", &b);
874        assert_eq!(result.size_a, 10_000);
875        assert_eq!(result.size_b, 10_000);
876        assert!(result.aggregate.shannon_a > 7.0);
877        assert!(result.aggregate.shannon_b > 7.0);
878        assert!(result.aggregate.cohens_d.abs() < 0.5);
879    }
880
881    #[test]
882    fn compare_biased_vs_random() {
883        let a = vec![128u8; 10_000];
884        let b = pseudo_random(10_000, 42);
885        let result = compare("constant", &a, "random", &b);
886        assert!(result.aggregate.shannon_a < 0.01);
887        assert!(result.aggregate.shannon_b > 7.0);
888        assert!(result.aggregate.variance_a < 0.01);
889        assert!(result.aggregate.variance_b > 100.0);
890    }
891
892    #[test]
893    fn compare_with_precomputed_analysis() {
894        let a = pseudo_random(5_000, 1);
895        let b = pseudo_random(5_000, 2);
896        let analysis_a = analysis::full_analysis("a", &a);
897        let analysis_b = analysis::full_analysis("b", &b);
898
899        let result = compare_with_analysis("a", &a, &analysis_a, "b", &b, &analysis_b);
900        let standalone = compare("a", &a, "b", &b);
901
902        assert!((result.aggregate.shannon_a - standalone.aggregate.shannon_a).abs() < 1e-10);
903        assert!((result.aggregate.mean_a - standalone.aggregate.mean_a).abs() < 1e-10);
904        assert!((result.aggregate.cohens_d - standalone.aggregate.cohens_d).abs() < 1e-10);
905    }
906
907    #[test]
908    fn compare_empty_inputs() {
909        let result = compare("empty_a", &[], "empty_b", &[]);
910        assert_eq!(result.size_a, 0);
911        assert_eq!(result.size_b, 0);
912        assert!(result.temporal.windowed_entropy_a.is_empty());
913        assert!(result.run_lengths.distribution_a.is_empty());
914    }
915
916    #[test]
917    fn compare_small_inputs() {
918        let a = pseudo_random(100, 1);
919        let b = pseudo_random(100, 2);
920        let result = compare("small_a", &a, "small_b", &b);
921        assert_eq!(result.size_a, 100);
922        assert!(result.temporal.windowed_entropy_a.is_empty());
923        assert!(result.aggregate.shannon_a > 0.0);
924    }
925
926    // -- Two-sample tests --
927
928    #[test]
929    fn two_sample_ks_identical_streams() {
930        let a = pseudo_random(10_000, 42);
931        let b = a.clone();
932        let (d, p) = two_sample_ks(&a, &b);
933        assert_eq!(d, 0.0);
934        assert_eq!(p, 1.0);
935    }
936
937    #[test]
938    fn two_sample_ks_different_distributions() {
939        let a = vec![0u8; 10_000]; // all zeros
940        let b = vec![255u8; 10_000]; // all 255s
941        let (d, p) = two_sample_ks(&a, &b);
942        assert!((d - 1.0).abs() < 1e-10); // maximally different CDFs
943        assert!(p < 0.001);
944    }
945
946    #[test]
947    fn two_sample_ks_similar_random() {
948        let a = pseudo_random(10_000, 1);
949        let b = pseudo_random(10_000, 2);
950        let (d, _p) = two_sample_ks(&a, &b);
951        // Two pseudo-random streams should have small D.
952        assert!(d < 0.05, "D = {d}");
953    }
954
955    #[test]
956    fn chi2_homogeneity_identical() {
957        let a = pseudo_random(10_000, 42);
958        let b = a.clone();
959        let (chi2, _df, p, reliable) = chi2_homogeneity(&a, &b);
960        assert_eq!(chi2, 0.0);
961        assert!((p - 1.0).abs() < 0.01);
962        assert!(reliable);
963    }
964
965    #[test]
966    fn chi2_homogeneity_different() {
967        let a = vec![0u8; 10_000];
968        let b = vec![255u8; 10_000];
969        let (chi2, _df, p, reliable) = chi2_homogeneity(&a, &b);
970        assert!(chi2 > 1000.0);
971        assert!(p < 0.001);
972        // Only 2 bins populated, each with large counts — reliable.
973        assert!(reliable);
974    }
975
976    #[test]
977    fn chi2_homogeneity_unreliable_small_sample() {
978        let a = pseudo_random(20, 1);
979        let b = pseudo_random(20, 2);
980        let (_chi2, _df, _p, reliable) = chi2_homogeneity(&a, &b);
981        // 20 bytes spread across 256 bins → most expected counts well below 5.
982        assert!(!reliable);
983    }
984
985    #[test]
986    fn cliffs_delta_identical() {
987        let a = pseudo_random(5_000, 42);
988        let b = a.clone();
989        let d = cliffs_delta(&a, &b);
990        assert_eq!(d, 0.0);
991    }
992
993    #[test]
994    fn cliffs_delta_maximally_different() {
995        let a = vec![0u8; 1_000];
996        let b = vec![255u8; 1_000];
997        let d = cliffs_delta(&a, &b);
998        // A is always less than B → delta = -1.0
999        assert!((d - (-1.0)).abs() < 1e-10);
1000    }
1001
1002    #[test]
1003    fn cliffs_delta_small_for_similar_random() {
1004        let a = pseudo_random(10_000, 1);
1005        let b = pseudo_random(10_000, 2);
1006        let d = cliffs_delta(&a, &b);
1007        assert!(d.abs() < 0.1, "Cliff's delta = {d}");
1008    }
1009
1010    #[test]
1011    fn mann_whitney_identical() {
1012        let a = pseudo_random(5_000, 42);
1013        let b = a.clone();
1014        let (u, p) = mann_whitney_u_test(&a, &b);
1015        assert!((u - 0.5).abs() < 0.01, "U = {u}");
1016        // Identical streams → not significant.
1017        assert!(p > 0.05, "p = {p}");
1018    }
1019
1020    #[test]
1021    fn mann_whitney_all_less() {
1022        let a = vec![0u8; 1_000];
1023        let b = vec![255u8; 1_000];
1024        let (u, p) = mann_whitney_u_test(&a, &b);
1025        // A is always less: U_A ≈ 0
1026        assert!(u < 0.01, "U = {u}");
1027        // Maximally different → highly significant.
1028        assert!(p < 0.001, "p = {p}");
1029    }
1030
1031    #[test]
1032    fn mann_whitney_similar_random() {
1033        let a = pseudo_random(10_000, 1);
1034        let b = pseudo_random(10_000, 2);
1035        let (u, _p) = mann_whitney_u_test(&a, &b);
1036        // Similar random streams: U ≈ 0.5
1037        assert!((u - 0.5).abs() < 0.05, "U = {u}");
1038    }
1039
1040    // -- Temporal --
1041
1042    #[test]
1043    fn temporal_analysis_basic() {
1044        let a = pseudo_random(10_000, 1);
1045        let b = pseudo_random(10_000, 2);
1046        let result = temporal_analysis(&a, &b, 1024, 3.0);
1047        assert_eq!(result.window_size, 1024);
1048        assert!(!result.windowed_entropy_a.is_empty());
1049    }
1050
1051    #[test]
1052    fn temporal_uses_theoretical_params() {
1053        // For truly uniform data, anomalies detected by theoretical z-scores
1054        // should be fewer than the ~0.27% you'd get from circular empirical z-scores.
1055        let a = pseudo_random(100_000, 1);
1056        let result = temporal_analysis(&a, &a, 1024, 3.0);
1057        let n_windows = 100_000 / 1024;
1058        // With theoretical params, a good PRNG should have very few anomalies.
1059        // (Circular empirical would always give ~0.27% = ~0.26 windows per 97.)
1060        assert!(
1061            result.anomaly_count_a < n_windows / 5,
1062            "Too many anomalies: {}/{}",
1063            result.anomaly_count_a,
1064            n_windows
1065        );
1066    }
1067
1068    #[test]
1069    fn temporal_analysis_subsamples_large_output() {
1070        let a = pseudo_random(100_000, 1);
1071        let b = pseudo_random(100_000, 2);
1072        let result = temporal_analysis(&a, &b, 64, 3.0);
1073        assert!(result.windowed_entropy_a.len() <= MAX_WINDOWED_ENTROPY);
1074        assert!(result.windowed_entropy_b.len() <= MAX_WINDOWED_ENTROPY);
1075    }
1076
1077    // -- Digram --
1078
1079    #[test]
1080    fn digram_insufficient_data() {
1081        let a = pseudo_random(1_000, 1);
1082        let b = pseudo_random(1_000, 2);
1083        let result = digram_analysis(&a, &b);
1084        assert!(!result.sufficient_data);
1085        assert!(result.chi2_a.is_nan());
1086    }
1087
1088    #[test]
1089    fn digram_sufficient_data() {
1090        let a = pseudo_random(400_000, 1);
1091        let b = pseudo_random(400_000, 2);
1092        let result = digram_analysis(&a, &b);
1093        assert!(result.sufficient_data);
1094        assert!(result.chi2_a.is_finite());
1095        assert!(result.chi2_a > 0.0);
1096    }
1097
1098    // -- Markov --
1099
1100    #[test]
1101    fn markov_rows_sum_to_one() {
1102        let a = pseudo_random(5_000, 1);
1103        let b = pseudo_random(5_000, 2);
1104        let result = markov_analysis(&a, &b);
1105        for bit in 0..8 {
1106            for from in 0..2 {
1107                let sum_a = result.transitions_a[bit][from][0] + result.transitions_a[bit][from][1];
1108                assert!(
1109                    (sum_a - 1.0).abs() < 0.01,
1110                    "A bit={bit} from={from} sum={sum_a}"
1111                );
1112                let sum_b = result.transitions_b[bit][from][0] + result.transitions_b[bit][from][1];
1113                assert!(
1114                    (sum_b - 1.0).abs() < 0.01,
1115                    "B bit={bit} from={from} sum={sum_b}"
1116                );
1117            }
1118        }
1119    }
1120
1121    // -- Multi-lag --
1122
1123    #[test]
1124    fn multi_lag_lengths_match() {
1125        let a = pseudo_random(5_000, 1);
1126        let b = pseudo_random(5_000, 2);
1127        let result = multi_lag_analysis(&a, &b);
1128        assert_eq!(result.lags.len(), result.autocorr_a.len());
1129        assert_eq!(result.lags.len(), result.autocorr_b.len());
1130    }
1131
1132    // -- Run-length --
1133
1134    #[test]
1135    fn run_length_basic() {
1136        let a = pseudo_random(5_000, 1);
1137        let b = pseudo_random(5_000, 2);
1138        let result = run_length_comparison(&a, &b);
1139        assert!(!result.distribution_a.is_empty());
1140        assert!(!result.distribution_b.is_empty());
1141    }
1142
1143    #[test]
1144    fn run_length_constant_stream() {
1145        let data = vec![42u8; 100];
1146        let dist = run_length_distribution(&data);
1147        assert_eq!(dist.len(), 1);
1148        assert_eq!(dist[0], (100, 1));
1149    }
1150
1151    // -- Chi-squared survival helper (via crate::math) --
1152
1153    #[test]
1154    fn chi_squared_survival_zero() {
1155        // P(X > 0) should be ~1.0
1156        let p = chi_squared_survival(0.0, 10);
1157        assert!((p - 1.0).abs() < 0.01);
1158    }
1159
1160    #[test]
1161    fn chi_squared_survival_large() {
1162        // P(X > 1000 | df=10) should be ~0.0
1163        let p = chi_squared_survival(1000.0, 10);
1164        assert!(p < 0.001);
1165    }
1166
1167    #[test]
1168    fn chi_squared_survival_midrange() {
1169        // chi2=18.31, df=10 is the 0.05 critical value.
1170        // SciPy reference: chi2.sf(18.31, 10) = 0.049954
1171        let p = chi_squared_survival(18.31, 10);
1172        assert!((p - 0.05).abs() < 0.005, "Expected p ≈ 0.05, got p = {p}");
1173
1174        // chi2=23.21, df=10 is the 0.01 critical value.
1175        // SciPy reference: chi2.sf(23.21, 10) = 0.009997
1176        let p = chi_squared_survival(23.21, 10);
1177        assert!((p - 0.01).abs() < 0.002, "Expected p ≈ 0.01, got p = {p}");
1178    }
1179
1180    #[test]
1181    fn chi_squared_survival_high_df() {
1182        // chi2=290, df=255 → SciPy gives p ≈ 0.065
1183        let p = chi_squared_survival(290.0, 255);
1184        assert!((p - 0.065).abs() < 0.01, "Expected p ≈ 0.065, got p = {p}");
1185    }
1186
1187    // -- Kolmogorov survival --
1188
1189    #[test]
1190    fn kolmogorov_survival_zero() {
1191        assert_eq!(kolmogorov_survival(0.0), 1.0);
1192    }
1193
1194    #[test]
1195    fn kolmogorov_survival_large() {
1196        let p = kolmogorov_survival(3.0);
1197        assert!(p < 1e-6);
1198    }
1199
1200    #[test]
1201    fn kolmogorov_survival_matches_scipy() {
1202        // SciPy reference: for lambda=1.2304, p ≈ 0.096852
1203        let p = kolmogorov_survival(1.2304);
1204        assert!(
1205            (p - 0.0969).abs() < 0.001,
1206            "Expected p ≈ 0.097, got p = {p}"
1207        );
1208
1209        // For small lambda where single-term would be wrong:
1210        // lambda=0.5 → full series gives ~0.964
1211        let p = kolmogorov_survival(0.5);
1212        assert!((p - 0.964).abs() < 0.01, "Expected p ≈ 0.964, got p = {p}");
1213    }
1214
1215    // -- erfc --
1216
1217    #[test]
1218    fn erfc_known_values() {
1219        // erfc(0) = 1
1220        assert!((erfc(0.0) - 1.0).abs() < 1e-6);
1221        // erfc(large) ≈ 0
1222        assert!(erfc(5.0) < 1e-6);
1223        // erfc(1.0) ≈ 0.1573 (from tables)
1224        assert!((erfc(1.0) - 0.1573).abs() < 0.001);
1225    }
1226}
openentropy_core/comparison.rs

openentropy_core/
comparison.rs