openentropy_core/
analysis.rs

1//! Comprehensive entropy source analysis beyond NIST min-entropy.
2//!
3//! This module provides research-oriented metrics for characterizing entropy
4//! sources: autocorrelation profiles, spectral analysis, bit bias, distribution
5//! statistics, stationarity, runs analysis, and entropy scaling.
6
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::f64::consts::PI;
10
11use crate::math;
12
13// ---------------------------------------------------------------------------
14// Result types
15// ---------------------------------------------------------------------------
16
17/// Autocorrelation at a single lag.
18#[derive(Debug, Clone, Serialize)]
19pub struct LagCorrelation {
20    pub lag: usize,
21    pub correlation: f64,
22}
23
24/// Autocorrelation profile across multiple lags.
25#[derive(Debug, Clone, Serialize)]
26pub struct AutocorrResult {
27    pub lags: Vec<LagCorrelation>,
28    pub max_abs_correlation: f64,
29    pub max_abs_lag: usize,
30    /// 95% significance threshold (2/sqrt(n)).
31    pub threshold: f64,
32    /// Number of lags exceeding the threshold.
33    pub violations: usize,
34}
35
36/// Single spectral bin.
37#[derive(Debug, Clone, Serialize)]
38pub struct SpectralBin {
39    /// Normalized frequency (0.0 to 0.5).
40    pub frequency: f64,
41    pub power: f64,
42}
43
44/// FFT-based spectral analysis result.
45#[derive(Debug, Clone, Serialize)]
46pub struct SpectralResult {
47    /// Top 10 spectral peaks by power.
48    pub peaks: Vec<SpectralBin>,
49    /// Spectral flatness (Wiener entropy): 1.0 = white noise, 0.0 = tonal.
50    pub flatness: f64,
51    /// Dominant frequency (normalized, 0.0–0.5).
52    pub dominant_frequency: f64,
53    /// Total spectral power.
54    pub total_power: f64,
55}
56
57/// Per-bit-position bias analysis.
58#[derive(Debug, Clone, Serialize)]
59pub struct BitBiasResult {
60    /// Probability of 1 for each bit position (0=LSB, 7=MSB).
61    pub bit_probabilities: [f64; 8],
62    /// Overall bias (deviation from 0.5).
63    pub overall_bias: f64,
64    /// Chi-squared statistic for uniformity.
65    pub chi_squared: f64,
66    /// Approximate p-value.
67    pub p_value: f64,
68    /// Any bit position deviating > 0.01 from 0.5.
69    pub has_significant_bias: bool,
70}
71
72/// Distribution statistics.
73#[derive(Debug, Clone, Serialize)]
74pub struct DistributionResult {
75    pub mean: f64,
76    pub variance: f64,
77    pub std_dev: f64,
78    pub skewness: f64,
79    pub kurtosis: f64,
80    /// Byte value histogram (256 bins).
81    pub histogram: Vec<u64>,
82    /// KS-style statistic vs uniform.
83    pub ks_statistic: f64,
84    /// Approximate p-value for KS-style statistic (heuristic for discrete bytes).
85    pub ks_p_value: f64,
86}
87
88/// Anderson-Darling goodness-of-fit test against byte-uniform distribution.
89#[derive(Debug, Clone, Serialize)]
90pub struct AndersonDarlingResult {
91    pub statistic: f64,
92    pub p_value: f64,
93    pub is_uniform: bool,
94    pub critical_values: Vec<(f64, f64)>,
95    pub is_valid: bool,
96}
97
98#[derive(Debug, Clone, Serialize)]
99pub struct ApproxEntropyResult {
100    pub apen: f64,
101    pub m: usize,
102    pub r: f64,
103    pub phi_m: f64,
104    pub phi_m_plus_1: f64,
105    pub actual_samples: usize,
106    pub is_valid: bool,
107}
108
109/// Stationarity test result.
110#[derive(Debug, Clone, Serialize)]
111pub struct StationarityResult {
112    /// Heuristic stationarity flag based on windowed F-statistic threshold.
113    pub is_stationary: bool,
114    /// ANOVA-like F-statistic comparing window means (heuristic).
115    pub f_statistic: f64,
116    /// Per-window means.
117    pub window_means: Vec<f64>,
118    /// Per-window standard deviations.
119    pub window_std_devs: Vec<f64>,
120    /// Number of windows used.
121    pub n_windows: usize,
122}
123
124/// Runs analysis result.
125#[derive(Debug, Clone, Serialize)]
126pub struct RunsResult {
127    /// Longest consecutive run of the same byte value.
128    pub longest_run: usize,
129    /// Expected longest run for random data of this size.
130    pub expected_longest_run: f64,
131    /// Total number of runs.
132    pub total_runs: usize,
133    /// Expected total runs for random data.
134    pub expected_runs: f64,
135}
136
137#[derive(Debug, Clone, Serialize)]
138pub struct PermutationEntropyResult {
139    pub permutation_entropy: f64,
140    pub normalized_entropy: f64,
141    pub order: usize,
142    pub delay: usize,
143    pub n_patterns: usize,
144    pub pattern_counts: Vec<(Vec<usize>, usize)>,
145    pub is_valid: bool,
146}
147
148/// Entropy at a specific sample size.
149#[derive(Debug, Clone, Serialize)]
150pub struct EntropyPoint {
151    pub sample_size: usize,
152    pub shannon_h: f64,
153    pub min_entropy: f64,
154    pub collection_time_ms: u64,
155}
156
157/// Entropy scaling across sample sizes.
158#[derive(Debug, Clone, Serialize)]
159pub struct ScalingResult {
160    pub points: Vec<EntropyPoint>,
161}
162
163/// Throughput measurement.
164#[derive(Debug, Clone, Serialize)]
165pub struct ThroughputResult {
166    /// Bytes per second at each tested sample size.
167    pub measurements: Vec<ThroughputPoint>,
168}
169
170#[derive(Debug, Clone, Serialize)]
171pub struct ThroughputPoint {
172    pub sample_size: usize,
173    pub bytes_per_second: f64,
174    pub collection_time_ms: u64,
175}
176
177/// Cross-correlation between two sources.
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub struct CrossCorrPair {
180    pub source_a: String,
181    pub source_b: String,
182    pub correlation: f64,
183    pub flagged: bool,
184}
185
186/// Cross-correlation matrix result.
187#[derive(Debug, Clone, Serialize, Deserialize)]
188pub struct CrossCorrMatrix {
189    pub pairs: Vec<CrossCorrPair>,
190    /// Pairs with |r| > 0.3.
191    pub flagged_count: usize,
192}
193
194/// Full per-source analysis.
195#[derive(Debug, Clone, Serialize)]
196pub struct SourceAnalysis {
197    pub source_name: String,
198    pub sample_size: usize,
199    /// Shannon entropy (bits/byte, max 8.0).
200    pub shannon_entropy: f64,
201    /// Min-entropy via MCV estimator (bits/byte, max 8.0).
202    pub min_entropy: f64,
203    pub autocorrelation: AutocorrResult,
204    pub spectral: SpectralResult,
205    pub bit_bias: BitBiasResult,
206    pub distribution: DistributionResult,
207    pub stationarity: StationarityResult,
208    pub runs: RunsResult,
209}
210
211// ---------------------------------------------------------------------------
212// Analysis functions
213// ---------------------------------------------------------------------------
214
215/// Compute autocorrelation profile for lags 1..max_lag.
216pub fn autocorrelation_profile(data: &[u8], max_lag: usize) -> AutocorrResult {
217    let n = data.len();
218    if n == 0 || max_lag == 0 {
219        return AutocorrResult {
220            lags: Vec::new(),
221            max_abs_correlation: 0.0,
222            max_abs_lag: 0,
223            threshold: 0.0,
224            violations: 0,
225        };
226    }
227
228    let max_lag = max_lag.min(n / 2);
229    if max_lag == 0 {
230        return AutocorrResult {
231            lags: Vec::new(),
232            max_abs_correlation: 0.0,
233            max_abs_lag: 0,
234            threshold: 2.0 / (n as f64).sqrt(),
235            violations: 0,
236        };
237    }
238    let arr: Vec<f64> = data.iter().map(|&b| b as f64).collect();
239    let mean: f64 = arr.iter().sum::<f64>() / n as f64;
240    let var: f64 = arr.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n as f64;
241
242    let threshold = 2.0 / (n as f64).sqrt();
243    let mut lags = Vec::with_capacity(max_lag);
244    let mut max_abs = 0.0f64;
245    let mut max_abs_lag = 1;
246    let mut violations = 0;
247
248    for lag in 1..=max_lag {
249        let corr = if var < 1e-10 {
250            0.0
251        } else {
252            let mut sum = 0.0;
253            let count = n - lag;
254            for i in 0..count {
255                sum += (arr[i] - mean) * (arr[i + lag] - mean);
256            }
257            sum / (count as f64 * var)
258        };
259
260        if corr.abs() > max_abs {
261            max_abs = corr.abs();
262            max_abs_lag = lag;
263        }
264        if corr.abs() > threshold {
265            violations += 1;
266        }
267
268        lags.push(LagCorrelation {
269            lag,
270            correlation: corr,
271        });
272    }
273
274    AutocorrResult {
275        lags,
276        max_abs_correlation: max_abs,
277        max_abs_lag,
278        threshold,
279        violations,
280    }
281}
282
283/// Compute spectral analysis via DFT (no external FFT crate).
284pub fn spectral_analysis(data: &[u8]) -> SpectralResult {
285    let n = data.len().min(4096); // Cap for performance
286    if n < 2 {
287        return SpectralResult {
288            peaks: Vec::new(),
289            flatness: 0.0,
290            dominant_frequency: 0.0,
291            total_power: 0.0,
292        };
293    }
294
295    let arr: Vec<f64> = data[..n].iter().map(|&b| b as f64 - 127.5).collect();
296
297    // Compute power spectrum via DFT (only positive frequencies).
298    let n_freq = n / 2;
299    let mut power_spectrum: Vec<f64> = Vec::with_capacity(n_freq);
300
301    for k in 1..=n_freq {
302        let mut re = 0.0;
303        let mut im = 0.0;
304        let freq = 2.0 * PI * k as f64 / n as f64;
305        for (j, &x) in arr.iter().enumerate() {
306            re += x * (freq * j as f64).cos();
307            im -= x * (freq * j as f64).sin();
308        }
309        power_spectrum.push((re * re + im * im) / n as f64);
310    }
311
312    let total_power: f64 = power_spectrum.iter().sum();
313
314    // Spectral flatness = geometric_mean / arithmetic_mean
315    let arith_mean = total_power / n_freq as f64;
316    let log_sum: f64 = power_spectrum
317        .iter()
318        .map(|&p| if p > 1e-20 { p.ln() } else { -46.0 }) // ln(1e-20) ≈ -46
319        .sum();
320    let geo_mean = (log_sum / n_freq as f64).exp();
321    let flatness = if arith_mean > 1e-20 {
322        (geo_mean / arith_mean).clamp(0.0, 1.0)
323    } else {
324        0.0
325    };
326
327    // Find peaks.
328    let mut indexed: Vec<(usize, f64)> = power_spectrum.iter().copied().enumerate().collect();
329    indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
330    let peaks: Vec<SpectralBin> = indexed
331        .iter()
332        .take(10)
333        .map(|&(i, p)| SpectralBin {
334            frequency: (i + 1) as f64 / n as f64,
335            power: p,
336        })
337        .collect();
338
339    let dominant_frequency = peaks.first().map(|p| p.frequency).unwrap_or(0.0);
340
341    SpectralResult {
342        peaks,
343        flatness,
344        dominant_frequency,
345        total_power,
346    }
347}
348
349/// Analyze per-bit-position bias.
350pub fn bit_bias(data: &[u8]) -> BitBiasResult {
351    if data.is_empty() {
352        return BitBiasResult {
353            bit_probabilities: [0.0; 8],
354            overall_bias: 0.0,
355            chi_squared: 0.0,
356            p_value: 1.0,
357            has_significant_bias: false,
358        };
359    }
360
361    let n = data.len() as f64;
362    let mut counts = [0u64; 8];
363
364    for &byte in data {
365        for (bit, count) in counts.iter_mut().enumerate() {
366            if byte & (1 << bit) != 0 {
367                *count += 1;
368            }
369        }
370    }
371
372    let bit_probs: [f64; 8] = {
373        let mut arr = [0.0; 8];
374        for (i, &c) in counts.iter().enumerate() {
375            arr[i] = c as f64 / n;
376        }
377        arr
378    };
379
380    let overall_bias: f64 = bit_probs.iter().map(|&p| (p - 0.5).abs()).sum::<f64>() / 8.0;
381
382    // Chi-squared: for each bit position, expected = n/2
383    let expected = n / 2.0;
384    let chi_squared: f64 = counts
385        .iter()
386        .map(|&c| {
387            let diff = c as f64 - expected;
388            diff * diff / expected
389        })
390        .sum();
391
392    // Approximate p-value from chi-squared with 8 degrees of freedom.
393    // Use the incomplete gamma function approximation.
394    let p_value = math::chi_squared_p_value(chi_squared, 8);
395
396    let has_significant_bias = bit_probs.iter().any(|&p| (p - 0.5).abs() > 0.01);
397
398    BitBiasResult {
399        bit_probabilities: bit_probs,
400        overall_bias,
401        chi_squared,
402        p_value,
403        has_significant_bias,
404    }
405}
406
407/// Compute distribution statistics.
408pub fn distribution_stats(data: &[u8]) -> DistributionResult {
409    if data.is_empty() {
410        return DistributionResult {
411            mean: 0.0,
412            variance: 0.0,
413            std_dev: 0.0,
414            skewness: 0.0,
415            kurtosis: 0.0,
416            histogram: vec![0u64; 256],
417            ks_statistic: 0.0,
418            ks_p_value: 1.0,
419        };
420    }
421
422    let n = data.len() as f64;
423    let arr: Vec<f64> = data.iter().map(|&b| b as f64).collect();
424
425    let mean = arr.iter().sum::<f64>() / n;
426    let variance = arr.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / n;
427    let std_dev = variance.sqrt();
428
429    let skewness = if std_dev > 1e-10 {
430        arr.iter()
431            .map(|&x| ((x - mean) / std_dev).powi(3))
432            .sum::<f64>()
433            / n
434    } else {
435        0.0
436    };
437
438    let kurtosis = if std_dev > 1e-10 {
439        arr.iter()
440            .map(|&x| ((x - mean) / std_dev).powi(4))
441            .sum::<f64>()
442            / n
443            - 3.0 // excess kurtosis
444    } else {
445        0.0
446    };
447
448    // Histogram
449    let mut histogram = vec![0u64; 256];
450    for &b in data {
451        histogram[b as usize] += 1;
452    }
453
454    // KS test vs uniform [0, 255]
455    let mut sorted: Vec<f64> = arr.clone();
456    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
457    let mut ks_stat = 0.0f64;
458    for (i, &x) in sorted.iter().enumerate() {
459        let empirical = (i + 1) as f64 / n;
460        let theoretical = (x + 0.5) / 256.0; // uniform over [0, 255]
461        let diff = (empirical - theoretical).abs();
462        if diff > ks_stat {
463            ks_stat = diff;
464        }
465    }
466    // Approximate p-value (Kolmogorov-Smirnov)
467    let sqrt_n = n.sqrt();
468    let lambda = (sqrt_n + 0.12 + 0.11 / sqrt_n) * ks_stat;
469    let ks_p = (-2.0 * lambda * lambda).exp().clamp(0.0, 1.0) * 2.0;
470
471    DistributionResult {
472        mean,
473        variance,
474        std_dev,
475        skewness,
476        kurtosis,
477        histogram,
478        ks_statistic: ks_stat,
479        ks_p_value: ks_p.min(1.0),
480    }
481}
482
483pub fn approximate_entropy(data: &[u8], m: usize, r: f64) -> ApproxEntropyResult {
484    const MAX_SAMPLES: usize = 5000;
485
486    let mut sampled: Vec<f64> = data.iter().map(|&b| b as f64).collect();
487    if sampled.len() > MAX_SAMPLES {
488        let step = (sampled.len() / MAX_SAMPLES).max(1);
489        sampled = sampled
490            .iter()
491            .step_by(step)
492            .take(MAX_SAMPLES)
493            .copied()
494            .collect();
495    }
496
497    let actual_samples = sampled.len();
498    if m == 0 || actual_samples <= m + 1 || !r.is_finite() || r <= 0.0 {
499        return ApproxEntropyResult {
500            apen: f64::NAN,
501            m,
502            r,
503            phi_m: f64::NAN,
504            phi_m_plus_1: f64::NAN,
505            actual_samples,
506            is_valid: false,
507        };
508    }
509
510    let mean = sampled.iter().sum::<f64>() / actual_samples as f64;
511    let variance = sampled
512        .iter()
513        .map(|x| {
514            let d = x - mean;
515            d * d
516        })
517        .sum::<f64>()
518        / (actual_samples as f64 - 1.0);
519    let std_dev = variance.sqrt();
520    if std_dev == 0.0 {
521        return ApproxEntropyResult {
522            apen: f64::NAN,
523            m,
524            r,
525            phi_m: f64::NAN,
526            phi_m_plus_1: f64::NAN,
527            actual_samples,
528            is_valid: false,
529        };
530    }
531
532    fn phi(sampled: &[f64], m: usize, r: f64) -> Option<f64> {
533        if sampled.len() < m {
534            return None;
535        }
536
537        let n_templates = sampled.len() - m + 1;
538        if n_templates == 0 {
539            return None;
540        }
541
542        let mut sum_ln = 0.0;
543        for i in 0..n_templates {
544            let mut count = 0usize;
545            for j in 0..n_templates {
546                let mut max_dist = 0.0;
547                for k in 0..m {
548                    let dist = (sampled[i + k] - sampled[j + k]).abs();
549                    if dist > max_dist {
550                        max_dist = dist;
551                    }
552                    if max_dist >= r {
553                        break;
554                    }
555                }
556                if max_dist < r {
557                    count += 1;
558                }
559            }
560
561            if count == 0 {
562                return None;
563            }
564
565            let c_i = count as f64 / n_templates as f64;
566            if c_i <= 0.0 {
567                return None;
568            }
569            sum_ln += c_i.ln();
570        }
571
572        Some(sum_ln / n_templates as f64)
573    }
574
575    let Some(phi_m) = phi(&sampled, m, r) else {
576        return ApproxEntropyResult {
577            apen: f64::NAN,
578            m,
579            r,
580            phi_m: f64::NAN,
581            phi_m_plus_1: f64::NAN,
582            actual_samples,
583            is_valid: false,
584        };
585    };
586
587    let Some(phi_m_plus_1) = phi(&sampled, m + 1, r) else {
588        return ApproxEntropyResult {
589            apen: f64::NAN,
590            m,
591            r,
592            phi_m,
593            phi_m_plus_1: f64::NAN,
594            actual_samples,
595            is_valid: false,
596        };
597    };
598
599    let apen = phi_m - phi_m_plus_1;
600    ApproxEntropyResult {
601        apen,
602        m,
603        r,
604        phi_m,
605        phi_m_plus_1,
606        actual_samples,
607        is_valid: apen.is_finite(),
608    }
609}
610
611pub fn approximate_entropy_default(data: &[u8]) -> ApproxEntropyResult {
612    const DEFAULT_M: usize = 2;
613    const MAX_SAMPLES: usize = 5000;
614
615    let mut sampled: Vec<f64> = data.iter().map(|&b| b as f64).collect();
616    if sampled.len() > MAX_SAMPLES {
617        let step = (sampled.len() / MAX_SAMPLES).max(1);
618        sampled = sampled
619            .iter()
620            .step_by(step)
621            .take(MAX_SAMPLES)
622            .copied()
623            .collect();
624    }
625
626    if sampled.len() <= DEFAULT_M + 1 {
627        return ApproxEntropyResult {
628            apen: f64::NAN,
629            m: DEFAULT_M,
630            r: f64::NAN,
631            phi_m: f64::NAN,
632            phi_m_plus_1: f64::NAN,
633            actual_samples: sampled.len(),
634            is_valid: false,
635        };
636    }
637
638    let mean = sampled.iter().sum::<f64>() / sampled.len() as f64;
639    let variance = sampled
640        .iter()
641        .map(|x| {
642            let d = x - mean;
643            d * d
644        })
645        .sum::<f64>()
646        / (sampled.len() as f64 - 1.0);
647    let std_dev = variance.sqrt();
648    if std_dev == 0.0 {
649        return ApproxEntropyResult {
650            apen: f64::NAN,
651            m: DEFAULT_M,
652            r: 0.0,
653            phi_m: f64::NAN,
654            phi_m_plus_1: f64::NAN,
655            actual_samples: sampled.len(),
656            is_valid: false,
657        };
658    }
659
660    approximate_entropy(data, DEFAULT_M, 0.2 * std_dev)
661}
662
663pub fn anderson_darling(data: &[u8]) -> AndersonDarlingResult {
664    const MAX_SAMPLES: usize = 2048;
665
666    let critical_values = vec![
667        (0.25, 0.787),
668        (0.10, 1.248),
669        (0.05, 1.610),
670        (0.025, 2.092),
671        (0.01, 2.880),
672    ];
673
674    if data.len() < 2 {
675        return AndersonDarlingResult {
676            statistic: 0.0,
677            p_value: 0.0,
678            is_uniform: false,
679            critical_values,
680            is_valid: false,
681        };
682    }
683
684    let sample = if data.len() > MAX_SAMPLES {
685        &data[..MAX_SAMPLES]
686    } else {
687        data
688    };
689
690    let n = sample.len();
691    let n_f = n as f64;
692
693    let mut sorted = sample.to_vec();
694    sorted.sort_unstable();
695
696    let u: Vec<f64> = sorted.iter().map(|&x| (x as f64 + 0.5) / 256.0).collect();
697
698    let mut sum = 0.0;
699    for i in 0..n {
700        let coeff = (2 * i + 1) as f64;
701        let left = u[i].ln();
702        let right = (1.0 - u[n - 1 - i]).ln();
703        sum += coeff * (left + right);
704    }
705
706    let a2 = -n_f - (sum / n_f);
707    let statistic = a2 * (1.0 + 4.0 / n_f - 25.0 / (n_f * n_f));
708
709    let p_value = if statistic > 10.0 {
710        0.0
711    } else if statistic >= 0.6 {
712        (1.2937 - 5.709 * statistic + 0.0186 * statistic * statistic).exp()
713    } else if statistic >= 0.34 {
714        (0.9177 - 4.279 * statistic - 1.38 * statistic * statistic).exp()
715    } else if statistic >= 0.2 {
716        1.0 - (-8.318 + 42.796 * statistic - 59.938 * statistic * statistic).exp()
717    } else {
718        1.0 - (-13.436 + 101.14 * statistic - 223.73 * statistic * statistic).exp()
719    }
720    .clamp(0.0, 1.0);
721
722    AndersonDarlingResult {
723        statistic,
724        p_value,
725        is_uniform: p_value > 0.05,
726        critical_values,
727        is_valid: true,
728    }
729}
730
731/// Test stationarity by comparing window means (ANOVA-like).
732pub fn stationarity_test(data: &[u8]) -> StationarityResult {
733    let n_windows = 10usize;
734    let window_size = data.len() / n_windows;
735    if window_size < 10 {
736        return StationarityResult {
737            is_stationary: true,
738            f_statistic: 0.0,
739            window_means: vec![],
740            window_std_devs: vec![],
741            n_windows: 0,
742        };
743    }
744
745    let mut window_means = Vec::with_capacity(n_windows);
746    let mut window_std_devs = Vec::with_capacity(n_windows);
747
748    for w in 0..n_windows {
749        let start = w * window_size;
750        let end = start + window_size;
751        let window = &data[start..end];
752        let arr: Vec<f64> = window.iter().map(|&b| b as f64).collect();
753        let mean = arr.iter().sum::<f64>() / arr.len() as f64;
754        let var = arr.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / arr.len() as f64;
755        window_means.push(mean);
756        window_std_devs.push(var.sqrt());
757    }
758
759    // One-way ANOVA F-statistic
760    let grand_mean: f64 = window_means.iter().sum::<f64>() / n_windows as f64;
761    let between_var: f64 = window_means
762        .iter()
763        .map(|&m| (m - grand_mean).powi(2))
764        .sum::<f64>()
765        / (n_windows - 1) as f64
766        * window_size as f64;
767
768    let within_var: f64 = window_std_devs.iter().map(|&s| s * s).sum::<f64>() / n_windows as f64;
769
770    let f_stat = if within_var > 1e-10 {
771        between_var / within_var
772    } else {
773        0.0
774    };
775
776    // F critical value at α=0.05, df1=9, df2→∞ ≈ 1.88.
777    // This is the asymptotic value; for finite df2 the true critical value
778    // is slightly higher (more permissive), so this is conservative.
779    let is_stationary = f_stat < 1.88;
780
781    StationarityResult {
782        is_stationary,
783        f_statistic: f_stat,
784        window_means,
785        window_std_devs,
786        n_windows,
787    }
788}
789
790/// Analyze runs of consecutive identical byte values.
791pub fn runs_analysis(data: &[u8]) -> RunsResult {
792    if data.is_empty() {
793        return RunsResult {
794            longest_run: 0,
795            expected_longest_run: 0.0,
796            total_runs: 0,
797            expected_runs: 0.0,
798        };
799    }
800
801    let mut longest = 1usize;
802    let mut current = 1usize;
803    let mut total_runs = 1usize;
804
805    for i in 1..data.len() {
806        if data[i] == data[i - 1] {
807            current += 1;
808            if current > longest {
809                longest = current;
810            }
811        } else {
812            total_runs += 1;
813            current = 1;
814        }
815    }
816
817    let n = data.len() as f64;
818    // Expected longest run of same byte ≈ log_256(n) for byte-level
819    let expected_longest = (n.ln() / 256.0_f64.ln()).max(1.0);
820    // Expected total runs ≈ n * (1 - 1/256) + 1
821    let expected_runs = n * (1.0 - 1.0 / 256.0) + 1.0;
822
823    RunsResult {
824        longest_run: longest,
825        expected_longest_run: expected_longest,
826        total_runs,
827        expected_runs,
828    }
829}
830
831pub fn permutation_entropy(data: &[u8], order: usize, delay: usize) -> PermutationEntropyResult {
832    if order < 2 || delay == 0 {
833        return PermutationEntropyResult {
834            permutation_entropy: 0.0,
835            normalized_entropy: 0.0,
836            order,
837            delay,
838            n_patterns: 0,
839            pattern_counts: Vec::new(),
840            is_valid: false,
841        };
842    }
843
844    let required_len = (order - 1).saturating_mul(delay).saturating_add(2);
845    if data.len() < required_len {
846        return PermutationEntropyResult {
847            permutation_entropy: 0.0,
848            normalized_entropy: 0.0,
849            order,
850            delay,
851            n_patterns: 0,
852            pattern_counts: Vec::new(),
853            is_valid: false,
854        };
855    }
856
857    let arr: Vec<f64> = data.iter().map(|&b| b as f64).collect();
858    let n_windows = data.len() - (order - 1) * delay;
859
860    let mut counts: HashMap<Vec<usize>, usize> = HashMap::new();
861    for i in 0..n_windows {
862        let mut window = Vec::with_capacity(order);
863        for j in 0..order {
864            window.push(arr[i + j * delay]);
865        }
866
867        let mut pattern: Vec<usize> = (0..order).collect();
868        pattern.sort_by(|&a, &b| {
869            window[a]
870                .partial_cmp(&window[b])
871                .unwrap_or(std::cmp::Ordering::Equal)
872        });
873        *counts.entry(pattern).or_insert(0) += 1;
874    }
875
876    let total_patterns = n_windows as f64;
877    let mut entropy = 0.0;
878    for &count in counts.values() {
879        let p = count as f64 / total_patterns;
880        if p > 0.0 {
881            entropy -= p * p.ln();
882        }
883    }
884
885    let max_entropy = (2..=order).map(|v| (v as f64).ln()).sum::<f64>();
886    let normalized_entropy = if max_entropy > 0.0 {
887        entropy / max_entropy
888    } else {
889        0.0
890    };
891
892    let n_patterns = counts.len();
893    let mut pattern_counts: Vec<(Vec<usize>, usize)> = counts.into_iter().collect();
894    pattern_counts.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
895    pattern_counts.truncate(20);
896
897    PermutationEntropyResult {
898        permutation_entropy: entropy,
899        normalized_entropy,
900        order,
901        delay,
902        n_patterns,
903        pattern_counts,
904        is_valid: true,
905    }
906}
907
908pub fn permutation_entropy_default(data: &[u8]) -> PermutationEntropyResult {
909    permutation_entropy(data, 3, 1)
910}
911
912/// Compute cross-correlation matrix between multiple sources.
913pub fn cross_correlation_matrix(sources_data: &[(String, Vec<u8>)]) -> CrossCorrMatrix {
914    let mut pairs = Vec::new();
915    let mut flagged_count = 0;
916
917    for i in 0..sources_data.len() {
918        for j in (i + 1)..sources_data.len() {
919            let (ref name_a, ref data_a) = sources_data[i];
920            let (ref name_b, ref data_b) = sources_data[j];
921            let min_len = data_a.len().min(data_b.len());
922            if min_len < 100 {
923                continue;
924            }
925            let corr = pearson_correlation(&data_a[..min_len], &data_b[..min_len]);
926            let flagged = corr.abs() > 0.3;
927            if flagged {
928                flagged_count += 1;
929            }
930            pairs.push(CrossCorrPair {
931                source_a: name_a.clone(),
932                source_b: name_b.clone(),
933                correlation: corr,
934                flagged,
935            });
936        }
937    }
938
939    CrossCorrMatrix {
940        pairs,
941        flagged_count,
942    }
943}
944
945/// Run all per-source analysis on raw byte data.
946pub fn full_analysis(source_name: &str, data: &[u8]) -> SourceAnalysis {
947    use crate::conditioning::{quick_min_entropy, quick_shannon};
948    SourceAnalysis {
949        source_name: source_name.to_string(),
950        sample_size: data.len(),
951        shannon_entropy: quick_shannon(data),
952        min_entropy: quick_min_entropy(data),
953        autocorrelation: autocorrelation_profile(data, 100),
954        spectral: spectral_analysis(data),
955        bit_bias: bit_bias(data),
956        distribution: distribution_stats(data),
957        stationarity: stationarity_test(data),
958        runs: runs_analysis(data),
959    }
960}
961
962// ---------------------------------------------------------------------------
963// Helpers
964// ---------------------------------------------------------------------------
965
966/// Pearson correlation coefficient between two byte slices.
967pub fn pearson_correlation(a: &[u8], b: &[u8]) -> f64 {
968    let n = a.len() as f64;
969    let a_f: Vec<f64> = a.iter().map(|&x| x as f64).collect();
970    let b_f: Vec<f64> = b.iter().map(|&x| x as f64).collect();
971    let mean_a = a_f.iter().sum::<f64>() / n;
972    let mean_b = b_f.iter().sum::<f64>() / n;
973
974    let mut cov = 0.0;
975    let mut var_a = 0.0;
976    let mut var_b = 0.0;
977    for i in 0..a.len() {
978        let da = a_f[i] - mean_a;
979        let db = b_f[i] - mean_b;
980        cov += da * db;
981        var_a += da * da;
982        var_b += db * db;
983    }
984
985    let denom = (var_a * var_b).sqrt();
986    if denom < 1e-10 { 0.0 } else { cov / denom }
987}
988
989// ---------------------------------------------------------------------------
990// Tests
991// ---------------------------------------------------------------------------
992
993#[cfg(test)]
994mod tests {
995    use super::*;
996
997    fn random_data_seeded(n: usize, seed: u64) -> Vec<u8> {
998        let mut data = Vec::with_capacity(n);
999        let mut state: u64 = seed;
1000        for _ in 0..n {
1001            state = state
1002                .wrapping_mul(6364136223846793005)
1003                .wrapping_add(1442695040888963407);
1004            data.push((state >> 33) as u8);
1005        }
1006        data
1007    }
1008
1009    fn random_data(n: usize) -> Vec<u8> {
1010        random_data_seeded(n, 0xdeadbeef)
1011    }
1012
1013    #[test]
1014    fn test_autocorrelation_random() {
1015        let data = random_data(10000);
1016        let result = autocorrelation_profile(&data, 50);
1017        assert_eq!(result.lags.len(), 50);
1018        // Random data should have low autocorrelation.
1019        assert!(result.max_abs_correlation < 0.1);
1020    }
1021
1022    #[test]
1023    fn test_autocorrelation_correlated() {
1024        // Data with strong lag-1 correlation.
1025        let mut data = vec![0u8; 1000];
1026        for (i, byte) in data.iter_mut().enumerate().take(1000) {
1027            *byte = if i % 2 == 0 { 200 } else { 50 };
1028        }
1029        let result = autocorrelation_profile(&data, 10);
1030        assert!(result.max_abs_correlation > 0.5);
1031    }
1032
1033    #[test]
1034    fn test_spectral_analysis() {
1035        let data = random_data(1024);
1036        let result = spectral_analysis(&data);
1037        assert!(result.flatness > 0.0);
1038        assert!(!result.peaks.is_empty());
1039    }
1040
1041    #[test]
1042    fn test_bit_bias_random() {
1043        let data = random_data(10000);
1044        let result = bit_bias(&data);
1045        for &p in &result.bit_probabilities {
1046            assert!((p - 0.5).abs() < 0.05);
1047        }
1048    }
1049
1050    #[test]
1051    fn test_bit_bias_all_ones() {
1052        let data = vec![0xFF; 1000];
1053        let result = bit_bias(&data);
1054        for &p in &result.bit_probabilities {
1055            assert!((p - 1.0).abs() < 0.001);
1056        }
1057        assert!(result.has_significant_bias);
1058    }
1059
1060    #[test]
1061    fn test_distribution_stats() {
1062        let data = random_data(10000);
1063        let result = distribution_stats(&data);
1064        // Mean should be near 127.5 for random bytes.
1065        assert!((result.mean - 127.5).abs() < 10.0);
1066        assert!(result.variance > 0.0);
1067    }
1068
1069    #[test]
1070    fn test_anderson_darling_random_uniform() {
1071        let data = random_data_seeded(5000, 0xdeadbeef);
1072        let result = anderson_darling(&data);
1073        assert!(result.is_valid);
1074        assert!(result.is_uniform);
1075        assert!(result.p_value > 0.05);
1076    }
1077
1078    #[test]
1079    fn test_anderson_darling_constant_rejects_uniform() {
1080        let data = vec![42u8; 1000];
1081        let result = anderson_darling(&data);
1082        assert!(result.is_valid);
1083        assert!(!result.is_uniform);
1084        assert!(result.statistic > 100.0);
1085    }
1086
1087    #[test]
1088    fn test_anderson_darling_biased_zero_data() {
1089        let data: Vec<u8> = (0..1000).map(|_| 0u8).collect();
1090        let result = anderson_darling(&data);
1091        assert!(result.is_valid);
1092        assert!(!result.is_uniform);
1093    }
1094
1095    #[test]
1096    fn test_anderson_darling_empty_invalid() {
1097        let result = anderson_darling(&[]);
1098        assert!(!result.is_valid);
1099    }
1100
1101    #[test]
1102    fn test_stationarity_stationary() {
1103        let data = random_data(10000);
1104        let result = stationarity_test(&data);
1105        assert!(result.is_stationary);
1106        assert_eq!(result.n_windows, 10);
1107    }
1108
1109    #[test]
1110    fn test_runs_analysis() {
1111        let data = random_data(10000);
1112        let result = runs_analysis(&data);
1113        assert!(result.total_runs > 0);
1114        assert!(result.longest_run >= 1);
1115    }
1116
1117    #[test]
1118    fn test_cross_correlation() {
1119        let a = random_data_seeded(1000, 0xdeadbeef);
1120        let b = random_data_seeded(1000, 0xcafebabe12345678);
1121        let result = cross_correlation_matrix(&[("a".to_string(), a), ("b".to_string(), b)]);
1122        assert_eq!(result.pairs.len(), 1);
1123        assert!(result.pairs[0].correlation.abs() < 0.3);
1124    }
1125
1126    #[test]
1127    fn test_approximate_entropy_random_high() {
1128        let data = random_data_seeded(5000, 0xdeadbeef);
1129        let result = approximate_entropy_default(&data);
1130        assert!(result.is_valid);
1131        assert!(result.apen > 0.5);
1132    }
1133
1134    #[test]
1135    fn test_approximate_entropy_periodic_lower_than_random() {
1136        let random = random_data_seeded(5000, 0xdeadbeef);
1137        let periodic: Vec<u8> = (0..5000).map(|i| (i % 4) as u8).collect();
1138
1139        let random_result = approximate_entropy_default(&random);
1140        let periodic_result = approximate_entropy_default(&periodic);
1141
1142        assert!(periodic_result.is_valid);
1143        assert!(random_result.is_valid);
1144        assert!(periodic_result.apen < random_result.apen);
1145    }
1146
1147    #[test]
1148    fn test_approximate_entropy_constant_invalid() {
1149        let data = vec![42u8; 5000];
1150        let result = approximate_entropy_default(&data);
1151        assert!(!result.is_valid);
1152    }
1153
1154    #[test]
1155    fn test_approximate_entropy_empty_invalid() {
1156        let result = approximate_entropy_default(&[]);
1157        assert!(!result.is_valid);
1158    }
1159
1160    #[test]
1161    fn test_permutation_entropy_random_high() {
1162        let data = random_data_seeded(5000, 0xdeadbeef);
1163        let result = permutation_entropy(&data, 3, 1);
1164        assert!(result.is_valid);
1165        assert!(result.normalized_entropy > 0.95);
1166    }
1167
1168    #[test]
1169    fn test_permutation_entropy_monotone_low() {
1170        let data: Vec<u8> = (0..1000).map(|i| (i % 256) as u8).collect();
1171        let result = permutation_entropy(&data, 3, 1);
1172        assert!(result.is_valid);
1173        assert!(result.normalized_entropy < 0.3);
1174    }
1175
1176    #[test]
1177    fn test_permutation_entropy_constant_zero() {
1178        let data = vec![42u8; 1000];
1179        let result = permutation_entropy_default(&data);
1180        assert!(result.is_valid);
1181        assert!(result.normalized_entropy.abs() < 1e-12);
1182        assert_eq!(result.n_patterns, 1);
1183    }
1184
1185    #[test]
1186    fn test_permutation_entropy_empty_invalid() {
1187        let result = permutation_entropy_default(&[]);
1188        assert!(!result.is_valid);
1189        assert_eq!(result.n_patterns, 0);
1190    }
1191
1192    #[test]
1193    fn test_full_analysis() {
1194        let data = random_data(1000);
1195        let result = full_analysis("test_source", &data);
1196        assert_eq!(result.source_name, "test_source");
1197        assert_eq!(result.sample_size, 1000);
1198    }
1199}
openentropy_core/analysis.rs

openentropy_core/
analysis.rs