1use serde::Serialize;
15
16use crate::analysis::{self, SourceAnalysis};
17use crate::conditioning::{quick_min_entropy, quick_shannon};
18use crate::math;
19
20#[derive(Debug, Clone, Serialize)]
26pub struct ComparisonResult {
27 pub label_a: String,
28 pub label_b: String,
29 pub size_a: usize,
30 pub size_b: usize,
31 pub aggregate: AggregateDelta,
32 pub two_sample: TwoSampleTests,
33 pub temporal: TemporalAnalysis,
34 pub digram: DigramAnalysis,
35 pub markov: MarkovAnalysis,
36 pub multi_lag: MultiLagAnalysis,
37 pub run_lengths: RunLengthComparison,
38}
39
40#[derive(Debug, Clone, Serialize)]
42pub struct AggregateDelta {
43 pub shannon_a: f64,
44 pub shannon_b: f64,
45 pub min_entropy_a: f64,
46 pub min_entropy_b: f64,
47 pub mean_a: f64,
48 pub mean_b: f64,
49 pub variance_a: f64,
50 pub variance_b: f64,
51 pub bit_bias_a: f64,
52 pub bit_bias_b: f64,
53 pub per_bit_bias_a: [f64; 8],
54 pub per_bit_bias_b: [f64; 8],
55 pub chi_squared_a: f64,
56 pub chi_squared_b: f64,
57 pub ks_statistic_a: f64,
58 pub ks_statistic_b: f64,
59 pub cohens_d: f64,
62}
63
64#[derive(Debug, Clone, Serialize)]
66pub struct TwoSampleTests {
67 pub ks_statistic: f64,
70 pub ks_p_value: f64,
72 pub chi2_homogeneity: f64,
75 pub chi2_df: usize,
77 pub chi2_p_value: f64,
79 pub chi2_reliable: bool,
82 pub cliffs_delta: f64,
86 pub mann_whitney_u: f64,
90 pub mann_whitney_p_value: f64,
93}
94
95#[derive(Debug, Clone, Serialize)]
97pub struct WindowAnomaly {
98 pub offset: usize,
99 pub mean: f64,
100 pub z_score: f64,
102}
103
104#[derive(Debug, Clone, Serialize)]
111pub struct TemporalAnalysis {
112 pub window_size: usize,
113 pub anomaly_count_a: usize,
114 pub anomaly_count_b: usize,
115 pub max_z_a: f64,
116 pub max_z_b: f64,
117 pub top_anomalies_a: Vec<WindowAnomaly>,
118 pub top_anomalies_b: Vec<WindowAnomaly>,
119 pub windowed_entropy_a: Vec<f64>,
122 pub windowed_entropy_b: Vec<f64>,
123}
124
125#[derive(Debug, Clone, Serialize)]
132pub struct DigramAnalysis {
133 pub chi2_a: f64,
135 pub chi2_b: f64,
137 pub sufficient_data: bool,
139 pub min_sample_bytes: usize,
141}
142
143#[derive(Debug, Clone, Serialize)]
145pub struct MarkovAnalysis {
146 pub transitions_a: [[[f64; 2]; 2]; 8],
148 pub transitions_b: [[[f64; 2]; 2]; 8],
149}
150
151#[derive(Debug, Clone, Serialize)]
153pub struct MultiLagAnalysis {
154 pub lags: Vec<usize>,
155 pub autocorr_a: Vec<f64>,
156 pub autocorr_b: Vec<f64>,
157}
158
159#[derive(Debug, Clone, Serialize)]
161pub struct RunLengthComparison {
162 pub distribution_a: Vec<(usize, usize)>,
164 pub distribution_b: Vec<(usize, usize)>,
166}
167
168pub fn compare(label_a: &str, data_a: &[u8], label_b: &str, data_b: &[u8]) -> ComparisonResult {
178 ComparisonResult {
179 label_a: label_a.to_string(),
180 label_b: label_b.to_string(),
181 size_a: data_a.len(),
182 size_b: data_b.len(),
183 aggregate: aggregate_delta(data_a, data_b),
184 two_sample: two_sample_tests(data_a, data_b),
185 temporal: temporal_analysis(data_a, data_b, 1024, 3.0),
186 digram: digram_analysis(data_a, data_b),
187 markov: markov_analysis(data_a, data_b),
188 multi_lag: multi_lag_analysis(data_a, data_b),
189 run_lengths: run_length_comparison(data_a, data_b),
190 }
191}
192
193pub fn compare_with_analysis(
198 label_a: &str,
199 data_a: &[u8],
200 analysis_a: &SourceAnalysis,
201 label_b: &str,
202 data_b: &[u8],
203 analysis_b: &SourceAnalysis,
204) -> ComparisonResult {
205 ComparisonResult {
206 label_a: label_a.to_string(),
207 label_b: label_b.to_string(),
208 size_a: data_a.len(),
209 size_b: data_b.len(),
210 aggregate: aggregate_delta_from_analysis(analysis_a, analysis_b),
211 two_sample: two_sample_tests(data_a, data_b),
212 temporal: temporal_analysis(data_a, data_b, 1024, 3.0),
213 digram: digram_analysis(data_a, data_b),
214 markov: markov_analysis(data_a, data_b),
215 multi_lag: multi_lag_analysis(data_a, data_b),
216 run_lengths: run_length_comparison(data_a, data_b),
217 }
218}
219
220pub fn aggregate_delta(data_a: &[u8], data_b: &[u8]) -> AggregateDelta {
226 let dist_a = analysis::distribution_stats(data_a);
227 let dist_b = analysis::distribution_stats(data_b);
228 let bias_a = analysis::bit_bias(data_a);
229 let bias_b = analysis::bit_bias(data_b);
230
231 build_aggregate(
232 quick_shannon(data_a),
233 quick_shannon(data_b),
234 quick_min_entropy(data_a),
235 quick_min_entropy(data_b),
236 &dist_a,
237 &dist_b,
238 &bias_a,
239 &bias_b,
240 )
241}
242
243fn aggregate_delta_from_analysis(a: &SourceAnalysis, b: &SourceAnalysis) -> AggregateDelta {
245 build_aggregate(
246 a.shannon_entropy,
247 b.shannon_entropy,
248 a.min_entropy,
249 b.min_entropy,
250 &a.distribution,
251 &b.distribution,
252 &a.bit_bias,
253 &b.bit_bias,
254 )
255}
256
257#[allow(clippy::too_many_arguments)]
258fn build_aggregate(
259 shannon_a: f64,
260 shannon_b: f64,
261 min_entropy_a: f64,
262 min_entropy_b: f64,
263 dist_a: &analysis::DistributionResult,
264 dist_b: &analysis::DistributionResult,
265 bias_a: &analysis::BitBiasResult,
266 bias_b: &analysis::BitBiasResult,
267) -> AggregateDelta {
268 let pooled_std = ((dist_a.variance + dist_b.variance) / 2.0).sqrt();
270 let cohens_d = if pooled_std > 1e-10 {
271 (dist_a.mean - dist_b.mean) / pooled_std
272 } else {
273 0.0
274 };
275
276 AggregateDelta {
277 shannon_a,
278 shannon_b,
279 min_entropy_a,
280 min_entropy_b,
281 mean_a: dist_a.mean,
282 mean_b: dist_b.mean,
283 variance_a: dist_a.variance,
284 variance_b: dist_b.variance,
285 bit_bias_a: bias_a.overall_bias,
286 bit_bias_b: bias_b.overall_bias,
287 per_bit_bias_a: bias_a.bit_probabilities,
288 per_bit_bias_b: bias_b.bit_probabilities,
289 chi_squared_a: bias_a.chi_squared,
290 chi_squared_b: bias_b.chi_squared,
291 ks_statistic_a: dist_a.ks_statistic,
292 ks_statistic_b: dist_b.ks_statistic,
293 cohens_d,
294 }
295}
296
297pub fn two_sample_tests(data_a: &[u8], data_b: &[u8]) -> TwoSampleTests {
303 let (ks_stat, ks_p) = two_sample_ks(data_a, data_b);
304 let (chi2, chi2_df, chi2_p, chi2_reliable) = chi2_homogeneity(data_a, data_b);
305 let (mw_u, mw_p) = mann_whitney_u_test(data_a, data_b);
306
307 TwoSampleTests {
308 ks_statistic: ks_stat,
309 ks_p_value: ks_p,
310 chi2_homogeneity: chi2,
311 chi2_df,
312 chi2_p_value: chi2_p,
313 chi2_reliable,
314 cliffs_delta: cliffs_delta(data_a, data_b),
315 mann_whitney_u: mw_u,
316 mann_whitney_p_value: mw_p,
317 }
318}
319
320fn two_sample_ks(data_a: &[u8], data_b: &[u8]) -> (f64, f64) {
325 if data_a.is_empty() || data_b.is_empty() {
326 return (0.0, 1.0);
327 }
328
329 let mut hist_a = [0u64; 256];
331 let mut hist_b = [0u64; 256];
332 for &b in data_a {
333 hist_a[b as usize] += 1;
334 }
335 for &b in data_b {
336 hist_b[b as usize] += 1;
337 }
338
339 let n_a = data_a.len() as f64;
340 let n_b = data_b.len() as f64;
341 let mut cdf_a = 0.0;
342 let mut cdf_b = 0.0;
343 let mut d_max = 0.0f64;
344
345 for i in 0..256 {
346 cdf_a += hist_a[i] as f64 / n_a;
347 cdf_b += hist_b[i] as f64 / n_b;
348 let d = (cdf_a - cdf_b).abs();
349 if d > d_max {
350 d_max = d;
351 }
352 }
353
354 let n_eff = (n_a * n_b) / (n_a + n_b);
359 let lambda = d_max * n_eff.sqrt();
360 let p_value = kolmogorov_survival(lambda);
361
362 (d_max, p_value)
363}
364
365fn chi2_homogeneity(data_a: &[u8], data_b: &[u8]) -> (f64, usize, f64, bool) {
372 if data_a.is_empty() || data_b.is_empty() {
373 return (0.0, 255, 1.0, false);
374 }
375
376 let mut hist_a = [0u64; 256];
377 let mut hist_b = [0u64; 256];
378 for &b in data_a {
379 hist_a[b as usize] += 1;
380 }
381 for &b in data_b {
382 hist_b[b as usize] += 1;
383 }
384
385 let n_a = data_a.len() as f64;
386 let n_b = data_b.len() as f64;
387 let n_total = n_a + n_b;
388
389 let mut chi2 = 0.0;
390 let mut df = 0usize;
391 let mut reliable = true;
392
393 for i in 0..256 {
394 let pooled = hist_a[i] + hist_b[i];
395 if pooled == 0 {
396 continue; }
398 df += 1;
399
400 let expected_a = pooled as f64 * n_a / n_total;
401 let expected_b = pooled as f64 * n_b / n_total;
402
403 if expected_a < CHI2_MIN_EXPECTED || expected_b < CHI2_MIN_EXPECTED {
404 reliable = false;
405 }
406
407 let diff_a = hist_a[i] as f64 - expected_a;
408 let diff_b = hist_b[i] as f64 - expected_b;
409
410 chi2 += diff_a * diff_a / expected_a + diff_b * diff_b / expected_b;
411 }
412
413 df = df.saturating_sub(1);
415 let p_value = math::chi_squared_survival(chi2, df);
416
417 (chi2, df, p_value, reliable)
418}
419
420pub fn cliffs_delta(data_a: &[u8], data_b: &[u8]) -> f64 {
425 if data_a.is_empty() || data_b.is_empty() {
426 return 0.0;
427 }
428
429 let mut hist_a = [0u64; 256];
430 let mut hist_b = [0u64; 256];
431 for &b in data_a {
432 hist_a[b as usize] += 1;
433 }
434 for &b in data_b {
435 hist_b[b as usize] += 1;
436 }
437
438 let mut cum_b = [0u64; 257]; for i in 0..256 {
441 cum_b[i + 1] = cum_b[i] + hist_b[i];
442 }
443
444 let mut greater = 0i128; let n_a = data_a.len() as i128;
446 let n_b = data_b.len() as i128;
447
448 for val in 0..256 {
449 let count_a = hist_a[val] as i128;
450 if count_a == 0 {
451 continue;
452 }
453 let b_less = cum_b[val] as i128; let b_greater = n_b - cum_b[val + 1] as i128; greater += count_a * (b_less - b_greater);
456 }
457
458 greater as f64 / (n_a * n_b) as f64
459}
460
461fn mann_whitney_u_test(data_a: &[u8], data_b: &[u8]) -> (f64, f64) {
468 if data_a.is_empty() || data_b.is_empty() {
469 return (0.5, 1.0);
470 }
471
472 let mut hist_a = [0u64; 256];
473 let mut hist_b = [0u64; 256];
474 for &b in data_a {
475 hist_a[b as usize] += 1;
476 }
477 for &b in data_b {
478 hist_b[b as usize] += 1;
479 }
480
481 let mut cum_b = [0u64; 257];
483 for i in 0..256 {
484 cum_b[i + 1] = cum_b[i] + hist_b[i];
485 }
486
487 let mut u: f64 = 0.0;
488 for val in 0..256 {
489 let count_a = hist_a[val] as f64;
490 if count_a == 0.0 {
491 continue;
492 }
493 let b_less = cum_b[val] as f64;
494 let b_equal = hist_b[val] as f64;
495 u += count_a * (b_less + 0.5 * b_equal);
496 }
497
498 let n_a = data_a.len() as f64;
499 let n_b = data_b.len() as f64;
500 let u_norm = u / (n_a * n_b);
501
502 let n_total = n_a + n_b;
508 let mu = n_a * n_b / 2.0;
509
510 let mut tie_sum = 0.0;
511 for i in 0..256 {
512 let t = (hist_a[i] + hist_b[i]) as f64;
513 if t > 1.0 {
514 tie_sum += t * t * t - t;
515 }
516 }
517
518 let sigma_sq = n_a * n_b / 12.0 * ((n_total + 1.0) - tie_sum / (n_total * (n_total - 1.0)));
519 let p_value = if sigma_sq > 0.0 {
520 let z = (u - mu).abs() / sigma_sq.sqrt();
521 math::erfc(z / std::f64::consts::SQRT_2)
524 } else {
525 1.0
526 };
527
528 (u_norm, p_value.clamp(0.0, 1.0))
529}
530
531const MAX_WINDOWED_ENTROPY: usize = 1024;
538
539const UNIFORM_BYTE_VARIANCE: f64 = 65535.0 / 12.0;
542
543const UNIFORM_BYTE_MEAN: f64 = 127.5;
545
546pub fn temporal_analysis(
551 data_a: &[u8],
552 data_b: &[u8],
553 window: usize,
554 z_threshold: f64,
555) -> TemporalAnalysis {
556 let (anomalies_a, entropy_a) = windowed_scan(data_a, window, z_threshold);
557 let (anomalies_b, entropy_b) = windowed_scan(data_b, window, z_threshold);
558
559 let max_z_a = anomalies_a
560 .iter()
561 .map(|a| a.z_score.abs())
562 .fold(0.0f64, f64::max);
563 let max_z_b = anomalies_b
564 .iter()
565 .map(|a| a.z_score.abs())
566 .fold(0.0f64, f64::max);
567
568 let count_a = anomalies_a.len();
569 let count_b = anomalies_b.len();
570
571 let top_a = top_anomalies(anomalies_a, 20);
573 let top_b = top_anomalies(anomalies_b, 20);
574
575 TemporalAnalysis {
576 window_size: window,
577 anomaly_count_a: count_a,
578 anomaly_count_b: count_b,
579 max_z_a,
580 max_z_b,
581 top_anomalies_a: top_a,
582 top_anomalies_b: top_b,
583 windowed_entropy_a: subsample(entropy_a, MAX_WINDOWED_ENTROPY),
584 windowed_entropy_b: subsample(entropy_b, MAX_WINDOWED_ENTROPY),
585 }
586}
587
588fn windowed_scan(data: &[u8], window: usize, z_threshold: f64) -> (Vec<WindowAnomaly>, Vec<f64>) {
590 if data.is_empty() || window == 0 {
591 return (Vec::new(), Vec::new());
592 }
593
594 let n_windows = data.len() / window;
595 if n_windows == 0 {
596 return (Vec::new(), Vec::new());
597 }
598
599 let theoretical_std = (UNIFORM_BYTE_VARIANCE / window as f64).sqrt();
601
602 let mut anomalies = Vec::new();
603 let mut entropies = Vec::with_capacity(n_windows);
604
605 for i in 0..n_windows {
606 let start = i * window;
607 let chunk = &data[start..start + window];
608 let sum: f64 = chunk.iter().map(|&b| b as f64).sum();
609 let mean = sum / window as f64;
610 entropies.push(quick_shannon(chunk));
611
612 let z = if theoretical_std > 1e-10 {
614 (mean - UNIFORM_BYTE_MEAN) / theoretical_std
615 } else {
616 0.0
617 };
618
619 if z.abs() > z_threshold {
620 anomalies.push(WindowAnomaly {
621 offset: i * window,
622 mean,
623 z_score: z,
624 });
625 }
626 }
627
628 (anomalies, entropies)
629}
630
631fn top_anomalies(mut anomalies: Vec<WindowAnomaly>, n: usize) -> Vec<WindowAnomaly> {
633 anomalies.sort_by(|a, b| {
634 b.z_score
635 .abs()
636 .partial_cmp(&a.z_score.abs())
637 .unwrap_or(std::cmp::Ordering::Equal)
638 });
639 anomalies.truncate(n);
640 anomalies
641}
642
643fn subsample(v: Vec<f64>, max: usize) -> Vec<f64> {
645 if v.len() <= max {
646 return v;
647 }
648 let step = v.len() as f64 / max as f64;
649 (0..max).map(|i| v[(i as f64 * step) as usize]).collect()
650}
651
652const CHI2_MIN_EXPECTED: f64 = 5.0;
658
659const N_DIGRAM_BINS: usize = 65536;
661
662pub fn digram_analysis(data_a: &[u8], data_b: &[u8]) -> DigramAnalysis {
667 let min_bytes = N_DIGRAM_BINS * CHI2_MIN_EXPECTED as usize + 1;
669 let sufficient = data_a.len() >= min_bytes && data_b.len() >= min_bytes;
670
671 DigramAnalysis {
672 chi2_a: digram_chi2(data_a),
673 chi2_b: digram_chi2(data_b),
674 sufficient_data: sufficient,
675 min_sample_bytes: min_bytes,
676 }
677}
678
679fn digram_chi2(data: &[u8]) -> f64 {
683 if data.len() < 2 {
684 return f64::NAN;
685 }
686 let n_digrams = data.len() - 1;
687 let expected = n_digrams as f64 / N_DIGRAM_BINS as f64;
688 if expected < CHI2_MIN_EXPECTED {
689 return f64::NAN;
690 }
691
692 let mut counts = vec![0u64; N_DIGRAM_BINS];
693 for pair in data.windows(2) {
694 let idx = (pair[0] as usize) * 256 + pair[1] as usize;
695 counts[idx] += 1;
696 }
697
698 counts
699 .iter()
700 .map(|&c| {
701 let diff = c as f64 - expected;
702 diff * diff / expected
703 })
704 .sum()
705}
706
707pub fn markov_analysis(data_a: &[u8], data_b: &[u8]) -> MarkovAnalysis {
713 MarkovAnalysis {
714 transitions_a: bit_markov_transitions(data_a),
715 transitions_b: bit_markov_transitions(data_b),
716 }
717}
718
719fn bit_markov_transitions(data: &[u8]) -> [[[f64; 2]; 2]; 8] {
721 let mut counts = [[[0u64; 2]; 2]; 8]; for pair in data.windows(2) {
724 let prev = pair[0];
725 let curr = pair[1];
726 for bit in 0..8u8 {
727 let from = ((prev >> bit) & 1) as usize;
728 let to = ((curr >> bit) & 1) as usize;
729 counts[bit as usize][from][to] += 1;
730 }
731 }
732
733 let mut probs = [[[0.0f64; 2]; 2]; 8];
734 for (bit, count_bit) in counts.iter().enumerate() {
735 for (from, count_from) in count_bit.iter().enumerate() {
736 let total = count_from[0] + count_from[1];
737 if total > 0 {
738 probs[bit][from][0] = count_from[0] as f64 / total as f64;
739 probs[bit][from][1] = count_from[1] as f64 / total as f64;
740 }
741 }
742 }
743
744 probs
745}
746
747pub fn multi_lag_analysis(data_a: &[u8], data_b: &[u8]) -> MultiLagAnalysis {
753 let lags: Vec<usize> = vec![1, 2, 3, 4, 5, 8, 16, 32, 64, 128];
754
755 let profile_a = analysis::autocorrelation_profile(data_a, 128);
756 let profile_b = analysis::autocorrelation_profile(data_b, 128);
757
758 let extract = |profile: &analysis::AutocorrResult| -> Vec<f64> {
759 lags.iter()
760 .map(|&lag| {
761 profile
762 .lags
763 .iter()
764 .find(|lc| lc.lag == lag)
765 .map(|lc| lc.correlation)
766 .unwrap_or(0.0)
767 })
768 .collect()
769 };
770
771 MultiLagAnalysis {
772 autocorr_a: extract(&profile_a),
773 autocorr_b: extract(&profile_b),
774 lags,
775 }
776}
777
778pub fn run_length_comparison(data_a: &[u8], data_b: &[u8]) -> RunLengthComparison {
784 RunLengthComparison {
785 distribution_a: run_length_distribution(data_a),
786 distribution_b: run_length_distribution(data_b),
787 }
788}
789
790fn run_length_distribution(data: &[u8]) -> Vec<(usize, usize)> {
792 if data.is_empty() {
793 return Vec::new();
794 }
795
796 let mut dist: std::collections::BTreeMap<usize, usize> = std::collections::BTreeMap::new();
797 let mut current_len = 1usize;
798
799 for i in 1..data.len() {
800 if data[i] == data[i - 1] {
801 current_len += 1;
802 } else {
803 *dist.entry(current_len).or_default() += 1;
804 current_len = 1;
805 }
806 }
807 *dist.entry(current_len).or_default() += 1;
809
810 dist.into_iter().collect()
811}
812
813fn kolmogorov_survival(lambda: f64) -> f64 {
823 if lambda <= 0.0 {
824 return 1.0;
825 }
826 let mut sum = 0.0;
827 for k in 1..=100 {
828 let kf = k as f64;
829 let term = (-2.0 * kf * kf * lambda * lambda).exp();
830 if term < 1e-15 {
831 break;
832 }
833 if k % 2 == 1 {
834 sum += term;
835 } else {
836 sum -= term;
837 }
838 }
839 (2.0 * sum).clamp(0.0, 1.0)
840}
841
842#[cfg(test)]
851mod tests {
852 use super::*;
853 use crate::math::{chi_squared_survival, erfc};
854
855 fn pseudo_random(n: usize, seed: u64) -> Vec<u8> {
856 let mut data = Vec::with_capacity(n);
857 let mut state: u64 = seed;
858 for _ in 0..n {
859 state = state
860 .wrapping_mul(6364136223846793005)
861 .wrapping_add(1442695040888963407);
862 data.push((state >> 33) as u8);
863 }
864 data
865 }
866
867 #[test]
870 fn compare_two_random_streams() {
871 let a = pseudo_random(10_000, 0xdeadbeef);
872 let b = pseudo_random(10_000, 0xcafebabe);
873 let result = compare("a", &a, "b", &b);
874 assert_eq!(result.size_a, 10_000);
875 assert_eq!(result.size_b, 10_000);
876 assert!(result.aggregate.shannon_a > 7.0);
877 assert!(result.aggregate.shannon_b > 7.0);
878 assert!(result.aggregate.cohens_d.abs() < 0.5);
879 }
880
881 #[test]
882 fn compare_biased_vs_random() {
883 let a = vec![128u8; 10_000];
884 let b = pseudo_random(10_000, 42);
885 let result = compare("constant", &a, "random", &b);
886 assert!(result.aggregate.shannon_a < 0.01);
887 assert!(result.aggregate.shannon_b > 7.0);
888 assert!(result.aggregate.variance_a < 0.01);
889 assert!(result.aggregate.variance_b > 100.0);
890 }
891
892 #[test]
893 fn compare_with_precomputed_analysis() {
894 let a = pseudo_random(5_000, 1);
895 let b = pseudo_random(5_000, 2);
896 let analysis_a = analysis::full_analysis("a", &a);
897 let analysis_b = analysis::full_analysis("b", &b);
898
899 let result = compare_with_analysis("a", &a, &analysis_a, "b", &b, &analysis_b);
900 let standalone = compare("a", &a, "b", &b);
901
902 assert!((result.aggregate.shannon_a - standalone.aggregate.shannon_a).abs() < 1e-10);
903 assert!((result.aggregate.mean_a - standalone.aggregate.mean_a).abs() < 1e-10);
904 assert!((result.aggregate.cohens_d - standalone.aggregate.cohens_d).abs() < 1e-10);
905 }
906
907 #[test]
908 fn compare_empty_inputs() {
909 let result = compare("empty_a", &[], "empty_b", &[]);
910 assert_eq!(result.size_a, 0);
911 assert_eq!(result.size_b, 0);
912 assert!(result.temporal.windowed_entropy_a.is_empty());
913 assert!(result.run_lengths.distribution_a.is_empty());
914 }
915
916 #[test]
917 fn compare_small_inputs() {
918 let a = pseudo_random(100, 1);
919 let b = pseudo_random(100, 2);
920 let result = compare("small_a", &a, "small_b", &b);
921 assert_eq!(result.size_a, 100);
922 assert!(result.temporal.windowed_entropy_a.is_empty());
923 assert!(result.aggregate.shannon_a > 0.0);
924 }
925
926 #[test]
929 fn two_sample_ks_identical_streams() {
930 let a = pseudo_random(10_000, 42);
931 let b = a.clone();
932 let (d, p) = two_sample_ks(&a, &b);
933 assert_eq!(d, 0.0);
934 assert_eq!(p, 1.0);
935 }
936
937 #[test]
938 fn two_sample_ks_different_distributions() {
939 let a = vec![0u8; 10_000]; let b = vec![255u8; 10_000]; let (d, p) = two_sample_ks(&a, &b);
942 assert!((d - 1.0).abs() < 1e-10); assert!(p < 0.001);
944 }
945
946 #[test]
947 fn two_sample_ks_similar_random() {
948 let a = pseudo_random(10_000, 1);
949 let b = pseudo_random(10_000, 2);
950 let (d, _p) = two_sample_ks(&a, &b);
951 assert!(d < 0.05, "D = {d}");
953 }
954
955 #[test]
956 fn chi2_homogeneity_identical() {
957 let a = pseudo_random(10_000, 42);
958 let b = a.clone();
959 let (chi2, _df, p, reliable) = chi2_homogeneity(&a, &b);
960 assert_eq!(chi2, 0.0);
961 assert!((p - 1.0).abs() < 0.01);
962 assert!(reliable);
963 }
964
965 #[test]
966 fn chi2_homogeneity_different() {
967 let a = vec![0u8; 10_000];
968 let b = vec![255u8; 10_000];
969 let (chi2, _df, p, reliable) = chi2_homogeneity(&a, &b);
970 assert!(chi2 > 1000.0);
971 assert!(p < 0.001);
972 assert!(reliable);
974 }
975
976 #[test]
977 fn chi2_homogeneity_unreliable_small_sample() {
978 let a = pseudo_random(20, 1);
979 let b = pseudo_random(20, 2);
980 let (_chi2, _df, _p, reliable) = chi2_homogeneity(&a, &b);
981 assert!(!reliable);
983 }
984
985 #[test]
986 fn cliffs_delta_identical() {
987 let a = pseudo_random(5_000, 42);
988 let b = a.clone();
989 let d = cliffs_delta(&a, &b);
990 assert_eq!(d, 0.0);
991 }
992
993 #[test]
994 fn cliffs_delta_maximally_different() {
995 let a = vec![0u8; 1_000];
996 let b = vec![255u8; 1_000];
997 let d = cliffs_delta(&a, &b);
998 assert!((d - (-1.0)).abs() < 1e-10);
1000 }
1001
1002 #[test]
1003 fn cliffs_delta_small_for_similar_random() {
1004 let a = pseudo_random(10_000, 1);
1005 let b = pseudo_random(10_000, 2);
1006 let d = cliffs_delta(&a, &b);
1007 assert!(d.abs() < 0.1, "Cliff's delta = {d}");
1008 }
1009
1010 #[test]
1011 fn mann_whitney_identical() {
1012 let a = pseudo_random(5_000, 42);
1013 let b = a.clone();
1014 let (u, p) = mann_whitney_u_test(&a, &b);
1015 assert!((u - 0.5).abs() < 0.01, "U = {u}");
1016 assert!(p > 0.05, "p = {p}");
1018 }
1019
1020 #[test]
1021 fn mann_whitney_all_less() {
1022 let a = vec![0u8; 1_000];
1023 let b = vec![255u8; 1_000];
1024 let (u, p) = mann_whitney_u_test(&a, &b);
1025 assert!(u < 0.01, "U = {u}");
1027 assert!(p < 0.001, "p = {p}");
1029 }
1030
1031 #[test]
1032 fn mann_whitney_similar_random() {
1033 let a = pseudo_random(10_000, 1);
1034 let b = pseudo_random(10_000, 2);
1035 let (u, _p) = mann_whitney_u_test(&a, &b);
1036 assert!((u - 0.5).abs() < 0.05, "U = {u}");
1038 }
1039
1040 #[test]
1043 fn temporal_analysis_basic() {
1044 let a = pseudo_random(10_000, 1);
1045 let b = pseudo_random(10_000, 2);
1046 let result = temporal_analysis(&a, &b, 1024, 3.0);
1047 assert_eq!(result.window_size, 1024);
1048 assert!(!result.windowed_entropy_a.is_empty());
1049 }
1050
1051 #[test]
1052 fn temporal_uses_theoretical_params() {
1053 let a = pseudo_random(100_000, 1);
1056 let result = temporal_analysis(&a, &a, 1024, 3.0);
1057 let n_windows = 100_000 / 1024;
1058 assert!(
1061 result.anomaly_count_a < n_windows / 5,
1062 "Too many anomalies: {}/{}",
1063 result.anomaly_count_a,
1064 n_windows
1065 );
1066 }
1067
1068 #[test]
1069 fn temporal_analysis_subsamples_large_output() {
1070 let a = pseudo_random(100_000, 1);
1071 let b = pseudo_random(100_000, 2);
1072 let result = temporal_analysis(&a, &b, 64, 3.0);
1073 assert!(result.windowed_entropy_a.len() <= MAX_WINDOWED_ENTROPY);
1074 assert!(result.windowed_entropy_b.len() <= MAX_WINDOWED_ENTROPY);
1075 }
1076
1077 #[test]
1080 fn digram_insufficient_data() {
1081 let a = pseudo_random(1_000, 1);
1082 let b = pseudo_random(1_000, 2);
1083 let result = digram_analysis(&a, &b);
1084 assert!(!result.sufficient_data);
1085 assert!(result.chi2_a.is_nan());
1086 }
1087
1088 #[test]
1089 fn digram_sufficient_data() {
1090 let a = pseudo_random(400_000, 1);
1091 let b = pseudo_random(400_000, 2);
1092 let result = digram_analysis(&a, &b);
1093 assert!(result.sufficient_data);
1094 assert!(result.chi2_a.is_finite());
1095 assert!(result.chi2_a > 0.0);
1096 }
1097
1098 #[test]
1101 fn markov_rows_sum_to_one() {
1102 let a = pseudo_random(5_000, 1);
1103 let b = pseudo_random(5_000, 2);
1104 let result = markov_analysis(&a, &b);
1105 for bit in 0..8 {
1106 for from in 0..2 {
1107 let sum_a = result.transitions_a[bit][from][0] + result.transitions_a[bit][from][1];
1108 assert!(
1109 (sum_a - 1.0).abs() < 0.01,
1110 "A bit={bit} from={from} sum={sum_a}"
1111 );
1112 let sum_b = result.transitions_b[bit][from][0] + result.transitions_b[bit][from][1];
1113 assert!(
1114 (sum_b - 1.0).abs() < 0.01,
1115 "B bit={bit} from={from} sum={sum_b}"
1116 );
1117 }
1118 }
1119 }
1120
1121 #[test]
1124 fn multi_lag_lengths_match() {
1125 let a = pseudo_random(5_000, 1);
1126 let b = pseudo_random(5_000, 2);
1127 let result = multi_lag_analysis(&a, &b);
1128 assert_eq!(result.lags.len(), result.autocorr_a.len());
1129 assert_eq!(result.lags.len(), result.autocorr_b.len());
1130 }
1131
1132 #[test]
1135 fn run_length_basic() {
1136 let a = pseudo_random(5_000, 1);
1137 let b = pseudo_random(5_000, 2);
1138 let result = run_length_comparison(&a, &b);
1139 assert!(!result.distribution_a.is_empty());
1140 assert!(!result.distribution_b.is_empty());
1141 }
1142
1143 #[test]
1144 fn run_length_constant_stream() {
1145 let data = vec![42u8; 100];
1146 let dist = run_length_distribution(&data);
1147 assert_eq!(dist.len(), 1);
1148 assert_eq!(dist[0], (100, 1));
1149 }
1150
1151 #[test]
1154 fn chi_squared_survival_zero() {
1155 let p = chi_squared_survival(0.0, 10);
1157 assert!((p - 1.0).abs() < 0.01);
1158 }
1159
1160 #[test]
1161 fn chi_squared_survival_large() {
1162 let p = chi_squared_survival(1000.0, 10);
1164 assert!(p < 0.001);
1165 }
1166
1167 #[test]
1168 fn chi_squared_survival_midrange() {
1169 let p = chi_squared_survival(18.31, 10);
1172 assert!((p - 0.05).abs() < 0.005, "Expected p ≈ 0.05, got p = {p}");
1173
1174 let p = chi_squared_survival(23.21, 10);
1177 assert!((p - 0.01).abs() < 0.002, "Expected p ≈ 0.01, got p = {p}");
1178 }
1179
1180 #[test]
1181 fn chi_squared_survival_high_df() {
1182 let p = chi_squared_survival(290.0, 255);
1184 assert!((p - 0.065).abs() < 0.01, "Expected p ≈ 0.065, got p = {p}");
1185 }
1186
1187 #[test]
1190 fn kolmogorov_survival_zero() {
1191 assert_eq!(kolmogorov_survival(0.0), 1.0);
1192 }
1193
1194 #[test]
1195 fn kolmogorov_survival_large() {
1196 let p = kolmogorov_survival(3.0);
1197 assert!(p < 1e-6);
1198 }
1199
1200 #[test]
1201 fn kolmogorov_survival_matches_scipy() {
1202 let p = kolmogorov_survival(1.2304);
1204 assert!(
1205 (p - 0.0969).abs() < 0.001,
1206 "Expected p ≈ 0.097, got p = {p}"
1207 );
1208
1209 let p = kolmogorov_survival(0.5);
1212 assert!((p - 0.964).abs() < 0.01, "Expected p ≈ 0.964, got p = {p}");
1213 }
1214
1215 #[test]
1218 fn erfc_known_values() {
1219 assert!((erfc(0.0) - 1.0).abs() < 1e-6);
1221 assert!(erfc(5.0) < 1e-6);
1223 assert!((erfc(1.0) - 0.1573).abs() < 0.001);
1225 }
1226}