1use serde::Serialize;
8use std::f64::consts::PI;
9
10#[derive(Debug, Clone, Serialize)]
16pub struct LagCorrelation {
17 pub lag: usize,
18 pub correlation: f64,
19}
20
21#[derive(Debug, Clone, Serialize)]
23pub struct AutocorrResult {
24 pub lags: Vec<LagCorrelation>,
25 pub max_abs_correlation: f64,
26 pub max_abs_lag: usize,
27 pub threshold: f64,
29 pub violations: usize,
31}
32
33#[derive(Debug, Clone, Serialize)]
35pub struct SpectralBin {
36 pub frequency: f64,
38 pub power: f64,
39}
40
41#[derive(Debug, Clone, Serialize)]
43pub struct SpectralResult {
44 pub peaks: Vec<SpectralBin>,
46 pub flatness: f64,
48 pub dominant_frequency: f64,
50 pub total_power: f64,
52}
53
54#[derive(Debug, Clone, Serialize)]
56pub struct BitBiasResult {
57 pub bit_probabilities: [f64; 8],
59 pub overall_bias: f64,
61 pub chi_squared: f64,
63 pub p_value: f64,
65 pub has_significant_bias: bool,
67}
68
69#[derive(Debug, Clone, Serialize)]
71pub struct DistributionResult {
72 pub mean: f64,
73 pub variance: f64,
74 pub std_dev: f64,
75 pub skewness: f64,
76 pub kurtosis: f64,
77 pub histogram: Vec<u64>,
79 pub ks_statistic: f64,
81 pub ks_p_value: f64,
83}
84
85#[derive(Debug, Clone, Serialize)]
87pub struct StationarityResult {
88 pub is_stationary: bool,
90 pub f_statistic: f64,
92 pub window_means: Vec<f64>,
94 pub window_std_devs: Vec<f64>,
96 pub n_windows: usize,
98}
99
100#[derive(Debug, Clone, Serialize)]
102pub struct RunsResult {
103 pub longest_run: usize,
105 pub expected_longest_run: f64,
107 pub total_runs: usize,
109 pub expected_runs: f64,
111}
112
113#[derive(Debug, Clone, Serialize)]
115pub struct EntropyPoint {
116 pub sample_size: usize,
117 pub shannon_h: f64,
118 pub min_entropy: f64,
119 pub collection_time_ms: u64,
120}
121
122#[derive(Debug, Clone, Serialize)]
124pub struct ScalingResult {
125 pub points: Vec<EntropyPoint>,
126}
127
128#[derive(Debug, Clone, Serialize)]
130pub struct ThroughputResult {
131 pub measurements: Vec<ThroughputPoint>,
133}
134
135#[derive(Debug, Clone, Serialize)]
136pub struct ThroughputPoint {
137 pub sample_size: usize,
138 pub bytes_per_second: f64,
139 pub collection_time_ms: u64,
140}
141
142#[derive(Debug, Clone, Serialize)]
144pub struct CrossCorrPair {
145 pub source_a: String,
146 pub source_b: String,
147 pub correlation: f64,
148 pub flagged: bool,
149}
150
151#[derive(Debug, Clone, Serialize)]
153pub struct CrossCorrMatrix {
154 pub pairs: Vec<CrossCorrPair>,
155 pub flagged_count: usize,
157}
158
159#[derive(Debug, Clone, Serialize)]
161pub struct SourceAnalysis {
162 pub source_name: String,
163 pub sample_size: usize,
164 pub autocorrelation: AutocorrResult,
165 pub spectral: SpectralResult,
166 pub bit_bias: BitBiasResult,
167 pub distribution: DistributionResult,
168 pub stationarity: StationarityResult,
169 pub runs: RunsResult,
170}
171
172pub fn autocorrelation_profile(data: &[u8], max_lag: usize) -> AutocorrResult {
178 let n = data.len();
179 if n == 0 || max_lag == 0 {
180 return AutocorrResult {
181 lags: Vec::new(),
182 max_abs_correlation: 0.0,
183 max_abs_lag: 0,
184 threshold: 0.0,
185 violations: 0,
186 };
187 }
188
189 let max_lag = max_lag.min(n / 2);
190 if max_lag == 0 {
191 return AutocorrResult {
192 lags: Vec::new(),
193 max_abs_correlation: 0.0,
194 max_abs_lag: 0,
195 threshold: 2.0 / (n as f64).sqrt(),
196 violations: 0,
197 };
198 }
199 let arr: Vec<f64> = data.iter().map(|&b| b as f64).collect();
200 let mean: f64 = arr.iter().sum::<f64>() / n as f64;
201 let var: f64 = arr.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n as f64;
202
203 let threshold = 2.0 / (n as f64).sqrt();
204 let mut lags = Vec::with_capacity(max_lag);
205 let mut max_abs = 0.0f64;
206 let mut max_abs_lag = 1;
207 let mut violations = 0;
208
209 for lag in 1..=max_lag {
210 let corr = if var < 1e-10 {
211 0.0
212 } else {
213 let mut sum = 0.0;
214 let count = n - lag;
215 for i in 0..count {
216 sum += (arr[i] - mean) * (arr[i + lag] - mean);
217 }
218 sum / (count as f64 * var)
219 };
220
221 if corr.abs() > max_abs {
222 max_abs = corr.abs();
223 max_abs_lag = lag;
224 }
225 if corr.abs() > threshold {
226 violations += 1;
227 }
228
229 lags.push(LagCorrelation {
230 lag,
231 correlation: corr,
232 });
233 }
234
235 AutocorrResult {
236 lags,
237 max_abs_correlation: max_abs,
238 max_abs_lag,
239 threshold,
240 violations,
241 }
242}
243
244pub fn spectral_analysis(data: &[u8]) -> SpectralResult {
246 let n = data.len().min(4096); if n < 2 {
248 return SpectralResult {
249 peaks: Vec::new(),
250 flatness: 0.0,
251 dominant_frequency: 0.0,
252 total_power: 0.0,
253 };
254 }
255
256 let arr: Vec<f64> = data[..n].iter().map(|&b| b as f64 - 127.5).collect();
257
258 let n_freq = n / 2;
260 let mut power_spectrum: Vec<f64> = Vec::with_capacity(n_freq);
261
262 for k in 1..=n_freq {
263 let mut re = 0.0;
264 let mut im = 0.0;
265 let freq = 2.0 * PI * k as f64 / n as f64;
266 for (j, &x) in arr.iter().enumerate() {
267 re += x * (freq * j as f64).cos();
268 im -= x * (freq * j as f64).sin();
269 }
270 power_spectrum.push((re * re + im * im) / n as f64);
271 }
272
273 let total_power: f64 = power_spectrum.iter().sum();
274
275 let arith_mean = total_power / n_freq as f64;
277 let log_sum: f64 = power_spectrum
278 .iter()
279 .map(|&p| if p > 1e-20 { p.ln() } else { -46.0 }) .sum();
281 let geo_mean = (log_sum / n_freq as f64).exp();
282 let flatness = if arith_mean > 1e-20 {
283 (geo_mean / arith_mean).clamp(0.0, 1.0)
284 } else {
285 0.0
286 };
287
288 let mut indexed: Vec<(usize, f64)> = power_spectrum.iter().copied().enumerate().collect();
290 indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
291 let peaks: Vec<SpectralBin> = indexed
292 .iter()
293 .take(10)
294 .map(|&(i, p)| SpectralBin {
295 frequency: (i + 1) as f64 / n as f64,
296 power: p,
297 })
298 .collect();
299
300 let dominant_frequency = peaks.first().map(|p| p.frequency).unwrap_or(0.0);
301
302 SpectralResult {
303 peaks,
304 flatness,
305 dominant_frequency,
306 total_power,
307 }
308}
309
310pub fn bit_bias(data: &[u8]) -> BitBiasResult {
312 if data.is_empty() {
313 return BitBiasResult {
314 bit_probabilities: [0.0; 8],
315 overall_bias: 0.0,
316 chi_squared: 0.0,
317 p_value: 1.0,
318 has_significant_bias: false,
319 };
320 }
321
322 let n = data.len() as f64;
323 let mut counts = [0u64; 8];
324
325 for &byte in data {
326 for (bit, count) in counts.iter_mut().enumerate() {
327 if byte & (1 << bit) != 0 {
328 *count += 1;
329 }
330 }
331 }
332
333 let bit_probs: [f64; 8] = {
334 let mut arr = [0.0; 8];
335 for (i, &c) in counts.iter().enumerate() {
336 arr[i] = c as f64 / n;
337 }
338 arr
339 };
340
341 let overall_bias: f64 = bit_probs.iter().map(|&p| (p - 0.5).abs()).sum::<f64>() / 8.0;
342
343 let expected = n / 2.0;
345 let chi_squared: f64 = counts
346 .iter()
347 .map(|&c| {
348 let diff = c as f64 - expected;
349 diff * diff / expected
350 })
351 .sum();
352
353 let p_value = chi_squared_p_value(chi_squared, 8);
356
357 let has_significant_bias = bit_probs.iter().any(|&p| (p - 0.5).abs() > 0.01);
358
359 BitBiasResult {
360 bit_probabilities: bit_probs,
361 overall_bias,
362 chi_squared,
363 p_value,
364 has_significant_bias,
365 }
366}
367
368pub fn distribution_stats(data: &[u8]) -> DistributionResult {
370 if data.is_empty() {
371 return DistributionResult {
372 mean: 0.0,
373 variance: 0.0,
374 std_dev: 0.0,
375 skewness: 0.0,
376 kurtosis: 0.0,
377 histogram: vec![0u64; 256],
378 ks_statistic: 0.0,
379 ks_p_value: 1.0,
380 };
381 }
382
383 let n = data.len() as f64;
384 let arr: Vec<f64> = data.iter().map(|&b| b as f64).collect();
385
386 let mean = arr.iter().sum::<f64>() / n;
387 let variance = arr.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / n;
388 let std_dev = variance.sqrt();
389
390 let skewness = if std_dev > 1e-10 {
391 arr.iter()
392 .map(|&x| ((x - mean) / std_dev).powi(3))
393 .sum::<f64>()
394 / n
395 } else {
396 0.0
397 };
398
399 let kurtosis = if std_dev > 1e-10 {
400 arr.iter()
401 .map(|&x| ((x - mean) / std_dev).powi(4))
402 .sum::<f64>()
403 / n
404 - 3.0 } else {
406 0.0
407 };
408
409 let mut histogram = vec![0u64; 256];
411 for &b in data {
412 histogram[b as usize] += 1;
413 }
414
415 let mut sorted: Vec<f64> = arr.clone();
417 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
418 let mut ks_stat = 0.0f64;
419 for (i, &x) in sorted.iter().enumerate() {
420 let empirical = (i + 1) as f64 / n;
421 let theoretical = (x + 0.5) / 256.0; let diff = (empirical - theoretical).abs();
423 if diff > ks_stat {
424 ks_stat = diff;
425 }
426 }
427 let sqrt_n = n.sqrt();
429 let lambda = (sqrt_n + 0.12 + 0.11 / sqrt_n) * ks_stat;
430 let ks_p = (-2.0 * lambda * lambda).exp().clamp(0.0, 1.0) * 2.0;
431
432 DistributionResult {
433 mean,
434 variance,
435 std_dev,
436 skewness,
437 kurtosis,
438 histogram,
439 ks_statistic: ks_stat,
440 ks_p_value: ks_p.min(1.0),
441 }
442}
443
444pub fn stationarity_test(data: &[u8]) -> StationarityResult {
446 let n_windows = 10usize;
447 let window_size = data.len() / n_windows;
448 if window_size < 10 {
449 return StationarityResult {
450 is_stationary: true,
451 f_statistic: 0.0,
452 window_means: vec![],
453 window_std_devs: vec![],
454 n_windows: 0,
455 };
456 }
457
458 let mut window_means = Vec::with_capacity(n_windows);
459 let mut window_std_devs = Vec::with_capacity(n_windows);
460
461 for w in 0..n_windows {
462 let start = w * window_size;
463 let end = start + window_size;
464 let window = &data[start..end];
465 let arr: Vec<f64> = window.iter().map(|&b| b as f64).collect();
466 let mean = arr.iter().sum::<f64>() / arr.len() as f64;
467 let var = arr.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / arr.len() as f64;
468 window_means.push(mean);
469 window_std_devs.push(var.sqrt());
470 }
471
472 let grand_mean: f64 = window_means.iter().sum::<f64>() / n_windows as f64;
474 let between_var: f64 = window_means
475 .iter()
476 .map(|&m| (m - grand_mean).powi(2))
477 .sum::<f64>()
478 / (n_windows - 1) as f64
479 * window_size as f64;
480
481 let within_var: f64 = window_std_devs.iter().map(|&s| s * s).sum::<f64>() / n_windows as f64;
482
483 let f_stat = if within_var > 1e-10 {
484 between_var / within_var
485 } else {
486 0.0
487 };
488
489 let is_stationary = f_stat < 1.88;
491
492 StationarityResult {
493 is_stationary,
494 f_statistic: f_stat,
495 window_means,
496 window_std_devs,
497 n_windows,
498 }
499}
500
501pub fn runs_analysis(data: &[u8]) -> RunsResult {
503 if data.is_empty() {
504 return RunsResult {
505 longest_run: 0,
506 expected_longest_run: 0.0,
507 total_runs: 0,
508 expected_runs: 0.0,
509 };
510 }
511
512 let mut longest = 1usize;
513 let mut current = 1usize;
514 let mut total_runs = 1usize;
515
516 for i in 1..data.len() {
517 if data[i] == data[i - 1] {
518 current += 1;
519 if current > longest {
520 longest = current;
521 }
522 } else {
523 total_runs += 1;
524 current = 1;
525 }
526 }
527
528 let n = data.len() as f64;
529 let expected_longest = (n.ln() / 256.0_f64.ln()).max(1.0);
531 let expected_runs = n * (1.0 - 1.0 / 256.0) + 1.0;
533
534 RunsResult {
535 longest_run: longest,
536 expected_longest_run: expected_longest,
537 total_runs,
538 expected_runs,
539 }
540}
541
542pub fn cross_correlation_matrix(sources_data: &[(String, Vec<u8>)]) -> CrossCorrMatrix {
544 let mut pairs = Vec::new();
545 let mut flagged_count = 0;
546
547 for i in 0..sources_data.len() {
548 for j in (i + 1)..sources_data.len() {
549 let (ref name_a, ref data_a) = sources_data[i];
550 let (ref name_b, ref data_b) = sources_data[j];
551 let min_len = data_a.len().min(data_b.len());
552 if min_len < 100 {
553 continue;
554 }
555 let corr = pearson_correlation(&data_a[..min_len], &data_b[..min_len]);
556 let flagged = corr.abs() > 0.3;
557 if flagged {
558 flagged_count += 1;
559 }
560 pairs.push(CrossCorrPair {
561 source_a: name_a.clone(),
562 source_b: name_b.clone(),
563 correlation: corr,
564 flagged,
565 });
566 }
567 }
568
569 CrossCorrMatrix {
570 pairs,
571 flagged_count,
572 }
573}
574
575pub fn full_analysis(source_name: &str, data: &[u8]) -> SourceAnalysis {
577 SourceAnalysis {
578 source_name: source_name.to_string(),
579 sample_size: data.len(),
580 autocorrelation: autocorrelation_profile(data, 100),
581 spectral: spectral_analysis(data),
582 bit_bias: bit_bias(data),
583 distribution: distribution_stats(data),
584 stationarity: stationarity_test(data),
585 runs: runs_analysis(data),
586 }
587}
588
589fn pearson_correlation(a: &[u8], b: &[u8]) -> f64 {
595 let n = a.len() as f64;
596 let a_f: Vec<f64> = a.iter().map(|&x| x as f64).collect();
597 let b_f: Vec<f64> = b.iter().map(|&x| x as f64).collect();
598 let mean_a = a_f.iter().sum::<f64>() / n;
599 let mean_b = b_f.iter().sum::<f64>() / n;
600
601 let mut cov = 0.0;
602 let mut var_a = 0.0;
603 let mut var_b = 0.0;
604 for i in 0..a.len() {
605 let da = a_f[i] - mean_a;
606 let db = b_f[i] - mean_b;
607 cov += da * db;
608 var_a += da * da;
609 var_b += db * db;
610 }
611
612 let denom = (var_a * var_b).sqrt();
613 if denom < 1e-10 { 0.0 } else { cov / denom }
614}
615
616fn chi_squared_p_value(chi2: f64, df: usize) -> f64 {
618 let a = df as f64 / 2.0;
621 let x = chi2 / 2.0;
622
623 if x < 0.0 {
624 return 1.0;
625 }
626
627 let mut sum = 0.0;
629 let mut term = 1.0 / a;
630 sum += term;
631 for n in 1..200 {
632 term *= x / (a + n as f64);
633 sum += term;
634 if term.abs() < 1e-12 {
635 break;
636 }
637 }
638 let lower_gamma = (-x + a * x.ln() - ln_gamma(a)).exp() * sum;
639 (1.0 - lower_gamma).clamp(0.0, 1.0)
640}
641
642fn ln_gamma(x: f64) -> f64 {
644 if x <= 0.0 {
645 return 0.0;
646 }
647 let g = 7.0;
649 let c = [
650 0.999_999_999_999_809_9,
651 676.5203681218851,
652 -1259.1392167224028,
653 771.323_428_777_653_1,
654 -176.615_029_162_140_6,
655 12.507343278686905,
656 -0.13857109526572012,
657 9.984_369_578_019_572e-6,
658 1.5056327351493116e-7,
659 ];
660
661 let x = x - 1.0;
662 let mut sum = c[0];
663 for (i, &coeff) in c[1..].iter().enumerate() {
664 sum += coeff / (x + i as f64 + 1.0);
665 }
666 let t = x + g + 0.5;
667 0.5 * (2.0 * PI).ln() + (t.ln() * (x + 0.5)) - t + sum.ln()
668}
669
670#[cfg(test)]
675mod tests {
676 use super::*;
677
678 fn random_data_seeded(n: usize, seed: u64) -> Vec<u8> {
679 let mut data = Vec::with_capacity(n);
680 let mut state: u64 = seed;
681 for _ in 0..n {
682 state = state
683 .wrapping_mul(6364136223846793005)
684 .wrapping_add(1442695040888963407);
685 data.push((state >> 33) as u8);
686 }
687 data
688 }
689
690 fn random_data(n: usize) -> Vec<u8> {
691 random_data_seeded(n, 0xdeadbeef)
692 }
693
694 #[test]
695 fn test_autocorrelation_random() {
696 let data = random_data(10000);
697 let result = autocorrelation_profile(&data, 50);
698 assert_eq!(result.lags.len(), 50);
699 assert!(result.max_abs_correlation < 0.1);
701 }
702
703 #[test]
704 fn test_autocorrelation_correlated() {
705 let mut data = vec![0u8; 1000];
707 for (i, byte) in data.iter_mut().enumerate().take(1000) {
708 *byte = if i % 2 == 0 { 200 } else { 50 };
709 }
710 let result = autocorrelation_profile(&data, 10);
711 assert!(result.max_abs_correlation > 0.5);
712 }
713
714 #[test]
715 fn test_spectral_analysis() {
716 let data = random_data(1024);
717 let result = spectral_analysis(&data);
718 assert!(result.flatness > 0.0);
719 assert!(!result.peaks.is_empty());
720 }
721
722 #[test]
723 fn test_bit_bias_random() {
724 let data = random_data(10000);
725 let result = bit_bias(&data);
726 for &p in &result.bit_probabilities {
727 assert!((p - 0.5).abs() < 0.05);
728 }
729 }
730
731 #[test]
732 fn test_bit_bias_all_ones() {
733 let data = vec![0xFF; 1000];
734 let result = bit_bias(&data);
735 for &p in &result.bit_probabilities {
736 assert!((p - 1.0).abs() < 0.001);
737 }
738 assert!(result.has_significant_bias);
739 }
740
741 #[test]
742 fn test_distribution_stats() {
743 let data = random_data(10000);
744 let result = distribution_stats(&data);
745 assert!((result.mean - 127.5).abs() < 10.0);
747 assert!(result.variance > 0.0);
748 }
749
750 #[test]
751 fn test_stationarity_stationary() {
752 let data = random_data(10000);
753 let result = stationarity_test(&data);
754 assert!(result.is_stationary);
755 assert_eq!(result.n_windows, 10);
756 }
757
758 #[test]
759 fn test_runs_analysis() {
760 let data = random_data(10000);
761 let result = runs_analysis(&data);
762 assert!(result.total_runs > 0);
763 assert!(result.longest_run >= 1);
764 }
765
766 #[test]
767 fn test_cross_correlation() {
768 let a = random_data_seeded(1000, 0xdeadbeef);
769 let b = random_data_seeded(1000, 0xcafebabe12345678);
770 let result = cross_correlation_matrix(&[("a".to_string(), a), ("b".to_string(), b)]);
771 assert_eq!(result.pairs.len(), 1);
772 assert!(result.pairs[0].correlation.abs() < 0.3);
773 }
774
775 #[test]
776 fn test_full_analysis() {
777 let data = random_data(1000);
778 let result = full_analysis("test_source", &data);
779 assert_eq!(result.source_name, "test_source");
780 assert_eq!(result.sample_size, 1000);
781 }
782}