Skip to main content

shadow_network_sim/
analysis.rs

1//! Statistical analysis — entropy, chi-squared, and uniformity tests for traffic.
2
3use serde::{Serialize, Deserialize};
4
5/// Result of statistical analysis on a dataset.
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct AnalysisResult {
8    pub test_name: String,
9    pub statistic: f64,
10    pub p_value: f64,
11    pub is_suspicious: bool,
12    pub description: String,
13}
14
15/// Comprehensive traffic statistical analyzer.
16pub struct StatisticalAnalyzer {
17    /// P-value threshold for significance.
18    pub significance_level: f64,
19}
20
21impl StatisticalAnalyzer {
22    pub fn new() -> Self {
23        Self {
24            significance_level: 0.05,
25        }
26    }
27
28    pub fn with_significance(mut self, level: f64) -> Self {
29        self.significance_level = level;
30        self
31    }
32
33    /// Shannon entropy of a byte distribution (0.0–8.0).
34    pub fn byte_entropy(&self, data: &[u8]) -> f64 {
35        if data.is_empty() {
36            return 0.0;
37        }
38
39        let mut counts = [0u64; 256];
40        for &byte in data {
41            counts[byte as usize] += 1;
42        }
43
44        let len = data.len() as f64;
45        let mut entropy = 0.0;
46
47        for &count in &counts {
48            if count > 0 {
49                let p = count as f64 / len;
50                entropy -= p * p.log2();
51            }
52        }
53
54        entropy
55    }
56
57    /// Analyze byte entropy: encrypted/random data should be close to 8.0.
58    pub fn entropy_analysis(&self, data: &[u8]) -> AnalysisResult {
59        let entropy = self.byte_entropy(data);
60        // Encrypted data: entropy > 7.5
61        // English text: entropy ~ 4.0-5.0
62        // Compressed: entropy > 7.0
63        let is_suspicious = entropy < 6.0; // Low entropy = potentially unencrypted
64
65        AnalysisResult {
66            test_name: "Byte Entropy".into(),
67            statistic: entropy,
68            p_value: if is_suspicious { 0.01 } else { 0.5 },
69            is_suspicious,
70            description: format!(
71                "Entropy: {:.3} bits/byte (max 8.0). {}",
72                entropy,
73                if is_suspicious {
74                    "Low entropy may indicate unencrypted/structured data."
75                } else {
76                    "High entropy consistent with encryption."
77                }
78            ),
79        }
80    }
81
82    /// Chi-squared test for uniformity of byte distribution.
83    pub fn chi_squared_uniformity(&self, data: &[u8]) -> AnalysisResult {
84        if data.is_empty() {
85            return AnalysisResult {
86                test_name: "Chi-Squared".into(),
87                statistic: 0.0,
88                p_value: 1.0,
89                is_suspicious: false,
90                description: "No data".into(),
91            };
92        }
93
94        let mut counts = [0u64; 256];
95        for &byte in data {
96            counts[byte as usize] += 1;
97        }
98
99        let expected = data.len() as f64 / 256.0;
100        let chi_sq: f64 = counts
101            .iter()
102            .map(|&c| {
103                let diff = c as f64 - expected;
104                diff * diff / expected
105            })
106            .sum();
107
108        // Degrees of freedom = 255
109        // For 255 df, critical value at 0.05 is ~293
110        let is_suspicious = chi_sq > 293.0;
111
112        AnalysisResult {
113            test_name: "Chi-Squared Uniformity".into(),
114            statistic: chi_sq,
115            p_value: if is_suspicious { 0.01 } else { 0.5 },
116            is_suspicious,
117            description: format!(
118                "Chi-squared: {:.1} (critical: 293.0 at α=0.05). {}",
119                chi_sq,
120                if is_suspicious {
121                    "Non-uniform distribution detected."
122                } else {
123                    "Distribution consistent with random/encrypted data."
124                }
125            ),
126        }
127    }
128
129    /// Test for repeated patterns in packet sizes.
130    pub fn pattern_detection(&self, packet_sizes: &[usize]) -> AnalysisResult {
131        if packet_sizes.len() < 4 {
132            return AnalysisResult {
133                test_name: "Pattern Detection".into(),
134                statistic: 0.0,
135                p_value: 1.0,
136                is_suspicious: false,
137                description: "Insufficient data".into(),
138            };
139        }
140
141        // Count unique sizes
142        let mut unique: std::collections::HashSet<usize> = std::collections::HashSet::new();
143        for &s in packet_sizes {
144            unique.insert(s);
145        }
146
147        let unique_ratio = unique.len() as f64 / packet_sizes.len() as f64;
148
149        // Check for constant size (padding)
150        let all_same = unique.len() == 1;
151
152        // Check for repeating pattern
153        let has_pattern = detect_repeating_pattern(packet_sizes);
154
155        let is_suspicious = has_pattern && !all_same;
156
157        AnalysisResult {
158            test_name: "Pattern Detection".into(),
159            statistic: unique_ratio,
160            p_value: if is_suspicious { 0.01 } else { 0.5 },
161            is_suspicious,
162            description: format!(
163                "Unique ratio: {:.2}, all_same: {}, repeating: {}. {}",
164                unique_ratio,
165                all_same,
166                has_pattern,
167                if all_same {
168                    "Constant-size padding detected (good for privacy)."
169                } else if is_suspicious {
170                    "Repeating pattern may enable fingerprinting."
171                } else {
172                    "No obvious pattern detected."
173                }
174            ),
175        }
176    }
177
178    /// Timing regularity analysis: detect constant-rate traffic.
179    pub fn timing_regularity(&self, inter_packet_delays_us: &[u64]) -> AnalysisResult {
180        if inter_packet_delays_us.is_empty() {
181            return AnalysisResult {
182                test_name: "Timing Regularity".into(),
183                statistic: 0.0,
184                p_value: 1.0,
185                is_suspicious: false,
186                description: "No data".into(),
187            };
188        }
189
190        let mean = inter_packet_delays_us.iter().sum::<u64>() as f64
191            / inter_packet_delays_us.len() as f64;
192
193        let variance: f64 = inter_packet_delays_us
194            .iter()
195            .map(|&d| {
196                let diff = d as f64 - mean;
197                diff * diff
198            })
199            .sum::<f64>()
200            / inter_packet_delays_us.len() as f64;
201
202        let cv = if mean > 0.0 {
203            variance.sqrt() / mean
204        } else {
205            0.0
206        };
207
208        // Very low CV = constant rate (which is defensive but detectable)
209        // Very high CV = bursty (natural but fingerprintable)
210        let is_suspicious = cv < 0.05 || cv > 2.0;
211
212        AnalysisResult {
213            test_name: "Timing Regularity".into(),
214            statistic: cv,
215            p_value: if is_suspicious { 0.01 } else { 0.5 },
216            is_suspicious,
217            description: format!(
218                "CV: {:.4} (mean delay: {:.0}µs). {}",
219                cv,
220                mean,
221                if cv < 0.05 {
222                    "Very regular timing (constant-rate shaping detected)."
223                } else if cv > 2.0 {
224                    "Highly bursty traffic (may enable fingerprinting)."
225                } else {
226                    "Normal timing variation."
227                }
228            ),
229        }
230    }
231
232    /// Run all analysis tests on a traffic sample.
233    pub fn full_analysis(
234        &self,
235        payload: &[u8],
236        packet_sizes: &[usize],
237        delays_us: &[u64],
238    ) -> Vec<AnalysisResult> {
239        vec![
240            self.entropy_analysis(payload),
241            self.chi_squared_uniformity(payload),
242            self.pattern_detection(packet_sizes),
243            self.timing_regularity(delays_us),
244        ]
245    }
246
247    /// Overall suspicion score (0.0–1.0) from all analyses.
248    pub fn suspicion_score(
249        &self,
250        payload: &[u8],
251        packet_sizes: &[usize],
252        delays_us: &[u64],
253    ) -> f64 {
254        let results = self.full_analysis(payload, packet_sizes, delays_us);
255        let suspicious_count = results.iter().filter(|r| r.is_suspicious).count();
256        suspicious_count as f64 / results.len() as f64
257    }
258}
259
260impl Default for StatisticalAnalyzer {
261    fn default() -> Self {
262        Self::new()
263    }
264}
265
266/// Detect if there's a repeating pattern in the data.
267fn detect_repeating_pattern(data: &[usize]) -> bool {
268    if data.len() < 4 {
269        return false;
270    }
271
272    // Try pattern lengths from 1 to half the data
273    for pattern_len in 1..=(data.len() / 2) {
274        let pattern = &data[..pattern_len];
275        let mut matches = true;
276        for i in pattern_len..data.len() {
277            if data[i] != pattern[i % pattern_len] {
278                matches = false;
279                break;
280            }
281        }
282        if matches && pattern_len < data.len() {
283            return true;
284        }
285    }
286
287    false
288}
289
290#[cfg(test)]
291mod tests {
292    use super::*;
293
294    #[test]
295    fn test_entropy_random_data() {
296        let analyzer = StatisticalAnalyzer::new();
297        // Generate pseudorandom data using hashing
298        let mut data = Vec::new();
299        for i in 0u16..1000 {
300            let hash = crypto::hash_data(&i.to_le_bytes());
301            data.extend_from_slice(hash.as_bytes());
302        }
303        let entropy = analyzer.byte_entropy(&data);
304        assert!(entropy > 7.0, "Random data should have high entropy: {}", entropy);
305    }
306
307    #[test]
308    fn test_entropy_structured_data() {
309        let analyzer = StatisticalAnalyzer::new();
310        let data = b"AAAAAABBBBBBCCCCCC".repeat(100);
311        let result = analyzer.entropy_analysis(&data);
312        assert!(result.is_suspicious, "Structured data should be flagged");
313        assert!(result.statistic < 4.0);
314    }
315
316    #[test]
317    fn test_chi_squared_uniform() {
318        let analyzer = StatisticalAnalyzer::new();
319        // Generate data with hash for near-uniform distribution
320        let mut data = Vec::new();
321        for i in 0u32..500 {
322            let hash = crypto::hash_data(&i.to_le_bytes());
323            data.extend_from_slice(hash.as_bytes());
324        }
325        let result = analyzer.chi_squared_uniformity(&data);
326        assert!(!result.is_suspicious, "Uniform-like data should pass: chi²={}", result.statistic);
327    }
328
329    #[test]
330    fn test_pattern_detection() {
331        let analyzer = StatisticalAnalyzer::new();
332
333        // Repeating pattern: should detect
334        let sizes = vec![100, 200, 100, 200, 100, 200, 100, 200];
335        let result = analyzer.pattern_detection(&sizes);
336        assert!(result.is_suspicious, "Should detect repeating pattern");
337
338        // Constant size: not suspicious (padding)
339        let constant = vec![256, 256, 256, 256, 256];
340        let result2 = analyzer.pattern_detection(&constant);
341        assert!(!result2.is_suspicious, "Constant size is not suspicious (padding)");
342    }
343
344    #[test]
345    fn test_timing_regularity() {
346        let analyzer = StatisticalAnalyzer::new();
347
348        // Very regular timing
349        let regular = vec![1000, 1000, 1000, 1000, 1000];
350        let result = analyzer.timing_regularity(&regular);
351        assert!(result.is_suspicious, "Perfectly regular timing should be detected");
352
353        // Normal variation
354        let varied = vec![800, 1200, 950, 1100, 1050, 900];
355        let result2 = analyzer.timing_regularity(&varied);
356        assert!(!result2.is_suspicious, "Normal variation should not flag");
357    }
358}