openentropy_core/
conditioning.rs

1//! Centralized entropy conditioning module.
2//!
3//! **ALL** post-processing of raw entropy lives here — no conditioning code
4//! should exist in individual source implementations. Sources produce raw bytes;
5//! this module is the single, auditable gateway for any transformation.
6//!
7//! # Architecture
8//!
9//! ```text
10//! Source → Raw Bytes → Conditioning Layer (this module) → Output
11//! ```
12//!
13//! # Conditioning Modes
14//!
15//! - **Raw**: No processing. XOR-combined bytes pass through unchanged.
16//!   Preserves the actual hardware noise signal for research.
17//! - **VonNeumann**: Debias only. Removes first-order bias without destroying
18//!   the noise structure. Output is shorter than input (~25% yield).
19//! - **Sha256**: Full SHA-256 conditioning with counter and timestamp mixing.
20//!   Produces cryptographically strong output but destroys the raw signal.
21//!
22//! Most QRNG APIs (ANU, Outshift/Cisco) apply DRBG post-processing that makes
23//! output indistinguishable from PRNG. The `Raw` mode here is what makes
24//! openentropy useful for researchers studying actual hardware noise.
25
26use sha2::{Digest, Sha256};
27use std::collections::HashMap;
28
29/// Conditioning mode for entropy output.
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
31pub enum ConditioningMode {
32    /// No conditioning. Raw bytes pass through unchanged.
33    Raw,
34    /// Von Neumann debiasing only.
35    VonNeumann,
36    /// SHA-256 hash conditioning (default). Cryptographically strong output.
37    #[default]
38    Sha256,
39}
40
41impl std::fmt::Display for ConditioningMode {
42    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
43        match self {
44            Self::Raw => write!(f, "raw"),
45            Self::VonNeumann => write!(f, "von_neumann"),
46            Self::Sha256 => write!(f, "sha256"),
47        }
48    }
49}
50
51// ---------------------------------------------------------------------------
52// Central conditioning gateway
53// ---------------------------------------------------------------------------
54
55/// Apply the specified conditioning mode to raw entropy bytes.
56///
57/// This is the **single gateway** for all entropy conditioning. No other code
58/// in the crate should perform SHA-256, Von Neumann debiasing, or any other
59/// form of whitening/post-processing on entropy data.
60///
61/// - `Raw`: returns the input unchanged (truncated or zero-padded to `n_output`)
62/// - `VonNeumann`: debiases then truncates/pads to `n_output`
63/// - `Sha256`: chained SHA-256 hashing to produce exactly `n_output` bytes
64pub fn condition(raw: &[u8], n_output: usize, mode: ConditioningMode) -> Vec<u8> {
65    match mode {
66        ConditioningMode::Raw => {
67            let mut out = raw.to_vec();
68            out.truncate(n_output);
69            out
70        }
71        ConditioningMode::VonNeumann => {
72            let debiased = von_neumann_debias(raw);
73            let mut out = debiased;
74            out.truncate(n_output);
75            out
76        }
77        ConditioningMode::Sha256 => sha256_condition_bytes(raw, n_output),
78    }
79}
80
81// ---------------------------------------------------------------------------
82// SHA-256 conditioning
83// ---------------------------------------------------------------------------
84
85/// SHA-256 chained conditioning: stretches or compresses raw bytes to exactly
86/// `n_output` bytes using counter-mode hashing.
87///
88/// Each 32-byte output block is: SHA-256(state || chunk || counter).
89/// State is chained from the previous block's digest.
90pub fn sha256_condition_bytes(raw: &[u8], n_output: usize) -> Vec<u8> {
91    if raw.is_empty() {
92        return vec![0u8; n_output];
93    }
94    let mut output = Vec::with_capacity(n_output);
95    let mut state = [0u8; 32];
96    let mut offset = 0;
97    let mut counter: u64 = 0;
98    while output.len() < n_output {
99        let end = (offset + 64).min(raw.len());
100        let chunk = &raw[offset..end];
101        let mut h = Sha256::new();
102        h.update(state);
103        h.update(chunk);
104        h.update(counter.to_le_bytes());
105        state = h.finalize().into();
106        output.extend_from_slice(&state);
107        offset += 64;
108        counter += 1;
109        if offset >= raw.len() {
110            offset = 0;
111        }
112    }
113    output.truncate(n_output);
114    output
115}
116
117/// SHA-256 condition with explicit state, sample, counter, and extra data.
118/// Returns (new_state, 32-byte digest).
119pub fn sha256_condition(
120    state: &[u8; 32],
121    sample: &[u8],
122    counter: u64,
123    extra: &[u8],
124) -> ([u8; 32], [u8; 32]) {
125    let mut h = Sha256::new();
126    h.update(state);
127    h.update(sample);
128    h.update(counter.to_le_bytes());
129
130    let ts = std::time::SystemTime::now()
131        .duration_since(std::time::UNIX_EPOCH)
132        .unwrap_or_default();
133    h.update(ts.as_nanos().to_le_bytes());
134
135    h.update(extra);
136
137    let digest: [u8; 32] = h.finalize().into();
138    (digest, digest)
139}
140
141// ---------------------------------------------------------------------------
142// Von Neumann debiasing
143// ---------------------------------------------------------------------------
144
145/// Von Neumann debiasing: extract unbiased bits from a biased stream.
146///
147/// Takes pairs of bits: (0,1) → 0, (1,0) → 1, same → discard.
148/// Expected yield: ~25% of input bits (for unbiased input).
149pub fn von_neumann_debias(data: &[u8]) -> Vec<u8> {
150    let mut bits = Vec::new();
151    for byte in data {
152        for i in (0..8).step_by(2) {
153            let b1 = (byte >> (7 - i)) & 1;
154            let b2 = (byte >> (6 - i)) & 1;
155            if b1 != b2 {
156                bits.push(b1);
157            }
158        }
159    }
160
161    // Pack bits back into bytes
162    let mut result = Vec::with_capacity(bits.len() / 8);
163    for chunk in bits.chunks_exact(8) {
164        let mut byte = 0u8;
165        for (i, &bit) in chunk.iter().enumerate() {
166            byte |= bit << (7 - i);
167        }
168        result.push(byte);
169    }
170    result
171}
172
173// ---------------------------------------------------------------------------
174// XOR folding
175// ---------------------------------------------------------------------------
176
177/// XOR-fold: reduce data by XORing the first half with the second half.
178pub fn xor_fold(data: &[u8]) -> Vec<u8> {
179    if data.len() < 2 {
180        return data.to_vec();
181    }
182    let half = data.len() / 2;
183    (0..half).map(|i| data[i] ^ data[half + i]).collect()
184}
185
186// ---------------------------------------------------------------------------
187// Quick analysis utilities
188// ---------------------------------------------------------------------------
189
190// ---------------------------------------------------------------------------
191// Min-Entropy Estimators (NIST SP 800-90B Section 6.3)
192// ---------------------------------------------------------------------------
193
194/// Min-entropy estimate: H∞ = -log2(max probability).
195/// More conservative than Shannon — reflects worst-case guessing probability.
196/// Returns bits per sample (0.0 to 8.0 for byte-valued data).
197pub fn min_entropy(data: &[u8]) -> f64 {
198    if data.is_empty() {
199        return 0.0;
200    }
201    let mut counts = [0u64; 256];
202    for &b in data {
203        counts[b as usize] += 1;
204    }
205    let n = data.len() as f64;
206    let p_max = counts.iter().map(|&c| c as f64 / n).fold(0.0f64, f64::max);
207    if p_max <= 0.0 {
208        return 0.0;
209    }
210    -p_max.log2()
211}
212
213/// Most Common Value (MCV) estimator — NIST SP 800-90B Section 6.3.1.
214/// Estimates min-entropy with upper bound on p_max using confidence interval.
215/// Returns (min_entropy_bits_per_sample, p_max_upper_bound).
216pub fn mcv_estimate(data: &[u8]) -> (f64, f64) {
217    if data.is_empty() {
218        return (0.0, 1.0);
219    }
220    let mut counts = [0u64; 256];
221    for &b in data {
222        counts[b as usize] += 1;
223    }
224    let n = data.len() as f64;
225    let max_count = *counts.iter().max().unwrap() as f64;
226    let p_hat = max_count / n;
227
228    // Upper bound of 99% confidence interval
229    // p_u = min(1, p_hat + 2.576 * sqrt(p_hat * (1 - p_hat) / n))
230    let z = 2.576; // z_{0.995} for 99% CI
231    let p_u = (p_hat + z * (p_hat * (1.0 - p_hat) / n).sqrt()).min(1.0);
232
233    let h = if p_u >= 1.0 {
234        0.0
235    } else {
236        (-p_u.log2()).max(0.0)
237    };
238    (h, p_u)
239}
240
241/// Collision estimator — NIST SP 800-90B Section 6.3.2.
242/// Measures average distance between repeated values.
243/// Returns estimated min-entropy bits per sample.
244pub fn collision_estimate(data: &[u8]) -> f64 {
245    if data.len() < 3 {
246        return 0.0;
247    }
248
249    // Count collision distances
250    let mut distances = Vec::new();
251    let mut i = 0;
252    while i < data.len() - 1 {
253        let mut j = i + 1;
254        // Find next collision (repeated value pair)
255        while j < data.len() && data[j] != data[i] {
256            j += 1;
257        }
258        if j < data.len() {
259            distances.push((j - i) as f64);
260            i = j + 1;
261        } else {
262            break;
263        }
264    }
265
266    if distances.is_empty() {
267        return 8.0; // No collisions found — maximum entropy
268    }
269
270    let mean_dist = distances.iter().sum::<f64>() / distances.len() as f64;
271
272    // The mean collision distance relates to entropy:
273    // For uniform distribution over k symbols, mean distance ≈ sqrt(π*k/2)
274    // Solve for p: mean ≈ 1/sum(p_i^2), then H∞ = -log2(p_max)
275    // Simplified: use the relationship p ≈ 1/mean^2 (first-order approx)
276    // with upper confidence bound
277    let n_collisions = distances.len() as f64;
278    let variance = distances
279        .iter()
280        .map(|d| (d - mean_dist).powi(2))
281        .sum::<f64>()
282        / (n_collisions - 1.0).max(1.0);
283    let std_err = (variance / n_collisions).sqrt();
284
285    // Lower bound on mean distance (conservative → lower entropy)
286    let z = 2.576;
287    let mean_lower = (mean_dist - z * std_err).max(1.0);
288
289    // Approximate p_max from collision rate
290    // For geometric distribution of collision distances: E[distance] ≈ 1/sum(p_i^2)
291    // sum(p_i^2) >= p_max^2, so p_max <= sqrt(1/mean_lower)
292    let p_max = (1.0 / mean_lower).sqrt().min(1.0);
293
294    if p_max <= 0.0 {
295        8.0
296    } else {
297        (-p_max.log2()).min(8.0)
298    }
299}
300
301/// Markov estimator — NIST SP 800-90B Section 6.3.3.
302/// Models first-order dependencies between consecutive samples.
303/// Returns estimated min-entropy bits per sample.
304pub fn markov_estimate(data: &[u8]) -> f64 {
305    if data.len() < 2 {
306        return 0.0;
307    }
308
309    // Build transition matrix (256x256 is too large for bytes, so bin into 16 levels)
310    let bins = 16u8;
311    let bin_of = |b: u8| -> usize { (b as usize * bins as usize) / 256 };
312
313    let mut transitions = vec![vec![0u64; bins as usize]; bins as usize];
314
315    for w in data.windows(2) {
316        let from = bin_of(w[0]);
317        let to = bin_of(w[1]);
318        transitions[from][to] += 1;
319    }
320
321    // Compute transition probabilities and find max path probability
322    let n = data.len() as f64;
323
324    // Initial distribution
325    let p_init: Vec<f64> = {
326        let mut counts = vec![0u64; bins as usize];
327        for &b in data {
328            counts[bin_of(b)] += 1;
329        }
330        counts.iter().map(|&c| c as f64 / n).collect()
331    };
332
333    // Transition probabilities
334    let mut p_trans = vec![vec![0.0f64; bins as usize]; bins as usize];
335    for (i, row) in transitions.iter().enumerate() {
336        let row_sum: u64 = row.iter().sum();
337        if row_sum > 0 {
338            for (j, &count) in row.iter().enumerate() {
339                p_trans[i][j] = count as f64 / row_sum as f64;
340            }
341        }
342    }
343
344    // Max probability of any single sample given Markov model
345    // p_max = max over all states s of: max(p_init[s], max_t(p_trans[t][s]))
346    let mut p_max = 0.0f64;
347    for s in 0..bins as usize {
348        p_max = p_max.max(p_init[s]);
349        for row in p_trans.iter().take(bins as usize) {
350            p_max = p_max.max(row[s]);
351        }
352    }
353
354    // Scale back: each bin covers 256/16=16 values, so per-value p_max ≈ p_max_bin / 16
355    // But we want per-byte min-entropy, so we use the bin-level Markov structure
356    // H∞ ≈ -log2(p_max_bin)
357    // This is a conservative estimate (binning reduces apparent entropy)
358    if p_max <= 0.0 {
359        8.0
360    } else {
361        (-p_max.log2()).min(8.0)
362    }
363}
364
365/// Compression estimator — NIST SP 800-90B Section 6.3.4.
366/// Uses Maurer's universal statistic to estimate entropy via compression.
367/// Returns estimated min-entropy bits per sample.
368pub fn compression_estimate(data: &[u8]) -> f64 {
369    if data.len() < 100 {
370        return 0.0;
371    }
372
373    // Maurer's universal statistic
374    // For each byte, record the distance to its previous occurrence
375    let l = 8; // bits per symbol (bytes)
376    let q = 256.min(data.len() / 4); // initialization segment length
377    let k = data.len() - q; // test segment length
378
379    if k == 0 {
380        return 0.0;
381    }
382
383    // Initialize: record last position of each byte value
384    let mut last_pos = [0usize; 256];
385    for (i, &b) in data[..q].iter().enumerate() {
386        last_pos[b as usize] = i + 1; // 1-indexed
387    }
388
389    // Test segment: compute log2 of distances
390    let mut sum = 0.0f64;
391    let mut count = 0u64;
392    for (i, &b) in data[q..].iter().enumerate() {
393        let pos = q + i + 1; // 1-indexed
394        let prev = last_pos[b as usize];
395        if prev > 0 {
396            let distance = pos - prev;
397            sum += (distance as f64).log2();
398            count += 1;
399        }
400        last_pos[b as usize] = pos;
401    }
402
403    if count == 0 {
404        return l as f64; // No repeated values
405    }
406
407    let f_n = sum / count as f64;
408
409    // Variance estimate for confidence bound
410    let mut var_sum = 0.0f64;
411    // Reset for second pass
412    let mut last_pos2 = [0usize; 256];
413    for (i, &b) in data[..q].iter().enumerate() {
414        last_pos2[b as usize] = i + 1;
415    }
416    for (i, &b) in data[q..].iter().enumerate() {
417        let pos = q + i + 1;
418        let prev = last_pos2[b as usize];
419        if prev > 0 {
420            let distance = pos - prev;
421            let log_d = (distance as f64).log2();
422            var_sum += (log_d - f_n).powi(2);
423        }
424        last_pos2[b as usize] = pos;
425    }
426    let variance = var_sum / (count as f64 - 1.0).max(1.0);
427    let std_err = (variance / count as f64).sqrt();
428
429    // Lower confidence bound (conservative)
430    let z = 2.576;
431    let f_lower = (f_n - z * std_err).max(0.0);
432
433    // f_n estimates per-sample entropy. Convert to min-entropy (conservative):
434    // min-entropy <= Shannon entropy, and Maurer's statistic approximates Shannon.
435    // Apply a reduction factor for min-entropy approximation.
436    // Min-entropy is at most the compression estimate.
437    f_lower.min(l as f64)
438}
439
440/// t-Tuple estimator — NIST SP 800-90B Section 6.3.5.
441/// Estimates entropy from most frequent t-length tuple.
442/// Returns estimated min-entropy bits per sample.
443pub fn t_tuple_estimate(data: &[u8]) -> f64 {
444    if data.len() < 20 {
445        return 0.0;
446    }
447
448    // Try t=1,2,3 and take the minimum (most conservative)
449    let mut min_h = 8.0f64;
450
451    for t in 1..=3usize {
452        if data.len() < t + 1 {
453            break;
454        }
455        let mut counts: HashMap<&[u8], u64> = HashMap::new();
456        for window in data.windows(t) {
457            *counts.entry(window).or_insert(0) += 1;
458        }
459        let n = (data.len() - t + 1) as f64;
460        let max_count = *counts.values().max().unwrap_or(&0) as f64;
461        let p_max = max_count / n;
462
463        if p_max > 0.0 {
464            // For t-tuples, per-sample entropy is -log2(p_max) / t
465            let h = -p_max.log2() / t as f64;
466            min_h = min_h.min(h);
467        }
468    }
469
470    min_h.min(8.0)
471}
472
473/// Combined min-entropy estimate using multiple estimators.
474/// Takes the minimum (most conservative) across all methods.
475/// Returns a [`MinEntropyReport`] with individual and combined estimates.
476pub fn min_entropy_estimate(data: &[u8]) -> MinEntropyReport {
477    let shannon = quick_shannon(data);
478    let (mcv_h, mcv_p_upper) = mcv_estimate(data);
479    let collision_h = collision_estimate(data);
480    let markov_h = markov_estimate(data);
481    let compression_h = compression_estimate(data);
482    let t_tuple_h = t_tuple_estimate(data);
483
484    // Min-entropy is the minimum of all estimators (most conservative)
485    let combined = mcv_h
486        .min(collision_h)
487        .min(markov_h)
488        .min(compression_h)
489        .min(t_tuple_h);
490
491    MinEntropyReport {
492        shannon_entropy: shannon,
493        min_entropy: combined,
494        mcv_estimate: mcv_h,
495        mcv_p_upper,
496        collision_estimate: collision_h,
497        markov_estimate: markov_h,
498        compression_estimate: compression_h,
499        t_tuple_estimate: t_tuple_h,
500        samples: data.len(),
501    }
502}
503
504/// Min-entropy analysis report with individual estimator results.
505#[derive(Debug, Clone)]
506pub struct MinEntropyReport {
507    /// Shannon entropy (bits/byte, max 8.0). Upper bound, not conservative.
508    pub shannon_entropy: f64,
509    /// Combined min-entropy estimate (bits/byte). Most conservative across all estimators.
510    pub min_entropy: f64,
511    /// Most Common Value estimator (NIST 6.3.1)
512    pub mcv_estimate: f64,
513    /// Upper bound on max probability from MCV
514    pub mcv_p_upper: f64,
515    /// Collision estimator (NIST 6.3.2)
516    pub collision_estimate: f64,
517    /// Markov estimator (NIST 6.3.3)
518    pub markov_estimate: f64,
519    /// Compression estimator (NIST 6.3.4)
520    pub compression_estimate: f64,
521    /// t-Tuple estimator (NIST 6.3.5)
522    pub t_tuple_estimate: f64,
523    /// Number of samples analyzed
524    pub samples: usize,
525}
526
527impl std::fmt::Display for MinEntropyReport {
528    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
529        writeln!(f, "Min-Entropy Analysis ({} samples)", self.samples)?;
530        writeln!(f, "  Shannon H:      {:.3} bits/byte", self.shannon_entropy)?;
531        writeln!(f, "  Min-Entropy H∞:  {:.3} bits/byte", self.min_entropy)?;
532        writeln!(f, "  ─────────────────────────────")?;
533        writeln!(
534            f,
535            "  MCV:            {:.3}  (p_upper={:.4})",
536            self.mcv_estimate, self.mcv_p_upper
537        )?;
538        writeln!(f, "  Collision:      {:.3}", self.collision_estimate)?;
539        writeln!(f, "  Markov:         {:.3}", self.markov_estimate)?;
540        writeln!(f, "  Compression:    {:.3}", self.compression_estimate)?;
541        writeln!(f, "  t-Tuple:        {:.3}", self.t_tuple_estimate)?;
542        Ok(())
543    }
544}
545
546/// Quick min-entropy (just the combined estimate, no full report).
547pub fn quick_min_entropy(data: &[u8]) -> f64 {
548    min_entropy_estimate(data).min_entropy
549}
550
551/// Quick Shannon entropy in bits/byte for a byte slice.
552pub fn quick_shannon(data: &[u8]) -> f64 {
553    if data.is_empty() {
554        return 0.0;
555    }
556    let mut counts = [0u64; 256];
557    for &b in data {
558        counts[b as usize] += 1;
559    }
560    let n = data.len() as f64;
561    let mut h = 0.0;
562    for &c in &counts {
563        if c > 0 {
564            let p = c as f64 / n;
565            h -= p * p.log2();
566        }
567    }
568    h
569}
570
571/// Grade a source based on its min-entropy (H∞) value.
572///
573/// This is the **single source of truth** for entropy grading. All CLI commands,
574/// server endpoints, and reports should use this function instead of duplicating
575/// threshold logic.
576///
577/// | Grade | Min-Entropy (H∞) |
578/// |-------|-------------------|
579/// | A     | ≥ 6.0             |
580/// | B     | ≥ 4.0             |
581/// | C     | ≥ 2.0             |
582/// | D     | ≥ 1.0             |
583/// | F     | < 1.0             |
584pub fn grade_min_entropy(min_entropy: f64) -> char {
585    if min_entropy >= 6.0 {
586        'A'
587    } else if min_entropy >= 4.0 {
588        'B'
589    } else if min_entropy >= 2.0 {
590        'C'
591    } else if min_entropy >= 1.0 {
592        'D'
593    } else {
594        'F'
595    }
596}
597
598/// Quick quality assessment.
599pub fn quick_quality(data: &[u8]) -> QualityReport {
600    if data.len() < 16 {
601        return QualityReport {
602            samples: data.len(),
603            unique_values: 0,
604            shannon_entropy: 0.0,
605            compression_ratio: 0.0,
606            quality_score: 0.0,
607            grade: 'F',
608        };
609    }
610
611    let shannon = quick_shannon(data);
612
613    // Compression ratio
614    use flate2::Compression;
615    use flate2::write::ZlibEncoder;
616    use std::io::Write;
617    let mut encoder = ZlibEncoder::new(Vec::new(), Compression::best());
618    encoder.write_all(data).unwrap_or_default();
619    let compressed = encoder.finish().unwrap_or_default();
620    let comp_ratio = compressed.len() as f64 / data.len() as f64;
621
622    // Unique values
623    let mut seen = [false; 256];
624    for &b in data {
625        seen[b as usize] = true;
626    }
627    let unique = seen.iter().filter(|&&s| s).count();
628
629    let eff = shannon / 8.0;
630    let score = eff * 60.0 + comp_ratio.min(1.0) * 20.0 + (unique as f64 / 256.0).min(1.0) * 20.0;
631    let grade = if score >= 80.0 {
632        'A'
633    } else if score >= 60.0 {
634        'B'
635    } else if score >= 40.0 {
636        'C'
637    } else if score >= 20.0 {
638        'D'
639    } else {
640        'F'
641    };
642
643    QualityReport {
644        samples: data.len(),
645        unique_values: unique,
646        shannon_entropy: shannon,
647        compression_ratio: comp_ratio,
648        quality_score: score,
649        grade,
650    }
651}
652
653#[derive(Debug, Clone)]
654pub struct QualityReport {
655    pub samples: usize,
656    pub unique_values: usize,
657    pub shannon_entropy: f64,
658    pub compression_ratio: f64,
659    pub quality_score: f64,
660    pub grade: char,
661}
662
663#[cfg(test)]
664mod tests {
665    use super::*;
666
667    // -----------------------------------------------------------------------
668    // Conditioning mode tests
669    // -----------------------------------------------------------------------
670
671    #[test]
672    fn test_condition_raw_passthrough() {
673        let data = vec![1, 2, 3, 4, 5];
674        let out = condition(&data, 3, ConditioningMode::Raw);
675        assert_eq!(out, vec![1, 2, 3]);
676    }
677
678    #[test]
679    fn test_condition_raw_exact_length() {
680        let data: Vec<u8> = (0..100).map(|i| i as u8).collect();
681        let out = condition(&data, 100, ConditioningMode::Raw);
682        assert_eq!(out, data);
683    }
684
685    #[test]
686    fn test_condition_raw_truncates() {
687        let data: Vec<u8> = (0..100).map(|i| i as u8).collect();
688        let out = condition(&data, 50, ConditioningMode::Raw);
689        assert_eq!(out.len(), 50);
690        assert_eq!(out, &data[..50]);
691    }
692
693    #[test]
694    fn test_condition_sha256_produces_exact_length() {
695        let data = vec![42u8; 100];
696        for len in [1, 16, 32, 64, 100, 256] {
697            let out = condition(&data, len, ConditioningMode::Sha256);
698            assert_eq!(out.len(), len, "SHA256 should produce exactly {len} bytes");
699        }
700    }
701
702    #[test]
703    fn test_sha256_deterministic() {
704        let data = vec![42u8; 100];
705        let out1 = sha256_condition_bytes(&data, 64);
706        let out2 = sha256_condition_bytes(&data, 64);
707        assert_eq!(
708            out1, out2,
709            "SHA256 conditioning should be deterministic for same input"
710        );
711    }
712
713    #[test]
714    fn test_sha256_different_inputs_differ() {
715        let data1 = vec![1u8; 100];
716        let data2 = vec![2u8; 100];
717        let out1 = sha256_condition_bytes(&data1, 32);
718        let out2 = sha256_condition_bytes(&data2, 32);
719        assert_ne!(out1, out2);
720    }
721
722    #[test]
723    fn test_sha256_empty_input() {
724        let out = sha256_condition_bytes(&[], 32);
725        assert_eq!(out.len(), 32);
726        assert_eq!(out, vec![0u8; 32], "Empty input should produce zero bytes");
727    }
728
729    #[test]
730    fn test_von_neumann_reduces_size() {
731        let input = vec![0b10101010u8; 128];
732        let output = von_neumann_debias(&input);
733        assert!(output.len() < input.len());
734    }
735
736    #[test]
737    fn test_von_neumann_known_output() {
738        // Input: 0b10_10_10_10 = pairs (1,0)(1,0)(1,0)(1,0)
739        // Von Neumann: (1,0) -> 1, repeated 4 times = 4 bits = 1111 per byte
740        // But we need 8 bits for one output byte.
741        // Two input bytes = 8 pairs of bits -> each (1,0) -> 1, so 8 bits -> 0b11111111
742        let input = vec![0b10101010u8; 2];
743        let output = von_neumann_debias(&input);
744        assert_eq!(output.len(), 1);
745        assert_eq!(output[0], 0b11111111);
746    }
747
748    #[test]
749    fn test_von_neumann_alternating_01() {
750        // Input: 0b01_01_01_01 = pairs (0,1)(0,1)(0,1)(0,1)
751        // Von Neumann: (0,1) -> 0, repeated 4 times per byte
752        // Two input bytes = 8 pairs -> 8 zero bits -> 0b00000000
753        let input = vec![0b01010101u8; 2];
754        let output = von_neumann_debias(&input);
755        assert_eq!(output.len(), 1);
756        assert_eq!(output[0], 0b00000000);
757    }
758
759    #[test]
760    fn test_von_neumann_all_same_discards() {
761        // Input: all 0xFF = pairs (1,1)(1,1)... -> all discarded
762        let input = vec![0xFF; 100];
763        let output = von_neumann_debias(&input);
764        assert!(output.is_empty(), "All-ones should produce no output");
765    }
766
767    #[test]
768    fn test_von_neumann_all_zeros_discards() {
769        // Input: all 0x00 = pairs (0,0)(0,0)... -> all discarded
770        let input = vec![0x00; 100];
771        let output = von_neumann_debias(&input);
772        assert!(output.is_empty(), "All-zeros should produce no output");
773    }
774
775    #[test]
776    fn test_condition_modes_differ() {
777        let data: Vec<u8> = (0..256).map(|i| i as u8).collect();
778        let raw = condition(&data, 64, ConditioningMode::Raw);
779        let sha = condition(&data, 64, ConditioningMode::Sha256);
780        assert_ne!(raw, sha);
781    }
782
783    #[test]
784    fn test_conditioning_mode_display() {
785        assert_eq!(ConditioningMode::Raw.to_string(), "raw");
786        assert_eq!(ConditioningMode::VonNeumann.to_string(), "von_neumann");
787        assert_eq!(ConditioningMode::Sha256.to_string(), "sha256");
788    }
789
790    #[test]
791    fn test_conditioning_mode_default() {
792        assert_eq!(ConditioningMode::default(), ConditioningMode::Sha256);
793    }
794
795    // -----------------------------------------------------------------------
796    // XOR fold tests
797    // -----------------------------------------------------------------------
798
799    #[test]
800    fn test_xor_fold_basic() {
801        let data = vec![0xFF, 0x00, 0xAA, 0x55];
802        let folded = xor_fold(&data);
803        assert_eq!(folded.len(), 2);
804        assert_eq!(folded[0], 0xFF ^ 0xAA);
805        assert_eq!(folded[1], 0x00 ^ 0x55);
806    }
807
808    #[test]
809    fn test_xor_fold_single_byte() {
810        let data = vec![42];
811        let folded = xor_fold(&data);
812        assert_eq!(folded, vec![42]);
813    }
814
815    #[test]
816    fn test_xor_fold_empty() {
817        let folded = xor_fold(&[]);
818        assert!(folded.is_empty());
819    }
820
821    #[test]
822    fn test_xor_fold_odd_length() {
823        // With 5 bytes, half=2, so XOR data[0..2] with data[2..4]
824        let data = vec![1, 2, 3, 4, 5];
825        let folded = xor_fold(&data);
826        assert_eq!(folded.len(), 2);
827        assert_eq!(folded[0], 1 ^ 3);
828        assert_eq!(folded[1], 2 ^ 4);
829    }
830
831    // -----------------------------------------------------------------------
832    // Shannon entropy tests
833    // -----------------------------------------------------------------------
834
835    #[test]
836    fn test_shannon_empty() {
837        assert_eq!(quick_shannon(&[]), 0.0);
838    }
839
840    #[test]
841    fn test_shannon_single_byte() {
842        // One byte = one value, p=1.0, H = -1.0 * log2(1.0) = 0.0
843        assert_eq!(quick_shannon(&[42]), 0.0);
844    }
845
846    #[test]
847    fn test_shannon_all_same() {
848        let data = vec![0u8; 1000];
849        assert_eq!(quick_shannon(&data), 0.0);
850    }
851
852    #[test]
853    fn test_shannon_two_values_equal() {
854        // 50/50 split between two values = 1.0 bits
855        let mut data = vec![0u8; 500];
856        data.extend(vec![1u8; 500]);
857        let h = quick_shannon(&data);
858        assert!((h - 1.0).abs() < 0.01, "Expected ~1.0, got {h}");
859    }
860
861    #[test]
862    fn test_shannon_uniform_256() {
863        // Perfectly uniform over 256 values = 8.0 bits
864        let data: Vec<u8> = (0..=255).collect();
865        let h = quick_shannon(&data);
866        assert!((h - 8.0).abs() < 0.01, "Expected ~8.0, got {h}");
867    }
868
869    #[test]
870    fn test_shannon_uniform_large() {
871        // Large uniform sample — each value appears ~40 times
872        let mut data = Vec::with_capacity(256 * 40);
873        for _ in 0..40 {
874            for b in 0..=255u8 {
875                data.push(b);
876            }
877        }
878        let h = quick_shannon(&data);
879        assert!((h - 8.0).abs() < 0.01, "Expected ~8.0, got {h}");
880    }
881
882    // -----------------------------------------------------------------------
883    // Min-entropy estimator tests
884    // -----------------------------------------------------------------------
885
886    #[test]
887    fn test_min_entropy_empty() {
888        assert_eq!(min_entropy(&[]), 0.0);
889    }
890
891    #[test]
892    fn test_min_entropy_all_same() {
893        let data = vec![42u8; 1000];
894        let h = min_entropy(&data);
895        assert!(h < 0.01, "All-same should have ~0 min-entropy, got {h}");
896    }
897
898    #[test]
899    fn test_min_entropy_uniform() {
900        let mut data = Vec::with_capacity(256 * 40);
901        for _ in 0..40 {
902            for b in 0..=255u8 {
903                data.push(b);
904            }
905        }
906        let h = min_entropy(&data);
907        assert!(
908            (h - 8.0).abs() < 0.1,
909            "Uniform should have ~8.0 min-entropy, got {h}"
910        );
911    }
912
913    #[test]
914    fn test_min_entropy_two_values() {
915        let mut data = vec![0u8; 500];
916        data.extend(vec![1u8; 500]);
917        let h = min_entropy(&data);
918        // p_max = 0.5, H∞ = -log2(0.5) = 1.0
919        assert!((h - 1.0).abs() < 0.01, "Expected ~1.0, got {h}");
920    }
921
922    #[test]
923    fn test_min_entropy_biased() {
924        // 90% value 0, 10% value 1: p_max=0.9, H∞ = -log2(0.9) ≈ 0.152
925        let mut data = vec![0u8; 900];
926        data.extend(vec![1u8; 100]);
927        let h = min_entropy(&data);
928        let expected = -(0.9f64.log2());
929        assert!(
930            (h - expected).abs() < 0.02,
931            "Expected ~{expected:.3}, got {h}"
932        );
933    }
934
935    // -----------------------------------------------------------------------
936    // MCV estimator tests
937    // -----------------------------------------------------------------------
938
939    #[test]
940    fn test_mcv_empty() {
941        let (h, p) = mcv_estimate(&[]);
942        assert_eq!(h, 0.0);
943        assert_eq!(p, 1.0);
944    }
945
946    #[test]
947    fn test_mcv_all_same() {
948        let data = vec![42u8; 1000];
949        let (h, p_upper) = mcv_estimate(&data);
950        assert!(h < 0.1, "All-same should have ~0 MCV entropy, got {h}");
951        assert!((p_upper - 1.0).abs() < 0.01);
952    }
953
954    #[test]
955    fn test_mcv_uniform() {
956        let mut data = Vec::with_capacity(256 * 100);
957        for _ in 0..100 {
958            for b in 0..=255u8 {
959                data.push(b);
960            }
961        }
962        let (h, _p_upper) = mcv_estimate(&data);
963        assert!(h > 7.0, "Uniform should have high MCV entropy, got {h}");
964    }
965
966    // -----------------------------------------------------------------------
967    // Collision estimator tests
968    // -----------------------------------------------------------------------
969
970    #[test]
971    fn test_collision_too_short() {
972        assert_eq!(collision_estimate(&[1, 2]), 0.0);
973    }
974
975    #[test]
976    fn test_collision_all_same() {
977        let data = vec![0u8; 1000];
978        let h = collision_estimate(&data);
979        // All same -> every adjacent pair is a collision -> mean distance = 1
980        // -> p_max = 1.0 -> H = 0
981        assert!(
982            h < 1.0,
983            "All-same should have very low collision entropy, got {h}"
984        );
985    }
986
987    #[test]
988    fn test_collision_uniform_large() {
989        let mut data = Vec::with_capacity(256 * 100);
990        for _ in 0..100 {
991            for b in 0..=255u8 {
992                data.push(b);
993            }
994        }
995        let h = collision_estimate(&data);
996        assert!(
997            h > 3.0,
998            "Uniform should have reasonable collision entropy, got {h}"
999        );
1000    }
1001
1002    // -----------------------------------------------------------------------
1003    // Markov estimator tests
1004    // -----------------------------------------------------------------------
1005
1006    #[test]
1007    fn test_markov_too_short() {
1008        assert_eq!(markov_estimate(&[42]), 0.0);
1009    }
1010
1011    #[test]
1012    fn test_markov_all_same() {
1013        let data = vec![0u8; 1000];
1014        let h = markov_estimate(&data);
1015        assert!(h < 1.0, "All-same should have low Markov entropy, got {h}");
1016    }
1017
1018    #[test]
1019    fn test_markov_uniform_large() {
1020        // Markov estimator bins into 16 levels and finds max transition probability.
1021        // Even good pseudo-random data will show some transition bias due to binning
1022        // and finite sample size. We just verify it's meaningfully above the all-same
1023        // baseline (~0) while accepting the conservative nature of this estimator.
1024        let mut data = Vec::with_capacity(256 * 100);
1025        for i in 0..(256 * 100) {
1026            let v = ((i as u64)
1027                .wrapping_mul(6364136223846793005)
1028                .wrapping_add(1442695040888963407)
1029                >> 56) as u8;
1030            data.push(v);
1031        }
1032        let h = markov_estimate(&data);
1033        assert!(
1034            h > 0.5,
1035            "Pseudo-random should have Markov entropy > 0.5, got {h}"
1036        );
1037    }
1038
1039    // -----------------------------------------------------------------------
1040    // Compression estimator tests
1041    // -----------------------------------------------------------------------
1042
1043    #[test]
1044    fn test_compression_too_short() {
1045        assert_eq!(compression_estimate(&[1; 50]), 0.0);
1046    }
1047
1048    #[test]
1049    fn test_compression_all_same() {
1050        let data = vec![0u8; 1000];
1051        let h = compression_estimate(&data);
1052        assert!(
1053            h < 2.0,
1054            "All-same should have low compression entropy, got {h}"
1055        );
1056    }
1057
1058    #[test]
1059    fn test_compression_uniform_large() {
1060        let mut data = Vec::with_capacity(256 * 100);
1061        for _ in 0..100 {
1062            for b in 0..=255u8 {
1063                data.push(b);
1064            }
1065        }
1066        let h = compression_estimate(&data);
1067        assert!(
1068            h > 4.0,
1069            "Uniform should have reasonable compression entropy, got {h}"
1070        );
1071    }
1072
1073    // -----------------------------------------------------------------------
1074    // t-Tuple estimator tests
1075    // -----------------------------------------------------------------------
1076
1077    #[test]
1078    fn test_t_tuple_too_short() {
1079        assert_eq!(t_tuple_estimate(&[1; 10]), 0.0);
1080    }
1081
1082    #[test]
1083    fn test_t_tuple_all_same() {
1084        let data = vec![0u8; 1000];
1085        let h = t_tuple_estimate(&data);
1086        assert!(h < 0.1, "All-same should have ~0 t-tuple entropy, got {h}");
1087    }
1088
1089    #[test]
1090    fn test_t_tuple_uniform_large() {
1091        // t-Tuple estimator finds the most frequent t-length tuple and computes
1092        // -log2(p_max)/t. For t>1, pseudo-random data with sequential correlation
1093        // may show elevated tuple frequencies. We verify the result is well above
1094        // the all-same baseline (~0).
1095        let mut data = Vec::with_capacity(256 * 100);
1096        for i in 0..(256 * 100) {
1097            let v = ((i as u64)
1098                .wrapping_mul(6364136223846793005)
1099                .wrapping_add(1442695040888963407)
1100                >> 56) as u8;
1101            data.push(v);
1102        }
1103        let h = t_tuple_estimate(&data);
1104        assert!(
1105            h > 2.5,
1106            "Pseudo-random should have t-tuple entropy > 2.5, got {h}"
1107        );
1108    }
1109
1110    // -----------------------------------------------------------------------
1111    // Combined min-entropy report tests
1112    // -----------------------------------------------------------------------
1113
1114    #[test]
1115    fn test_min_entropy_estimate_all_same() {
1116        let data = vec![0u8; 1000];
1117        let report = min_entropy_estimate(&data);
1118        assert!(
1119            report.min_entropy < 1.0,
1120            "All-same combined estimate: {}",
1121            report.min_entropy
1122        );
1123        assert!(report.shannon_entropy < 0.01);
1124        assert_eq!(report.samples, 1000);
1125    }
1126
1127    #[test]
1128    fn test_min_entropy_estimate_uniform() {
1129        // Combined estimate takes the minimum across all estimators, so it will
1130        // be limited by the most conservative one (often Markov). We verify it's
1131        // meaningfully above the all-same baseline and Shannon is near maximum.
1132        let mut data = Vec::with_capacity(256 * 100);
1133        for i in 0..(256 * 100) {
1134            let v = ((i as u64)
1135                .wrapping_mul(6364136223846793005)
1136                .wrapping_add(1442695040888963407)
1137                >> 56) as u8;
1138            data.push(v);
1139        }
1140        let report = min_entropy_estimate(&data);
1141        assert!(
1142            report.min_entropy > 0.5,
1143            "Combined estimate should be > 0.5: {}",
1144            report.min_entropy
1145        );
1146        assert!(
1147            report.shannon_entropy > 7.9,
1148            "Shannon should be near 8.0 for uniform marginals: {}",
1149            report.shannon_entropy
1150        );
1151    }
1152
1153    #[test]
1154    fn test_min_entropy_report_display() {
1155        let data = vec![0u8; 1000];
1156        let report = min_entropy_estimate(&data);
1157        let s = format!("{report}");
1158        assert!(s.contains("Min-Entropy Analysis"));
1159        assert!(s.contains("1000 samples"));
1160    }
1161
1162    #[test]
1163    fn test_quick_min_entropy_matches_report() {
1164        let data: Vec<u8> = (0..=255).collect();
1165        let quick = quick_min_entropy(&data);
1166        let report = min_entropy_estimate(&data);
1167        assert!((quick - report.min_entropy).abs() < f64::EPSILON);
1168    }
1169
1170    // -----------------------------------------------------------------------
1171    // Quality report tests
1172    // -----------------------------------------------------------------------
1173
1174    #[test]
1175    fn test_quality_too_short() {
1176        let q = quick_quality(&[1, 2, 3]);
1177        assert_eq!(q.grade, 'F');
1178        assert_eq!(q.quality_score, 0.0);
1179    }
1180
1181    #[test]
1182    fn test_quality_all_same() {
1183        let data = vec![0u8; 1000];
1184        let q = quick_quality(&data);
1185        assert!(
1186            q.grade == 'F' || q.grade == 'D',
1187            "All-same should grade poorly, got {}",
1188            q.grade
1189        );
1190        assert_eq!(q.unique_values, 1);
1191        assert!(q.shannon_entropy < 0.01);
1192    }
1193
1194    #[test]
1195    fn test_quality_uniform() {
1196        let mut data = Vec::with_capacity(256 * 40);
1197        for _ in 0..40 {
1198            for b in 0..=255u8 {
1199                data.push(b);
1200            }
1201        }
1202        let q = quick_quality(&data);
1203        assert!(
1204            q.grade == 'A' || q.grade == 'B',
1205            "Uniform should grade well, got {}",
1206            q.grade
1207        );
1208        assert_eq!(q.unique_values, 256);
1209        assert!(q.shannon_entropy > 7.9);
1210    }
1211
1212    // -----------------------------------------------------------------------
1213    // grade_min_entropy tests
1214    // -----------------------------------------------------------------------
1215
1216    #[test]
1217    fn test_grade_boundaries() {
1218        assert_eq!(grade_min_entropy(8.0), 'A');
1219        assert_eq!(grade_min_entropy(6.0), 'A');
1220        assert_eq!(grade_min_entropy(5.99), 'B');
1221        assert_eq!(grade_min_entropy(4.0), 'B');
1222        assert_eq!(grade_min_entropy(3.99), 'C');
1223        assert_eq!(grade_min_entropy(2.0), 'C');
1224        assert_eq!(grade_min_entropy(1.99), 'D');
1225        assert_eq!(grade_min_entropy(1.0), 'D');
1226        assert_eq!(grade_min_entropy(0.99), 'F');
1227        assert_eq!(grade_min_entropy(0.0), 'F');
1228    }
1229
1230    #[test]
1231    fn test_grade_negative() {
1232        assert_eq!(grade_min_entropy(-1.0), 'F');
1233    }
1234}
openentropy_core/conditioning.rs

openentropy_core/
conditioning.rs