openentropy_core/
conditioning.rs

1//! Centralized entropy conditioning module.
2//!
3//! **ALL** post-processing of raw entropy lives here — no conditioning code
4//! should exist in individual source implementations. Sources produce raw bytes;
5//! this module is the single, auditable gateway for any transformation.
6//!
7//! # Architecture
8//!
9//! ```text
10//! Source → Raw Bytes → Conditioning Layer (this module) → Output
11//! ```
12//!
13//! # Conditioning Modes
14//!
15//! - **Raw**: No processing. XOR-combined bytes pass through unchanged.
16//!   Preserves the actual hardware noise signal for research.
17//! - **VonNeumann**: Debias only. Removes first-order bias without destroying
18//!   the noise structure. Output is shorter than input (~25% yield).
19//! - **Sha256**: Full SHA-256 conditioning with counter and timestamp mixing.
20//!   Produces cryptographically strong output but destroys the raw signal.
21//!
22//! Most QRNG APIs (ANU, Outshift/Cisco) apply DRBG post-processing that makes
23//! output indistinguishable from PRNG. The `Raw` mode here is what makes
24//! openentropy useful for researchers studying actual hardware noise.
25
26use sha2::{Digest, Sha256};
27use std::collections::HashMap;
28
29/// Conditioning mode for entropy output.
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
31pub enum ConditioningMode {
32    /// No conditioning. Raw bytes pass through unchanged.
33    Raw,
34    /// Von Neumann debiasing only.
35    VonNeumann,
36    /// SHA-256 hash conditioning (default). Cryptographically strong output.
37    #[default]
38    Sha256,
39}
40
41impl std::fmt::Display for ConditioningMode {
42    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
43        match self {
44            Self::Raw => write!(f, "raw"),
45            Self::VonNeumann => write!(f, "von_neumann"),
46            Self::Sha256 => write!(f, "sha256"),
47        }
48    }
49}
50
51// ---------------------------------------------------------------------------
52// Central conditioning gateway
53// ---------------------------------------------------------------------------
54
55/// Apply the specified conditioning mode to raw entropy bytes.
56///
57/// This is the **single gateway** for all entropy conditioning. No other code
58/// in the crate should perform SHA-256, Von Neumann debiasing, or any other
59/// form of whitening/post-processing on entropy data.
60///
61/// - `Raw`: returns the input unchanged (truncated to `n_output`)
62/// - `VonNeumann`: debiases then truncates to `n_output`
63/// - `Sha256`: chained SHA-256 hashing to produce exactly `n_output` bytes
64pub fn condition(raw: &[u8], n_output: usize, mode: ConditioningMode) -> Vec<u8> {
65    match mode {
66        ConditioningMode::Raw => {
67            let mut out = raw.to_vec();
68            out.truncate(n_output);
69            out
70        }
71        ConditioningMode::VonNeumann => {
72            let debiased = von_neumann_debias(raw);
73            let mut out = debiased;
74            out.truncate(n_output);
75            out
76        }
77        ConditioningMode::Sha256 => sha256_condition_bytes(raw, n_output),
78    }
79}
80
81// ---------------------------------------------------------------------------
82// SHA-256 conditioning
83// ---------------------------------------------------------------------------
84
85/// SHA-256 chained conditioning: stretches or compresses raw bytes to exactly
86/// `n_output` bytes using counter-mode hashing.
87///
88/// Each 32-byte output block is: SHA-256(state || chunk || counter).
89/// State is chained from the previous block's digest.
90pub fn sha256_condition_bytes(raw: &[u8], n_output: usize) -> Vec<u8> {
91    if raw.is_empty() {
92        return Vec::new();
93    }
94    let mut output = Vec::with_capacity(n_output);
95    let mut state = [0u8; 32];
96    let mut offset = 0;
97    let mut counter: u64 = 0;
98    while output.len() < n_output {
99        let end = (offset + 64).min(raw.len());
100        let chunk = &raw[offset..end];
101        let mut h = Sha256::new();
102        h.update(state);
103        h.update(chunk);
104        h.update(counter.to_le_bytes());
105        state = h.finalize().into();
106        output.extend_from_slice(&state);
107        offset += 64;
108        counter += 1;
109        if offset >= raw.len() {
110            offset = 0;
111        }
112    }
113    output.truncate(n_output);
114    output
115}
116
117/// SHA-256 condition with explicit state, sample, counter, and extra data.
118/// Returns (new_state, 32-byte digest).
119pub fn sha256_condition(
120    state: &[u8; 32],
121    sample: &[u8],
122    counter: u64,
123    extra: &[u8],
124) -> ([u8; 32], [u8; 32]) {
125    let mut h = Sha256::new();
126    h.update(state);
127    h.update(sample);
128    h.update(counter.to_le_bytes());
129
130    let ts = std::time::SystemTime::now()
131        .duration_since(std::time::UNIX_EPOCH)
132        .unwrap_or_default();
133    h.update(ts.as_nanos().to_le_bytes());
134
135    h.update(extra);
136
137    let digest: [u8; 32] = h.finalize().into();
138    (digest, digest)
139}
140
141// ---------------------------------------------------------------------------
142// Von Neumann debiasing
143// ---------------------------------------------------------------------------
144
145/// Von Neumann debiasing: extract unbiased bits from a biased stream.
146///
147/// Takes pairs of bits: (0,1) → 0, (1,0) → 1, same → discard.
148/// Expected yield: ~25% of input bits (for unbiased input).
149pub fn von_neumann_debias(data: &[u8]) -> Vec<u8> {
150    let mut bits = Vec::new();
151    for byte in data {
152        for i in (0..8).step_by(2) {
153            let b1 = (byte >> (7 - i)) & 1;
154            let b2 = (byte >> (6 - i)) & 1;
155            if b1 != b2 {
156                bits.push(b1);
157            }
158        }
159    }
160
161    // Pack bits back into bytes
162    let mut result = Vec::with_capacity(bits.len() / 8);
163    for chunk in bits.chunks_exact(8) {
164        let mut byte = 0u8;
165        for (i, &bit) in chunk.iter().enumerate() {
166            byte |= bit << (7 - i);
167        }
168        result.push(byte);
169    }
170    result
171}
172
173// ---------------------------------------------------------------------------
174// XOR folding
175// ---------------------------------------------------------------------------
176
177/// XOR-fold: reduce data by XORing the first half with the second half.
178pub fn xor_fold(data: &[u8]) -> Vec<u8> {
179    if data.len() < 2 {
180        return data.to_vec();
181    }
182    let half = data.len() / 2;
183    (0..half).map(|i| data[i] ^ data[half + i]).collect()
184}
185
186// ---------------------------------------------------------------------------
187// Quick analysis utilities
188// ---------------------------------------------------------------------------
189
190// ---------------------------------------------------------------------------
191// Min-entropy estimators
192//
193// Notes:
194// - `mcv_estimate` follows the NIST 800-90B MCV style closely and is used as
195//   the primary conservative estimate.
196// - The other estimators are retained as NIST-inspired diagnostics. They are
197//   useful for comparative/source characterization, but this implementation is
198//   not a strict validation harness for 800-90B.
199// ---------------------------------------------------------------------------
200
201/// Min-entropy estimate: H∞ = -log2(max probability).
202/// More conservative than Shannon — reflects worst-case guessing probability.
203/// Returns bits per sample (0.0 to 8.0 for byte-valued data).
204pub fn min_entropy(data: &[u8]) -> f64 {
205    if data.is_empty() {
206        return 0.0;
207    }
208    let mut counts = [0u64; 256];
209    for &b in data {
210        counts[b as usize] += 1;
211    }
212    let n = data.len() as f64;
213    let p_max = counts.iter().map(|&c| c as f64 / n).fold(0.0f64, f64::max);
214    if p_max <= 0.0 {
215        return 0.0;
216    }
217    -p_max.log2()
218}
219
220/// Most Common Value (MCV) estimator (NIST-inspired 800-90B 6.3.1 style).
221/// Estimates min-entropy with upper bound on p_max using confidence interval.
222/// Returns (min_entropy_bits_per_sample, p_max_upper_bound).
223pub fn mcv_estimate(data: &[u8]) -> (f64, f64) {
224    if data.is_empty() {
225        return (0.0, 1.0);
226    }
227    let mut counts = [0u64; 256];
228    for &b in data {
229        counts[b as usize] += 1;
230    }
231    let n = data.len() as f64;
232    let max_count = *counts.iter().max().unwrap() as f64;
233    let p_hat = max_count / n;
234
235    // Upper bound of 99% confidence interval
236    // p_u = min(1, p_hat + 2.576 * sqrt(p_hat * (1 - p_hat) / n))
237    let z = 2.576; // z_{0.995} for 99% CI
238    let p_u = (p_hat + z * (p_hat * (1.0 - p_hat) / n).sqrt()).min(1.0);
239
240    let h = if p_u >= 1.0 {
241        0.0
242    } else {
243        (-p_u.log2()).max(0.0)
244    };
245    (h, p_u)
246}
247
248/// Collision estimator (NIST-inspired diagnostic).
249///
250/// Scans the data sequentially, finding the distance between successive
251/// "collisions" — where any two adjacent samples in the sequence are equal
252/// (data[i] == data[i+1]). The mean collision distance relates to the
253/// collision probability q = sum(p_i^2), from which we derive min-entropy.
254///
255/// Key correction vs prior implementation: NIST defines a collision as any
256/// two consecutive equal values, not as a repeat of a specific starting value.
257/// We scan pairs sequentially and measure the gap between collisions.
258///
259/// Returns estimated min-entropy bits per sample.
260pub fn collision_estimate(data: &[u8]) -> f64 {
261    if data.len() < 3 {
262        return 0.0;
263    }
264
265    // Scan for collisions: positions where data[i] == data[i+1].
266    // Record the distance (in samples) between successive collisions.
267    let mut distances = Vec::new();
268    let mut last_collision: Option<usize> = None;
269
270    for i in 0..data.len() - 1 {
271        if data[i] == data[i + 1] {
272            if let Some(prev) = last_collision {
273                distances.push((i - prev) as f64);
274            }
275            last_collision = Some(i);
276        }
277    }
278
279    if distances.is_empty() {
280        // No repeated collisions found — either very high entropy or too little data.
281        // Fall back to counting total collisions vs total pairs.
282        let mut collision_count = 0usize;
283        for i in 0..data.len() - 1 {
284            if data[i] == data[i + 1] {
285                collision_count += 1;
286            }
287        }
288        if collision_count == 0 {
289            return 8.0; // No collisions at all
290        }
291        // q_hat ≈ collision_count / (n-1), min-entropy from q >= p_max^2
292        let q_hat = collision_count as f64 / (data.len() - 1) as f64;
293        let p_max = q_hat.sqrt().min(1.0);
294        return if p_max <= 0.0 {
295            8.0
296        } else {
297            (-p_max.log2()).min(8.0)
298        };
299    }
300
301    let mean_dist = distances.iter().sum::<f64>() / distances.len() as f64;
302
303    // The mean inter-collision distance ≈ 1/q where q = sum(p_i^2).
304    // Since p_max^2 <= q, we have p_max <= sqrt(q) <= sqrt(1/mean_dist).
305    // Apply a confidence bound: use the lower bound on mean distance
306    // (conservative → higher q → higher p_max → lower entropy).
307    let n_collisions = distances.len() as f64;
308    let variance = distances
309        .iter()
310        .map(|d| (d - mean_dist).powi(2))
311        .sum::<f64>()
312        / (n_collisions - 1.0).max(1.0);
313    let std_err = (variance / n_collisions).sqrt();
314
315    let z = 2.576; // 99% CI
316    let mean_lower = (mean_dist - z * std_err).max(1.0);
317
318    // q_upper ≈ 1/mean_lower, p_max <= sqrt(q_upper)
319    let p_max = (1.0 / mean_lower).sqrt().min(1.0);
320
321    if p_max <= 0.0 {
322        8.0
323    } else {
324        (-p_max.log2()).min(8.0)
325    }
326}
327
328/// Markov estimator (NIST-inspired diagnostic).
329///
330/// Models first-order dependencies between consecutive samples using byte-level
331/// transition counts. For each byte value, computes the maximum transition
332/// probability from any predecessor. The per-sample entropy is then bounded by
333/// the maximum over all values of: p_init[s] * max_predecessor(p_trans[pred][s]).
334///
335/// Unlike a binned approach, this operates on all 256 byte values directly.
336/// To keep memory bounded (256x256 = 64KB), we use a flat array.
337///
338/// Returns estimated min-entropy bits per sample.
339pub fn markov_estimate(data: &[u8]) -> f64 {
340    if data.len() < 2 {
341        return 0.0;
342    }
343
344    let n = data.len() as f64;
345
346    // Initial distribution: count of each byte value
347    let mut init_counts = [0u64; 256];
348    for &b in data {
349        init_counts[b as usize] += 1;
350    }
351
352    // Transition counts: transitions[from * 256 + to]
353    let mut transitions = vec![0u64; 256 * 256];
354    for w in data.windows(2) {
355        transitions[w[0] as usize * 256 + w[1] as usize] += 1;
356    }
357
358    // Row sums for transition probabilities
359    let mut row_sums = [0u64; 256];
360    for (from, row_sum) in row_sums.iter_mut().enumerate() {
361        let base = from * 256;
362        *row_sum = transitions[base..base + 256].iter().sum();
363    }
364
365    // NIST-inspired Markov-style bound:
366    // For each output value s, find the maximum probability of producing s
367    // considering all possible predecessor states.
368    //
369    // p_max_markov = max over s of: max over pred of (p_init[pred] * p_trans[pred][s])
370    //
371    // But a simpler conservative bound: for each value s, compute
372    //   p_s = max(p_init[s], max over pred of p_trans[pred][s])
373    // and take p_max = max over s of p_s.
374    //
375    // This bounds the per-sample probability under the first-order Markov model.
376    let mut p_max = 0.0f64;
377    for s in 0..256usize {
378        // Initial probability
379        let p_init_s = init_counts[s] as f64 / n;
380        p_max = p_max.max(p_init_s);
381
382        // Max transition probability into s from any predecessor
383        for pred in 0..256usize {
384            if row_sums[pred] > 0 {
385                let p_trans = transitions[pred * 256 + s] as f64 / row_sums[pred] as f64;
386                p_max = p_max.max(p_trans);
387            }
388        }
389    }
390
391    if p_max <= 0.0 {
392        8.0
393    } else {
394        (-p_max.log2()).min(8.0)
395    }
396}
397
398/// Compression estimator (NIST-inspired diagnostic).
399///
400/// Uses Maurer's universal statistic to estimate entropy via compression.
401/// Maurer's f_n converges to the Shannon entropy rate, NOT min-entropy.
402///
403/// To convert to a min-entropy bound, we use the relationship:
404///   H∞ <= H_Shannon
405/// and apply a conservative correction. For IID data with alphabet size k=256:
406///   H∞ = -log2(p_max), H_Shannon = -sum(p_i * log2(p_i))
407/// The gap between them grows with distribution skew. We use:
408///   H∞_est ≈ f_lower * (f_lower / log2(k))
409/// which maps f_lower=log2(256)=8.0 → 8.0 (uniform) and compresses lower
410/// values quadratically, reflecting that low Shannon entropy implies even
411/// lower min-entropy.
412///
413/// Returns estimated min-entropy bits per sample.
414pub fn compression_estimate(data: &[u8]) -> f64 {
415    if data.len() < 100 {
416        return 0.0;
417    }
418
419    // Maurer's universal statistic
420    // For each byte, record the distance to its previous occurrence
421    let l = 8.0f64; // log2(alphabet_size) = log2(256) = 8
422    let q = 256.min(data.len() / 4); // initialization segment length
423    let k = data.len() - q; // test segment length
424
425    if k == 0 {
426        return 0.0;
427    }
428
429    // Initialize: record last position of each byte value
430    let mut last_pos = [0usize; 256];
431    for (i, &b) in data[..q].iter().enumerate() {
432        last_pos[b as usize] = i + 1; // 1-indexed
433    }
434
435    // Test segment: compute log2 of distances
436    let mut sum = 0.0f64;
437    let mut count = 0u64;
438    for (i, &b) in data[q..].iter().enumerate() {
439        let pos = q + i + 1; // 1-indexed
440        let prev = last_pos[b as usize];
441        if prev > 0 {
442            let distance = pos - prev;
443            sum += (distance as f64).log2();
444            count += 1;
445        }
446        last_pos[b as usize] = pos;
447    }
448
449    if count == 0 {
450        return l; // No repeated values
451    }
452
453    let f_n = sum / count as f64;
454
455    // Variance estimate for confidence bound
456    let mut var_sum = 0.0f64;
457    // Reset for second pass
458    let mut last_pos2 = [0usize; 256];
459    for (i, &b) in data[..q].iter().enumerate() {
460        last_pos2[b as usize] = i + 1;
461    }
462    for (i, &b) in data[q..].iter().enumerate() {
463        let pos = q + i + 1;
464        let prev = last_pos2[b as usize];
465        if prev > 0 {
466            let distance = pos - prev;
467            let log_d = (distance as f64).log2();
468            var_sum += (log_d - f_n).powi(2);
469        }
470        last_pos2[b as usize] = pos;
471    }
472    let variance = var_sum / (count as f64 - 1.0).max(1.0);
473    let std_err = (variance / count as f64).sqrt();
474
475    // Lower confidence bound on Shannon estimate (conservative)
476    let z = 2.576; // 99% CI
477    let f_lower = (f_n - z * std_err).max(0.0);
478
479    // Convert Shannon estimate to min-entropy bound.
480    // Maurer's statistic ≈ Shannon entropy. Min-entropy <= Shannon entropy.
481    // Apply quadratic scaling: H∞_est = f_lower^2 / log2(k).
482    // This correctly maps: 8.0 → 8.0 (uniform), 4.0 → 2.0, 1.0 → 0.125.
483    // The quadratic penalty reflects that skewed distributions have a larger
484    // gap between Shannon and min-entropy.
485    (f_lower * f_lower / l).min(l)
486}
487
488/// t-Tuple estimator (NIST-inspired diagnostic).
489/// Estimates entropy from most frequent t-length tuple.
490/// Returns estimated min-entropy bits per sample.
491pub fn t_tuple_estimate(data: &[u8]) -> f64 {
492    if data.len() < 20 {
493        return 0.0;
494    }
495
496    // Try t=1,2,3 and take the minimum (most conservative)
497    let mut min_h = 8.0f64;
498
499    for t in 1..=3usize {
500        if data.len() < t + 1 {
501            break;
502        }
503        let mut counts: HashMap<&[u8], u64> = HashMap::new();
504        for window in data.windows(t) {
505            *counts.entry(window).or_insert(0) += 1;
506        }
507        let n = (data.len() - t + 1) as f64;
508        let max_count = *counts.values().max().unwrap_or(&0) as f64;
509        let p_max = max_count / n;
510
511        if p_max > 0.0 {
512            // For t-tuples, per-sample entropy is -log2(p_max) / t
513            let h = -p_max.log2() / t as f64;
514            min_h = min_h.min(h);
515        }
516    }
517
518    min_h.min(8.0)
519}
520
521/// Min-entropy estimate with diagnostic side metrics.
522///
523/// For professional operational use, `min_entropy` is the MCV-based estimate.
524/// Additional estimators are reported as diagnostics, and their minimum is
525/// exposed as `heuristic_floor`.
526pub fn min_entropy_estimate(data: &[u8]) -> MinEntropyReport {
527    let shannon = quick_shannon(data);
528    let (mcv_h, mcv_p_upper) = mcv_estimate(data);
529    let collision_h = collision_estimate(data);
530    let markov_h = markov_estimate(data);
531    let compression_h = compression_estimate(data);
532    let t_tuple_h = t_tuple_estimate(data);
533
534    let heuristic_floor = collision_h.min(markov_h).min(compression_h).min(t_tuple_h);
535
536    MinEntropyReport {
537        shannon_entropy: shannon,
538        min_entropy: mcv_h,
539        heuristic_floor,
540        mcv_estimate: mcv_h,
541        mcv_p_upper,
542        collision_estimate: collision_h,
543        markov_estimate: markov_h,
544        compression_estimate: compression_h,
545        t_tuple_estimate: t_tuple_h,
546        samples: data.len(),
547    }
548}
549
550/// Min-entropy analysis report with individual estimator results.
551#[derive(Debug, Clone)]
552pub struct MinEntropyReport {
553    /// Shannon entropy (bits/byte, max 8.0). Upper bound, not conservative.
554    pub shannon_entropy: f64,
555    /// Primary conservative min-entropy estimate (bits/byte), MCV-based.
556    pub min_entropy: f64,
557    /// Minimum across heuristic diagnostic estimators.
558    pub heuristic_floor: f64,
559    /// Most Common Value estimator.
560    pub mcv_estimate: f64,
561    /// Upper bound on max probability from MCV
562    pub mcv_p_upper: f64,
563    /// Collision estimator (diagnostic)
564    pub collision_estimate: f64,
565    /// Markov estimator (diagnostic)
566    pub markov_estimate: f64,
567    /// Compression estimator (diagnostic)
568    pub compression_estimate: f64,
569    /// t-Tuple estimator (diagnostic)
570    pub t_tuple_estimate: f64,
571    /// Number of samples analyzed
572    pub samples: usize,
573}
574
575impl std::fmt::Display for MinEntropyReport {
576    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
577        writeln!(f, "Min-Entropy Analysis ({} samples)", self.samples)?;
578        writeln!(
579            f,
580            "  Shannon H:       {:.3} bits/byte  (upper bound)",
581            self.shannon_entropy
582        )?;
583        writeln!(
584            f,
585            "  Min-Entropy H∞:  {:.3} bits/byte  (primary, MCV)",
586            self.min_entropy
587        )?;
588        writeln!(
589            f,
590            "  Heuristic floor: {:.3} bits/byte  (diagnostic minimum)",
591            self.heuristic_floor
592        )?;
593        writeln!(f, "  ─────────────────────────────────")?;
594        writeln!(
595            f,
596            "  MCV:                 {:.3}  (p_upper={:.4})",
597            self.mcv_estimate, self.mcv_p_upper
598        )?;
599        writeln!(f, "  Collision (diag):    {:.3}", self.collision_estimate)?;
600        writeln!(f, "  Markov (diag):       {:.3}", self.markov_estimate)?;
601        writeln!(
602            f,
603            "  Compression (diag):  {:.3}  (Maurer-inspired)",
604            self.compression_estimate
605        )?;
606        writeln!(f, "  t-Tuple (diag):      {:.3}", self.t_tuple_estimate)?;
607        Ok(())
608    }
609}
610
611/// Quick min-entropy estimate using only the MCV estimator (NIST SP 800-90B 6.3.1).
612///
613/// This is the fast path used by the entropy pool and TUI for per-collection
614/// health checks. It uses only the Most Common Value estimator — the most
615/// well-established and computationally cheap NIST estimator (O(n) single pass).
616///
617/// For a full multi-estimator breakdown, use [`min_entropy_estimate`] instead.
618pub fn quick_min_entropy(data: &[u8]) -> f64 {
619    mcv_estimate(data).0
620}
621
622/// Quick Shannon entropy in bits/byte for a byte slice.
623pub fn quick_shannon(data: &[u8]) -> f64 {
624    if data.is_empty() {
625        return 0.0;
626    }
627    let mut counts = [0u64; 256];
628    for &b in data {
629        counts[b as usize] += 1;
630    }
631    let n = data.len() as f64;
632    let mut h = 0.0;
633    for &c in &counts {
634        if c > 0 {
635            let p = c as f64 / n;
636            h -= p * p.log2();
637        }
638    }
639    h
640}
641
642/// Grade a source based on its min-entropy (H∞) value.
643///
644/// This is the **single source of truth** for entropy grading. All CLI commands,
645/// server endpoints, and reports should use this function instead of duplicating
646/// threshold logic.
647///
648/// | Grade | Min-Entropy (H∞) |
649/// |-------|-------------------|
650/// | A     | ≥ 6.0             |
651/// | B     | ≥ 4.0             |
652/// | C     | ≥ 2.0             |
653/// | D     | ≥ 1.0             |
654/// | F     | < 1.0             |
655pub fn grade_min_entropy(min_entropy: f64) -> char {
656    if min_entropy >= 6.0 {
657        'A'
658    } else if min_entropy >= 4.0 {
659        'B'
660    } else if min_entropy >= 2.0 {
661        'C'
662    } else if min_entropy >= 1.0 {
663        'D'
664    } else {
665        'F'
666    }
667}
668
669/// Quick quality assessment.
670pub fn quick_quality(data: &[u8]) -> QualityReport {
671    if data.len() < 16 {
672        return QualityReport {
673            samples: data.len(),
674            unique_values: 0,
675            shannon_entropy: 0.0,
676            compression_ratio: 0.0,
677            quality_score: 0.0,
678            grade: 'F',
679        };
680    }
681
682    let shannon = quick_shannon(data);
683
684    // Compression ratio
685    use flate2::Compression;
686    use flate2::write::ZlibEncoder;
687    use std::io::Write;
688    let mut encoder = ZlibEncoder::new(Vec::new(), Compression::best());
689    encoder.write_all(data).unwrap_or_default();
690    let compressed = encoder.finish().unwrap_or_default();
691    let comp_ratio = compressed.len() as f64 / data.len() as f64;
692
693    // Unique values
694    let mut seen = [false; 256];
695    for &b in data {
696        seen[b as usize] = true;
697    }
698    let unique = seen.iter().filter(|&&s| s).count();
699
700    let eff = shannon / 8.0;
701    let score = eff * 60.0 + comp_ratio.min(1.0) * 20.0 + (unique as f64 / 256.0).min(1.0) * 20.0;
702    let grade = if score >= 80.0 {
703        'A'
704    } else if score >= 60.0 {
705        'B'
706    } else if score >= 40.0 {
707        'C'
708    } else if score >= 20.0 {
709        'D'
710    } else {
711        'F'
712    };
713
714    QualityReport {
715        samples: data.len(),
716        unique_values: unique,
717        shannon_entropy: shannon,
718        compression_ratio: comp_ratio,
719        quality_score: score,
720        grade,
721    }
722}
723
724#[derive(Debug, Clone)]
725pub struct QualityReport {
726    pub samples: usize,
727    pub unique_values: usize,
728    pub shannon_entropy: f64,
729    pub compression_ratio: f64,
730    pub quality_score: f64,
731    pub grade: char,
732}
733
734#[cfg(test)]
735mod tests {
736    use super::*;
737
738    // -----------------------------------------------------------------------
739    // Conditioning mode tests
740    // -----------------------------------------------------------------------
741
742    #[test]
743    fn test_condition_raw_passthrough() {
744        let data = vec![1, 2, 3, 4, 5];
745        let out = condition(&data, 3, ConditioningMode::Raw);
746        assert_eq!(out, vec![1, 2, 3]);
747    }
748
749    #[test]
750    fn test_condition_raw_exact_length() {
751        let data: Vec<u8> = (0..100).map(|i| i as u8).collect();
752        let out = condition(&data, 100, ConditioningMode::Raw);
753        assert_eq!(out, data);
754    }
755
756    #[test]
757    fn test_condition_raw_truncates() {
758        let data: Vec<u8> = (0..100).map(|i| i as u8).collect();
759        let out = condition(&data, 50, ConditioningMode::Raw);
760        assert_eq!(out.len(), 50);
761        assert_eq!(out, &data[..50]);
762    }
763
764    #[test]
765    fn test_condition_sha256_produces_exact_length() {
766        let data = vec![42u8; 100];
767        for len in [1, 16, 32, 64, 100, 256] {
768            let out = condition(&data, len, ConditioningMode::Sha256);
769            assert_eq!(out.len(), len, "SHA256 should produce exactly {len} bytes");
770        }
771    }
772
773    #[test]
774    fn test_sha256_deterministic() {
775        let data = vec![42u8; 100];
776        let out1 = sha256_condition_bytes(&data, 64);
777        let out2 = sha256_condition_bytes(&data, 64);
778        assert_eq!(
779            out1, out2,
780            "SHA256 conditioning should be deterministic for same input"
781        );
782    }
783
784    #[test]
785    fn test_sha256_different_inputs_differ() {
786        let data1 = vec![1u8; 100];
787        let data2 = vec![2u8; 100];
788        let out1 = sha256_condition_bytes(&data1, 32);
789        let out2 = sha256_condition_bytes(&data2, 32);
790        assert_ne!(out1, out2);
791    }
792
793    #[test]
794    fn test_sha256_empty_input() {
795        let out = sha256_condition_bytes(&[], 32);
796        assert!(out.is_empty(), "Empty input should produce no output");
797    }
798
799    #[test]
800    fn test_von_neumann_reduces_size() {
801        let input = vec![0b10101010u8; 128];
802        let output = von_neumann_debias(&input);
803        assert!(output.len() < input.len());
804    }
805
806    #[test]
807    fn test_von_neumann_known_output() {
808        // Input: 0b10_10_10_10 = pairs (1,0)(1,0)(1,0)(1,0)
809        // Von Neumann: (1,0) -> 1, repeated 4 times = 4 bits = 1111 per byte
810        // But we need 8 bits for one output byte.
811        // Two input bytes = 8 pairs of bits -> each (1,0) -> 1, so 8 bits -> 0b11111111
812        let input = vec![0b10101010u8; 2];
813        let output = von_neumann_debias(&input);
814        assert_eq!(output.len(), 1);
815        assert_eq!(output[0], 0b11111111);
816    }
817
818    #[test]
819    fn test_von_neumann_alternating_01() {
820        // Input: 0b01_01_01_01 = pairs (0,1)(0,1)(0,1)(0,1)
821        // Von Neumann: (0,1) -> 0, repeated 4 times per byte
822        // Two input bytes = 8 pairs -> 8 zero bits -> 0b00000000
823        let input = vec![0b01010101u8; 2];
824        let output = von_neumann_debias(&input);
825        assert_eq!(output.len(), 1);
826        assert_eq!(output[0], 0b00000000);
827    }
828
829    #[test]
830    fn test_von_neumann_all_same_discards() {
831        // Input: all 0xFF = pairs (1,1)(1,1)... -> all discarded
832        let input = vec![0xFF; 100];
833        let output = von_neumann_debias(&input);
834        assert!(output.is_empty(), "All-ones should produce no output");
835    }
836
837    #[test]
838    fn test_von_neumann_all_zeros_discards() {
839        // Input: all 0x00 = pairs (0,0)(0,0)... -> all discarded
840        let input = vec![0x00; 100];
841        let output = von_neumann_debias(&input);
842        assert!(output.is_empty(), "All-zeros should produce no output");
843    }
844
845    #[test]
846    fn test_condition_modes_differ() {
847        let data: Vec<u8> = (0..256).map(|i| i as u8).collect();
848        let raw = condition(&data, 64, ConditioningMode::Raw);
849        let sha = condition(&data, 64, ConditioningMode::Sha256);
850        assert_ne!(raw, sha);
851    }
852
853    #[test]
854    fn test_conditioning_mode_display() {
855        assert_eq!(ConditioningMode::Raw.to_string(), "raw");
856        assert_eq!(ConditioningMode::VonNeumann.to_string(), "von_neumann");
857        assert_eq!(ConditioningMode::Sha256.to_string(), "sha256");
858    }
859
860    #[test]
861    fn test_conditioning_mode_default() {
862        assert_eq!(ConditioningMode::default(), ConditioningMode::Sha256);
863    }
864
865    // -----------------------------------------------------------------------
866    // XOR fold tests
867    // -----------------------------------------------------------------------
868
869    #[test]
870    fn test_xor_fold_basic() {
871        let data = vec![0xFF, 0x00, 0xAA, 0x55];
872        let folded = xor_fold(&data);
873        assert_eq!(folded.len(), 2);
874        assert_eq!(folded[0], 0xFF ^ 0xAA);
875        assert_eq!(folded[1], 0x55);
876    }
877
878    #[test]
879    fn test_xor_fold_single_byte() {
880        let data = vec![42];
881        let folded = xor_fold(&data);
882        assert_eq!(folded, vec![42]);
883    }
884
885    #[test]
886    fn test_xor_fold_empty() {
887        let folded = xor_fold(&[]);
888        assert!(folded.is_empty());
889    }
890
891    #[test]
892    fn test_xor_fold_odd_length() {
893        // With 5 bytes, half=2, so XOR data[0..2] with data[2..4]
894        let data = vec![1, 2, 3, 4, 5];
895        let folded = xor_fold(&data);
896        assert_eq!(folded.len(), 2);
897        assert_eq!(folded[0], 1 ^ 3);
898        assert_eq!(folded[1], 2 ^ 4);
899    }
900
901    // -----------------------------------------------------------------------
902    // Shannon entropy tests
903    // -----------------------------------------------------------------------
904
905    #[test]
906    fn test_shannon_empty() {
907        assert_eq!(quick_shannon(&[]), 0.0);
908    }
909
910    #[test]
911    fn test_shannon_single_byte() {
912        // One byte = one value, p=1.0, H = -1.0 * log2(1.0) = 0.0
913        assert_eq!(quick_shannon(&[42]), 0.0);
914    }
915
916    #[test]
917    fn test_shannon_all_same() {
918        let data = vec![0u8; 1000];
919        assert_eq!(quick_shannon(&data), 0.0);
920    }
921
922    #[test]
923    fn test_shannon_two_values_equal() {
924        // 50/50 split between two values = 1.0 bits
925        let mut data = vec![0u8; 500];
926        data.extend(vec![1u8; 500]);
927        let h = quick_shannon(&data);
928        assert!((h - 1.0).abs() < 0.01, "Expected ~1.0, got {h}");
929    }
930
931    #[test]
932    fn test_shannon_uniform_256() {
933        // Perfectly uniform over 256 values = 8.0 bits
934        let data: Vec<u8> = (0..=255).collect();
935        let h = quick_shannon(&data);
936        assert!((h - 8.0).abs() < 0.01, "Expected ~8.0, got {h}");
937    }
938
939    #[test]
940    fn test_shannon_uniform_large() {
941        // Large uniform sample — each value appears ~40 times
942        let mut data = Vec::with_capacity(256 * 40);
943        for _ in 0..40 {
944            for b in 0..=255u8 {
945                data.push(b);
946            }
947        }
948        let h = quick_shannon(&data);
949        assert!((h - 8.0).abs() < 0.01, "Expected ~8.0, got {h}");
950    }
951
952    // -----------------------------------------------------------------------
953    // Min-entropy estimator tests
954    // -----------------------------------------------------------------------
955
956    #[test]
957    fn test_min_entropy_empty() {
958        assert_eq!(min_entropy(&[]), 0.0);
959    }
960
961    #[test]
962    fn test_min_entropy_all_same() {
963        let data = vec![42u8; 1000];
964        let h = min_entropy(&data);
965        assert!(h < 0.01, "All-same should have ~0 min-entropy, got {h}");
966    }
967
968    #[test]
969    fn test_min_entropy_uniform() {
970        let mut data = Vec::with_capacity(256 * 40);
971        for _ in 0..40 {
972            for b in 0..=255u8 {
973                data.push(b);
974            }
975        }
976        let h = min_entropy(&data);
977        assert!(
978            (h - 8.0).abs() < 0.1,
979            "Uniform should have ~8.0 min-entropy, got {h}"
980        );
981    }
982
983    #[test]
984    fn test_min_entropy_two_values() {
985        let mut data = vec![0u8; 500];
986        data.extend(vec![1u8; 500]);
987        let h = min_entropy(&data);
988        // p_max = 0.5, H∞ = -log2(0.5) = 1.0
989        assert!((h - 1.0).abs() < 0.01, "Expected ~1.0, got {h}");
990    }
991
992    #[test]
993    fn test_min_entropy_biased() {
994        // 90% value 0, 10% value 1: p_max=0.9, H∞ = -log2(0.9) ≈ 0.152
995        let mut data = vec![0u8; 900];
996        data.extend(vec![1u8; 100]);
997        let h = min_entropy(&data);
998        let expected = -(0.9f64.log2());
999        assert!(
1000            (h - expected).abs() < 0.02,
1001            "Expected ~{expected:.3}, got {h}"
1002        );
1003    }
1004
1005    // -----------------------------------------------------------------------
1006    // MCV estimator tests
1007    // -----------------------------------------------------------------------
1008
1009    #[test]
1010    fn test_mcv_empty() {
1011        let (h, p) = mcv_estimate(&[]);
1012        assert_eq!(h, 0.0);
1013        assert_eq!(p, 1.0);
1014    }
1015
1016    #[test]
1017    fn test_mcv_all_same() {
1018        let data = vec![42u8; 1000];
1019        let (h, p_upper) = mcv_estimate(&data);
1020        assert!(h < 0.1, "All-same should have ~0 MCV entropy, got {h}");
1021        assert!((p_upper - 1.0).abs() < 0.01);
1022    }
1023
1024    #[test]
1025    fn test_mcv_uniform() {
1026        let mut data = Vec::with_capacity(256 * 100);
1027        for _ in 0..100 {
1028            for b in 0..=255u8 {
1029                data.push(b);
1030            }
1031        }
1032        let (h, _p_upper) = mcv_estimate(&data);
1033        assert!(h > 7.0, "Uniform should have high MCV entropy, got {h}");
1034    }
1035
1036    // -----------------------------------------------------------------------
1037    // Collision estimator tests
1038    // -----------------------------------------------------------------------
1039
1040    #[test]
1041    fn test_collision_too_short() {
1042        assert_eq!(collision_estimate(&[1, 2]), 0.0);
1043    }
1044
1045    #[test]
1046    fn test_collision_all_same() {
1047        let data = vec![0u8; 1000];
1048        let h = collision_estimate(&data);
1049        // All same -> every adjacent pair is a collision -> mean distance = 1
1050        // -> p_max = 1.0 -> H = 0
1051        assert!(
1052            h < 1.0,
1053            "All-same should have very low collision entropy, got {h}"
1054        );
1055    }
1056
1057    #[test]
1058    fn test_collision_uniform_large() {
1059        let mut data = Vec::with_capacity(256 * 100);
1060        for _ in 0..100 {
1061            for b in 0..=255u8 {
1062                data.push(b);
1063            }
1064        }
1065        let h = collision_estimate(&data);
1066        assert!(
1067            h > 3.0,
1068            "Uniform should have reasonable collision entropy, got {h}"
1069        );
1070    }
1071
1072    // -----------------------------------------------------------------------
1073    // Markov estimator tests
1074    // -----------------------------------------------------------------------
1075
1076    #[test]
1077    fn test_markov_too_short() {
1078        assert_eq!(markov_estimate(&[42]), 0.0);
1079    }
1080
1081    #[test]
1082    fn test_markov_all_same() {
1083        let data = vec![0u8; 1000];
1084        let h = markov_estimate(&data);
1085        assert!(h < 1.0, "All-same should have low Markov entropy, got {h}");
1086    }
1087
1088    #[test]
1089    fn test_markov_uniform_large() {
1090        // Byte-level Markov estimator finds the max transition probability across
1091        // all 256x256 = 65536 transitions. With ~25600 samples, the transition
1092        // matrix is very sparse (~0.4 counts per cell on average). Some cells will
1093        // get a disproportionate share by chance, making p_max high.
1094        //
1095        // This is the correct, expected behavior: the Markov estimator is inherently
1096        // conservative with small sample sizes relative to the state space.
1097        // With truly uniform IID data you'd need ~1M+ samples for the Markov
1098        // estimate to converge near 8.0.
1099        //
1100        // We verify it's meaningfully above zero (all-same baseline).
1101        let mut data = Vec::with_capacity(256 * 100);
1102        for i in 0..(256 * 100) {
1103            let v = ((i as u64)
1104                .wrapping_mul(6364136223846793005)
1105                .wrapping_add(1442695040888963407)
1106                >> 56) as u8;
1107            data.push(v);
1108        }
1109        let h = markov_estimate(&data);
1110        assert!(
1111            h > 0.1,
1112            "Pseudo-random should have Markov entropy > 0.1, got {h}"
1113        );
1114    }
1115
1116    // -----------------------------------------------------------------------
1117    // Compression estimator tests
1118    // -----------------------------------------------------------------------
1119
1120    #[test]
1121    fn test_compression_too_short() {
1122        assert_eq!(compression_estimate(&[1; 50]), 0.0);
1123    }
1124
1125    #[test]
1126    fn test_compression_all_same() {
1127        let data = vec![0u8; 1000];
1128        let h = compression_estimate(&data);
1129        assert!(
1130            h < 2.0,
1131            "All-same should have low compression entropy, got {h}"
1132        );
1133    }
1134
1135    #[test]
1136    fn test_compression_uniform_large() {
1137        let mut data = Vec::with_capacity(256 * 100);
1138        for _ in 0..100 {
1139            for b in 0..=255u8 {
1140                data.push(b);
1141            }
1142        }
1143        let h = compression_estimate(&data);
1144        assert!(
1145            h > 4.0,
1146            "Uniform should have reasonable compression entropy, got {h}"
1147        );
1148    }
1149
1150    // -----------------------------------------------------------------------
1151    // t-Tuple estimator tests
1152    // -----------------------------------------------------------------------
1153
1154    #[test]
1155    fn test_t_tuple_too_short() {
1156        assert_eq!(t_tuple_estimate(&[1; 10]), 0.0);
1157    }
1158
1159    #[test]
1160    fn test_t_tuple_all_same() {
1161        let data = vec![0u8; 1000];
1162        let h = t_tuple_estimate(&data);
1163        assert!(h < 0.1, "All-same should have ~0 t-tuple entropy, got {h}");
1164    }
1165
1166    #[test]
1167    fn test_t_tuple_uniform_large() {
1168        // t-Tuple estimator finds the most frequent t-length tuple and computes
1169        // -log2(p_max)/t. For t>1, pseudo-random data with sequential correlation
1170        // may show elevated tuple frequencies. We verify the result is well above
1171        // the all-same baseline (~0).
1172        let mut data = Vec::with_capacity(256 * 100);
1173        for i in 0..(256 * 100) {
1174            let v = ((i as u64)
1175                .wrapping_mul(6364136223846793005)
1176                .wrapping_add(1442695040888963407)
1177                >> 56) as u8;
1178            data.push(v);
1179        }
1180        let h = t_tuple_estimate(&data);
1181        assert!(
1182            h > 2.5,
1183            "Pseudo-random should have t-tuple entropy > 2.5, got {h}"
1184        );
1185    }
1186
1187    // -----------------------------------------------------------------------
1188    // Combined min-entropy report tests
1189    // -----------------------------------------------------------------------
1190
1191    #[test]
1192    fn test_min_entropy_estimate_all_same() {
1193        let data = vec![0u8; 1000];
1194        let report = min_entropy_estimate(&data);
1195        assert!(
1196            report.min_entropy < 1.0,
1197            "All-same combined estimate: {}",
1198            report.min_entropy
1199        );
1200        assert!(report.shannon_entropy < 0.01);
1201        assert_eq!(report.samples, 1000);
1202    }
1203
1204    #[test]
1205    fn test_min_entropy_estimate_uniform() {
1206        // Primary min-entropy is MCV-based; heuristic floor remains available
1207        // as an additional diagnostic view.
1208        let mut data = Vec::with_capacity(256 * 100);
1209        for i in 0..(256 * 100) {
1210            let v = ((i as u64)
1211                .wrapping_mul(6364136223846793005)
1212                .wrapping_add(1442695040888963407)
1213                >> 56) as u8;
1214            data.push(v);
1215        }
1216        let report = min_entropy_estimate(&data);
1217        assert!(
1218            report.min_entropy > 6.0,
1219            "Primary min-entropy should be high for uniform marginals: {}",
1220            report.min_entropy
1221        );
1222        assert!(
1223            report.shannon_entropy > 7.9,
1224            "Shannon should be near 8.0 for uniform marginals: {}",
1225            report.shannon_entropy
1226        );
1227        // MCV should be close to 8.0 for uniform-ish data
1228        assert!(
1229            report.mcv_estimate > 6.0,
1230            "MCV should be high for uniform data: {}",
1231            report.mcv_estimate
1232        );
1233        assert!(
1234            report.heuristic_floor <= report.min_entropy + 1e-9,
1235            "heuristic floor should not exceed primary min-entropy"
1236        );
1237    }
1238
1239    #[test]
1240    fn test_min_entropy_report_display() {
1241        let data = vec![0u8; 1000];
1242        let report = min_entropy_estimate(&data);
1243        let s = format!("{report}");
1244        assert!(s.contains("Min-Entropy Analysis"));
1245        assert!(s.contains("1000 samples"));
1246    }
1247
1248    #[test]
1249    fn test_quick_min_entropy_uses_mcv() {
1250        let data: Vec<u8> = (0..=255).collect();
1251        let quick = quick_min_entropy(&data);
1252        let (mcv_h, _) = mcv_estimate(&data);
1253        // quick_min_entropy uses MCV only — should match exactly
1254        assert!(
1255            (quick - mcv_h).abs() < f64::EPSILON,
1256            "quick_min_entropy ({quick}) should equal MCV estimate ({mcv_h})"
1257        );
1258    }
1259
1260    #[test]
1261    fn test_quick_min_entropy_leq_shannon() {
1262        // Min-entropy should always be <= Shannon entropy
1263        let data: Vec<u8> = (0..=255).cycle().take(2560).collect();
1264        let quick = quick_min_entropy(&data);
1265        let shannon = quick_shannon(&data);
1266        assert!(
1267            quick <= shannon + 0.01,
1268            "H∞ ({quick}) should be <= Shannon ({shannon})"
1269        );
1270    }
1271
1272    // -----------------------------------------------------------------------
1273    // Quality report tests
1274    // -----------------------------------------------------------------------
1275
1276    #[test]
1277    fn test_quality_too_short() {
1278        let q = quick_quality(&[1, 2, 3]);
1279        assert_eq!(q.grade, 'F');
1280        assert_eq!(q.quality_score, 0.0);
1281    }
1282
1283    #[test]
1284    fn test_quality_all_same() {
1285        let data = vec![0u8; 1000];
1286        let q = quick_quality(&data);
1287        assert!(
1288            q.grade == 'F' || q.grade == 'D',
1289            "All-same should grade poorly, got {}",
1290            q.grade
1291        );
1292        assert_eq!(q.unique_values, 1);
1293        assert!(q.shannon_entropy < 0.01);
1294    }
1295
1296    #[test]
1297    fn test_quality_uniform() {
1298        let mut data = Vec::with_capacity(256 * 40);
1299        for _ in 0..40 {
1300            for b in 0..=255u8 {
1301                data.push(b);
1302            }
1303        }
1304        let q = quick_quality(&data);
1305        assert!(
1306            q.grade == 'A' || q.grade == 'B',
1307            "Uniform should grade well, got {}",
1308            q.grade
1309        );
1310        assert_eq!(q.unique_values, 256);
1311        assert!(q.shannon_entropy > 7.9);
1312    }
1313
1314    // -----------------------------------------------------------------------
1315    // grade_min_entropy tests
1316    // -----------------------------------------------------------------------
1317
1318    #[test]
1319    fn test_grade_boundaries() {
1320        assert_eq!(grade_min_entropy(8.0), 'A');
1321        assert_eq!(grade_min_entropy(6.0), 'A');
1322        assert_eq!(grade_min_entropy(5.99), 'B');
1323        assert_eq!(grade_min_entropy(4.0), 'B');
1324        assert_eq!(grade_min_entropy(3.99), 'C');
1325        assert_eq!(grade_min_entropy(2.0), 'C');
1326        assert_eq!(grade_min_entropy(1.99), 'D');
1327        assert_eq!(grade_min_entropy(1.0), 'D');
1328        assert_eq!(grade_min_entropy(0.99), 'F');
1329        assert_eq!(grade_min_entropy(0.0), 'F');
1330    }
1331
1332    #[test]
1333    fn test_grade_negative() {
1334        assert_eq!(grade_min_entropy(-1.0), 'F');
1335    }
1336}
openentropy_core/conditioning.rs

openentropy_core/
conditioning.rs