1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
use crate::{data_error, Vecu8, RE};
impl Vecu8 for &[u8] {
/// Probability density function of bytes data
fn pdfu8(self) -> Vec<f64> {
let nf = self.len() as f64;
let mut pdfv = vec![0_f64; 256];
for &x in self {
pdfv[x as usize] += 1_f64
}
pdfv.iter_mut().for_each(|p| {
if *p > 0.0 {
*p /= nf
}
});
pdfv
}
/// Information (entropy) of &[u8] (in nats)
fn entropyu8(self) -> f64 {
let pdfv = self.pdfu8();
pdfv.iter()
.map(|&p| if p > 0.0 { -p * (p.ln()) } else { 0.0 })
.sum::<f64>()
}
/// Joint probability density function (here just co-occurence counts)
/// of successive pairs of values from two vectors of bytes
/// of the same lenghts n. Needs 4*256^2=262144 bytes of heap memory,
/// which will be sparse except for long input vectors.
fn jointpdfu8(self, v: &[u8]) -> Result<Vec<Vec<u32>>, RE> {
let n = self.len();
if v.len() != n {
return data_error("jointpdfu8: argument vectors must be of equal length!");
}
let mut res: Vec<Vec<u32>> = vec![vec![0_u32; 256]; 256];
self.iter()
.zip(v)
.for_each(|(&si, &vi)| res[si as usize][vi as usize] += 1);
Ok(res)
}
/// Joint entropy of &[u8],&[u8] (in nats)
fn jointentropyu8(self, v: &[u8]) -> Result<f64, RE> {
let n = self.len();
let nf = n as f64;
let mut entropy = 0_f64;
// for short vecs, it is quicker to iterate through args
if n < 65000 {
let mut jpdf = self.jointpdfu8(v)?;
for (&si, &vi) in self.iter().zip(v) {
let c = jpdf[si as usize][vi as usize];
if c > 0 {
let p = (c as f64) / nf;
entropy -= p * (p.ln());
// prevent this pair's count being counted again
jpdf[si as usize][vi as usize] = 0;
}
}
return Ok(entropy); // return value
}
// for long vecs, iterate through the counts array
let jpdf = self.jointpdfu8(v)?;
for v in jpdf {
for c in v {
if c > 0_u32 {
let p = (c as f64) / nf;
entropy -= p * (p.ln());
}
}
}
Ok(entropy)
}
/// Statistical pairwise dependence in range [0,1] of two &[u8] variables
/// returns 0 iff they are statistically pairwise independent
/// returns 1 if they are identical or all values are unique
fn dependenceu8(self, v: &[u8]) -> Result<f64, RE> {
Ok((self.entropyu8() + v.entropyu8()) / self.jointentropyu8(v)? - 1.0)
}
/// Independence in the range [1,2] of two &[u8] variables
/// e.g. 2 is returned iff they are statistically pairwise independent
/// returns 1 if they are identical or all values are unique
fn independenceu8(self, v: &[u8]) -> Result<f64, RE> {
Ok(2.0 * self.jointentropyu8(v)? / (self.entropyu8() + v.entropyu8()))
}
}