1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
use crate::{Vecu8,here};

impl Vecu8 for &[u8] {

    /// Scalar product of two (positive) u8 slices.   
    /// Must be of the same length - no error checking (for speed)
    fn dotpu8(self, v: &[u8]) -> u64 {
        self.iter().zip(v).map(|(&xi, &vi)| (xi as u64)*(vi as u64)).sum::<u64>()
    }

    /// Cosine between two (positive) u8 slices.
    fn cosineu8(self, v: &[u8]) -> f64 {
     let (mut sxy, mut sy2) = (0_f64, 0_f64);
     let sx2: f64 = self
         .iter()
         .zip(v)
         .map(|(&ux, &uy)| {
             let x  = ux as f64;
             let y = uy as f64;
             sxy += x * y;
             sy2 += y * y;
             x*x
         })
         .sum();
     sxy / (sx2*sy2).sqrt()
    }
 
    /// Vector subtraction (converts results to f64 as they can be negative)
    fn vsubu8(self, v: &[u8]) -> Vec<f64> {
        self.iter().zip(v).map(|(&xi, &vi)| (xi as f64) - (vi as f64)).collect()
    }

    /// Vector addition ( converts results to f64, as they can exceed 255 )
    fn vaddu8(self, v: &[u8]) -> Vec<f64> {
        self.iter().zip(v).map(|(&xi, &vi)| (xi as f64)+(vi as f64)).collect()
    }

    /// Euclidian distance between self &[u8] and v:&[u8].  
    /// Faster than vsub followed by vmag, as both are done in one loop
    fn vdistu8(self, v: &[u8]) -> f64 {
        self.iter()
            .zip(v)
            .map(|(&xi, &vi)| ((xi as f64)-(vi as f64)).powi(2))
            .sum::<f64>()
            .sqrt()
    }

    /// cityblock distance
    fn cityblockdu8(self, v:&[u8]) -> f64 {
        self.iter()
        .zip(v)
        .map(|(&xi, &vi)| { let d = xi as f64 -vi as f64; if d<0_f64 {-d} else {d} } ) 
        .sum::<f64>()      
    }
    ///Euclidian distance squared, the arguments are both of &[u8] type  
    fn vdistsqu8(self, v: &[u8]) -> u64 {
        self.iter()
            .zip(v)
            .map(|(&xi, &vi)| {
               let x = xi as i32;
               let y = vi as i32;
                (x - y).pow(2) as u64})           
            .sum::<u64>()
    }
 
    /// Probability density function of bytes data
    fn pdfu8(self) -> Vec<f64> {  
        let nf = self.len() as f64;
        let mut pdfv = vec![0_f64;256];        
        for &x in self { pdfv[x as usize] += 1_f64 };
        pdfv.iter_mut().for_each(|p| if *p > 0.0 { *p /= nf });
        pdfv
    }

    /// Information (entropy) of &[u8] (in nats)
    fn entropyu8(self) -> f64 {
        let pdfv = self.pdfu8();
        pdfv.iter().map(|&p|if p > 0.0 {-p*(p.ln())} else {0.0}).sum::<f64>()    
    }

    /// Joint probability density function (here just co-occurence counts) 
    /// of successive pairs of values from two vectors of bytes 
    /// of the same lenghts n. Needs 4*256^2=262144 bytes of heap memory, 
    /// which will be sparse except for long input vectors. 
    fn jointpdfu8(self, v:&[u8]) -> Vec<Vec<u32>> {  
        let n = self.len();
        if v.len() != n { panic!("{} argument vectors must be of equal length!",here!()) }    
        let mut res:Vec<Vec<u32>> = vec![vec![0_u32;256];256]; 
        self.iter().zip(v).for_each(|(&si,&vi)| res[si as usize][vi as usize] += 1 ); 
        res
    }

    /// Joint entropy of &[u8],&[u8] (in nats)
    fn jointentropyu8(self, v:&[u8]) -> f64 {
        let n = self.len(); 
        let nf = n as f64;
        let mut entropy = 0_f64;
        // for short vecs, it is quicker to iterate through args
        if n < 65000 { 
            let mut jpdf = self.jointpdfu8(v); 
            self.iter().zip(v).for_each(|(&si,&vi)| {
              let c = jpdf[si as usize][vi as usize];
              if c > 0 {
                let p = (c as f64)/nf;                   
                entropy -= p*(p.ln()); 
                // prevent this pair's count being counted again
                jpdf[si as usize][vi as usize] = 0;  
              }
            });
            return entropy; // return value 
        } 
        // for long vecs, iterate through the counts array
        let jpdf = self.jointpdfu8(v); 
        for v in jpdf {
            for c in v {  
                if c > 0_u32 {
                    let p = (c as f64)/nf; 
                    entropy -= p*(p.ln()); 
                }
            }
        } 
        entropy              
    }

    /// Statistical pairwise dependence in range [0,1] of two &[u8] variables
    /// returns 0 iff they are statistically pairwise independent
    /// returns 1 if they are identical or all values are unique
    fn dependenceu8(self, v:&[u8]) -> f64 {     
        (self.entropyu8() + v.entropyu8())/self.jointentropyu8(v)-1.0
    }

    /// Independence in the range [1,2] of two &[u8] variables
    /// e.g. 2 is returned iff they are statistically pairwise independent
    /// returns 1 if they are identical or all values are unique
    fn independenceu8(self, v:&[u8]) -> f64 {     
        2.0*self.jointentropyu8(v)/(self.entropyu8() + v.entropyu8())
    }
}