1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
use anyhow::{Result,Context,ensure};
use crate::{RStats,MutVectors,Vectors,Med};
use crate::f64impls::{emsg};
use std::cmp::Ordering::Equal;
use std::fmt;

impl MutVectors for &mut[f64] {

   /// Scalar multiplication of a vector, mutates self
   fn mutsmult(self, s:f64) {
     self.iter_mut().for_each(|x|{ *x*=s });
   }
   /// Vector subtraction, mutates self
   fn mutvsub(self, v: &[f64]) {
     self.iter_mut().enumerate().for_each(|(i,x)|*x-=v[i])
   }
  /// Vector addition, mutates self
   fn mutvadd(self, v: &[f64]) {
     self.iter_mut().enumerate().for_each(|(i,x)|*x+=v[i])
   }
  /// Mutate to unit vector
   fn mutvunit(self) { 
      self.mutsmult(1_f64/self.iter().map(|x|x.powi(2)).sum::<f64>().sqrt())
   }
   /// Vector magnitude duplicated for mutable type 
   fn mutvmag(self) -> f64 { self.iter().map(|x|x.powi(2)).sum::<f64>().sqrt() }
}

impl Vectors for &[f64] { 
   
   /// Scalar multiplication of a vector, creates new vec
   fn smult(self, s:f64) -> Vec<f64> {
      self.iter().map(|&x|s*x).collect()
   }
  
   /// Scalar product of two f64 slices.   
   /// Must be of the same length - no error checking for speed
   fn dotp(self, v: &[f64]) -> f64 {
      self.iter().enumerate().map(|(i,&x)| x*v[i]).sum::<f64>()    
   }

   /// Vector subtraction, creates a new Vec result
   fn vsub(self, v: &[f64]) -> Vec<f64> {
      self.iter().enumerate().map(|(i,&x)|x-v[i]).collect()
   }
   
   /// Vector addition, creates a new Vec result
   fn vadd(self, v: &[f64]) -> Vec<f64> {
      self.iter().enumerate().map(|(i,&x)|x+v[i]).collect()
   }
 
   /// Euclidian distance between two n dimensional points (vectors).  
   /// Slightly faster than vsub followed by vmag, as both are done in one loop
   fn vdist(self, v: &[f64]) -> f64 {
      self.iter().enumerate().map(|(i,&x)|(x-v[i]).powi(2)).sum::<f64>().sqrt()
   }

   /// Vector magnitude 
   fn vmag(self) -> f64 { self.iter().map(|&x|x.powi(2)).sum::<f64>().sqrt() }

   /// Unit vector - creates a new one
   fn vunit(self) ->Vec<f64> { 
      self.smult(1./self.iter().map(|x|x.powi(2)).sum::<f64>().sqrt())
   }
    
   /// Correlation coefficient of a sample of two f64 variables.
   /// # Example
   /// ```
   /// use rstats::Vectors;
   /// let v1 = vec![1_f64,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.,13.,14.];
   /// let v2 = vec![14_f64,13.,12.,11.,10.,9.,8.,7.,6.,5.,4.,3.,2.,1.];
   /// assert_eq!(v1.correlation(&v2).unwrap(),-1_f64);
   /// ```
   fn correlation(self,v:&[f64]) -> Result<f64> {
      let n = self.len();
      ensure!(n>0,emsg(file!(),line!(),"correlation - first sample is empty"));
      ensure!(n==v.len(),emsg(file!(),line!(),"correlation - samples are not of the same size"));
      let (mut sy,mut sxy,mut sx2,mut sy2) = (0_f64,0_f64,0_f64,0_f64);
      let sx:f64 = self.iter().enumerate().map(|(i,&x)| {
         let y = v[i]; 
         sy += y; sxy += x*y; sx2 += x*x; sy2 += y*y; x    
      }).sum();
   let nf = n as f64;
   Ok( (sxy-sx/nf*sy)/(((sx2-sx/nf*sx)*(sy2-sy/nf*sy)).sqrt()) )
   }

   /// Kendall Tau-B correlation coefficient of a sample of two f64 variables.
   /// Defined by: tau = (conc - disc) / sqrt((conc + disc + tiesx) * (conc + disc + tiesy))
   /// This is the simplest implementation with no sorting.
   /// # Example
   /// ```
   /// use rstats::Vectors;
   /// let v1 = vec![1_f64,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.,13.,14.];
   /// let v2 = vec![14_f64,13.,12.,11.,10.,9.,8.,7.,6.,5.,4.,3.,2.,1.];
   /// assert_eq!(v1.kendalcorr(&v2).unwrap(),-1_f64);
   /// ```
   fn kendalcorr(self,v:&[f64]) -> Result<f64> {
      let n = self.len();
      ensure!(n>0,emsg(file!(),line!(),"kendalcorr - first sample is empty"));
      ensure!(n==v.len(),emsg(file!(),line!(),"kendalcorr - samples are not of the same size"));
      let (mut conc, mut disc, mut tiesx, mut tiesy) = (0_i64,0_i64,0_i64,0_i64);
      for i in 1..n {
         let x = self[i];
         let y = v[i];
         for j in 0..i {
            let xd = x - self[j];
            let yd = y - v[j];
            if !xd.is_normal() {
               if !yd.is_normal() { continue } else { tiesx += 1; continue }
            };
            if !yd.is_normal() { tiesy += 1; continue };
            if (xd*yd).signum() > 0_f64 { conc += 1 } else { disc += 1 }               
            }
         }
      Ok((conc-disc) as f64/(((conc+disc+tiesx)*(conc+disc+tiesy)) as f64).sqrt())
   }
   /// Spearman rho correlation coefficient of a sample of two f64 variables.
   /// This is the simplest implementation with no sorting.
   /// # Example
   /// ```
   /// use rstats::Vectors;
   /// let v1 = vec![1_f64,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.,13.,14.];
   /// let v2 = vec![14_f64,13.,12.,11.,10.,9.,8.,7.,6.,5.,4.,3.,2.,1.];
   /// assert_eq!(v1.spearmancorr(&v2).unwrap(),-1_f64);
   /// ```
   fn spearmancorr(self,v:&[f64]) -> Result<f64> {
      let n = self.len();
      ensure!(n>0,emsg(file!(),line!(),"spearmancorr - first sample is empty"));
      ensure!(n==v.len(),emsg(file!(),line!(),"spearmancorr - samples are not of the same size"));
      let xvec = self.ranks().unwrap();
      let yvec = v.ranks().unwrap(); 
      let mx = xvec.ameanstd().unwrap();
      let my = yvec.ameanstd().unwrap();
      let mut covar = 0_f64;
      for i in 0..n {
         covar += (xvec[i]-mx.mean)*(yvec[i]-my.mean);
      }
      covar /= mx.std*my.std*(n as f64);
      // remove small truncation errors
      if covar > 1.0 { covar=1_f64 } else if covar < -1_f64 { covar=-1.0 }; 
      Ok(covar)
   }


   /// (Auto)correlation coefficient of pairs of successive values of (time series) f64 variable.
   /// # Example
   /// ```
   /// use rstats::Vectors;
   /// let v1 = vec![1_f64,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.,13.,14.];
   /// assert_eq!(v1.autocorr().unwrap(),0.9984603532054123_f64);
   /// ```
   fn autocorr(self) -> Result<f64> {
      let n = self.len();
       ensure!(n>=2,emsg(file!(),line!(),"autocorr - sample is too small"));
       let (mut sx,mut sy,mut sxy,mut sx2,mut sy2) = (0_f64,0_f64,0_f64,0_f64,0_f64);
       for i in 0..n-1 {
          let x = self[i]; let y = self[i+1]; 
          sx += x; sy += y; sxy += x*y; sx2 += x*x; sy2 += y*y 
       }    
       let nf = n as f64;
       Ok( (sxy-sx/nf*sy)/(((sx2-sx/nf*sx)*(sy2-sy/nf*sy)).sqrt()) )
   }

   /// Centroid = simple multidimensional arithmetic mean
   /// # Example
   /// ```
   /// use rstats::{Vectors,vimpls::genvec};
   /// let pts = genvec(15,15,255,30);
   /// let centre = pts.acentroid(15);
   /// let dist = pts.distsum(15,&centre);
   /// assert_eq!(dist, 4.14556218326653_f64);
   /// ```
   fn acentroid(self, d:usize) -> Vec<f64> {
      let n = self.len()/d;
      let mut centre = vec![0_f64;d];
      for i in 0..n {
         centre.as_mut_slice().mutvadd(self.get(i*d .. (i+1)*d).unwrap())
      }
      centre.as_mut_slice().mutsmult(1.0/n as f64);
      centre
   }

    /// Finds minimum, minimum's index, maximum, maximum's index of &[f64]
   fn minmax(self) -> (f64,usize,f64,usize) {
      let mut min = self[0]; // initialise to the first value
      let mut mini = 0;
      let mut max = self[0]; // initialised as min, allowing 'else' below
      let mut maxi = 0;
      for i in 1..self.len() {
         let x = self[i];
         if x < min { min = x; mini = i }
         else if x > max { max = x; maxi = i } 
      }
   (min,mini,max,maxi)
   }

   /// For each point, gives its sum of distances to all other points
   fn distances(self, d:usize) -> Result<Vec <f64>> {  
     let n = self.len()/d;
     ensure!(n*d == self.len(),emsg(file!(),line!(),"distances - d must divide vector length"));
     let mut dists = vec![0_f64;n]; // distances accumulator for all points
      // examine all unique pairings (lower triangular part of symmetric flat matrix)
      for i in 1..n {
         let thisp = self.get(i*d .. (i+1)*d)
            .with_context(||emsg(file!(),line!(),"distances failed to get this slice"))?;
         for j in 0..i {
            let thatp = self.get(j*d .. (j+1)*d)
               .with_context(||emsg(file!(),line!(),"distances failed to get that slice"))?;
            let d = thisp.vdist(&thatp); // calculate each distance relation just once
            dists[i] += d; dists[j] += d;   // but add it to both points   
         }
      }
      Ok(dists)           
   }   

 /// Eccentricity vector for each point
 fn eccentricities(self, d:usize) -> Result<Vec<Vec<f64>>> {  
   let n = self.len()/d;
   ensure!(n*d == self.len(),emsg(file!(),line!(),"distances - d must divide vector length"));
   // allocate vectors for the results
   let mut eccs = vec![vec![0_f64;d];n];
   // ecentricities vectors accumulator for all points
   // examine all unique pairings (lower triangular part of symmetric flat matrix)
   for i in 1..n {
      let thisp = self.get(i*d .. (i+1)*d).unwrap();
      for j in 0..i {
         let thatp = self.get(j*d .. (j+1)*d).unwrap();
         let e = thatp.vsub(&thisp).vunit(); // calculate each vector just once
         eccs[i].as_mut_slice().mutvadd(&e); 
         eccs[j].as_mut_slice().mutvsub(&e);  // mind the orientation!   
      }
   }
   Ok(eccs)           
 }   

   /// Medoid is the point belonging to set of points `self`,
   /// which has the least sum of distances to all other points. 
   /// Outlier is the point with the greatest sum of distances. 
   /// This function returns a four-tuple:  
   /// (medoid_distance, medoid_index, outlier_distance, outlier_index).
   /// `d` is the number of dimensions = length of the point sub-slices. 
   /// The entire set of points is held in one flat `&[f64]`.  
   /// This is faster than vec of vecs but we have to handle the indices.  
   /// Note: `medoid` computes each distance twice but it is probably faster than memoizing and looking them up,  
   /// unless the dimensionality is somewhat large; and it saves memory.
   /// # Example
   /// ```
   /// use rstats::{Vectors,vimpls::genvec};
   /// let pts = genvec(15,15,255,30);
   /// let (dm,_,_,_) = pts.medoid(15);
   /// assert_eq!(dm,4.812334638782327_f64);
   /// ```
   fn medoid(self, d:usize) -> (f64,usize,f64,usize) {
      self.distances(d).unwrap().minmax()
   }

   /// The sum of distances of all points in &self to given point v.    
   /// Geometric Median is defined as v which minimises this function. 
   fn distsum(self, d:usize, v:&[f64]) -> f64 {
      let n = self.len()/v.len();
      let mut sum = 0_f64;
      for i in 0..n {
         let thisp = self.get(i*d .. (i+1)*d).unwrap();
         sum += v.vdist(&thisp)              
      }
      sum
   }

   /// `Eccentricity` of a d-dimensional point belonging to the set self, specified by its indx.  
   /// Eccentricity is a measure between 0.0 and 1.0 of  a point `not being a median` of the given set. It does not need the median. 
   /// The perfect median has eccentricity zero.
   /// Of all the set points, Medoid has the lowest ecentricity and Outlier the highest.
   fn eccentr(self, d:usize, indx:usize) -> f64 {
      let n = self.len()/d;
      let mut vsum = vec![0_f64;d];
      let thisp = self.get(indx*d .. (indx+1)*d).unwrap();
      for i in 0..n {
         if i == indx { continue }; // exclude this point  
         let thatp = self.get(i*d .. (i+1)*d).unwrap();
         let unitdv = thatp.vsub(thisp).vunit();
         vsum.as_mut_slice().mutvadd(&unitdv);   // add it to their sum
      }
      vsum.vmag()/n as f64
   }

   /// Ecentricity measure and the eccentricity vector of any point (typically not one of the set).
   /// It is a measure  between 0.0 and 1.0 of `not being a median` but does not need the median.
   /// The eccentricity vector points towards the median and has maximum possible magnitude of n.
   fn veccentr(self, d:usize, thisp:&[f64]) -> Result<(f64,Vec<f64>)> {
      let n = self.len()/d;
      let mut vsum = vec![0_f64;d];
      for i in 0..n {
         let thatp = self.get(i*d .. (i+1)*d)
            .with_context(||emsg(file!(),line!(),"veccentr failed to extract that point"))?;
         let mut vdif = thatp.vsub(thisp);
         let mag = vdif.vmag();
         if !mag.is_normal() { continue }; // thisp belongs to the set
         // make vdif into a unit vector with its already known magnitude
         vdif.as_mut_slice().mutsmult(1./mag); 
         vsum.as_mut_slice().mutvadd(&vdif);   // add it to their sum
      }
      Ok((vsum.vmag()/n as f64, vsum))
   }

   /// This convenience wrapper calls `veccentr` and extracts just the eccentricity (residual error for median).
   fn ecc(self, d:usize, v:&[f64]) -> f64 {
      let (eccentricity,_) = self.veccentr(d,&v).unwrap();
      eccentricity
   }

   /// We now define MOE (median of ecentricities), a new measure of spread of multidimensional points 
   /// (or multivariate sample)  
   fn moe(self, d:usize) -> Med {
      scalarecc(self.eccentricities(d).unwrap()).median().unwrap()
   }

   /// Geometric Median (gm) is the point that minimises the sum of distances to a given set of points.
   /// It has (provably) only vector iterative solutions. 
   /// Searching methods are slow and difficult in highly dimensional space. 
   /// Weiszfeld's fixed point iteration formula had known problems with sometimes failing to converge.
   /// Especially, when the points are dense in the close proximity of the gm, 
   /// or it coincides with one of them.  
   /// However, these problems are fixed in my improved algorithm here.      
   /// There is eventually going to be a multithreaded version of `nmedian`.
   /// # Example
   /// ```
   /// use rstats::{Vectors,vimpls::genvec};
   /// let pt = genvec(15,15,255,30);
   /// let gm = pt.nmedian(15, 1e-5).unwrap();
   /// let error = pt.ecc(15,&gm);
   /// assert_eq!(error,0.000004826966175302838_f64);
   /// ```
   fn nmedian(self, d:usize, eps:f64) -> Result<Vec<f64>> {
      let n = self.len()/d;
      ensure!(n*d == self.len(),emsg(file!(),line!(),"gmedian d must divide vector length"));
      let mut oldpoint = self.acentroid(d); // start with the centroid
      loop {
        let (rsum,mut newv) = betterpoint(self,d,&oldpoint)
            .with_context(||emsg(file!(),line!(),"nmedian betterpoint call failed"))?; // find new point 
         newv.as_mut_slice().mutsmult(1.0/rsum); // adding unit vectors
         if newv.vdist(&oldpoint) < eps { 
            oldpoint = newv; break // use the last iteration anyway
         };
         oldpoint = newv                       
      }
      Ok(oldpoint)
   }  
}

/// betterpoint is called by nmedian. 
/// Scaling by rsum is left as the final step at higher level, 
/// in order to facilitate data parallelism. 
fn betterpoint(set:&[f64], d:usize, v:&[f64]) -> Result<(f64,Vec<f64>)> {
   let n = set.len()/d;
   let mut rsum = 0_f64;
   let mut vsum = vec![0_f64;d];
   for i in 0..n {
      let thatp = set.get(i*d .. (i+1)*d)
         .with_context(||emsg(file!(),line!(),"betterpoint failed to extract that point"))?; 
      let dist = v.vdist(&thatp);
      if !dist.is_normal() { continue };  
      let recip = 1.0/dist;
      rsum += recip;
      vsum.as_mut_slice().mutvadd(&thatp.smult(recip));
   }
   Ok((rsum,vsum))    
}

/// Converts the set of vectors produced by `eccentricities`
/// to their magnitudes normalised by n.
/// the output can be typically passed to `median` 
/// or `minmax` to find the Outlier and the Medoid
pub fn scalarecc(ev:Vec<Vec<f64>>) -> Vec<f64> {
   let mut scalars = Vec::new();
   let n = ev.len();
   let nf = n as f64;
   for i in 0..n { scalars.push(ev[i].vmag()/nf) };
   scalars
}

/// Sorts a mutable `Vec<f64>` in place.  
/// It is the responsibility of the user to ensure that there are no NaNs etc.
pub fn sortf(v: &mut [f64]) { 
   v.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Equal))
}

/// Generates a random f64 vector of size d x n suitable for testing. It needs two seeds.  
/// Uses local closure `rand` to generate random numbers (avoids dependencies).  
/// Random numbers are in the open interval 0..1 with uniform distribution.  
pub fn genvec(d:usize, n:usize, s1:u32, s2:u32 ) -> Vec<f64> {
   let size = d*n;
   // change the seeds as desired
   let mut m_z = s1 as u32;
   let mut m_w = s2 as u32;
   let mut rand = || {
      m_z = 36969 * (m_z & 65535) + (m_z >> 16);
      m_w = 18000 * (m_w & 65535) + (m_w >> 16);
      (((m_z << 16) & m_w) as f64 + 1.0)*2.328306435454494e-10
   };
   let mut v = Vec::with_capacity(size); 
   for _i in 0..size { v.push(rand()) }; // fills the lot with random numbers
   return v
}

/// GreenIt struct facilitates printing (in green) any type
/// that has Display implemented.
pub struct GreenIt<T: fmt::Display>(pub T);
impl<T: fmt::Display> fmt::Display for GreenIt<T> {
   fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
      write!(f,"\x1B[01;92m{}\x1B[0m",self.0.to_string())  
   }
}

/// GreenVec struct facilitates printing (in green) of vectors of any type
/// that has Display implemented.
pub struct GreenVec<T: fmt::Display>(pub Vec<T>);
impl<T: fmt::Display> fmt::Display for GreenVec<T> {
   fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
      let mut s = String::from("\x1B[01;92m[");
      let n = self.0.len();
      if n > 0 {
         s.push_str(&self.0[0].to_string()); // first item
         for i in 1..n {
            s.push_str(", ");
            s.push_str(&self.0[i].to_string());
         }
      }   
      write!(f,"{}]\x1B[0m", s)
   }
}