1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
pub mod statsg;
pub mod mutstats;
pub mod vecg;
pub mod vecu8;
pub mod mutvec;
pub mod vecvec;
pub mod vecvecg;

// reexporting to avoid duplication and for backwards compatibility
pub use indxvec::{MinMax,here,wi,wv,printvv}; 
/// simple error handling
use anyhow::{Result,bail}; 
use core::iter::FromIterator;

// Structs //

/// Median and quartiles
#[derive(Default)]
pub struct Med {
    pub lquartile: f64,
    pub median: f64,
    pub uquartile: f64,
}
impl std::fmt::Display for Med {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(
            f,
            "median:\n\tLower Q: {}\n\tMedian:  {}\n\tUpper Q: {}",
            wi(&self.lquartile),
            wi(&self.median),
            wi(&self.uquartile)
        )
    }
}

/// Mean and standard deviation (or std ratio for geometric mean)
#[derive(Default)]
pub struct MStats {
    pub mean: f64,
    pub std: f64,
}
impl std::fmt::Display for MStats {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(f, "mean±std: {}±{}", wi(&self.mean), wi(&self.std))
    }
}

// Auxiliary Functions //

/// Potentially useful recast of a whole slice
pub fn tof64<T>(s: &[T]) -> Vec<f64> where T: Copy, f64: From<T> {
    s.iter().map(| &x | f64::from(x)).collect()
}

/// Necessary recast of a whole i64 slice to f64 
pub fn i64tof64(s: &[i64]) -> Vec<f64> {
    s.iter().map(| &x | x as f64).collect()
}

/// Sum of linear weights 1..n.
/// Also the size of an upper or lower triangle 
/// of a square array (including the diagonal)
pub fn wsum(n: usize) -> f64 {
    (n * (n + 1)) as f64 / 2.
}

/// Generates a vector of n vectors, each of length d, all filled with random numbers for testing.
/// It needs two seeds s1 and s2. Same seeds will produce the same random sequence.  
/// Uses local closure `rand` to generate random numbers (avoids dependencies).  
/// Random numbers are in the open interval 0..1 with uniform distribution.  
pub fn genvec(d: usize, n: usize, mut s1: u32, mut s2: u32) -> Vec<Vec<f64>> {
    if n * d < 1 { panic!("{}\n\tnon positive dimensions",here!()) }
    // random numbers generating closure with captured seeds s1,s2 
    let mut rand = || {
        s1 = 36969 * (s1 & 65535) + (s1 >> 16);
        s2 = 18000 * (s2 & 65535) + (s2 >> 16);
        (((s1 << 16) & s2) as f64 + 1.0) * 2.328306435454494e-10
    };
    let mut v: Vec<Vec<f64>> = Vec::with_capacity(n);
    for _i in 0..n {
        let mut pt = Vec::with_capacity(d);
        for _j in 0..d {
            pt.push(rand())
        }
        v.push(pt)
    } // fills the lot with random numbers
    v
}

/// Generates a vector of n vectors, each of length d, all filled with random numbers for testing.
/// It needs two seeds s1 and s2. Same seeds will produce the same random sequence.  
/// Uses local closure `rand` to generate random numbers (avoids dependencies).  
/// Random numbers are in the closed interval 0..255 with uniform distribution.  
pub fn genvecu8(d: usize, n: usize, mut s1: u32, mut s2: u32) -> Vec<Vec<u8>> {
    if n * d < 1 { panic!("{}\n\tgenvecu8: non positive dimensions",here!()) } 
    // random numbers generating closure with captured seeds s1,s2 
    let mut rand = || {
        s1 = 36969 * (s1 & 65535) + (s1 >> 16);
        s2 = 18000 * (s2 & 65535) + (s2 >> 16);
        (256.0*(((s1 << 16) & s2) as f64 + 1.0)*2.328306435454494e-10).floor() as u8
    };
    let mut v: Vec<Vec<u8>> = Vec::with_capacity(n);
    for _i in 0..n {
        let mut pt = Vec::with_capacity(d);
        for _j in 0..d {
            pt.push(rand())
        }
        v.push(pt)
    } // fills the lot with random numbers
    v
}

// Traits

/// Statistical measures of a single variable (one generic vector of data) and 
/// vector algebra applicable to a single (generic) vector. 
/// Thus these methods take no arguments.
/// There is just one limitation: data of end type `i64` has to be explicitly converted to `f64`.
/// That is to raise awareness that, in this particular case, some precision may be lost. 
/// Function `statsg::i64tof64(&s)` will convert the whole slice.
pub trait Stats { 

    /// Vector magnitude
    fn vmag(self) -> f64;
    /// Vector magnitude squared (sum of squares)
    fn vmagsq(self) -> f64;
    /// vector with reciprocal components
    fn vreciprocal(self) -> Result<Vec<f64>>;
    /// vector with inverse magnitude 
    fn vinverse(self) -> Result<Vec<f64>>;
    // negated vector (all components swap sign)
    fn negv(self) -> Vec<f64>;
    /// Unit vector
    fn vunit(self) -> Vec<f64>;
    
    /// Arithmetic mean
    fn amean(self) -> Result<f64> 
        where Self: std::marker::Sized { bail!("amean not implemented for this type")}
    /// Arithmetic mean and standard deviation
    fn ameanstd(self) -> Result<MStats> 
        where Self: std::marker::Sized { bail!("ameanstd not implemented for this type")}
    /// Weighted arithmetic mean
    fn awmean(self) -> Result<f64> 
        where Self: std::marker::Sized { bail!("awmean not implemented for this type")}
    /// Weighted arithmetic men and standard deviation
    fn awmeanstd(self) -> Result<MStats>
        where Self: std::marker::Sized { bail!("awmeanstd not implemented for this type")}
    /// Harmonic mean
    fn hmean(self) -> Result<f64>
        where Self: std::marker::Sized { bail!("hmean not implemented for this type")}
    /// Harmonic mean and experimental standard deviation 
    fn hmeanstd(self) -> Result<MStats>
        where Self: std::marker::Sized { bail!("hmeanstd not implemented for this type")}
    /// Weighted harmonic mean
    fn hwmean(self) -> Result<f64> 
        where Self: std::marker::Sized { bail!("hwmean not implemented for this type")}
    /// Weighted harmonic mean and standard deviation 
    fn hwmeanstd(self) -> Result<MStats>
        where Self: std::marker::Sized { bail!("hwgmeanstd not implemented for this type")}
    /// Geometric mean
    fn gmean(self) -> Result<f64>
        where Self: std::marker::Sized { bail!("gmean not implemented for this type")}
    /// Geometric mean and standard deviation ratio
    fn gmeanstd(self) -> Result<MStats>
        where Self: std::marker::Sized { bail!("gmeanstd not implemented for this type")}
    /// Weighed geometric mean
    fn gwmean(self) -> Result<f64> 
        where Self: std::marker::Sized { bail!("gwmean not implemented for this type")}
    /// Weighted geometric mean and standard deviation ratio
    fn gwmeanstd(self) -> Result<MStats>
        where Self: std::marker::Sized { bail!("gwmeanstd not implemented for this type")}
    /// Median and quartiles
    fn median(self) -> Result<Med>
        where Self: std::marker::Sized { bail!("median not implemented for this type")}
    /// Probability density function of a sorted slice
    fn pdf(self) -> Vec<f64>;
    /// Information (entropy) in nats
    fn entropy(self) -> f64;
    /// (Auto)correlation coefficient of pairs of successive values of (time series) variable.
    fn autocorr(self) -> f64; 
    /// Linear transform to interval [0,1]
    fn lintrans(self) -> Vec<f64>;
    /// Reconstructs the full symmetric square matrix from its lower diagonal compact form,
    /// produced by covar, covone, wcovar
    fn symmatrix(self) -> Vec<Vec<f64>>;
   }

/// A few of the `Stats` methods are reimplemented here
/// (only for f64), so that they mutate `self` in-place.
/// This is more efficient and convenient in some circumstances.
pub trait MutStats {
    /// Invert the magnitude
    fn minvert(self); 
    // negate vector (all components swap sign)
    fn mneg(self);
    /// Make into a unit vector
    fn munit(self);
    /// Linearly transform to interval [0,1]
    fn mlintrans(self);
    /// Sort in place  
    fn msortf(self); 
}

/// Vector Algebra on two vectors (represented here as generic slices).
/// Also included are scalar operations on the `self` vector.
pub trait Vecg<T,U> { 
    /// Scalar addition to vector
    fn sadd(self, s:U) -> Vec<f64>;
    /// Scalar multiplication of vector, creates new vec
     fn smult(self, s:U) -> Vec<f64>;
    /// Scalar product 
    fn dotp(self, v:&[U]) -> f64;
    /// Cosine of angle between two slices
    fn cosine(self, v:&[U]) -> f64;
    /// Vectors' subtraction (difference)
    fn vsub(self, v:&[U]) -> Vec<f64>;
    /// Vectors difference as unit vector (done together for efficiency)
    fn vsubunit(self, v: &[U]) -> Vec<f64>; 
    /// Vector addition
    fn vadd(self, v:&[U]) -> Vec<f64>; 
    /// Euclidian distance 
    fn vdist(self, v:&[U]) -> f64;
    /// Euclidian distance squared
    fn vdistsq(self, v:&[U]) -> f64; 
     /// cityblock distance
    fn cityblockd(self, v:&[U]) -> f64;
    /// Magnitude of the cross product |a x b| = |a||b|sin(theta).
    /// Attains maximum `|a|.|b|` when the vectors are othogonal.
    fn varea(self, v:&[U]) -> f64;
    /// Area proportional to the swept arc
     fn varc(self, v:&[U]) -> f64; 
    /// Vector similarity S in the interval [0,1]: S = (1+cos(theta))/2
    fn vsim(self, v:&[U]) -> f64;
    /// We define vector dissimilarity D in the interval [0,1]: D = 1-S = (1-cos(theta))/2
    fn vdisim(self, v:&[U]) -> f64;
    /// Lower triangular part of a covariance matrix for a single f64 vector.
    fn covone(self, m:&[U]) -> Vec<f64>;
    /// Kronecker product of two vectors 
    fn kron(self, m:&[U]) -> Vec<f64>; 
    /// Outer product of two vectors 
    fn outer(self, m:&[U]) -> Vec<Vec<f64>>; 
    /// Joint probability density function 
    fn jointpdf(self,v:&[U]) -> Vec<f64>;     
    /// Joint entropy of &[T],&[U] in nats (units of e)
    fn jointentropy(self, v:&[U]) -> f64;
    /// Statistical dependence based on joint entropy
    fn dependence(self, v:&[U]) -> f64; 
    /// Pearson's correlation.  
    fn correlation(self, v:&[U]) -> f64; 
    /// Kendall Tau-B correlation. 
    fn kendalcorr(self, v:&[U]) -> f64;
    /// Spearman rho correlation.
    fn spearmancorr(self, v:&[U]) -> f64;   
}
/// Vector algebra: methods operaing on one vector of generic end type
/// and the second argument, which is a scalar or vector of f64 end type.
pub trait Vecf64<_T> {
    /// Scalar multiplication by f64
    fn smultf64(self, s:f64) -> Vec<f64>;
    /// Vector subtraction of `&[f64]`
    fn vsubf64(self, v:&[f64]) -> Vec<f64>;
    /// Vectors difference unitised (done together for efficiency)
    fn vsubunitf64(self, v: &[f64]) -> Vec<f64>;
    /// Euclidian distance to `&[f64]`
    fn vdistf64(self, v:&[f64]) -> f64;
    /// Adding `&[f64]` 
    fn vaddf64(self, v:&[f64]) -> Vec<f64>;
    /// Euclidian distance to `&[f64]` squared  
    fn vdistsqf64(self, v:&[f64]) -> f64;
}

/// Mutable vector operations that take one generic argument. 
/// A few of the essential `Vecg` methods are reimplemented here 
/// to mutate `self` in-place (only for f64). 
/// This is for efficiency and convenience, for example, in
/// vector iterative methods.
pub trait MutVecg<U> {
    /// mutable multiplication by a scalar
    fn mutsmult(self, _s:U);  
    /// mutable vector subtraction
    fn mutvsub(self, _v: &[U]); 
    /// mutable vector addition
    fn mutvadd(self, _v: &[U]);  
}

/// Vector mutating operations that take one argument of f64 end type.
pub trait MutVecf64 {
    /// mutable multiplication by a scalar
    fn mutsmultf64(self, _s:f64); 
    /// mutable vector subtraction 
    fn mutvsubf64(self, _v:&[f64]);
    /// mutable vector addition
    fn mutvaddf64(self, _v:&[f64]); 
}

/// Methods specialised to, or more efficient for `&[u8]`
pub trait Vecu8 {
    /// Scalar product of two (positive) u8 slices.   
    /// Must be of the same length - no error checking (for speed)
    fn dotpu8(self, v: &[u8]) -> u64; 
    /// Cosine between two (positive) u8 slices.
    fn cosineu8(self, v: &[u8]) -> f64; 
    /// Vector subtraction (converts results to f64 as they can be negative)
    fn vsubu8(self, v: &[u8]) -> Vec<f64>;
    /// Vector addition ( converts results to f64, as they can exceed 255 )
    fn vaddu8(self, v: &[u8]) -> Vec<f64>;
    /// Euclidian distance between self &[u8] and v:&[u8].  
    /// Faster than vsub followed by vmag, as both are done in one loop
    fn vdistu8(self, v: &[u8]) -> f64; 
    /// cityblock distance
    fn cityblockdu8(self, v:&[u8]) -> f64;
    ///Euclidian distance squared, the arguments are both of &[u8] type  
    fn vdistsqu8(self, v: &[u8]) -> u64; 
    /// Probability density function of bytes data
    fn pdfu8(self) -> Vec<f64>;
    /// Information (entropy) of &[u8] (in nats)
    fn entropyu8(self) -> f64;
    /// Joint probability density function (here just co-occurence counts) 
    /// of paired values in two vectors of bytes of the same length.
    /// Needs n^2 x 32bits of memory. 
    fn jointpdfu8(self, v:&[u8]) -> Vec<Vec<u32>>;
    /// Joint entropy of &[u8],&[u8] (in nats)
    fn jointentropyu8(self, v:&[u8]) -> f64;
    /// Dependence of two &[u8] variables, the range is [0,1],
    /// i.e. it returns 0 iff they are statistically independent
    /// and 1 when they are identical
    fn dependenceu8(self, v:&[u8]) -> f64;
}

/// A few specialised methods applicable to `Vec<Vec<u8>>` (vector of vectors of bytes).
pub trait VecVecu8 { 
    /// Centroid = euclidian mean of a set of points  
    fn acentroid(self) -> Vec<f64>;
    /// Weighted Centre
    fn wacentroid(self,ws: &[u8]) -> Vec<f64>;
    /// Eccentricity vector added to a non member point,
    fn nxnonmember(self, p:&[f64]) -> Vec<f64>;
    /// Weighted eccentricity vector for a non member point
    fn wnxnonmember(self, ws:&[u8], p:&[f64]) -> Vec<f64>; 
    /// Weighted geometric median, sorted eccentricities magnitudes, cpdf of the weights
    fn gmedian(self, eps:f64) -> Vec<f64>; 
    /// The weighted geometric median
    fn wgmedian(self, ws:&[u8], eps: f64) -> Vec<f64>;
    /// Lower triangular part of a covariance matrix of a Vec of u8 vectors.
    fn covar(self, med:&[f64]) -> Vec<f64>; 
    fn wcovar(self, ws:&[u8], m:&[f64]) -> Vec<f64>;
}

/// Methods applicable to a single argument: a vector of vectors of generic end type.
/// Operations on a set of multidimensional vectors.
pub trait VecVec<T> {
    /// Arithmetic Centre = euclidian mean of a set of points
    fn acentroid(self) -> Vec<f64>;
    /// Geometric Centroid
    fn gcentroid(self) -> Vec<f64>;
    /// Harmonic Centroid = harmonic mean of a set of points
    fn hcentroid(self) -> Vec<f64>; 
    /// Possible first iteration point for geometric medians
    fn firstpoint(self) -> Vec<f64>;
    /// Sums of distances from each point to all other points
    fn distsums(self) -> Vec<f64>;
    /// Medoid distance, its index, outlier distance, its index
    fn medout(self) -> MinMax<f64>; 
    /// Fast sums of distances from each point to all other points 
    fn distsuminset(self, indx: usize) -> f64;
    /// Next approx gm computed at a member point given by its indx
    fn nxmember(self, indx: usize) -> Vec<f64>;
    /// Next approx gm computed at a nonmember point p
    fn nxnonmember(self, p:&[f64]) -> Vec<f64>;
    /// Estimated ecentricity vector from a member point given by its indx to gm
    fn eccmember(self, indx: usize) -> Vec<f64>;
    /// Estimated eccentricity vector from a non member point
    fn eccnonmember(self, p:&[f64]) -> Vec<f64>;
    /// Estimated eccentricity vectors from each member point
    fn eccentricities(self) -> Vec<Vec<f64>>;
    /// Exact eccentricity vectors from each member point by first finding the gm.
    /// As well as being more accurate, it is usually faster than `eccentricities` above, 
    /// especially for large numbers of points.
    fn exacteccs(self, eps: f64) -> Vec<Vec<f64>>; 
    /// Median and quartiles of eccentricities (new robust measure of spread of a multivariate sample)
    fn eccinfo(self, eps: f64) -> (MStats,Med,MinMax<f64>) where Vec<f64>:FromIterator<f64>;
    /// Medoid and Outlier as defined by eccentricities.
    fn emedoid(self, eps: f64) -> MinMax<f64> where Vec<f64>:FromIterator<f64>;
    /// Returns sorted eccentricities magnitudes
    fn sortedeccs(self, ascending:bool, gm:&[f64]) -> Vec<f64>; 
    /// Improved Weizsfeld's Algorithm for geometric median, to accuracy eps
    // fn nmedian(self, eps: f64) -> Vec<f64>;
    /// New algorithm for geometric median, to accuracy eps    
    fn gmedian(self, eps: f64) -> Vec<f64>;
    /// Secant method geometric median
    fn smedian(self, eps: f64) -> Vec<f64>; 
    /// Same a smedian but returns also the number of iterations 
    fn ismedian(self, eps: f64) -> (Vec<f64>,usize);   
}

/// Methods applicable to vector of vectors of generic end type and one argument
/// of a similar kind.
pub trait VecVecg<T,U> { 
    /// Weighted Arithmetic Centre = weighted euclidian mean of a set of points
    fn wacentroid(self,ws: &[U]) -> Vec<f64>;
    /// Trend between two sets
    fn trend(self, eps: f64, v: Vec<Vec<U>>) -> Vec<f64>;
    /// Subtract m from all points - e.g. transform to zero median form
    fn translate(self, m: &[U]) -> Vec<Vec<f64>>;
    /// Sum of distances from arbitrary point (v) to all the points in self   
    fn distsum(self, v: &[U]) -> f64;
    /// Individual distances from any point v (typically not in self) to all the points in self.    
    fn dists(self, v: &[U]) -> Vec<f64>;
    /// Medoid and Outlier (by distance) of a set of points
    // fn medoid(self) -> MinMax<f64>; 
    /// ( wgm, sorted eccentricities magnitudes, associated cpdf )
    fn wsortedeccs(self,ws:&[U],gm:&[f64]) -> (Vec<f64>,Vec<f64>); 
    /// Sorted cosines magnitudes and cpdf, needs central median
    fn wsortedcos(self,medmed:&[U],unitzmed:&[U],ws:&[U]) -> (Vec<f64>,Vec<f64>); 
    /// Estimated weighted gm computed at a non member point
    fn wnxnonmember(self, ws:&[U], p:&[f64]) -> Vec<f64>; 
    /// Estimated weighted eccentricity for a non-member point 
    fn weccnonmember(self, ws:&[U], p:&[f64]) -> Vec<f64>;
    /// The weighted geometric median to accuracy eps 
    fn wgmedian(self, ws: &[U], eps: f64) -> Vec<f64>; 
    /// The weighted geometric median to accuracy eps
    fn wsmedian(self, ws: &[U], eps: f64) -> Vec<f64>; 
    /// Flattened lower triangular part of a covariance matrix of a Vec of f64 vectors.
    fn covar(self, med:&[U]) -> Vec<f64>; 
    /// Flattened lower triangular part of a comediance matrix of a Vec of f64 vectors.
    fn comed(self, m:&[U], eps:f64) -> Vec<f64>;
    /// Flattened lower triangular part of a covariance matrix for weighted f64 vectors.
    fn wcovar(self, ws:&[U], m:&[f64]) -> Vec<f64>;
    /// Flattened lower triangular part of a comediance matrix for weighted f64 vectors.
    fn wcomed(self, ws:&[U], m:&[f64], eps:f64) -> Vec<f64>;
}