lorikeet_genome/utils/
math_utils.rs

1use ordered_float::OrderedFloat;
2use statrs::function::gamma::ln_gamma;
3use std::clone::Clone;
4use std::ops::{Add, AddAssign, Mul, Sub};
5
6use crate::utils::natural_log_utils::NaturalLogUtils;
7
8lazy_static! {
9    static ref cache: Vec<f64> = (0..((JacobianLogTable::MAX_TOLERANCE
10        / JacobianLogTable::TABLE_STEP)
11        + 1.0) as usize)
12        .into_iter()
13        .map(|k| { (1.0 + (10.0_f64).powf(-(k as f64) * JacobianLogTable::TABLE_STEP)).log10() })
14        .collect::<Vec<f64>>();
15    pub static ref LOG10_ONE_HALF: f64 = (0.5 as f64).log10();
16    pub static ref LOG10_ONE_THIRD: f64 = -((3.0 as f64).log10());
17    pub static ref LOG_ONE_THIRD: f64 = -((3.0 as f64).ln());
18    pub static ref INV_LOG_2: f64 = (1.0 as f64) / (2.0 as f64).ln();
19    static ref LOG_10: f64 = (10. as f64).ln();
20    static ref INV_LOG_10: f64 = (1.0) / *LOG_10;
21    pub static ref LOG10_E: f64 = std::f64::consts::E.log10();
22    static ref ROOT_TWO_PI: f64 = (2.0 * std::f64::consts::PI).sqrt();
23}
24
25pub struct MathUtils {}
26
27impl MathUtils {
28    pub const LOG10_P_OF_ZERO: f64 = -1000000.0;
29
30    // const LOG_10_CACHE: Log10Cache
31    // const LOG_10_FACTORIAL_CACHE: Log10FactorialCache
32    // const DIGAMMA_CACHE: DiGammaCache
33
34    pub fn median_clone<T: PartialOrd + Copy>(numbers: &[T]) -> T {
35        let mut numbers = numbers.to_vec();
36        numbers.sort_by(|a, b| a.partial_cmp(b).unwrap());
37        let mid = numbers.len() / 2;
38        numbers[mid]
39    }
40
41    pub fn median<T: Ord + PartialOrd + Copy>(numbers: &mut [T]) -> T {
42        numbers.sort();
43        let mid = numbers.len() / 2;
44        numbers[mid]
45    }
46
47    pub fn normalize_pls(pls: &[f64]) -> Vec<f64> {
48        let mut new_pls = vec![0.0; pls.len()];
49        let smallest = *pls
50            .iter()
51            .min_by_key(|x| OrderedFloat(**x))
52            .unwrap_or(&std::f64::NAN);
53        new_pls.iter_mut().enumerate().for_each(|(i, pl)| {
54            *pl = pls[i] - smallest;
55        });
56
57        return new_pls;
58    }
59
60    /**
61     * Element by elemnt addition of two vectors in place
62     */
63    pub fn ebe_add_in_place<T: Send + Sync + Add + Copy + AddAssign>(a: &mut [T], b: &[T]) {
64        a.iter_mut().enumerate().for_each(|(i, val)| *val += b[i]);
65    }
66
67    /**
68     * Element by elemnt addition of two vectors
69     */
70    pub fn ebe_add<T: Send + Sync + Add + Copy + Add<Output = T>>(a: &[T], b: &[T]) -> Vec<T> {
71        let z = a
72            .iter()
73            .zip(b.iter())
74            .map(|(aval, bval)| *aval + *bval)
75            .collect::<Vec<T>>();
76        z
77    }
78
79    /**
80     * Element by elemnt subtraction of two vectors
81     */
82    pub fn ebe_subtract<T: Send + Sync + Sub + Copy + Sub<Output = T>>(a: &[T], b: &[T]) -> Vec<T> {
83        let z = a
84            .iter()
85            .zip(b.iter())
86            .map(|(aval, bval)| *aval - *bval)
87            .collect::<Vec<T>>();
88        z
89    }
90
91    /**
92     * Element by elemnt multiplication of two vectors
93     */
94    pub fn ebe_multiply<T: Send + Sync + Mul + Copy + Mul<Output = T>>(a: &[T], b: &[T]) -> Vec<T> {
95        let z = a
96            .into_iter()
97            .zip(b.iter())
98            .map(|(aval, bval)| *aval * *bval)
99            .collect::<Vec<T>>();
100        z
101    }
102
103    /**
104     * calculates the dot product of two vectors
105     */
106    pub fn dot_product<
107        T: Send + Sync + Mul + Add + Copy + Mul<Output = T> + Add<Output = T> + std::iter::Sum,
108    >(
109        a: &[T],
110        b: &[T],
111    ) -> T {
112        Self::ebe_multiply(a, b).into_iter().sum::<T>()
113    }
114
115    /**
116     * Converts LN to LOG10
117     * @param ln log(x)
118     * @return log10(x)
119     */
120    pub fn log_to_log10(ln: f64) -> f64 {
121        ln * *LOG10_E
122    }
123
124    /**
125     * @see #binomialCoefficient(int, int) with log10 applied to result
126     */
127    pub fn log10_binomial_coeffecient(n: f64, k: f64) -> f64 {
128        return MathUtils::log10_factorial(n)
129            - MathUtils::log10_factorial(k)
130            - MathUtils::log10_factorial(n - k);
131    }
132
133    pub fn log10_factorial(n: f64) -> f64 {
134        ln_gamma(n + 1.0) * *LOG10_E
135    }
136
137    /**
138     * Gets the maximum element's index of an array of f64 values
139     * Rather convoluted due to Rust not allowing proper comparisons between floats
140     */
141    pub fn max_element_index(array: &[f64], start: usize, finish: usize) -> usize {
142        let mut max_i = start;
143        for i in (start + 1)..finish {
144            if array[i] > array[max_i] {
145                max_i = i;
146            }
147        }
148
149        return max_i;
150    }
151
152    pub fn normalize_log10(mut array: Vec<f64>, take_log10_of_output: bool) -> Vec<f64> {
153        let log10_sum = MathUtils::log10_sum_log10(&array, 0, array.len());
154        array.iter_mut().for_each(|x| *x = *x - log10_sum);
155        if !take_log10_of_output {
156            array.iter_mut().for_each(|x| *x = 10.0_f64.powf(*x))
157        }
158        return array;
159    }
160
161    pub fn log10_sum_log10(log10_values: &[f64], start: usize, finish: usize) -> f64 {
162        if start >= finish {
163            return std::f64::NEG_INFINITY;
164        }
165
166        let max_element_index = MathUtils::max_element_index(log10_values, start, finish);
167
168        let max_value = log10_values[max_element_index];
169
170        if max_value == std::f64::NEG_INFINITY {
171            return max_value;
172        }
173
174        let sum_tot = 1.0
175            + log10_values[start..finish]
176                .iter()
177                .enumerate()
178                .filter(|(index, value)| {
179                    *index != max_element_index && **value != std::f64::NEG_INFINITY
180                })
181                .map(|(_, value)| {
182                    let scaled_val = value - max_value;
183                    10.0_f64.powf(scaled_val)
184                })
185                .sum::<f64>();
186
187        if sum_tot.is_nan() || sum_tot == std::f64::INFINITY {
188            panic!("log10 p: Values must be non-infinite and non-NAN")
189        }
190
191        max_value
192            + (if (sum_tot - 1.0).abs() > f64::EPSILON {
193                sum_tot.log10()
194            } else {
195                0.0
196            })
197    }
198
199    pub fn log10_sum_log10_two_values(a: f64, b: f64) -> f64 {
200        if a > b {
201            a + (1. + 10.0_f64.powf(b - a)).log10()
202        } else {
203            b + (1. + 10.0_f64.powf(a - b)).log10()
204        }
205    }
206
207    /**
208     * Do the log-sum trick for three double values.
209     * @param a
210     * @param b
211     * @param c
212     * @return the sum... perhaps NaN or infinity if it applies.
213     */
214    pub fn log10_sum_log10_three_values(a: f64, b: f64, c: f64) -> f64 {
215        if a >= b && a >= c {
216            a + (1.0 + 10.0_f64.powf(b - a) + 10.0_f64.powf(c - a)).log10()
217        } else if b >= c {
218            b + (1.0 + 10.0_f64.powf(a - b) + 10.0_f64.powf(c - b)).log10()
219        } else {
220            c + (1.0 + 10.0_f64.powf(a - c) + 10.0_f64.powf(b - c)).log10()
221        }
222    }
223
224    /**
225     * Given an array of log space (log or log10) values, subtract all values by the array maximum so that the max element in log space
226     * is zero.  This is equivalent to dividing by the maximum element in real space and is useful for avoiding underflow/overflow
227     * when the array's values matter only up to an arbitrary normalizing factor, for example, an array of likelihoods.
228     *
229     * @param array
230     * @return the scaled-in-place array
231     */
232    pub fn scale_log_space_array_for_numeric_stability(array: &[f64]) -> Vec<f64> {
233        let max_value: f64 = *array
234            .iter()
235            .max_by_key(|x| OrderedFloat(**x))
236            .unwrap_or(&std::f64::NAN);
237        let result = array.iter().map(|x| *x - max_value).collect::<Vec<f64>>();
238        result
239    }
240
241    /**
242     * See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space
243     *
244     * @param array
245     * @param takeLog10OfOutput
246     * @param keepInLogSpace
247     *
248     * @return array
249     */
250    //TODO: Check that this works
251    pub fn normalize_from_log10(
252        array: &[f64],
253        take_log10_of_output: bool,
254        keep_in_log_space: bool,
255    ) -> Vec<f64> {
256        // for precision purposes, we need to add (or really subtract, since they're
257        // all negative) the largest value; also, we need to convert to normal-space.
258        let max_value: f64 = *array
259            .iter()
260            .max_by_key(|x| OrderedFloat(**x))
261            .unwrap_or(&std::f64::NAN);
262
263        // we may decide to just normalize in log space without converting to linear space
264        if keep_in_log_space {
265            let array: Vec<f64> = array.iter().map(|x| *x - max_value).collect::<Vec<f64>>();
266            return array;
267        }
268        // default case: go to linear space
269        let mut normalized: Vec<f64> = (0..array.len())
270            .into_iter()
271            .map(|i| 10.0_f64.powf(array[i] - max_value))
272            .collect::<Vec<f64>>();
273
274        let sum: f64 = normalized.iter().sum::<f64>();
275
276        normalized.iter_mut().enumerate().for_each(|(i, x)| {
277            *x = *x / sum;
278            if take_log10_of_output {
279                *x = x.log10();
280                if *x < MathUtils::LOG10_P_OF_ZERO || x.is_infinite() {
281                    *x = array[i] - max_value
282                }
283            }
284        });
285
286        normalized
287    }
288
289    pub fn is_valid_log10_probability(result: f64) -> bool {
290        result <= 0.0
291    }
292
293    pub fn log10_to_log(log10: f64) -> f64 {
294        log10 * (*LOG_10)
295    }
296    /**
297     * Calculates {@code log10(1-10^a)} without losing precision.
298     *
299     * @param a the input exponent.
300     * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value.
301     */
302    pub fn log10_one_minus_pow10(a: f64) -> f64 {
303        if a > 0.0 {
304            return std::f64::NAN;
305        }
306        if a == 0.0 {
307            return std::f64::NEG_INFINITY;
308        }
309
310        let b = a * *LOG_10;
311        return NaturalLogUtils::log1mexp(b) * *INV_LOG_10;
312    }
313
314    pub fn approximate_log10_sum_log10(a: f64, b: f64) -> f64 {
315        // this code works only when a <= b so we flip them if the order is opposite
316        if a > b {
317            MathUtils::approximate_log10_sum_log10(b, a)
318        } else if a == std::f64::NEG_INFINITY {
319            b
320        } else {
321            // if |b-a| < tol we need to compute log(e^a + e^b) = log(e^b(1 + e^(a-b))) = b + log(1 + e^(-(b-a)))
322            // we compute the second term as a table lookup with integer quantization
323            // we have pre-stored correction for 0,0.1,0.2,... 10.0
324            let diff = b - a;
325
326            b + if diff < JacobianLogTable::MAX_TOLERANCE {
327                JacobianLogTable::get(diff)
328            } else {
329                0.0
330            }
331        }
332    }
333
334    /**
335     * Calculate the approximate log10 sum of an array range.
336     * @param vals the input values.
337     * @param fromIndex the first inclusive index in the input array.
338     * @param toIndex index following the last element to sum in the input array (exclusive).
339     * @return the approximate sum.
340     * @throws IllegalArgumentException if {@code vals} is {@code null} or  {@code fromIndex} is out of bounds
341     * or if {@code toIndex} is larger than
342     * the length of the input array or {@code fromIndex} is larger than {@code toIndex}.
343     */
344    pub fn approximate_log10_sum_log10_vec(
345        vals: &[f64],
346        from_index: usize,
347        to_index: usize,
348    ) -> f64 {
349        if from_index == to_index {
350            return std::f64::NEG_INFINITY;
351        };
352        let max_element_index = Self::max_element_index(vals, from_index, to_index);
353        let mut approx_sum = vals[max_element_index];
354
355        let mut i = from_index;
356        for val in vals[from_index..to_index].iter() {
357            if i == max_element_index || val == &std::f64::NEG_INFINITY {
358                i += 1;
359                continue;
360            };
361            let diff = approx_sum - val;
362            if diff < JacobianLogTable::MAX_TOLERANCE {
363                approx_sum += JacobianLogTable::get(diff);
364            };
365
366            i += 1;
367        }
368
369        return approx_sum;
370    }
371
372    pub fn well_formed_f64(val: f64) -> bool {
373        return !val.is_nan() && !val.is_infinite();
374    }
375
376    /**
377     * Calculate f(x) = Normal(x | mu = mean, sigma = sd)
378     * @param mean the desired mean of the Normal distribution
379     * @param sd the desired standard deviation of the Normal distribution
380     * @param x the value to evaluate
381     * @return a well-formed double
382     */
383    pub fn normal_distribution(mean: f64, sd: f64, x: f64) -> f64 {
384        assert!(sd >= 0.0, "Standard deviation must be >= 0.0");
385        // assert!(
386        //     Self::well_formed_f64(mean) && Self::well_formed_f64(sd) && Self::well_formed_f64(x),
387        //     "mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"
388        // );
389
390        return (-(x - mean) * (x - mean) / (2.0 * sd * sd)).exp() / (sd * *ROOT_TWO_PI);
391    }
392
393    /**
394     * normalizes the real-space probability array.
395     *
396     * Does not assume anything about the values in the array, beyond that no elements are below 0.  It's ok
397     * to have values in the array of > 1, or have the sum go above 0.
398     *
399     * @param array the array to be normalized
400     * @return a newly allocated array corresponding the normalized values in array
401     */
402    pub fn normalize_sum_to_one(mut array: Vec<f64>) -> Vec<f64> {
403        if array.len() == 0 {
404            return array;
405        }
406
407        let sum = array.iter().sum::<f64>();
408        assert!(
409            sum >= 0.0,
410            "Values in probability array sum to a negative number"
411        );
412        array.iter_mut().for_each(|x| *x = *x / sum);
413
414        return array;
415    }
416
417    /**
418     * Computes the entropy -p*ln(p) - (1-p)*ln(1-p) of a Bernoulli distribution with success probability p
419     * using an extremely fast Pade approximation that is very accurate for all values of 0 <= p <= 1.
420     *
421     * See http://www.nezumi.demon.co.uk/consult/logx.htm
422     */
423    pub fn fast_bernoulli_entropy(p: f64) -> f64 {
424        let product = p * (1.0 - p);
425        return product * ((11.0 + 33.0 * product) / (2.0 + 20.0 * product));
426    }
427
428    pub fn is_valid_probability(result: f64) -> bool {
429        return result >= 0.0 && result <= 1.0;
430    }
431}
432
433#[derive(Debug, Clone, Copy)]
434pub struct RunningAverage {
435    mean: f64,
436    s: f64,
437    obs_count: usize,
438}
439
440impl RunningAverage {
441    pub fn new() -> RunningAverage {
442        RunningAverage {
443            mean: 0.0,
444            s: 0.0,
445            obs_count: 0,
446        }
447    }
448
449    pub fn add(&mut self, obs: f64) {
450        self.obs_count += 1;
451        let old_mean = self.mean;
452        self.mean += (obs - self.mean) / self.obs_count as f64;
453        self.s += (obs - old_mean) * (obs - self.mean)
454    }
455
456    pub fn add_all(&mut self, col: &[f64]) {
457        for obs in col {
458            self.add(*obs)
459        }
460    }
461
462    pub fn mean(&self) -> f64 {
463        self.mean
464    }
465
466    pub fn stddev(&self) -> f64 {
467        (self.s / (self.obs_count - 1) as f64).sqrt()
468    }
469
470    pub fn var(&self) -> f64 {
471        self.s / (self.obs_count - 1) as f64
472    }
473
474    pub fn obs_count(&self) -> usize {
475        self.obs_count
476    }
477}
478
479/**
480 * Encapsulates the second term of Jacobian log identity for differences up to MAX_TOLERANCE
481 */
482struct JacobianLogTable {}
483
484impl JacobianLogTable {
485    // if log(a) - log(b) > MAX_TOLERANCE, b is effectively treated as zero in approximateLogSumLog
486    // MAX_TOLERANCE = 8.0 introduces an error of at most one part in 10^8 in sums
487    pub const MAX_TOLERANCE: f64 = 8.0;
488
489    //  Phred scores Q and Q+1 differ by 0.1 in their corresponding log-10 probabilities, and by
490    // 0.1 * log(10) in natural log probabilities.  Setting TABLE_STEP to an exact divisor of this
491    // quantity ensures that approximateSumLog in fact caches exact values for integer phred scores
492    pub const TABLE_STEP: f64 = 0.0001;
493    pub const INV_STEP: f64 = (1.0) / JacobianLogTable::TABLE_STEP;
494
495    pub fn get(difference: f64) -> f64 {
496        let index = (difference * JacobianLogTable::INV_STEP).round() as usize;
497        return cache[index];
498    }
499
500    // pub fn fast_round(d: f64) -> usize {
501    //     if d > 0.0 {
502    //         (d + 0.5) as usize
503    //     } else {
504    //         (d - 0.5) as usize
505    //     }
506    // }
507}