lightmotif_tfmpvalue/
lib.rs

1#![doc = include_str!("../README.md")]
2
3use std::cmp::Ordering;
4use std::collections::HashMap;
5use std::fmt::Debug;
6use std::ops::RangeInclusive;
7
8use lightmotif::abc::Alphabet;
9use lightmotif::dense::DenseMatrix;
10use lightmotif::num::Unsigned;
11use lightmotif::pwm::ScoringMatrix;
12
13mod hash;
14
15/// The fast integer map type used to record *Q*-values and *p*-values.
16pub type IntMap<V> = HashMap<i64, V, self::hash::IntHasherBuilder>;
17
18/// The TFM-PVALUE algorithm.
19#[derive(Debug)]
20pub struct TfmPvalue<A: Alphabet, M: AsRef<ScoringMatrix<A>>> {
21    /// A reference to the original scoring matrix.
22    matrix: M,
23    /// A permutation of the original matrix rows.
24    permutation: Vec<usize>,
25    /// The granularity with which the round matrix has been built.
26    granularity: f64,
27    /// The round matrix offsets.
28    offsets: Vec<i64>,
29    /// Rescaled PSSM in integer space.
30    int_matrix: DenseMatrix<i64, A::K>,
31    /// The maximum error caused by integer rescale.
32    error_max: f64,
33    /// The maximum integer score reachable at each row of the matrix.
34    max_score_rows: Vec<i64>,
35    /// The minimum integer score reachable at each row of the matrix.
36    min_score_rows: Vec<i64>,
37    /// The Q-values for the current granularity
38    qvalues: Vec<IntMap<f64>>,
39}
40
41#[allow(non_snake_case)]
42impl<A: Alphabet, M: AsRef<ScoringMatrix<A>>> TfmPvalue<A, M> {
43    /// Initialize the TFM-PVALUE algorithm for the given scoring matrix.
44    pub fn new(matrix: M) -> Self {
45        let m = matrix.as_ref();
46        let M = m.len();
47
48        // Compute the column permutation by decreasing score range
49        // over each row to minimize the total size of score ranges
50        // (see TFM-PVALUE paper, Lemma 7).
51        let range = (0..M)
52            .map(|i| {
53                let row = &m[i][..A::K::USIZE - 1];
54                let max_score = row.iter().cloned().reduce(f32::max).unwrap_or_default();
55                let min_score = row.iter().cloned().reduce(f32::min).unwrap_or_default();
56                max_score - min_score
57            })
58            .collect::<Vec<_>>();
59        let mut permutation: Vec<usize> = (0..M).collect();
60        permutation.sort_unstable_by(|i, j| range[*j].partial_cmp(&range[*i]).unwrap());
61
62        Self {
63            granularity: f64::NAN,
64            matrix,
65            permutation,
66            offsets: vec![0; M],
67            int_matrix: DenseMatrix::new(M),
68            max_score_rows: vec![0; M],
69            min_score_rows: vec![0; M],
70            qvalues: vec![IntMap::default(); M + 1],
71            error_max: 0.0,
72        }
73    }
74
75    /// Return a reference to the wrapped matrix reference.
76    pub fn as_inner(&self) -> &M {
77        &self.matrix
78    }
79
80    /// Extract the wrapped matrix reference.
81    pub fn into_inner(self) -> M {
82        self.matrix
83    }
84
85    /// Compute the approximate score matrix with the given granularity.
86    fn recompute(&mut self, granularity: f64) {
87        assert!(granularity < 1.0);
88        let matrix = self.matrix.as_ref();
89        let M: usize = matrix.len();
90        let K: usize = <A as Alphabet>::K::USIZE;
91
92        // compute effective granularity
93        self.granularity = granularity;
94
95        // compute integer matrix using optimal column permutation
96        for (i, &p) in self.permutation.iter().enumerate() {
97            for j in 0..K - 1 {
98                self.int_matrix[i][j] = (matrix[p][j] as f64 / self.granularity).floor() as i64;
99            }
100        }
101
102        // compute maximum error by summing max error at each row
103        self.error_max = 0.0;
104        for i in 1..M {
105            let max_e = matrix[self.permutation[i]]
106                .iter()
107                .enumerate()
108                .map(|(j, &x)| (x as f64) / self.granularity - self.int_matrix[i][j] as f64)
109                .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Less))
110                .unwrap();
111            self.error_max += max_e;
112        }
113
114        // compute offsets
115        for i in 0..M {
116            self.offsets[i] = -*self.int_matrix[i][..K - 1].iter().min().unwrap();
117            for j in 0..K - 1 {
118                self.int_matrix[i][j] += self.offsets[i];
119            }
120        }
121
122        // look for the minimum score of the matrix for each row
123        for i in 0..M {
124            self.min_score_rows[i] = *self.int_matrix[i][..K - 1].iter().min().unwrap();
125            self.max_score_rows[i] = *self.int_matrix[i][..K - 1].iter().max().unwrap();
126        }
127    }
128
129    /// Compute the score distribution between `min` and `max`.
130    ///
131    /// The resulting distributions is stored in `self.qvalues`.
132    fn distribution(&mut self, min: i64, max: i64) {
133        // Clear Q-values
134        for map in self.qvalues.iter_mut() {
135            map.clear();
136        }
137
138        //
139        let matrix = self.matrix.as_ref();
140        let M: usize = matrix.len();
141        let K: usize = <A as Alphabet>::K::USIZE;
142
143        // background frequencies
144        let bg = matrix.background().frequencies();
145
146        // maximum score reachable with the suffix matrix from i to M-1
147        let mut maxs = vec![0; M + 1];
148        for i in (0..M).rev() {
149            maxs[i] = maxs[i + 1] + self.max_score_rows[i];
150        }
151
152        // initialize the map at first position with background frequencies
153        for k in 0..K - 1 {
154            if self.int_matrix[0][k] + maxs[1] >= min {
155                *self.qvalues[0].entry(self.int_matrix[0][k]).or_default() += bg[k] as f64;
156            }
157        }
158
159        // compute q values for scores greater or equal to min
160        self.qvalues[M - 1].insert(max + 1, 0.0);
161        for pos in 1..M {
162            // get the matrix row at the current position
163            let int_row = &self.int_matrix[pos];
164            // split the array in two to make the borrow checker happy
165            let (l, r) = self.qvalues.split_at_mut(pos);
166            // iterate on every reachable score at the current position
167            for (key, val) in &l[pos - 1] {
168                for k in 0..K - 1 {
169                    let sc = key + int_row[k];
170                    if sc + maxs[pos + 1] >= min {
171                        // the score min can be reached
172                        let occ = val * bg[k] as f64;
173                        if sc > max {
174                            // the score will be greater than max for all suffixes
175                            *r[M - 1 - pos].entry(max + 1).or_default() += occ;
176                        } else {
177                            *r[0].entry(sc).or_default() += occ;
178                        }
179                    }
180                }
181            }
182        }
183    }
184
185    /// Search the p-value range for the given score.
186    fn lookup_pvalue(&mut self, score: f64) -> RangeInclusive<f64> {
187        assert!(!self.granularity.is_nan());
188        let matrix = self.matrix.as_ref();
189        let M: usize = matrix.len();
190
191        // Compute the integer score range from the given score.
192        let scaled = score / self.granularity + self.offsets.iter().sum::<i64>() as f64;
193        let avg = scaled.floor() as i64;
194        let max = (scaled + self.error_max + 1.0).floor() as i64;
195        let min = (scaled - self.error_max - 1.0).floor() as i64;
196
197        // Compute q values for the given scores
198        self.distribution(min, max);
199
200        // Compute p-values
201        let mut pvalues = IntMap::default();
202        let mut s = max + 1;
203        let mut last = self.qvalues[M - 1].keys().cloned().collect::<Vec<i64>>();
204        last.sort_unstable_by(|x, y| x.partial_cmp(y).unwrap());
205        let mut sum = self.qvalues[0].get(&(max + 1)).cloned().unwrap_or_default();
206        for &l in last.iter().rev() {
207            sum += self.qvalues[M - 1][&l];
208            if l >= avg {
209                s = l;
210            }
211            pvalues.insert(l, sum);
212        }
213
214        // Find the p-value range for the requested score
215        let mut keys = pvalues.keys().cloned().collect::<Vec<i64>>();
216        keys.sort_unstable_by(|x, y| x.partial_cmp(y).unwrap());
217        let mut kmax = keys.iter().position(|&k| k == s).unwrap();
218        while kmax > 0 && keys[kmax] as f64 >= s as f64 - self.error_max {
219            kmax -= 1;
220        }
221
222        // Return p-value range
223        let pmax = pvalues[&keys[kmax]];
224        let pmin = pvalues[&s];
225        RangeInclusive::new(pmin, pmax)
226    }
227
228    /// Search the score and p-value range for a given p-value.
229    fn lookup_score(
230        &mut self,
231        pvalue: f64,
232        range: RangeInclusive<i64>,
233    ) -> (i64, RangeInclusive<f64>) {
234        assert!(!self.granularity.is_nan());
235        let matrix = self.matrix.as_ref();
236        let M: usize = matrix.len();
237
238        // compute score range for target pvalue
239        let min = *range.start();
240        let max = *range.end();
241
242        // compute q values
243        self.distribution(min, max);
244        let mut pvalues = IntMap::default();
245
246        // find most likely scores at the end of the matrix
247        let mut keys = self.qvalues[M - 1].keys().cloned().collect::<Vec<_>>();
248        keys.sort_unstable_by(|x, y| x.partial_cmp(y).unwrap());
249
250        // compute pvalues
251        let mut sum = 0.0;
252        let mut riter = keys.len() - 1;
253        let alpha;
254        let alpha_e;
255        while riter > 0 {
256            sum += self.qvalues[M - 1][&keys[riter]];
257            pvalues.insert(keys[riter], sum);
258            if sum >= pvalue {
259                break;
260            }
261            riter -= 1;
262        }
263
264        if sum > pvalue {
265            alpha_e = keys[riter];
266            alpha = keys[riter + 1];
267        } else {
268            if riter == 0 {
269                alpha = keys[0];
270                alpha_e = keys[0];
271            } else {
272                alpha = keys[riter];
273                alpha_e = keys[riter - 1];
274                sum += pvalues.get(&alpha_e).cloned().unwrap_or_default();
275            }
276            pvalues.insert(alpha_e, sum);
277        }
278
279        if (alpha - alpha_e) as f64 > self.error_max {
280            (alpha, RangeInclusive::new(pvalues[&alpha], pvalues[&alpha]))
281        } else {
282            (
283                alpha,
284                RangeInclusive::new(pvalues[&alpha_e], pvalues[&alpha]),
285            )
286        }
287    }
288
289    /// Compute the exact P-value for the given score.
290    ///
291    /// # Caution
292    /// This method internally calls `approximate_pvalue` without bounds on
293    /// the granularity, which may require a very large amount of memory for
294    /// some scoring matrices. Use `approximate_pvalue` directly to add
295    /// limits on the number of iterations or on the granularity.
296    pub fn pvalue(&mut self, score: f64) -> f64 {
297        let it = self.approximate_pvalue(score).last().unwrap();
298        assert!(it.converged); // algorithm should always converge
299        *it.range.start()
300    }
301
302    /// Iterate with decreasing granularity to compute an approximate *p*-value for a score.
303    ///
304    /// # Example
305    /// Approximate a *p*-value for a score of `10.0` with a granularity of
306    /// `0.001`:
307    /// ```rust
308    /// # use lightmotif::abc::Dna;
309    /// # let pssm = lightmotif::pwm::CountMatrix::<Dna>::new(
310    /// #     lightmotif::dense::DenseMatrix::from_rows([
311    /// #         [1, 0, 1, 0, 0],
312    /// #         [0, 1, 1, 0, 0],
313    /// #         [0, 0, 0, 2, 0],
314    /// #         [0, 0, 2, 0, 0],
315    /// #     ])
316    /// # ).unwrap().to_freq(0.1).to_scoring(None);
317    /// // Initialize the TFM-PVALUE algorithm for a lightmotif PSSM
318    /// let mut tfmp = lightmotif_tfmpvalue::TfmPvalue::new(&pssm);
319    ///
320    /// // Compute the p-value for a score by iterating
321    /// // until granularity or convergence are reached.
322    /// let p_value = tfmp.approximate_pvalue(10.0)
323    ///     .find(|it| it.converged || it.granularity <= 0.001)
324    ///     .map(|it| *it.range.start())
325    ///     .unwrap();
326    /// ```
327    pub fn approximate_pvalue(&mut self, score: f64) -> PvaluesIterator<'_, A, M> {
328        PvaluesIterator {
329            tfmp: self,
330            score,
331            decay: 10.0,
332            granularity: 0.1,
333            target: 0.0,
334            converged: false,
335        }
336    }
337
338    /// Compute the exact score associated with a given *p*-value.
339    ///
340    /// # Caution
341    /// This method internally calls `approximate_score` without bounds on
342    /// the granularity, which may require a very large amount of memory for
343    /// some scoring matrices. Use `approximate_score` directly to add
344    /// limits on the number of iterations or on the granularity.
345    pub fn score(&mut self, pvalue: f64) -> f64 {
346        let it = self.approximate_score(pvalue).last().unwrap();
347        assert!(it.converged); // algorithm should always converge
348        it.score
349    }
350
351    /// Iterate with decreasing granularity to compute an approximate score for a *p*-value.
352    pub fn approximate_score(&mut self, pvalue: f64) -> ScoresIterator<'_, A, M> {
353        self.recompute(0.1);
354        ScoresIterator {
355            min: self.min_score_rows.iter().sum(),
356            max: self.max_score_rows.iter().sum::<i64>() + (self.error_max + 0.5).ceil() as i64,
357            tfmp: self,
358            pvalue,
359            decay: 10.0,
360            granularity: 0.1,
361            target: 0.0,
362            converged: false,
363        }
364    }
365}
366
367impl<A: Alphabet, M: AsRef<ScoringMatrix<A>>> From<M> for TfmPvalue<A, M> {
368    fn from(matrix: M) -> Self {
369        Self::new(matrix)
370    }
371}
372
373/// The result of an iteration of the TFM-PVALUE algorithm.
374#[derive(Debug, Clone)]
375pub struct Iteration {
376    /// The score computed for the current iteration, or the query score
377    /// if approximating p-value.
378    pub score: f64,
379    /// The p-value range for the current iteration.
380    pub range: RangeInclusive<f64>,
381    /// The granularity with which scores and p-values were computed.
382    pub granularity: f64,
383    /// A flag to mark whether the approximation converged on this iteration.
384    pub converged: bool,
385    #[allow(unused)]
386    _hidden: (),
387}
388
389/// A helper type running iterations to approximate the *p*-value for a score.
390#[derive(Debug)]
391pub struct PvaluesIterator<'tfmp, A: Alphabet, M: AsRef<ScoringMatrix<A>>> {
392    tfmp: &'tfmp mut TfmPvalue<A, M>,
393    score: f64,
394    decay: f64,
395    granularity: f64,
396    target: f64,
397    converged: bool,
398}
399
400impl<'tfmp, A: Alphabet, M: AsRef<ScoringMatrix<A>>> Iterator for PvaluesIterator<'tfmp, A, M> {
401    type Item = Iteration;
402    fn next(&mut self) -> Option<Self::Item> {
403        if self.converged || self.granularity <= self.target {
404            return None;
405        }
406
407        self.tfmp.recompute(self.granularity);
408        let granularity = self.granularity;
409        let range = self.tfmp.lookup_pvalue(self.score);
410
411        self.granularity /= self.decay;
412        if range.start() == range.end() {
413            self.converged = true;
414        }
415
416        Some(Iteration {
417            range,
418            granularity,
419            converged: self.converged,
420            score: self.score,
421            _hidden: (),
422        })
423    }
424}
425
426/// A helper type running iterations to approximate the score for a *p*-value.
427#[derive(Debug)]
428pub struct ScoresIterator<'tfmp, A: Alphabet, M: AsRef<ScoringMatrix<A>>> {
429    tfmp: &'tfmp mut TfmPvalue<A, M>,
430    pvalue: f64,
431    decay: f64,
432    granularity: f64,
433    target: f64,
434    converged: bool,
435    min: i64,
436    max: i64,
437}
438
439impl<'tfmp, A: Alphabet, M: AsRef<ScoringMatrix<A>>> Iterator for ScoresIterator<'tfmp, A, M> {
440    type Item = Iteration;
441    fn next(&mut self) -> Option<Self::Item> {
442        if self.converged || self.granularity <= self.target {
443            return None;
444        }
445
446        self.tfmp.recompute(self.granularity);
447        let granularity = self.granularity;
448        let (iscore, range) = self
449            .tfmp
450            .lookup_score(self.pvalue, RangeInclusive::new(self.min, self.max));
451
452        self.granularity /= self.decay;
453        self.min =
454            ((iscore as f64 - (self.tfmp.error_max + 0.5).ceil()) * self.decay).floor() as i64;
455        self.max =
456            ((iscore as f64 + (self.tfmp.error_max + 0.5).ceil()) * self.decay).floor() as i64;
457        if range.start() == range.end() {
458            self.converged = true;
459        }
460
461        let offset = self.tfmp.offsets.iter().sum::<i64>();
462        Some(Iteration {
463            granularity,
464            range,
465            score: (iscore - offset) as f64 * granularity,
466            converged: self.converged,
467            _hidden: (),
468        })
469    }
470}
471
472#[cfg(test)]
473mod test {
474
475    use lightmotif::abc::Dna;
476    use lightmotif::dense::DenseMatrix;
477    use lightmotif::pwm::CountMatrix;
478
479    use super::*;
480
481    macro_rules! assert_almost_eq {
482        ($x:expr, $y:expr, places = $places:expr) => {{
483            assert_eq!(
484                ($x * 10f64.powi($places)).round(),
485                ($y * 10f64.powi($places)).round(),
486            )
487        }};
488    }
489
490    /// Build the MA0045 PSSM from Jaspar using a uniform background and 0.25 pseudocounts.
491    fn build_ma0045() -> ScoringMatrix<Dna> {
492        #[rustfmt::skip]
493        let counts = DenseMatrix::from_rows([
494            //A   C   T   G   N
495            [ 3,  5,  2,  4,  0],
496            [ 7,  0,  4,  3,  0],
497            [ 9,  1,  3,  1,  0],
498            [ 3,  6,  1,  4,  0],
499            [11,  0,  0,  3,  0],
500            [11,  0,  1,  2,  0],
501            [11,  0,  1,  2,  0],
502            [ 3,  3,  6,  2,  0],
503            [ 4,  1,  1,  8,  0],
504            [ 3,  4,  1,  6,  0],
505            [ 8,  5,  0,  1,  0],
506            [ 8,  1,  1,  4,  0],
507            [ 9,  0,  3,  2,  0],
508            [ 9,  5,  0,  0,  0],
509            [11,  0,  0,  3,  0],
510            [ 2,  7,  5,  0,  0],
511        ]);
512        CountMatrix::new(counts)
513            .unwrap()
514            .to_freq(0.25)
515            .to_scoring(None)
516    }
517
518    #[test]
519    fn approximate_pvalue() {
520        let pssm = build_ma0045();
521        let mut tfmp = TfmPvalue::new(&pssm);
522        let mut pvalues = tfmp.approximate_pvalue(10.0);
523
524        // Reference values computed with pytfmpval:
525        //
526        // granularity  pmin                    pmax
527        //         0.1  5.7484256103634834e-05  0.000185822369530797
528        //        0.01  0.00011981534771621227  0.00012914929538965225
529        //       0.001  0.00012489012442529202  0.0001261131837964058
530        //      0.0001  0.00012567872181534767  0.00012605986557900906
531        //       1e-05  0.00012601236812770367  0.0001260137651115656
532        //       1e-06  0.00012601329945027828  0.0001260137651115656
533        //       1e-07  0.00012601329945027828  0.00012601329945027828
534
535        let it = pvalues.next().unwrap();
536        assert_almost_eq!(it.granularity, 1e-1, places = 5);
537        assert_almost_eq!(it.range.start(), 5.74842561e-5, places = 7);
538        assert_almost_eq!(it.range.end(), 0.000185822369, places = 7);
539        assert!(!it.converged);
540
541        let it = pvalues.next().unwrap();
542        assert_almost_eq!(it.granularity, 1e-2, places = 7);
543        assert_almost_eq!(it.range.start(), 0.000119815, places = 5);
544        assert_almost_eq!(it.range.end(), 0.000129149, places = 7);
545        assert!(!it.converged);
546
547        let it = pvalues.next().unwrap();
548        assert_almost_eq!(it.granularity, 1e-3, places = 5);
549        assert_almost_eq!(it.range.start(), 0.000124890, places = 7);
550        assert_almost_eq!(it.range.end(), 0.000126113, places = 7);
551        assert!(!it.converged);
552
553        let it = pvalues.next().unwrap();
554        assert_almost_eq!(it.granularity, 1e-4, places = 5);
555        assert_almost_eq!(it.range.start(), 0.00012567, places = 5);
556        assert_almost_eq!(it.range.end(), 0.000126059, places = 5);
557        assert!(!it.converged);
558
559        let it = pvalues.next().unwrap();
560        assert_almost_eq!(it.granularity, 1e-5, places = 5);
561        assert_almost_eq!(it.range.start(), 0.00012601, places = 5);
562        assert_almost_eq!(it.range.end(), 0.0001260137, places = 5);
563        assert!(!it.converged);
564
565        let it = pvalues.next().unwrap();
566        assert_almost_eq!(it.granularity, 1e-6, places = 5);
567        assert_almost_eq!(it.range.start(), 0.00012601, places = 5);
568        assert_almost_eq!(it.range.end(), 0.0001260137, places = 5);
569        assert!(!it.converged);
570
571        let it = pvalues.next().unwrap();
572        assert_almost_eq!(it.granularity, 1e-7, places = 5);
573        assert_almost_eq!(it.range.start(), 0.0001260, places = 5);
574        assert_almost_eq!(it.range.end(), 0.0001260132, places = 5);
575        assert!(it.converged);
576
577        assert!(pvalues.next().is_none());
578    }
579
580    #[test]
581    fn pvalue() {
582        let pssm = build_ma0045();
583        let mut tfmp = TfmPvalue::new(&pssm);
584
585        assert_almost_eq!(tfmp.pvalue(8.882756), 0.0003, places = 5);
586        assert_almost_eq!(tfmp.pvalue(12.657785), 0.00001, places = 5);
587        assert_almost_eq!(tfmp.pvalue(19.1), 1e-10, places = 5);
588    }
589
590    #[test]
591    fn score() {
592        let pssm = build_ma0045();
593        let mut tfmp = TfmPvalue::new(&pssm);
594
595        assert_almost_eq!(tfmp.score(0.00001), 12.657785, places = 4);
596        assert_almost_eq!(tfmp.score(0.0003), 8.882756, places = 5);
597        assert_almost_eq!(tfmp.score(1e-10), 19.1, places = 5);
598    }
599}