1#![doc = include_str!("../README.md")]
2
3use std::cmp::Ordering;
4use std::collections::HashMap;
5use std::fmt::Debug;
6use std::ops::RangeInclusive;
7
8use lightmotif::abc::Alphabet;
9use lightmotif::dense::DenseMatrix;
10use lightmotif::num::Unsigned;
11use lightmotif::pwm::ScoringMatrix;
12
13mod hash;
14
15pub type IntMap<V> = HashMap<i64, V, self::hash::IntHasherBuilder>;
17
18#[derive(Debug)]
20pub struct TfmPvalue<A: Alphabet, M: AsRef<ScoringMatrix<A>>> {
21 matrix: M,
23 permutation: Vec<usize>,
25 granularity: f64,
27 offsets: Vec<i64>,
29 int_matrix: DenseMatrix<i64, A::K>,
31 error_max: f64,
33 max_score_rows: Vec<i64>,
35 min_score_rows: Vec<i64>,
37 qvalues: Vec<IntMap<f64>>,
39}
40
41#[allow(non_snake_case)]
42impl<A: Alphabet, M: AsRef<ScoringMatrix<A>>> TfmPvalue<A, M> {
43 pub fn new(matrix: M) -> Self {
45 let m = matrix.as_ref();
46 let M = m.len();
47
48 let range = (0..M)
52 .map(|i| {
53 let row = &m[i][..A::K::USIZE - 1];
54 let max_score = row.iter().cloned().reduce(f32::max).unwrap_or_default();
55 let min_score = row.iter().cloned().reduce(f32::min).unwrap_or_default();
56 max_score - min_score
57 })
58 .collect::<Vec<_>>();
59 let mut permutation: Vec<usize> = (0..M).collect();
60 permutation.sort_unstable_by(|i, j| range[*j].partial_cmp(&range[*i]).unwrap());
61
62 Self {
63 granularity: f64::NAN,
64 matrix,
65 permutation,
66 offsets: vec![0; M],
67 int_matrix: DenseMatrix::new(M),
68 max_score_rows: vec![0; M],
69 min_score_rows: vec![0; M],
70 qvalues: vec![IntMap::default(); M + 1],
71 error_max: 0.0,
72 }
73 }
74
75 pub fn as_inner(&self) -> &M {
77 &self.matrix
78 }
79
80 pub fn into_inner(self) -> M {
82 self.matrix
83 }
84
85 fn recompute(&mut self, granularity: f64) {
87 assert!(granularity < 1.0);
88 let matrix = self.matrix.as_ref();
89 let M: usize = matrix.len();
90 let K: usize = <A as Alphabet>::K::USIZE;
91
92 self.granularity = granularity;
94
95 for (i, &p) in self.permutation.iter().enumerate() {
97 for j in 0..K - 1 {
98 self.int_matrix[i][j] = (matrix[p][j] as f64 / self.granularity).floor() as i64;
99 }
100 }
101
102 self.error_max = 0.0;
104 for i in 1..M {
105 let max_e = matrix[self.permutation[i]]
106 .iter()
107 .enumerate()
108 .map(|(j, &x)| (x as f64) / self.granularity - self.int_matrix[i][j] as f64)
109 .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Less))
110 .unwrap();
111 self.error_max += max_e;
112 }
113
114 for i in 0..M {
116 self.offsets[i] = -*self.int_matrix[i][..K - 1].iter().min().unwrap();
117 for j in 0..K - 1 {
118 self.int_matrix[i][j] += self.offsets[i];
119 }
120 }
121
122 for i in 0..M {
124 self.min_score_rows[i] = *self.int_matrix[i][..K - 1].iter().min().unwrap();
125 self.max_score_rows[i] = *self.int_matrix[i][..K - 1].iter().max().unwrap();
126 }
127 }
128
129 fn distribution(&mut self, min: i64, max: i64) {
133 for map in self.qvalues.iter_mut() {
135 map.clear();
136 }
137
138 let matrix = self.matrix.as_ref();
140 let M: usize = matrix.len();
141 let K: usize = <A as Alphabet>::K::USIZE;
142
143 let bg = matrix.background().frequencies();
145
146 let mut maxs = vec![0; M + 1];
148 for i in (0..M).rev() {
149 maxs[i] = maxs[i + 1] + self.max_score_rows[i];
150 }
151
152 for k in 0..K - 1 {
154 if self.int_matrix[0][k] + maxs[1] >= min {
155 *self.qvalues[0].entry(self.int_matrix[0][k]).or_default() += bg[k] as f64;
156 }
157 }
158
159 self.qvalues[M - 1].insert(max + 1, 0.0);
161 for pos in 1..M {
162 let int_row = &self.int_matrix[pos];
164 let (l, r) = self.qvalues.split_at_mut(pos);
166 for (key, val) in &l[pos - 1] {
168 for k in 0..K - 1 {
169 let sc = key + int_row[k];
170 if sc + maxs[pos + 1] >= min {
171 let occ = val * bg[k] as f64;
173 if sc > max {
174 *r[M - 1 - pos].entry(max + 1).or_default() += occ;
176 } else {
177 *r[0].entry(sc).or_default() += occ;
178 }
179 }
180 }
181 }
182 }
183 }
184
185 fn lookup_pvalue(&mut self, score: f64) -> RangeInclusive<f64> {
187 assert!(!self.granularity.is_nan());
188 let matrix = self.matrix.as_ref();
189 let M: usize = matrix.len();
190
191 let scaled = score / self.granularity + self.offsets.iter().sum::<i64>() as f64;
193 let avg = scaled.floor() as i64;
194 let max = (scaled + self.error_max + 1.0).floor() as i64;
195 let min = (scaled - self.error_max - 1.0).floor() as i64;
196
197 self.distribution(min, max);
199
200 let mut pvalues = IntMap::default();
202 let mut s = max + 1;
203 let mut last = self.qvalues[M - 1].keys().cloned().collect::<Vec<i64>>();
204 last.sort_unstable_by(|x, y| x.partial_cmp(y).unwrap());
205 let mut sum = self.qvalues[0].get(&(max + 1)).cloned().unwrap_or_default();
206 for &l in last.iter().rev() {
207 sum += self.qvalues[M - 1][&l];
208 if l >= avg {
209 s = l;
210 }
211 pvalues.insert(l, sum);
212 }
213
214 let mut keys = pvalues.keys().cloned().collect::<Vec<i64>>();
216 keys.sort_unstable_by(|x, y| x.partial_cmp(y).unwrap());
217 let mut kmax = keys.iter().position(|&k| k == s).unwrap();
218 while kmax > 0 && keys[kmax] as f64 >= s as f64 - self.error_max {
219 kmax -= 1;
220 }
221
222 let pmax = pvalues[&keys[kmax]];
224 let pmin = pvalues[&s];
225 RangeInclusive::new(pmin, pmax)
226 }
227
228 fn lookup_score(
230 &mut self,
231 pvalue: f64,
232 range: RangeInclusive<i64>,
233 ) -> (i64, RangeInclusive<f64>) {
234 assert!(!self.granularity.is_nan());
235 let matrix = self.matrix.as_ref();
236 let M: usize = matrix.len();
237
238 let min = *range.start();
240 let max = *range.end();
241
242 self.distribution(min, max);
244 let mut pvalues = IntMap::default();
245
246 let mut keys = self.qvalues[M - 1].keys().cloned().collect::<Vec<_>>();
248 keys.sort_unstable_by(|x, y| x.partial_cmp(y).unwrap());
249
250 let mut sum = 0.0;
252 let mut riter = keys.len() - 1;
253 let alpha;
254 let alpha_e;
255 while riter > 0 {
256 sum += self.qvalues[M - 1][&keys[riter]];
257 pvalues.insert(keys[riter], sum);
258 if sum >= pvalue {
259 break;
260 }
261 riter -= 1;
262 }
263
264 if sum > pvalue {
265 alpha_e = keys[riter];
266 alpha = keys[riter + 1];
267 } else {
268 if riter == 0 {
269 alpha = keys[0];
270 alpha_e = keys[0];
271 } else {
272 alpha = keys[riter];
273 alpha_e = keys[riter - 1];
274 sum += pvalues.get(&alpha_e).cloned().unwrap_or_default();
275 }
276 pvalues.insert(alpha_e, sum);
277 }
278
279 if (alpha - alpha_e) as f64 > self.error_max {
280 (alpha, RangeInclusive::new(pvalues[&alpha], pvalues[&alpha]))
281 } else {
282 (
283 alpha,
284 RangeInclusive::new(pvalues[&alpha_e], pvalues[&alpha]),
285 )
286 }
287 }
288
289 pub fn pvalue(&mut self, score: f64) -> f64 {
297 let it = self.approximate_pvalue(score).last().unwrap();
298 assert!(it.converged); *it.range.start()
300 }
301
302 pub fn approximate_pvalue(&mut self, score: f64) -> PvaluesIterator<'_, A, M> {
328 PvaluesIterator {
329 tfmp: self,
330 score,
331 decay: 10.0,
332 granularity: 0.1,
333 target: 0.0,
334 converged: false,
335 }
336 }
337
338 pub fn score(&mut self, pvalue: f64) -> f64 {
346 let it = self.approximate_score(pvalue).last().unwrap();
347 assert!(it.converged); it.score
349 }
350
351 pub fn approximate_score(&mut self, pvalue: f64) -> ScoresIterator<'_, A, M> {
353 self.recompute(0.1);
354 ScoresIterator {
355 min: self.min_score_rows.iter().sum(),
356 max: self.max_score_rows.iter().sum::<i64>() + (self.error_max + 0.5).ceil() as i64,
357 tfmp: self,
358 pvalue,
359 decay: 10.0,
360 granularity: 0.1,
361 target: 0.0,
362 converged: false,
363 }
364 }
365}
366
367impl<A: Alphabet, M: AsRef<ScoringMatrix<A>>> From<M> for TfmPvalue<A, M> {
368 fn from(matrix: M) -> Self {
369 Self::new(matrix)
370 }
371}
372
373#[derive(Debug, Clone)]
375pub struct Iteration {
376 pub score: f64,
379 pub range: RangeInclusive<f64>,
381 pub granularity: f64,
383 pub converged: bool,
385 #[allow(unused)]
386 _hidden: (),
387}
388
389#[derive(Debug)]
391pub struct PvaluesIterator<'tfmp, A: Alphabet, M: AsRef<ScoringMatrix<A>>> {
392 tfmp: &'tfmp mut TfmPvalue<A, M>,
393 score: f64,
394 decay: f64,
395 granularity: f64,
396 target: f64,
397 converged: bool,
398}
399
400impl<'tfmp, A: Alphabet, M: AsRef<ScoringMatrix<A>>> Iterator for PvaluesIterator<'tfmp, A, M> {
401 type Item = Iteration;
402 fn next(&mut self) -> Option<Self::Item> {
403 if self.converged || self.granularity <= self.target {
404 return None;
405 }
406
407 self.tfmp.recompute(self.granularity);
408 let granularity = self.granularity;
409 let range = self.tfmp.lookup_pvalue(self.score);
410
411 self.granularity /= self.decay;
412 if range.start() == range.end() {
413 self.converged = true;
414 }
415
416 Some(Iteration {
417 range,
418 granularity,
419 converged: self.converged,
420 score: self.score,
421 _hidden: (),
422 })
423 }
424}
425
426#[derive(Debug)]
428pub struct ScoresIterator<'tfmp, A: Alphabet, M: AsRef<ScoringMatrix<A>>> {
429 tfmp: &'tfmp mut TfmPvalue<A, M>,
430 pvalue: f64,
431 decay: f64,
432 granularity: f64,
433 target: f64,
434 converged: bool,
435 min: i64,
436 max: i64,
437}
438
439impl<'tfmp, A: Alphabet, M: AsRef<ScoringMatrix<A>>> Iterator for ScoresIterator<'tfmp, A, M> {
440 type Item = Iteration;
441 fn next(&mut self) -> Option<Self::Item> {
442 if self.converged || self.granularity <= self.target {
443 return None;
444 }
445
446 self.tfmp.recompute(self.granularity);
447 let granularity = self.granularity;
448 let (iscore, range) = self
449 .tfmp
450 .lookup_score(self.pvalue, RangeInclusive::new(self.min, self.max));
451
452 self.granularity /= self.decay;
453 self.min =
454 ((iscore as f64 - (self.tfmp.error_max + 0.5).ceil()) * self.decay).floor() as i64;
455 self.max =
456 ((iscore as f64 + (self.tfmp.error_max + 0.5).ceil()) * self.decay).floor() as i64;
457 if range.start() == range.end() {
458 self.converged = true;
459 }
460
461 let offset = self.tfmp.offsets.iter().sum::<i64>();
462 Some(Iteration {
463 granularity,
464 range,
465 score: (iscore - offset) as f64 * granularity,
466 converged: self.converged,
467 _hidden: (),
468 })
469 }
470}
471
472#[cfg(test)]
473mod test {
474
475 use lightmotif::abc::Dna;
476 use lightmotif::dense::DenseMatrix;
477 use lightmotif::pwm::CountMatrix;
478
479 use super::*;
480
481 macro_rules! assert_almost_eq {
482 ($x:expr, $y:expr, places = $places:expr) => {{
483 assert_eq!(
484 ($x * 10f64.powi($places)).round(),
485 ($y * 10f64.powi($places)).round(),
486 )
487 }};
488 }
489
490 fn build_ma0045() -> ScoringMatrix<Dna> {
492 #[rustfmt::skip]
493 let counts = DenseMatrix::from_rows([
494 [ 3, 5, 2, 4, 0],
496 [ 7, 0, 4, 3, 0],
497 [ 9, 1, 3, 1, 0],
498 [ 3, 6, 1, 4, 0],
499 [11, 0, 0, 3, 0],
500 [11, 0, 1, 2, 0],
501 [11, 0, 1, 2, 0],
502 [ 3, 3, 6, 2, 0],
503 [ 4, 1, 1, 8, 0],
504 [ 3, 4, 1, 6, 0],
505 [ 8, 5, 0, 1, 0],
506 [ 8, 1, 1, 4, 0],
507 [ 9, 0, 3, 2, 0],
508 [ 9, 5, 0, 0, 0],
509 [11, 0, 0, 3, 0],
510 [ 2, 7, 5, 0, 0],
511 ]);
512 CountMatrix::new(counts)
513 .unwrap()
514 .to_freq(0.25)
515 .to_scoring(None)
516 }
517
518 #[test]
519 fn approximate_pvalue() {
520 let pssm = build_ma0045();
521 let mut tfmp = TfmPvalue::new(&pssm);
522 let mut pvalues = tfmp.approximate_pvalue(10.0);
523
524 let it = pvalues.next().unwrap();
536 assert_almost_eq!(it.granularity, 1e-1, places = 5);
537 assert_almost_eq!(it.range.start(), 5.74842561e-5, places = 7);
538 assert_almost_eq!(it.range.end(), 0.000185822369, places = 7);
539 assert!(!it.converged);
540
541 let it = pvalues.next().unwrap();
542 assert_almost_eq!(it.granularity, 1e-2, places = 7);
543 assert_almost_eq!(it.range.start(), 0.000119815, places = 5);
544 assert_almost_eq!(it.range.end(), 0.000129149, places = 7);
545 assert!(!it.converged);
546
547 let it = pvalues.next().unwrap();
548 assert_almost_eq!(it.granularity, 1e-3, places = 5);
549 assert_almost_eq!(it.range.start(), 0.000124890, places = 7);
550 assert_almost_eq!(it.range.end(), 0.000126113, places = 7);
551 assert!(!it.converged);
552
553 let it = pvalues.next().unwrap();
554 assert_almost_eq!(it.granularity, 1e-4, places = 5);
555 assert_almost_eq!(it.range.start(), 0.00012567, places = 5);
556 assert_almost_eq!(it.range.end(), 0.000126059, places = 5);
557 assert!(!it.converged);
558
559 let it = pvalues.next().unwrap();
560 assert_almost_eq!(it.granularity, 1e-5, places = 5);
561 assert_almost_eq!(it.range.start(), 0.00012601, places = 5);
562 assert_almost_eq!(it.range.end(), 0.0001260137, places = 5);
563 assert!(!it.converged);
564
565 let it = pvalues.next().unwrap();
566 assert_almost_eq!(it.granularity, 1e-6, places = 5);
567 assert_almost_eq!(it.range.start(), 0.00012601, places = 5);
568 assert_almost_eq!(it.range.end(), 0.0001260137, places = 5);
569 assert!(!it.converged);
570
571 let it = pvalues.next().unwrap();
572 assert_almost_eq!(it.granularity, 1e-7, places = 5);
573 assert_almost_eq!(it.range.start(), 0.0001260, places = 5);
574 assert_almost_eq!(it.range.end(), 0.0001260132, places = 5);
575 assert!(it.converged);
576
577 assert!(pvalues.next().is_none());
578 }
579
580 #[test]
581 fn pvalue() {
582 let pssm = build_ma0045();
583 let mut tfmp = TfmPvalue::new(&pssm);
584
585 assert_almost_eq!(tfmp.pvalue(8.882756), 0.0003, places = 5);
586 assert_almost_eq!(tfmp.pvalue(12.657785), 0.00001, places = 5);
587 assert_almost_eq!(tfmp.pvalue(19.1), 1e-10, places = 5);
588 }
589
590 #[test]
591 fn score() {
592 let pssm = build_ma0045();
593 let mut tfmp = TfmPvalue::new(&pssm);
594
595 assert_almost_eq!(tfmp.score(0.00001), 12.657785, places = 4);
596 assert_almost_eq!(tfmp.score(0.0003), 8.882756, places = 5);
597 assert_almost_eq!(tfmp.score(1e-10), 19.1, places = 5);
598 }
599}