pattern_partition_prediction/
lib.rs

1extern crate tabfile;
2
3pub mod error;
4
5use std::borrow::Cow;
6use std::convert::TryInto;
7use std::path::Path;
8
9use tabfile::Tabfile;
10
11use error::Error;
12
13pub type Float = f32;
14type TransitionProbabilities = [Float; 3];
15
16#[derive(Copy, Clone, Debug)]
17pub struct IndelTransitionProbabilities {
18    pub outframe: Float, // frameshift
19    pub inframe: Float, // not a frameshift
20}
21
22impl Default for IndelTransitionProbabilities {
23    fn default() -> Self {
24        Self{outframe: Float::NAN, inframe: Float::NAN}
25    }
26}
27
28// X -> X will always be NaN
29#[derive(Debug)]
30pub struct SubstitutionRates {
31    pub a: Float,
32    pub c: Float,
33    pub g: Float,
34    pub t: Float,
35}
36
37impl SubstitutionRates {
38    /// Dynamic field access to a specific nucleotide
39    ///
40    /// ```
41    /// extern crate pattern_partition_prediction;
42    /// use pattern_partition_prediction::SubstitutionRates;
43    ///
44    /// let rates = SubstitutionRates{
45    ///   a: 0.1,
46    ///   c: 0.2,
47    ///   g: 0.3,
48    ///   t: 0.4,
49    /// };
50    /// assert_eq!( rates.by_name( 'a' ), 0.1 );
51    /// assert_eq!( rates.by_name( 'c' ), 0.2 );
52    /// assert_eq!( rates.by_name( 'g' ), 0.3 );
53    /// assert_eq!( rates.by_name( 't' ), 0.4 );
54    /// ```
55    pub fn by_name(&self, nucleotide: char) -> Float {
56        match nucleotide {
57            'A' | 'a' => self.a,
58            'C' | 'c' => self.c,
59            'G' | 'g' => self.g,
60            'T' | 't' => self.t,
61            _ => panic!("Not a valid nucleotide: {}", nucleotide),
62        }
63    }
64}
65
66/// A pattern partition prediction lookup table
67///
68/// Use the `new` method to read in a sequence-context point mutation probabilities file.
69/// This file has the following format:
70///
71/// ```text
72/// A->C ANAANVY 9.624201240304532e-06
73/// A->C BNAANVY 7.1908831554369445e-06
74/// A->C KNAANVR 5.345747358414394e-06
75/// A->C MNAANVR 7.589541872637903e-06
76/// A->C NAAAVTN 7.0416965447897126e-06
77/// [and so on]
78/// ```
79/// Each column is separated by a single space.
80/// The first column is the point mutation.
81/// The second column is the sequence context where the mutation takes place (UIPAC notation)
82/// The third column is the point mutation rate with this context
83///
84/// The reverse complement is handled automatically. All sequence must be different (even when you
85/// expand the UIPAC code)
86///
87
88pub struct PaPaPred {
89    radius: usize,
90    lookup: Vec<TransitionProbabilities>, // ( odd kmer -> base 4 ) -> TransitionProbabilities
91}
92
93impl PaPaPred {
94    /// Create a PaPaPred instance.
95    ///
96    /// The input file must consist of 3 space-separated columns:
97    /// 1) The substitution in the form X->Y where X, Y in {ACGT}
98    /// 2) The pattern in UIPAC notation.
99    /// 3) The probability for the substitution with that pattern
100    ///
101    /// You provide the `path` to the sequence-context point mutation probabilities file.
102    /// If you want to ensure that the patters have a minimum size, you can provide a value for
103    /// `min_kmer_size` which will pad all patterns with Ns at the end. The minimum kmer size has
104    /// to be an odd number because there has to be a single mutated base in the center of the
105    /// sequence.
106    pub fn new<P: AsRef<Path>>(path: P, min_kmer_size: Option<usize>) -> Result<PaPaPred, Error> {
107        let mut kmer_size = 0; // will be detected from input size
108        let template: Vec<TransitionProbabilities> = vec![[Float::NAN; 3]]; // missing values should be treated as NaNs
109        let mut lookup: Vec<TransitionProbabilities> = Vec::new();
110        for record_result in Tabfile::open(path)?.separator(' ') {
111            let record = record_result?;
112            let tokens = record.fields();
113
114            if tokens[0].starts_with('#') || tokens.len() == 0 {
115                continue // ignore header or comments
116            }
117
118            let substitution = tokens[0].parse::<Substitution>().unwrap();
119            let uipac_context = pad_size(tokens[1], min_kmer_size.unwrap_or(1));
120            let rate = tokens[2].parse::<Float>().unwrap();
121
122            if kmer_size == 0 {
123                kmer_size = uipac_context.len();
124                if kmer_size % 2 == 0 {
125                    return Err(Error::BadKmerSize(kmer_size));
126                }
127                lookup = template.repeat(4usize.pow(kmer_size as u32));
128            } else if uipac_context.len() != kmer_size {
129                let message = format!(
130                    "The following line does not have length {}: {}",
131                    kmer_size,
132                    record.line()
133                );
134                return Err(Error::FileFormat { message });
135            }
136
137            for context in expand_uipac(&uipac_context) {
138                let context_base4 = seq2base_four(&context)?;
139                let index = substitution.transition_probabilities_index();
140                lookup[context_base4][index] = rate;
141                let complement_context = base4_reverse_complement(context_base4, kmer_size);
142                let complement_index = substitution.complement().transition_probabilities_index();
143                lookup[complement_context][complement_index] = rate;
144            }
145        }
146        let radius = kmer_size / 2;
147        Ok(PaPaPred { radius, lookup })
148    }
149
150    /// Query the mutation rates for a sequence context
151    ///
152    /// The middle position of the `seq` is assumed to be mutating.
153    pub fn rates(&self, seq: &str) -> Result<SubstitutionRates, Error> {
154        let index = seq2base_four(seq)?;
155        let rates = self.lookup[index];
156        let ref_base: Base = seq
157            .chars()
158            .nth(self.radius)
159            .expect("sequence is too short")
160            .try_into()
161            .unwrap();
162        Ok(match ref_base {
163            Base::A => SubstitutionRates {
164                a: Float::NAN,
165                c: rates[0],
166                g: rates[1],
167                t: rates[2],
168            },
169            Base::C => SubstitutionRates {
170                a: rates[0],
171                c: Float::NAN,
172                g: rates[1],
173                t: rates[2],
174            },
175            Base::G => SubstitutionRates {
176                a: rates[0],
177                c: rates[1],
178                g: Float::NAN,
179                t: rates[2],
180            },
181            Base::T => SubstitutionRates {
182                a: rates[0],
183                c: rates[1],
184                g: rates[2],
185                t: Float::NAN,
186            },
187        })
188    }
189
190    pub fn kmer_size(&self) -> usize {
191        1 + 2 * self.radius
192    }
193
194    pub fn radius(&self) -> usize {
195        self.radius
196    }
197}
198
199pub struct PaPaPredIndel {
200    radius: usize,
201    lookup: Vec<IndelTransitionProbabilities>, // ( oven kmer -> base 4 ) -> IndelTransitionProbabilities
202}
203
204impl PaPaPredIndel {
205    /// Create a PaPaPredIndel instance.
206    ///
207    /// Note that this class treats insertions and deletions the same, hence "indel".
208    /// But it distinguishes between outframe (=frameshift) and inframe (non-frameshift) mutations.
209    ///
210    /// The input file must consist of 3 space-separated columns:
211    /// 1) The pattern in UIPAC notation.
212    /// 2) The probability for a frameshift indel
213    /// 3) The probability for a non-frameshift indel
214    ///
215    /// You provide the `path` to the sequence-context point mutation probabilities file.
216    /// If you want to ensure that the patters have a minimum size, you can provide a value for
217    /// `min_kmer_size` which will pad all patterns with Ns at the end. The minimum kmer size has
218    /// to be an even number because the insertion or deletion should happen right in the middle
219    /// of the sequence
220    pub fn new<P: AsRef<Path>>(path: P, min_kmer_size: Option<usize>) -> Result<Self, Error> {
221        let mut kmer_size = 0; // will be detected from input size
222        let template: Vec<IndelTransitionProbabilities> = vec![IndelTransitionProbabilities::default()]; // missing values should be treated as NaNs
223        let mut lookup: Vec<IndelTransitionProbabilities> = Vec::new();
224        for record_result in Tabfile::open(path)?.separator(' ') {
225            let record = record_result?;
226            let tokens = record.fields();
227            if tokens[0].starts_with('#') || tokens.len() == 0 {
228                continue // ignore header or comments
229            }
230
231            let uipac_context = pad_size(tokens[0], min_kmer_size.unwrap_or(1));
232            let frameshift_probability = tokens[1].parse::<Float>().unwrap();
233            let inframe_probability = tokens[2].parse::<Float>().unwrap();
234
235            if kmer_size == 0 { // first line of input
236                kmer_size = uipac_context.len();
237                if kmer_size % 2 == 1 {
238                    return Err(Error::BadKmerSize(kmer_size));
239                }
240                lookup = template.repeat(4usize.pow(kmer_size as u32));
241            } else if uipac_context.len() != kmer_size {
242                let message = format!(
243                    "The following line does not have length {}: {}",
244                    kmer_size,
245                    record.line()
246                );
247                return Err(Error::FileFormat { message });
248            }
249
250            let rate = IndelTransitionProbabilities{
251                outframe: frameshift_probability,
252                inframe: inframe_probability,
253            };
254
255            for context in expand_uipac(&uipac_context) {
256                let context_base4 = seq2base_four(&context)?;
257                lookup[context_base4] = rate;
258                //let complement_context = base4_reverse_complement(context_base4, kmer_size);
259                //lookup[complement_context] = rate;
260            }
261        }
262        let radius = kmer_size / 2;
263        Ok(Self{radius, lookup})
264    }
265
266    /// Query the mutation rates for a sequence context
267    ///
268    /// The indel is assumed to happen inbetween the two middle nuleotides
269    pub fn rates(&self, seq: &str) -> Result<IndelTransitionProbabilities, Error> {
270        let index = seq2base_four(seq)?;
271        let rates = self.lookup[index];
272        Ok(rates)
273    }
274
275    pub fn kmer_size(&self) -> usize {
276        2 * self.radius
277    }
278
279    pub fn radius(&self) -> usize {
280        self.radius
281    }
282}
283
284#[derive(PartialEq)]
285enum Base {
286    A,
287    C,
288    G,
289    T,
290}
291
292impl Base {
293    fn complement(&self) -> Self {
294        match self {
295            Base::A => Base::T,
296            Base::C => Base::G,
297            Base::G => Base::C,
298            Base::T => Base::A,
299        }
300    }
301}
302
303struct Substitution {
304    from: Base,
305    to: Base,
306}
307
308impl Substitution {
309    /// There are no transitions from nucleotide X to X
310    /// Therefore it is sufficient to store only 3 transitions
311    /// But we need to know how to adjust the index inside TransitionProbabilities
312    fn transition_probabilities_index(&self) -> usize {
313        match self.from {
314            Base::A => match self.to {
315                Base::A => panic!("Can't transition from A to A"),
316                Base::C => 0,
317                Base::G => 1,
318                Base::T => 2,
319            },
320            Base::C => match self.to {
321                Base::A => 0,
322                Base::C => panic!("Can't transition from C to C"),
323                Base::G => 1,
324                Base::T => 2,
325            },
326            Base::G => match self.to {
327                Base::A => 0,
328                Base::C => 1,
329                Base::G => panic!("Can't transition from G to G"),
330                Base::T => 2,
331            },
332            Base::T => match self.to {
333                Base::A => 0,
334                Base::C => 1,
335                Base::G => 2,
336                Base::T => panic!("Can't transition from T to T"),
337            },
338        }
339    }
340
341    fn complement(&self) -> Self {
342        Substitution {
343            from: self.from.complement(),
344            to: self.to.complement(),
345        }
346    }
347}
348
349impl std::str::FromStr for Substitution {
350    type Err = Error;
351    fn from_str(s: &str) -> Result<Self, Self::Err> {
352        if s.len() == 4 {
353            if &s[1..3] != "->" {
354                Err(Error::FileFormat {
355                    message: "expected '->' between nucleotides".to_string(),
356                })
357            } else {
358                let from = s.chars().next().expect("not empty").try_into()?;
359                let to = s.chars().nth(3).expect("4 chars").try_into()?;
360                Ok(Substitution { from, to })
361            }
362        } else {
363            Err( Error::FileFormat{ message:
364                format!( "The string {} does not confirm to the scheme X->Y (where X and Y are nucleotides", s ) } )
365        }
366    }
367}
368
369impl std::convert::TryFrom<char> for Base {
370    type Error = Error;
371    fn try_from(value: char) -> Result<Self, Self::Error> {
372        Ok(match value {
373            'A' => Base::A,
374            'C' => Base::C,
375            'G' => Base::G,
376            'T' => Base::T,
377            _ => return Err(Error::BadNucleotide(value)),
378        })
379    }
380}
381
382fn seq2base_four(seq: &str) -> Result<usize, Error> {
383    let mut result = 0;
384    for c in seq.chars() {
385        let value = match c {
386            'A' => 0,
387            'C' => 1,
388            'G' => 2,
389            'T' => 3,
390            _ => return Err(Error::BadNucleotide(c))
391        };
392        result <<= 2;
393        result += value;
394    }
395    Ok(result)
396}
397
398fn expand_uipac(seq: &str) -> Vec<String> {
399    // this is fairly expensive. Can I speed this up?
400    if !seq.is_empty() {
401        let (c, rest) = seq.split_at(1);
402        let suffixes = expand_uipac(rest);
403
404        let mut result = Vec::new();
405        let expansions = match c.chars().next().unwrap() {
406            'A' => "A",
407            'C' => "C",
408            'G' => "G",
409            'T' => "T",
410            'R' => "AG",
411            'Y' => "CT",
412            'S' => "GC",
413            'W' => "AT",
414            'K' => "GT",
415            'M' => "AC",
416            'B' => "CGT",
417            'D' => "AGT",
418            'H' => "ACT",
419            'V' => "ACG",
420            'N' => "ACGT",
421            _ => panic!("Invalid code: {}", c),
422        };
423        for expansion in expansions.chars() {
424            if !suffixes.is_empty() {
425                for suffix in &suffixes {
426                    let mut s = String::with_capacity(suffix.len() + 1);
427                    s.push(expansion);
428                    s.push_str(&suffix);
429                    result.push(s);
430                }
431            } else {
432                result.push(expansion.to_string());
433            }
434        }
435        result
436    } else {
437        Vec::new()
438    }
439}
440
441fn pad_size(seq: &str, min_size: usize) -> Cow<str> {
442    let len = seq.len();
443    if len >= min_size {
444        Cow::Borrowed(seq)
445    } else {
446        let flanking = "N".repeat((min_size - len) / 2);
447        Cow::Owned(format!("{}{}{}", flanking, seq, flanking))
448    }
449}
450
451// based on the code in seq2base_four, the following conventions apply:
452// A -> 0
453// C -> 1
454// G -> 2
455// T -> 3
456//
457// And therefore the following complements apply:
458// 0 -> A => T -> 3
459// 1 -> C => G -> 2
460// 2 -> G => C -> 1
461// 3 -> T => A -> 0
462//
463// So, basically: complement(x) -> 3 - x
464fn base4_reverse_complement(mut value: usize, digits: usize) -> usize {
465    let mut result = 0;
466    for _ in 0..digits {
467        result <<= 2; // multiply result by 4
468        result += 3 - (value & 3); // bitmask the last two bits and complement
469        value >>= 2; // shift the last two bits away
470    }
471    result
472}
473
474#[cfg(test)]
475mod tests {
476    use super::*;
477
478    #[test]
479    fn test_seq2base_four() {
480        let fun = seq2base_four;
481        assert_eq!(fun("ACGT").unwrap(), 0 * 64 + 1 * 16 + 2 * 4 + 3 * 1);
482        assert_eq!(fun("TGCA").unwrap(), 3 * 64 + 2 * 16 + 1 * 4 + 0 * 1);
483        assert_eq!(fun("").unwrap(), 0);
484        assert_eq!(fun("TTTG").unwrap(), 3 * 64 + 3 * 16 + 3 * 4 + 2);
485        assert_eq!(fun("AAAAAAAA").unwrap(), 0);
486        assert_eq!(fun("CCCCC").unwrap(), 256 + 64 + 16 + 4 + 1);
487        assert_eq!(
488            fun("ACCGCCT").unwrap(),
489            1 * 1024 + 1 * 256 + 2 * 64 + 1 * 16 + 1 * 4 + 3
490        );
491        assert_eq!(fun("e").unwrap_err(), Error::BadNucleotide('e'));
492        assert_eq!(fun("ACGTmACGT").unwrap_err(), Error::BadNucleotide('m'));
493    }
494
495    #[test]
496    fn test_papapred() {
497        let full_input_file = "test_assets/PaPa_rates.txt";
498        let papa = PaPaPred::new(full_input_file, None).unwrap();
499
500        let rates = papa.rates("ACCGCCT").unwrap();
501        assert_eq!(rates.a, 6.0526200e-04);
502        assert_eq!(rates.c, 2.3540691e-05);
503        assert!(rates.g.is_nan());
504        assert_eq!(rates.t, 3.8108174e-05);
505
506        let rates = papa.rates("AGGCGGT").unwrap();
507        assert_eq!(rates.a, 3.8108174e-05);
508        assert!(rates.c.is_nan());
509        assert_eq!(rates.g, 2.3540691e-05);
510        assert_eq!(rates.t, 6.0526200e-04);
511
512        let rates = papa.rates("AAACAAA").unwrap();
513        assert_eq!(rates.a, 1.94144068e-05);
514        assert!(rates.c.is_nan());
515        assert_eq!(rates.g, 1.33842095e-05);
516        assert_eq!(rates.t, 3.95017705e-05);
517
518        let rates = papa.rates("AAACAAA").unwrap();
519        assert_eq!(rates.a, 1.94144068e-05);
520        assert!(rates.c.is_nan());
521        assert_eq!(rates.g, 1.33842095e-05);
522        assert_eq!(rates.t, 3.95017705e-05);
523
524        let rates = papa.rates("TCATTTT").unwrap();
525        assert_eq!(rates.a, 5.1615812e-06);
526        assert_eq!(rates.c, 2.6344846e-05);
527        assert_eq!(rates.g, 7.5895418e-06);
528        assert!(rates.t.is_nan());
529
530        let rates = papa.rates("TAGCTTA").unwrap();
531        assert_eq!(rates.a, 1.18828875e-05);
532        assert!(rates.c.is_nan());
533        assert_eq!(rates.g, 1.92736370e-05);
534        assert_eq!(rates.t, 5.04661366e-05);
535
536        let rates = papa.rates("GTCGTGC").unwrap();
537        assert_eq!(rates.a, 6.9644750e-04);
538        assert_eq!(rates.c, 2.0450439e-05);
539        assert!(rates.g.is_nan());
540        assert_eq!(rates.t, 1.7836264e-05);
541
542        let rates = papa.rates("GGTACTT").unwrap();
543        assert!(rates.a.is_nan());
544        assert_eq!(rates.c, 7.8399744e-06);
545        assert_eq!(rates.g, 3.8916667e-05);
546        assert_eq!(rates.t, 6.6361449e-06);
547    }
548
549    #[test]
550    fn test_papapred_indel() {
551        let full_input_file = "test_assets/PaPa_rates_indel.txt";
552        let papa = PaPaPredIndel::new(full_input_file, None).unwrap();
553
554        let rates = papa.rates("ACGTAC").unwrap();
555        assert_eq!(rates.outframe, 0.25);
556        assert_eq!(rates.inframe, 0.75);
557
558        let rates = papa.rates("CATCAT").unwrap();
559        assert_eq!(rates.outframe, 0.125);
560        assert_eq!(rates.inframe, 0.5625);
561
562        let rates = papa.rates("TTATGG").unwrap();
563        assert_eq!(rates.outframe, 0.5);
564        assert_eq!(rates.inframe, 0.625);
565
566        let rates = papa.rates("GGCGTA").unwrap();
567        assert_eq!(rates.outframe, 0.3125);
568        assert_eq!(rates.inframe, 0.375);
569
570        let rates = papa.rates("TTTTTT").unwrap();
571        assert!(rates.outframe.is_nan());
572        assert!(rates.inframe.is_nan());
573
574    }
575
576    #[test]
577    fn test_expand_uipac() {
578        let mut foo = expand_uipac("ACGT");
579        assert_eq!(foo.len(), 1);
580        assert_eq!(foo.pop().unwrap(), "ACGT");
581
582        foo = expand_uipac("");
583        assert_eq!(foo.len(), 0);
584
585        foo = expand_uipac("NN");
586        let bar = vec![
587            "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", "GA", "GC", "GG", "GT", "TA", "TC",
588            "TG", "TT",
589        ];
590        assert_eq!(foo, bar);
591
592        foo = expand_uipac("RYS"); //AG CT GC
593        let bar = vec!["ACG", "ACC", "ATG", "ATC", "GCG", "GCC", "GTG", "GTC"];
594        assert_eq!(foo, bar);
595    }
596    #[test]
597    fn test_base4_reverse_complement() {
598        assert_eq!(
599            base4_reverse_complement(seq2base_four("AAGG").unwrap(), 4),
600            seq2base_four("CCTT").unwrap()
601        );
602        assert_eq!(
603            base4_reverse_complement(seq2base_four("GTACTAG").unwrap(), 7),
604            seq2base_four("CTAGTAC").unwrap()
605        );
606        assert_eq!(
607            base4_reverse_complement(seq2base_four("TCGATTGCAT").unwrap(), 10),
608            seq2base_four("ATGCAATCGA").unwrap()
609        );
610        assert_eq!(
611            base4_reverse_complement(0, 6), // 6 times A
612            seq2base_four("TTTTTT").unwrap()
613        );
614        assert_eq!(
615            base4_reverse_complement(seq2base_four("AG").unwrap(), 8),
616            seq2base_four("CTTTTTTT").unwrap()
617        );
618        assert_eq!(
619            base4_reverse_complement(seq2base_four("ACCGCCT").unwrap(), 7),
620            seq2base_four("AGGCGGT").unwrap()
621        );
622    }
623
624    #[test]
625    fn test_pad_size() {
626        assert_eq!(pad_size("123", 1), "123");
627        assert_eq!(pad_size("123", 3), "123");
628        assert_eq!(pad_size("123", 5), "N123N");
629        assert_eq!(pad_size("123", 7), "NN123NN");
630
631        assert_eq!(pad_size("1234", 2), "1234");
632        assert_eq!(pad_size("1234", 4), "1234");
633        assert_eq!(pad_size("1234", 6), "N1234N");
634        assert_eq!(pad_size("1234", 8), "NN1234NN");
635    }
636}