1extern crate tabfile;
2
3pub mod error;
4
5use std::borrow::Cow;
6use std::convert::TryInto;
7use std::path::Path;
8
9use tabfile::Tabfile;
10
11use error::Error;
12
13pub type Float = f32;
14type TransitionProbabilities = [Float; 3];
15
16#[derive(Copy, Clone, Debug)]
17pub struct IndelTransitionProbabilities {
18 pub outframe: Float, pub inframe: Float, }
21
22impl Default for IndelTransitionProbabilities {
23 fn default() -> Self {
24 Self{outframe: Float::NAN, inframe: Float::NAN}
25 }
26}
27
28#[derive(Debug)]
30pub struct SubstitutionRates {
31 pub a: Float,
32 pub c: Float,
33 pub g: Float,
34 pub t: Float,
35}
36
37impl SubstitutionRates {
38 pub fn by_name(&self, nucleotide: char) -> Float {
56 match nucleotide {
57 'A' | 'a' => self.a,
58 'C' | 'c' => self.c,
59 'G' | 'g' => self.g,
60 'T' | 't' => self.t,
61 _ => panic!("Not a valid nucleotide: {}", nucleotide),
62 }
63 }
64}
65
66pub struct PaPaPred {
89 radius: usize,
90 lookup: Vec<TransitionProbabilities>, }
92
93impl PaPaPred {
94 pub fn new<P: AsRef<Path>>(path: P, min_kmer_size: Option<usize>) -> Result<PaPaPred, Error> {
107 let mut kmer_size = 0; let template: Vec<TransitionProbabilities> = vec![[Float::NAN; 3]]; let mut lookup: Vec<TransitionProbabilities> = Vec::new();
110 for record_result in Tabfile::open(path)?.separator(' ') {
111 let record = record_result?;
112 let tokens = record.fields();
113
114 if tokens[0].starts_with('#') || tokens.len() == 0 {
115 continue }
117
118 let substitution = tokens[0].parse::<Substitution>().unwrap();
119 let uipac_context = pad_size(tokens[1], min_kmer_size.unwrap_or(1));
120 let rate = tokens[2].parse::<Float>().unwrap();
121
122 if kmer_size == 0 {
123 kmer_size = uipac_context.len();
124 if kmer_size % 2 == 0 {
125 return Err(Error::BadKmerSize(kmer_size));
126 }
127 lookup = template.repeat(4usize.pow(kmer_size as u32));
128 } else if uipac_context.len() != kmer_size {
129 let message = format!(
130 "The following line does not have length {}: {}",
131 kmer_size,
132 record.line()
133 );
134 return Err(Error::FileFormat { message });
135 }
136
137 for context in expand_uipac(&uipac_context) {
138 let context_base4 = seq2base_four(&context)?;
139 let index = substitution.transition_probabilities_index();
140 lookup[context_base4][index] = rate;
141 let complement_context = base4_reverse_complement(context_base4, kmer_size);
142 let complement_index = substitution.complement().transition_probabilities_index();
143 lookup[complement_context][complement_index] = rate;
144 }
145 }
146 let radius = kmer_size / 2;
147 Ok(PaPaPred { radius, lookup })
148 }
149
150 pub fn rates(&self, seq: &str) -> Result<SubstitutionRates, Error> {
154 let index = seq2base_four(seq)?;
155 let rates = self.lookup[index];
156 let ref_base: Base = seq
157 .chars()
158 .nth(self.radius)
159 .expect("sequence is too short")
160 .try_into()
161 .unwrap();
162 Ok(match ref_base {
163 Base::A => SubstitutionRates {
164 a: Float::NAN,
165 c: rates[0],
166 g: rates[1],
167 t: rates[2],
168 },
169 Base::C => SubstitutionRates {
170 a: rates[0],
171 c: Float::NAN,
172 g: rates[1],
173 t: rates[2],
174 },
175 Base::G => SubstitutionRates {
176 a: rates[0],
177 c: rates[1],
178 g: Float::NAN,
179 t: rates[2],
180 },
181 Base::T => SubstitutionRates {
182 a: rates[0],
183 c: rates[1],
184 g: rates[2],
185 t: Float::NAN,
186 },
187 })
188 }
189
190 pub fn kmer_size(&self) -> usize {
191 1 + 2 * self.radius
192 }
193
194 pub fn radius(&self) -> usize {
195 self.radius
196 }
197}
198
199pub struct PaPaPredIndel {
200 radius: usize,
201 lookup: Vec<IndelTransitionProbabilities>, }
203
204impl PaPaPredIndel {
205 pub fn new<P: AsRef<Path>>(path: P, min_kmer_size: Option<usize>) -> Result<Self, Error> {
221 let mut kmer_size = 0; let template: Vec<IndelTransitionProbabilities> = vec![IndelTransitionProbabilities::default()]; let mut lookup: Vec<IndelTransitionProbabilities> = Vec::new();
224 for record_result in Tabfile::open(path)?.separator(' ') {
225 let record = record_result?;
226 let tokens = record.fields();
227 if tokens[0].starts_with('#') || tokens.len() == 0 {
228 continue }
230
231 let uipac_context = pad_size(tokens[0], min_kmer_size.unwrap_or(1));
232 let frameshift_probability = tokens[1].parse::<Float>().unwrap();
233 let inframe_probability = tokens[2].parse::<Float>().unwrap();
234
235 if kmer_size == 0 { kmer_size = uipac_context.len();
237 if kmer_size % 2 == 1 {
238 return Err(Error::BadKmerSize(kmer_size));
239 }
240 lookup = template.repeat(4usize.pow(kmer_size as u32));
241 } else if uipac_context.len() != kmer_size {
242 let message = format!(
243 "The following line does not have length {}: {}",
244 kmer_size,
245 record.line()
246 );
247 return Err(Error::FileFormat { message });
248 }
249
250 let rate = IndelTransitionProbabilities{
251 outframe: frameshift_probability,
252 inframe: inframe_probability,
253 };
254
255 for context in expand_uipac(&uipac_context) {
256 let context_base4 = seq2base_four(&context)?;
257 lookup[context_base4] = rate;
258 }
261 }
262 let radius = kmer_size / 2;
263 Ok(Self{radius, lookup})
264 }
265
266 pub fn rates(&self, seq: &str) -> Result<IndelTransitionProbabilities, Error> {
270 let index = seq2base_four(seq)?;
271 let rates = self.lookup[index];
272 Ok(rates)
273 }
274
275 pub fn kmer_size(&self) -> usize {
276 2 * self.radius
277 }
278
279 pub fn radius(&self) -> usize {
280 self.radius
281 }
282}
283
284#[derive(PartialEq)]
285enum Base {
286 A,
287 C,
288 G,
289 T,
290}
291
292impl Base {
293 fn complement(&self) -> Self {
294 match self {
295 Base::A => Base::T,
296 Base::C => Base::G,
297 Base::G => Base::C,
298 Base::T => Base::A,
299 }
300 }
301}
302
303struct Substitution {
304 from: Base,
305 to: Base,
306}
307
308impl Substitution {
309 fn transition_probabilities_index(&self) -> usize {
313 match self.from {
314 Base::A => match self.to {
315 Base::A => panic!("Can't transition from A to A"),
316 Base::C => 0,
317 Base::G => 1,
318 Base::T => 2,
319 },
320 Base::C => match self.to {
321 Base::A => 0,
322 Base::C => panic!("Can't transition from C to C"),
323 Base::G => 1,
324 Base::T => 2,
325 },
326 Base::G => match self.to {
327 Base::A => 0,
328 Base::C => 1,
329 Base::G => panic!("Can't transition from G to G"),
330 Base::T => 2,
331 },
332 Base::T => match self.to {
333 Base::A => 0,
334 Base::C => 1,
335 Base::G => 2,
336 Base::T => panic!("Can't transition from T to T"),
337 },
338 }
339 }
340
341 fn complement(&self) -> Self {
342 Substitution {
343 from: self.from.complement(),
344 to: self.to.complement(),
345 }
346 }
347}
348
349impl std::str::FromStr for Substitution {
350 type Err = Error;
351 fn from_str(s: &str) -> Result<Self, Self::Err> {
352 if s.len() == 4 {
353 if &s[1..3] != "->" {
354 Err(Error::FileFormat {
355 message: "expected '->' between nucleotides".to_string(),
356 })
357 } else {
358 let from = s.chars().next().expect("not empty").try_into()?;
359 let to = s.chars().nth(3).expect("4 chars").try_into()?;
360 Ok(Substitution { from, to })
361 }
362 } else {
363 Err( Error::FileFormat{ message:
364 format!( "The string {} does not confirm to the scheme X->Y (where X and Y are nucleotides", s ) } )
365 }
366 }
367}
368
369impl std::convert::TryFrom<char> for Base {
370 type Error = Error;
371 fn try_from(value: char) -> Result<Self, Self::Error> {
372 Ok(match value {
373 'A' => Base::A,
374 'C' => Base::C,
375 'G' => Base::G,
376 'T' => Base::T,
377 _ => return Err(Error::BadNucleotide(value)),
378 })
379 }
380}
381
382fn seq2base_four(seq: &str) -> Result<usize, Error> {
383 let mut result = 0;
384 for c in seq.chars() {
385 let value = match c {
386 'A' => 0,
387 'C' => 1,
388 'G' => 2,
389 'T' => 3,
390 _ => return Err(Error::BadNucleotide(c))
391 };
392 result <<= 2;
393 result += value;
394 }
395 Ok(result)
396}
397
398fn expand_uipac(seq: &str) -> Vec<String> {
399 if !seq.is_empty() {
401 let (c, rest) = seq.split_at(1);
402 let suffixes = expand_uipac(rest);
403
404 let mut result = Vec::new();
405 let expansions = match c.chars().next().unwrap() {
406 'A' => "A",
407 'C' => "C",
408 'G' => "G",
409 'T' => "T",
410 'R' => "AG",
411 'Y' => "CT",
412 'S' => "GC",
413 'W' => "AT",
414 'K' => "GT",
415 'M' => "AC",
416 'B' => "CGT",
417 'D' => "AGT",
418 'H' => "ACT",
419 'V' => "ACG",
420 'N' => "ACGT",
421 _ => panic!("Invalid code: {}", c),
422 };
423 for expansion in expansions.chars() {
424 if !suffixes.is_empty() {
425 for suffix in &suffixes {
426 let mut s = String::with_capacity(suffix.len() + 1);
427 s.push(expansion);
428 s.push_str(&suffix);
429 result.push(s);
430 }
431 } else {
432 result.push(expansion.to_string());
433 }
434 }
435 result
436 } else {
437 Vec::new()
438 }
439}
440
441fn pad_size(seq: &str, min_size: usize) -> Cow<str> {
442 let len = seq.len();
443 if len >= min_size {
444 Cow::Borrowed(seq)
445 } else {
446 let flanking = "N".repeat((min_size - len) / 2);
447 Cow::Owned(format!("{}{}{}", flanking, seq, flanking))
448 }
449}
450
451fn base4_reverse_complement(mut value: usize, digits: usize) -> usize {
465 let mut result = 0;
466 for _ in 0..digits {
467 result <<= 2; result += 3 - (value & 3); value >>= 2; }
471 result
472}
473
474#[cfg(test)]
475mod tests {
476 use super::*;
477
478 #[test]
479 fn test_seq2base_four() {
480 let fun = seq2base_four;
481 assert_eq!(fun("ACGT").unwrap(), 0 * 64 + 1 * 16 + 2 * 4 + 3 * 1);
482 assert_eq!(fun("TGCA").unwrap(), 3 * 64 + 2 * 16 + 1 * 4 + 0 * 1);
483 assert_eq!(fun("").unwrap(), 0);
484 assert_eq!(fun("TTTG").unwrap(), 3 * 64 + 3 * 16 + 3 * 4 + 2);
485 assert_eq!(fun("AAAAAAAA").unwrap(), 0);
486 assert_eq!(fun("CCCCC").unwrap(), 256 + 64 + 16 + 4 + 1);
487 assert_eq!(
488 fun("ACCGCCT").unwrap(),
489 1 * 1024 + 1 * 256 + 2 * 64 + 1 * 16 + 1 * 4 + 3
490 );
491 assert_eq!(fun("e").unwrap_err(), Error::BadNucleotide('e'));
492 assert_eq!(fun("ACGTmACGT").unwrap_err(), Error::BadNucleotide('m'));
493 }
494
495 #[test]
496 fn test_papapred() {
497 let full_input_file = "test_assets/PaPa_rates.txt";
498 let papa = PaPaPred::new(full_input_file, None).unwrap();
499
500 let rates = papa.rates("ACCGCCT").unwrap();
501 assert_eq!(rates.a, 6.0526200e-04);
502 assert_eq!(rates.c, 2.3540691e-05);
503 assert!(rates.g.is_nan());
504 assert_eq!(rates.t, 3.8108174e-05);
505
506 let rates = papa.rates("AGGCGGT").unwrap();
507 assert_eq!(rates.a, 3.8108174e-05);
508 assert!(rates.c.is_nan());
509 assert_eq!(rates.g, 2.3540691e-05);
510 assert_eq!(rates.t, 6.0526200e-04);
511
512 let rates = papa.rates("AAACAAA").unwrap();
513 assert_eq!(rates.a, 1.94144068e-05);
514 assert!(rates.c.is_nan());
515 assert_eq!(rates.g, 1.33842095e-05);
516 assert_eq!(rates.t, 3.95017705e-05);
517
518 let rates = papa.rates("AAACAAA").unwrap();
519 assert_eq!(rates.a, 1.94144068e-05);
520 assert!(rates.c.is_nan());
521 assert_eq!(rates.g, 1.33842095e-05);
522 assert_eq!(rates.t, 3.95017705e-05);
523
524 let rates = papa.rates("TCATTTT").unwrap();
525 assert_eq!(rates.a, 5.1615812e-06);
526 assert_eq!(rates.c, 2.6344846e-05);
527 assert_eq!(rates.g, 7.5895418e-06);
528 assert!(rates.t.is_nan());
529
530 let rates = papa.rates("TAGCTTA").unwrap();
531 assert_eq!(rates.a, 1.18828875e-05);
532 assert!(rates.c.is_nan());
533 assert_eq!(rates.g, 1.92736370e-05);
534 assert_eq!(rates.t, 5.04661366e-05);
535
536 let rates = papa.rates("GTCGTGC").unwrap();
537 assert_eq!(rates.a, 6.9644750e-04);
538 assert_eq!(rates.c, 2.0450439e-05);
539 assert!(rates.g.is_nan());
540 assert_eq!(rates.t, 1.7836264e-05);
541
542 let rates = papa.rates("GGTACTT").unwrap();
543 assert!(rates.a.is_nan());
544 assert_eq!(rates.c, 7.8399744e-06);
545 assert_eq!(rates.g, 3.8916667e-05);
546 assert_eq!(rates.t, 6.6361449e-06);
547 }
548
549 #[test]
550 fn test_papapred_indel() {
551 let full_input_file = "test_assets/PaPa_rates_indel.txt";
552 let papa = PaPaPredIndel::new(full_input_file, None).unwrap();
553
554 let rates = papa.rates("ACGTAC").unwrap();
555 assert_eq!(rates.outframe, 0.25);
556 assert_eq!(rates.inframe, 0.75);
557
558 let rates = papa.rates("CATCAT").unwrap();
559 assert_eq!(rates.outframe, 0.125);
560 assert_eq!(rates.inframe, 0.5625);
561
562 let rates = papa.rates("TTATGG").unwrap();
563 assert_eq!(rates.outframe, 0.5);
564 assert_eq!(rates.inframe, 0.625);
565
566 let rates = papa.rates("GGCGTA").unwrap();
567 assert_eq!(rates.outframe, 0.3125);
568 assert_eq!(rates.inframe, 0.375);
569
570 let rates = papa.rates("TTTTTT").unwrap();
571 assert!(rates.outframe.is_nan());
572 assert!(rates.inframe.is_nan());
573
574 }
575
576 #[test]
577 fn test_expand_uipac() {
578 let mut foo = expand_uipac("ACGT");
579 assert_eq!(foo.len(), 1);
580 assert_eq!(foo.pop().unwrap(), "ACGT");
581
582 foo = expand_uipac("");
583 assert_eq!(foo.len(), 0);
584
585 foo = expand_uipac("NN");
586 let bar = vec![
587 "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", "GA", "GC", "GG", "GT", "TA", "TC",
588 "TG", "TT",
589 ];
590 assert_eq!(foo, bar);
591
592 foo = expand_uipac("RYS"); let bar = vec!["ACG", "ACC", "ATG", "ATC", "GCG", "GCC", "GTG", "GTC"];
594 assert_eq!(foo, bar);
595 }
596 #[test]
597 fn test_base4_reverse_complement() {
598 assert_eq!(
599 base4_reverse_complement(seq2base_four("AAGG").unwrap(), 4),
600 seq2base_four("CCTT").unwrap()
601 );
602 assert_eq!(
603 base4_reverse_complement(seq2base_four("GTACTAG").unwrap(), 7),
604 seq2base_four("CTAGTAC").unwrap()
605 );
606 assert_eq!(
607 base4_reverse_complement(seq2base_four("TCGATTGCAT").unwrap(), 10),
608 seq2base_four("ATGCAATCGA").unwrap()
609 );
610 assert_eq!(
611 base4_reverse_complement(0, 6), seq2base_four("TTTTTT").unwrap()
613 );
614 assert_eq!(
615 base4_reverse_complement(seq2base_four("AG").unwrap(), 8),
616 seq2base_four("CTTTTTTT").unwrap()
617 );
618 assert_eq!(
619 base4_reverse_complement(seq2base_four("ACCGCCT").unwrap(), 7),
620 seq2base_four("AGGCGGT").unwrap()
621 );
622 }
623
624 #[test]
625 fn test_pad_size() {
626 assert_eq!(pad_size("123", 1), "123");
627 assert_eq!(pad_size("123", 3), "123");
628 assert_eq!(pad_size("123", 5), "N123N");
629 assert_eq!(pad_size("123", 7), "NN123NN");
630
631 assert_eq!(pad_size("1234", 2), "1234");
632 assert_eq!(pad_size("1234", 4), "1234");
633 assert_eq!(pad_size("1234", 6), "N1234N");
634 assert_eq!(pad_size("1234", 8), "NN1234NN");
635 }
636}