extern crate tabfile;
pub mod error;
use std::borrow::Cow;
use std::convert::TryInto;
use std::path::Path;
use tabfile::Tabfile;
use error::Error;
pub type Float = f32;
type TransitionProbabilities = [Float; 3];
#[derive(Copy, Clone, Debug)]
pub struct IndelTransitionProbabilities {
pub outframe: Float,
pub inframe: Float,
}
impl Default for IndelTransitionProbabilities {
fn default() -> Self {
Self{outframe: Float::NAN, inframe: Float::NAN}
}
}
#[derive(Debug)]
pub struct SubstitutionRates {
pub a: Float,
pub c: Float,
pub g: Float,
pub t: Float,
}
impl SubstitutionRates {
pub fn by_name(&self, nucleotide: char) -> Float {
match nucleotide {
'A' | 'a' => self.a,
'C' | 'c' => self.c,
'G' | 'g' => self.g,
'T' | 't' => self.t,
_ => panic!("Not a valid nucleotide: {}", nucleotide),
}
}
}
pub struct PaPaPred {
radius: usize,
lookup: Vec<TransitionProbabilities>,
}
impl PaPaPred {
pub fn new<P: AsRef<Path>>(path: P, min_kmer_size: Option<usize>) -> Result<PaPaPred, Error> {
let mut kmer_size = 0;
let template: Vec<TransitionProbabilities> = vec![[Float::NAN; 3]];
let mut lookup: Vec<TransitionProbabilities> = Vec::new();
for record_result in Tabfile::open(path)?.separator(' ') {
let record = record_result?;
let tokens = record.fields();
let substitution = tokens[0].parse::<Substitution>().unwrap();
let uipac_context = pad_size(tokens[1], min_kmer_size.unwrap_or(1));
let rate = tokens[2].parse::<Float>().unwrap();
if kmer_size == 0 {
kmer_size = uipac_context.len();
if kmer_size % 2 == 0 {
return Err(Error::BadKmerSize(kmer_size));
}
lookup = template.repeat(4usize.pow(kmer_size as u32));
} else if uipac_context.len() != kmer_size {
let message = format!(
"The following line does not have length {}: {}",
kmer_size,
record.line()
);
return Err(Error::FileFormat { message });
}
for context in expand_uipac(&uipac_context) {
let context_base4 = seq2base_four(&context)?;
let index = substitution.transition_probabilities_index();
lookup[context_base4][index] = rate;
let complement_context = base4_reverse_complement(context_base4, kmer_size);
let complement_index = substitution.complement().transition_probabilities_index();
lookup[complement_context][complement_index] = rate;
}
}
let radius = kmer_size / 2;
Ok(PaPaPred { radius, lookup })
}
pub fn rates(&self, seq: &str) -> Result<SubstitutionRates, Error> {
let index = seq2base_four(seq)?;
let rates = self.lookup[index];
let ref_base: Base = seq
.chars()
.nth(self.radius)
.expect("sequence is too short")
.try_into()
.unwrap();
Ok(match ref_base {
Base::A => SubstitutionRates {
a: Float::NAN,
c: rates[0],
g: rates[1],
t: rates[2],
},
Base::C => SubstitutionRates {
a: rates[0],
c: Float::NAN,
g: rates[1],
t: rates[2],
},
Base::G => SubstitutionRates {
a: rates[0],
c: rates[1],
g: Float::NAN,
t: rates[2],
},
Base::T => SubstitutionRates {
a: rates[0],
c: rates[1],
g: rates[2],
t: Float::NAN,
},
})
}
pub fn kmer_size(&self) -> usize {
1 + 2 * self.radius
}
pub fn radius(&self) -> usize {
self.radius
}
}
pub struct PaPaPredIndel {
radius: usize,
lookup: Vec<IndelTransitionProbabilities>,
}
impl PaPaPredIndel {
pub fn new<P: AsRef<Path>>(path: P, min_kmer_size: Option<usize>) -> Result<Self, Error> {
let mut kmer_size = 0;
let template: Vec<IndelTransitionProbabilities> = vec![IndelTransitionProbabilities::default()];
let mut lookup: Vec<IndelTransitionProbabilities> = Vec::new();
for record_result in Tabfile::open(path)?.separator(' ') {
let record = record_result?;
let tokens = record.fields();
let uipac_context = pad_size(tokens[0], min_kmer_size.unwrap_or(1));
let frameshift_probability = tokens[1].parse::<Float>().unwrap();
let inframe_probability = tokens[2].parse::<Float>().unwrap();
if kmer_size == 0 {
kmer_size = uipac_context.len();
if kmer_size % 2 == 1 {
return Err(Error::BadKmerSize(kmer_size));
}
lookup = template.repeat(4usize.pow(kmer_size as u32));
} else if uipac_context.len() != kmer_size {
let message = format!(
"The following line does not have length {}: {}",
kmer_size,
record.line()
);
return Err(Error::FileFormat { message });
}
let rate = IndelTransitionProbabilities{
outframe: frameshift_probability,
inframe: inframe_probability,
};
for context in expand_uipac(&uipac_context) {
let context_base4 = seq2base_four(&context)?;
lookup[context_base4] = rate;
let complement_context = base4_reverse_complement(context_base4, kmer_size);
lookup[complement_context] = rate;
}
}
let radius = kmer_size / 2;
Ok(Self{radius, lookup})
}
pub fn rates(&self, seq: &str) -> Result<IndelTransitionProbabilities, Error> {
let index = seq2base_four(seq)?;
let rates = self.lookup[index];
Ok(rates)
}
pub fn kmer_size(&self) -> usize {
2 * self.radius
}
pub fn radius(&self) -> usize {
self.radius
}
}
#[derive(PartialEq)]
enum Base {
A,
C,
G,
T,
}
impl Base {
fn complement(&self) -> Self {
match self {
Base::A => Base::T,
Base::C => Base::G,
Base::G => Base::C,
Base::T => Base::A,
}
}
}
struct Substitution {
from: Base,
to: Base,
}
impl Substitution {
fn transition_probabilities_index(&self) -> usize {
match self.from {
Base::A => match self.to {
Base::A => panic!("Can't transition from A to A"),
Base::C => 0,
Base::G => 1,
Base::T => 2,
},
Base::C => match self.to {
Base::A => 0,
Base::C => panic!("Can't transition from C to C"),
Base::G => 1,
Base::T => 2,
},
Base::G => match self.to {
Base::A => 0,
Base::C => 1,
Base::G => panic!("Can't transition from G to G"),
Base::T => 2,
},
Base::T => match self.to {
Base::A => 0,
Base::C => 1,
Base::G => 2,
Base::T => panic!("Can't transition from T to T"),
},
}
}
fn complement(&self) -> Self {
Substitution {
from: self.from.complement(),
to: self.to.complement(),
}
}
}
impl std::str::FromStr for Substitution {
type Err = Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
if s.len() == 4 {
if &s[1..3] != "->" {
Err(Error::FileFormat {
message: "expected '->' between nucleotides".to_string(),
})
} else {
let from = s.chars().next().expect("not empty").try_into()?;
let to = s.chars().nth(3).expect("4 chars").try_into()?;
Ok(Substitution { from, to })
}
} else {
Err( Error::FileFormat{ message:
format!( "The string {} does not confirm to the scheme X->Y (where X and Y are nucleotides", s ) } )
}
}
}
impl std::convert::TryFrom<char> for Base {
type Error = Error;
fn try_from(value: char) -> Result<Self, Self::Error> {
Ok(match value {
'A' => Base::A,
'C' => Base::C,
'G' => Base::G,
'T' => Base::T,
_ => return Err(Error::BadNucleotide(value)),
})
}
}
fn seq2base_four(seq: &str) -> Result<usize, Error> {
let mut result = 0;
for c in seq.chars() {
let value = match c {
'A' => 0,
'C' => 1,
'G' => 2,
'T' => 3,
_ => return Err(Error::BadNucleotide(c))
};
result <<= 2;
result += value;
}
Ok(result)
}
fn expand_uipac(seq: &str) -> Vec<String> {
if !seq.is_empty() {
let (c, rest) = seq.split_at(1);
let suffixes = expand_uipac(rest);
let mut result = Vec::new();
let expansions = match c.chars().next().unwrap() {
'A' => "A",
'C' => "C",
'G' => "G",
'T' => "T",
'R' => "AG",
'Y' => "CT",
'S' => "GC",
'W' => "AT",
'K' => "GT",
'M' => "AC",
'B' => "CGT",
'D' => "AGT",
'H' => "ACT",
'V' => "ACG",
'N' => "ACGT",
_ => panic!("Invalid code: {}", c),
};
for expansion in expansions.chars() {
if !suffixes.is_empty() {
for suffix in &suffixes {
let mut s = String::with_capacity(suffix.len() + 1);
s.push(expansion);
s.push_str(&suffix);
result.push(s);
}
} else {
result.push(expansion.to_string());
}
}
result
} else {
Vec::new()
}
}
fn pad_size(seq: &str, min_size: usize) -> Cow<str> {
if min_size % 2 == 0 {
panic!(
"Only odd numbers are allowed. Provided number: {}",
min_size
);
}
let len = seq.len();
if len >= min_size {
Cow::Borrowed(seq)
} else {
let flanking = "N".repeat((min_size - len) / 2);
Cow::Owned(format!("{}{}{}", flanking, seq, flanking))
}
}
fn base4_reverse_complement(mut value: usize, digits: usize) -> usize {
let mut result = 0;
for _ in 0..digits {
result <<= 2;
result += 3 - (value & 3);
value >>= 2;
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_seq2base_four() {
let fun = seq2base_four;
assert_eq!(fun("ACGT").unwrap(), 0 * 64 + 1 * 16 + 2 * 4 + 3 * 1);
assert_eq!(fun("TGCA").unwrap(), 3 * 64 + 2 * 16 + 1 * 4 + 0 * 1);
assert_eq!(fun("").unwrap(), 0);
assert_eq!(fun("TTTG").unwrap(), 3 * 64 + 3 * 16 + 3 * 4 + 2);
assert_eq!(fun("AAAAAAAA").unwrap(), 0);
assert_eq!(fun("CCCCC").unwrap(), 256 + 64 + 16 + 4 + 1);
assert_eq!(
fun("ACCGCCT").unwrap(),
1 * 1024 + 1 * 256 + 2 * 64 + 1 * 16 + 1 * 4 + 3
);
assert_eq!(fun("e").unwrap_err(), Error::BadNucleotide('e'));
assert_eq!(fun("ACGTmACGT").unwrap_err(), Error::BadNucleotide('m'));
}
#[test]
fn test_papapred() {
let full_input_file = "test_assets/PaPa_rates.txt";
let papa = PaPaPred::new(full_input_file, None).unwrap();
let rates = papa.rates("ACCGCCT").unwrap();
assert_eq!(rates.a, 6.0526200e-04);
assert_eq!(rates.c, 2.3540691e-05);
assert!(rates.g.is_nan());
assert_eq!(rates.t, 3.8108174e-05);
let rates = papa.rates("AGGCGGT").unwrap();
assert_eq!(rates.a, 3.8108174e-05);
assert!(rates.c.is_nan());
assert_eq!(rates.g, 2.3540691e-05);
assert_eq!(rates.t, 6.0526200e-04);
let rates = papa.rates("AAACAAA").unwrap();
assert_eq!(rates.a, 1.94144068e-05);
assert!(rates.c.is_nan());
assert_eq!(rates.g, 1.33842095e-05);
assert_eq!(rates.t, 3.95017705e-05);
let rates = papa.rates("AAACAAA").unwrap();
assert_eq!(rates.a, 1.94144068e-05);
assert!(rates.c.is_nan());
assert_eq!(rates.g, 1.33842095e-05);
assert_eq!(rates.t, 3.95017705e-05);
let rates = papa.rates("TCATTTT").unwrap();
assert_eq!(rates.a, 5.1615812e-06);
assert_eq!(rates.c, 2.6344846e-05);
assert_eq!(rates.g, 7.5895418e-06);
assert!(rates.t.is_nan());
let rates = papa.rates("TAGCTTA").unwrap();
assert_eq!(rates.a, 1.18828875e-05);
assert!(rates.c.is_nan());
assert_eq!(rates.g, 1.92736370e-05);
assert_eq!(rates.t, 5.04661366e-05);
let rates = papa.rates("GTCGTGC").unwrap();
assert_eq!(rates.a, 6.9644750e-04);
assert_eq!(rates.c, 2.0450439e-05);
assert!(rates.g.is_nan());
assert_eq!(rates.t, 1.7836264e-05);
let rates = papa.rates("GGTACTT").unwrap();
assert!(rates.a.is_nan());
assert_eq!(rates.c, 7.8399744e-06);
assert_eq!(rates.g, 3.8916667e-05);
assert_eq!(rates.t, 6.6361449e-06);
}
#[test]
fn test_papapred_indel() {
let full_input_file = "test_assets/PaPa_rates_indel.txt";
let papa = PaPaPredIndel::new(full_input_file, None).unwrap();
let rates = papa.rates("ACGTAC").unwrap();
assert_eq!(rates.outframe, 0.25);
assert_eq!(rates.inframe, 0.75);
let rates = papa.rates("CATCAT").unwrap();
assert_eq!(rates.outframe, 0.125);
assert_eq!(rates.inframe, 0.5625);
let rates = papa.rates("TTATGG").unwrap();
assert_eq!(rates.outframe, 0.5);
assert_eq!(rates.inframe, 0.625);
let rates = papa.rates("GGCGTA").unwrap();
assert_eq!(rates.outframe, 0.3125);
assert_eq!(rates.inframe, 0.375);
let rates = papa.rates("TTTTTT").unwrap();
assert!(rates.outframe.is_nan());
assert!(rates.inframe.is_nan());
}
#[test]
fn test_expand_uipac() {
let mut foo = expand_uipac("ACGT");
assert_eq!(foo.len(), 1);
assert_eq!(foo.pop().unwrap(), "ACGT");
foo = expand_uipac("");
assert_eq!(foo.len(), 0);
foo = expand_uipac("NN");
let bar = vec![
"AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", "GA", "GC", "GG", "GT", "TA", "TC",
"TG", "TT",
];
assert_eq!(foo, bar);
foo = expand_uipac("RYS");
let bar = vec!["ACG", "ACC", "ATG", "ATC", "GCG", "GCC", "GTG", "GTC"];
assert_eq!(foo, bar);
}
#[test]
fn test_base4_reverse_complement() {
assert_eq!(
base4_reverse_complement(seq2base_four("AAGG").unwrap(), 4),
seq2base_four("CCTT").unwrap()
);
assert_eq!(
base4_reverse_complement(seq2base_four("GTACTAG").unwrap(), 7),
seq2base_four("CTAGTAC").unwrap()
);
assert_eq!(
base4_reverse_complement(seq2base_four("TCGATTGCAT").unwrap(), 10),
seq2base_four("ATGCAATCGA").unwrap()
);
assert_eq!(
base4_reverse_complement(0, 6),
seq2base_four("TTTTTT").unwrap()
);
assert_eq!(
base4_reverse_complement(seq2base_four("AG").unwrap(), 8),
seq2base_four("CTTTTTTT").unwrap()
);
assert_eq!(
base4_reverse_complement(seq2base_four("ACCGCCT").unwrap(), 7),
seq2base_four("AGGCGGT").unwrap()
);
}
#[test]
fn test_pad_size() {
assert_eq!(pad_size("123", 1), "123");
assert_eq!(pad_size("123", 3), "123");
assert_eq!(pad_size("123", 5), "N123N");
assert_eq!(pad_size("123", 7), "NN123NN");
}
}