mod amino_acid;
mod cds;
pub mod error;
pub mod interval;
mod mutation;
mod point_mutation_classifier;
mod seq_window_slider;
mod sequence_annotation;
use std::collections::hash_map::HashMap;
use std::convert::{From, TryFrom};
use std::fmt;
use serde_repr::{Deserialize_repr, Serialize_repr};
use pattern_partition_prediction::{PaPaPred, PaPaPredIndel};
use twobit::TwoBitFile;
pub use crate::cds::{Phase, CDS};
use crate::error::{MutexpectError, ParseError};
pub use crate::interval::Interval;
use crate::mutation::{PointMutation, Indel};
pub use crate::point_mutation_classifier::PointMutationClassifier;
use crate::seq_window_slider::SeqWindowSlider;
pub use crate::sequence_annotation::{read_sequence_annotations_from_file, SeqAnnotation, Strand};
pub fn possible_mutations(
seq: &str,
seq_annotation: &SeqAnnotation,
point_mutation_probabilities: &PaPaPred,
indel_probabilities: &Option<PaPaPredIndel>,
drop_nan: bool,
include_intronic : bool,
include_unknown : bool,
filter_plof : bool,
) -> Result<Vec<MutationEvent>, MutexpectError> {
let mut result = Vec::new();
let seq_flanking_length = point_mutation_probabilities.kmer_size() / 2; let window_length = 1 + 2 * seq_flanking_length;
let sequence_genomic_start_position = seq_annotation.range.start;
let classifier = PointMutationClassifier::new(&seq_annotation, seq_flanking_length);
let introns = seq_annotation.get_introns();
let mut introns_iter = introns.iter();
let mut cds_iter = seq_annotation.coding_sequences.iter();
let mut current_intron: Option<Interval> = introns_iter.next().copied();
let mut current_cds: Option<CDS> = cds_iter.next().copied();
for (i, seq_window) in SeqWindowSlider::new(seq, window_length)
.into_iter()
.enumerate()
{
let genomic_position = sequence_genomic_start_position + i;
let seq_window_vec: Vec<char> = seq_window.chars().collect();
let ref_base: Base = seq_window_vec[seq_flanking_length].into();
let mutation_probability = point_mutation_probabilities
.rates(seq_window)
.map_err(|e| {
error::SequenceError::new(
seq_annotation.name.clone(),
"Bad sequence (for point mutation)",
Some(Box::new(e)),
)
})?;
let overlapping_intron = {
loop {
if let Some(intron) = current_intron {
if genomic_position < intron.start {
break None; } else if genomic_position < intron.stop {
break Some(intron); } else {
current_intron = introns_iter.next().copied();
}
} else {
break None; }
}
};
let overlapping_cds = {
loop {
if let Some(cds) = ¤t_cds {
if genomic_position < cds.range.start {
break None;
} else if genomic_position < cds.range.stop {
break Some(cds); } else {
current_cds = cds_iter.next().copied();
}
} else {
break None; }
}
};
let mut mutation_type = classifier.classify_by_position(
genomic_position,
&seq_window_vec,
&overlapping_intron, );
for other_nuc in &[Base::A, Base::C, Base::G, Base::T] {
if *other_nuc == ref_base {
} else {
let probability = mutation_probability.by_name(other_nuc.name());
if drop_nan && probability.is_nan() {
continue;
}
if let Some(cds) = overlapping_cds {
mutation_type = classifier.classify_coding_mutation(
genomic_position,
&seq_window_vec,
other_nuc.name(),
&cds,
filter_plof,
)
} if mutation_type == MutationType::Intronic && !include_intronic {
continue;
}
if mutation_type == MutationType::Unknown && !include_unknown {
continue;
}
result.push(MutationEvent {
mutation_type,
probability,
});
}
}
if let Some(p) = indel_probabilities {
if overlapping_cds.is_some() && overlapping_cds.expect("some").range.start < genomic_position { let rates = p.rates(&seq_window[0 .. seq_window.len() - 1] ) .map_err(|e| {
error::SequenceError::new(
seq_annotation.name.clone(),
"Bad sequence (for indel split)",
Some(Box::new(e)),
)
})?;
result.push(MutationEvent{
mutation_type: MutationType::InFrameIndel,
probability: rates.inframe,
});
if filter_plof {
match seq_annotation.strand {
Strand::Plus => {
if genomic_position >= seq_annotation.get_end_minus50() {
continue;
}
},
Strand::Minus => {
if genomic_position <= seq_annotation.get_end_minus50() {
continue;
}
},
}
}
result.push(MutationEvent{
mutation_type: MutationType::FrameshiftIndel,
probability: rates.outframe,
});
}
}
}
Ok(result)
}
pub fn expected_number_of_mutations(
possible_mutations: &[MutationEvent],
) -> HashMap<MutationType, f64> {
let mut result = HashMap::new();
for event in possible_mutations {
*result.entry(event.mutation_type).or_insert(0.0) += event.probability as f64
}
result
}
pub fn observed_number_of_mutations(
mutations: &[PointMutation], indels: &[Indel],
seq_annotation: &SeqAnnotation, twobit_ref_seq: &TwoBitFile,
filter_plof : bool,
) -> Result<HashMap<MutationType, usize>, MutexpectError> {
let mut result = HashMap::new();
let dna = twobit_ref_seq;
let classifier = PointMutationClassifier::new(seq_annotation, 5);
for mutation in mutations {
let start = mutation.position - 2; let stop = mutation.position + 3; let nucleotides: Vec<char> = dna
.sequence(&mutation.chromosome, start, stop)
.unwrap()
.chars()
.collect();
let mut mut_type = classifier.classify_by_position(
mutation.position,
&nucleotides,
&seq_annotation.find_intron(mutation.position),
);
if mut_type == MutationType::Unknown {
if let Some(cds) = seq_annotation.find_cds(mutation.position) {
mut_type = classifier.classify_coding_mutation(
mutation.position,
&nucleotides,
mutation.alternative,
&cds,
filter_plof,
);
} } result.entry(mut_type).and_modify(|c| *c += 1).or_insert(1);
}
for indel in indels {
let mut mut_type = MutationType::Unknown;
for intron in seq_annotation.get_introns() {
if intron.contains(indel.position) {
mut_type = MutationType::Intronic;
break;
}
}
if mut_type == MutationType::Unknown { for cds in &seq_annotation.coding_sequences {
if cds.range.contains(indel.position) {
if indel.is_inframe() {
mut_type = MutationType::InFrameIndel
} else {
mut_type = MutationType::FrameshiftIndel;
if filter_plof {
match seq_annotation.strand {
Strand::Plus => {
if indel.position >= seq_annotation.get_end_minus50() {
mut_type = MutationType::Unknown
}
},
Strand::Minus => {
if indel.position <= seq_annotation.get_end_minus50() {
mut_type = MutationType::Unknown
}
},
}
}
}
break
}
}
}
result.entry(mut_type).and_modify(|c| *c += 1).or_insert(1);
}
Ok(result)
}
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Serialize_repr, Deserialize_repr)]
#[repr(u8)]
pub enum MutationType {
Unknown = 0,
Synonymous = 1,
Missense = 2,
Nonsense = 3,
StopLoss = 4,
StartCodon = 5,
SpliceSite = 6,
Intronic = 7,
InFrameIndel = 8,
FrameshiftIndel = 9,
}
impl MutationType {
pub fn as_str(&self) -> &'static str {
match self {
Self::Unknown => "unknown",
Self::Synonymous => "synonymous",
Self::Missense => "missense",
Self::Nonsense => "nonsense",
Self::StopLoss => "stop_loss",
Self::StartCodon => "start_codon",
Self::SpliceSite => "splice_site",
Self::Intronic => "intronic",
Self::InFrameIndel => "in_frame_indel",
Self::FrameshiftIndel => "frameshift_indel",
}
}
pub fn iter() -> MutationTypeIter {
MutationTypeIter { index: 0 }
}
}
impl fmt::Display for MutationType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let string = match self {
Self::Unknown => "Unknown",
Self::Synonymous => "Synonymous",
Self::Missense => "Missense",
Self::Nonsense => "Nonsense",
Self::StopLoss => "StopLoss",
Self::StartCodon => "StartCodon",
Self::SpliceSite => "SpliceSite",
Self::Intronic => "Intronic",
Self::InFrameIndel => "InFrameIndel",
Self::FrameshiftIndel => "FrameshiftIndel",
};
write!(f, "{}", string)
}
}
impl TryFrom<&str> for MutationType {
type Error = ParseError;
fn try_from(s: &str) -> Result<Self, Self::Error> {
Ok(match s.to_lowercase().as_str() {
"unknown" => Self::Unknown,
"synonymous" => Self::Synonymous,
"missense" => Self::Missense,
"nonsense" => Self::Nonsense,
"stoploss" | "stop_loss" => Self::StopLoss,
"startcodon" | "start_codon" => Self::StartCodon,
"splicesite" | "splice_site" => Self::SpliceSite,
"intronic" => Self::Intronic,
"in_frame" | "in_frame_indel" | "inframe" => Self::InFrameIndel,
"out_frame" | "out_frame_outdel" | "outframe" => Self::FrameshiftIndel,
_ => {
return Err(ParseError::somewhere(
"name of mutation type",
s.to_string(),
))
}
})
}
}
impl From<u8> for MutationType {
fn from(n: u8) -> Self {
match n {
0 => Self::Unknown,
1 => Self::Synonymous,
2 => Self::Missense,
3 => Self::Nonsense,
4 => Self::StopLoss,
5 => Self::StartCodon,
6 => Self::SpliceSite,
7 => Self::Intronic,
8 => Self::InFrameIndel,
9 => Self::FrameshiftIndel,
_ => Self::Unknown,
}
}
}
pub struct MutationTypeIter {
index: u8,
}
impl std::iter::Iterator for MutationTypeIter {
type Item = MutationType;
fn next(&mut self) -> Option<Self::Item> {
let mut_type: MutationType = self.index.into();
self.index += 1;
if mut_type == MutationType::Unknown && self.index != 1 { None
} else {
Some(mut_type)
}
}
}
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct MutationEvent {
pub mutation_type: MutationType,
pub probability: f32,
}
impl MutationEvent {
pub fn new(mutation_type: MutationType, probability: f32) -> Self {
Self {
mutation_type,
probability,
}
}
}
#[derive(PartialEq, Clone, Copy, Debug)]
pub enum Base {
A,
C,
G,
T,
N,
}
impl Base {
pub fn name(&self) -> char {
match self {
Base::A => 'A',
Base::C => 'C',
Base::G => 'G',
Base::T => 'T',
Base::N => 'N',
}
}
}
impl From<u8> for Base {
fn from(byte: u8) -> Self {
match byte {
65 | 97 => Self::A,
67 | 99 => Self::C,
71 | 103 => Self::G,
84 | 116 => Self::T,
78 | 110 => Self::N,
_ => panic!("Bad nucleotide: {}", byte),
}
}
}
impl From<char> for Base {
fn from(c: char) -> Self {
match c {
'A' | 'a' => Self::A,
'C' | 'c' => Self::C,
'G' | 'g' => Self::G,
'T' | 't' => Self::T,
'N' | 'n' => Self::N,
_ => panic!("Bad nucleotide: {}", c),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_mutation_type_iterator() {
let mutation_types = vec![
MutationType::Unknown,
MutationType::Synonymous,
MutationType::Missense,
MutationType::Nonsense,
MutationType::StopLoss,
MutationType::StartCodon,
MutationType::SpliceSite,
MutationType::Intronic,
MutationType::InFrameIndel,
MutationType::FrameshiftIndel,
];
let mut iter = MutationType::iter();
for mut_type in mutation_types {
assert_eq!(mut_type, iter.next().unwrap());
}
assert!(iter.next().is_none());
}
}