mutexpect/
lib.rs

1mod amino_acid;
2mod cds;
3pub mod error;
4pub mod interval;
5mod mutation;
6mod point_mutation_classifier;
7mod seq_window_slider;
8mod sequence_annotation;
9
10use std::collections::hash_map::HashMap;
11use std::convert::{From, TryFrom};
12use std::fmt;
13
14use serde_repr::{Deserialize_repr, Serialize_repr};
15
16use pattern_partition_prediction::{PaPaPred, PaPaPredIndel};
17use twobit::TwoBitFile;
18
19pub use crate::cds::{Phase, CDS};
20use crate::error::{MutexpectError, ParseError};
21pub use crate::interval::Interval;
22use crate::mutation::{PointMutation, Indel};
23pub use crate::point_mutation_classifier::PointMutationClassifier;
24use crate::seq_window_slider::SeqWindowSlider;
25pub use crate::sequence_annotation::{read_sequence_annotations_from_file, SeqAnnotation, Strand};
26
27pub fn possible_mutations(
28    seq: &str,
29    seq_annotation: &SeqAnnotation,
30    point_mutation_probabilities: &PaPaPred,
31    indel_probabilities: &Option<PaPaPredIndel>,
32    drop_nan: bool,
33    include_intronic : bool,
34    include_unknown : bool,
35    filter_plof : bool,
36) -> Result<Vec<MutationEvent>, MutexpectError> {
37    let mut result = Vec::new();
38    let seq_flanking_length = point_mutation_probabilities.kmer_size() / 2; // this should round down
39    let window_length = 1 + 2 * seq_flanking_length;
40    let sequence_genomic_start_position = seq_annotation.range.start;
41    let classifier = PointMutationClassifier::new(&seq_annotation, seq_flanking_length);
42    let introns = seq_annotation.get_introns();
43    let mut introns_iter = introns.iter();
44    let mut cds_iter = seq_annotation.coding_sequences.iter();
45
46    let mut current_intron: Option<Interval> = introns_iter.next().copied();
47    let mut current_cds: Option<CDS> = cds_iter.next().copied();
48    //for ( i, seq_window ) in seq.as_bytes().windows( window_length ).enumerate() {
49    for (i, seq_window) in SeqWindowSlider::new(seq, window_length)
50        .into_iter()
51        .enumerate()
52    {
53        let genomic_position = sequence_genomic_start_position + i;
54        let seq_window_vec: Vec<char> = seq_window.chars().collect();
55        let ref_base: Base = seq_window_vec[seq_flanking_length].into();
56
57        let mutation_probability = point_mutation_probabilities
58            .rates(seq_window)
59            .map_err(|e| {
60                error::SequenceError::new(
61                    seq_annotation.name.clone(),
62                    "Bad sequence (for point mutation)",
63                    Some(Box::new(e)),
64                )
65            })?;
66
67
68        let overlapping_intron = {
69            loop {
70                if let Some(intron) = current_intron {
71                    if genomic_position < intron.start {
72                        break None; // still has some way to go to the next intron
73                    } else if genomic_position < intron.stop {
74                        break Some(intron); // yay, we are inside the current intron!
75                    } else {
76                        current_intron = introns_iter.next().copied();
77                    }
78                } else {
79                    break None; // iterator is exhausted
80                }
81            }
82        };
83
84        let overlapping_cds = {
85            loop {
86                if let Some(cds) = &current_cds {
87                    if genomic_position < cds.range.start {
88                        break None;
89                    } else if genomic_position < cds.range.stop {
90                        break Some(cds); // yay, we are inside the current cds!
91                    } else {
92                        current_cds = cds_iter.next().copied();
93                    }
94                } else {
95                    break None; // iterator is exhausted
96                }
97            }
98        };
99
100        let mut mutation_type = classifier.classify_by_position(
101            genomic_position,
102            &seq_window_vec,
103            &overlapping_intron, // may be none
104        );
105
106        for other_nuc in &[Base::A, Base::C, Base::G, Base::T] {
107            if *other_nuc == ref_base {
108                // NOP
109            } else {
110                let probability = mutation_probability.by_name(other_nuc.name());
111                if drop_nan && probability.is_nan() {
112                    // avoid trouble but make it impossible to reconstruct the positions of the point mutations
113                    // but that's not something I'm concerned with at the moment anyway.
114                    continue;
115                }
116                if let Some(cds) = overlapping_cds {
117                    // if we are in a coding region we need to do additional classifications
118                    mutation_type = classifier.classify_coding_mutation(
119                        genomic_position,
120                        &seq_window_vec,
121                        other_nuc.name(),
122                        &cds,
123                        filter_plof,
124                    )
125                } // else just recycle the mutation class
126                if mutation_type == MutationType::Intronic && !include_intronic {
127                    continue;
128                }
129                if mutation_type == MutationType::Unknown && !include_unknown {
130                    continue;
131                }
132                result.push(MutationEvent {
133                    mutation_type,
134                    probability,
135                });
136                // For testing. Would be nice to be able to print this based on a cli argument
137                //println!("{} {:?} {:?} {:?}", genomic_position +1 , ref_base, other_nuc, mutation_type)
138            }
139        }
140
141        // determine indels
142        if let Some(p) = indel_probabilities {
143            // we don't care about intronic indels because most of them don't matter
144            if overlapping_cds.is_some() && overlapping_cds.expect("some").range.start < genomic_position { // needs to be greater, because of anchor base
145                let rates = p.rates(&seq_window[0 .. seq_window.len() - 1] ) // 01[2]34 -> 01[]23
146                .map_err(|e| {
147                    error::SequenceError::new(
148                        seq_annotation.name.clone(),
149                        "Bad sequence (for indel split)",
150                        Some(Box::new(e)),
151                    )
152                })?;
153                result.push(MutationEvent{
154                    mutation_type: MutationType::InFrameIndel,
155                    probability: rates.inframe,
156                });
157                if filter_plof {
158                    match seq_annotation.strand {
159                        Strand::Plus => {
160                            if genomic_position >= seq_annotation.get_end_minus50() {
161                                continue;
162                            }  
163                        },      
164                        Strand::Minus => {
165                            if genomic_position <= seq_annotation.get_end_minus50() {
166                                continue;
167                            } 
168                        },
169                    }
170                }
171                result.push(MutationEvent{
172                    mutation_type: MutationType::FrameshiftIndel,
173                    probability: rates.outframe,
174                });
175            }
176        }
177    }
178    Ok(result)
179}
180
181pub fn expected_number_of_mutations(
182    possible_mutations: &[MutationEvent],
183) -> HashMap<MutationType, f64> {
184    let mut result = HashMap::new();
185    for event in possible_mutations {
186        *result.entry(event.mutation_type).or_insert(0.0) += event.probability as f64
187    }
188    result
189}
190
191pub fn observed_number_of_mutations(
192    mutations: &[PointMutation], // point mutations within the seq_annotation (the gene/transcript)
193    indels: &[Indel],
194    seq_annotation: &SeqAnnotation, //only one annotation for the current gene/transcript
195    twobit_ref_seq: &TwoBitFile,
196    filter_plof : bool,
197) -> Result<HashMap<MutationType, usize>, MutexpectError> {
198    let mut result = HashMap::new();
199    let dna = twobit_ref_seq; // alias
200
201    // we will provide a 5 nucleotide window around the mutation
202    let classifier = PointMutationClassifier::new(seq_annotation, 5);
203
204    for mutation in mutations {
205        let start = mutation.position - 2; // 2 nucs flanking to the left
206        let stop = mutation.position + 3; // 2 nucs flanking to the right
207        let nucleotides: Vec<char> = dna
208            .sequence(&mutation.chromosome, start, stop)
209            .unwrap()
210            .chars()
211            .collect();
212        let mut mut_type = classifier.classify_by_position(
213            mutation.position,
214            &nucleotides,
215            &seq_annotation.find_intron(mutation.position),
216        );
217        if mut_type == MutationType::Unknown {
218            if let Some(cds) = seq_annotation.find_cds(mutation.position) {
219                mut_type = classifier.classify_coding_mutation(
220                    mutation.position,
221                    &nucleotides,
222                    mutation.alternative,
223                    &cds,
224                    filter_plof,
225                );
226            } // else: not coding
227        } // else: just recycle Unknown 3 times
228        result.entry(mut_type).and_modify(|c| *c += 1).or_insert(1);
229    }
230
231    for indel in indels {
232        let mut mut_type = MutationType::Unknown;
233        for intron in seq_annotation.get_introns() {
234            if intron.contains(indel.position) {
235                mut_type = MutationType::Intronic;
236                break;
237            }
238        }
239
240        if mut_type == MutationType::Unknown { // it's not intronic. It might be a frameshift
241            for cds in &seq_annotation.coding_sequences {
242                if cds.range.contains(indel.position) {
243                    if indel.is_inframe() {
244                        mut_type = MutationType::InFrameIndel
245                    } else {
246                        mut_type = MutationType::FrameshiftIndel;
247                        if filter_plof {
248                            match seq_annotation.strand {
249                                Strand::Plus => {
250                                    if indel.position >= seq_annotation.get_end_minus50() {
251                                        mut_type = MutationType::Unknown
252                                    }  
253                                },      
254                                Strand::Minus => {
255                                    if indel.position <= seq_annotation.get_end_minus50() {
256                                        mut_type = MutationType::Unknown
257                                    } 
258                                },
259                            }
260                        }
261                    }
262                    break
263                }
264            }
265        }
266        result.entry(mut_type).and_modify(|c| *c += 1).or_insert(1);
267    }
268    Ok(result)
269}
270
271#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Serialize_repr, Deserialize_repr)]
272#[repr(u8)]
273pub enum MutationType {
274    Unknown = 0,
275    Synonymous = 1,
276    Missense = 2,
277    Nonsense = 3,
278    StopLoss = 4,
279    StartCodon = 5,
280    SpliceSite = 6,
281    Intronic = 7,
282    InFrameIndel = 8,
283    FrameshiftIndel = 9,
284}
285
286impl MutationType {
287    pub fn as_str(&self) -> &'static str {
288        match self {
289            Self::Unknown => "unknown",
290            Self::Synonymous => "synonymous",
291            Self::Missense => "missense",
292            Self::Nonsense => "nonsense",
293            Self::StopLoss => "stop_loss",
294            Self::StartCodon => "start_codon",
295            Self::SpliceSite => "splice_site",
296            Self::Intronic => "intronic",
297            Self::InFrameIndel => "in_frame_indel",
298            Self::FrameshiftIndel => "frameshift_indel",
299        }
300    }
301
302    pub fn iter() -> MutationTypeIter {
303        MutationTypeIter { index: 0 }
304    }
305}
306
307
308impl fmt::Display for MutationType {
309    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
310        // I get a stack overflow if I simply do self.to_string(). So I'll implement the string
311        // representation directly instead.
312        // I need to figure out why that happens though... Seems like it's related to different
313        // formatting functions calling each other recursively.
314        let string = match self {
315            Self::Unknown => "Unknown",
316            Self::Synonymous => "Synonymous",
317            Self::Missense => "Missense",
318            Self::Nonsense => "Nonsense",
319            Self::StopLoss => "StopLoss",
320            Self::StartCodon => "StartCodon",
321            Self::SpliceSite => "SpliceSite",
322            Self::Intronic => "Intronic",
323            Self::InFrameIndel => "InFrameIndel",
324            Self::FrameshiftIndel => "FrameshiftIndel",
325        };
326        write!(f, "{}", string)
327    }
328}
329
330impl TryFrom<&str> for MutationType {
331    type Error = ParseError;
332    fn try_from(s: &str) -> Result<Self, Self::Error> {
333        Ok(match s.to_lowercase().as_str() {
334            "unknown" => Self::Unknown,
335            "synonymous" => Self::Synonymous,
336            "missense" => Self::Missense,
337            "nonsense" => Self::Nonsense,
338            "stoploss" | "stop_loss" => Self::StopLoss,
339            "startcodon" | "start_codon" => Self::StartCodon,
340            "splicesite" | "splice_site" => Self::SpliceSite,
341            "intronic" => Self::Intronic,
342            "in_frame" | "in_frame_indel" | "inframe" => Self::InFrameIndel,
343            "out_frame" | "out_frame_outdel" | "outframe" => Self::FrameshiftIndel,
344            _ => {
345                return Err(ParseError::somewhere(
346                    "name of mutation type",
347                    s.to_string(),
348                ))
349            }
350        })
351    }
352}
353
354impl From<u8> for MutationType {
355    fn from(n: u8) -> Self {
356        match n {
357            0 => Self::Unknown,
358            1 => Self::Synonymous,
359            2 => Self::Missense,
360            3 => Self::Nonsense,
361            4 => Self::StopLoss,
362            5 => Self::StartCodon,
363            6 => Self::SpliceSite,
364            7 => Self::Intronic,
365            8 => Self::InFrameIndel,
366            9 => Self::FrameshiftIndel,
367            _ => Self::Unknown,
368        }
369    }
370}
371
372pub struct MutationTypeIter {
373    index: u8,
374}
375
376impl std::iter::Iterator for MutationTypeIter {
377    type Item = MutationType;
378
379    fn next(&mut self) -> Option<Self::Item> {
380        let mut_type: MutationType = self.index.into();
381        self.index += 1;
382        if mut_type == MutationType::Unknown && self.index != 1 { // 1, because we already incremented
383            None
384        } else {
385            Some(mut_type)
386        }
387    }
388}
389
390#[derive(Clone, Copy, Debug, PartialEq)]
391pub struct MutationEvent {
392    pub mutation_type: MutationType,
393    pub probability: f32,
394}
395
396impl MutationEvent {
397    pub fn new(mutation_type: MutationType, probability: f32) -> Self {
398        Self {
399            mutation_type,
400            probability,
401        }
402    }
403}
404
405#[derive(PartialEq, Clone, Copy, Debug)]
406pub enum Base {
407    A,
408    C,
409    G,
410    T,
411    N,
412}
413
414impl Base {
415    pub fn name(&self) -> char {
416        match self {
417            Base::A => 'A',
418            Base::C => 'C',
419            Base::G => 'G',
420            Base::T => 'T',
421            Base::N => 'N',
422        }
423    }
424}
425
426impl From<u8> for Base {
427    fn from(byte: u8) -> Self {
428        match byte {
429            65 | 97 => Self::A,
430            67 | 99 => Self::C,
431            71 | 103 => Self::G,
432            84 | 116 => Self::T,
433            78 | 110 => Self::N,
434            _ => panic!("Bad nucleotide: {}", byte),
435        }
436    }
437}
438
439impl From<char> for Base {
440    fn from(c: char) -> Self {
441        match c {
442            'A' | 'a' => Self::A,
443            'C' | 'c' => Self::C,
444            'G' | 'g' => Self::G,
445            'T' | 't' => Self::T,
446            'N' | 'n' => Self::N,
447            _ => panic!("Bad nucleotide: {}", c),
448        }
449    }
450}
451
452#[cfg(test)]
453mod tests {
454    use super::*;
455
456    #[test]
457    fn test_mutation_type_iterator() {
458        let mutation_types = vec![
459            MutationType::Unknown,
460            MutationType::Synonymous,
461            MutationType::Missense,
462            MutationType::Nonsense,
463            MutationType::StopLoss,
464            MutationType::StartCodon,
465            MutationType::SpliceSite,
466            MutationType::Intronic,
467            MutationType::InFrameIndel,
468            MutationType::FrameshiftIndel,
469        ];
470        let mut iter = MutationType::iter();
471        for mut_type in mutation_types {
472            assert_eq!(mut_type, iter.next().unwrap());
473        }
474        assert!(iter.next().is_none());
475    }
476}