1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
mod amino_acid;
mod cds;
pub mod error;
pub mod interval;
mod mutation;
mod point_mutation_classifier;
mod seq_window_slider;
mod sequence_annotation;

use std::collections::hash_map::HashMap;
use std::convert::{From, TryFrom};
use std::fmt;

use serde_repr::{Deserialize_repr, Serialize_repr};

use pattern_partition_prediction::{PaPaPred, PaPaPredIndel};
use twobit::TwoBitFile;

pub use crate::cds::{Phase, CDS};
use crate::error::{MutexpectError, ParseError};
pub use crate::interval::Interval;
use crate::mutation::{PointMutation, Indel};
pub use crate::point_mutation_classifier::PointMutationClassifier;
use crate::seq_window_slider::SeqWindowSlider;
pub use crate::sequence_annotation::{read_sequence_annotations_from_file, SeqAnnotation, Strand};

pub fn possible_mutations(
    seq: &str,
    seq_annotation: &SeqAnnotation,
    point_mutation_probabilities: &PaPaPred,
    indel_probabilities: &Option<PaPaPredIndel>,
    drop_nan: bool,
    include_intronic : bool,
    include_unknown : bool,
    filter_plof : bool,
) -> Result<Vec<MutationEvent>, MutexpectError> {
    let mut result = Vec::new();
    let seq_flanking_length = point_mutation_probabilities.kmer_size() / 2; // this should round down
    let window_length = 1 + 2 * seq_flanking_length;
    let sequence_genomic_start_position = seq_annotation.range.start;
    let classifier = PointMutationClassifier::new(&seq_annotation, seq_flanking_length);
    let introns = seq_annotation.get_introns();
    let mut introns_iter = introns.iter();
    let mut cds_iter = seq_annotation.coding_sequences.iter();

    let mut current_intron: Option<Interval> = introns_iter.next().copied();
    let mut current_cds: Option<CDS> = cds_iter.next().copied();
    //for ( i, seq_window ) in seq.as_bytes().windows( window_length ).enumerate() {
    for (i, seq_window) in SeqWindowSlider::new(seq, window_length)
        .into_iter()
        .enumerate()
    {
        let genomic_position = sequence_genomic_start_position + i;
        let seq_window_vec: Vec<char> = seq_window.chars().collect();
        let ref_base: Base = seq_window_vec[seq_flanking_length].into();

        let mutation_probability = point_mutation_probabilities
            .rates(seq_window)
            .map_err(|e| {
                error::SequenceError::new(
                    seq_annotation.name.clone(),
                    "Bad sequence (for point mutation)",
                    Some(Box::new(e)),
                )
            })?;


        let overlapping_intron = {
            loop {
                if let Some(intron) = current_intron {
                    if genomic_position < intron.start {
                        break None; // still has some way to go to the next intron
                    } else if genomic_position < intron.stop {
                        break Some(intron); // yay, we are inside the current intron!
                    } else {
                        current_intron = introns_iter.next().copied();
                    }
                } else {
                    break None; // iterator is exhausted
                }
            }
        };

        let overlapping_cds = {
            loop {
                if let Some(cds) = &current_cds {
                    if genomic_position < cds.range.start {
                        break None;
                    } else if genomic_position < cds.range.stop {
                        break Some(cds); // yay, we are inside the current cds!
                    } else {
                        current_cds = cds_iter.next().copied();
                    }
                } else {
                    break None; // iterator is exhausted
                }
            }
        };

        let mut mutation_type = classifier.classify_by_position(
            genomic_position,
            &seq_window_vec,
            &overlapping_intron, // may be none
        );

        for other_nuc in &[Base::A, Base::C, Base::G, Base::T] {
            if *other_nuc == ref_base {
                // NOP
            } else {
                let probability = mutation_probability.by_name(other_nuc.name());
                if drop_nan && probability.is_nan() {
                    // avoid trouble but make it impossible to reconstruct the positions of the point mutations
                    // but that's not something I'm concerned with at the moment anyway.
                    continue;
                }
                if let Some(cds) = overlapping_cds {
                    // if we are in a coding region we need to do additional classifications
                    mutation_type = classifier.classify_coding_mutation(
                        genomic_position,
                        &seq_window_vec,
                        other_nuc.name(),
                        &cds,
                        filter_plof,
                    )
                } // else just recycle the mutation class
                if mutation_type == MutationType::Intronic && !include_intronic {
                    continue;
                }
                if mutation_type == MutationType::Unknown && !include_unknown {
                    continue;
                }
                result.push(MutationEvent {
                    mutation_type,
                    probability,
                });
                // For testing. Would be nice to be able to print this based on a cli argument
                //println!("{} {:?} {:?} {:?}", genomic_position +1 , ref_base, other_nuc, mutation_type)
            }
        }

        // determine indels
        if let Some(p) = indel_probabilities {
            // we don't care about intronic indels because most of them don't matter
            if overlapping_cds.is_some() && overlapping_cds.expect("some").range.start < genomic_position { // needs to be greater, because of anchor base
                let rates = p.rates(&seq_window[0 .. seq_window.len() - 1] ) // 01[2]34 -> 01[]23
                .map_err(|e| {
                    error::SequenceError::new(
                        seq_annotation.name.clone(),
                        "Bad sequence (for indel split)",
                        Some(Box::new(e)),
                    )
                })?;
                result.push(MutationEvent{
                    mutation_type: MutationType::InFrameIndel,
                    probability: rates.inframe,
                });
                if filter_plof {
                    match seq_annotation.strand {
                        Strand::Plus => {
                            if genomic_position >= seq_annotation.get_end_minus50() {
                                continue;
                            }  
                        },      
                        Strand::Minus => {
                            if genomic_position <= seq_annotation.get_end_minus50() {
                                continue;
                            } 
                        },
                    }
                }
                result.push(MutationEvent{
                    mutation_type: MutationType::FrameshiftIndel,
                    probability: rates.outframe,
                });
            }
        }
    }
    Ok(result)
}

pub fn expected_number_of_mutations(
    possible_mutations: &[MutationEvent],
) -> HashMap<MutationType, f64> {
    let mut result = HashMap::new();
    for event in possible_mutations {
        *result.entry(event.mutation_type).or_insert(0.0) += event.probability as f64
    }
    result
}

pub fn observed_number_of_mutations(
    mutations: &[PointMutation], // point mutations within the seq_annotation (the gene/transcript)
    indels: &[Indel],
    seq_annotation: &SeqAnnotation, //only one annotation for the current gene/transcript
    twobit_ref_seq: &TwoBitFile,
    filter_plof : bool,
) -> Result<HashMap<MutationType, usize>, MutexpectError> {
    let mut result = HashMap::new();
    let dna = twobit_ref_seq; // alias

    // we will provide a 5 nucleotide window around the mutation
    let classifier = PointMutationClassifier::new(seq_annotation, 5);

    for mutation in mutations {
        let start = mutation.position - 2; // 2 nucs flanking to the left
        let stop = mutation.position + 3; // 2 nucs flanking to the right
        let nucleotides: Vec<char> = dna
            .sequence(&mutation.chromosome, start, stop)
            .unwrap()
            .chars()
            .collect();
        let mut mut_type = classifier.classify_by_position(
            mutation.position,
            &nucleotides,
            &seq_annotation.find_intron(mutation.position),
        );
        if mut_type == MutationType::Unknown {
            if let Some(cds) = seq_annotation.find_cds(mutation.position) {
                mut_type = classifier.classify_coding_mutation(
                    mutation.position,
                    &nucleotides,
                    mutation.alternative,
                    &cds,
                    filter_plof,
                );
            } // else: not coding
        } // else: just recycle Unknown 3 times
        result.entry(mut_type).and_modify(|c| *c += 1).or_insert(1);
    }

    for indel in indels {
        let mut mut_type = MutationType::Unknown;
        for intron in seq_annotation.get_introns() {
            if intron.contains(indel.position) {
                mut_type = MutationType::Intronic;
                break;
            }
        }

        if mut_type == MutationType::Unknown { // it's not intronic. It might be a frameshift
            for cds in &seq_annotation.coding_sequences {
                if cds.range.contains(indel.position) {
                    if indel.is_inframe() {
                        mut_type = MutationType::InFrameIndel
                    } else {
                        mut_type = MutationType::FrameshiftIndel;
                        if filter_plof {
                            match seq_annotation.strand {
                                Strand::Plus => {
                                    if indel.position >= seq_annotation.get_end_minus50() {
                                        mut_type = MutationType::Unknown
                                    }  
                                },      
                                Strand::Minus => {
                                    if indel.position <= seq_annotation.get_end_minus50() {
                                        mut_type = MutationType::Unknown
                                    } 
                                },
                            }
                        }
                    }
                    break
                }
            }
        }
        result.entry(mut_type).and_modify(|c| *c += 1).or_insert(1);
    }
    Ok(result)
}

#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Serialize_repr, Deserialize_repr)]
#[repr(u8)]
pub enum MutationType {
    Unknown = 0,
    Synonymous = 1,
    Missense = 2,
    Nonsense = 3,
    StopLoss = 4,
    StartCodon = 5,
    SpliceSite = 6,
    Intronic = 7,
    InFrameIndel = 8,
    FrameshiftIndel = 9,
}

impl MutationType {
    pub fn as_str(&self) -> &'static str {
        match self {
            Self::Unknown => "unknown",
            Self::Synonymous => "synonymous",
            Self::Missense => "missense",
            Self::Nonsense => "nonsense",
            Self::StopLoss => "stop_loss",
            Self::StartCodon => "start_codon",
            Self::SpliceSite => "splice_site",
            Self::Intronic => "intronic",
            Self::InFrameIndel => "in_frame_indel",
            Self::FrameshiftIndel => "frameshift_indel",
        }
    }

    pub fn iter() -> MutationTypeIter {
        MutationTypeIter { index: 0 }
    }
}


impl fmt::Display for MutationType {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // I get a stack overflow if I simply do self.to_string(). So I'll implement the string
        // representation directly instead.
        // I need to figure out why that happens though... Seems like it's related to different
        // formatting functions calling each other recursively.
        let string = match self {
            Self::Unknown => "Unknown",
            Self::Synonymous => "Synonymous",
            Self::Missense => "Missense",
            Self::Nonsense => "Nonsense",
            Self::StopLoss => "StopLoss",
            Self::StartCodon => "StartCodon",
            Self::SpliceSite => "SpliceSite",
            Self::Intronic => "Intronic",
            Self::InFrameIndel => "InFrameIndel",
            Self::FrameshiftIndel => "FrameshiftIndel",
        };
        write!(f, "{}", string)
    }
}

impl TryFrom<&str> for MutationType {
    type Error = ParseError;
    fn try_from(s: &str) -> Result<Self, Self::Error> {
        Ok(match s.to_lowercase().as_str() {
            "unknown" => Self::Unknown,
            "synonymous" => Self::Synonymous,
            "missense" => Self::Missense,
            "nonsense" => Self::Nonsense,
            "stoploss" | "stop_loss" => Self::StopLoss,
            "startcodon" | "start_codon" => Self::StartCodon,
            "splicesite" | "splice_site" => Self::SpliceSite,
            "intronic" => Self::Intronic,
            "in_frame" | "in_frame_indel" | "inframe" => Self::InFrameIndel,
            "out_frame" | "out_frame_outdel" | "outframe" => Self::FrameshiftIndel,
            _ => {
                return Err(ParseError::somewhere(
                    "name of mutation type",
                    s.to_string(),
                ))
            }
        })
    }
}

impl From<u8> for MutationType {
    fn from(n: u8) -> Self {
        match n {
            0 => Self::Unknown,
            1 => Self::Synonymous,
            2 => Self::Missense,
            3 => Self::Nonsense,
            4 => Self::StopLoss,
            5 => Self::StartCodon,
            6 => Self::SpliceSite,
            7 => Self::Intronic,
            8 => Self::InFrameIndel,
            9 => Self::FrameshiftIndel,
            _ => Self::Unknown,
        }
    }
}

pub struct MutationTypeIter {
    index: u8,
}

impl std::iter::Iterator for MutationTypeIter {
    type Item = MutationType;

    fn next(&mut self) -> Option<Self::Item> {
        let mut_type: MutationType = self.index.into();
        self.index += 1;
        if mut_type == MutationType::Unknown && self.index != 1 { // 1, because we already incremented
            None
        } else {
            Some(mut_type)
        }
    }
}

#[derive(Clone, Copy, Debug, PartialEq)]
pub struct MutationEvent {
    pub mutation_type: MutationType,
    pub probability: f32,
}

impl MutationEvent {
    pub fn new(mutation_type: MutationType, probability: f32) -> Self {
        Self {
            mutation_type,
            probability,
        }
    }
}

#[derive(PartialEq, Clone, Copy, Debug)]
pub enum Base {
    A,
    C,
    G,
    T,
    N,
}

impl Base {
    pub fn name(&self) -> char {
        match self {
            Base::A => 'A',
            Base::C => 'C',
            Base::G => 'G',
            Base::T => 'T',
            Base::N => 'N',
        }
    }
}

impl From<u8> for Base {
    fn from(byte: u8) -> Self {
        match byte {
            65 | 97 => Self::A,
            67 | 99 => Self::C,
            71 | 103 => Self::G,
            84 | 116 => Self::T,
            78 | 110 => Self::N,
            _ => panic!("Bad nucleotide: {}", byte),
        }
    }
}

impl From<char> for Base {
    fn from(c: char) -> Self {
        match c {
            'A' | 'a' => Self::A,
            'C' | 'c' => Self::C,
            'G' | 'g' => Self::G,
            'T' | 't' => Self::T,
            'N' | 'n' => Self::N,
            _ => panic!("Bad nucleotide: {}", c),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_mutation_type_iterator() {
        let mutation_types = vec![
            MutationType::Unknown,
            MutationType::Synonymous,
            MutationType::Missense,
            MutationType::Nonsense,
            MutationType::StopLoss,
            MutationType::StartCodon,
            MutationType::SpliceSite,
            MutationType::Intronic,
            MutationType::InFrameIndel,
            MutationType::FrameshiftIndel,
        ];
        let mut iter = MutationType::iter();
        for mut_type in mutation_types {
            assert_eq!(mut_type, iter.next().unwrap());
        }
        assert!(iter.next().is_none());
    }
}