1mod amino_acid;
2mod cds;
3pub mod error;
4pub mod interval;
5mod mutation;
6mod point_mutation_classifier;
7mod seq_window_slider;
8mod sequence_annotation;
9
10use std::collections::hash_map::HashMap;
11use std::convert::{From, TryFrom};
12use std::fmt;
13
14use serde_repr::{Deserialize_repr, Serialize_repr};
15
16use pattern_partition_prediction::{PaPaPred, PaPaPredIndel};
17use twobit::TwoBitFile;
18
19pub use crate::cds::{Phase, CDS};
20use crate::error::{MutexpectError, ParseError};
21pub use crate::interval::Interval;
22use crate::mutation::{PointMutation, Indel};
23pub use crate::point_mutation_classifier::PointMutationClassifier;
24use crate::seq_window_slider::SeqWindowSlider;
25pub use crate::sequence_annotation::{read_sequence_annotations_from_file, SeqAnnotation, Strand};
26
27pub fn possible_mutations(
28 seq: &str,
29 seq_annotation: &SeqAnnotation,
30 point_mutation_probabilities: &PaPaPred,
31 indel_probabilities: &Option<PaPaPredIndel>,
32 drop_nan: bool,
33 include_intronic : bool,
34 include_unknown : bool,
35 filter_plof : bool,
36) -> Result<Vec<MutationEvent>, MutexpectError> {
37 let mut result = Vec::new();
38 let seq_flanking_length = point_mutation_probabilities.kmer_size() / 2; let window_length = 1 + 2 * seq_flanking_length;
40 let sequence_genomic_start_position = seq_annotation.range.start;
41 let classifier = PointMutationClassifier::new(&seq_annotation, seq_flanking_length);
42 let introns = seq_annotation.get_introns();
43 let mut introns_iter = introns.iter();
44 let mut cds_iter = seq_annotation.coding_sequences.iter();
45
46 let mut current_intron: Option<Interval> = introns_iter.next().copied();
47 let mut current_cds: Option<CDS> = cds_iter.next().copied();
48 for (i, seq_window) in SeqWindowSlider::new(seq, window_length)
50 .into_iter()
51 .enumerate()
52 {
53 let genomic_position = sequence_genomic_start_position + i;
54 let seq_window_vec: Vec<char> = seq_window.chars().collect();
55 let ref_base: Base = seq_window_vec[seq_flanking_length].into();
56
57 let mutation_probability = point_mutation_probabilities
58 .rates(seq_window)
59 .map_err(|e| {
60 error::SequenceError::new(
61 seq_annotation.name.clone(),
62 "Bad sequence (for point mutation)",
63 Some(Box::new(e)),
64 )
65 })?;
66
67
68 let overlapping_intron = {
69 loop {
70 if let Some(intron) = current_intron {
71 if genomic_position < intron.start {
72 break None; } else if genomic_position < intron.stop {
74 break Some(intron); } else {
76 current_intron = introns_iter.next().copied();
77 }
78 } else {
79 break None; }
81 }
82 };
83
84 let overlapping_cds = {
85 loop {
86 if let Some(cds) = ¤t_cds {
87 if genomic_position < cds.range.start {
88 break None;
89 } else if genomic_position < cds.range.stop {
90 break Some(cds); } else {
92 current_cds = cds_iter.next().copied();
93 }
94 } else {
95 break None; }
97 }
98 };
99
100 let mut mutation_type = classifier.classify_by_position(
101 genomic_position,
102 &seq_window_vec,
103 &overlapping_intron, );
105
106 for other_nuc in &[Base::A, Base::C, Base::G, Base::T] {
107 if *other_nuc == ref_base {
108 } else {
110 let probability = mutation_probability.by_name(other_nuc.name());
111 if drop_nan && probability.is_nan() {
112 continue;
115 }
116 if let Some(cds) = overlapping_cds {
117 mutation_type = classifier.classify_coding_mutation(
119 genomic_position,
120 &seq_window_vec,
121 other_nuc.name(),
122 &cds,
123 filter_plof,
124 )
125 } if mutation_type == MutationType::Intronic && !include_intronic {
127 continue;
128 }
129 if mutation_type == MutationType::Unknown && !include_unknown {
130 continue;
131 }
132 result.push(MutationEvent {
133 mutation_type,
134 probability,
135 });
136 }
139 }
140
141 if let Some(p) = indel_probabilities {
143 if overlapping_cds.is_some() && overlapping_cds.expect("some").range.start < genomic_position { let rates = p.rates(&seq_window[0 .. seq_window.len() - 1] ) .map_err(|e| {
147 error::SequenceError::new(
148 seq_annotation.name.clone(),
149 "Bad sequence (for indel split)",
150 Some(Box::new(e)),
151 )
152 })?;
153 result.push(MutationEvent{
154 mutation_type: MutationType::InFrameIndel,
155 probability: rates.inframe,
156 });
157 if filter_plof {
158 match seq_annotation.strand {
159 Strand::Plus => {
160 if genomic_position >= seq_annotation.get_end_minus50() {
161 continue;
162 }
163 },
164 Strand::Minus => {
165 if genomic_position <= seq_annotation.get_end_minus50() {
166 continue;
167 }
168 },
169 }
170 }
171 result.push(MutationEvent{
172 mutation_type: MutationType::FrameshiftIndel,
173 probability: rates.outframe,
174 });
175 }
176 }
177 }
178 Ok(result)
179}
180
181pub fn expected_number_of_mutations(
182 possible_mutations: &[MutationEvent],
183) -> HashMap<MutationType, f64> {
184 let mut result = HashMap::new();
185 for event in possible_mutations {
186 *result.entry(event.mutation_type).or_insert(0.0) += event.probability as f64
187 }
188 result
189}
190
191pub fn observed_number_of_mutations(
192 mutations: &[PointMutation], indels: &[Indel],
194 seq_annotation: &SeqAnnotation, twobit_ref_seq: &TwoBitFile,
196 filter_plof : bool,
197) -> Result<HashMap<MutationType, usize>, MutexpectError> {
198 let mut result = HashMap::new();
199 let dna = twobit_ref_seq; let classifier = PointMutationClassifier::new(seq_annotation, 5);
203
204 for mutation in mutations {
205 let start = mutation.position - 2; let stop = mutation.position + 3; let nucleotides: Vec<char> = dna
208 .sequence(&mutation.chromosome, start, stop)
209 .unwrap()
210 .chars()
211 .collect();
212 let mut mut_type = classifier.classify_by_position(
213 mutation.position,
214 &nucleotides,
215 &seq_annotation.find_intron(mutation.position),
216 );
217 if mut_type == MutationType::Unknown {
218 if let Some(cds) = seq_annotation.find_cds(mutation.position) {
219 mut_type = classifier.classify_coding_mutation(
220 mutation.position,
221 &nucleotides,
222 mutation.alternative,
223 &cds,
224 filter_plof,
225 );
226 } } result.entry(mut_type).and_modify(|c| *c += 1).or_insert(1);
229 }
230
231 for indel in indels {
232 let mut mut_type = MutationType::Unknown;
233 for intron in seq_annotation.get_introns() {
234 if intron.contains(indel.position) {
235 mut_type = MutationType::Intronic;
236 break;
237 }
238 }
239
240 if mut_type == MutationType::Unknown { for cds in &seq_annotation.coding_sequences {
242 if cds.range.contains(indel.position) {
243 if indel.is_inframe() {
244 mut_type = MutationType::InFrameIndel
245 } else {
246 mut_type = MutationType::FrameshiftIndel;
247 if filter_plof {
248 match seq_annotation.strand {
249 Strand::Plus => {
250 if indel.position >= seq_annotation.get_end_minus50() {
251 mut_type = MutationType::Unknown
252 }
253 },
254 Strand::Minus => {
255 if indel.position <= seq_annotation.get_end_minus50() {
256 mut_type = MutationType::Unknown
257 }
258 },
259 }
260 }
261 }
262 break
263 }
264 }
265 }
266 result.entry(mut_type).and_modify(|c| *c += 1).or_insert(1);
267 }
268 Ok(result)
269}
270
271#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Serialize_repr, Deserialize_repr)]
272#[repr(u8)]
273pub enum MutationType {
274 Unknown = 0,
275 Synonymous = 1,
276 Missense = 2,
277 Nonsense = 3,
278 StopLoss = 4,
279 StartCodon = 5,
280 SpliceSite = 6,
281 Intronic = 7,
282 InFrameIndel = 8,
283 FrameshiftIndel = 9,
284}
285
286impl MutationType {
287 pub fn as_str(&self) -> &'static str {
288 match self {
289 Self::Unknown => "unknown",
290 Self::Synonymous => "synonymous",
291 Self::Missense => "missense",
292 Self::Nonsense => "nonsense",
293 Self::StopLoss => "stop_loss",
294 Self::StartCodon => "start_codon",
295 Self::SpliceSite => "splice_site",
296 Self::Intronic => "intronic",
297 Self::InFrameIndel => "in_frame_indel",
298 Self::FrameshiftIndel => "frameshift_indel",
299 }
300 }
301
302 pub fn iter() -> MutationTypeIter {
303 MutationTypeIter { index: 0 }
304 }
305}
306
307
308impl fmt::Display for MutationType {
309 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
310 let string = match self {
315 Self::Unknown => "Unknown",
316 Self::Synonymous => "Synonymous",
317 Self::Missense => "Missense",
318 Self::Nonsense => "Nonsense",
319 Self::StopLoss => "StopLoss",
320 Self::StartCodon => "StartCodon",
321 Self::SpliceSite => "SpliceSite",
322 Self::Intronic => "Intronic",
323 Self::InFrameIndel => "InFrameIndel",
324 Self::FrameshiftIndel => "FrameshiftIndel",
325 };
326 write!(f, "{}", string)
327 }
328}
329
330impl TryFrom<&str> for MutationType {
331 type Error = ParseError;
332 fn try_from(s: &str) -> Result<Self, Self::Error> {
333 Ok(match s.to_lowercase().as_str() {
334 "unknown" => Self::Unknown,
335 "synonymous" => Self::Synonymous,
336 "missense" => Self::Missense,
337 "nonsense" => Self::Nonsense,
338 "stoploss" | "stop_loss" => Self::StopLoss,
339 "startcodon" | "start_codon" => Self::StartCodon,
340 "splicesite" | "splice_site" => Self::SpliceSite,
341 "intronic" => Self::Intronic,
342 "in_frame" | "in_frame_indel" | "inframe" => Self::InFrameIndel,
343 "out_frame" | "out_frame_outdel" | "outframe" => Self::FrameshiftIndel,
344 _ => {
345 return Err(ParseError::somewhere(
346 "name of mutation type",
347 s.to_string(),
348 ))
349 }
350 })
351 }
352}
353
354impl From<u8> for MutationType {
355 fn from(n: u8) -> Self {
356 match n {
357 0 => Self::Unknown,
358 1 => Self::Synonymous,
359 2 => Self::Missense,
360 3 => Self::Nonsense,
361 4 => Self::StopLoss,
362 5 => Self::StartCodon,
363 6 => Self::SpliceSite,
364 7 => Self::Intronic,
365 8 => Self::InFrameIndel,
366 9 => Self::FrameshiftIndel,
367 _ => Self::Unknown,
368 }
369 }
370}
371
372pub struct MutationTypeIter {
373 index: u8,
374}
375
376impl std::iter::Iterator for MutationTypeIter {
377 type Item = MutationType;
378
379 fn next(&mut self) -> Option<Self::Item> {
380 let mut_type: MutationType = self.index.into();
381 self.index += 1;
382 if mut_type == MutationType::Unknown && self.index != 1 { None
384 } else {
385 Some(mut_type)
386 }
387 }
388}
389
390#[derive(Clone, Copy, Debug, PartialEq)]
391pub struct MutationEvent {
392 pub mutation_type: MutationType,
393 pub probability: f32,
394}
395
396impl MutationEvent {
397 pub fn new(mutation_type: MutationType, probability: f32) -> Self {
398 Self {
399 mutation_type,
400 probability,
401 }
402 }
403}
404
405#[derive(PartialEq, Clone, Copy, Debug)]
406pub enum Base {
407 A,
408 C,
409 G,
410 T,
411 N,
412}
413
414impl Base {
415 pub fn name(&self) -> char {
416 match self {
417 Base::A => 'A',
418 Base::C => 'C',
419 Base::G => 'G',
420 Base::T => 'T',
421 Base::N => 'N',
422 }
423 }
424}
425
426impl From<u8> for Base {
427 fn from(byte: u8) -> Self {
428 match byte {
429 65 | 97 => Self::A,
430 67 | 99 => Self::C,
431 71 | 103 => Self::G,
432 84 | 116 => Self::T,
433 78 | 110 => Self::N,
434 _ => panic!("Bad nucleotide: {}", byte),
435 }
436 }
437}
438
439impl From<char> for Base {
440 fn from(c: char) -> Self {
441 match c {
442 'A' | 'a' => Self::A,
443 'C' | 'c' => Self::C,
444 'G' | 'g' => Self::G,
445 'T' | 't' => Self::T,
446 'N' | 'n' => Self::N,
447 _ => panic!("Bad nucleotide: {}", c),
448 }
449 }
450}
451
452#[cfg(test)]
453mod tests {
454 use super::*;
455
456 #[test]
457 fn test_mutation_type_iterator() {
458 let mutation_types = vec![
459 MutationType::Unknown,
460 MutationType::Synonymous,
461 MutationType::Missense,
462 MutationType::Nonsense,
463 MutationType::StopLoss,
464 MutationType::StartCodon,
465 MutationType::SpliceSite,
466 MutationType::Intronic,
467 MutationType::InFrameIndel,
468 MutationType::FrameshiftIndel,
469 ];
470 let mut iter = MutationType::iter();
471 for mut_type in mutation_types {
472 assert_eq!(mut_type, iter.next().unwrap());
473 }
474 assert!(iter.next().is_none());
475 }
476}