lorikeet_genome/evolve/
codon_structs.rs

1use bio::alphabets::dna;
2use bio_types::strand::Strand;
3use itertools::{izip, Itertools};
4use rust_htslib::bcf::Read;
5use std::collections::HashMap;
6
7use crate::model::variant_context::VariantContext;
8use crate::model::variant_context_utils::VariantContextUtils;
9use crate::reference::reference_reader::ReferenceReader;
10use crate::utils::utils::{mean, std_deviation};
11
12#[allow(dead_code)]
13pub struct GeneInfo {
14    name: String,
15    locations: HashMap<u32, (u64, u64)>,
16    coverages: HashMap<u32, f64>,
17}
18
19pub struct CodonTable {
20    aminos: HashMap<Vec<u8>, char>,
21    starts: HashMap<Vec<u8>, char>,
22    ns_sites: HashMap<Vec<u8>, f64>,
23}
24
25pub struct NCBITable {
26    aas: String,
27    starts: String,
28    base1: String,
29    base2: String,
30    base3: String,
31}
32
33impl NCBITable {
34    // get translation tables in NCBI format
35    // Kind of lazy storing and then converting every time but would take way too much time
36    // to write out each table into CodonTable format by hand
37    fn get_translation_table(table_id: usize) -> NCBITable {
38        match table_id {
39            1 => NCBITable {
40                aas: "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG".to_owned(),
41                starts: "---M------**--*----M---------------M----------------------------"
42                    .to_owned(),
43                base1: "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG"
44                    .to_owned(),
45                base2: "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG"
46                    .to_owned(),
47                base3: "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"
48                    .to_owned(),
49            },
50            11 => NCBITable {
51                aas: "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG".to_owned(),
52                starts: "---M------**--*----M------------MMMM---------------M------------"
53                    .to_owned(),
54                base1: "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG"
55                    .to_owned(),
56                base2: "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG"
57                    .to_owned(),
58                base3: "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"
59                    .to_owned(),
60            },
61            _ => {
62                panic!("Translation table {} not yet implemented", table_id);
63            }
64        }
65    }
66}
67
68impl CodonTable {
69    pub fn setup() -> CodonTable {
70        CodonTable {
71            aminos: HashMap::new(),
72            starts: HashMap::new(),
73            ns_sites: HashMap::new(),
74        }
75    }
76}
77
78pub trait Translations {
79    fn get_codon_table(&mut self, table_id: usize);
80    fn find_mutations(
81        &self,
82        gene: &bio::io::gff::Record,
83        variants: &mut rust_htslib::bcf::IndexedReader,
84        reference_reader: &mut ReferenceReader,
85        ref_idx: usize,
86        n_samples: usize,
87        qual_by_depth_filter: f64,
88        qual_threshold: f64,
89        depth_per_sample_filter: i64,
90    ) -> (Vec<usize>, Vec<usize>, Vec<f64>);
91    fn calculate_gene_coverage(
92        &self,
93        gene: &bio::io::gff::Record,
94        depth_of_contig: &Vec<i32>,
95    ) -> (f32, f32);
96}
97
98impl Translations for CodonTable {
99    fn get_codon_table(&mut self, table_id: usize) {
100        // Convert NCBI format to something more usable
101        let ncbi_format = NCBITable::get_translation_table(table_id);
102        let mut amino_hash = HashMap::new();
103        let mut start_hash = HashMap::new();
104        for (aa, s, b1, b2, b3) in izip!(
105            ncbi_format.aas.as_bytes(),
106            ncbi_format.starts.as_bytes(),
107            ncbi_format.base1.as_bytes(),
108            ncbi_format.base2.as_bytes(),
109            ncbi_format.base3.as_bytes()
110        ) {
111            let codon = vec![*b1, *b2, *b3];
112            amino_hash.insert(codon.clone(), *aa as char);
113            start_hash.insert(codon, *s as char);
114        }
115        // debug!("Amino hash {:?}", amino_hash);
116        self.aminos = amino_hash;
117        self.starts = start_hash;
118
119        let nucleotides: Vec<u8> = vec![65, 84, 67, 71];
120        // debug!("nucleotides: {} ", String::from_utf8_lossy(&nucleotides));
121        for (codon, _aa) in self.aminos.iter() {
122            let mut n = 0.0;
123            for (pos, cod) in codon.iter().enumerate() {
124                for nuc in nucleotides.iter() {
125                    if cod == nuc {
126                        // Ignore this nucleotide
127                        continue;
128                    } else {
129                        let mut codon_shift = codon.clone();
130                        // Change one position of the codon
131                        codon_shift[pos] = nuc.clone();
132
133                        if self.aminos[codon] != self.aminos[&codon_shift] {
134                            // This change can cause a non-synonymous mutation
135                            n += 1.0 / 3.0;
136                        }
137                    }
138                }
139            }
140            self.ns_sites.insert(codon.clone(), n as f64);
141        }
142    }
143
144    /// Finds all associate mutations within a gene region in the form of a gff record
145    /// If there are associated variants in this gene attempts to calculate dN/dS ratios for
146    /// the given sample
147    /// Returns a tuple of the number of frameshifts and dN/dS ratio
148    /// TODO: Refactor so calculates for all samples at once without having to re-read the variant
149    ///       region each time.
150    fn find_mutations(
151        &self,
152        gene: &bio::io::gff::Record,
153        variants: &mut rust_htslib::bcf::IndexedReader,
154        reference_reader: &mut ReferenceReader,
155        ref_idx: usize,
156        n_samples: usize,
157        qual_by_depth_filter: f64,
158        qual_threshold: f64,
159        depth_per_sample_filter: i64,
160    ) -> (Vec<usize>, Vec<usize>, Vec<f64>) {
161        match gene.strand() {
162            Some(strand) => {
163                let contig_name = format!(
164                    "{}~{}",
165                    reference_reader.retrieve_reference_stem(ref_idx),
166                    gene.seqname()
167                ); // create concatenated contig name format
168                let rid = if let Some(rid) =
169                    VariantContext::get_contig_vcf_tid(variants.header(), contig_name.as_bytes())
170                {
171                    rid
172                } else {
173                    // no variants on this contig so skip
174                    return (vec![0; n_samples], vec![0; n_samples], vec![1.0; n_samples]);
175                };
176
177                reference_reader
178                    .fetch_contig_from_reference_by_contig_name(contig_name.as_bytes(), ref_idx);
179                reference_reader.read_sequence_to_vec();
180                // bio::gff documentation says start and end positions are 1-based, so we minus 1
181                // Additionally, end position is non-inclusive so do minus 1
182                let start = *gene.start() as usize - 1;
183                let end = *gene.end() as usize - 1;
184                // debug!("Start {} End {}", start, end);
185                // fetch variants in this window
186                match variants.fetch(rid, start as u64, Some(end as u64)) {
187                    Ok(_) => {}
188                    Err(_e) => {
189                        return (vec![0; n_samples], vec![0; n_samples], vec![1.0; n_samples]);
190                    }
191                };
192
193                // VariantContext::process_vcf_in_region()
194
195                let frame: usize = match gene.frame().parse() {
196                    Ok(frame_val) => frame_val,
197                    Err(_) => 0,
198                };
199                let gene_sequence = &reference_reader.current_sequence[start..=end];
200                // debug!("Gene Seq {:?}", String::from_utf8_lossy(gene_sequence));
201                let codon_sequence = get_codons(&gene_sequence, frame, strand);
202                // debug!("Codon Sequence {:?}", codon_sequence);
203
204                // Calculate N and S
205                let mut big_n: Vec<f64> = vec![0.0; n_samples];
206                let mut big_s: Vec<f64> = vec![0.0; n_samples];
207                for codon in codon_sequence.iter() {
208                    if std::str::from_utf8(codon)
209                        .expect("Unable to interpret codon")
210                        .contains('N')
211                        || codon.len() != 3
212                    {
213                        continue;
214                    } else {
215                        match self.ns_sites.get(codon) {
216                            Some(n) => {
217                                for sample_idx in 0..n_samples {
218                                    big_n[sample_idx] += n;
219                                    big_s[sample_idx] += 3.0 - n;
220                                }
221                            }
222                            None => {
223                                // debug!(
224                                    // "Codon {:?} not found",
225                                    // std::str::from_utf8(codon.as_slice())
226                                // );
227                                continue;
228                            }
229                        }
230                    }
231                }
232
233                // debug!("getting ns_sites N {:?} S {:?}", &big_n, &big_s);
234
235                // Create Nd and Sd values
236                let mut big_nd: Vec<f64> = vec![0.0; n_samples];
237                let mut big_sd: Vec<f64> = vec![0.0; n_samples];
238
239                // dN/dS calculations when using NGS reads outlined here:
240                // http://bioinformatics.cvr.ac.uk/blog/calculating-dnds-for-ngs-datasets/
241                // Note, we don't normalize for depth here and instead just use Jukes-Cantor model
242                let mut codon: &Vec<u8> = &codon_sequence[0];
243                let mut previous_codon: &Vec<u8> = &codon_sequence[0];
244                let mut new_codons: Vec<Vec<Vec<u8>>> = vec![vec![]; n_samples];
245                let mut positionals = vec![0; n_samples];
246                let mut total_variants = vec![0; n_samples];
247                let mut gene_cursor = 0; // index inside gene, i.e, position of variant
248                let mut reference_cursor = start; // index of reference sequence
249                let mut frameshifts = vec![0; n_samples];
250                let mut snps = vec![0; n_samples];
251                let mut old_codon_idx = None;
252                for record in variants.records().into_iter() {
253                    match record {
254                        Ok(mut record) => {
255                            match VariantContext::from_vcf_record(&mut record, true) {
256                                Some(mut context) => {
257                                    let passes = VariantContextUtils::passes_thresholds(
258                                        &mut context,
259                                        qual_by_depth_filter,
260                                        qual_threshold,
261                                    );
262
263                                    if !passes {
264                                        continue;
265                                    }
266
267                                    // gained bases is the difference between current and previous
268                                    // position
269                                    let gained_bases =
270                                        context.loc.start.saturating_sub(reference_cursor);
271                                    // update previous position to current position
272                                    gene_cursor += gained_bases;
273                                    // add gained bases to reference cursor
274                                    reference_cursor = context.loc.start;
275
276                                    // index of current codon in gene
277                                    // number of codons is gene size divided by three
278                                    let codon_idx = gene_cursor / 3 as usize;
279                                    let process_previous_codon = match old_codon_idx {
280                                        Some(old_idx) => {
281                                            if old_idx != codon_idx {
282                                                old_codon_idx = Some(codon_idx);
283                                                previous_codon = codon;
284                                                true
285                                            } else {
286                                                false
287                                            }
288                                        }
289                                        None => {
290                                            old_codon_idx = Some(codon_idx);
291                                            false
292                                        }
293                                    };
294
295                                    // index of current position in the current codon
296                                    // all codons have size == 3
297                                    let codon_cursor = gene_cursor % 3;
298                                    // debug!(
299                                    //     "reference cursor {} gained_bases {} Gene cursor {} codon idx {} codon cursor {}",
300                                    //     reference_cursor, gained_bases, gene_cursor, codon_idx, codon_cursor
301                                    // );
302
303                                    if codon_idx >= codon_sequence.len() {
304                                        continue;
305                                    }
306                                    codon = &codon_sequence[codon_idx];
307
308                                    if std::str::from_utf8(codon.as_slice())
309                                        .expect("Unable to interpret codon")
310                                        .contains('N')
311                                        || codon.len() != 3
312                                    {
313                                        continue;
314                                    }
315
316                                    for sample_idx in 0..n_samples {
317                                        if process_previous_codon {
318                                            for new_codon in new_codons[sample_idx].iter() {
319                                                // debug!("Positionals start {:?}", &positionals);
320                                                // debug!(
321                                                //     "Sample {} new_codon {:?} ref_codon {:?}",
322                                                //     sample_idx, new_codon, &previous_codon
323                                                // );
324                                                if (previous_codon.len() == 3)
325                                                    && (new_codon.len() == 3)
326                                                    && (previous_codon != new_codon)
327                                                {
328                                                    // get indices of different locations
329                                                    let mut pos = 0 as usize;
330                                                    let mut diffs = vec![];
331                                                    for (c1, c2) in
332                                                        previous_codon.iter().zip(new_codon.iter())
333                                                    {
334                                                        if c1 != c2 {
335                                                            diffs.push(pos);
336                                                        }
337                                                        pos += 1;
338                                                    }
339                                                    total_variants[sample_idx] += diffs.len();
340                                                    // get permuations of positions
341                                                    let permutations: Vec<Vec<&usize>> = diffs
342                                                        .iter()
343                                                        .permutations(diffs.len())
344                                                        .collect();
345
346                                                    // calculate synonymous and non-synonymous for each permutation
347                                                    let mut ns = 0;
348                                                    let mut ss = 0;
349                                                    // debug!(
350                                                    //     "positional difference {:?} permutations {:?}",
351                                                    //     diffs,
352                                                    //     permutations.len()
353                                                    // );
354                                                    positionals[sample_idx] += permutations.len();
355                                                    for permutation in permutations.iter() {
356                                                        let mut shifting = codon.clone();
357                                                        let mut old_shift;
358                                                        for pos in permutation {
359                                                            // Check if one amino acid change causes an syn or non-syn
360                                                            old_shift = shifting.clone();
361                                                            shifting[**pos] = new_codon[**pos];
362                                                            // debug!("Old shift {:?}, new {:?}", old_shift, shifting);
363                                                            if self.aminos[&old_shift]
364                                                                != self.aminos[&shifting]
365                                                            {
366                                                                ns += 1;
367                                                            } else {
368                                                                ss += 1;
369                                                            }
370                                                        }
371                                                    }
372                                                    let nd = ns as f64 / permutations.len() as f64;
373                                                    let sd = ss as f64 / permutations.len() as f64;
374                                                    big_nd[sample_idx] += nd;
375                                                    big_sd[sample_idx] += sd;
376                                                }
377                                                // debug!("Positionals end {:?}", &positionals);
378                                                // debug!(
379                                                //     "Codon idx {} gene cursor {} codons {} gene length {} new_codons {:?}",
380                                                //     codon_idx,
381                                                //     gene_cursor,
382                                                //     codon_sequence.len(),
383                                                //     gene_sequence.len(),
384                                                //     new_codons[sample_idx]
385                                                // );
386                                            }
387                                            // begin working on new codon
388                                            new_codons[sample_idx] = Vec::new();
389                                        }
390
391                                        let which_are_present = context
392                                            .alleles_present_in_sample(
393                                                sample_idx,
394                                                depth_per_sample_filter as i32,
395                                            );
396
397                                        if !which_are_present[1..].iter().any(|v| *v) {
398                                            continue; // no alt alleles are present
399                                        }
400
401                                        let ref_allele = context.get_reference();
402                                        let mut snp_count = 0;
403
404                                        // iterate through non reference alleles
405                                        // if those alleles are present in this sample then
406                                        // increment appropriate values
407                                        for (allele_index, allele) in
408                                            context.get_alternate_alleles_with_index()
409                                        {
410                                            if new_codons[sample_idx].len() == 0 {
411                                                new_codons[sample_idx] = vec![codon.clone()];
412                                            }
413                                            if allele.bases.len() > 1
414                                                || allele.bases.len() != ref_allele.bases.len()
415                                            {
416                                                if which_are_present[allele_index] {
417                                                    frameshifts[sample_idx] += 1;
418                                                }
419                                                continue;
420                                            }
421
422                                            if which_are_present[allele_index] {
423                                                snps[sample_idx] += 1;
424                                                if snp_count >= 1 {
425                                                    // Create a copy of codon up to this point
426                                                    // Not sure if reusing previous variants is bad, but
427                                                    // not doing so can cause randomness to dN/dS values
428                                                    // debug!(
429                                                    //     "before pushing {:?}",
430                                                    //     new_codons[sample_idx]
431                                                    // );
432                                                    new_codons[sample_idx].push(codon.clone());
433                                                    // debug!(
434                                                    //     "Gene cursor {} sample idx {} snp count {} codon cursor {}",
435                                                    //     gene_cursor, sample_idx, snp_count, codon_cursor
436                                                    // );
437                                                    // debug!(
438                                                    //     "new_codons length {}",
439                                                    //     new_codons[sample_idx].len()
440                                                    // );
441                                                    new_codons[sample_idx][snp_count]
442                                                        [codon_cursor] = allele.bases[0];
443
444                                                    // debug!(
445                                                    //     "multi snp codon {:?}",
446                                                    //     new_codons[sample_idx]
447                                                    // );
448                                                } else {
449                                                    for var_idx in 0..new_codons[sample_idx].len() {
450                                                        // debug!(
451                                                        //     "s {} v {} c {}. Size {}",
452                                                        //     sample_idx,
453                                                        //     var_idx,
454                                                        //     codon_cursor,
455                                                        //     new_codons[sample_idx].len()
456                                                        // );
457                                                        new_codons[sample_idx][var_idx]
458                                                            [codon_cursor] = allele.bases[0];
459                                                    }
460                                                }
461                                                snp_count += 1;
462                                            }
463                                        }
464                                    }
465                                }
466                                None => continue,
467                            }
468                        }
469                        Err(_) => {
470                            // Skip error record
471                            continue;
472                        }
473                    }
474                }
475
476                let mut dnds_values = vec![1.0; n_samples];
477                for sample_idx in 0..n_samples {
478                    // debug!(
479                    //     "Nd {} N {}, Sd {} S {} total permutations {} variants {}",
480                    //     big_nd[sample_idx],
481                    //     big_n[sample_idx],
482                    //     big_sd[sample_idx],
483                    //     big_s[sample_idx],
484                    //     positionals[sample_idx],
485                    //     total_variants[sample_idx]
486                    // );
487                    let mut pn = big_nd[sample_idx] / big_n[sample_idx];
488                    let mut ps = big_sd[sample_idx] / big_s[sample_idx];
489                    // debug!("pn {} ps {}", pn, ps);
490                    // Weirdly in the Jukes-Cantor model if pn or ps are 0.75 then the nat log does not resolve
491                    // No one talks about this in the literature for some reason
492                    if pn == 0.75 {
493                        pn = 0.7499
494                    }
495                    if ps == 0.75 {
496                        ps = 0.7499
497                    }
498                    let d_n = -(3.0 / 4.0) * (1.0 - (4.0 * pn) / 3.0).ln();
499                    let d_s = -(3.0 / 4.0) * (1.0 - (4.0 * ps) / 3.0).ln();
500                    // debug!("dN {} dS {}", d_n, d_s);
501                    let mut dnds = d_n / d_s;
502
503                    // negative dnds values make no sense, but occur nonetheless
504                    // Just make them 0.0
505                    if dnds.is_nan() || d_s - 0.0 <= f64::EPSILON {
506                        dnds = 1.
507                    } else if dnds.is_sign_negative() {
508                        dnds = 0.0
509                    }
510                    dnds_values[sample_idx] = dnds
511                }
512
513                return (snps, frameshifts, dnds_values);
514            }
515            _ => return (vec![0; n_samples], vec![0; n_samples], vec![1.0; n_samples]),
516        }
517    }
518
519    fn calculate_gene_coverage(
520        &self,
521        gene: &bio::io::gff::Record,
522        depth_of_contig: &Vec<i32>,
523    ) -> (f32, f32) {
524        let gene_start = *gene.start() as usize - 1;
525        let gene_end = *gene.end() as usize - 1;
526        let gene_depths = &depth_of_contig[gene_start..gene_end];
527        let mean_cov = mean(gene_depths).unwrap_or(0.);
528        let std = std_deviation(gene_depths).unwrap_or(0.);
529
530        return (mean_cov, std);
531    }
532}
533
534pub fn get_codons<'a>(sequence: &'a [u8], frame: usize, strandedness: Strand) -> Vec<Vec<u8>> {
535    match strandedness {
536        Strand::Forward | Strand::Unknown => sequence[0 + frame..]
537            .chunks(3)
538            .map(|chunk| chunk.to_vec())
539            .collect::<Vec<Vec<u8>>>(),
540        Strand::Reverse => {
541            let rc = dna::revcomp(sequence);
542            rc[0 + frame..]
543                .chunks(3)
544                .map(|chunk| chunk.to_vec())
545                .collect::<Vec<Vec<u8>>>()
546        }
547    }
548}