lorikeet_genome/evolve/codon_structs.rs
1use bio::alphabets::dna;
2use bio_types::strand::Strand;
3use itertools::{izip, Itertools};
4use rust_htslib::bcf::Read;
5use std::collections::HashMap;
6
7use crate::model::variant_context::VariantContext;
8use crate::model::variant_context_utils::VariantContextUtils;
9use crate::reference::reference_reader::ReferenceReader;
10use crate::utils::utils::{mean, std_deviation};
11
12#[allow(dead_code)]
13pub struct GeneInfo {
14 name: String,
15 locations: HashMap<u32, (u64, u64)>,
16 coverages: HashMap<u32, f64>,
17}
18
19pub struct CodonTable {
20 aminos: HashMap<Vec<u8>, char>,
21 starts: HashMap<Vec<u8>, char>,
22 ns_sites: HashMap<Vec<u8>, f64>,
23}
24
25pub struct NCBITable {
26 aas: String,
27 starts: String,
28 base1: String,
29 base2: String,
30 base3: String,
31}
32
33impl NCBITable {
34 // get translation tables in NCBI format
35 // Kind of lazy storing and then converting every time but would take way too much time
36 // to write out each table into CodonTable format by hand
37 fn get_translation_table(table_id: usize) -> NCBITable {
38 match table_id {
39 1 => NCBITable {
40 aas: "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG".to_owned(),
41 starts: "---M------**--*----M---------------M----------------------------"
42 .to_owned(),
43 base1: "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG"
44 .to_owned(),
45 base2: "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG"
46 .to_owned(),
47 base3: "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"
48 .to_owned(),
49 },
50 11 => NCBITable {
51 aas: "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG".to_owned(),
52 starts: "---M------**--*----M------------MMMM---------------M------------"
53 .to_owned(),
54 base1: "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG"
55 .to_owned(),
56 base2: "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG"
57 .to_owned(),
58 base3: "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"
59 .to_owned(),
60 },
61 _ => {
62 panic!("Translation table {} not yet implemented", table_id);
63 }
64 }
65 }
66}
67
68impl CodonTable {
69 pub fn setup() -> CodonTable {
70 CodonTable {
71 aminos: HashMap::new(),
72 starts: HashMap::new(),
73 ns_sites: HashMap::new(),
74 }
75 }
76}
77
78pub trait Translations {
79 fn get_codon_table(&mut self, table_id: usize);
80 fn find_mutations(
81 &self,
82 gene: &bio::io::gff::Record,
83 variants: &mut rust_htslib::bcf::IndexedReader,
84 reference_reader: &mut ReferenceReader,
85 ref_idx: usize,
86 n_samples: usize,
87 qual_by_depth_filter: f64,
88 qual_threshold: f64,
89 depth_per_sample_filter: i64,
90 ) -> (Vec<usize>, Vec<usize>, Vec<f64>);
91 fn calculate_gene_coverage(
92 &self,
93 gene: &bio::io::gff::Record,
94 depth_of_contig: &Vec<i32>,
95 ) -> (f32, f32);
96}
97
98impl Translations for CodonTable {
99 fn get_codon_table(&mut self, table_id: usize) {
100 // Convert NCBI format to something more usable
101 let ncbi_format = NCBITable::get_translation_table(table_id);
102 let mut amino_hash = HashMap::new();
103 let mut start_hash = HashMap::new();
104 for (aa, s, b1, b2, b3) in izip!(
105 ncbi_format.aas.as_bytes(),
106 ncbi_format.starts.as_bytes(),
107 ncbi_format.base1.as_bytes(),
108 ncbi_format.base2.as_bytes(),
109 ncbi_format.base3.as_bytes()
110 ) {
111 let codon = vec![*b1, *b2, *b3];
112 amino_hash.insert(codon.clone(), *aa as char);
113 start_hash.insert(codon, *s as char);
114 }
115 // debug!("Amino hash {:?}", amino_hash);
116 self.aminos = amino_hash;
117 self.starts = start_hash;
118
119 let nucleotides: Vec<u8> = vec![65, 84, 67, 71];
120 // debug!("nucleotides: {} ", String::from_utf8_lossy(&nucleotides));
121 for (codon, _aa) in self.aminos.iter() {
122 let mut n = 0.0;
123 for (pos, cod) in codon.iter().enumerate() {
124 for nuc in nucleotides.iter() {
125 if cod == nuc {
126 // Ignore this nucleotide
127 continue;
128 } else {
129 let mut codon_shift = codon.clone();
130 // Change one position of the codon
131 codon_shift[pos] = nuc.clone();
132
133 if self.aminos[codon] != self.aminos[&codon_shift] {
134 // This change can cause a non-synonymous mutation
135 n += 1.0 / 3.0;
136 }
137 }
138 }
139 }
140 self.ns_sites.insert(codon.clone(), n as f64);
141 }
142 }
143
144 /// Finds all associate mutations within a gene region in the form of a gff record
145 /// If there are associated variants in this gene attempts to calculate dN/dS ratios for
146 /// the given sample
147 /// Returns a tuple of the number of frameshifts and dN/dS ratio
148 /// TODO: Refactor so calculates for all samples at once without having to re-read the variant
149 /// region each time.
150 fn find_mutations(
151 &self,
152 gene: &bio::io::gff::Record,
153 variants: &mut rust_htslib::bcf::IndexedReader,
154 reference_reader: &mut ReferenceReader,
155 ref_idx: usize,
156 n_samples: usize,
157 qual_by_depth_filter: f64,
158 qual_threshold: f64,
159 depth_per_sample_filter: i64,
160 ) -> (Vec<usize>, Vec<usize>, Vec<f64>) {
161 match gene.strand() {
162 Some(strand) => {
163 let contig_name = format!(
164 "{}~{}",
165 reference_reader.retrieve_reference_stem(ref_idx),
166 gene.seqname()
167 ); // create concatenated contig name format
168 let rid = if let Some(rid) =
169 VariantContext::get_contig_vcf_tid(variants.header(), contig_name.as_bytes())
170 {
171 rid
172 } else {
173 // no variants on this contig so skip
174 return (vec![0; n_samples], vec![0; n_samples], vec![1.0; n_samples]);
175 };
176
177 reference_reader
178 .fetch_contig_from_reference_by_contig_name(contig_name.as_bytes(), ref_idx);
179 reference_reader.read_sequence_to_vec();
180 // bio::gff documentation says start and end positions are 1-based, so we minus 1
181 // Additionally, end position is non-inclusive so do minus 1
182 let start = *gene.start() as usize - 1;
183 let end = *gene.end() as usize - 1;
184 // debug!("Start {} End {}", start, end);
185 // fetch variants in this window
186 match variants.fetch(rid, start as u64, Some(end as u64)) {
187 Ok(_) => {}
188 Err(_e) => {
189 return (vec![0; n_samples], vec![0; n_samples], vec![1.0; n_samples]);
190 }
191 };
192
193 // VariantContext::process_vcf_in_region()
194
195 let frame: usize = match gene.frame().parse() {
196 Ok(frame_val) => frame_val,
197 Err(_) => 0,
198 };
199 let gene_sequence = &reference_reader.current_sequence[start..=end];
200 // debug!("Gene Seq {:?}", String::from_utf8_lossy(gene_sequence));
201 let codon_sequence = get_codons(&gene_sequence, frame, strand);
202 // debug!("Codon Sequence {:?}", codon_sequence);
203
204 // Calculate N and S
205 let mut big_n: Vec<f64> = vec![0.0; n_samples];
206 let mut big_s: Vec<f64> = vec![0.0; n_samples];
207 for codon in codon_sequence.iter() {
208 if std::str::from_utf8(codon)
209 .expect("Unable to interpret codon")
210 .contains('N')
211 || codon.len() != 3
212 {
213 continue;
214 } else {
215 match self.ns_sites.get(codon) {
216 Some(n) => {
217 for sample_idx in 0..n_samples {
218 big_n[sample_idx] += n;
219 big_s[sample_idx] += 3.0 - n;
220 }
221 }
222 None => {
223 // debug!(
224 // "Codon {:?} not found",
225 // std::str::from_utf8(codon.as_slice())
226 // );
227 continue;
228 }
229 }
230 }
231 }
232
233 // debug!("getting ns_sites N {:?} S {:?}", &big_n, &big_s);
234
235 // Create Nd and Sd values
236 let mut big_nd: Vec<f64> = vec![0.0; n_samples];
237 let mut big_sd: Vec<f64> = vec![0.0; n_samples];
238
239 // dN/dS calculations when using NGS reads outlined here:
240 // http://bioinformatics.cvr.ac.uk/blog/calculating-dnds-for-ngs-datasets/
241 // Note, we don't normalize for depth here and instead just use Jukes-Cantor model
242 let mut codon: &Vec<u8> = &codon_sequence[0];
243 let mut previous_codon: &Vec<u8> = &codon_sequence[0];
244 let mut new_codons: Vec<Vec<Vec<u8>>> = vec![vec![]; n_samples];
245 let mut positionals = vec![0; n_samples];
246 let mut total_variants = vec![0; n_samples];
247 let mut gene_cursor = 0; // index inside gene, i.e, position of variant
248 let mut reference_cursor = start; // index of reference sequence
249 let mut frameshifts = vec![0; n_samples];
250 let mut snps = vec![0; n_samples];
251 let mut old_codon_idx = None;
252 for record in variants.records().into_iter() {
253 match record {
254 Ok(mut record) => {
255 match VariantContext::from_vcf_record(&mut record, true) {
256 Some(mut context) => {
257 let passes = VariantContextUtils::passes_thresholds(
258 &mut context,
259 qual_by_depth_filter,
260 qual_threshold,
261 );
262
263 if !passes {
264 continue;
265 }
266
267 // gained bases is the difference between current and previous
268 // position
269 let gained_bases =
270 context.loc.start.saturating_sub(reference_cursor);
271 // update previous position to current position
272 gene_cursor += gained_bases;
273 // add gained bases to reference cursor
274 reference_cursor = context.loc.start;
275
276 // index of current codon in gene
277 // number of codons is gene size divided by three
278 let codon_idx = gene_cursor / 3 as usize;
279 let process_previous_codon = match old_codon_idx {
280 Some(old_idx) => {
281 if old_idx != codon_idx {
282 old_codon_idx = Some(codon_idx);
283 previous_codon = codon;
284 true
285 } else {
286 false
287 }
288 }
289 None => {
290 old_codon_idx = Some(codon_idx);
291 false
292 }
293 };
294
295 // index of current position in the current codon
296 // all codons have size == 3
297 let codon_cursor = gene_cursor % 3;
298 // debug!(
299 // "reference cursor {} gained_bases {} Gene cursor {} codon idx {} codon cursor {}",
300 // reference_cursor, gained_bases, gene_cursor, codon_idx, codon_cursor
301 // );
302
303 if codon_idx >= codon_sequence.len() {
304 continue;
305 }
306 codon = &codon_sequence[codon_idx];
307
308 if std::str::from_utf8(codon.as_slice())
309 .expect("Unable to interpret codon")
310 .contains('N')
311 || codon.len() != 3
312 {
313 continue;
314 }
315
316 for sample_idx in 0..n_samples {
317 if process_previous_codon {
318 for new_codon in new_codons[sample_idx].iter() {
319 // debug!("Positionals start {:?}", &positionals);
320 // debug!(
321 // "Sample {} new_codon {:?} ref_codon {:?}",
322 // sample_idx, new_codon, &previous_codon
323 // );
324 if (previous_codon.len() == 3)
325 && (new_codon.len() == 3)
326 && (previous_codon != new_codon)
327 {
328 // get indices of different locations
329 let mut pos = 0 as usize;
330 let mut diffs = vec![];
331 for (c1, c2) in
332 previous_codon.iter().zip(new_codon.iter())
333 {
334 if c1 != c2 {
335 diffs.push(pos);
336 }
337 pos += 1;
338 }
339 total_variants[sample_idx] += diffs.len();
340 // get permuations of positions
341 let permutations: Vec<Vec<&usize>> = diffs
342 .iter()
343 .permutations(diffs.len())
344 .collect();
345
346 // calculate synonymous and non-synonymous for each permutation
347 let mut ns = 0;
348 let mut ss = 0;
349 // debug!(
350 // "positional difference {:?} permutations {:?}",
351 // diffs,
352 // permutations.len()
353 // );
354 positionals[sample_idx] += permutations.len();
355 for permutation in permutations.iter() {
356 let mut shifting = codon.clone();
357 let mut old_shift;
358 for pos in permutation {
359 // Check if one amino acid change causes an syn or non-syn
360 old_shift = shifting.clone();
361 shifting[**pos] = new_codon[**pos];
362 // debug!("Old shift {:?}, new {:?}", old_shift, shifting);
363 if self.aminos[&old_shift]
364 != self.aminos[&shifting]
365 {
366 ns += 1;
367 } else {
368 ss += 1;
369 }
370 }
371 }
372 let nd = ns as f64 / permutations.len() as f64;
373 let sd = ss as f64 / permutations.len() as f64;
374 big_nd[sample_idx] += nd;
375 big_sd[sample_idx] += sd;
376 }
377 // debug!("Positionals end {:?}", &positionals);
378 // debug!(
379 // "Codon idx {} gene cursor {} codons {} gene length {} new_codons {:?}",
380 // codon_idx,
381 // gene_cursor,
382 // codon_sequence.len(),
383 // gene_sequence.len(),
384 // new_codons[sample_idx]
385 // );
386 }
387 // begin working on new codon
388 new_codons[sample_idx] = Vec::new();
389 }
390
391 let which_are_present = context
392 .alleles_present_in_sample(
393 sample_idx,
394 depth_per_sample_filter as i32,
395 );
396
397 if !which_are_present[1..].iter().any(|v| *v) {
398 continue; // no alt alleles are present
399 }
400
401 let ref_allele = context.get_reference();
402 let mut snp_count = 0;
403
404 // iterate through non reference alleles
405 // if those alleles are present in this sample then
406 // increment appropriate values
407 for (allele_index, allele) in
408 context.get_alternate_alleles_with_index()
409 {
410 if new_codons[sample_idx].len() == 0 {
411 new_codons[sample_idx] = vec![codon.clone()];
412 }
413 if allele.bases.len() > 1
414 || allele.bases.len() != ref_allele.bases.len()
415 {
416 if which_are_present[allele_index] {
417 frameshifts[sample_idx] += 1;
418 }
419 continue;
420 }
421
422 if which_are_present[allele_index] {
423 snps[sample_idx] += 1;
424 if snp_count >= 1 {
425 // Create a copy of codon up to this point
426 // Not sure if reusing previous variants is bad, but
427 // not doing so can cause randomness to dN/dS values
428 // debug!(
429 // "before pushing {:?}",
430 // new_codons[sample_idx]
431 // );
432 new_codons[sample_idx].push(codon.clone());
433 // debug!(
434 // "Gene cursor {} sample idx {} snp count {} codon cursor {}",
435 // gene_cursor, sample_idx, snp_count, codon_cursor
436 // );
437 // debug!(
438 // "new_codons length {}",
439 // new_codons[sample_idx].len()
440 // );
441 new_codons[sample_idx][snp_count]
442 [codon_cursor] = allele.bases[0];
443
444 // debug!(
445 // "multi snp codon {:?}",
446 // new_codons[sample_idx]
447 // );
448 } else {
449 for var_idx in 0..new_codons[sample_idx].len() {
450 // debug!(
451 // "s {} v {} c {}. Size {}",
452 // sample_idx,
453 // var_idx,
454 // codon_cursor,
455 // new_codons[sample_idx].len()
456 // );
457 new_codons[sample_idx][var_idx]
458 [codon_cursor] = allele.bases[0];
459 }
460 }
461 snp_count += 1;
462 }
463 }
464 }
465 }
466 None => continue,
467 }
468 }
469 Err(_) => {
470 // Skip error record
471 continue;
472 }
473 }
474 }
475
476 let mut dnds_values = vec![1.0; n_samples];
477 for sample_idx in 0..n_samples {
478 // debug!(
479 // "Nd {} N {}, Sd {} S {} total permutations {} variants {}",
480 // big_nd[sample_idx],
481 // big_n[sample_idx],
482 // big_sd[sample_idx],
483 // big_s[sample_idx],
484 // positionals[sample_idx],
485 // total_variants[sample_idx]
486 // );
487 let mut pn = big_nd[sample_idx] / big_n[sample_idx];
488 let mut ps = big_sd[sample_idx] / big_s[sample_idx];
489 // debug!("pn {} ps {}", pn, ps);
490 // Weirdly in the Jukes-Cantor model if pn or ps are 0.75 then the nat log does not resolve
491 // No one talks about this in the literature for some reason
492 if pn == 0.75 {
493 pn = 0.7499
494 }
495 if ps == 0.75 {
496 ps = 0.7499
497 }
498 let d_n = -(3.0 / 4.0) * (1.0 - (4.0 * pn) / 3.0).ln();
499 let d_s = -(3.0 / 4.0) * (1.0 - (4.0 * ps) / 3.0).ln();
500 // debug!("dN {} dS {}", d_n, d_s);
501 let mut dnds = d_n / d_s;
502
503 // negative dnds values make no sense, but occur nonetheless
504 // Just make them 0.0
505 if dnds.is_nan() || d_s - 0.0 <= f64::EPSILON {
506 dnds = 1.
507 } else if dnds.is_sign_negative() {
508 dnds = 0.0
509 }
510 dnds_values[sample_idx] = dnds
511 }
512
513 return (snps, frameshifts, dnds_values);
514 }
515 _ => return (vec![0; n_samples], vec![0; n_samples], vec![1.0; n_samples]),
516 }
517 }
518
519 fn calculate_gene_coverage(
520 &self,
521 gene: &bio::io::gff::Record,
522 depth_of_contig: &Vec<i32>,
523 ) -> (f32, f32) {
524 let gene_start = *gene.start() as usize - 1;
525 let gene_end = *gene.end() as usize - 1;
526 let gene_depths = &depth_of_contig[gene_start..gene_end];
527 let mean_cov = mean(gene_depths).unwrap_or(0.);
528 let std = std_deviation(gene_depths).unwrap_or(0.);
529
530 return (mean_cov, std);
531 }
532}
533
534pub fn get_codons<'a>(sequence: &'a [u8], frame: usize, strandedness: Strand) -> Vec<Vec<u8>> {
535 match strandedness {
536 Strand::Forward | Strand::Unknown => sequence[0 + frame..]
537 .chunks(3)
538 .map(|chunk| chunk.to_vec())
539 .collect::<Vec<Vec<u8>>>(),
540 Strand::Reverse => {
541 let rc = dna::revcomp(sequence);
542 rc[0 + frame..]
543 .chunks(3)
544 .map(|chunk| chunk.to_vec())
545 .collect::<Vec<Vec<u8>>>()
546 }
547 }
548}