lorikeet_genome/reference/
reference_writer.rs

1use std::collections::{BTreeMap, BinaryHeap};
2use std::fs::{create_dir_all, File};
3use std::io::Write;
4use std::path::Path;
5
6use crate::model::byte_array_allele::ByteArrayAllele;
7use crate::model::variant_context::{VariantContext, VariantType};
8use crate::reference::reference_reader::ReferenceReader;
9use crate::utils::simple_interval::Locatable;
10
11/// Struct housing methods for writing out genomes when given specific variant information
12/// Basically a wrapper for reference reader
13pub struct ReferenceWriter<'a> {
14    reference_reader: ReferenceReader,
15    output_prefix: &'a str,
16}
17
18impl<'a> ReferenceWriter<'a> {
19    pub fn new(reference_reader: ReferenceReader, output_prefix: &'a str) -> ReferenceWriter<'a> {
20        // ensure path exists
21        create_dir_all(output_prefix).expect("Unable to create output directory");
22        Self {
23            reference_reader,
24            output_prefix,
25        }
26    }
27
28    /// Generates the potential strain genomes calculated by Lorikeet. The VariantContexts are expected
29    /// To be tagged with one or more strain genomes in their `attributes` with `VariantAnnotation::Strain`
30    /// tag
31    pub fn generate_strains(
32        &mut self,
33        variant_contexts: Vec<VariantContext>,
34        ref_idx: usize,
35        strain_ids_present: Vec<usize>,
36    ) {
37        let mut grouped_variant_contexts = Self::split_variant_contexts_by_tid(variant_contexts);
38        let tids = self
39            .reference_reader
40            .retrieve_tids_for_ref_index(ref_idx)
41            .unwrap()
42            .clone();
43
44        for strain_idx in strain_ids_present {
45            let file_name = format!(
46                "{}/{}_strain_{}.fna",
47                self.output_prefix,
48                self.reference_reader.genomes_and_contigs.genomes[ref_idx],
49                strain_idx,
50            );
51
52            let file_path = Path::new(&file_name);
53            debug!("File path {}", &file_name);
54            // Open new reference file or create one
55            let mut file_open =
56                File::create(file_path).expect("No Read or Write Permission in current directory");
57            for tid in tids.iter() {
58                if self
59                    .reference_reader
60                    .fetch_contig_from_reference_by_tid(*tid, ref_idx)
61                    .is_err()
62                {
63                    continue;
64                };
65                self.reference_reader.read_sequence_to_vec();
66
67                let mut new_bases = std::mem::take(&mut self.reference_reader.current_sequence);
68                let old_length = new_bases.len();
69                // This value holds how far right or left the vc location has shifted as we add indels
70                let mut offset = 0;
71                let variant_contexts_of_contig = grouped_variant_contexts.get_mut(&tid);
72                let mut variations = 0;
73                match variant_contexts_of_contig {
74                    Some(variant_contexts_of_contig) => {
75                        for mut vc in variant_contexts_of_contig.iter_mut() {
76                            if vc.part_of_strain(strain_idx) {
77                                let alternate_allele = vc.get_alternate_alleles()[0].clone();
78                                let variant_type = vc.get_type().clone();
79                                let is_ref = alternate_allele.is_ref;
80                                Self::modify_reference_bases_based_on_variant_type(
81                                    &mut new_bases,
82                                    alternate_allele,
83                                    &mut vc,
84                                    variant_type,
85                                    &mut offset,
86                                );
87                                variations += if is_ref { 0 } else { 1 };
88                            }
89                        }
90                    }
91                    None => {
92                        // pass
93                    }
94                }
95
96                // write the contig header
97                writeln!(
98                    file_open,
99                    ">{} strain_id={} old_length={} new_length={} variations={}",
100                    std::str::from_utf8(self.reference_reader.get_target_name(*tid)).unwrap(),
101                    strain_idx,
102                    old_length,
103                    new_bases.len(),
104                    variations
105                )
106                .expect("Unable to write to file");
107
108                // write out the actual contig
109                for line in new_bases[..].chunks(60).into_iter() {
110                    file_open.write_all(line).unwrap();
111                    file_open.write_all(b"\n").unwrap();
112                }
113            }
114        }
115    }
116
117    /// Generates the per sample consensus genomes based on the provided variant contexts.
118    /// The consensus is defined as the most dominant variant at a given position on the reference
119    /// genome measured by read depth.
120    pub fn generate_consensus(
121        &mut self,
122        variant_contexts: Vec<VariantContext>,
123        ref_idx: usize,
124        samples: &[&str],
125    ) {
126        let mut grouped_variant_contexts = Self::split_variant_contexts_by_tid(variant_contexts);
127        let tids = self
128            .reference_reader
129            .retrieve_tids_for_ref_index(ref_idx)
130            .unwrap()
131            .clone();
132        for (sample_index, sample_name) in samples.into_iter().enumerate() {
133            let file_name = format!(
134                "{}/{}_consensus_{}.fna",
135                self.output_prefix,
136                self.reference_reader.genomes_and_contigs.genomes[ref_idx],
137                &sample_name.rsplitn(2, '/').next().unwrap(),
138            );
139            let file_path = Path::new(&file_name);
140            debug!("File path {}", &file_name);
141            // Open new reference file or create one
142            let mut file_open = File::create(file_path).unwrap_or_else(|_| {
143                panic!(
144                    "No Read or Write Permission in current directory: {:?}",
145                    file_path
146                )
147            });
148            for tid in tids.iter() {
149                if self
150                    .reference_reader
151                    .fetch_contig_from_reference_by_tid(*tid, ref_idx)
152                    .is_err()
153                {
154                    continue;
155                };
156                self.reference_reader.read_sequence_to_vec();
157                debug!(
158                    "Fetched length {} tid {} ref_idx {} ",
159                    self.reference_reader.current_sequence.len(),
160                    *tid,
161                    ref_idx
162                );
163                let mut new_bases = std::mem::take(&mut self.reference_reader.current_sequence);
164                let old_length = new_bases.len();
165                debug!("Contig length {}", old_length);
166                // This value holds how far right or left the vc location has shifted as we add indels
167                let mut offset = 0;
168                let variant_contexts_of_contig = grouped_variant_contexts.get_mut(&tid);
169                let mut variations = 0;
170                match variant_contexts_of_contig {
171                    Some(variant_contexts_of_contig) => {
172                        for mut vc in variant_contexts_of_contig.iter_mut() {
173                            let consensus_allele = vc.get_consensus_allele(sample_index);
174                            match consensus_allele {
175                                Some(consensus_allele) => {
176                                    let variant_type = vc.get_type().clone();
177                                    let is_ref = consensus_allele.is_ref;
178                                    Self::modify_reference_bases_based_on_variant_type(
179                                        &mut new_bases,
180                                        consensus_allele,
181                                        &mut vc,
182                                        variant_type,
183                                        &mut offset,
184                                    );
185                                    variations += if is_ref { 0 } else { 1 };
186                                }
187                                None => continue,
188                            }
189                        }
190                    }
191                    None => {
192                        // pass
193                    }
194                }
195
196                debug!(
197                    "Writing contig {}",
198                    std::str::from_utf8(self.reference_reader.get_target_name(*tid)).unwrap()
199                );
200                // write the contig header
201                writeln!(
202                    file_open,
203                    ">{} sample_consensus={} old_length={} new_length={} variations={}",
204                    std::str::from_utf8(self.reference_reader.get_target_name(*tid)).unwrap(),
205                    sample_name,
206                    old_length,
207                    new_bases.len(),
208                    variations
209                )
210                .expect("Unable to write to file");
211
212                // write out the actual contig
213                for line in new_bases[..].chunks(60).into_iter() {
214                    file_open.write_all(line).unwrap();
215                    file_open.write_all(b"\n").unwrap();
216                }
217            }
218        }
219    }
220
221    /// Takes a list of variant contexts and returns a BTreeMap with contexts grouped by which
222    /// contig they appear on. Additionally, the Vector of contexts for each contig is coordinate
223    /// sorted so the contexts appear in order in which they occur on the contig
224    fn split_variant_contexts_by_tid(
225        variant_contexts: Vec<VariantContext>,
226    ) -> BTreeMap<usize, Vec<VariantContext>> {
227        let mut grouped_variant_contexts = BTreeMap::new();
228        for vc in variant_contexts {
229            let vcs_on_contig = grouped_variant_contexts
230                .entry(vc.loc.get_contig())
231                .or_insert_with(BinaryHeap::new);
232            vcs_on_contig.push(vc);
233        }
234
235        grouped_variant_contexts
236            .into_iter()
237            .map(|(tid, heap)| (tid, heap.into_sorted_vec()))
238            .collect()
239    }
240
241    pub fn modify_reference_bases_based_on_variant_type(
242        new_bases: &mut Vec<u8>,
243        consensus_allele: ByteArrayAllele,
244        vc: &mut VariantContext,
245        variant_type: VariantType,
246        offset: &mut i64,
247    ) {
248        match variant_type {
249            VariantType::Symbolic => {
250                if consensus_allele.is_span_del() {
251                    // delete from reference, replace with nothing
252                    new_bases.splice(
253                        ((vc.loc.start as i64 + 1 + *offset) as usize)
254                            ..=((vc.loc.end as i64 + *offset) as usize),
255                        (0..0).into_iter(),
256                    );
257                    *offset -= vc.loc.get_length_on_reference() as i64 - 1;
258                };
259            }
260            VariantType::Snp => {
261                new_bases[(vc.loc.start as i64 + *offset) as usize] = consensus_allele.bases[0];
262            }
263            VariantType::Indel => {
264                let allele_len = consensus_allele.bases.len();
265                new_bases.splice(
266                    ((vc.loc.start as i64 + *offset) as usize)
267                        ..=(((vc.loc.start + vc.get_reference().bases.len() - 1) as i64 + *offset)
268                            as usize),
269                    consensus_allele.bases.into_iter(),
270                );
271
272                if vc.loc.get_length_on_reference() == 1 {
273                    // insertion
274                    *offset += allele_len as i64 - 1;
275                } else {
276                    *offset -= vc.loc.get_length_on_reference() as i64 - 1;
277                };
278            }
279            VariantType::Mnp => {
280                let allele_len = consensus_allele.bases.len();
281                new_bases.splice(
282                    ((vc.loc.start as i64 + *offset) as usize)
283                        ..=(((vc.loc.start + vc.get_reference().bases.len() - 1) as i64 + *offset)
284                            as usize),
285                    consensus_allele.bases.into_iter(),
286                );
287
288                if vc.loc.get_length_on_reference() < allele_len {
289                    // gaining bases so increase offset
290                    *offset += allele_len as i64 - 1 - vc.loc.get_length_on_reference() as i64;
291                } else {
292                    // losing bases so decrease offset
293                    *offset -= vc.loc.get_length_on_reference() as i64 - allele_len as i64 - 1;
294                }
295            }
296            VariantType::Mixed => {
297                // need to determine the type the actual allele came out as
298                let new_variant_type = VariantContext::type_of_biallelic_variant(
299                    vc.get_reference(),
300                    &consensus_allele,
301                );
302                return Self::modify_reference_bases_based_on_variant_type(
303                    new_bases,
304                    consensus_allele,
305                    vc,
306                    new_variant_type,
307                    offset,
308                );
309            }
310            _ => {
311                // pass on everything else for now.
312            }
313        }
314    }
315}