lorikeet_genome/reference/
reference_writer.rs1use std::collections::{BTreeMap, BinaryHeap};
2use std::fs::{create_dir_all, File};
3use std::io::Write;
4use std::path::Path;
5
6use crate::model::byte_array_allele::ByteArrayAllele;
7use crate::model::variant_context::{VariantContext, VariantType};
8use crate::reference::reference_reader::ReferenceReader;
9use crate::utils::simple_interval::Locatable;
10
11pub struct ReferenceWriter<'a> {
14 reference_reader: ReferenceReader,
15 output_prefix: &'a str,
16}
17
18impl<'a> ReferenceWriter<'a> {
19 pub fn new(reference_reader: ReferenceReader, output_prefix: &'a str) -> ReferenceWriter<'a> {
20 create_dir_all(output_prefix).expect("Unable to create output directory");
22 Self {
23 reference_reader,
24 output_prefix,
25 }
26 }
27
28 pub fn generate_strains(
32 &mut self,
33 variant_contexts: Vec<VariantContext>,
34 ref_idx: usize,
35 strain_ids_present: Vec<usize>,
36 ) {
37 let mut grouped_variant_contexts = Self::split_variant_contexts_by_tid(variant_contexts);
38 let tids = self
39 .reference_reader
40 .retrieve_tids_for_ref_index(ref_idx)
41 .unwrap()
42 .clone();
43
44 for strain_idx in strain_ids_present {
45 let file_name = format!(
46 "{}/{}_strain_{}.fna",
47 self.output_prefix,
48 self.reference_reader.genomes_and_contigs.genomes[ref_idx],
49 strain_idx,
50 );
51
52 let file_path = Path::new(&file_name);
53 debug!("File path {}", &file_name);
54 let mut file_open =
56 File::create(file_path).expect("No Read or Write Permission in current directory");
57 for tid in tids.iter() {
58 if self
59 .reference_reader
60 .fetch_contig_from_reference_by_tid(*tid, ref_idx)
61 .is_err()
62 {
63 continue;
64 };
65 self.reference_reader.read_sequence_to_vec();
66
67 let mut new_bases = std::mem::take(&mut self.reference_reader.current_sequence);
68 let old_length = new_bases.len();
69 let mut offset = 0;
71 let variant_contexts_of_contig = grouped_variant_contexts.get_mut(&tid);
72 let mut variations = 0;
73 match variant_contexts_of_contig {
74 Some(variant_contexts_of_contig) => {
75 for mut vc in variant_contexts_of_contig.iter_mut() {
76 if vc.part_of_strain(strain_idx) {
77 let alternate_allele = vc.get_alternate_alleles()[0].clone();
78 let variant_type = vc.get_type().clone();
79 let is_ref = alternate_allele.is_ref;
80 Self::modify_reference_bases_based_on_variant_type(
81 &mut new_bases,
82 alternate_allele,
83 &mut vc,
84 variant_type,
85 &mut offset,
86 );
87 variations += if is_ref { 0 } else { 1 };
88 }
89 }
90 }
91 None => {
92 }
94 }
95
96 writeln!(
98 file_open,
99 ">{} strain_id={} old_length={} new_length={} variations={}",
100 std::str::from_utf8(self.reference_reader.get_target_name(*tid)).unwrap(),
101 strain_idx,
102 old_length,
103 new_bases.len(),
104 variations
105 )
106 .expect("Unable to write to file");
107
108 for line in new_bases[..].chunks(60).into_iter() {
110 file_open.write_all(line).unwrap();
111 file_open.write_all(b"\n").unwrap();
112 }
113 }
114 }
115 }
116
117 pub fn generate_consensus(
121 &mut self,
122 variant_contexts: Vec<VariantContext>,
123 ref_idx: usize,
124 samples: &[&str],
125 ) {
126 let mut grouped_variant_contexts = Self::split_variant_contexts_by_tid(variant_contexts);
127 let tids = self
128 .reference_reader
129 .retrieve_tids_for_ref_index(ref_idx)
130 .unwrap()
131 .clone();
132 for (sample_index, sample_name) in samples.into_iter().enumerate() {
133 let file_name = format!(
134 "{}/{}_consensus_{}.fna",
135 self.output_prefix,
136 self.reference_reader.genomes_and_contigs.genomes[ref_idx],
137 &sample_name.rsplitn(2, '/').next().unwrap(),
138 );
139 let file_path = Path::new(&file_name);
140 debug!("File path {}", &file_name);
141 let mut file_open = File::create(file_path).unwrap_or_else(|_| {
143 panic!(
144 "No Read or Write Permission in current directory: {:?}",
145 file_path
146 )
147 });
148 for tid in tids.iter() {
149 if self
150 .reference_reader
151 .fetch_contig_from_reference_by_tid(*tid, ref_idx)
152 .is_err()
153 {
154 continue;
155 };
156 self.reference_reader.read_sequence_to_vec();
157 debug!(
158 "Fetched length {} tid {} ref_idx {} ",
159 self.reference_reader.current_sequence.len(),
160 *tid,
161 ref_idx
162 );
163 let mut new_bases = std::mem::take(&mut self.reference_reader.current_sequence);
164 let old_length = new_bases.len();
165 debug!("Contig length {}", old_length);
166 let mut offset = 0;
168 let variant_contexts_of_contig = grouped_variant_contexts.get_mut(&tid);
169 let mut variations = 0;
170 match variant_contexts_of_contig {
171 Some(variant_contexts_of_contig) => {
172 for mut vc in variant_contexts_of_contig.iter_mut() {
173 let consensus_allele = vc.get_consensus_allele(sample_index);
174 match consensus_allele {
175 Some(consensus_allele) => {
176 let variant_type = vc.get_type().clone();
177 let is_ref = consensus_allele.is_ref;
178 Self::modify_reference_bases_based_on_variant_type(
179 &mut new_bases,
180 consensus_allele,
181 &mut vc,
182 variant_type,
183 &mut offset,
184 );
185 variations += if is_ref { 0 } else { 1 };
186 }
187 None => continue,
188 }
189 }
190 }
191 None => {
192 }
194 }
195
196 debug!(
197 "Writing contig {}",
198 std::str::from_utf8(self.reference_reader.get_target_name(*tid)).unwrap()
199 );
200 writeln!(
202 file_open,
203 ">{} sample_consensus={} old_length={} new_length={} variations={}",
204 std::str::from_utf8(self.reference_reader.get_target_name(*tid)).unwrap(),
205 sample_name,
206 old_length,
207 new_bases.len(),
208 variations
209 )
210 .expect("Unable to write to file");
211
212 for line in new_bases[..].chunks(60).into_iter() {
214 file_open.write_all(line).unwrap();
215 file_open.write_all(b"\n").unwrap();
216 }
217 }
218 }
219 }
220
221 fn split_variant_contexts_by_tid(
225 variant_contexts: Vec<VariantContext>,
226 ) -> BTreeMap<usize, Vec<VariantContext>> {
227 let mut grouped_variant_contexts = BTreeMap::new();
228 for vc in variant_contexts {
229 let vcs_on_contig = grouped_variant_contexts
230 .entry(vc.loc.get_contig())
231 .or_insert_with(BinaryHeap::new);
232 vcs_on_contig.push(vc);
233 }
234
235 grouped_variant_contexts
236 .into_iter()
237 .map(|(tid, heap)| (tid, heap.into_sorted_vec()))
238 .collect()
239 }
240
241 pub fn modify_reference_bases_based_on_variant_type(
242 new_bases: &mut Vec<u8>,
243 consensus_allele: ByteArrayAllele,
244 vc: &mut VariantContext,
245 variant_type: VariantType,
246 offset: &mut i64,
247 ) {
248 match variant_type {
249 VariantType::Symbolic => {
250 if consensus_allele.is_span_del() {
251 new_bases.splice(
253 ((vc.loc.start as i64 + 1 + *offset) as usize)
254 ..=((vc.loc.end as i64 + *offset) as usize),
255 (0..0).into_iter(),
256 );
257 *offset -= vc.loc.get_length_on_reference() as i64 - 1;
258 };
259 }
260 VariantType::Snp => {
261 new_bases[(vc.loc.start as i64 + *offset) as usize] = consensus_allele.bases[0];
262 }
263 VariantType::Indel => {
264 let allele_len = consensus_allele.bases.len();
265 new_bases.splice(
266 ((vc.loc.start as i64 + *offset) as usize)
267 ..=(((vc.loc.start + vc.get_reference().bases.len() - 1) as i64 + *offset)
268 as usize),
269 consensus_allele.bases.into_iter(),
270 );
271
272 if vc.loc.get_length_on_reference() == 1 {
273 *offset += allele_len as i64 - 1;
275 } else {
276 *offset -= vc.loc.get_length_on_reference() as i64 - 1;
277 };
278 }
279 VariantType::Mnp => {
280 let allele_len = consensus_allele.bases.len();
281 new_bases.splice(
282 ((vc.loc.start as i64 + *offset) as usize)
283 ..=(((vc.loc.start + vc.get_reference().bases.len() - 1) as i64 + *offset)
284 as usize),
285 consensus_allele.bases.into_iter(),
286 );
287
288 if vc.loc.get_length_on_reference() < allele_len {
289 *offset += allele_len as i64 - 1 - vc.loc.get_length_on_reference() as i64;
291 } else {
292 *offset -= vc.loc.get_length_on_reference() as i64 - allele_len as i64 - 1;
294 }
295 }
296 VariantType::Mixed => {
297 let new_variant_type = VariantContext::type_of_biallelic_variant(
299 vc.get_reference(),
300 &consensus_allele,
301 );
302 return Self::modify_reference_bases_based_on_variant_type(
303 new_bases,
304 consensus_allele,
305 vc,
306 new_variant_type,
307 offset,
308 );
309 }
310 _ => {
311 }
313 }
314 }
315}