lorikeet_genome/reference/
reference_reader.rs1use bio::io::fasta::IndexedReader;
2use hashlink::LinkedHashSet;
3use std::cmp::min;
4use std::collections::HashMap;
5use std::fs::File;
6
7use crate::reference::reference_reader_utils::GenomesAndContigs;
8use crate::reference::reference_reader_utils::ReferenceReaderUtils;
9use crate::utils::simple_interval::{Locatable, SimpleInterval};
10
11#[derive(Debug)]
21pub struct ReferenceReader {
22 indexed_reader: IndexedReader<File>,
23 pub current_sequence: Vec<u8>,
24 pub genomes_and_contigs: GenomesAndContigs,
25 reference_index_to_tid: HashMap<usize, LinkedHashSet<usize>>,
26 target_names: HashMap<usize, Vec<u8>>,
27 pub target_lens: HashMap<usize, u64>,
28 genome_path: Option<String>,
29}
30
31impl Clone for ReferenceReader {
32 fn clone(&self) -> Self {
33 let indexed_reader = ReferenceReaderUtils::retrieve_reference(&self.genome_path);
34
35 Self {
36 indexed_reader,
37 current_sequence: Vec::new(),
39 genomes_and_contigs: self.genomes_and_contigs.clone(),
40 reference_index_to_tid: self.reference_index_to_tid.clone(),
41 target_names: self.target_names.clone(),
42 target_lens: self.target_lens.clone(),
43 genome_path: self.genome_path.clone(),
44 }
45 }
46}
47
48impl ReferenceReader {
49 pub fn new(
50 concatenated_genomes: &Option<String>,
51 genomes_and_contigs: GenomesAndContigs,
52 _number_of_contigs: usize,
53 ) -> ReferenceReader {
54 let indexed_reader = ReferenceReaderUtils::retrieve_reference(concatenated_genomes);
55
56 ReferenceReader {
57 indexed_reader,
58 current_sequence: Vec::new(),
59 reference_index_to_tid: HashMap::new(),
60 target_names: HashMap::new(),
61 target_lens: HashMap::new(),
62 genomes_and_contigs,
63 genome_path: concatenated_genomes.clone(),
64 }
65 }
66
67 pub fn new_with_target_names(
68 concatenated_genomes: &Option<String>,
69 genomes_and_contigs: GenomesAndContigs,
70 target_names: Vec<&[u8]>,
71 ) -> Self {
72 let indexed_reader = ReferenceReaderUtils::retrieve_reference(concatenated_genomes);
73
74 Self {
75 indexed_reader,
76 current_sequence: Vec::new(),
77 reference_index_to_tid: HashMap::new(),
78 target_names: target_names
79 .into_iter()
80 .enumerate()
81 .map(|(tid, target)| (tid, target.to_vec()))
82 .collect::<HashMap<usize, Vec<u8>>>(),
83 genomes_and_contigs,
84 target_lens: HashMap::new(),
85 genome_path: concatenated_genomes.clone(),
86 }
87 }
88
89 pub fn new_from_reader_with_reference_index(reader: &Self, ref_idx: usize) -> Self {
91 let indexed_reader = ReferenceReaderUtils::retrieve_reference(&reader.genome_path);
92
93 if reader.reference_index_to_tid.contains_key(&ref_idx) {
94 let ref_tids = reader.reference_index_to_tid.get(&ref_idx).unwrap().clone();
95 let mut target_names = HashMap::with_capacity(ref_tids.len());
96 let mut target_lens = HashMap::with_capacity(ref_tids.len());
97 ref_tids.iter().for_each(|tid| {
98 target_names.insert(*tid, reader.target_names.get(tid).unwrap().clone());
99 target_lens.insert(*tid, reader.target_lens.get(tid).unwrap().clone());
100 });
101
102 let mut ref_idx_to_tid = HashMap::new();
103 ref_idx_to_tid.insert(ref_idx, ref_tids);
104
105 let genomes_and_contigs = GenomesAndContigs::new();
106
107 Self {
108 indexed_reader,
109 current_sequence: Vec::new(),
110 reference_index_to_tid: ref_idx_to_tid,
111 genomes_and_contigs,
112 target_lens,
113 target_names,
114 genome_path: reader.genome_path.clone(),
115 }
116 } else {
117 reader.clone()
118 }
119 }
120
121 pub fn new_from_reader_with_tid_and_rid(
122 reader: &ReferenceReader,
123 ref_idx: usize,
124 tid: usize,
125 ) -> Self {
126 let indexed_reader = ReferenceReaderUtils::retrieve_reference(&reader.genome_path);
127
128 if reader.reference_index_to_tid.contains_key(&ref_idx) {
129 let mut target_names = HashMap::with_capacity(1);
130 let mut target_lens = HashMap::with_capacity(1);
131 target_names.insert(tid, reader.target_names.get(&tid).unwrap().clone());
132 target_lens.insert(tid, reader.target_lens.get(&tid).unwrap().clone());
133
134 let mut ref_idx_to_tid = HashMap::new();
135 let mut tids = LinkedHashSet::with_capacity(1);
136 tids.insert(tid);
137 ref_idx_to_tid.insert(ref_idx, tids);
138
139 let genomes_and_contigs = GenomesAndContigs::new();
140
141 Self {
142 indexed_reader,
143 current_sequence: Vec::new(),
144 reference_index_to_tid: ref_idx_to_tid,
145 genomes_and_contigs,
146 target_lens,
147 target_names,
148 genome_path: reader.genome_path.clone(),
149 }
150 } else {
151 reader.clone()
152 }
153 }
154
155 pub fn get_target_name(&self, tid: usize) -> &[u8] {
156 self.target_names.get(&tid).unwrap()
157 }
158
159 pub fn add_target(&mut self, target: &[u8], tid: usize) {
160 self.target_names.insert(tid, target.to_vec());
161 }
162
163 pub fn update_ref_index_tids(&mut self, ref_index: usize, tid: usize) -> usize {
164 let tids = self
165 .reference_index_to_tid
166 .entry(ref_index)
167 .or_insert_with(LinkedHashSet::new);
168 tids.insert(tid);
169 tids.len()
170 }
171
172 pub fn retrieve_tids_for_ref_index(&self, ref_index: usize) -> Option<&LinkedHashSet<usize>> {
173 self.reference_index_to_tid.get(&ref_index)
174 }
175
176 pub fn add_length(&mut self, tid: usize, length: u64) {
177 self.target_lens.insert(tid, length);
178 }
179
180 pub fn add_lengths(&mut self, target_lengths: HashMap<usize, u64>) {
181 self.target_lens = target_lengths
182 }
183
184 pub fn get_contig_length(&self, tid: usize) -> u64 {
185 *self.target_lens.get(&tid).unwrap_or(&0)
186 }
187
188 pub fn retrieve_reference_stem(&self, ref_idx: usize) -> String {
189 self.genomes_and_contigs.genomes[ref_idx].clone()
190 }
191
192 pub fn update_current_sequence_capacity(&mut self, size: usize) {
193 self.current_sequence = Vec::with_capacity(size);
194 }
195
196 pub fn update_current_sequence_without_capacity(&mut self) {
197 self.current_sequence = Vec::new();
198 }
199
200 pub fn retrieve_contig_name_from_tid(&self, tid: usize) -> Option<&Vec<u8>> {
201 return self.target_names.get(&tid);
202 }
203
204 pub fn fetch_contig_from_reference_by_contig_name(
205 &mut self,
206 contig_name: &[u8],
207 ref_idx: usize,
208 ) {
209 match self
210 .indexed_reader
211 .fetch_all(std::str::from_utf8(contig_name).unwrap())
212 {
213 Ok(reference) => reference,
214 Err(_e) => match self.indexed_reader.fetch_all(&format!(
215 "{}",
216 std::str::from_utf8(contig_name)
217 .unwrap()
218 .splitn(2, '~')
219 .nth(1)
220 .unwrap()
221 )) {
222 Ok(reference) => reference,
223 Err(_e) => {
224 match self.indexed_reader.fetch_all(&format!(
225 "{}~{}",
226 self.genomes_and_contigs.genomes[ref_idx],
227 std::str::from_utf8(contig_name).unwrap()
228 )) {
229 Ok(reference) => reference,
230 Err(e) => {
231 panic!(
232 "Cannot read sequence from {}: {}",
233 std::str::from_utf8(contig_name).unwrap(),
234 e
235 )
236 }
237 }
238 }
239 },
240 };
241 }
242
243 pub fn fetch_contig_from_reference_by_tid(
244 &mut self,
245 tid: usize,
246 ref_idx: usize,
247 ) -> Result<(), std::io::Error> {
248 match self
249 .indexed_reader
250 .fetch_all(std::str::from_utf8(&self.target_names[&tid]).unwrap())
251 {
252 Ok(reference) => Ok(reference),
253 Err(_e) => match self.indexed_reader.fetch_all(&format!(
254 "{}",
255 match std::str::from_utf8(&self.target_names[&tid])
256 .unwrap()
257 .splitn(2, '~')
258 .nth(1)
259 {
260 None => std::str::from_utf8(&self.target_names[&tid]).unwrap(),
261 Some(contig) => contig,
262 }
263 )) {
264 Ok(reference) => Ok(reference),
265 Err(_e) => {
266 match self.indexed_reader.fetch_all(&format!(
267 "{}~{}",
268 self.genomes_and_contigs.genomes[ref_idx],
269 std::str::from_utf8(&self.target_names[&tid]).unwrap()
270 )) {
271 Ok(reference) => Ok(reference),
272 Err(e) => {
273 Err(e)
279 }
280 }
281 }
282 },
283 }
284 }
285
286 pub fn fetch_reference_context(&mut self, ref_idx: usize, interval: &SimpleInterval) {
289 match self.indexed_reader.fetch(
290 std::str::from_utf8(&self.target_names[&interval.get_contig()]).unwrap(),
291 interval.get_start() as u64,
292 min(
293 (interval.get_end() + 1) as u64,
294 *self.target_lens.get(&interval.get_contig()).unwrap(),
295 ),
296 ) {
297 Ok(reference) => reference,
298 Err(_e) => match self.indexed_reader.fetch(
299 &format!(
300 "{}",
301 std::str::from_utf8(&self.target_names[&interval.get_contig()])
302 .unwrap()
303 .splitn(2, '~')
304 .nth(1)
305 .unwrap()
306 ),
307 interval.get_start() as u64,
308 min(
309 (interval.get_end() + 1) as u64,
310 *self.target_lens.get(&interval.get_contig()).unwrap(),
311 ),
312 ) {
313 Ok(reference) => reference,
314 Err(_e) => match self.indexed_reader.fetch(
315 &format!(
316 "{}~{}",
317 self.genomes_and_contigs.genomes[ref_idx],
318 std::str::from_utf8(&self.target_names[&interval.get_contig()]).unwrap()
319 ),
320 interval.get_start() as u64,
321 min(
322 (interval.get_end() + 1) as u64,
323 *self.target_lens.get(&interval.get_contig()).unwrap(),
324 ),
325 ) {
326 Ok(reference) => reference,
327 Err(e) => {
328 panic!(
329 "Cannot read sequence from reference {} {:?}",
330 format!(
331 "{}~{}",
332 &self.genomes_and_contigs.genomes[ref_idx],
333 std::str::from_utf8(&self.target_names[&interval.get_contig()])
334 .unwrap()
335 ),
336 e,
337 );
338 }
339 },
340 },
341 };
342 }
343
344 pub fn read_sequence_to_vec(&mut self) {
345 match self.indexed_reader.read(&mut self.current_sequence) {
346 Ok(reference) => reference,
347 Err(e) => {
348 panic!("Cannot read sequence from reference {:?}", e,);
349 }
350 };
351 }
352
353 pub fn split_contig_name<'b>(contig_name: &'b [u8], separator: u8) -> &'b [u8] {
357 match contig_name.into_iter().position(|&x| x == separator) {
358 Some(position) => return &contig_name[(position + 1)..contig_name.len()], None => return contig_name,
360 }
361 }
362}