lorikeet_genome/reference/
reference_reader.rs

1use bio::io::fasta::IndexedReader;
2use hashlink::LinkedHashSet;
3use std::cmp::min;
4use std::collections::HashMap;
5use std::fs::File;
6
7use crate::reference::reference_reader_utils::GenomesAndContigs;
8use crate::reference::reference_reader_utils::ReferenceReaderUtils;
9use crate::utils::simple_interval::{Locatable, SimpleInterval};
10
11/**
12* Struct handling methods to read and handle information for references
13* indexed_reader represents the #[IndexedReader<File>]
14* current_sequence is the byte array current being read from the indexed_reader file
15* genomes_and_contigs holds the #[GenomesAndContigs] struct matching contig string names to tids
16* reference index to tid holds a reference index matched to a LinkedHashSet of all associated tids
17* target_names matches the tid to a byte array representation of the contigs name in the Fasta file
18* target_lens matches the tid of a contig to its observed length in the Fasta file in base pairs
19*/
20#[derive(Debug)]
21pub struct ReferenceReader {
22    indexed_reader: IndexedReader<File>,
23    pub current_sequence: Vec<u8>,
24    pub genomes_and_contigs: GenomesAndContigs,
25    reference_index_to_tid: HashMap<usize, LinkedHashSet<usize>>,
26    target_names: HashMap<usize, Vec<u8>>,
27    pub target_lens: HashMap<usize, u64>,
28    genome_path: Option<String>,
29}
30
31impl Clone for ReferenceReader {
32    fn clone(&self) -> Self {
33        let indexed_reader = ReferenceReaderUtils::retrieve_reference(&self.genome_path);
34
35        Self {
36            indexed_reader,
37            // current_sequence: self.current_sequence.clone(),
38            current_sequence: Vec::new(),
39            genomes_and_contigs: self.genomes_and_contigs.clone(),
40            reference_index_to_tid: self.reference_index_to_tid.clone(),
41            target_names: self.target_names.clone(),
42            target_lens: self.target_lens.clone(),
43            genome_path: self.genome_path.clone(),
44        }
45    }
46}
47
48impl ReferenceReader {
49    pub fn new(
50        concatenated_genomes: &Option<String>,
51        genomes_and_contigs: GenomesAndContigs,
52        _number_of_contigs: usize,
53    ) -> ReferenceReader {
54        let indexed_reader = ReferenceReaderUtils::retrieve_reference(concatenated_genomes);
55
56        ReferenceReader {
57            indexed_reader,
58            current_sequence: Vec::new(),
59            reference_index_to_tid: HashMap::new(),
60            target_names: HashMap::new(),
61            target_lens: HashMap::new(),
62            genomes_and_contigs,
63            genome_path: concatenated_genomes.clone(),
64        }
65    }
66
67    pub fn new_with_target_names(
68        concatenated_genomes: &Option<String>,
69        genomes_and_contigs: GenomesAndContigs,
70        target_names: Vec<&[u8]>,
71    ) -> Self {
72        let indexed_reader = ReferenceReaderUtils::retrieve_reference(concatenated_genomes);
73
74        Self {
75            indexed_reader,
76            current_sequence: Vec::new(),
77            reference_index_to_tid: HashMap::new(),
78            target_names: target_names
79                .into_iter()
80                .enumerate()
81                .map(|(tid, target)| (tid, target.to_vec()))
82                .collect::<HashMap<usize, Vec<u8>>>(),
83            genomes_and_contigs,
84            target_lens: HashMap::new(),
85            genome_path: concatenated_genomes.clone(),
86        }
87    }
88
89    /// Somewhat efficient cloning function. Only takes the info needed for a given ref idx
90    pub fn new_from_reader_with_reference_index(reader: &Self, ref_idx: usize) -> Self {
91        let indexed_reader = ReferenceReaderUtils::retrieve_reference(&reader.genome_path);
92
93        if reader.reference_index_to_tid.contains_key(&ref_idx) {
94            let ref_tids = reader.reference_index_to_tid.get(&ref_idx).unwrap().clone();
95            let mut target_names = HashMap::with_capacity(ref_tids.len());
96            let mut target_lens = HashMap::with_capacity(ref_tids.len());
97            ref_tids.iter().for_each(|tid| {
98                target_names.insert(*tid, reader.target_names.get(tid).unwrap().clone());
99                target_lens.insert(*tid, reader.target_lens.get(tid).unwrap().clone());
100            });
101
102            let mut ref_idx_to_tid = HashMap::new();
103            ref_idx_to_tid.insert(ref_idx, ref_tids);
104
105            let genomes_and_contigs = GenomesAndContigs::new();
106
107            Self {
108                indexed_reader,
109                current_sequence: Vec::new(),
110                reference_index_to_tid: ref_idx_to_tid,
111                genomes_and_contigs,
112                target_lens,
113                target_names,
114                genome_path: reader.genome_path.clone(),
115            }
116        } else {
117            reader.clone()
118        }
119    }
120
121    pub fn new_from_reader_with_tid_and_rid(
122        reader: &ReferenceReader,
123        ref_idx: usize,
124        tid: usize,
125    ) -> Self {
126        let indexed_reader = ReferenceReaderUtils::retrieve_reference(&reader.genome_path);
127
128        if reader.reference_index_to_tid.contains_key(&ref_idx) {
129            let mut target_names = HashMap::with_capacity(1);
130            let mut target_lens = HashMap::with_capacity(1);
131            target_names.insert(tid, reader.target_names.get(&tid).unwrap().clone());
132            target_lens.insert(tid, reader.target_lens.get(&tid).unwrap().clone());
133
134            let mut ref_idx_to_tid = HashMap::new();
135            let mut tids = LinkedHashSet::with_capacity(1);
136            tids.insert(tid);
137            ref_idx_to_tid.insert(ref_idx, tids);
138
139            let genomes_and_contigs = GenomesAndContigs::new();
140
141            Self {
142                indexed_reader,
143                current_sequence: Vec::new(),
144                reference_index_to_tid: ref_idx_to_tid,
145                genomes_and_contigs,
146                target_lens,
147                target_names,
148                genome_path: reader.genome_path.clone(),
149            }
150        } else {
151            reader.clone()
152        }
153    }
154
155    pub fn get_target_name(&self, tid: usize) -> &[u8] {
156        self.target_names.get(&tid).unwrap()
157    }
158
159    pub fn add_target(&mut self, target: &[u8], tid: usize) {
160        self.target_names.insert(tid, target.to_vec());
161    }
162
163    pub fn update_ref_index_tids(&mut self, ref_index: usize, tid: usize) -> usize {
164        let tids = self
165            .reference_index_to_tid
166            .entry(ref_index)
167            .or_insert_with(LinkedHashSet::new);
168        tids.insert(tid);
169        tids.len()
170    }
171
172    pub fn retrieve_tids_for_ref_index(&self, ref_index: usize) -> Option<&LinkedHashSet<usize>> {
173        self.reference_index_to_tid.get(&ref_index)
174    }
175
176    pub fn add_length(&mut self, tid: usize, length: u64) {
177        self.target_lens.insert(tid, length);
178    }
179
180    pub fn add_lengths(&mut self, target_lengths: HashMap<usize, u64>) {
181        self.target_lens = target_lengths
182    }
183
184    pub fn get_contig_length(&self, tid: usize) -> u64 {
185        *self.target_lens.get(&tid).unwrap_or(&0)
186    }
187
188    pub fn retrieve_reference_stem(&self, ref_idx: usize) -> String {
189        self.genomes_and_contigs.genomes[ref_idx].clone()
190    }
191
192    pub fn update_current_sequence_capacity(&mut self, size: usize) {
193        self.current_sequence = Vec::with_capacity(size);
194    }
195
196    pub fn update_current_sequence_without_capacity(&mut self) {
197        self.current_sequence = Vec::new();
198    }
199
200    pub fn retrieve_contig_name_from_tid(&self, tid: usize) -> Option<&Vec<u8>> {
201        return self.target_names.get(&tid);
202    }
203
204    pub fn fetch_contig_from_reference_by_contig_name(
205        &mut self,
206        contig_name: &[u8],
207        ref_idx: usize,
208    ) {
209        match self
210            .indexed_reader
211            .fetch_all(std::str::from_utf8(contig_name).unwrap())
212        {
213            Ok(reference) => reference,
214            Err(_e) => match self.indexed_reader.fetch_all(&format!(
215                "{}",
216                std::str::from_utf8(contig_name)
217                    .unwrap()
218                    .splitn(2, '~')
219                    .nth(1)
220                    .unwrap()
221            )) {
222                Ok(reference) => reference,
223                Err(_e) => {
224                    match self.indexed_reader.fetch_all(&format!(
225                        "{}~{}",
226                        self.genomes_and_contigs.genomes[ref_idx],
227                        std::str::from_utf8(contig_name).unwrap()
228                    )) {
229                        Ok(reference) => reference,
230                        Err(e) => {
231                            panic!(
232                                "Cannot read sequence from {}: {}",
233                                std::str::from_utf8(contig_name).unwrap(),
234                                e
235                            )
236                        }
237                    }
238                }
239            },
240        };
241    }
242
243    pub fn fetch_contig_from_reference_by_tid(
244        &mut self,
245        tid: usize,
246        ref_idx: usize,
247    ) -> Result<(), std::io::Error> {
248        match self
249            .indexed_reader
250            .fetch_all(std::str::from_utf8(&self.target_names[&tid]).unwrap())
251        {
252            Ok(reference) => Ok(reference),
253            Err(_e) => match self.indexed_reader.fetch_all(&format!(
254                "{}",
255                match std::str::from_utf8(&self.target_names[&tid])
256                    .unwrap()
257                    .splitn(2, '~')
258                    .nth(1)
259                {
260                    None => std::str::from_utf8(&self.target_names[&tid]).unwrap(),
261                    Some(contig) => contig,
262                }
263            )) {
264                Ok(reference) => Ok(reference),
265                Err(_e) => {
266                    match self.indexed_reader.fetch_all(&format!(
267                        "{}~{}",
268                        self.genomes_and_contigs.genomes[ref_idx],
269                        std::str::from_utf8(&self.target_names[&tid]).unwrap()
270                    )) {
271                        Ok(reference) => Ok(reference),
272                        Err(e) => {
273                            // debug!(
274                            //     "Cannot read sequence from {}: {}",
275                            //     std::str::from_utf8(&self.target_names[&tid]).unwrap(),
276                            //     e
277                            // );
278                            Err(e)
279                        }
280                    }
281                }
282            },
283        }
284    }
285
286    /// Fetches the reference sequence from a given SimpleInterval
287    /// The return position is 0-base start and stop inclusive
288    pub fn fetch_reference_context(&mut self, ref_idx: usize, interval: &SimpleInterval) {
289        match self.indexed_reader.fetch(
290            std::str::from_utf8(&self.target_names[&interval.get_contig()]).unwrap(),
291            interval.get_start() as u64,
292            min(
293                (interval.get_end() + 1) as u64,
294                *self.target_lens.get(&interval.get_contig()).unwrap(),
295            ),
296        ) {
297            Ok(reference) => reference,
298            Err(_e) => match self.indexed_reader.fetch(
299                &format!(
300                    "{}",
301                    std::str::from_utf8(&self.target_names[&interval.get_contig()])
302                        .unwrap()
303                        .splitn(2, '~')
304                        .nth(1)
305                        .unwrap()
306                ),
307                interval.get_start() as u64,
308                min(
309                    (interval.get_end() + 1) as u64,
310                    *self.target_lens.get(&interval.get_contig()).unwrap(),
311                ),
312            ) {
313                Ok(reference) => reference,
314                Err(_e) => match self.indexed_reader.fetch(
315                    &format!(
316                        "{}~{}",
317                        self.genomes_and_contigs.genomes[ref_idx],
318                        std::str::from_utf8(&self.target_names[&interval.get_contig()]).unwrap()
319                    ),
320                    interval.get_start() as u64,
321                    min(
322                        (interval.get_end() + 1) as u64,
323                        *self.target_lens.get(&interval.get_contig()).unwrap(),
324                    ),
325                ) {
326                    Ok(reference) => reference,
327                    Err(e) => {
328                        panic!(
329                            "Cannot read sequence from reference {} {:?}",
330                            format!(
331                                "{}~{}",
332                                &self.genomes_and_contigs.genomes[ref_idx],
333                                std::str::from_utf8(&self.target_names[&interval.get_contig()])
334                                    .unwrap()
335                            ),
336                            e,
337                        );
338                    }
339                },
340            },
341        };
342    }
343
344    pub fn read_sequence_to_vec(&mut self) {
345        match self.indexed_reader.read(&mut self.current_sequence) {
346            Ok(reference) => reference,
347            Err(e) => {
348                panic!("Cannot read sequence from reference {:?}", e,);
349            }
350        };
351    }
352
353    /**
354     * Takes a contig name &[u8] and splits around a suspected separator character
355     */
356    pub fn split_contig_name<'b>(contig_name: &'b [u8], separator: u8) -> &'b [u8] {
357        match contig_name.into_iter().position(|&x| x == separator) {
358            Some(position) => return &contig_name[(position + 1)..contig_name.len()], // + 1 because we do  not want to include the sep
359            None => return contig_name,
360        }
361    }
362}