sigalign_impl/sequence_storage/in_memory/
mod.rs

1use std::{io::Read, str::Utf8Error};
2
3use sigalign_core::reference::{
4    SequenceStorage,
5    SequenceBuffer,
6};
7use sigalign_utils::sequence_reader::{
8    SeqRecord, IdRecord,
9    fasta::FastaReader,
10    decompress::get_gzip_decoder,
11};
12
13// TODO: Debug impl manually
14/// `SequenceStorage` that stores sequences in memory.
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct InMemoryStorage {
17    target_count: usize,
18    concatenated_sequence: Vec<u8>,
19    sequence_index: Vec<usize>,
20    concatenated_label: String,
21    label_index: Vec<usize>,
22}
23
24/// `SequenceBuffer` for `InMemoryStorage`.
25#[derive(Clone)]
26pub struct InMemoryBuffer {
27    pointer: *const u8,
28    len: usize,
29}
30
31// Sequence Storage
32impl SequenceStorage for InMemoryStorage {
33    type Buffer = InMemoryBuffer;
34
35    fn num_targets(&self) -> u32 {
36        self.target_count as u32
37    }
38    fn get_buffer(&self) -> Self::Buffer {
39        InMemoryBuffer {
40            pointer: self.concatenated_sequence.as_ptr(),
41            len: 0,
42        }
43    }
44    fn fill_buffer(&self, target_index: u32, buffer: &mut Self::Buffer) {
45        let start_index = self.sequence_index[target_index as usize];
46        buffer.pointer = &self.concatenated_sequence[start_index];
47        buffer.len = self.sequence_index[target_index as usize +1] - start_index;
48    }
49    fn get_concatenated_sequence_with_boundaries_of_targets(&self) -> (
50        Vec<u8>,
51        Vec<u32>,
52    ) {
53        let concatenated_sequence = self.concatenated_sequence.to_vec();
54        let boundaries = self.sequence_index.iter().map(|x| *x as u32).collect();
55        (concatenated_sequence, boundaries)
56    }
57}
58impl SequenceBuffer for InMemoryBuffer {
59    fn buffered_sequence(&self) -> &[u8] {
60        unsafe { std::slice::from_raw_parts(self.pointer, self.len) }
61    }
62}
63
64impl InMemoryStorage {
65    pub fn new() -> Self {
66        Self {
67            target_count: 0,
68            concatenated_sequence: Vec::new(),
69            sequence_index: vec![0],
70            concatenated_label: String::new(),
71            label_index: vec![0],
72        }
73    }
74    pub fn add_target(
75        &mut self,
76        label: &str,
77        sequence: &[u8],
78    ) {
79        self.target_count += 1;
80        self.concatenated_sequence.extend_from_slice(sequence);
81        self.sequence_index.push(self.concatenated_sequence.len());
82        self.concatenated_label.push_str(label);
83        self.label_index.push(self.concatenated_label.len());
84    }
85    pub fn add_fasta<R: Read>(&mut self, reader: R) -> Result<(), Utf8Error> {
86        let mut fasta_reader = FastaReader::new(reader);
87        while let Some(mut record) = fasta_reader.next() {
88            self.target_count += 1;
89            record.extend_seq_buf(&mut self.concatenated_sequence);
90            self.sequence_index.push(self.concatenated_sequence.len());
91            record.extend_id_string(&mut self.concatenated_label)?;
92            self.label_index.push(self.concatenated_label.len());
93        }
94        Ok(())
95    }
96    /// Get filled storages
97    /// Each storage has a total length of at most `max_length`
98    /// !If one record is longer than `max_length`, it will be in a storage of its own
99    pub fn fill_fasta_until_max_length<R: Read>(
100        &mut self,
101        reader: R,
102        max_length: u32,
103    ) -> Result<Vec<Self>, Utf8Error> {
104        let mut filled_storages = Vec::new();
105
106        let mut fasta_reader = FastaReader::new(reader);
107        let mut current_seq_length = self.get_total_length();
108        let mut seq_buffer = Vec::new();
109        
110        while let Some(mut record) = fasta_reader.next() {
111            record.extend_seq_buf(&mut seq_buffer);
112            let new_seq_length = seq_buffer.len() as u32;
113
114            if (current_seq_length != 0) && (current_seq_length + new_seq_length > max_length) {
115                let filled_storage = std::mem::replace(self, Self::new());
116                // Save current storage
117                filled_storages.push(filled_storage);
118                // Reset current storage
119                current_seq_length = 0;
120            }
121
122            // Add record to current storage
123            current_seq_length += new_seq_length;
124            self.target_count += 1;
125            self.concatenated_sequence.append(&mut seq_buffer);
126            self.sequence_index.push(self.concatenated_sequence.len());
127            record.extend_id_string(&mut self.concatenated_label)?;
128            self.label_index.push(self.concatenated_label.len());
129        }
130        Ok(filled_storages)
131    }
132    pub fn add_gzip_fasta<R: Read>(&mut self, reader: R) -> Result<(), Utf8Error> {
133        let decomp_reader = get_gzip_decoder(reader);
134        let mut fasta_reader = FastaReader::new(decomp_reader);
135        while let Some(mut record) = fasta_reader.next() {
136            self.target_count += 1;
137            record.extend_seq_buf(&mut self.concatenated_sequence);
138            self.sequence_index.push(self.concatenated_sequence.len());
139            record.extend_id_string(&mut self.concatenated_label)?;
140            self.label_index.push(self.concatenated_label.len());
141        }
142        Ok(())
143    }
144    pub fn merge(&mut self, other: Self) {
145        let Self {
146            target_count: other_target_count,
147            concatenated_sequence: mut other_combined_sequence,
148            sequence_index: other_sequence_index,
149            concatenated_label: other_combined_label,
150            label_index: other_label_index,
151        } = other;
152        // record_count
153        self.target_count += other_target_count;
154        // concatenated_sequence
155        self.concatenated_sequence.append(&mut other_combined_sequence);
156        // sequence_index
157        let last_seq_idx = *self.sequence_index.last().unwrap();
158        self.sequence_index.reserve(other_target_count);
159        other_sequence_index[1..].iter().for_each(|v| {
160            self.sequence_index.push(v+last_seq_idx);
161        });
162        // concatenated_label
163        self.concatenated_label.push_str(&other_combined_label);
164        // label_index
165        let last_label_idx = *self.label_index.last().unwrap();
166        self.label_index.reserve(other_target_count);
167        other_label_index[1..].iter().for_each(|v| {
168            self.label_index.push(v+last_label_idx);
169        });
170    }
171    pub fn get_sequence_safely(&self, target_index: u32) -> Option<Vec<u8>> {
172        if target_index as usize >= self.target_count {
173            return None
174        }
175        let mut buffer = self.get_buffer();
176        self.fill_buffer(target_index, &mut buffer);
177        let seq = buffer.buffered_sequence().to_vec();
178        Some(seq)
179    }
180    pub fn get_total_length(&self) -> u32 {
181        self.concatenated_sequence.len() as u32
182    }
183    /// Remove all labels
184    /// !Cannot be undone
185    pub fn remove_labels(&mut self) {
186        self.concatenated_label = String::new();
187        self.label_index = vec![0; self.target_count+1];
188    }
189    /// Set sequence to uppercase
190    /// !Cannot be undone
191    pub fn set_sequences_to_uppercase(&mut self) {
192        self.concatenated_sequence.make_ascii_uppercase();
193    }
194    /// Make all designated bases to defined base
195    /// !Cannot be undone
196    pub fn change_bases_to(&mut self, bases_to_change: &[u8], target_base: u8) {
197        let mut byte_mapper: [u8; 256] = [0; 256];
198        for (i, item) in byte_mapper.iter_mut().enumerate() {
199            *item = i as u8;
200        }
201        bases_to_change.iter().for_each(|v| {
202            byte_mapper[*v as usize] = target_base;
203        });
204        self.concatenated_sequence.iter_mut().for_each(|v| {
205            *v = byte_mapper[*v as usize];
206        });
207    }
208}
209
210impl InMemoryBuffer {
211    pub fn new() -> Self {
212        Self {
213            pointer: std::ptr::null(),
214            len: 0,
215        }
216    }
217}
218
219mod extensions;