sigalign_impl/sequence_storage/in_memory/
mod.rs

1use std::{io::Read, str::Utf8Error};
2
3use sigalign_core::reference::{
4    SequenceStorage,
5    SequenceBuffer,
6};
7use sigalign_utils::sequence_reader::{
8    SeqRecord, IdRecord,
9    fasta::FastaReader,
10    decompress::get_gzip_decoder,
11};
12
13// TODO: Debug impl manually
14/// `SequenceStorage` that stores sequences in memory.
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct InMemoryStorage {
17    target_count: usize,
18    concatenated_sequence: Vec<u8>,
19    sequence_index: Vec<usize>,
20    concatenated_label: String,
21    label_index: Vec<usize>,
22}
23
24/// `SequenceBuffer` for `InMemoryStorage`.
25///
26/// ## ⚠️CAUTION⚠️
27/// This struct is not thread-safe, although it impl `Send`.
28/// `InMemoryStorage` must not be dropped while the buffer is in use.
29///
30/// ## Safety
31/// This struct uses raw pointers, which can lead to undefined behavior if used incorrectly.
32/// Ensure that the `InMemoryStorage`'s lifetime exceeds the buffer's usage to avoid dangling pointers.
33#[derive(Clone)]
34pub struct InMemoryBuffer {
35    pointer: *const u8,
36    len: usize,
37}
38
39unsafe impl Send for InMemoryBuffer {}
40
41// Sequence Storage
42impl SequenceStorage for InMemoryStorage {
43    type Buffer = InMemoryBuffer;
44
45    fn num_targets(&self) -> u32 {
46        self.target_count as u32
47    }
48    fn get_buffer(&self) -> Self::Buffer {
49        InMemoryBuffer {
50            pointer: self.concatenated_sequence.as_ptr(),
51            len: 0,
52        }
53    }
54    fn fill_buffer(&self, target_index: u32, buffer: &mut Self::Buffer) {
55        let start_index = self.sequence_index[target_index as usize];
56        buffer.pointer = &self.concatenated_sequence[start_index];
57        buffer.len = self.sequence_index[target_index as usize +1] - start_index;
58    }
59    fn get_concatenated_sequence_with_boundaries_of_targets(&self) -> (
60        Vec<u8>,
61        Vec<u32>,
62    ) {
63        let concatenated_sequence = self.concatenated_sequence.to_vec();
64        let boundaries = self.sequence_index.iter().map(|x| *x as u32).collect();
65        (concatenated_sequence, boundaries)
66    }
67}
68impl SequenceBuffer for InMemoryBuffer {
69    fn buffered_sequence(&self) -> &[u8] {
70        unsafe { std::slice::from_raw_parts(self.pointer, self.len) }
71    }
72}
73
74impl InMemoryStorage {
75    pub fn new() -> Self {
76        Self {
77            target_count: 0,
78            concatenated_sequence: Vec::new(),
79            sequence_index: vec![0],
80            concatenated_label: String::new(),
81            label_index: vec![0],
82        }
83    }
84    pub fn add_target(
85        &mut self,
86        label: &str,
87        sequence: &[u8],
88    ) {
89        self.target_count += 1;
90        self.concatenated_sequence.extend_from_slice(sequence);
91        self.sequence_index.push(self.concatenated_sequence.len());
92        self.concatenated_label.push_str(label);
93        self.label_index.push(self.concatenated_label.len());
94    }
95    pub fn add_fasta<R: Read>(&mut self, reader: R) -> Result<(), Utf8Error> {
96        let mut fasta_reader = FastaReader::new(reader);
97        while let Some(mut record) = fasta_reader.next() {
98            self.target_count += 1;
99            record.extend_seq_buf(&mut self.concatenated_sequence);
100            self.sequence_index.push(self.concatenated_sequence.len());
101            record.extend_id_string(&mut self.concatenated_label)?;
102            self.label_index.push(self.concatenated_label.len());
103        }
104        Ok(())
105    }
106    /// Get filled storages
107    /// Each storage has a total length of at most `max_length`
108    /// !If one record is longer than `max_length`, it will be in a storage of its own
109    pub fn fill_fasta_until_max_length<R: Read>(
110        &mut self,
111        reader: R,
112        max_length: u32,
113    ) -> Result<Vec<Self>, Utf8Error> {
114        let mut filled_storages = Vec::new();
115
116        let mut fasta_reader = FastaReader::new(reader);
117        let mut current_seq_length = self.get_total_length();
118        let mut seq_buffer = Vec::new();
119        
120        while let Some(mut record) = fasta_reader.next() {
121            record.extend_seq_buf(&mut seq_buffer);
122            let new_seq_length = seq_buffer.len() as u32;
123
124            if (current_seq_length != 0) && (current_seq_length + new_seq_length > max_length) {
125                let filled_storage = std::mem::replace(self, Self::new());
126                // Save current storage
127                filled_storages.push(filled_storage);
128                // Reset current storage
129                current_seq_length = 0;
130            }
131
132            // Add record to current storage
133            current_seq_length += new_seq_length;
134            self.target_count += 1;
135            self.concatenated_sequence.append(&mut seq_buffer);
136            self.sequence_index.push(self.concatenated_sequence.len());
137            record.extend_id_string(&mut self.concatenated_label)?;
138            self.label_index.push(self.concatenated_label.len());
139        }
140        Ok(filled_storages)
141    }
142    pub fn add_gzip_fasta<R: Read>(&mut self, reader: R) -> Result<(), Utf8Error> {
143        let decomp_reader = get_gzip_decoder(reader);
144        let mut fasta_reader = FastaReader::new(decomp_reader);
145        while let Some(mut record) = fasta_reader.next() {
146            self.target_count += 1;
147            record.extend_seq_buf(&mut self.concatenated_sequence);
148            self.sequence_index.push(self.concatenated_sequence.len());
149            record.extend_id_string(&mut self.concatenated_label)?;
150            self.label_index.push(self.concatenated_label.len());
151        }
152        Ok(())
153    }
154    pub fn merge(&mut self, other: Self) {
155        let Self {
156            target_count: other_target_count,
157            concatenated_sequence: mut other_combined_sequence,
158            sequence_index: other_sequence_index,
159            concatenated_label: other_combined_label,
160            label_index: other_label_index,
161        } = other;
162        // record_count
163        self.target_count += other_target_count;
164        // concatenated_sequence
165        self.concatenated_sequence.append(&mut other_combined_sequence);
166        // sequence_index
167        let last_seq_idx = *self.sequence_index.last().unwrap();
168        self.sequence_index.reserve(other_target_count);
169        other_sequence_index[1..].iter().for_each(|v| {
170            self.sequence_index.push(v+last_seq_idx);
171        });
172        // concatenated_label
173        self.concatenated_label.push_str(&other_combined_label);
174        // label_index
175        let last_label_idx = *self.label_index.last().unwrap();
176        self.label_index.reserve(other_target_count);
177        other_label_index[1..].iter().for_each(|v| {
178            self.label_index.push(v+last_label_idx);
179        });
180    }
181    pub fn get_sequence_safely(&self, target_index: u32) -> Option<Vec<u8>> {
182        if target_index as usize >= self.target_count {
183            return None
184        }
185        let mut buffer = self.get_buffer();
186        self.fill_buffer(target_index, &mut buffer);
187        let seq = buffer.buffered_sequence().to_vec();
188        Some(seq)
189    }
190    pub fn get_sequence_length_safely(&self, target_index: u32) -> Option<u32> {
191        if target_index as usize >= self.target_count {
192            return None
193        }
194        let start_index = self.sequence_index[target_index as usize];
195        let end_index = self.sequence_index[target_index as usize +1];
196        Some((end_index - start_index) as u32)
197    }
198    pub fn get_total_length(&self) -> u32 {
199        self.concatenated_sequence.len() as u32
200    }
201    /// Remove all labels
202    /// !Cannot be undone
203    pub fn remove_labels(&mut self) {
204        self.concatenated_label = String::new();
205        self.label_index = vec![0; self.target_count+1];
206    }
207    /// Set sequence to uppercase
208    /// !Cannot be undone
209    pub fn set_sequences_to_uppercase(&mut self) {
210        self.concatenated_sequence.make_ascii_uppercase();
211    }
212    /// Make all designated bases to defined base
213    /// !Cannot be undone
214    pub fn change_bases_to(&mut self, bases_to_change: &[u8], target_base: u8) {
215        let mut byte_mapper: [u8; 256] = [0; 256];
216        for (i, item) in byte_mapper.iter_mut().enumerate() {
217            *item = i as u8;
218        }
219        bases_to_change.iter().for_each(|v| {
220            byte_mapper[*v as usize] = target_base;
221        });
222        self.concatenated_sequence.iter_mut().for_each(|v| {
223            *v = byte_mapper[*v as usize];
224        });
225    }
226}
227
228impl InMemoryBuffer {
229    pub fn new() -> Self {
230        Self {
231            pointer: std::ptr::null(),
232            len: 0,
233        }
234    }
235}
236
237mod extensions;