sigalign_impl/sequence_storage/in_memory/
mod.rs1use std::{io::Read, str::Utf8Error};
2
3use sigalign_core::reference::{
4 SequenceStorage,
5 SequenceBuffer,
6};
7use sigalign_utils::sequence_reader::{
8 SeqRecord, IdRecord,
9 fasta::FastaReader,
10 decompress::get_gzip_decoder,
11};
12
13#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct InMemoryStorage {
17 target_count: usize,
18 concatenated_sequence: Vec<u8>,
19 sequence_index: Vec<usize>,
20 concatenated_label: String,
21 label_index: Vec<usize>,
22}
23
24#[derive(Clone)]
34pub struct InMemoryBuffer {
35 pointer: *const u8,
36 len: usize,
37}
38
39unsafe impl Send for InMemoryBuffer {}
40
41impl SequenceStorage for InMemoryStorage {
43 type Buffer = InMemoryBuffer;
44
45 fn num_targets(&self) -> u32 {
46 self.target_count as u32
47 }
48 fn get_buffer(&self) -> Self::Buffer {
49 InMemoryBuffer {
50 pointer: self.concatenated_sequence.as_ptr(),
51 len: 0,
52 }
53 }
54 fn fill_buffer(&self, target_index: u32, buffer: &mut Self::Buffer) {
55 let start_index = self.sequence_index[target_index as usize];
56 buffer.pointer = &self.concatenated_sequence[start_index];
57 buffer.len = self.sequence_index[target_index as usize +1] - start_index;
58 }
59 fn get_concatenated_sequence_with_boundaries_of_targets(&self) -> (
60 Vec<u8>,
61 Vec<u32>,
62 ) {
63 let concatenated_sequence = self.concatenated_sequence.to_vec();
64 let boundaries = self.sequence_index.iter().map(|x| *x as u32).collect();
65 (concatenated_sequence, boundaries)
66 }
67}
68impl SequenceBuffer for InMemoryBuffer {
69 fn buffered_sequence(&self) -> &[u8] {
70 unsafe { std::slice::from_raw_parts(self.pointer, self.len) }
71 }
72}
73
74impl InMemoryStorage {
75 pub fn new() -> Self {
76 Self {
77 target_count: 0,
78 concatenated_sequence: Vec::new(),
79 sequence_index: vec![0],
80 concatenated_label: String::new(),
81 label_index: vec![0],
82 }
83 }
84 pub fn add_target(
85 &mut self,
86 label: &str,
87 sequence: &[u8],
88 ) {
89 self.target_count += 1;
90 self.concatenated_sequence.extend_from_slice(sequence);
91 self.sequence_index.push(self.concatenated_sequence.len());
92 self.concatenated_label.push_str(label);
93 self.label_index.push(self.concatenated_label.len());
94 }
95 pub fn add_fasta<R: Read>(&mut self, reader: R) -> Result<(), Utf8Error> {
96 let mut fasta_reader = FastaReader::new(reader);
97 while let Some(mut record) = fasta_reader.next() {
98 self.target_count += 1;
99 record.extend_seq_buf(&mut self.concatenated_sequence);
100 self.sequence_index.push(self.concatenated_sequence.len());
101 record.extend_id_string(&mut self.concatenated_label)?;
102 self.label_index.push(self.concatenated_label.len());
103 }
104 Ok(())
105 }
106 pub fn fill_fasta_until_max_length<R: Read>(
110 &mut self,
111 reader: R,
112 max_length: u32,
113 ) -> Result<Vec<Self>, Utf8Error> {
114 let mut filled_storages = Vec::new();
115
116 let mut fasta_reader = FastaReader::new(reader);
117 let mut current_seq_length = self.get_total_length();
118 let mut seq_buffer = Vec::new();
119
120 while let Some(mut record) = fasta_reader.next() {
121 record.extend_seq_buf(&mut seq_buffer);
122 let new_seq_length = seq_buffer.len() as u32;
123
124 if (current_seq_length != 0) && (current_seq_length + new_seq_length > max_length) {
125 let filled_storage = std::mem::replace(self, Self::new());
126 filled_storages.push(filled_storage);
128 current_seq_length = 0;
130 }
131
132 current_seq_length += new_seq_length;
134 self.target_count += 1;
135 self.concatenated_sequence.append(&mut seq_buffer);
136 self.sequence_index.push(self.concatenated_sequence.len());
137 record.extend_id_string(&mut self.concatenated_label)?;
138 self.label_index.push(self.concatenated_label.len());
139 }
140 Ok(filled_storages)
141 }
142 pub fn add_gzip_fasta<R: Read>(&mut self, reader: R) -> Result<(), Utf8Error> {
143 let decomp_reader = get_gzip_decoder(reader);
144 let mut fasta_reader = FastaReader::new(decomp_reader);
145 while let Some(mut record) = fasta_reader.next() {
146 self.target_count += 1;
147 record.extend_seq_buf(&mut self.concatenated_sequence);
148 self.sequence_index.push(self.concatenated_sequence.len());
149 record.extend_id_string(&mut self.concatenated_label)?;
150 self.label_index.push(self.concatenated_label.len());
151 }
152 Ok(())
153 }
154 pub fn merge(&mut self, other: Self) {
155 let Self {
156 target_count: other_target_count,
157 concatenated_sequence: mut other_combined_sequence,
158 sequence_index: other_sequence_index,
159 concatenated_label: other_combined_label,
160 label_index: other_label_index,
161 } = other;
162 self.target_count += other_target_count;
164 self.concatenated_sequence.append(&mut other_combined_sequence);
166 let last_seq_idx = *self.sequence_index.last().unwrap();
168 self.sequence_index.reserve(other_target_count);
169 other_sequence_index[1..].iter().for_each(|v| {
170 self.sequence_index.push(v+last_seq_idx);
171 });
172 self.concatenated_label.push_str(&other_combined_label);
174 let last_label_idx = *self.label_index.last().unwrap();
176 self.label_index.reserve(other_target_count);
177 other_label_index[1..].iter().for_each(|v| {
178 self.label_index.push(v+last_label_idx);
179 });
180 }
181 pub fn get_sequence_safely(&self, target_index: u32) -> Option<Vec<u8>> {
182 if target_index as usize >= self.target_count {
183 return None
184 }
185 let mut buffer = self.get_buffer();
186 self.fill_buffer(target_index, &mut buffer);
187 let seq = buffer.buffered_sequence().to_vec();
188 Some(seq)
189 }
190 pub fn get_sequence_length_safely(&self, target_index: u32) -> Option<u32> {
191 if target_index as usize >= self.target_count {
192 return None
193 }
194 let start_index = self.sequence_index[target_index as usize];
195 let end_index = self.sequence_index[target_index as usize +1];
196 Some((end_index - start_index) as u32)
197 }
198 pub fn get_total_length(&self) -> u32 {
199 self.concatenated_sequence.len() as u32
200 }
201 pub fn remove_labels(&mut self) {
204 self.concatenated_label = String::new();
205 self.label_index = vec![0; self.target_count+1];
206 }
207 pub fn set_sequences_to_uppercase(&mut self) {
210 self.concatenated_sequence.make_ascii_uppercase();
211 }
212 pub fn change_bases_to(&mut self, bases_to_change: &[u8], target_base: u8) {
215 let mut byte_mapper: [u8; 256] = [0; 256];
216 for (i, item) in byte_mapper.iter_mut().enumerate() {
217 *item = i as u8;
218 }
219 bases_to_change.iter().for_each(|v| {
220 byte_mapper[*v as usize] = target_base;
221 });
222 self.concatenated_sequence.iter_mut().for_each(|v| {
223 *v = byte_mapper[*v as usize];
224 });
225 }
226}
227
228impl InMemoryBuffer {
229 pub fn new() -> Self {
230 Self {
231 pointer: std::ptr::null(),
232 len: 0,
233 }
234 }
235}
236
237mod extensions;