sigalign_impl/sequence_storage/in_memory/
mod.rs1use std::{io::Read, str::Utf8Error};
2
3use sigalign_core::reference::{
4 SequenceStorage,
5 SequenceBuffer,
6};
7use sigalign_utils::sequence_reader::{
8 SeqRecord, IdRecord,
9 fasta::FastaReader,
10 decompress::get_gzip_decoder,
11};
12
13#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct InMemoryStorage {
17 target_count: usize,
18 concatenated_sequence: Vec<u8>,
19 sequence_index: Vec<usize>,
20 concatenated_label: String,
21 label_index: Vec<usize>,
22}
23
24#[derive(Clone)]
26pub struct InMemoryBuffer {
27 pointer: *const u8,
28 len: usize,
29}
30
31impl SequenceStorage for InMemoryStorage {
33 type Buffer = InMemoryBuffer;
34
35 fn num_targets(&self) -> u32 {
36 self.target_count as u32
37 }
38 fn get_buffer(&self) -> Self::Buffer {
39 InMemoryBuffer {
40 pointer: self.concatenated_sequence.as_ptr(),
41 len: 0,
42 }
43 }
44 fn fill_buffer(&self, target_index: u32, buffer: &mut Self::Buffer) {
45 let start_index = self.sequence_index[target_index as usize];
46 buffer.pointer = &self.concatenated_sequence[start_index];
47 buffer.len = self.sequence_index[target_index as usize +1] - start_index;
48 }
49 fn get_concatenated_sequence_with_boundaries_of_targets(&self) -> (
50 Vec<u8>,
51 Vec<u32>,
52 ) {
53 let concatenated_sequence = self.concatenated_sequence.to_vec();
54 let boundaries = self.sequence_index.iter().map(|x| *x as u32).collect();
55 (concatenated_sequence, boundaries)
56 }
57}
58impl SequenceBuffer for InMemoryBuffer {
59 fn buffered_sequence(&self) -> &[u8] {
60 unsafe { std::slice::from_raw_parts(self.pointer, self.len) }
61 }
62}
63
64impl InMemoryStorage {
65 pub fn new() -> Self {
66 Self {
67 target_count: 0,
68 concatenated_sequence: Vec::new(),
69 sequence_index: vec![0],
70 concatenated_label: String::new(),
71 label_index: vec![0],
72 }
73 }
74 pub fn add_target(
75 &mut self,
76 label: &str,
77 sequence: &[u8],
78 ) {
79 self.target_count += 1;
80 self.concatenated_sequence.extend_from_slice(sequence);
81 self.sequence_index.push(self.concatenated_sequence.len());
82 self.concatenated_label.push_str(label);
83 self.label_index.push(self.concatenated_label.len());
84 }
85 pub fn add_fasta<R: Read>(&mut self, reader: R) -> Result<(), Utf8Error> {
86 let mut fasta_reader = FastaReader::new(reader);
87 while let Some(mut record) = fasta_reader.next() {
88 self.target_count += 1;
89 record.extend_seq_buf(&mut self.concatenated_sequence);
90 self.sequence_index.push(self.concatenated_sequence.len());
91 record.extend_id_string(&mut self.concatenated_label)?;
92 self.label_index.push(self.concatenated_label.len());
93 }
94 Ok(())
95 }
96 pub fn fill_fasta_until_max_length<R: Read>(
100 &mut self,
101 reader: R,
102 max_length: u32,
103 ) -> Result<Vec<Self>, Utf8Error> {
104 let mut filled_storages = Vec::new();
105
106 let mut fasta_reader = FastaReader::new(reader);
107 let mut current_seq_length = self.get_total_length();
108 let mut seq_buffer = Vec::new();
109
110 while let Some(mut record) = fasta_reader.next() {
111 record.extend_seq_buf(&mut seq_buffer);
112 let new_seq_length = seq_buffer.len() as u32;
113
114 if (current_seq_length != 0) && (current_seq_length + new_seq_length > max_length) {
115 let filled_storage = std::mem::replace(self, Self::new());
116 filled_storages.push(filled_storage);
118 current_seq_length = 0;
120 }
121
122 current_seq_length += new_seq_length;
124 self.target_count += 1;
125 self.concatenated_sequence.append(&mut seq_buffer);
126 self.sequence_index.push(self.concatenated_sequence.len());
127 record.extend_id_string(&mut self.concatenated_label)?;
128 self.label_index.push(self.concatenated_label.len());
129 }
130 Ok(filled_storages)
131 }
132 pub fn add_gzip_fasta<R: Read>(&mut self, reader: R) -> Result<(), Utf8Error> {
133 let decomp_reader = get_gzip_decoder(reader);
134 let mut fasta_reader = FastaReader::new(decomp_reader);
135 while let Some(mut record) = fasta_reader.next() {
136 self.target_count += 1;
137 record.extend_seq_buf(&mut self.concatenated_sequence);
138 self.sequence_index.push(self.concatenated_sequence.len());
139 record.extend_id_string(&mut self.concatenated_label)?;
140 self.label_index.push(self.concatenated_label.len());
141 }
142 Ok(())
143 }
144 pub fn merge(&mut self, other: Self) {
145 let Self {
146 target_count: other_target_count,
147 concatenated_sequence: mut other_combined_sequence,
148 sequence_index: other_sequence_index,
149 concatenated_label: other_combined_label,
150 label_index: other_label_index,
151 } = other;
152 self.target_count += other_target_count;
154 self.concatenated_sequence.append(&mut other_combined_sequence);
156 let last_seq_idx = *self.sequence_index.last().unwrap();
158 self.sequence_index.reserve(other_target_count);
159 other_sequence_index[1..].iter().for_each(|v| {
160 self.sequence_index.push(v+last_seq_idx);
161 });
162 self.concatenated_label.push_str(&other_combined_label);
164 let last_label_idx = *self.label_index.last().unwrap();
166 self.label_index.reserve(other_target_count);
167 other_label_index[1..].iter().for_each(|v| {
168 self.label_index.push(v+last_label_idx);
169 });
170 }
171 pub fn get_sequence_safely(&self, target_index: u32) -> Option<Vec<u8>> {
172 if target_index as usize >= self.target_count {
173 return None
174 }
175 let mut buffer = self.get_buffer();
176 self.fill_buffer(target_index, &mut buffer);
177 let seq = buffer.buffered_sequence().to_vec();
178 Some(seq)
179 }
180 pub fn get_total_length(&self) -> u32 {
181 self.concatenated_sequence.len() as u32
182 }
183 pub fn remove_labels(&mut self) {
186 self.concatenated_label = String::new();
187 self.label_index = vec![0; self.target_count+1];
188 }
189 pub fn set_sequences_to_uppercase(&mut self) {
192 self.concatenated_sequence.make_ascii_uppercase();
193 }
194 pub fn change_bases_to(&mut self, bases_to_change: &[u8], target_base: u8) {
197 let mut byte_mapper: [u8; 256] = [0; 256];
198 for (i, item) in byte_mapper.iter_mut().enumerate() {
199 *item = i as u8;
200 }
201 bases_to_change.iter().for_each(|v| {
202 byte_mapper[*v as usize] = target_base;
203 });
204 self.concatenated_sequence.iter_mut().for_each(|v| {
205 *v = byte_mapper[*v as usize];
206 });
207 }
208}
209
210impl InMemoryBuffer {
211 pub fn new() -> Self {
212 Self {
213 pointer: std::ptr::null(),
214 len: 0,
215 }
216 }
217}
218
219mod extensions;