gtars_refget/digest/
encoder.rs1use super::alphabet;
4use super::alphabet::Alphabet;
5
6pub struct SequenceEncoder {
28 alphabet: &'static alphabet::Alphabet, encoded_sequence: Vec<u8>, bit_pos: usize, buffer: u64, buffer_bits: usize, }
34
35impl SequenceEncoder {
36 pub fn new(alphabet_type: alphabet::AlphabetType, length: usize) -> Self {
37 let alphabet = alphabet::lookup_alphabet(&alphabet_type);
38 let bits_per_symbol = alphabet.bits_per_symbol;
39 let estimated_bytes = (length * bits_per_symbol).div_ceil(8);
40
41 SequenceEncoder {
42 alphabet,
43 encoded_sequence: Vec::with_capacity(estimated_bytes),
44 bit_pos: 0,
45 buffer: 0,
46 buffer_bits: 0,
47 }
48 }
49
50 pub fn update(&mut self, sequence: &[u8]) {
51 for &byte in sequence {
52 let code = self.alphabet.encoding_array[byte as usize] as u64;
53 self.buffer = (self.buffer << self.alphabet.bits_per_symbol) | code;
54 self.buffer_bits += self.alphabet.bits_per_symbol;
55
56 while self.buffer_bits >= 8 {
57 self.buffer_bits -= 8;
58 let out_byte = (self.buffer >> self.buffer_bits) as u8;
59 self.encoded_sequence.push(out_byte);
60 self.bit_pos += 8;
61 self.buffer &= (1 << self.buffer_bits) - 1; }
63 }
64 }
65
66 pub fn finalize(mut self) -> Vec<u8> {
67 if self.buffer_bits > 0 {
68 let out_byte = (self.buffer << (8 - self.buffer_bits)) as u8;
69 self.encoded_sequence.push(out_byte);
70 self.bit_pos += self.buffer_bits;
71 }
72 self.encoded_sequence
73 }
74}
75
76pub fn encode_sequence<T: AsRef<[u8]>>(sequence: T, alphabet: &Alphabet) -> Vec<u8> {
100 let sequence = sequence.as_ref();
101 let total_bits = sequence.len() * alphabet.bits_per_symbol;
102 let mut bytes = vec![0u8; total_bits.div_ceil(8)];
103
104 let mut bit_index = 0;
105 for &byte in sequence {
106 let code = alphabet.encoding_array[byte as usize];
107
108 for i in (0..alphabet.bits_per_symbol).rev() {
109 let bit = (code >> i) & 1;
110 let byte_index = bit_index / 8;
111 let bit_offset = 7 - (bit_index % 8); bytes[byte_index] |= bit << bit_offset;
113 bit_index += 1;
114 }
115 }
116 bytes
117}
118
119pub fn decode_substring_from_bytes(
137 encoded_bytes: &[u8],
138 start: usize,
139 end: usize,
140 alphabet: &Alphabet,
141) -> Vec<u8> {
142 let mut decoded = Vec::with_capacity(end - start);
143 for i in start..end {
144 let bit_offset = i * alphabet.bits_per_symbol;
145 let mut code = 0u8;
146
147 for j in 0..alphabet.bits_per_symbol {
148 let bit_pos = bit_offset + j;
149 let byte_index = bit_pos / 8;
150 let bit_in_byte = 7 - (bit_pos % 8); let bit = if byte_index < encoded_bytes.len() {
153 (encoded_bytes[byte_index] >> bit_in_byte) & 1
154 } else {
155 0
156 };
157
158 code = (code << 1) | bit;
159 }
160 decoded.push(alphabet.decoding_array[code as usize]);
161 }
162 decoded
163}
164
165pub fn decode_string_from_bytes(
166 encoded_bytes: &[u8],
167 seq_len: usize,
168 alphabet: &Alphabet,
169) -> Vec<u8> {
170 let mut decoded = Vec::with_capacity(seq_len);
171 for i in 0..seq_len {
172 let bit_offset = i * alphabet.bits_per_symbol;
173 let mut code = 0u8;
174
175 for j in 0..alphabet.bits_per_symbol {
176 let bit_pos = bit_offset + j;
177 let byte_index = bit_pos / 8;
178 let bit_in_byte = 7 - (bit_pos % 8); let bit = if byte_index < encoded_bytes.len() {
181 (encoded_bytes[byte_index] >> bit_in_byte) & 1
182 } else {
183 0
184 };
185
186 code = (code << 1) | bit;
187 }
188 decoded.push(alphabet.decoding_array[code as usize]);
189 }
190 decoded
191}
192
193#[cfg(test)]
194mod tests {
195 use super::*;
196
197 #[test]
198 fn test_dna_2bit_encoding() {
199 let alphabet = &alphabet::DNA_2BIT_ALPHABET;
200 let sequence = b"ACGT";
201 let encoded = encode_sequence(sequence, alphabet);
202 let ans = [0b10, 0b01, 0b11, 0b00];
203 let packed: Vec<u8> = ans
204 .chunks(8 / alphabet.bits_per_symbol) .map(|chunk| {
206 chunk
207 .iter()
208 .fold(0, |acc, &b| (acc << alphabet.bits_per_symbol) | b)
209 })
210 .collect();
211 assert_eq!(encoded, packed);
212
213 let decoded: Vec<u8> = decode_substring_from_bytes(&encoded, 0, sequence.len(), alphabet);
214 assert_eq!(decoded, sequence);
215 }
216
217 #[test]
218 fn test_dna_iupac_encoding() {
219 let sequence = b"ACGTRYMK";
220 let alphabet = &alphabet::DNA_IUPAC_ALPHABET;
221 let encoded = encode_sequence(sequence, alphabet);
222 let ans = [
223 0b0001, 0b0010, 0b0100, 0b1000, 0b0101, 0b1010, 0b0011, 0b0111,
224 ];
225 let packed: Vec<u8> = ans
226 .chunks(8 / alphabet.bits_per_symbol) .map(|chunk| {
228 chunk
229 .iter()
230 .fold(0, |acc, &b| (acc << alphabet.bits_per_symbol) | b)
231 })
232 .collect();
233 assert_eq!(encoded, packed);
234 let decoded: Vec<u8> = decode_substring_from_bytes(&encoded, 0, sequence.len(), alphabet);
235 assert_eq!(decoded, sequence);
236 }
237
238 #[test]
239 fn test_protein_encoding() {
240 let sequence = b"ACDEFGHIKLMNPQRSTVWY*X-";
241 let alphabet = &alphabet::PROTEIN_ALPHABET;
242 let encoded = encode_sequence(sequence, alphabet);
243 assert_eq!(
245 encoded.len(),
246 (sequence.len() * alphabet.bits_per_symbol).div_ceil(8)
247 );
248 let decoded: Vec<u8> = decode_substring_from_bytes(&encoded, 0, sequence.len(), alphabet);
249 assert_eq!(decoded, sequence);
250 }
251
252 #[test]
253 fn test_ascii_encoding() {
254 let sequence = b"Hello, World!";
255 let alphabet = &alphabet::ASCII_ALPHABET;
256 let encoded = encode_sequence(sequence, alphabet);
257 let decoded = decode_substring_from_bytes(&encoded, 0, sequence.len(), alphabet);
258 assert_eq!(decoded, sequence);
259 }
260
261 #[test]
262 fn test_dna_3bit_encoding() {
263 let sequence = b"ACGTNRYX"; let alphabet = &alphabet::DNA_3BIT_ALPHABET;
265 let encoded = encode_sequence(sequence, alphabet);
266 let packed = vec![0b00000101, 0b00111001, 0b01110111]; assert_eq!(encoded, packed);
269 let decoded: Vec<u8> = decode_substring_from_bytes(&encoded, 0, sequence.len(), alphabet);
270 assert_eq!(decoded, sequence);
271 }
272}