Skip to main content

gtars_refget/digest/
encoder.rs

1//! Sequence encoding and decoding - WASM-safe, no filesystem dependencies.
2
3use super::alphabet;
4use super::alphabet::Alphabet;
5
6/// A struct for encoding biological sequences into a compact, bit-packed format.
7///
8/// The `SequenceEncoder` is designed to efficiently encode sequences using a specified
9/// alphabet type (e.g., DNA, protein, or ASCII). It uses a bit buffer to pack symbols
10/// into bytes, minimizing memory usage while maintaining fast encoding performance.
11///
12/// # Fields
13///
14/// * `encoding_array` - A lookup table that maps input bytes to their encoded values.
15/// * `bits_per_symbol` - The number of bits used to represent each symbol in the sequence.
16/// * `encoded_sequence` - A vector that stores the resulting bit-packed sequence.
17/// * `bit_pos` - The current bit position in the encoded sequence.
18/// * `buffer` - An internal bit buffer used for packing symbols into bytes.
19/// * `buffer_bits` - The number of bits currently stored in the buffer.
20///
21/// # Usage
22///
23/// 1. Create a new `SequenceEncoder` using the `new` method.
24/// 2. Add sequence chunks with the `update` method.
25/// 3. Call the `finalize` method to retrieve the encoded sequence as a `Vec<u8>`.
26///
27pub struct SequenceEncoder {
28    alphabet: &'static alphabet::Alphabet, // alphabet used by this encoder
29    encoded_sequence: Vec<u8>,             // resulting encoded sequence
30    bit_pos: usize,                        // internal bit position in encoded sequence
31    buffer: u64,                           // internal bit buffer
32    buffer_bits: usize,                    // number of bits currently in buffer
33}
34
35impl SequenceEncoder {
36    pub fn new(alphabet_type: alphabet::AlphabetType, length: usize) -> Self {
37        let alphabet = alphabet::lookup_alphabet(&alphabet_type);
38        let bits_per_symbol = alphabet.bits_per_symbol;
39        let estimated_bytes = (length * bits_per_symbol).div_ceil(8);
40
41        SequenceEncoder {
42            alphabet,
43            encoded_sequence: Vec::with_capacity(estimated_bytes),
44            bit_pos: 0,
45            buffer: 0,
46            buffer_bits: 0,
47        }
48    }
49
50    pub fn update(&mut self, sequence: &[u8]) {
51        for &byte in sequence {
52            let code = self.alphabet.encoding_array[byte as usize] as u64;
53            self.buffer = (self.buffer << self.alphabet.bits_per_symbol) | code;
54            self.buffer_bits += self.alphabet.bits_per_symbol;
55
56            while self.buffer_bits >= 8 {
57                self.buffer_bits -= 8;
58                let out_byte = (self.buffer >> self.buffer_bits) as u8;
59                self.encoded_sequence.push(out_byte);
60                self.bit_pos += 8;
61                self.buffer &= (1 << self.buffer_bits) - 1; // Mask to keep remaining bits
62            }
63        }
64    }
65
66    pub fn finalize(mut self) -> Vec<u8> {
67        if self.buffer_bits > 0 {
68            let out_byte = (self.buffer << (8 - self.buffer_bits)) as u8;
69            self.encoded_sequence.push(out_byte);
70            self.bit_pos += self.buffer_bits;
71        }
72        self.encoded_sequence
73    }
74}
75
76/// Encodes a sequence using the specified encoding array.
77///
78/// This function takes a sequence of bytes and encodes it using the provided
79/// encoding array. The encoding is done by converting each byte in the sequence
80/// to its corresponding code in the encoding array, and then packing those
81/// codes into a byte array. The number of bits used to represent each symbol
82/// is specified by the `bits_per_symbol` parameter. The function returns
83/// a bit-packed vector of bytes containing the encoded sequence.
84///
85/// **Bit Ordering: MSB-first (Most Significant Bit first)**
86///
87/// Example with 2-bit DNA encoding (A=00, C=01, G=10, T=11):
88/// - Sequence "ACGT" → byte 0x1B (00011011)
89/// - A (00) in bits 7-6, C (01) in bits 5-4, G (10) in bits 3-2, T (11) in bits 1-0
90///
91/// # Arguments
92///
93/// * `sequence` - The sequence to encode
94/// * `alphabet` - The alphabet defining the encoding
95///
96/// # Returns
97///
98/// A vector containing the encoded (bit-packed) sequence
99pub fn encode_sequence<T: AsRef<[u8]>>(sequence: T, alphabet: &Alphabet) -> Vec<u8> {
100    let sequence = sequence.as_ref();
101    let total_bits = sequence.len() * alphabet.bits_per_symbol;
102    let mut bytes = vec![0u8; total_bits.div_ceil(8)];
103
104    let mut bit_index = 0;
105    for &byte in sequence {
106        let code = alphabet.encoding_array[byte as usize];
107
108        for i in (0..alphabet.bits_per_symbol).rev() {
109            let bit = (code >> i) & 1;
110            let byte_index = bit_index / 8;
111            let bit_offset = 7 - (bit_index % 8); // MSB-first in each byte
112            bytes[byte_index] |= bit << bit_offset;
113            bit_index += 1;
114        }
115    }
116    bytes
117}
118
119/// Decodes a substring from a bit-packed encoded sequence.
120///
121/// Decodes a substring from a bit-packed sequence of bytes.
122/// Decodes via the decoding array, which maps encoded
123/// values back to original symbols.
124///
125/// # Arguments
126///
127/// * `encoded_bytes` - A slice of bytes containing the bit-packed encoded sequence.
128/// * `start` - The start index of the substring (inclusive), in terms of symbols.
129/// * `end` - The end index of the substring (exclusive), in terms of symbols.
130/// * `bits_per_symbol` - The number of bits used to represent each symbol in the sequence.
131/// * `decoding_array` - A lookup table that maps encoded values back to their original symbols.
132///
133/// # Returns
134///
135/// A `Vec<u8>` containing the decoded substring.
136pub fn decode_substring_from_bytes(
137    encoded_bytes: &[u8],
138    start: usize,
139    end: usize,
140    alphabet: &Alphabet,
141) -> Vec<u8> {
142    let mut decoded = Vec::with_capacity(end - start);
143    for i in start..end {
144        let bit_offset = i * alphabet.bits_per_symbol;
145        let mut code = 0u8;
146
147        for j in 0..alphabet.bits_per_symbol {
148            let bit_pos = bit_offset + j;
149            let byte_index = bit_pos / 8;
150            let bit_in_byte = 7 - (bit_pos % 8); // MSB0
151
152            let bit = if byte_index < encoded_bytes.len() {
153                (encoded_bytes[byte_index] >> bit_in_byte) & 1
154            } else {
155                0
156            };
157
158            code = (code << 1) | bit;
159        }
160        decoded.push(alphabet.decoding_array[code as usize]);
161    }
162    decoded
163}
164
165pub fn decode_string_from_bytes(
166    encoded_bytes: &[u8],
167    seq_len: usize,
168    alphabet: &Alphabet,
169) -> Vec<u8> {
170    let mut decoded = Vec::with_capacity(seq_len);
171    for i in 0..seq_len {
172        let bit_offset = i * alphabet.bits_per_symbol;
173        let mut code = 0u8;
174
175        for j in 0..alphabet.bits_per_symbol {
176            let bit_pos = bit_offset + j;
177            let byte_index = bit_pos / 8;
178            let bit_in_byte = 7 - (bit_pos % 8); // MSB0
179
180            let bit = if byte_index < encoded_bytes.len() {
181                (encoded_bytes[byte_index] >> bit_in_byte) & 1
182            } else {
183                0
184            };
185
186            code = (code << 1) | bit;
187        }
188        decoded.push(alphabet.decoding_array[code as usize]);
189    }
190    decoded
191}
192
193#[cfg(test)]
194mod tests {
195    use super::*;
196
197    #[test]
198    fn test_dna_2bit_encoding() {
199        let alphabet = &alphabet::DNA_2BIT_ALPHABET;
200        let sequence = b"ACGT";
201        let encoded = encode_sequence(sequence, alphabet);
202        let ans = [0b10, 0b01, 0b11, 0b00];
203        let packed: Vec<u8> = ans
204            .chunks(8 / alphabet.bits_per_symbol) // Number of symbols that fit in a byte
205            .map(|chunk| {
206                chunk
207                    .iter()
208                    .fold(0, |acc, &b| (acc << alphabet.bits_per_symbol) | b)
209            })
210            .collect();
211        assert_eq!(encoded, packed);
212
213        let decoded: Vec<u8> = decode_substring_from_bytes(&encoded, 0, sequence.len(), alphabet);
214        assert_eq!(decoded, sequence);
215    }
216
217    #[test]
218    fn test_dna_iupac_encoding() {
219        let sequence = b"ACGTRYMK";
220        let alphabet = &alphabet::DNA_IUPAC_ALPHABET;
221        let encoded = encode_sequence(sequence, alphabet);
222        let ans = [
223            0b0001, 0b0010, 0b0100, 0b1000, 0b0101, 0b1010, 0b0011, 0b0111,
224        ];
225        let packed: Vec<u8> = ans
226            .chunks(8 / alphabet.bits_per_symbol) // Number of symbols that fit in a byte
227            .map(|chunk| {
228                chunk
229                    .iter()
230                    .fold(0, |acc, &b| (acc << alphabet.bits_per_symbol) | b)
231            })
232            .collect();
233        assert_eq!(encoded, packed);
234        let decoded: Vec<u8> = decode_substring_from_bytes(&encoded, 0, sequence.len(), alphabet);
235        assert_eq!(decoded, sequence);
236    }
237
238    #[test]
239    fn test_protein_encoding() {
240        let sequence = b"ACDEFGHIKLMNPQRSTVWY*X-";
241        let alphabet = &alphabet::PROTEIN_ALPHABET;
242        let encoded = encode_sequence(sequence, alphabet);
243        // Don't want to re-implement bit-packing here for 5-bit symbols, so just check the length.
244        assert_eq!(
245            encoded.len(),
246            (sequence.len() * alphabet.bits_per_symbol).div_ceil(8)
247        );
248        let decoded: Vec<u8> = decode_substring_from_bytes(&encoded, 0, sequence.len(), alphabet);
249        assert_eq!(decoded, sequence);
250    }
251
252    #[test]
253    fn test_ascii_encoding() {
254        let sequence = b"Hello, World!";
255        let alphabet = &alphabet::ASCII_ALPHABET;
256        let encoded = encode_sequence(sequence, alphabet);
257        let decoded = decode_substring_from_bytes(&encoded, 0, sequence.len(), alphabet);
258        assert_eq!(decoded, sequence);
259    }
260
261    #[test]
262    fn test_dna_3bit_encoding() {
263        let sequence = b"ACGTNRYX"; // 8 chars * 3 bits/char = 24 bits.
264        let alphabet = &alphabet::DNA_3BIT_ALPHABET;
265        let encoded = encode_sequence(sequence, alphabet);
266        // let ans =  vec![0b000, 0b001, 0b010, 0b011, 0b100, 0b101, 0b110, 0b111];
267        let packed = vec![0b00000101, 0b00111001, 0b01110111]; //manually bit-packed above 3-bits
268        assert_eq!(encoded, packed);
269        let decoded: Vec<u8> = decode_substring_from_bytes(&encoded, 0, sequence.len(), alphabet);
270        assert_eq!(decoded, sequence);
271    }
272}