const ENCODE_TABLE: [u8; 256] = {
let mut table = [0u8; 256];
table[b'A' as usize] = 0;
table[b'a' as usize] = 0;
table[b'C' as usize] = 1;
table[b'c' as usize] = 1;
table[b'G' as usize] = 2;
table[b'g' as usize] = 2;
table[b'T' as usize] = 3;
table[b't' as usize] = 3;
table
};
const DECODE_TABLE: [u8; 4] = [b'a', b'c', b'g', b't'];
pub fn compress(dna: &[u8], out: &mut [u8]) -> usize {
let len = dna.len();
let full_chunks = len / 4;
let remainder = len % 4;
for (i, chunk) in dna.chunks_exact(4).enumerate() {
out[i] = ENCODE_TABLE[chunk[0] as usize]
| (ENCODE_TABLE[chunk[1] as usize] << 2)
| (ENCODE_TABLE[chunk[2] as usize] << 4)
| (ENCODE_TABLE[chunk[3] as usize] << 6);
}
if remainder > 0 {
let base = full_chunks * 4;
let mut byte = ENCODE_TABLE[dna[base] as usize];
if remainder > 1 {
byte |= ENCODE_TABLE[dna[base + 1] as usize] << 2;
}
if remainder > 2 {
byte |= ENCODE_TABLE[dna[base + 2] as usize] << 4;
}
out[full_chunks] = byte;
}
len.div_ceil(4)
}
pub fn decompress(compressed: &[u8], len: usize, out: &mut [u8]) {
let full_chunks = len / 4;
let remainder = len % 4;
for (i, &byte) in compressed[..full_chunks].iter().enumerate() {
let base = i * 4;
out[base] = DECODE_TABLE[(byte & 0x03) as usize];
out[base + 1] = DECODE_TABLE[((byte >> 2) & 0x03) as usize];
out[base + 2] = DECODE_TABLE[((byte >> 4) & 0x03) as usize];
out[base + 3] = DECODE_TABLE[((byte >> 6) & 0x03) as usize];
}
if remainder > 0 {
let byte = compressed[full_chunks];
let base = full_chunks * 4;
out[base] = DECODE_TABLE[(byte & 0x03) as usize];
if remainder > 1 {
out[base + 1] = DECODE_TABLE[((byte >> 2) & 0x03) as usize];
}
if remainder > 2 {
out[base + 2] = DECODE_TABLE[((byte >> 4) & 0x03) as usize];
}
}
}
pub fn bit_count(len: usize) -> usize {
len << 1
}
pub fn byte_count(len: usize) -> usize {
len.div_ceil(4)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn round_trip_exact_multiple() {
let dna = b"ACGTACGT";
let mut compressed = [0u8; 2];
let n = compress(dna, &mut compressed);
assert_eq!(n, 2);
let mut decompressed = [0u8; 8];
decompress(&compressed, 8, &mut decompressed);
assert_eq!(&decompressed, b"acgtacgt");
}
#[test]
fn round_trip_remainder_3() {
let dna = b"ACG";
let mut compressed = [0u8; 1];
let n = compress(dna, &mut compressed);
assert_eq!(n, 1);
let mut decompressed = [0u8; 3];
decompress(&compressed, 3, &mut decompressed);
assert_eq!(&decompressed, b"acg");
}
#[test]
fn round_trip_remainder_2() {
let dna = b"TG";
let mut compressed = [0u8; 1];
compress(dna, &mut compressed);
let mut decompressed = [0u8; 2];
decompress(&compressed, 2, &mut decompressed);
assert_eq!(&decompressed, b"tg");
}
#[test]
fn round_trip_remainder_1() {
let dna = b"C";
let mut compressed = [0u8; 1];
compress(dna, &mut compressed);
let mut decompressed = [0u8; 1];
decompress(&compressed, 1, &mut decompressed);
assert_eq!(&decompressed, b"c");
}
#[test]
fn case_insensitive() {
let upper = b"ACGT";
let lower = b"acgt";
let mut comp_upper = [0u8; 1];
let mut comp_lower = [0u8; 1];
compress(upper, &mut comp_upper);
compress(lower, &mut comp_lower);
assert_eq!(comp_upper, comp_lower);
}
#[test]
fn non_acgt_maps_to_a() {
let dna = b"NCGT";
let mut compressed = [0u8; 1];
compress(dna, &mut compressed);
let mut decompressed = [0u8; 4];
decompress(&compressed, 4, &mut decompressed);
assert_eq!(&decompressed, b"acgt");
}
#[test]
fn empty_input() {
let mut compressed = [0u8; 1];
let n = compress(b"", &mut compressed);
assert_eq!(n, 0);
let mut decompressed = [0u8; 0];
decompress(&compressed, 0, &mut decompressed);
}
#[test]
fn bit_and_byte_counts() {
assert_eq!(bit_count(8), 16);
assert_eq!(byte_count(8), 2);
assert_eq!(byte_count(3), 1);
assert_eq!(byte_count(5), 2);
assert_eq!(byte_count(0), 0);
}
#[test]
fn packing_order() {
let dna = b"ACGT"; let mut compressed = [0u8; 1];
compress(dna, &mut compressed);
assert_eq!(compressed[0], 0b_11_10_01_00);
}
}