onelib 0.2.0

Rust implementation of the ONEcode file format
Documentation
//! 2-bit DNA compression codec.
//!
//! Packs DNA bases into 2 bits each, four bases per byte, in little-endian
//! order:
//!
//! ```text
//! byte = base0 | (base1 << 2) | (base2 << 4) | (base3 << 6)
//! ```
//!
//! Encoding: A/a → 0, C/c → 1, G/g → 2, T/t → 3. Any other character
//! (including N) maps to 0 (i.e. 'a'). Decoding always produces lowercase.

/// Lookup table: ASCII byte → 2-bit encoding.
/// A/a = 0, C/c = 1, G/g = 2, T/t = 3, everything else = 0.
const ENCODE_TABLE: [u8; 256] = {
    let mut table = [0u8; 256];
    table[b'A' as usize] = 0;
    table[b'a' as usize] = 0;
    table[b'C' as usize] = 1;
    table[b'c' as usize] = 1;
    table[b'G' as usize] = 2;
    table[b'g' as usize] = 2;
    table[b'T' as usize] = 3;
    table[b't' as usize] = 3;
    table
};

/// Lookup table: 2-bit value → ASCII character (always lowercase).
const DECODE_TABLE: [u8; 4] = [b'a', b'c', b'g', b't'];

/// Compress DNA characters to 2-bit encoding.
///
/// Returns the number of bytes written to `out`. The caller must ensure
/// `out` is large enough: `(dna.len() + 3) / 4` bytes.
pub fn compress(dna: &[u8], out: &mut [u8]) -> usize {
    let len = dna.len();
    let full_chunks = len / 4;
    let remainder = len % 4;

    for (i, chunk) in dna.chunks_exact(4).enumerate() {
        out[i] = ENCODE_TABLE[chunk[0] as usize]
            | (ENCODE_TABLE[chunk[1] as usize] << 2)
            | (ENCODE_TABLE[chunk[2] as usize] << 4)
            | (ENCODE_TABLE[chunk[3] as usize] << 6);
    }

    if remainder > 0 {
        let base = full_chunks * 4;
        let mut byte = ENCODE_TABLE[dna[base] as usize];
        if remainder > 1 {
            byte |= ENCODE_TABLE[dna[base + 1] as usize] << 2;
        }
        if remainder > 2 {
            byte |= ENCODE_TABLE[dna[base + 2] as usize] << 4;
        }
        out[full_chunks] = byte;
    }

    len.div_ceil(4)
}

/// Decompress 2-bit encoded DNA to ASCII characters (always lowercase).
///
/// `len` is the number of bases (not bytes). The caller must ensure `out`
/// has room for `len` bytes.
pub fn decompress(compressed: &[u8], len: usize, out: &mut [u8]) {
    let full_chunks = len / 4;
    let remainder = len % 4;

    for (i, &byte) in compressed[..full_chunks].iter().enumerate() {
        let base = i * 4;
        out[base] = DECODE_TABLE[(byte & 0x03) as usize];
        out[base + 1] = DECODE_TABLE[((byte >> 2) & 0x03) as usize];
        out[base + 2] = DECODE_TABLE[((byte >> 4) & 0x03) as usize];
        out[base + 3] = DECODE_TABLE[((byte >> 6) & 0x03) as usize];
    }

    if remainder > 0 {
        let byte = compressed[full_chunks];
        let base = full_chunks * 4;
        out[base] = DECODE_TABLE[(byte & 0x03) as usize];
        if remainder > 1 {
            out[base + 1] = DECODE_TABLE[((byte >> 2) & 0x03) as usize];
        }
        if remainder > 2 {
            out[base + 2] = DECODE_TABLE[((byte >> 4) & 0x03) as usize];
        }
    }
}

/// Returns the number of bits needed to represent `len` bases in 2-bit
/// encoding. This is the value stored as the bit count in binary ONEcode
/// files.
pub fn bit_count(len: usize) -> usize {
    len << 1
}

/// Returns the number of bytes needed to store `len` bases in 2-bit
/// encoding.
pub fn byte_count(len: usize) -> usize {
    len.div_ceil(4)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn round_trip_exact_multiple() {
        let dna = b"ACGTACGT";
        let mut compressed = [0u8; 2];
        let n = compress(dna, &mut compressed);
        assert_eq!(n, 2);

        let mut decompressed = [0u8; 8];
        decompress(&compressed, 8, &mut decompressed);
        assert_eq!(&decompressed, b"acgtacgt");
    }

    #[test]
    fn round_trip_remainder_3() {
        let dna = b"ACG";
        let mut compressed = [0u8; 1];
        let n = compress(dna, &mut compressed);
        assert_eq!(n, 1);

        let mut decompressed = [0u8; 3];
        decompress(&compressed, 3, &mut decompressed);
        assert_eq!(&decompressed, b"acg");
    }

    #[test]
    fn round_trip_remainder_2() {
        let dna = b"TG";
        let mut compressed = [0u8; 1];
        compress(dna, &mut compressed);

        let mut decompressed = [0u8; 2];
        decompress(&compressed, 2, &mut decompressed);
        assert_eq!(&decompressed, b"tg");
    }

    #[test]
    fn round_trip_remainder_1() {
        let dna = b"C";
        let mut compressed = [0u8; 1];
        compress(dna, &mut compressed);

        let mut decompressed = [0u8; 1];
        decompress(&compressed, 1, &mut decompressed);
        assert_eq!(&decompressed, b"c");
    }

    #[test]
    fn case_insensitive() {
        let upper = b"ACGT";
        let lower = b"acgt";
        let mut comp_upper = [0u8; 1];
        let mut comp_lower = [0u8; 1];
        compress(upper, &mut comp_upper);
        compress(lower, &mut comp_lower);
        assert_eq!(comp_upper, comp_lower);
    }

    #[test]
    fn non_acgt_maps_to_a() {
        let dna = b"NCGT";
        let mut compressed = [0u8; 1];
        compress(dna, &mut compressed);

        let mut decompressed = [0u8; 4];
        decompress(&compressed, 4, &mut decompressed);
        // N maps to A (0), so first base becomes 'a'.
        assert_eq!(&decompressed, b"acgt");
    }

    #[test]
    fn empty_input() {
        let mut compressed = [0u8; 1];
        let n = compress(b"", &mut compressed);
        assert_eq!(n, 0);

        let mut decompressed = [0u8; 0];
        decompress(&compressed, 0, &mut decompressed);
    }

    #[test]
    fn bit_and_byte_counts() {
        assert_eq!(bit_count(8), 16);
        assert_eq!(byte_count(8), 2);
        assert_eq!(byte_count(3), 1);
        assert_eq!(byte_count(5), 2);
        assert_eq!(byte_count(0), 0);
    }

    #[test]
    fn packing_order() {
        // Verify little-endian packing: first base in lowest bits.
        let dna = b"ACGT"; // A=0, C=1, G=2, T=3
        let mut compressed = [0u8; 1];
        compress(dna, &mut compressed);
        // Expected: 0 | (1<<2) | (2<<4) | (3<<6) = 0 + 4 + 32 + 192 = 228
        assert_eq!(compressed[0], 0b_11_10_01_00);
    }
}