use std::fmt::{self, Write};
#[cfg(feature = "packed-seq")]
use packed_seq::{PackedSeq, PackedSeqVec};
type T = u128;
const BITS_PER_BP: usize = 2;
const BITS_PER_BLOCK: usize = T::BITS as usize;
const BP_PER_BLOCK: usize = BITS_PER_BLOCK / BITS_PER_BP;
#[cfg(feature = "packed-seq")]
const PADDING: usize = 3;
#[derive(Clone, Default)]
pub struct PackedDNA {
bits: Vec<T>,
cur: T,
num_bits: usize,
}
impl PackedDNA {
#[inline(always)]
pub const fn new() -> Self {
Self {
bits: Vec::new(),
cur: 0,
num_bits: 0,
}
}
#[inline(always)]
pub fn with_capacity(capacity: usize) -> Self {
Self {
bits: Vec::with_capacity(capacity),
cur: 0,
num_bits: 0,
}
}
#[inline(always)]
pub const fn len(&self) -> usize {
self.num_bits / BITS_PER_BP
}
#[inline(always)]
pub const fn is_empty(&self) -> bool {
self.num_bits == 0
}
#[inline(always)]
pub const fn capacity(&self) -> usize {
self.bits.capacity()
}
#[inline(always)]
pub fn clear(&mut self) {
self.bits.clear();
self.cur = 0;
self.num_bits = 0;
}
#[inline(always)]
pub fn append(&mut self, packed: T, num_bits: usize) {
if num_bits == 0 {
return;
}
let rem = self.num_bits % BITS_PER_BLOCK;
let mask = !0 >> (BITS_PER_BLOCK - num_bits);
self.num_bits += num_bits;
let x = packed & mask;
if rem + num_bits >= BITS_PER_BLOCK {
self.cur |= packed << rem;
self.bits.push(self.cur);
self.cur = x.checked_shr((BITS_PER_BLOCK - rem) as u32).unwrap_or(0);
} else {
self.cur |= x << rem;
}
}
#[inline(always)]
pub fn bits(&self) -> (&[T], T) {
(&self.bits[..self.num_bits / BITS_PER_BLOCK], self.cur)
}
#[inline(always)]
pub fn get(&self, i: usize) -> u8 {
if i < self.len() & (!0 << BP_PER_BLOCK.trailing_zeros()) {
((self.bits[i / BP_PER_BLOCK] >> (2 * (i % BP_PER_BLOCK))) & 0b11) as u8
} else {
((self.cur >> (2 * (i % BP_PER_BLOCK))) & 0b11) as u8
}
}
#[inline(always)]
pub fn get_char(&self, i: usize) -> char {
const LUT: [char; 4] = ['A', 'C', 'T', 'G'];
LUT[self.get(i) as usize]
}
#[cfg(feature = "packed-seq")]
#[inline(always)]
pub(crate) fn append_padding(&mut self) {
let len = self.num_bits / BITS_PER_BLOCK;
self.bits.resize(len + 1 + PADDING, 0);
self.bits[len] = self.cur;
}
#[cfg(feature = "packed-seq")]
#[allow(clippy::missing_transmute_annotations)]
#[inline(always)]
pub fn as_packed_seq(&self) -> PackedSeq<'_> {
let len = self.len();
let data = self.bits.as_ptr() as *const u8;
let data_len = self.bits.len() * BITS_PER_BLOCK / 8;
let seq = unsafe { std::slice::from_raw_parts(data, data_len) };
PackedSeq::from_raw_parts(seq, 0, len)
}
#[cfg(feature = "packed-seq")]
#[allow(
clippy::missing_transmute_annotations,
clippy::unsound_collection_transmute
)]
#[inline(always)]
pub fn into_packed_seq_vec(mut self) -> PackedSeqVec {
let len = self.len();
let data_len = self.bits.len() * BITS_PER_BLOCK / 8;
let data_cap = self.bits.capacity() * BITS_PER_BLOCK / 8;
let data = self.bits.as_mut_ptr() as *mut u8;
let seq = unsafe { Vec::from_raw_parts(data, data_len, data_cap) };
PackedSeqVec::from_raw_parts(seq, len)
}
}
impl fmt::Display for PackedDNA {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for i in 0..self.len() {
f.write_char(self.get_char(i))?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
fn encode(s: &str) -> (T, usize) {
let mut packed: T = 0;
for (i, &b) in s.as_bytes().iter().enumerate() {
let code: T = match b {
b'A' | b'a' => 0b00,
b'C' | b'c' => 0b01,
b'T' | b't' => 0b10,
b'G' | b'g' => 0b11,
_ => 0,
};
packed |= code << (2 * i);
}
(packed, s.len() * 2)
}
fn roundtrip(s: &str) -> String {
let mut dna = PackedDNA::new();
let (packed, num_bits) = encode(s);
dna.append(packed, num_bits);
format!("{dna}")
}
#[test]
fn append_full_block_at_boundary() {
let (a_packed, _) = encode(&"A".repeat(BP_PER_BLOCK)); let (g_packed, _) = encode(&"G".repeat(BP_PER_BLOCK));
let mut dna = PackedDNA::new();
dna.append(a_packed, BITS_PER_BLOCK);
dna.append(g_packed, BITS_PER_BLOCK);
assert_eq!(dna.len(), 2 * BP_PER_BLOCK);
for i in 0..BP_PER_BLOCK {
assert_eq!(dna.get(i), 0b00, "base {i} should be A");
}
for i in BP_PER_BLOCK..2 * BP_PER_BLOCK {
assert_eq!(dna.get(i), 0b11, "base {i} should be G");
}
}
#[test]
fn three_full_blocks_distinct() {
let (a, _) = encode(&"A".repeat(BP_PER_BLOCK));
let (c, _) = encode(&"C".repeat(BP_PER_BLOCK));
let (t, _) = encode(&"T".repeat(BP_PER_BLOCK));
let mut dna = PackedDNA::new();
dna.append(a, BITS_PER_BLOCK);
dna.append(c, BITS_PER_BLOCK);
dna.append(t, BITS_PER_BLOCK);
assert_eq!(dna.len(), 3 * BP_PER_BLOCK);
for i in 0..BP_PER_BLOCK {
assert_eq!(dna.get(i), 0b00, "base {i} A");
}
for i in BP_PER_BLOCK..2 * BP_PER_BLOCK {
assert_eq!(dna.get(i), 0b01, "base {i} C");
}
for i in 2 * BP_PER_BLOCK..3 * BP_PER_BLOCK {
assert_eq!(dna.get(i), 0b10, "base {i} T");
}
}
#[test]
fn partial_then_full_block() {
let partial = BP_PER_BLOCK - 6;
let (a_partial, a_bits) = encode(&"A".repeat(partial));
let (g_full, _) = encode(&"G".repeat(BP_PER_BLOCK));
let mut dna = PackedDNA::new();
dna.append(a_partial, a_bits);
dna.append(g_full, BITS_PER_BLOCK);
assert_eq!(dna.len(), partial + BP_PER_BLOCK);
for i in 0..partial {
assert_eq!(dna.get(i), 0b00, "base {i} A");
}
for i in partial..partial + BP_PER_BLOCK {
assert_eq!(dna.get(i), 0b11, "base {i} G");
}
}
#[test]
fn roundtrip_short() {
assert_eq!(roundtrip("ACGT"), "ACGT");
}
#[test]
fn roundtrip_exactly_one_block() {
let s = "ACGT".repeat(BP_PER_BLOCK / 4);
assert_eq!(s.len(), BP_PER_BLOCK);
assert_eq!(roundtrip(&s), s);
}
#[test]
fn roundtrip_two_full_blocks() {
let block = "ACGT".repeat(BP_PER_BLOCK / 4);
let mut dna = PackedDNA::new();
let (p, n) = encode(&block);
dna.append(p, n);
let (p, n) = encode(&block);
dna.append(p, n);
assert_eq!(dna.len(), 2 * BP_PER_BLOCK);
assert_eq!(format!("{dna}"), block.repeat(2));
}
}