use crate::kmer::{Kmer, MAX_PREC};
const BIN2NT: [char; 4] = ['A', 'C', 'T', 'G'];
#[inline]
fn nt_to_bin(c: char) -> u64 {
match c {
'A' | 'a' => 0,
'C' | 'c' => 1,
'T' | 't' => 2,
'G' | 'g' => 3,
_ => 0,
}
}
#[derive(Clone, Debug)]
pub struct ReadHolder {
storage: Vec<u64>,
read_length: Vec<u32>,
total_seq: usize,
contains_paired: bool,
}
impl ReadHolder {
pub fn new(contains_paired: bool) -> Self {
ReadHolder {
storage: Vec::new(),
read_length: Vec::new(),
total_seq: 0,
contains_paired,
}
}
fn reserve_for_read(&mut self, len: usize) {
self.read_length.reserve(1);
self.storage.reserve((2 * len).div_ceil(64) + 1);
}
pub fn push_back_str(&mut self, read: &str) {
self.reserve_for_read(read.len());
let mut shift = (self.total_seq * 2) % 64;
let mut read_len: u32 = 0;
for c in read.chars().rev() {
if shift == 0 {
self.storage.push(0);
}
*self.storage.last_mut().unwrap() += nt_to_bin(c) << shift;
shift = (shift + 2) % 64;
read_len += 1;
}
self.read_length.push(read_len);
self.total_seq += read_len as usize;
}
pub fn push_back_chars(&mut self, read: &[char]) {
self.reserve_for_read(read.len());
let mut shift = (self.total_seq * 2) % 64;
let len = read.len() as u32;
for &c in read.iter().rev() {
if shift == 0 {
self.storage.push(0);
}
*self.storage.last_mut().unwrap() += nt_to_bin(c) << shift;
shift = (shift + 2) % 64;
}
self.read_length.push(len);
self.total_seq += len as usize;
}
pub fn push_back_iter(&mut self, iter: &StringIterator<'_>) {
let read_len = iter.read_len();
self.read_length.push(read_len as u32);
let destination_first_bit = 2 * self.total_seq;
self.total_seq += read_len;
self.storage.resize((2 * self.total_seq).div_ceil(64), 0);
let bit_from = iter.position;
let bit_to = bit_from + 2 * read_len;
let dest_size = self.storage.len();
iter.holder.copy_bits(
bit_from,
bit_to,
&mut self.storage,
destination_first_bit,
dest_size,
);
}
pub fn swap(&mut self, other: &mut ReadHolder) {
std::mem::swap(&mut self.storage, &mut other.storage);
std::mem::swap(&mut self.read_length, &mut other.read_length);
std::mem::swap(&mut self.total_seq, &mut other.total_seq);
}
pub fn clear(&mut self) {
self.storage.clear();
self.storage.shrink_to_fit();
self.read_length.clear();
self.read_length.shrink_to_fit();
self.total_seq = 0;
}
pub fn total_seq(&self) -> usize {
self.total_seq
}
pub fn max_length(&self) -> usize {
self.read_length.iter().copied().max().unwrap_or(0) as usize
}
pub fn kmer_num(&self, kmer_len: usize) -> usize {
let mut num = 0;
for &l in &self.read_length {
let l = l as usize;
if l >= kmer_len {
num += l - kmer_len + 1;
}
}
num
}
pub fn read_num(&self) -> usize {
self.read_length.len()
}
pub fn contains_paired(&self) -> bool {
self.contains_paired
}
pub fn memory_footprint(&self) -> usize {
8 * self.storage.capacity() + 4 * self.read_length.capacity()
}
pub fn reserve(&mut self, seq: usize, num: usize) {
self.storage.reserve(seq / 32 + 1);
if num > 0 {
self.read_length.reserve(num);
}
}
pub fn nxx(&self, xx: f64) -> usize {
let mut lengths: Vec<u32> = self.read_length.clone();
lengths.sort();
let mut nxx = 0usize;
let mut len = 0usize;
let threshold = (xx * self.total_seq as f64) as usize;
for j in (0..lengths.len()).rev() {
nxx = lengths[j] as usize;
len += lengths[j] as usize;
if len >= threshold {
break;
}
}
nxx
}
pub fn n50(&self) -> usize {
self.nxx(0.5)
}
pub fn read_length_at(&self, idx: usize) -> u32 {
self.read_length[idx]
}
pub fn storage(&self) -> &[u64] {
&self.storage
}
pub fn storage_as_bytes(&self) -> Vec<u8> {
let mut bytes = Vec::with_capacity(self.storage.len() * 8);
for &word in &self.storage {
bytes.extend_from_slice(&word.to_ne_bytes());
}
bytes
}
pub fn storage_bytes(&self) -> &[u8] {
let ptr = self.storage.as_ptr() as *const u8;
let len = self.storage.len() * 8;
unsafe { std::slice::from_raw_parts(ptr, len) }
}
pub fn read_lengths(&self) -> &[u32] {
&self.read_length
}
pub fn copy_bits(
&self,
bit_from: usize,
bit_to: usize,
destination: &mut [u64],
dest_bit_from: usize,
dest_size: usize,
) {
if bit_to <= bit_from {
return;
}
let mut word = bit_from / 64;
let last_word = (bit_to - 1) / 64;
let shift = (bit_from % 64) as u32;
let mut dest_word = dest_bit_from / 64;
let mut dest_shift = (dest_bit_from % 64) as u32;
if shift > 0 {
let chunk = self.storage[word] >> shift;
word += 1;
if dest_shift > 0 {
destination[dest_word] = destination[dest_word].wrapping_add(chunk << dest_shift);
if shift <= dest_shift {
dest_word += 1;
}
if shift < dest_shift && dest_word < dest_size {
destination[dest_word] =
destination[dest_word].wrapping_add(chunk >> (64 - dest_shift));
}
} else {
destination[dest_word] = chunk;
}
dest_shift = (dest_shift + 64 - shift) % 64;
}
while word <= last_word {
if dest_shift > 0 {
destination[dest_word] =
destination[dest_word].wrapping_add(self.storage[word] << dest_shift);
if dest_word + 1 < dest_size {
destination[dest_word + 1] = destination[dest_word + 1]
.wrapping_add(self.storage[word] >> (64 - dest_shift));
}
} else {
destination[dest_word] = self.storage[word];
}
word += 1;
dest_word += 1;
}
let partial_bits = (dest_bit_from + bit_to - bit_from) % 64;
if partial_bits > 0 {
let mask = (1u64 << partial_bits) - 1;
destination[dest_size - 1] &= mask;
}
}
pub fn kmer_iter(&self, kmer_len: usize) -> KmerIterator<'_> {
let mut iter = KmerIterator {
holder: self,
read: 0,
position: 0,
kmer_len: kmer_len as u32,
position_in_read: 0,
};
iter.skip_short_reads();
iter
}
pub fn kmer_end(&self) -> KmerIterator<'_> {
KmerIterator {
holder: self,
read: self.read_length.len(),
position: 2 * self.total_seq,
kmer_len: 0,
position_in_read: 0,
}
}
pub fn string_iter(&self) -> StringIterator<'_> {
StringIterator {
holder: self,
position: 0,
read: 0,
}
}
pub fn string_iter_at(&self, read: usize) -> StringIterator<'_> {
let read = read.min(self.read_length.len());
let position = 2 * self
.read_length
.iter()
.take(read)
.map(|&len| len as usize)
.sum::<usize>();
StringIterator {
holder: self,
position,
read,
}
}
pub fn string_end(&self) -> StringIterator<'_> {
StringIterator {
holder: self,
position: 2 * self.total_seq,
read: self.read_length.len(),
}
}
}
#[derive(Clone)]
pub struct KmerIterator<'a> {
holder: &'a ReadHolder,
read: usize,
position: usize, kmer_len: u32,
position_in_read: u32, }
impl<'a> KmerIterator<'a> {
pub fn get(&self) -> Kmer {
let kmer_len = self.kmer_len as usize;
if kmer_len <= 32 {
let val = self.get_val_p1();
return Kmer::from_u64(kmer_len, val);
}
let num_words = (2 * kmer_len).div_ceil(64);
let mut buf = [0u64; MAX_PREC];
self.holder.copy_bits(
self.position,
self.position + 2 * kmer_len,
&mut buf[..num_words],
0,
num_words,
);
let mut kmer = Kmer::zero(kmer_len);
kmer.copy_words_from(&buf[..num_words]);
kmer
}
#[inline]
pub(crate) fn get_val_p1(&self) -> u64 {
let kmer_len = self.kmer_len as usize;
let bit_len = 2 * kmer_len;
let word_idx = self.position / 64;
let bit_offset = self.position % 64;
let storage = self.holder.storage();
let mut val = storage[word_idx] >> bit_offset;
if bit_offset + bit_len > 64 && word_idx + 1 < storage.len() {
val |= storage[word_idx + 1] << (64 - bit_offset);
}
if bit_len < 64 {
val &= (1u64 << bit_len) - 1;
}
val
}
pub fn advance(&mut self) {
let kmer_len = self.kmer_len as usize;
if self.position == 2 * (self.holder.total_seq - kmer_len) {
self.position = 2 * self.holder.total_seq;
return;
}
self.position += 2;
self.position_in_read += 1;
let read_len = self.holder.read_length[self.read];
if self.position_in_read == read_len - self.kmer_len + 1 {
self.position += 2 * (kmer_len - 1);
self.read += 1;
self.position_in_read = 0;
self.skip_short_reads();
}
}
pub fn at_end(&self) -> bool {
self.position >= 2 * self.holder.total_seq
}
fn skip_short_reads(&mut self) {
let kmer_len = self.kmer_len as usize;
while self.position < 2 * self.holder.total_seq
&& self.read < self.holder.read_length.len()
&& (self.holder.read_length[self.read] as usize) < kmer_len
{
self.position += 2 * self.holder.read_length[self.read] as usize;
self.read += 1;
}
}
}
impl<'a> PartialEq for KmerIterator<'a> {
fn eq(&self, other: &Self) -> bool {
std::ptr::eq(self.holder, other.holder) && self.position == other.position
}
}
#[derive(Clone)]
pub struct StringIterator<'a> {
holder: &'a ReadHolder,
position: usize,
read: usize,
}
impl<'a> StringIterator<'a> {
pub fn get(&self) -> String {
let read_length = self.holder.read_length[self.read] as usize;
if read_length == 0 {
return String::new();
}
let mut read = String::with_capacity(read_length);
let mut position = self.position + 2 * (read_length - 1);
for _ in 0..read_length {
let idx = (self.holder.storage[position / 64] >> (position % 64)) & 3;
read.push(BIN2NT[idx as usize]);
position = position.wrapping_sub(2);
}
read
}
pub fn read_len(&self) -> usize {
self.holder.read_length[self.read] as usize
}
pub fn advance(&mut self) {
if self.read >= self.holder.read_length.len() {
return;
}
self.position += 2 * self.holder.read_length[self.read] as usize;
self.read += 1;
}
pub fn at_end(&self) -> bool {
self.read >= self.holder.read_length.len()
}
pub fn has_mate(&self) -> bool {
self.holder.contains_paired
}
pub fn pair_type(&self) -> u8 {
if !self.holder.contains_paired {
0 } else if self.read % 2 == 1 {
2 } else {
1 }
}
pub fn get_mate(&self) -> StringIterator<'a> {
if self.read % 2 == 1 {
StringIterator {
holder: self.holder,
position: self.position - 2 * self.holder.read_length[self.read - 1] as usize,
read: self.read - 1,
}
} else {
StringIterator {
holder: self.holder,
position: self.position + 2 * self.holder.read_length[self.read] as usize,
read: self.read + 1,
}
}
}
pub fn b_seq(&self, shift: usize, destination: &mut [u64]) {
let position = self.position + 2 * shift;
let len = 2 * (self.read_len() - shift);
let dest_size = len.div_ceil(64);
self.holder
.copy_bits(position, position + len, destination, 0, dest_size);
}
pub fn true_b_seq(
&self,
left_clip: usize,
right_clip: usize,
reverse_complement: bool,
destination: &mut [u64],
) {
fn reverse_word(w: &mut u64) {
*w = ((*w & 0x3333_3333_3333_3333) << 2) | ((*w >> 2) & 0x3333_3333_3333_3333);
*w = ((*w & 0x0F0F_0F0F_0F0F_0F0F) << 4) | ((*w >> 4) & 0x0F0F_0F0F_0F0F_0F0F);
*w = ((*w & 0x00FF_00FF_00FF_00FF) << 8) | ((*w >> 8) & 0x00FF_00FF_00FF_00FF);
*w = ((*w & 0x0000_FFFF_0000_FFFF) << 16) | ((*w >> 16) & 0x0000_FFFF_0000_FFFF);
*w = ((*w & 0x0000_0000_FFFF_FFFF) << 32) | ((*w >> 32) & 0x0000_0000_FFFF_FFFF);
}
let position = self.position + 2 * right_clip;
let len = 2 * (self.read_len() - right_clip - left_clip);
let destination_size = len.div_ceil(64);
if reverse_complement {
self.holder
.copy_bits(position, position + len, destination, 0, destination_size);
for p in 0..destination_size {
destination[p] ^= 0xAAAA_AAAA_AAAA_AAAA;
}
let partial_bits = len % 64;
if partial_bits > 0 {
destination[destination_size - 1] &= (1u64 << partial_bits) - 1;
}
} else {
let shift_to_right_end = 64 * destination_size - len;
self.holder.copy_bits(
position,
position + len,
destination,
shift_to_right_end,
destination_size,
);
let half = destination_size / 2;
for p in 0..half {
destination.swap(p, destination_size - 1 - p);
reverse_word(&mut destination[p]);
reverse_word(&mut destination[destination_size - 1 - p]);
}
if destination_size % 2 == 1 {
reverse_word(&mut destination[destination_size / 2]);
}
}
}
pub fn kmers_for_read(&self, kmer_len: usize) -> KmerIterator<'a> {
if kmer_len <= self.holder.read_length[self.read] as usize {
let mut iter = KmerIterator {
holder: self.holder,
read: self.read,
position: self.position,
kmer_len: kmer_len as u32,
position_in_read: 0,
};
iter.skip_short_reads();
iter
} else {
self.holder.kmer_end()
}
}
}
impl<'a> PartialEq for StringIterator<'a> {
fn eq(&self, other: &Self) -> bool {
std::ptr::eq(self.holder, other.holder) && self.read == other.read
}
}
pub fn common_seq_len(seq1: &[u64], seq2: &[u64], word_len: usize) -> usize {
let n = word_len.min(seq1.len()).min(seq2.len());
for i in 0..n {
let a = seq1[i];
let b = seq2[i];
if a != b {
return 32 * i + (a ^ b).trailing_zeros() as usize / 2;
}
}
32 * n
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_push_back_and_read() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("ACGT");
assert_eq!(rh.total_seq(), 4);
assert_eq!(rh.read_num(), 1);
let mut si = rh.string_iter();
assert!(!si.at_end());
assert_eq!(si.get(), "ACGT");
si.advance();
assert!(si.at_end());
}
#[test]
fn test_multiple_reads() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("ACGT");
rh.push_back_str("TTGG");
rh.push_back_str("CCAA");
assert_eq!(rh.total_seq(), 12);
assert_eq!(rh.read_num(), 3);
let mut si = rh.string_iter();
assert_eq!(si.get(), "ACGT");
si.advance();
assert_eq!(si.get(), "TTGG");
si.advance();
assert_eq!(si.get(), "CCAA");
si.advance();
assert!(si.at_end());
}
#[test]
fn test_string_iterator_round_trip_varied_reads() {
let mut seed = 0x5EED_u64;
let mut expected = Vec::new();
let mut rh = ReadHolder::new(false);
for len in [
1usize, 2, 3, 4, 5, 7, 8, 15, 16, 31, 32, 33, 63, 64, 65, 127,
] {
seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
let mut state = seed ^ len as u64;
let seq: String = (0..len)
.map(|_| {
state = state
.wrapping_mul(2862933555777941757)
.wrapping_add(3037000493);
BIN2NT[((state >> 33) & 3) as usize]
})
.collect();
rh.push_back_str(&seq);
expected.push(seq);
}
assert_eq!(rh.read_num(), expected.len());
assert_eq!(
rh.total_seq(),
expected.iter().map(String::len).sum::<usize>()
);
let mut actual = Vec::new();
let mut si = rh.string_iter();
while !si.at_end() {
actual.push(si.get());
si.advance();
}
assert_eq!(actual, expected);
}
#[test]
fn test_kmer_iterator() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("ACGTACGT");
let mut ki = rh.kmer_iter(4);
let mut kmers = Vec::new();
while !ki.at_end() {
kmers.push(ki.get().to_kmer_string(4));
ki.advance();
}
assert_eq!(kmers.len(), 5);
assert_eq!(kmers[0], "ACGT");
assert_eq!(kmers[1], "TACG");
assert_eq!(kmers[2], "GTAC");
assert_eq!(kmers[3], "CGTA");
assert_eq!(kmers[4], "ACGT");
}
#[test]
fn test_kmer_across_reads() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("ACGT"); rh.push_back_str("TTGG");
let mut ki = rh.kmer_iter(4);
let mut kmers = Vec::new();
while !ki.at_end() {
kmers.push(ki.get().to_kmer_string(4));
ki.advance();
}
assert_eq!(kmers.len(), 2);
assert_eq!(kmers[0], "ACGT");
assert_eq!(kmers[1], "TTGG");
}
#[test]
fn test_skip_short_reads() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("AC"); rh.push_back_str("ACGTACGT");
let mut ki = rh.kmer_iter(4);
let mut count = 0;
while !ki.at_end() {
ki.advance();
count += 1;
}
assert_eq!(count, 5); }
#[test]
fn test_max_length() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("AC");
rh.push_back_str("ACGTACGT");
rh.push_back_str("ACGT");
assert_eq!(rh.max_length(), 8);
}
#[test]
fn test_kmer_num() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("ACGTACGT"); rh.push_back_str("ACGT"); rh.push_back_str("AC"); assert_eq!(rh.kmer_num(4), 6);
}
#[test]
fn test_paired_reads() {
let mut rh = ReadHolder::new(true);
rh.push_back_str("ACGT"); rh.push_back_str("TTGG");
let si = rh.string_iter();
assert_eq!(si.pair_type(), 1); let mate = si.get_mate();
assert_eq!(mate.pair_type(), 2); assert_eq!(mate.get(), "TTGG");
}
#[test]
fn test_long_read_crossing_word_boundary() {
let seq = "ACGTACGTACGTACGTACGTACGTACGTACGTACGT"; let mut rh = ReadHolder::new(false);
rh.push_back_str(seq);
assert_eq!(rh.total_seq(), 36);
let mut si = rh.string_iter();
assert_eq!(si.get(), seq);
si.advance();
assert!(si.at_end());
let ki = rh.kmer_iter(21);
let first_kmer = ki.get().to_kmer_string(21);
assert_eq!(first_kmer, &seq[seq.len() - 21..]);
}
#[test]
fn test_clear() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("ACGT");
assert_eq!(rh.read_num(), 1);
rh.clear();
assert_eq!(rh.read_num(), 0);
assert_eq!(rh.total_seq(), 0);
}
#[test]
fn test_common_seq_len_all_match() {
let a: [u64; 2] = [0xAAAA_BBBB_CCCC_DDDD, 0x1122_3344_5566_7788];
assert_eq!(common_seq_len(&a, &a, 2), 64);
}
#[test]
fn test_common_seq_len_mismatch_within_first_word() {
let a: u64 = 0b0011_0000;
let b: u64 = 0b0000_0000;
assert_eq!(common_seq_len(&[a], &[b], 1), 2);
}
#[test]
fn test_common_seq_len_mismatch_in_second_word() {
let a = [0u64, 1u64];
let b = [0u64, 0u64];
assert_eq!(common_seq_len(&a, &b, 2), 32);
}
#[test]
fn test_common_seq_len_respects_word_len() {
let a = [0u64, 0u64, 0u64];
let b = [0u64, 0u64, 0u64];
assert_eq!(common_seq_len(&a, &b, 1), 32);
}
#[test]
fn test_b_seq_returns_stored_reversed_bits() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("AAAC");
let si = rh.string_iter();
let mut dest = [0u64; 1];
si.b_seq(0, &mut dest);
assert_eq!(dest[0], 0x01);
}
#[test]
fn test_true_b_seq_forward_order_restores_read() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("AAAC");
let si = rh.string_iter();
let mut dest = [0u64; 1];
si.true_b_seq(0, 0, false, &mut dest);
assert_eq!(dest[0], 0x40);
}
#[test]
fn test_true_b_seq_reverse_complement() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("AAAC");
let si = rh.string_iter();
let mut dest = [0u64; 1];
si.true_b_seq(0, 0, true, &mut dest);
assert_eq!(dest[0], 0xAB);
}
#[test]
fn test_b_seq_respects_shift() {
let mut rh = ReadHolder::new(false);
rh.push_back_str("AAAC");
let si = rh.string_iter();
let mut dest = [0u64; 1];
si.b_seq(1, &mut dest);
assert_eq!(dest[0], 0x00);
}
}