use ahash::RandomState;
use num_traits::{PrimInt, Unsigned};
use std::hash::{BuildHasher, Hasher};
const LETTER_CODE: [u8; 4] = [b'A', b'C', b'T', b'G'];
#[inline(always)]
pub fn encode_base(base: u8) -> u8 {
(base >> 1) & 0x3
}
#[inline(always)]
pub fn decode_base(bitbase: u8) -> u8 {
LETTER_CODE[bitbase as usize]
}
#[inline(always)]
pub fn rc_base(base: u8) -> u8 {
base ^ 2
}
#[inline(always)]
pub fn valid_base(base: u8) -> bool {
base & 0xF != 14
}
#[inline(always)]
pub fn is_ambiguous(mut base: u8) -> bool {
base |= 0x20; !matches!(base, b'a' | b'c' | b'g' | b't' | b'u' | b'-')
}
pub fn base_to_prob(base: u8) -> [f64; 4] {
match base {
b'A' => [1.0, 0.0, 0.0, 0.0],
b'C' => [0.0, 1.0, 0.0, 0.0],
b'G' => [0.0, 0.0, 0.0, 1.0],
b'T' | b'U' => [0.0, 0.0, 1.0, 0.0],
b'R' => [0.5, 0.0, 0.0, 0.5],
b'Y' => [0.0, 0.5, 0.5, 0.0],
b'S' => [0.0, 0.5, 0.0, 0.5],
b'W' => [0.5, 0.0, 0.5, 0.0],
b'K' => [0.0, 0.0, 0.5, 0.5],
b'M' => [0.5, 0.5, 0.0, 0.0],
b'B' => [0.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0],
b'D' => [1.0 / 3.0, 0.0, 1.0 / 3.0, 1.0 / 3.0],
b'H' => [1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0, 0.0],
b'V' => [1.0 / 3.0, 1.0 / 3.0, 0.0, 1.0 / 3.0],
b'N' => [0.0, 0.0, 0.0, 0.0], _ => [0.0, 0.0, 0.0, 0.0],
}
}
pub trait UInt<'a>:
PrimInt
+ Unsigned
+ std::fmt::Display
+ std::fmt::Debug
+ std::marker::Send
+ std::marker::Sync
+ std::hash::Hash
+ std::ops::Shl<usize, Output = Self>
+ std::ops::BitAnd
+ std::ops::ShrAssign<i32>
+ std::ops::ShlAssign<i32>
+ std::ops::BitOrAssign
+ std::borrow::BorrowMut<Self>
+ serde::Serialize
+ serde::Deserialize<'a>
{
fn rev_comp(self, k_size: usize) -> Self;
fn lsb_u8(self) -> u8;
fn as_u8(self) -> u8;
fn generate_masks(k: usize) -> (Self, Self);
fn skalo_mask(k: usize) -> Self;
fn encode_kmer(kmer: &[u8]) -> Self {
kmer.iter().fold(Self::zero_init(), |result, nt| {
(result << 2) | (Self::from_encoded_base(encode_base(*nt)))
})
}
fn encode_kmer_str(kmer: &str) -> Self {
Self::encode_kmer(kmer.as_bytes())
}
fn combine_kmers(encoded_kmer1: Self, encoded_kmer2: Self) -> Self {
let last_nucleotide_mask: Self = Self::from_encoded_base(0b11);
let shifted_kmer1 = encoded_kmer1 << 2;
let last_nucleotide = encoded_kmer2 & last_nucleotide_mask;
shifted_kmer1 | last_nucleotide
}
fn get_last_nucl(encoded_kmer: Self) -> char {
let last_bits = Self::as_u8(encoded_kmer & Self::from_encoded_base(0b11));
decode_base(last_bits) as char
}
fn skalo_decode_kmer(encoded: Self, k: usize) -> String {
let mut kmer = String::with_capacity(k);
let mask: Self = Self::skalo_mask(k);
let mut value = encoded & mask;
for _ in 0..k {
let nucleotide =
decode_base(Self::as_u8(value & Self::from_encoded_base(0b11))) as char;
kmer.insert(0, nucleotide);
value >>= 2;
}
kmer
}
fn zero_init() -> Self {
Self::from_encoded_base(0)
}
fn from_encoded_base(encoded_base: u8) -> Self;
fn n_bits() -> u32;
fn hash_val(self, hash_fn: &RandomState) -> u64;
}
impl UInt<'_> for u64 {
#[inline(always)]
fn rev_comp(mut self, k_size: usize) -> Self {
self = (self >> 2 & 0x3333333333333333) | (self & 0x3333333333333333) << 2;
self = (self >> 4 & 0x0F0F0F0F0F0F0F0F) | (self & 0x0F0F0F0F0F0F0F0F) << 4;
self = (self >> 8 & 0x00FF00FF00FF00FF) | (self & 0x00FF00FF00FF00FF) << 8;
self = (self >> 16 & 0x0000FFFF0000FFFF) | (self & 0x0000FFFF0000FFFF) << 16;
self = (self >> 32 & 0x00000000FFFFFFFF) | (self & 0x00000000FFFFFFFF) << 32;
self ^= 0xAAAAAAAAAAAAAAAA;
self >> (2 * (32 - k_size))
}
#[inline(always)]
fn lsb_u8(self) -> u8 {
(self & 0x3) as u8
}
#[inline(always)]
fn as_u8(self) -> u8 {
self as u8
}
#[inline(always)]
fn generate_masks(k: usize) -> (Self, Self) {
let half_size: usize = (k - 1) / 2;
let lower_mask: Self = (1 << (half_size * 2)) - 1;
let upper_mask: Self = lower_mask << (half_size * 2);
(lower_mask, upper_mask)
}
#[inline(always)]
fn skalo_mask(k: usize) -> Self {
(1 << (k * 2)) - 1
}
#[inline(always)]
fn from_encoded_base(encoded_base: u8) -> Self {
encoded_base as Self
}
#[inline(always)]
fn n_bits() -> u32 {
Self::BITS
}
#[inline(always)]
fn hash_val(self, hash_fn: &RandomState) -> u64 {
let mut hasher = hash_fn.build_hasher();
hasher.write_u64(self);
hasher.finish()
}
}
impl UInt<'_> for u128 {
#[inline(always)]
fn rev_comp(mut self, k_size: usize) -> Self {
self = (self >> 2 & 0x33333333333333333333333333333333)
| (self & 0x33333333333333333333333333333333) << 2;
self = (self >> 4 & 0x0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F)
| (self & 0x0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F) << 4;
self = (self >> 8 & 0x00FF00FF00FF00FF00FF00FF00FF00FF)
| (self & 0x00FF00FF00FF00FF00FF00FF00FF00FF) << 8;
self = (self >> 16 & 0x0000FFFF0000FFFF0000FFFF0000FFFF)
| (self & 0x0000FFFF0000FFFF0000FFFF0000FFFF) << 16;
self = (self >> 32 & 0x00000000FFFFFFFF00000000FFFFFFFF)
| (self & 0x00000000FFFFFFFF00000000FFFFFFFF) << 32;
self = (self >> 64 & 0x0000000000000000FFFFFFFFFFFFFFFF)
| (self & 0x0000000000000000FFFFFFFFFFFFFFFF) << 64;
self ^= 0xAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA;
self >> (2 * (64 - k_size))
}
#[inline(always)]
fn lsb_u8(self) -> u8 {
(self & 0x3) as u8
}
#[inline(always)]
fn as_u8(self) -> u8 {
self as u8
}
#[inline(always)]
fn generate_masks(k: usize) -> (Self, Self) {
let half_size: usize = (k - 1) / 2;
let lower_mask: Self = (1 << (half_size * 2)) - 1;
let upper_mask: Self = lower_mask << (half_size * 2);
(lower_mask, upper_mask)
}
#[inline(always)]
fn skalo_mask(k: usize) -> Self {
(1 << (k * 2)) - 1
}
#[inline(always)]
fn from_encoded_base(encoded_base: u8) -> Self {
encoded_base as Self
}
#[inline(always)]
fn n_bits() -> u32 {
Self::BITS
}
#[inline(always)]
fn hash_val(self, hash_fn: &RandomState) -> u64 {
let mut hasher = hash_fn.build_hasher();
hasher.write_u128(self);
hasher.finish()
}
}
pub fn decode_kmer<IntT>(
k: usize,
kmer: IntT,
upper_mask: IntT,
lower_mask: IntT,
) -> (String, String)
where
IntT: for<'a> UInt<'a>,
{
let half_k: usize = (k - 1) / 2;
let mut upper_bits = (kmer & upper_mask) >> (half_k * 2);
let mut upper_kmer = String::with_capacity(half_k);
for _idx in 0..half_k {
let base = decode_base(upper_bits.lsb_u8());
upper_kmer.push(base as char);
upper_bits >>= 2;
}
upper_kmer = upper_kmer.chars().rev().collect::<String>();
let mut lower_bits = kmer & lower_mask;
let mut lower_kmer = String::with_capacity(half_k);
for _idx in 0..half_k {
let base = decode_base(lower_bits.lsb_u8());
lower_kmer.push(base as char);
lower_bits >>= 2;
}
lower_kmer = lower_kmer.chars().rev().collect::<String>();
(upper_kmer, lower_kmer)
}
pub const IUPAC: [u8; 1024] = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, b'A', b'N', b'M', b'D', 0, 0, b'R', b'H', 0, 0, b'D', 0, b'M', b'N', 0, 0, 0, b'R', b'V', b'W', 0, b'V', b'W', 0, b'H', 0, 0, 0, 0, 0, 0, 0, b'A', b'N', b'M', b'D', 0, 0, b'R', b'H', 0, 0, b'D', 0, b'M', b'N', 0, 0, 0, b'R', b'V', b'W', 0, b'V', b'W', 0, b'H', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, b'M', b'B', b'C', b'N', 0, 0, b'S', b'H', 0, 0, b'B', 0, b'M', b'N', 0, 0, 0, b'V', b'S', b'Y', 0, b'V', b'H', 0, b'Y', 0, 0, 0, 0, 0, 0, 0, b'M', b'B', b'C', b'N', 0, 0, b'S', b'H', 0, 0, b'B', 0, b'M', b'N', 0, 0, 0, b'V', b'S', b'Y', 0, b'V', b'H', 0, b'Y', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, b'W', b'B', b'Y', b'D', 0, 0, b'K', b'H', 0, 0, b'K', 0, b'H', b'N', 0, 0, 0, b'D', b'B', b'T', 0, b'N', b'W', 0, b'Y', 0, 0, 0, 0, 0, 0, 0, b'W', b'B', b'Y', b'D', 0, 0, b'K', b'H', 0, 0, b'K', 0, b'H', b'N', 0, 0, 0, b'D', b'B', b'T', 0, b'N', b'W', 0, b'Y', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, b'R', b'B', b'S', b'D', 0, 0, b'G', b'N', 0, 0, b'K', 0, b'V', b'N', 0, 0, 0, b'R', b'S', b'K', 0, b'V', b'D', 0, b'B', 0, 0, 0, 0, 0, 0, 0, b'R', b'B', b'S', b'D', 0, 0, b'G', b'N', 0, 0, b'K', 0, b'V', b'N', 0, 0, 0, b'R', b'S', b'K', 0, b'V', b'D', 0, b'B', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
pub const RC_IUPAC: [u8; 256] = [
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'T', b'V', b'G', b'H', b'-', b'-', b'C', b'D', b'-', b'-', b'M', b'-', b'K', b'N',
b'-', b'-', b'-', b'Y', b'S', b'A', b'-', b'B', b'W', b'-', b'R', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'T', b'V', b'G', b'H', b'-', b'-', b'C', b'D', b'-', b'-', b'M', b'-', b'K', b'N',
b'-', b'-', b'-', b'Y', b'S', b'A', b'-', b'B', b'W', b'-', b'R', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
b'-', ];
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
fn overlap(b1: &[f64; 4], b2: &[f64; 4]) -> f64 {
b1.iter().zip(b2).map(|(p1, p2)| *p1 * p2).sum()
}
#[test]
fn test_base_to_prob() {
let a = base_to_prob(b'A');
let c = base_to_prob(b'C');
let g = base_to_prob(b'G');
let t = base_to_prob(b'T');
let u = base_to_prob(b'U');
let r = base_to_prob(b'R');
let y = base_to_prob(b'Y');
let s = base_to_prob(b'S');
let w = base_to_prob(b'W');
let k = base_to_prob(b'K');
let m = base_to_prob(b'M');
let b = base_to_prob(b'B');
let d = base_to_prob(b'D');
let h = base_to_prob(b'H');
let v = base_to_prob(b'V');
let n = base_to_prob(b'N');
let empty = base_to_prob(b'-');
assert_eq!(overlap(&a, &a), 1.0);
assert_eq!(overlap(&a, &c), 0.0);
assert_eq!(overlap(&t, &u), 1.0);
assert_eq!(overlap(&g, &u), 0.0);
assert_eq!(overlap(&r, &a), 0.5);
assert_eq!(overlap(&r, &y), 0.0);
assert_eq!(overlap(&s, &g), 0.5);
assert_eq!(overlap(&w, &w), 0.5);
assert_eq!(overlap(&m, &y), 0.25);
assert_eq!(overlap(&k, &b), 1.0 / 3.0);
assert_eq!(overlap(&d, &h), 2.0 / 9.0);
assert_eq!(overlap(&v, &n), 0.0);
assert_eq!(overlap(&n, &empty), 0.0);
}
}