use rustc_hash::FxHashSet;
use std::hash::Hash;
use xxhash_rust::xxh3::{xxh3_64, xxh3_128};
pub struct HashVerifier<T>
where
T: Hash + Eq,
{
memory: FxHashSet<T>,
}
impl<T> HashVerifier<T>
where
T: Hash + Eq,
{
pub fn new(estimated_capacity: usize) -> Self {
Self {
memory: FxHashSet::with_capacity_and_hasher(estimated_capacity, Default::default()),
}
}
pub fn verify(&mut self, hash: T) -> bool {
self.memory.insert(hash)
}
}
pub trait SequenceHasher: Hash + Eq + Copy + Send + Sync {
fn hash_sequence(seq: &[u8]) -> Self;
fn hash_pair(seq1: &[u8], seq2: &[u8]) -> Self;
}
impl SequenceHasher for u64 {
#[inline(always)]
fn hash_sequence(seq: &[u8]) -> Self {
xxh3_64(seq)
}
#[inline(always)]
fn hash_pair(seq1: &[u8], seq2: &[u8]) -> Self {
let h1 = xxh3_64(seq1);
let h2 = xxh3_64(seq2);
h1 ^ h2.rotate_left(32)
}
}
impl SequenceHasher for u128 {
#[inline(always)]
fn hash_sequence(seq: &[u8]) -> Self {
xxh3_128(seq)
}
#[inline(always)]
fn hash_pair(seq1: &[u8], seq2: &[u8]) -> Self {
let h1 = xxh3_128(seq1);
let h2 = xxh3_128(seq2);
h1 ^ h2.rotate_left(64)
}
}
#[derive(Debug)]
pub enum HashType {
XXH3_64,
XXH3_128,
}
impl HashType {
pub fn to_num(&self) -> usize {
match self {
HashType::XXH3_64 => 64,
HashType::XXH3_128 => 128,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hash_sequence_deterministic() {
let seq = b"ATCGATCG";
assert_eq!(u64::hash_sequence(seq), u64::hash_sequence(seq));
assert_eq!(u128::hash_sequence(seq), u128::hash_sequence(seq));
}
#[test]
fn test_hash_pair_deterministic() {
let r1 = b"ATCGATCG";
let r2 = b"GCTAGCTA";
assert_eq!(u64::hash_pair(r1, r2), u64::hash_pair(r1, r2));
assert_eq!(u128::hash_pair(r1, r2), u128::hash_pair(r1, r2));
}
#[test]
fn test_hash_pair_order_dependent() {
let r1 = b"ATCGATCG";
let r2 = b"GCTAGCTA";
assert_ne!(u64::hash_pair(r1, r2), u64::hash_pair(r2, r1));
assert_ne!(u128::hash_pair(r1, r2), u128::hash_pair(r2, r1));
}
#[test]
fn test_hash_pair_identical_mates_still_valid() {
let seq = b"ATCGATCG";
let h = u64::hash_pair(seq, seq);
assert_ne!(h, 0);
let h128 = u128::hash_pair(seq, seq);
assert_ne!(h128, 0);
}
#[test]
fn test_verifier_detects_duplicate() {
let mut v = HashVerifier::<u64>::new(16);
let h = u64::hash_sequence(b"ATCG");
assert!(v.verify(h)); assert!(!v.verify(h)); }
#[test]
fn test_verifier_distinct_sequences() {
let mut v = HashVerifier::<u64>::new(16);
assert!(v.verify(u64::hash_sequence(b"ATCG")));
assert!(v.verify(u64::hash_sequence(b"GCTA")));
}
}