use log::trace;
use rand::distr::*;
use rand::prelude::*;
use rand_xoshiro::Xoshiro256PlusPlus;
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use std::marker::PhantomData;
use num::{Bounded, FromPrimitive, Integer, ToPrimitive, Unsigned};
use crate::fyshuffle::*;
#[allow(unused_imports)]
use crate::invhash;
pub struct NoHashHasher(u64);
impl Default for NoHashHasher {
#[inline]
fn default() -> NoHashHasher {
NoHashHasher(0x0000000000000000)
}
}
impl Hasher for NoHashHasher {
#[inline]
fn write(&mut self, bytes: &[u8]) {
match bytes.len() {
4 => {
*self = NoHashHasher(
((bytes[0] as u64) << 24)
| ((bytes[1] as u64) << 16)
| ((bytes[2] as u64) << 8)
| (bytes[3] as u64),
);
}
8 => {
*self = NoHashHasher(
((bytes[0] as u64) << 56)
| ((bytes[1] as u64) << 48)
| ((bytes[2] as u64) << 40)
| ((bytes[3] as u64) << 32)
| ((bytes[4] as u64) << 24)
| ((bytes[5] as u64) << 16)
| ((bytes[6] as u64) << 8)
| (bytes[7] as u64),
);
}
_ => panic!("bad slice len in NoHashHasher write"),
} }
fn finish(&self) -> u64 {
self.0
}
}
pub struct SuperMinHash2<I: Integer, T: Hash, H: Hasher + Default> {
hsketch: Vec<I>,
values: Vec<usize>,
l: Vec<usize>,
b: Vec<usize>,
item_rank: usize,
a_upper: usize,
permut_generator: FYshuffle,
b_hasher: BuildHasherDefault<H>,
t_marker: PhantomData<T>,
f_marker: PhantomData<I>,
}
impl<I, T, H: Hasher + Default> SuperMinHash2<I, T, H>
where
I: Integer
+ Unsigned
+ ToPrimitive
+ FromPrimitive
+ Bounded
+ Copy
+ Clone
+ Send
+ Sync
+ std::fmt::Debug,
T: Hash,
{
pub fn new(size: usize, build_hasher: BuildHasherDefault<H>) -> SuperMinHash2<I, T, H> {
log::info!("\n allocating-sketcher \n");
let size_i = std::mem::size_of::<I>();
assert!(
size_i >= 4,
"hash values should have at least 4 bytes,so u32 or u64"
); let sketch_init: Vec<I> = (0..size).map(|_| I::zero()).collect();
let values: Vec<usize> = (0..size).map(|_| usize::MAX).collect();
let l: Vec<usize> = (0..size).map(|_| size - 1).collect();
let mut b: Vec<usize> = (0..size).map(|_| 0).collect();
b[size - 1] = size;
let permut_generator = FYshuffle::new(size);
SuperMinHash2 {
hsketch: sketch_init,
values,
l,
b,
item_rank: 0,
a_upper: size - 1,
permut_generator,
b_hasher: build_hasher,
t_marker: PhantomData,
f_marker: PhantomData,
}
}
pub fn reinit(&mut self) {
let size = self.hsketch.len();
self.values.fill(usize::MAX);
self.l.fill(size - 1);
self.b.fill(0);
self.b[size - 1] = size;
self.hsketch.fill(I::zero());
self.item_rank = 0;
self.a_upper = size - 1;
self.permut_generator.reset();
}
pub fn get_hsketch(&self) -> &Vec<I> {
&self.hsketch
}
pub fn get_jaccard_index_estimate(&self, other_sketch: &[I]) -> Result<f64, ()> {
if other_sketch.len() != self.hsketch.len() {
return Err(());
}
let mut count: usize = 0;
for i in 0..self.hsketch.len() {
if self.hsketch[i] == other_sketch[i] {
trace!(" intersection {:?} ", self.hsketch[i]);
count += 1;
}
}
Ok(count as f64 / other_sketch.len() as f64)
}
pub fn sketch(&mut self, to_sketch: &T) -> Result<(), ()> {
let mut hasher = self.b_hasher.build_hasher();
to_sketch.hash(&mut hasher);
let hval_64: u64 = hasher.finish();
let hval_i = I::from_u64(hval_64).unwrap();
let mut rand_generator = Xoshiro256PlusPlus::seed_from_u64(hval_64);
let distr_unif = Uniform::new(0u64, usize::MAX as u64).unwrap();
self.permut_generator.reset();
log::trace!("item-rank : {}", self.item_rank);
let mut j: usize = 0;
while j <= self.a_upper {
let r = distr_unif.sample(&mut rand_generator) as usize;
let k = self.permut_generator.next(&mut rand_generator);
log::trace!(
"before: j {} k {} l[k] {}, upper : {}",
j,
k,
self.l[k],
self.a_upper
);
if self.l[k] >= j {
if self.l[k] == j {
if r <= self.values[k] {
self.values[k] = r;
self.hsketch[k] = hval_i;
}
} else {
self.b[self.l[k]] -= 1;
self.b[j] += 1;
while self.b[self.a_upper] == 0 {
self.a_upper -= 1;
}
self.l[k] = j;
self.values[k] = r;
self.hsketch[k] = hval_i;
log::debug!(
"after : j {} k {} l[k] {}, upper : {}",
j,
k,
self.l[k],
self.a_upper
);
}
}
j += 1;
} self.item_rank += 1;
Ok(())
}
pub fn sketch_slice(&mut self, to_sketch: &[T]) -> Result<(), ()> {
if to_sketch.is_empty() {
println!(" empty arg");
return Err(());
}
for val in to_sketch {
self.sketch(val).unwrap();
}
Ok(())
} }
pub fn compute_superminhash_jaccard<F: PartialEq + num::Zero + std::fmt::Debug>(
hsketch: &[F],
other_sketch: &[F],
) -> Result<f32, ()> {
if hsketch.len() != other_sketch.len() {
return Err(());
}
let mut count: usize = 0;
for i in 0..hsketch.len() {
if hsketch[i] == other_sketch[i] {
trace!(" intersection {:?} ", hsketch[i]);
count += 1;
}
}
Ok(count as f32 / other_sketch.len() as f32)
}
#[inline]
pub fn get_jaccard_index_estimate<F: PartialEq + num::Zero + std::fmt::Debug>(
hsketch: &[F],
other_sketch: &[F],
) -> Result<f32, ()> {
compute_superminhash_jaccard::<F>(hsketch, other_sketch)
}
#[cfg(test)]
mod tests {
use super::*;
use fnv::FnvHasher;
use twox_hash::XxHash32;
#[allow(dead_code)]
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
#[test]
fn test_build_hasher() {
let bh = BuildHasherDefault::<FnvHasher>::default();
let _new_hasher = bh.build_hasher();
let _sminhash: SuperMinHash2<u64, u64, FnvHasher> = SuperMinHash2::new(10, bh);
}
#[test]
fn test_range_intersection_fnv_u64() {
log_init_test();
let va: Vec<usize> = (0..1000).collect();
let vb: Vec<usize> = (900..2000).collect();
let inter = 100;
let jexact = inter as f32 / 2000.;
let size = 100;
let bh = BuildHasherDefault::<FnvHasher>::default();
let mut sminhash: SuperMinHash2<u64, usize, FnvHasher> = SuperMinHash2::new(size, bh);
let resa = sminhash.sketch_slice(&va);
if !resa.is_ok() {
println!("error in sketcing va");
return;
}
let ska = sminhash.get_hsketch().clone();
sminhash.reinit();
let resb = sminhash.sketch_slice(&vb);
if !resb.is_ok() {
println!("error in sketching vb");
return;
}
let skb = sminhash.get_hsketch();
let jac = compute_superminhash_jaccard(&ska, &skb).unwrap();
let sigma = (jac * (1. - jac) / size as f32).sqrt();
println!(
" jaccard estimate {:.3e}, j exact : {:.3e} , sigma : {:.3} ",
jac, jexact, sigma
);
assert!(jac > 0. && jac < jexact + 3. * sigma);
}
#[test]
fn test_range_intersection_fnv_u32() {
log_init_test();
let va: Vec<u64> = (0..1000).collect();
let vb: Vec<u64> = (900..2000).collect();
let inter = 100;
let jexact = inter as f32 / 2000.;
let size = 70000;
let bh = BuildHasherDefault::<XxHash32>::default();
let mut sminhash: SuperMinHash2<u32, u64, XxHash32> = SuperMinHash2::new(size, bh);
let resa = sminhash.sketch_slice(&va);
if !resa.is_ok() {
println!("error in sketcing va");
return;
}
let ska = sminhash.get_hsketch().clone();
sminhash.reinit();
let resb = sminhash.sketch_slice(&vb);
if !resb.is_ok() {
println!("error in sketching vb");
return;
}
let skb = sminhash.get_hsketch();
log::debug!("ska : {:?}", ska);
log::debug!("skb : {:?}", skb);
let jac = compute_superminhash_jaccard(&ska, &skb).unwrap();
let sigma = (jac * (1. - jac) / size as f32).sqrt();
println!(
" jaccard estimate {:.3e}, j exact : {:.3e}, sigma : {:.3e} ",
jac, jexact, sigma
);
assert!(jac > 0. && jac < jexact + 3. * sigma);
}
#[test]
fn test_range_intersection_already_hashed_u64() {
log_init_test();
let va: Vec<u64> = (0..1000).map(|x| invhash::int64_hash(x)).collect();
let vb: Vec<u64> = (900..2000).map(|x| invhash::int64_hash(x)).collect();
let inter = 100;
let jexact = inter as f32 / 2000.;
let size = 500;
let bh = BuildHasherDefault::<NoHashHasher>::default();
let mut sminhash: SuperMinHash2<u64, u64, NoHashHasher> = SuperMinHash2::new(size, bh);
trace!("sketching a ");
let resa = sminhash.sketch_slice(&va);
if !resa.is_ok() {
println!("error in sketcing va");
return;
}
let ska = sminhash.get_hsketch().clone();
sminhash.reinit();
trace!("\n \n sketching b ");
let resb = sminhash.sketch_slice(&vb);
if !resb.is_ok() {
println!("error in sketching vb");
return;
}
let skb = sminhash.get_hsketch();
let jac = compute_superminhash_jaccard(&ska, &skb).unwrap();
let sigma = (jac * (1. - jac) / size as f32).sqrt();
println!(
" jaccard estimate : {:.3e} exact value : {:.3e} , sigma : {:.3e}",
jac, jexact, sigma
);
assert!(jac > 0. && jac < jexact + 3. * sigma);
} }