use std::io::Cursor;
use rand::distr::*;
use rand::prelude::*;
use rand_distr::uniform::SampleUniform;
use rand_xoshiro::Xoshiro256PlusPlus;
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use std::marker::PhantomData;
use murmur3::murmur3_32;
use rand_chacha::ChaCha12Rng;
use num::Float;
pub struct OptDensMinHash<F: Float, D: Hash, H: Hasher + Default> {
hsketch: Vec<F>,
values: Vec<u64>,
init: Vec<bool>,
nb_empty: i64,
b_hasher: BuildHasherDefault<H>,
t_marker: PhantomData<D>,
}
impl<F: Float + SampleUniform + std::fmt::Debug, D: Hash + Copy, H: Hasher + Default>
OptDensMinHash<F, D, H>
{
pub fn new(size: usize, build_hasher: BuildHasherDefault<H>) -> OptDensMinHash<F, D, H> {
let mut sketch_init = Vec::<F>::with_capacity(size);
let mut values = Vec::<u64>::with_capacity(size);
let mut init = Vec::<bool>::with_capacity(size);
let large: F = F::from(u32::MAX).unwrap(); let nb_empty = size as i64;
for _i in 0..size {
sketch_init.push(large);
values.push(u64::MAX);
init.push(false);
}
OptDensMinHash {
hsketch: sketch_init,
values,
init,
nb_empty,
b_hasher: build_hasher,
t_marker: PhantomData,
}
}
pub fn reinit(&mut self) {
let size = self.hsketch.len();
let large: F = F::from(u32::MAX).unwrap();
for i in 0..size {
self.hsketch[i] = large;
self.values[i] = u64::MAX;
self.init[i] = false;
}
self.nb_empty = size as i64;
}
#[allow(unused)]
fn get_nb_empty(&self) -> i64 {
self.nb_empty
}
pub fn get_hsketch(&self) -> &Vec<F> {
if self.nb_empty > 0 {
log::error!("OptDensMinHash: end_sketch should have been called");
std::panic!("OptDensMinHash: end_sketch should have been called")
}
&self.hsketch
}
pub fn get_hsketch_u64(&self) -> Vec<u64> {
if self.nb_empty > 0 {
log::error!("OptDensMinHash: end_sketch should have been called");
std::panic!("OptDensMinHash: end_sketch should have been called")
}
self.values.clone()
}
pub fn get_hsketch_u32(&self) -> Vec<u32> {
if self.nb_empty > 0 {
log::error!("OptDensMinHash: end_sketch should have been called");
std::panic!("OptDensMinHash: end_sketch should have been called")
}
self.values
.iter()
.map(|v| murmur3_32(&mut Cursor::new(v.to_ne_bytes()), 127).unwrap())
.collect()
}
pub fn sketch_slice(&mut self, to_sketch: &[D]) -> anyhow::Result<()> {
let m: usize = self.hsketch.len();
for d in to_sketch {
self.sketch(d);
}
log::debug!(
"optdensminhash::sketch_slice sketch size : {:?}, nb empy slots : {:?}",
m,
self.nb_empty
);
if self.nb_empty > 0 {
let res = self.densify();
assert!(res.is_ok());
}
Ok(())
}
pub fn sketch(&mut self, to_sketch: &D) {
let m = self.hsketch.len();
let unit_range = Uniform::<F>::new(num::zero::<F>(), num::one::<F>()).unwrap();
let hval1: u64 = self.b_hasher.hash_one(&to_sketch);
let mut rand_generator = Xoshiro256PlusPlus::seed_from_u64(hval1);
let r: F = unit_range.sample(&mut rand_generator);
let k: usize = Uniform::<usize>::new(0, m)
.unwrap()
.sample(&mut rand_generator); if r <= self.hsketch[k] {
self.hsketch[k] = r;
self.values[k] = hval1;
if !self.init[k] {
self.init[k] = true;
self.nb_empty -= 1;
}
}
}
pub fn end_sketch(&mut self) {
if self.nb_empty == 0 {
return;
}
let res = self.densify();
assert!(res.is_ok());
}
fn densify(&mut self) -> anyhow::Result<()> {
let m: usize = self.hsketch.len();
let mut nbpass = 1u64;
let inrange = Uniform::<usize>::new(0, m).unwrap();
for k in 0..m {
if !self.init[k] {
let mut rng2 = ChaCha12Rng::seed_from_u64(k as u64 + 123743);
loop {
let j: usize = inrange.sample(&mut rng2);
if self.init[j] {
self.values[k] = self.values[j];
self.hsketch[k] = self.hsketch[j];
self.init[k] = true;
self.nb_empty -= 1;
break;
}
nbpass += 1;
}
}
}
log::debug!("end of pass {}, nb empty : {}", nbpass, self.nb_empty);
assert_eq!(self.nb_empty, 0);
Ok(())
} }
pub struct RevOptDensMinHash<F: Float, D: Hash, H: Hasher + Default> {
hsketch: Vec<F>,
values: Vec<u64>,
init: Vec<bool>,
nb_empty: i64,
b_hasher: BuildHasherDefault<H>,
t_marker: PhantomData<D>,
}
impl<F: Float + SampleUniform + std::fmt::Debug, D: Hash + Copy, H: Hasher + Default>
RevOptDensMinHash<F, D, H>
{
pub fn new(size: usize, build_hasher: BuildHasherDefault<H>) -> RevOptDensMinHash<F, D, H> {
let mut sketch_init = Vec::<F>::with_capacity(size);
let mut values: Vec<u64> = Vec::<u64>::with_capacity(size);
let mut init: Vec<bool> = Vec::<bool>::with_capacity(size);
let large: F = F::from(u32::MAX).unwrap(); let nb_empty = size as i64;
for _i in 0..size {
sketch_init.push(large);
values.push(u64::MAX);
init.push(false);
}
RevOptDensMinHash {
hsketch: sketch_init,
values,
init,
nb_empty,
b_hasher: build_hasher,
t_marker: PhantomData,
}
}
pub fn reinit(&mut self) {
let size = self.hsketch.len();
let large: F = F::from(u32::MAX).unwrap();
for i in 0..size {
self.hsketch[i] = large;
self.values[i] = u64::MAX;
self.init[i] = false;
}
self.nb_empty = size as i64;
}
pub fn get_hsketch(&self) -> &Vec<F> {
if self.nb_empty > 0 {
log::error!("RevOptDensMinHash: end_sketch should have been called");
std::panic!("RevOptDensMinHash: end_sketch should have been called")
}
&self.hsketch
}
pub fn get_hsketch_u64(&self) -> Vec<u64> {
if self.nb_empty > 0 {
log::error!("RevOptDensMinHash: end_sketch should have been called");
std::panic!("RevOptDensMinHash: end_sketch should have been called")
}
self.values.clone()
}
pub fn get_hsketch_u32(&self) -> Vec<u32> {
if self.nb_empty > 0 {
log::error!("OptDensMinHash: end_sketch should have been called");
std::panic!("OptDensMinHash: end_sketch should have been called")
}
self.values
.iter()
.map(|v| murmur3_32(&mut Cursor::new(v.to_ne_bytes()), 127).unwrap())
.collect()
}
pub fn sketch(&mut self, d: &D) {
let m: usize = self.hsketch.len();
let unif_0m = Uniform::<usize>::new(0, m).unwrap();
let hval1: u64 = self.b_hasher.hash_one(&d);
let mut rand_generator = Xoshiro256PlusPlus::seed_from_u64(hval1);
let unit_range = Uniform::<F>::new(num::zero::<F>(), num::one::<F>()).unwrap();
let r: F = unit_range.sample(&mut rand_generator);
let k: usize = unif_0m.sample(&mut rand_generator); if r <= self.hsketch[k] {
self.hsketch[k] = r;
self.values[k] = hval1;
if !self.init[k] {
self.init[k] = true;
self.nb_empty -= 1;
}
}
}
pub fn sketch_slice(&mut self, to_sketch: &[D]) -> anyhow::Result<()> {
let m: usize = self.hsketch.len();
for d in to_sketch {
self.sketch(d);
}
if self.nb_empty > 0 {
let res = self.densify();
assert!(res.is_ok());
}
log::debug!(
"fastdensminhash::sketch_slice sketch size : {:?}, nb empy slots : {:?}",
m,
self.nb_empty
);
Ok(())
}
fn densify(&mut self) -> anyhow::Result<()> {
let m: usize = self.hsketch.len();
let unif_m = Uniform::<usize>::new(0, m).unwrap();
let mut pass: u64 = 1;
while self.nb_empty > 0 {
for k in 0..m {
if self.init[k] {
let mut rng2 =
ChaCha12Rng::seed_from_u64((k as u64 + 1) * m as u64 + pass + 253713);
let j: usize = unif_m.sample(&mut rng2);
if !self.init[j] {
self.values[j] = self.values[k];
self.hsketch[j] = self.hsketch[k];
self.init[j] = true;
self.nb_empty -= 1;
}
}
}
pass += 1;
log::debug!("end of pass {}, nb empty : {}", pass, self.nb_empty);
}
log::debug!("end of pass {}, nb empty : {}", pass, self.nb_empty);
assert_eq!(self.nb_empty, 0);
Ok(())
}
pub fn end_sketch(&mut self) {
if self.nb_empty == 0 {
return;
}
let res = self.densify();
assert!(res.is_ok());
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::jaccard::*;
use fnv::FnvHasher;
#[allow(dead_code)]
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
#[test]
fn test_optdens_manybins_fnv_f64() {
log_init_test();
let vamax = 1000;
let va: Vec<usize> = (0..vamax).collect();
let vbmin = 900;
let vbmax = 2000;
let vb: Vec<usize> = (vbmin..vbmax).collect();
let inter = vamax - vbmin;
let jexact = inter as f64 / vbmax as f64;
let size = 1000;
let nbtest: usize = 50;
let mut deltavec = Vec::<f64>::with_capacity(nbtest);
for j in 1..nbtest {
let sketch_size = size * j;
let opt_res = test_optdens(&va, &vb, jexact, sketch_size);
let res = opt_res.unwrap();
let delta = (jexact - res.0).abs() / res.1;
deltavec.push(delta);
}
let mean_delta = deltavec.iter().sum::<f64>() / deltavec.len() as f64;
log::info!(
"test_optdens_manybins_fnv_f64 mean delta-j/sigma : {:.3e}",
mean_delta
);
}
#[test]
fn test_optdens_fewbins_fnv_f64() {
log_init_test();
let vamax = 300000;
let va: Vec<usize> = (0..vamax).collect();
let vbmin = 50000;
let vbmax = 2 * vamax;
let vb: Vec<usize> = (vbmin..vbmax).collect();
let inter = vamax - vbmin;
let jexact = inter as f64 / vbmax as f64;
let size = 50000;
let _res = test_optdens(&va, &vb, jexact, size);
}
#[test]
fn test_revoptdens_fewbins_fnv_f64() {
log_init_test();
let vamax = 300000;
let va: Vec<usize> = (0..vamax).collect();
let vbmin = 50000;
let vbmax = 2 * vamax;
let vb: Vec<usize> = (vbmin..vbmax).collect();
let inter = vamax - vbmin;
let jexact = inter as f64 / vbmax as f64;
let size = 50000;
let res = test_revoptdens(&va, &vb, jexact, size).unwrap();
assert!(res.0 > 0. && (res.0 - jexact).abs() < 3. * res.1);
}
#[test]
fn test_revoptdens_manybins_fnv_f64() {
log_init_test();
let vamax = 1000;
let va: Vec<usize> = (0..vamax).collect();
let vbmin = 900;
let vbmax = 2000;
let vb: Vec<usize> = (vbmin..vbmax).collect();
let inter = vamax - vbmin;
let jexact = inter as f64 / vbmax as f64;
let size = 1000;
let nbtest: usize = 50;
let mut deltavec = Vec::<f64>::with_capacity(nbtest);
for j in 1..nbtest {
let sketch_size = size * j;
let opt_res = test_revoptdens(&va, &vb, jexact, sketch_size);
let res = opt_res.unwrap();
let delta = (jexact - res.0).abs() / res.1;
deltavec.push(delta);
}
let mean_delta = deltavec.iter().sum::<f64>() / deltavec.len() as f64;
log::info!(
"test_revoptdens_manybins_fnv_f64 mean delta-j/sigma : {:.3e}",
mean_delta
);
}
fn test_optdens(
va: &Vec<usize>,
vb: &Vec<usize>,
jexact: f64,
size: usize,
) -> Result<(f64, f64), ()> {
log::info!(
"test_optdens length va : {} length vb : {}",
va.len(),
vb.len(),
);
let bh = BuildHasherDefault::<FnvHasher>::default();
let mut sminhash: OptDensMinHash<f64, usize, FnvHasher> = OptDensMinHash::new(size, bh);
let resa = sminhash.sketch_slice(&va);
if !resa.is_ok() {
println!("error in sketcing va");
return Err(());
}
let ska = sminhash.get_hsketch().clone();
let ska_u64 = sminhash.get_hsketch_u64().clone();
let ska_u32 = sminhash.get_hsketch_u32();
sminhash.reinit();
let resb = sminhash.sketch_slice(&vb);
if !resb.is_ok() {
println!("error in sketching vb");
return Err(());
}
let skb = sminhash.get_hsketch();
let skb_u64 = sminhash.get_hsketch_u64();
let skb_u32 = sminhash.get_hsketch_u32();
let jac = get_jaccard_index_estimate(&ska, &skb).unwrap();
let sigma = (jexact * (1. - jexact) / size as f64).sqrt();
let delta = (jac - jexact).abs() / sigma;
log::info!(
" f64 sketch jaccard estimate {:.3e}, j exact : {:.3e}, sigma : {:.3e} j-error/sigma : {:.3e}",
jac,
jexact,
sigma,
delta
);
let jac_u64 = get_jaccard_index_estimate(&ska_u64, &skb_u64).unwrap();
let sigma = (jexact * (1. - jexact) / size as f64).sqrt();
let delta = (jac_u64 - jexact).abs() / sigma;
log::info!(
" u64 sketch jaccard estimate {:.3e}, j exact : {:.3e}, sigma : {:.3e} j-error/sigma : {:.3e}",
jac_u64,
jexact,
sigma,
delta
);
let jac_u32 = get_jaccard_index_estimate(&ska_u32, &skb_u32).unwrap();
let sigma = (jexact * (1. - jexact) / size as f64).sqrt();
let delta = (jac_u32 - jexact).abs() / sigma;
log::info!(
" u32 sketch jaccard estimate {:.3e}, j exact : {:.3e}, sigma : {:.3e} j-error/sigma : {:.3e}",
jac_u32,
jexact,
sigma,
delta
);
Ok((jac, sigma))
}
fn test_revoptdens(
va: &Vec<usize>,
vb: &Vec<usize>,
jexact: f64,
size: usize,
) -> Result<(f64, f64), ()> {
let bh = BuildHasherDefault::<FnvHasher>::default();
let mut sminhash: RevOptDensMinHash<f64, usize, FnvHasher> =
RevOptDensMinHash::new(size, bh);
let resa = sminhash.sketch_slice(&va);
if !resa.is_ok() {
println!("error in sketcing va");
return Err(());
}
let ska = sminhash.get_hsketch().clone();
let ska_u64 = sminhash.get_hsketch_u64().clone();
let ska_u32 = sminhash.get_hsketch_u32();
sminhash.reinit();
let resb = sminhash.sketch_slice(&vb);
if !resb.is_ok() {
println!("error in sketching vb");
return Err(());
}
let skb = sminhash.get_hsketch();
let skb_u64 = sminhash.get_hsketch_u64();
let skb_u32 = sminhash.get_hsketch_u32();
let jac = get_jaccard_index_estimate(&ska, &skb).unwrap();
let sigma = (jexact * (1. - jexact) / size as f64).sqrt();
let delta = (jac - jexact).abs() / sigma;
log::info!(
" jaccard estimate {:.3e}, j exact : {:.3e}, sigma : {:.3e} j-error/sigma : {:.3e}",
jac,
jexact,
sigma,
delta
);
assert!(jac > 0.);
let jac_u64 = get_jaccard_index_estimate(&ska_u64, &skb_u64).unwrap();
let sigma = (jexact * (1. - jexact) / size as f64).sqrt();
let delta = (jac_u64 - jexact).abs() / sigma;
log::info!(
" u64 sketch jaccard estimate {:.3e}, j exact : {:.3e}, sigma : {:.3e} j-error/sigma : {:.3e}",
jac_u64,
jexact,
sigma,
delta
);
let jac_u32 = get_jaccard_index_estimate(&ska_u32, &skb_u32).unwrap();
let sigma = (jexact * (1. - jexact) / size as f64).sqrt();
let delta = (jac_u32 - jexact).abs() / sigma;
log::info!(
" u32 sketch jaccard estimate {:.3e}, j exact : {:.3e}, sigma : {:.3e} j-error/sigma : {:.3e}",
jac_u32,
jexact,
sigma,
delta
);
Ok((jac, sigma))
} }