#[allow(unused_imports)]
use log::{trace,debug};
use std::fmt::{Debug};
use rand::distributions::{Distribution,Uniform};
use rand_distr::Exp1;
use rand::prelude::*;
use rand_xoshiro::Xoshiro256PlusPlus;
use std::hash::{BuildHasher, BuildHasherDefault, Hasher, Hash};
use indexmap::{IndexMap};
#[derive(Clone, Copy, Debug)]
pub struct ExpRestricted01 {
lambda : f64,
c1 : f64,
c2 : f64,
c3 : f64,
unit_range : Uniform<f64>,
}
impl ExpRestricted01 {
pub fn new(lambda : f64) -> Self {
let c1 = lambda.exp_m1() / lambda;
let c2 = (2./(1. + (-lambda).exp())).ln()/ lambda;
let c3 = (1. - (-lambda).exp()) / lambda;
ExpRestricted01{lambda, c1, c2, c3, unit_range:Uniform::<f64>::new(0.,1.)}
}
pub fn get_lambda(&self) -> f64 {
self.lambda
}
}
impl Distribution<f64> for ExpRestricted01 {
fn sample<R : Rng + ?Sized>(&self, rng: &mut R) -> f64 {
let mut x = self.c1 * rng.sample(&self.unit_range);
if x < 1. { return x }
loop {
x = rng.sample(&self.unit_range);
if x < self.c2 { return x}
let mut y = 0.5 * rng.sample(&self.unit_range);
if y > 1. - x {
x = 1. - x;
y = 1. - y;
}
if x <= self.c3 * (1. - y) { return x }
if self.c1 * y <= (1. - x) { return x }
if y * self.c1 * self.lambda <= (self.lambda * (1.- x)).exp_m1() { return x }
}
}
}
struct MaxValueTracker {
m : usize,
last_index : usize,
values : Vec<f64>
}
impl MaxValueTracker {
pub fn new(m:usize) -> Self {
let last_index = ((m << 1) - 2) as usize;
let vlen = last_index+1;
let values : Vec::<f64> = (0..vlen).map( |_| f64::INFINITY).collect();
MaxValueTracker{m, last_index, values}
}
fn update(&mut self, k:usize, value:f64) {
assert!(k < self.m);
trace!("\n max value tracker update k, value , value at k {} {} {} ", k, value, self.values[k]);
let mut current_value = value;
let mut current_k = k;
let mut more = false;
if current_value < self.values[current_k] {
more = true;
}
while more {
trace!("mxvt update k value {} {}", current_k, current_value);
self.values[current_k] = current_value;
let pidx = self.m + (current_k/2) as usize;
if pidx > self.last_index {
break;
}
let siblidx = current_k^1;
assert!(self.values[siblidx] <= self.values[pidx]);
assert!(self.values[current_k] <= self.values[pidx]);
if self.values[siblidx] >= self.values[pidx] && self.values[current_k] >= self.values[pidx] {
break;
}
trace!("propagating current_value {} sibling {} ? ", current_value, self.values[siblidx]);
if current_value < self.values[siblidx] {
trace!(" propagating sibling value {} to parent {}", self.values[siblidx], pidx);
current_value = self.values[siblidx];
}
else {
trace!(" propagating current_value {} to parent {}", current_value, pidx);
}
current_k = pidx;
if current_value >= self.values[current_k] {
more = false;
}
}
}
pub fn get_max_value(&self) -> f64 {
return self.values[self.last_index]
}
#[allow(dead_code)]
pub fn get_parent_slot(&self, slot : usize) -> usize {
assert!(slot <= self.m);
return self.m + (slot/2) as usize
}
#[allow(dead_code)]
pub fn get_value(&self, slot: usize) -> f64 {
self.values[slot]
}
#[allow(dead_code)]
pub fn dump(&self) {
println!("\n\nMaxValueTracker dump : ");
for i in 0..self.values.len() {
println!(" i value {} {} ", i , self.values[i]);
}
}
}
pub trait WeightedSet {
type Object;
fn get_weight(&self, obj:&Self::Object) -> f64;
}
pub struct ProbMinHash3<D, H: Hasher+Default>
where D:Copy+Eq+Hash+Debug {
m : usize,
b_hasher: BuildHasherDefault<H>,
maxvaluetracker : MaxValueTracker,
exp01 : ExpRestricted01,
signature : Vec<D>,
}
impl<D,H> ProbMinHash3<D, H>
where D:Copy+Eq+Debug+Hash , H: Hasher+Default {
pub fn new(nbhash:usize, initobj : D) -> Self {
assert!(nbhash >= 2);
let lambda = ((nbhash as f64)/((nbhash - 1) as f64)).ln();
let h_signature = (0..nbhash).map( |_| initobj).collect();
ProbMinHash3{m:nbhash, b_hasher : BuildHasherDefault::<H>::default(),
maxvaluetracker: MaxValueTracker::new(nbhash as usize),
exp01:ExpRestricted01::new(lambda), signature:h_signature}
}
pub fn hash_item(&mut self, id:D, weight:f64) {
assert!(weight > 0.);
trace!("hash_item : id {:?} weight {} ", id, weight);
let winv = 1./weight;
let unif0m = Uniform::<usize>::new(0, self.m);
let mut hasher = self.b_hasher.build_hasher();
id.hash(&mut hasher);
let id_hash : u64 = hasher.finish();
let mut rng = Xoshiro256PlusPlus::seed_from_u64(id_hash);
let mut h = winv * self.exp01.sample(&mut rng);
let mut i = 1;
let mut qmax = self.maxvaluetracker.get_max_value();
while h < qmax {
let k = unif0m.sample(&mut rng);
assert!(k < self.m);
if h < self.maxvaluetracker.values[k] {
self.signature[k] = id;
self.maxvaluetracker.update(k, h);
qmax = self.maxvaluetracker.get_max_value();
}
h = winv * i as f64;
i = i + 1;
if h >= qmax {
break;
}
h = h + winv * self.exp01.sample(&mut rng);
trace!("hash_item : i h qmax = {} {} {} ", i, h, qmax);
}
}
pub fn get_signature(&self) -> &Vec<D> {
return &self.signature
}
pub fn hash_wset<T>(&mut self, data: &mut T)
where T: WeightedSet<Object=D> + Iterator<Item=D> {
while let Some(obj) = &data.next() {
let weight = data.get_weight(&obj);
self.hash_item(*obj, weight);
}
}
pub fn hash_weigthed_idxmap<Hidx>(&mut self, data: &mut IndexMap<D, f64, Hidx>)
where Hidx : std::hash::BuildHasher,
{
let mut objects = data.keys();
loop {
match objects.next() {
Some(key) => {
trace!(" retrieved key {:?} ", key);
if let Some(weight) = data.get(key) {
self.hash_item(*key, *weight);
};
},
None => break,
}
}
}
}
pub struct ProbMinHash3a<D,H>
where D:Copy+Eq+Hash+Debug,
H:Hasher+Default {
m : usize,
b_hasher : BuildHasherDefault::<H>,
maxvaluetracker : MaxValueTracker,
exp01 : ExpRestricted01,
to_be_processed : Vec<(D, f64, Xoshiro256PlusPlus)>,
signature : Vec<D>,
}
impl <D,H> ProbMinHash3a<D,H>
where D:Copy+Eq+Debug+Hash, H : Hasher+Default {
pub fn new(nbhash:usize, initobj : D) -> Self {
assert!(nbhash >= 2);
let lambda = ((nbhash as f64)/((nbhash - 1) as f64)).ln();
let h_signature = (0..nbhash).map( |_| initobj).collect();
ProbMinHash3a{m:nbhash,
maxvaluetracker: MaxValueTracker::new(nbhash as usize),
b_hasher : BuildHasherDefault::<H>::default(),
exp01:ExpRestricted01::new(lambda),
to_be_processed : Vec::<(D, f64, Xoshiro256PlusPlus)>::new(),
signature:h_signature}
}
pub fn hash_weigthed_idxmap<Hidx>(&mut self, data: &IndexMap<D, f64, Hidx>)
where Hidx : std::hash::BuildHasher {
let mut objects = data.keys();
let unif0m = Uniform::<usize>::new(0, self.m);
let mut qmax:f64 = self.maxvaluetracker.get_max_value();
loop {
if let Some(key) = objects.next() {
if let Some(weight) = data.get(key) {
trace!("hash_item : id {:?} weight {} ", key, weight);
let winv = 1./weight;
let mut hasher = self.b_hasher.build_hasher();
key.hash(&mut hasher);
let new_hash : u64 = hasher.finish();
let mut rng = Xoshiro256PlusPlus::seed_from_u64(new_hash);
let h = winv * self.exp01.sample(&mut rng);
qmax = self.maxvaluetracker.get_max_value();
if h < qmax {
let k = unif0m.sample(&mut rng);
assert!(k < self.m);
if h < self.maxvaluetracker.values[k] {
self.signature[k] = *key;
self.maxvaluetracker.update(k, h);
qmax = self.maxvaluetracker.get_max_value();
}
if winv < qmax {
self.to_be_processed.push((*key,winv, rng));
}
}
}
}
else {
break;
}
}
let mut i = 2;
while self.to_be_processed.len() > 0 {
let mut insert_pos = 0;
trace!(" i : {:?} , nb to process : {}", i , self.to_be_processed.len());
for j in 0..self.to_be_processed.len() {
let (key, winv, rng) = &mut self.to_be_processed[j];
let mut h = (*winv) * (i - 1) as f64;
if h < self.maxvaluetracker.get_max_value() {
h = h + (*winv) * self.exp01.sample(rng);
let k = unif0m.sample(rng);
if h < self.maxvaluetracker.values[k] {
self.signature[k] = *key;
self.maxvaluetracker.update(k, h);
qmax = self.maxvaluetracker.get_max_value();
}
if (*winv) * (i as f64) < qmax {
self.to_be_processed[insert_pos] = (*key, *winv, rng.clone());
insert_pos = insert_pos + 1;
}
}
}
self.to_be_processed.truncate(insert_pos);
i = i+1;
}
}
pub fn get_signature(&self) -> &Vec<D> {
return &self.signature;
}
}
pub struct FYshuffle {
m: usize,
unif_01 : Uniform<f64>,
v : Vec<usize>,
lastidx : usize,
}
impl FYshuffle {
pub fn new(m: usize) -> FYshuffle {
let v : Vec<usize> = (0..m).map(|x| x).collect();
FYshuffle{m:m, unif_01: Uniform::<f64>::new(0., 1.), v : v, lastidx:m}
}
pub fn next(&mut self, rng : &mut Xoshiro256PlusPlus) -> usize {
if self.lastidx >= self.m {
self.reset();
}
let xsi = self.unif_01.sample(rng);
let idx = self.lastidx + (xsi * (self.m - self.lastidx) as f64) as usize;
let val = self.v[idx];
self.v[idx] = self.v[self.lastidx];
self.v[self.lastidx] = val;
self.lastidx += 1;
val
}
pub fn reset(&mut self) {
trace!("resetting shuffle lastidx = {}", self.lastidx);
self.lastidx = 0;
for i in 0..self.m {
self.v[i] = i;
}
}
pub fn get_values(&self) -> &Vec<usize> {
&self.v
}
}
pub struct ProbMinHash2<D,H>
where D:Copy+Eq+Hash+Debug,H:Hasher+Default {
m : usize,
b_hasher : BuildHasherDefault<H>,
maxvaluetracker : MaxValueTracker,
permut_generator : FYshuffle,
betas : Vec<f64>,
signature : Vec<D>,
}
impl <D,H> ProbMinHash2<D,H>
where D:Copy+Eq+Hash+Debug,H:Hasher+Default {
pub fn new(nbhash:usize, initobj:D) -> Self {
let h_signature = (0..nbhash).map( |_| initobj).collect();
let betas : Vec<f64> = (0..nbhash).map(| x | (nbhash as f64)/ (nbhash - x - 1) as f64).collect();
ProbMinHash2{ m:nbhash,
b_hasher : BuildHasherDefault::<H>::default(),
maxvaluetracker: MaxValueTracker::new(nbhash as usize),
permut_generator : FYshuffle::new(nbhash),
betas : betas,
signature:h_signature}
}
pub fn hash_item(&mut self, id:D, weight:f64) {
assert!(weight > 0.);
trace!("hash_item : id {:?} weight {} ", id, weight);
let winv : f64 = 1./weight;
let mut hasher = self.b_hasher.build_hasher();
id.hash(&mut hasher);
let id_hash : u64 = hasher.finish();
let mut rng = Xoshiro256PlusPlus::seed_from_u64(id_hash);
self.permut_generator.reset();
let mut i = 0;
let x : f64 = Exp1.sample(&mut rng);
let mut h : f64 = winv * x;
let mut qmax = self.maxvaluetracker.get_max_value();
while h < qmax {
let k = self.permut_generator.next(&mut rng);
if h < self.maxvaluetracker.values[k] {
self.signature[k] = id;
self.maxvaluetracker.update(k, h);
qmax = self.maxvaluetracker.get_max_value();
if h >= qmax { break;}
}
let x : f64 = Exp1.sample(&mut rng);
h = h + winv * self.betas[i] * x;
i = i+1;
assert!(i < self.m);
}
}
pub fn hash_wset<T>(&mut self, data: &mut T)
where T: WeightedSet<Object=D> + Iterator<Item=D> {
while let Some(obj) = &data.next() {
let weight = data.get_weight(&obj);
self.hash_item(*obj, weight);
}
}
pub fn get_signature(&self) -> &Vec<D> {
return &self.signature;
}
}
pub fn compute_probminhash_jaccard<D:Eq>(siga : &Vec<D>, sigb : &Vec<D>) -> f64 {
let sig_size = siga.len();
assert_eq!(sig_size, sigb.len());
let mut inter = 0;
for i in 0..siga.len() {
if siga[i] == sigb[i] {
inter += 1;
}
}
let jp = inter as f64/siga.len() as f64;
jp
}
#[cfg(test)]
mod tests {
use log::*;
use indexmap::{IndexMap};
use fnv::{FnvHasher,FnvBuildHasher};
type FnvIndexMap<K, V> = IndexMap<K, V, FnvBuildHasher>;
#[allow(dead_code)]
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
use super::*;
#[test]
fn test_exp01() {
log_init_test();
let mut rng = Xoshiro256PlusPlus::seed_from_u64(234567 as u64);
let mut xsi;
let lambda = 0.5f64;
let mut mu_th = - lambda * (-lambda).exp() - (-lambda).exp_m1();
mu_th = mu_th / (- lambda * (-lambda).exp_m1());
let nb_sampled = 10_000_000;
let mut sampled = Vec::<f64>::with_capacity(nb_sampled);
let exp01 = ExpRestricted01::new(lambda);
for _ in 0..nb_sampled {
xsi = exp01.sample(&mut rng);
sampled.push(xsi);
}
let sum = sampled.iter().fold(0., |acc, x| acc +x);
let mean = sum / nb_sampled as f64;
let mut s2 = sampled.iter().fold(0., |acc, x| acc +(x-mean)*(x-mean));
s2 = s2 / (nb_sampled - 1) as f64;
println!("mu_th {} mean {} sigma {} ", mu_th, mean, (s2/nb_sampled as f64).sqrt());
let test = (mu_th - mean) / (s2/nb_sampled as f64).sqrt();
println!("test {}", test);
assert!(test.abs() < 3.);
}
#[test]
fn test_max_value_tracker() {
log_init_test();
let mut rng = Xoshiro256PlusPlus::seed_from_u64(45678 as u64);
let nbhash = 10;
let unif_01 = Uniform::<f64>::new(0., 1.);
let unif_m = Uniform::<usize>::new(0, nbhash);
let mut tracker = MaxValueTracker::new(nbhash);
let mut vmax = 0f64;
let loop_size = 500;
for _ in 0..loop_size {
let k = unif_m.sample(&mut rng);
assert!(k < nbhash);
let xsi = unif_01.sample(&mut rng);
vmax = vmax.max(xsi);
tracker.update(k,xsi);
assert!( !( vmax > tracker.get_max_value() && vmax < tracker.get_max_value()) );
}
for i in 0..nbhash {
let sibling = i^1;
let sibling_value = tracker.get_value(sibling);
let i_value = tracker.get_value(i);
let pidx = tracker.get_parent_slot(i);
let pidx_value = tracker.get_value(pidx);
assert!(sibling_value <= pidx_value && i_value <= pidx_value);
assert!( !( sibling_value > pidx_value && i_value > pidx_value) );
}
assert!(!( vmax > tracker.get_max_value() && vmax < tracker.get_max_value() ));
}
#[test]
fn test_probminhash3_count_intersection_equal_weights() {
log_init_test();
debug!("test_probminhash3_count_intersection_equal_weights");
println!("test_probminhash3_count_intersection_equal_weights");
let set_size = 100;
let nbhash = 50;
let mut wa = Vec::<f64>::with_capacity(set_size);
let mut wb = Vec::<f64>::with_capacity(set_size);
for i in 0..set_size {
if i < 70 {
wa.push(20.);
}
else {
wa.push(0.);
}
}
for i in 0..set_size {
if i < 50 {
wb.push(0.);
}
else {
wb.push(10.);
}
}
let mut jp = 0.;
for i in 0..set_size {
if wa[i] > 0. && wb[i] > 0. {
let mut den = 0.;
for j in 0..set_size {
den += (wa[j]/wa[i]).max(wb[j]/wb[i]);
}
jp += 1./den;
}
}
trace!("Jp = {} ",jp);
trace!("\n\n hashing wa");
let mut waprobhash = ProbMinHash3::<usize, FnvHasher>::new(nbhash, 0);
for i in 0..set_size {
if wa[i] > 0. {
waprobhash.hash_item(i, wa[i]);
}
}
trace!("\n\n hashing wb");
let mut wbprobhash = ProbMinHash3::<usize, FnvHasher>::new(nbhash,0 );
for i in 0..set_size {
if wb[i] > 0. {
wbprobhash.hash_item(i, wb[i]);
}
}
let siga = waprobhash.get_signature();
let sigb = wbprobhash.get_signature();
let jp_approx = compute_probminhash_jaccard(siga, sigb);
info!("exact jp = {} ,jp estimated = {} ", jp, jp_approx);
assert!(jp_approx > 0.);
}
#[test]
fn test_probminhash3a_count_intersection_unequal_weights() {
log_init_test();
println!("test_probminhash3a_count_intersection_unequal_weights");
debug!("test_probminhash3a_count_intersection_unequal_weights");
let set_size = 100;
let nbhash = 50;
let mut wa : FnvIndexMap::<usize,f64> = FnvIndexMap::with_capacity_and_hasher(70, FnvBuildHasher::default());
for i in 0..set_size {
if i < 70 {
*wa.entry(i).or_insert(0.) += 2. * i as f64;
}
}
let mut wb : FnvIndexMap::<usize,f64> = FnvIndexMap::with_capacity_and_hasher(70, FnvBuildHasher::default());
for i in 0..set_size {
if i >= 50 {
wb.entry(i).or_insert((i as f64).powi(4));
}
}
trace!("\n\n hashing wa");
let mut waprobhash = ProbMinHash3a::<usize, FnvHasher>::new(nbhash, 0);
waprobhash.hash_weigthed_idxmap(&wa);
trace!("\n\n hashing wb");
let mut wbprobhash = ProbMinHash3a::<usize, FnvHasher>::new(nbhash, 0);
wbprobhash.hash_weigthed_idxmap(&wb);
let siga = waprobhash.get_signature();
let sigb = wbprobhash.get_signature();
let jp_approx = compute_probminhash_jaccard(siga, sigb);
let mut jp = 0.;
for i in 0..set_size {
let wa_i = *wa.get(&i).unwrap_or(&0.);
let wb_i = *wb.get(&i).unwrap_or(&0.);
if wa_i > 0. && wb_i > 0. {
let mut den = 0.;
for j in 0..set_size {
let wa_j = *wa.get(&j).unwrap_or(&0.);
let wb_j = *wb.get(&j).unwrap_or(&0.);
den += (wa_j/wa_i).max(wb_j/wb_i);
}
jp += 1./den;
}
}
trace!("Jp = {} ",jp);
info!("jp exact= {jptheo:.3} , jp estimate = {jp_est:.3} ", jptheo=jp, jp_est=jp_approx);
assert!(jp_approx > 0.);
}
#[test]
fn test_probminhash3_count_intersection_unequal_weights() {
log_init_test();
println!("test_probminhash3_count_intersection_unequal_weights");
debug!("test_probminhash3_count_intersection_unequal_weights");
let set_size = 100;
let nbhash = 50;
let mut wa = Vec::<f64>::with_capacity(set_size);
let mut wb = Vec::<f64>::with_capacity(set_size);
for i in 0..set_size {
if i < 70 {
wa.push(2. * i as f64);
}
else {
wa.push(0.);
}
}
for i in 0..set_size {
if i < 50 {
wb.push(0.);
}
else {
wb.push( (i as f64).powi(4));
}
}
trace!("\n\n hashing wa");
let mut waprobhash = ProbMinHash3::<usize, FnvHasher>::new(nbhash, 0);
for i in 0..set_size {
if wa[i] > 0. {
waprobhash.hash_item(i, wa[i]);
}
}
trace!("\n\n hashing wb");
let mut wbprobhash = ProbMinHash3::<usize, FnvHasher>::new(nbhash, 0);
for i in 0..set_size {
if wb[i] > 0. {
wbprobhash.hash_item(i, wb[i]);
}
}
let siga = waprobhash.get_signature();
let sigb = wbprobhash.get_signature();
let jp_approx = compute_probminhash_jaccard(siga, sigb);
let mut jp = 0.;
for i in 0..set_size {
if wa[i] > 0. && wb[i] > 0. {
let mut den = 0.;
for j in 0..set_size {
den += (wa[j]/wa[i]).max(wb[j]/wb[i]);
}
jp += 1./den;
}
}
trace!("Jp = {} ",jp);
info!("jp exact = {jp_exact:.3} , jp estimate {jp_estimate:.3} ", jp_exact=jp, jp_estimate=jp_approx);
assert!(jp_approx > 0.);
}
#[test]
fn test_fyshuffle() {
log_init_test();
let mut rng = Xoshiro256PlusPlus::seed_from_u64(45678 as u64);
let m = 4;
let mut fypermut = FYshuffle::new(m);
let nb_permut = 500000;
let mut freq : Vec<usize> = (0..m).map(|_| 0).collect();
for _ in 0..nb_permut {
for _ in 0..m {
fypermut.next(&mut rng);
}
let v = fypermut.get_values();
for k in 0..v.len() {
freq[k] += v[k];
}
fypermut.reset();
}
let th_freq = 1.5;
let th_var = 5./4.;
let sigma = (th_var/ (nb_permut as f64)).sqrt();
for i in 0..freq.len() {
let rel_error = ((freq[i] as f64)/ (nb_permut as f64) - th_freq)/ sigma;
info!(" slot i {} , rel error = {}", i, rel_error);
assert!( rel_error.abs() < 3.)
}
info!(" freq = {:?}", freq);
for _ in 0..15 {
for _ in 0..m {
fypermut.next(&mut rng);
}
println!("permut state {:?} ", fypermut.get_values());
fypermut.reset();
}
}
#[test]
fn test_probminhash2_count_intersection_unequal_weights() {
log_init_test();
println!("test_probminhash2_count_intersection_unequal_weights");
debug!("test_probminhash2_count_intersection_unequal_weights");
let set_size = 100;
let nbhash = 50;
let mut wa = Vec::<f64>::with_capacity(set_size);
let mut wb = Vec::<f64>::with_capacity(set_size);
for i in 0..set_size {
if i < 70 {
wa.push(2. * i as f64);
}
else {
wa.push(0.);
}
}
for i in 0..set_size {
if i < 50 {
wb.push(0.);
}
else {
wb.push( (i as f64).powi(4));
}
}
let mut jp_exact = 0.;
for i in 0..set_size {
if wa[i] > 0. && wb[i] > 0. {
let mut den = 0.;
for j in 0..set_size {
den += (wa[j]/wa[i]).max(wb[j]/wb[i]);
}
jp_exact += 1./den;
}
}
trace!("Jp = {} ",jp_exact);
trace!("\n\n hashing wa");
let mut waprobhash = ProbMinHash2::<usize, FnvHasher>::new(nbhash, 0);
for i in 0..set_size {
if wa[i] > 0. {
waprobhash.hash_item(i, wa[i]);
}
}
trace!("\n\n hashing wb");
let mut wbprobhash = ProbMinHash2::<usize, FnvHasher>::new(nbhash, 0);
for i in 0..set_size {
if wb[i] > 0. {
wbprobhash.hash_item(i, wb[i]);
}
}
let siga = waprobhash.get_signature();
let sigb = wbprobhash.get_signature();
let jp_estimate = compute_probminhash_jaccard(siga, sigb);
info!("jp exact = {jp_exact:.3} , jp estimate {jp_estimate:.3} ", jp_exact=jp_exact, jp_estimate=jp_estimate);
assert!(jp_estimate > 0.);
}
#[test]
fn test_probminhash2_count_intersection_equal_weights() {
log_init_test();
println!("test_probminhash2_count_intersection_equal_weights");
debug!("test_probminhash2_count_intersection_equal_weights");
let set_size = 100;
let nbhash = 50;
let mut wa = Vec::<f64>::with_capacity(set_size);
let mut wb = Vec::<f64>::with_capacity(set_size);
for i in 0..set_size {
if i < 70 {
wa.push(1.);
}
else {
wa.push(0.);
}
}
for i in 0..set_size {
if i < 50 {
wb.push(0.);
}
else {
wb.push(1.);
}
}
let mut jp_exact = 0.;
for i in 0..set_size {
if wa[i] > 0. && wb[i] > 0. {
let mut den = 0.;
for j in 0..set_size {
den += (wa[j]/wa[i]).max(wb[j]/wb[i]);
}
jp_exact += 1./den;
}
}
trace!("Jp = {} ",jp_exact);
trace!("\n\n hashing wa");
let mut waprobhash = ProbMinHash2::<usize, FnvHasher>::new(nbhash, 0);
for i in 0..set_size {
if wa[i] > 0. {
waprobhash.hash_item(i, wa[i]);
}
}
trace!("\n\n hashing wb");
let mut wbprobhash = ProbMinHash2::<usize, FnvHasher>::new(nbhash, 0);
for i in 0..set_size {
if wb[i] > 0. {
wbprobhash.hash_item(i, wb[i]);
}
}
let siga = waprobhash.get_signature();
let sigb = wbprobhash.get_signature();
let jp_estimate = compute_probminhash_jaccard(siga, sigb);
info!("jp exact = {jp_exact:.3} , jp estimate {jp_estimate:.3} ", jp_exact=jp_exact, jp_estimate=jp_estimate);
assert!(jp_estimate > 0.);
}
}