#[allow(unused_imports)]
use log::{debug, trace};
use std::fmt::Debug;
use num;
use rand::distr::{Distribution, Uniform};
use rand::prelude::*;
use rand_xoshiro::Xoshiro256PlusPlus;
use indexmap::IndexMap;
use std::collections::HashMap;
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use crate::exp01::*;
use crate::maxvaluetrack::*;
use crate::weightedset::*;
pub struct ProbMinHash3<D, H: Hasher + Default>
where
D: Copy + Eq + Hash + Debug,
{
m: usize,
b_hasher: BuildHasherDefault<H>,
maxvaluetracker: MaxValueTracker<f64>,
exp01: ExpRestricted01,
signature: Vec<D>,
}
impl<D, H> ProbMinHash3<D, H>
where
D: Copy + Eq + Debug + Hash,
H: Hasher + Default,
{
pub fn new(nbhash: usize, initobj: D) -> Self {
assert!(nbhash >= 2);
let lambda = ((nbhash as f64) / ((nbhash - 1) as f64)).ln();
let h_signature = (0..nbhash).map(|_| initobj).collect();
ProbMinHash3 {
m: nbhash,
b_hasher: BuildHasherDefault::<H>::default(),
maxvaluetracker: MaxValueTracker::new(nbhash),
exp01: ExpRestricted01::new(lambda),
signature: h_signature,
}
}
pub fn hash_item<F>(&mut self, id: D, weight_a: &F)
where
F: num::ToPrimitive + std::fmt::Display,
{
let weight = weight_a.to_f64().unwrap();
assert!(weight > 0.);
trace!("hash_item : id {:?} weight {} ", id, weight);
let winv = 1. / weight;
let unif0m = Uniform::<usize>::new(0, self.m).unwrap();
let id_hash: u64 = self.b_hasher.hash_one(&id);
let mut rng = Xoshiro256PlusPlus::seed_from_u64(id_hash);
let mut h = winv * self.exp01.sample(&mut rng);
let mut i = 1;
let mut qmax = self.maxvaluetracker.get_max_value();
while h < qmax {
let k = unif0m.sample(&mut rng);
assert!(k < self.m);
if h < self.maxvaluetracker.get_value(k) {
self.signature[k] = id;
self.maxvaluetracker.update(k, h);
qmax = self.maxvaluetracker.get_max_value();
}
h = winv * i as f64;
i += 1;
if h >= qmax {
break;
}
h += winv * self.exp01.sample(&mut rng);
trace!("hash_item : i h qmax = {} {} {} ", i, h, qmax);
}
}
pub fn get_signature(&self) -> &Vec<D> {
&self.signature
}
pub fn hash_wset<T>(&mut self, data: &mut T)
where
T: WeightedSet<Object = D> + Iterator<Item = D>,
{
while let Some(obj) = &data.next() {
let weight = data.get_weight(obj);
self.hash_item(*obj, &weight);
}
}
pub fn hash_weigthed_idxmap<Hidx, F>(&mut self, data: &IndexMap<D, F, Hidx>)
where
Hidx: std::hash::BuildHasher,
F: num::ToPrimitive + std::fmt::Display,
{
let iter = data.iter();
for (key, weigth) in iter {
trace!(" retrieved key {:?} ", key);
self.hash_item(*key, weigth);
}
}
pub fn hash_weigthed_hashmap<Hidx, F>(&mut self, data: &HashMap<D, F, Hidx>)
where
Hidx: std::hash::BuildHasher,
F: num::ToPrimitive + std::fmt::Display,
{
let iter = data.iter();
for (key, weight) in iter {
trace!(" retrieved key {:?} ", key);
self.hash_item(*key, weight);
}
} }
pub struct ProbMinHash3a<D, H>
where
D: Copy + Eq + Hash + Debug,
H: Hasher + Default,
{
m: usize,
b_hasher: BuildHasherDefault<H>,
maxvaluetracker: MaxValueTracker<f64>,
exp01: ExpRestricted01,
to_be_processed: Vec<(D, f64, Xoshiro256PlusPlus)>,
signature: Vec<D>,
}
impl<D, H> ProbMinHash3a<D, H>
where
D: Copy + Eq + Debug + Hash,
H: Hasher + Default,
{
pub fn new(nbhash: usize, initobj: D) -> Self {
assert!(nbhash >= 2);
let lambda = ((nbhash as f64) / ((nbhash - 1) as f64)).ln();
let h_signature = (0..nbhash).map(|_| initobj).collect();
ProbMinHash3a {
m: nbhash,
maxvaluetracker: MaxValueTracker::new(nbhash),
b_hasher: BuildHasherDefault::<H>::default(),
exp01: ExpRestricted01::new(lambda),
to_be_processed: Vec::<(D, f64, Xoshiro256PlusPlus)>::new(),
signature: h_signature,
}
}
pub fn hash_weigthed_idxmap<Hidx, F>(&mut self, data: &IndexMap<D, F, Hidx>)
where
Hidx: std::hash::BuildHasher,
F: num::ToPrimitive + std::fmt::Display,
{
let unif0m = Uniform::<usize>::new(0, self.m).unwrap();
let mut qmax: f64 = self.maxvaluetracker.get_max_value();
let iter = data.iter();
for (key, weight_t) in iter {
trace!("hash_item : id {:?} weight {} ", key, weight_t);
let weight = weight_t.to_f64().unwrap();
assert!(
weight.is_finite() && weight >= 0.,
"conversion to f64 failed"
);
let winv = 1. / weight;
let new_hash: u64 = self.b_hasher.hash_one(&key);
let mut rng = Xoshiro256PlusPlus::seed_from_u64(new_hash);
let h = winv * self.exp01.sample(&mut rng);
qmax = self.maxvaluetracker.get_max_value();
if h < qmax {
let k = unif0m.sample(&mut rng);
assert!(k < self.m);
if h < self.maxvaluetracker.get_value(k) {
self.signature[k] = *key;
self.maxvaluetracker.update(k, h);
qmax = self.maxvaluetracker.get_max_value();
}
if winv < qmax {
self.to_be_processed.push((*key, winv, rng));
}
} } let mut i = 2; while !self.to_be_processed.is_empty() {
let mut insert_pos = 0;
trace!(
" i : {:?} , nb to process : {}",
i,
self.to_be_processed.len()
);
for j in 0..self.to_be_processed.len() {
let (key, winv, rng) = &mut self.to_be_processed[j];
let mut h = (*winv) * (i - 1) as f64;
if h < self.maxvaluetracker.get_max_value() {
h += (*winv) * self.exp01.sample(rng);
let k = unif0m.sample(rng);
if h < self.maxvaluetracker.get_value(k) {
self.signature[k] = *key;
self.maxvaluetracker.update(k, h);
qmax = self.maxvaluetracker.get_max_value();
}
if (*winv) * (i as f64) < qmax {
self.to_be_processed[insert_pos] = (*key, *winv, rng.clone());
insert_pos += 1;
}
}
} self.to_be_processed.truncate(insert_pos);
i += 1;
} }
pub fn hash_weigthed_hashmap<Hidx, F>(&mut self, data: &HashMap<D, F, Hidx>)
where
Hidx: std::hash::BuildHasher,
F: num::ToPrimitive + std::fmt::Display,
{
let unif0m = Uniform::<usize>::new(0, self.m).unwrap();
let mut qmax: f64 = self.maxvaluetracker.get_max_value();
let iter = data.iter();
for (key, weight_t) in iter {
trace!("hash_item : id {:?} weight {} ", key, weight_t);
let weight = weight_t.to_f64().unwrap();
assert!(
weight.is_finite() && weight >= 0.,
"conversion to f64 failed"
);
let winv = 1. / weight;
let new_hash: u64 = self.b_hasher.hash_one(&key);
let mut rng = Xoshiro256PlusPlus::seed_from_u64(new_hash);
let h = winv * self.exp01.sample(&mut rng);
qmax = self.maxvaluetracker.get_max_value();
if h < qmax {
let k = unif0m.sample(&mut rng);
assert!(k < self.m);
if h < self.maxvaluetracker.get_value(k) {
self.signature[k] = *key;
self.maxvaluetracker.update(k, h);
qmax = self.maxvaluetracker.get_max_value();
}
if winv < qmax {
self.to_be_processed.push((*key, winv, rng));
}
} } let mut i = 2; while !self.to_be_processed.is_empty() {
let mut insert_pos = 0;
trace!(
" i : {:?} , nb to process : {}",
i,
self.to_be_processed.len()
);
for j in 0..self.to_be_processed.len() {
let (key, winv, rng) = &mut self.to_be_processed[j];
let mut h = (*winv) * (i - 1) as f64;
if h < self.maxvaluetracker.get_max_value() {
h += (*winv) * self.exp01.sample(rng);
let k = unif0m.sample(rng);
if h < self.maxvaluetracker.get_value(k) {
self.signature[k] = *key;
self.maxvaluetracker.update(k, h);
qmax = self.maxvaluetracker.get_max_value();
}
if (*winv) * (i as f64) < qmax {
self.to_be_processed[insert_pos] = (*key, *winv, rng.clone());
insert_pos += 1;
}
}
} self.to_be_processed.truncate(insert_pos);
i += 1;
} }
pub fn get_signature(&self) -> &Vec<D> {
&self.signature
}
}
#[cfg(test)]
mod tests {
use log::*;
use fnv::{FnvBuildHasher, FnvHasher};
use indexmap::IndexMap;
type FnvIndexMap<K, V> = IndexMap<K, V, FnvBuildHasher>;
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
use crate::jaccard::*;
use super::*;
#[test]
fn test_probminhash3_count_intersection_equal_weights() {
log_init_test();
debug!("test_probminhash3_count_intersection_equal_weights");
println!("test_probminhash3_count_intersection_equal_weights");
let set_size = 100;
let nbhash = 100;
let mut wa = Vec::<f64>::with_capacity(set_size);
let mut wb = Vec::<f64>::with_capacity(set_size);
for i in 0..set_size {
if i < 70 {
wa.push(20.);
} else {
wa.push(0.);
}
}
for i in 0..set_size {
if i < 50 {
wb.push(0.);
} else {
wb.push(10.);
}
}
let mut jp = 0.;
for i in 0..set_size {
if wa[i] > 0. && wb[i] > 0. {
let mut den = 0.;
for j in 0..set_size {
den += (wa[j] / wa[i]).max(wb[j] / wb[i]);
}
jp += 1. / den;
}
}
trace!("Jp = {} ", jp);
trace!("\n\n hashing wa");
let mut waprobhash = ProbMinHash3::<usize, FnvHasher>::new(nbhash, 0);
for i in 0..set_size {
if wa[i] > 0. {
waprobhash.hash_item::<f64>(i, &wa[i]);
}
}
trace!("\n\n hashing wb");
let mut wbprobhash = ProbMinHash3::<usize, FnvHasher>::new(nbhash, 0);
for i in 0..set_size {
if wb[i] > 0. {
wbprobhash.hash_item(i, &wb[i]);
}
}
let siga = waprobhash.get_signature();
let sigb = wbprobhash.get_signature();
let jp_approx = compute_probminhash_jaccard(siga, sigb);
info!("exact jp = {:.3e} ,jp estimated = {:.3e} ", jp, jp_approx);
assert!(jp_approx > 0.);
}
#[test]
fn test_probminhash3a_count_intersection_unequal_weights() {
log_init_test();
println!("test_probminhash3a_count_intersection_unequal_weights");
debug!("test_probminhash3a_count_intersection_unequal_weights");
let set_size = 100;
let nbhash = 2000;
let mut wa: FnvIndexMap<usize, f64> =
FnvIndexMap::with_capacity_and_hasher(70, FnvBuildHasher::default());
for i in 0..set_size {
if i < 70 {
*wa.entry(i).or_insert(0.) += 2. * i as f64;
}
}
let mut wb: FnvIndexMap<usize, f64> =
FnvIndexMap::with_capacity_and_hasher(70, FnvBuildHasher::default());
for i in 0..set_size {
if i >= 50 {
wb.entry(i).or_insert((i as f64).powi(4)); }
}
trace!("\n\n hashing wa");
let mut waprobhash = ProbMinHash3a::<usize, FnvHasher>::new(nbhash, 0);
waprobhash.hash_weigthed_idxmap(&wa);
trace!("\n\n hashing wb");
let mut wbprobhash = ProbMinHash3a::<usize, FnvHasher>::new(nbhash, 0);
wbprobhash.hash_weigthed_idxmap(&wb);
let siga = waprobhash.get_signature();
let sigb = wbprobhash.get_signature();
let jp_approx = compute_probminhash_jaccard(siga, sigb);
let mut jp = 0.;
for i in 0..set_size {
let wa_i = *wa.get(&i).unwrap_or(&0.);
let wb_i = *wb.get(&i).unwrap_or(&0.);
if wa_i > 0. && wb_i > 0. {
let mut den = 0.;
for j in 0..set_size {
let wa_j = *wa.get(&j).unwrap_or(&0.);
let wb_j = *wb.get(&j).unwrap_or(&0.);
den += (wa_j / wa_i).max(wb_j / wb_i);
}
jp += 1. / den;
}
}
trace!("Jp = {} ", jp);
info!(
"jp exact= {jptheo:.3} , jp estimate = {jp_est:.3} ",
jptheo = jp,
jp_est = jp_approx
);
assert!(jp_approx > 0.);
}
#[test]
fn test_probminhash3_count_intersection_unequal_weights() {
log_init_test();
println!("test_probminhash3_count_intersection_unequal_weights");
debug!("test_probminhash3_count_intersection_unequal_weights");
let set_size = 100;
let nbhash = 2000;
let mut wa = Vec::<f64>::with_capacity(set_size);
let mut wb = Vec::<f64>::with_capacity(set_size);
for i in 0..set_size {
if i < 70 {
wa.push(2. * i as f64);
} else {
wa.push(0.);
}
}
for i in 0..set_size {
if i < 50 {
wb.push(0.);
} else {
wb.push((i as f64).powi(4));
}
}
trace!("\n\n hashing wa");
let mut waprobhash = ProbMinHash3::<usize, FnvHasher>::new(nbhash, 0);
for i in 0..set_size {
if wa[i] > 0. {
waprobhash.hash_item(i, &wa[i]);
}
}
trace!("\n\n hashing wb");
let mut wbprobhash = ProbMinHash3::<usize, FnvHasher>::new(nbhash, 0);
for i in 0..set_size {
if wb[i] > 0. {
wbprobhash.hash_item(i, &wb[i]);
}
}
let siga = waprobhash.get_signature();
let sigb = wbprobhash.get_signature();
let jp_approx = compute_probminhash_jaccard(siga, sigb);
let mut jp = 0.;
for i in 0..set_size {
if wa[i] > 0. && wb[i] > 0. {
let mut den = 0.;
for j in 0..set_size {
den += (wa[j] / wa[i]).max(wb[j] / wb[i]);
}
jp += 1. / den;
}
}
trace!("Jp = {} ", jp);
info!(
"jp exact = {jp_exact:.3} , jp estimate {jp_estimate:.3} ",
jp_exact = jp,
jp_estimate = jp_approx
);
assert!(jp_approx > 0.);
} }