use std::collections::HashMap;
use std::fmt::Debug;
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use rand::prelude::*;
use rand_distr::Exp1;
use rand_xoshiro::Xoshiro256PlusPlus;
use wyhash::WyHash;
use crate::fyshuffle::FYshuffle;
use crate::maxvaluetrack::{MaxValue, MaxValueTracker};
struct OrdMinHashStore<V>
where
V: MaxValue + Copy + PartialOrd + Debug,
{
m: usize,
l: usize,
indices: Vec<u64>,
values: Vec<V>,
hashbuffer: Vec<u64>,
seed_rng: ThreadRng,
wyhash_seed: u64,
}
impl<V> OrdMinHashStore<V>
where
V: MaxValue + Copy + PartialOrd + Debug,
{
pub fn new(m: usize, l: usize) -> Self {
let ml = m * l;
let indices = vec![u64::MAX; ml];
let values = (0..ml).map(|_| V::get_max()).collect::<Vec<V>>();
let hashbuffer = vec![0_u64; l];
let rng = ThreadRng::default();
let wyhash_seed = 0xcf7355744a6e8145;
assert!(l < 16);
OrdMinHashStore {
m,
l,
indices,
values,
hashbuffer,
seed_rng: rng,
wyhash_seed,
}
}
#[allow(unused)]
pub(crate) fn change_wyhash_seed(&mut self) {
self.wyhash_seed = self.seed_rng.next_u64();
log::trace!(
"OrdMinHashStore setting wyhash_seed to : {:?}",
self.wyhash_seed
);
}
pub(crate) fn update_with_maxtracker(
&mut self,
permuted_idx: usize,
value: &V,
data_idx: usize,
maxtracker: &mut MaxValueTracker<V>,
) -> bool {
assert!(permuted_idx < self.m);
let first_idx: usize = permuted_idx * self.l;
let last_idx: usize = first_idx + self.l - 1;
log::trace!("OrdMinHashStore::update permut idx : {}", permuted_idx);
log::trace!("indices : {:?}", self.indices);
log::trace!("values : {:?}", self.values);
if *value < self.values[last_idx] {
let mut array_idx = last_idx;
while array_idx > first_idx && *value < self.values[array_idx - 1] {
self.values[array_idx] = self.values[array_idx - 1];
self.indices[array_idx] = self.indices[array_idx - 1];
array_idx -= 1;
}
self.values[array_idx] = *value;
self.indices[array_idx] = data_idx as u64;
maxtracker.update(permuted_idx, self.values[last_idx]);
log::trace!("exiting update indices : {:?}", self.indices);
true
} else {
false
}
}
pub(crate) fn reset(&mut self) {
self.values.fill(V::get_max());
self.indices.fill(u64::MAX);
}
pub(crate) fn get_l(&self) -> usize {
self.l
}
pub(crate) fn create_signature<D: Hash, H: Hasher + Default>(
&mut self,
data: &[D],
) -> Vec<u64> {
assert!(data.len() >= self.l, "data length must be greater than l");
log::debug!("indices : {:?}", self.indices);
let mut result = Vec::<u64>::with_capacity(self.m);
let b_hasher = BuildHasherDefault::<H>::default();
for i in 0..self.m {
let mut nb_bad_indices = 0;
let mut combine_hasher = WyHash::with_seed(self.wyhash_seed);
let start = i * self.l;
let end = start + self.l;
self.indices[start..end].sort_unstable();
for j in 0..self.l {
let data_idx = usize::try_from(self.indices[start + j]).unwrap();
if data_idx < data.len() {
self.hashbuffer[j] = b_hasher.hash_one(&data[data_idx]);
combine_hasher.write_u64(self.hashbuffer[j]);
} else {
nb_bad_indices += 1;
}
}
result.push(combine_hasher.finish());
if nb_bad_indices > 0 {
log::error!(
"OrdMinHashStore::create_signature slot i : {} nb_bad_indices : {}",
i,
nb_bad_indices
);
assert_eq!(nb_bad_indices, 0);
}
}
log::debug!("create_signature : {:?}", result);
result
} }
pub struct ProbOrdMinHash2<H> {
m: usize,
b_hasher: BuildHasherDefault<H>,
max_tracker: MaxValueTracker<f64>,
min_store: OrdMinHashStore<f64>,
g: Vec<f64>,
permut_generator: FYshuffle,
counter: HashMap<u64, u64>,
seed_rng: ThreadRng,
seed: u64,
}
impl<H> ProbOrdMinHash2<H>
where
H: Hasher + Default,
{
pub fn new(m_s: u32, l: usize) -> Self {
let m = m_s as usize;
let max_tracker = MaxValueTracker::new(m);
let minstore = OrdMinHashStore::<f64>::new(m, l);
let mut g = (0..(m - 1)).map(|_| 0.).collect::<Vec<f64>>();
for i in 1..m {
g[i - 1] = m as f64 / (m - i) as f64;
}
let counter = HashMap::<u64, u64>::new();
let mut rng = ThreadRng::default();
let seed = rng.next_u64();
ProbOrdMinHash2 {
m,
b_hasher: BuildHasherDefault::<H>::default(),
max_tracker,
min_store: minstore,
g,
permut_generator: FYshuffle::new(m),
counter,
seed_rng: rng.clone(),
seed,
}
}
pub fn hash_set<D: Eq + Hash>(&mut self, data: &[D]) -> Vec<u64> {
let size = data.len();
if size < self.min_store.get_l() {
log::error!(
"data length must be greater than {:}",
self.min_store.get_l()
);
std::panic!(
"data length must be greater than {:}",
self.min_store.get_l()
);
}
self.counter.clear();
self.min_store.reset();
self.max_tracker.reset();
let mut x: f64;
for (i, val) in data.iter().enumerate() {
self.permut_generator.reset();
let id_hash: u64 = self.b_hasher.hash_one(val);
let newcount = match self.counter.get_mut(&id_hash) {
Some(count) => {
*count += 1;
*count
}
_ => {
self.counter.insert(id_hash, 1);
1
}
};
let mut seed_256 = [0u8; 32];
seed_256[0..8].copy_from_slice(&id_hash.to_ne_bytes());
seed_256[8..16].copy_from_slice(&newcount.to_ne_bytes());
seed_256[16..24].copy_from_slice(&self.seed.to_ne_bytes());
let mut rng = Xoshiro256PlusPlus::from_seed(seed_256);
x = Exp1.sample(&mut rng);
let mut nb_inserted = 0;
while x < self.max_tracker.get_max_value() {
let k = self.permut_generator.next(&mut rng);
assert!(k < self.m);
let inserted =
self.min_store
.update_with_maxtracker(k, &x, i, &mut self.max_tracker);
if !inserted {
break;
}
if !self.max_tracker.is_update_possible(x) {
break;
}
if nb_inserted + 1 >= self.m {
break;
}
let y: f64 = Exp1.sample(&mut rng);
x += y * self.g[nb_inserted];
nb_inserted += 1;
}
}
self.min_store.create_signature::<D, H>(data)
}
pub fn change_rng_seed(&mut self) {
self.min_store.change_wyhash_seed();
self.seed = self.seed_rng.next_u64();
}
}
#[cfg(test)]
mod tests {
use super::*;
use fnv::FnvHasher;
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
fn gen_01seq(k: usize, n: usize) -> (Vec<u32>, Vec<u32>) {
assert!(k < n);
let vec1 = (0..n)
.map(|i| if i < n - k { 0 } else { 1 })
.collect::<Vec<u32>>();
let vec2 = (0..n)
.map(|i| if i < k { 0 } else { 1 })
.collect::<Vec<u32>>();
(vec1, vec2)
}
fn test_vectors(m: u32, l: usize, v1: &[u32], v2: &[u32], nb_iter: usize) {
let mut pordminhash = ProbOrdMinHash2::<WyHash>::new(m, l);
let mut equals = (0..m + 1).map(|_| 0).collect::<Vec<usize>>();
for _ in 0..nb_iter {
let hash1 = pordminhash.hash_set(&v1);
let hash2 = pordminhash.hash_set(&v2);
let mut nb_equal: u32 = 0;
for i in 0..m as usize {
if hash1[i] == hash2[i] {
nb_equal = nb_equal + 1;
}
}
equals[nb_equal as usize] += 1;
pordminhash.change_rng_seed();
}
log::info!(" equalities at m slots : {:?}", equals);
}
fn get_pattern_1() -> (Vec<u32>, Vec<u32>) {
let v1: Vec<u32> = vec![0, 0, 1, 2];
let v2: Vec<u32> = vec![0, 1, 1, 2];
return (v1, v2);
}
fn get_pattern_2() -> (Vec<u32>, Vec<u32>) {
let v1: Vec<u32> = vec![0, 1, 2, 3, 4, 0, 1, 2, 3, 2, 4, 5];
let v2: Vec<u32> = vec![0, 1, 2, 6, 4, 0, 7, 1, 2, 3, 2, 4, 5];
return (v1, v2);
}
fn get_pattern_3() -> (Vec<u32>, Vec<u32>) {
let v1: Vec<u32> = vec![
0, 1, 2, 3, 4, 0, 1, 2, 3, 2, 4, 5, 0, 1, 2, 3, 4, 0, 1, 2, 6, 2, 4, 5,
];
let v2: Vec<u32> = vec![0, 1, 2, 6, 4, 0, 7, 1, 2, 3, 2, 4, 5];
return (v1, v2);
}
#[test]
fn test_ordminhash_01seq() {
log_init_test();
log::info!("in test_ordminhash_01seq");
let (v1, v2) = gen_01seq(3, 100);
test_vectors(1024, 1, &v1, &v2, 1000);
test_vectors(1024, 3, &v1, &v2, 1000);
test_vectors(1024, 5, &v1, &v2, 1000);
}
#[test]
fn ordminhash2_random_data() {
log_init_test();
log::info!("in test_ordminhash2_random_data");
let mut rng = rand::rng();
let data_size = 50000;
let data = (0..data_size).map(|_| rng.next_u64()).collect::<Vec<u64>>();
let m_s: u32 = 500;
let l = 3;
let mut pordminhash = ProbOrdMinHash2::<FnvHasher>::new(m_s, l);
let _hash = pordminhash.hash_set(&data);
}
#[test]
fn ordminhash2_equality() {
log_init_test();
log::info!("in test_ordminhash2_equality");
let m_s: u32 = 5;
let l = 2;
let nb_iter = 10;
let v = [0, 0, 1, 2, 2, 3, 4];
test_vectors(m_s, l, &v, &v, nb_iter);
}
#[test]
fn test_ordminhash2_p1() {
log_init_test();
log::info!("in test_ordminhash2_1");
let m_s: u32 = 1024;
let l = 3;
let nb_iter = 100000;
let pattern = get_pattern_1();
test_vectors(m_s, l, &pattern.0, &pattern.1, nb_iter);
}
#[test]
fn test_ordminhash2_p2() {
log_init_test();
log::info!("in test_ordminhash2_p2");
let pattern = get_pattern_2();
let nb_iter = 50000;
log::info!("\n m : 32, l : 3");
test_vectors(32, 3, &pattern.0, &pattern.1, nb_iter);
log::info!("\n m : 32, l : 5");
test_vectors(32, 5, &pattern.0, &pattern.1, nb_iter);
log::info!("\n m : 1024, l : 3");
test_vectors(1024, 3, &pattern.0, &pattern.1, nb_iter);
log::info!("\n m : 1024, l : 5");
test_vectors(1024, 5, &pattern.0, &pattern.1, nb_iter);
}
#[test]
fn test_ordminhash2_p3() {
log_init_test();
log::info!("in test_ordminhash2_p3");
let pattern = get_pattern_3();
let nb_iter = 10000;
log::info!("\n m : 32, l : 3");
test_vectors(32, 3, &pattern.0, &pattern.1, nb_iter);
log::info!("\n m : 32, l : 5");
test_vectors(32, 5, &pattern.0, &pattern.1, nb_iter);
log::info!("\n m : 1024, l : 3");
test_vectors(1024, 3, &pattern.0, &pattern.1, nb_iter);
log::info!("\n m : 1024, l : 5");
test_vectors(1024, 5, &pattern.0, &pattern.1, nb_iter);
}
#[test]
fn test_ordminhash2_p5() {
log_init_test();
log::info!("in test_ordminhash2_dist_5");
let nbiter = 100000;
let size: usize = 25;
let shift = 5;
let v1 = (0..size).map(|i| i as u32).collect::<Vec<u32>>();
let v2 = (0..size).map(|i| (i + shift) as u32).collect::<Vec<u32>>();
log::info!("shift : {}", shift);
test_vectors(4, 2, &v1, &v2, nbiter);
let shift = 9;
log::info!("shift : {}", shift);
let v1 = (0..size).map(|i| i as u32).collect::<Vec<u32>>();
let v2 = (0..size).map(|i| (i + shift) as u32).collect::<Vec<u32>>();
test_vectors(4, 2, &v1, &v2, nbiter);
} }