1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#![no_std]
extern crate alloc;
use alloc::{vec, vec::Vec};
use bitarray::BitArray;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
pub struct HammingHasher<const B: usize, const H: usize> {
codewords: Vec<BitArray<B>>,
imbalance: f64,
}
impl<const B: usize, const H: usize> HammingHasher<B, H> {
pub fn new() -> Self {
Self::new_with_seed(0)
}
pub fn new_with_seed(seed: u64) -> Self {
assert_ne!(H, 0);
let codewords = hamming_dict::generate_dict_seed(H * 8, seed);
let imbalance = hamming_volume_imbalance(B * 8);
Self {
codewords,
imbalance,
}
}
const fn threshold() -> u32 {
(B * 4) as u32
}
pub fn hash(&self, feature: &BitArray<B>) -> BitArray<H> {
let mut hash = BitArray::zeros();
for (ix, word) in self.codewords.iter().enumerate() {
hash[ix >> 3] |= ((feature.distance(word) <= Self::threshold()) as u8) << (ix & 0b111);
}
hash
}
pub fn hash_bag<'a>(&self, features: impl IntoIterator<Item = &'a BitArray<B>>) -> BitArray<H> {
let mut counts = vec![0isize; H * 8];
let mut num_features = 0usize;
for feature in features {
for (ix, word) in self.codewords.iter().enumerate() {
counts[ix] +=
(((feature.distance(word) <= Self::threshold()) as isize) << 1).wrapping_sub(1);
}
num_features += 1;
}
let correction = (num_features as f64 * -self.imbalance + 0.5) as isize;
for count in &mut counts {
*count += correction;
}
let mut hash = BitArray::zeros();
for ix in 0..H * 8 {
hash[ix >> 3] |= (counts[ix].is_positive() as u8) << (ix & 0b111);
}
hash
}
}
impl<const B: usize, const H: usize> Default for HammingHasher<B, H> {
fn default() -> Self {
Self::new()
}
}
fn hamming_volume_imbalance(n: usize) -> f64 {
let half_space = core::iter::repeat(2.0f64)
.take(n - 1)
.fold(1.0, |a, b| a * b);
let mut volume = 0.0;
for radius in 0..=(n + 1) / 2 {
volume += n_choose_k(n, radius);
}
volume / half_space - 1.0
}
fn n_choose_k(n: usize, k: usize) -> f64 {
if k == 0 {
1.0
} else {
n_choose_k(n - 1, k - 1) * n as f64 / k as f64
}
}