1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
//! Hash-related utils.
use std::any::TypeId;
use std::collections::hash_map::DefaultHasher;
use std::hash::{BuildHasher, Hash, Hasher};
use std::marker::PhantomData;

/// Builder for iterators for multiple hash function results for the same given objects.
///
/// In other words: for a given object `x`, HashIterBuilder creates an iterator that emits
/// `h_i(x) in [0, m)` for `i in [0, k)` with `m` and `k` being configurable.
///
/// # Examples
/// ```
/// use pdatastructs::hash_utils::HashIterBuilder;
/// use std::collections::hash_map::DefaultHasher;
/// use std::hash::BuildHasherDefault;
///
/// // set up builder
/// let bh = BuildHasherDefault::<DefaultHasher>::default();
/// let max_result = 42;
/// let number_functions = 2;
/// let builder = HashIterBuilder::new(max_result, number_functions, bh);
///
/// // create iterator for a given object
/// let obj = "my string";
/// let iter = builder.iter_for(&obj);
/// for h_i in iter {
///     println!("{}", h_i);
/// }
/// ```
///
/// # Applications
/// - mostly used as helper for BloomFilter and CountMinSketch
///
/// # How It Works
/// Instead of hashing `x` `i` times with different hash functions, only 2 hash functions `h_1` and
/// `h_2` are used. Furthermore, a function `f: [0, k) -> [0, m)` is generated in a pseudo-random,
/// but derministic manner. Then, the result of `i in [0, k)` is:
///
/// ```text
/// (h_1(x) + i * h_2(x) + f(i)) mod m
/// ```
///
/// This is also called "enhanced double hashing".
///
/// # See Also
/// - `std::collections::hash_map::DefaultHasher`: a solid choice to produce some hash results,
///   but slower for the special case needed for BloomFilter etc.
///
/// # References
/// - ["Less Hashing, Same Performance: Building a Better Bloom Filter", Adam Kirsch, Michael
///   Mitzenmacher, 2008](https://www.eecs.harvard.edu/%7Emichaelm/postscripts/rsa2008.pdf)
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct HashIterBuilder<B>
where
    B: BuildHasher,
{
    m: usize,
    k: usize,
    buildhasher: B,
    f: Vec<u64>,
}

impl<B> HashIterBuilder<B>
where
    B: BuildHasher,
{
    /// Create new `HashIterBuilder` with the following parameters:
    ///
    /// - `m`: the maximal result value of `h_i(x)`
    /// - `k`: number of hash functions to generate
    /// - `buildhasher`: `BuildHasher` used to construct new `Hash` objects, must be stable (i.e. create the same `Hash`
    ///   object on every call)
    pub fn new(m: usize, k: usize, buildhasher: B) -> Self {
        let f = Self::setup_f(m, k, &buildhasher);
        Self {
            m,
            k,
            buildhasher,
            f,
        }
    }

    /// Domain aka maximal result of `h_i(x)`.
    pub fn m(&self) -> usize {
        self.m
    }

    /// Number of hash functions.
    pub fn k(&self) -> usize {
        self.k
    }

    /// `BuildHasher` that is used to hash objects and to create `f`.
    pub fn buildhasher(&self) -> &B {
        &self.buildhasher
    }

    /// Shift-function to enhance the hashing scheme.
    pub fn f(&self, i: usize) -> u64 {
        self.f[i]
    }

    /// Create an iterator for `k` hash functions for the given object.
    ///
    /// This call is CPU-expensive and may only be used if the resulting iterator will really be
    /// used.
    pub fn iter_for<T>(&self, obj: &T) -> HashIter<B>
    where
        T: Hash,
    {
        let h1 = self.h_i(&obj, 0) % (self.m as u64);
        let h2 = self.h_i(&obj, 1) % (self.m as u64);
        HashIter::new(&self, h1, h2)
    }

    /// Set up `f`.
    fn setup_f(m: usize, k: usize, buildhasher: &B) -> Vec<u64> {
        (0..k)
            .map(|i| {
                let mut hasher = buildhasher.build_hasher();
                hasher.write_usize(i + 2); // skip 2 for h1 und h2
                hasher.finish() % (m as u64)
            })
            .collect()
    }

    /// Helper to calculate h_0 and h_1.
    fn h_i<T>(&self, obj: &T, i: usize) -> u64
    where
        T: Hash,
    {
        let mut hasher = self.buildhasher.build_hasher();
        hasher.write_usize(i);
        obj.hash(&mut hasher);
        hasher.finish()
    }
}

/// Iterate over `h_i(x) in [0, k)` for `i in [0, m)`.
#[derive(Debug)]
pub struct HashIter<'a, B>
where
    B: 'a + BuildHasher,
{
    builder: &'a HashIterBuilder<B>,
    h1: u64,
    h2: u64,
    i: usize,
}

impl<'a, B> HashIter<'a, B>
where
    B: 'a + BuildHasher,
{
    /// Create new `HashIter` with the following parameters:
    ///
    /// - `builder`: builder
    /// - `h1`: first hash for double hashing
    /// - `h2`: second hash for double hashing
    fn new(builder: &'a HashIterBuilder<B>, h1: u64, h2: u64) -> Self {
        Self {
            builder,
            h1,
            h2,
            i: 0,
        }
    }
}

impl<'a, B> Iterator for HashIter<'a, B>
where
    B: 'a + BuildHasher,
{
    type Item = usize;

    fn next(&mut self) -> Option<usize> {
        if self.i < self.builder.k() {
            let m = self.builder.m() as u64;
            let i = (self.i as u64) % m;
            let f = self.builder.f(self.i);
            let x = (self.h1 + (i * self.h2) + f) % m;

            self.i += 1;

            Some(x as usize)
        } else {
            None
        }
    }
}

/// BuildHasher that takes a seed.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct BuildHasherSeeded {
    seed: usize,
}

impl BuildHasherSeeded {
    /// Create new BuildHasherSeeded with given seed.
    pub fn new(seed: usize) -> Self {
        Self { seed }
    }
}

impl BuildHasher for BuildHasherSeeded {
    type Hasher = DefaultHasher;

    fn build_hasher(&self) -> DefaultHasher {
        let mut h = DefaultHasher::default();
        h.write_usize(self.seed);
        h
    }
}

/// Hash placeholder that can be used to make certain data structures accept multiple data types.
///
/// # Example
/// ```
/// use pdatastructs::filters::Filter;
/// use pdatastructs::filters::bloomfilter::BloomFilter;
/// use pdatastructs::hash_utils::AnyHash;
///
/// // set up filter
/// let false_positive_rate = 0.02;  // = 2%
/// let expected_elements = 1000;
/// let mut filter = BloomFilter::<AnyHash>::with_properties(expected_elements, false_positive_rate);
///
/// // add different types
/// filter.insert(&AnyHash::new("1"));
/// filter.insert(&AnyHash::new(&1u64));
///
/// // query
/// assert!(filter.query(&AnyHash::new("1")));
/// assert!(filter.query(&AnyHash::new(&1u64)));
/// assert!(!filter.query(&AnyHash::new(&1u32)));
/// ```
///
/// # How It Works
/// During construction, a 64bit hash of the object as well as the `TypeID` of the object type will
/// be preserved. Both will later be fed into the target hasher of the used data structure.
#[derive(Debug)]
pub struct AnyHash<H = DefaultHasher>
where
    H: Hasher + Default,
{
    hash_obj: u64,
    type_id: TypeId,
    phantom: PhantomData<H>,
}

impl<H> AnyHash<H>
where
    H: Hasher + Default,
{
    /// Create new hash placeholder for given object.
    pub fn new<T>(obj: &T) -> Self
    where
        T: Hash + 'static + ?Sized,
    {
        let mut hasher_obj = H::default();
        obj.hash(&mut hasher_obj);
        let hash_obj = hasher_obj.finish();

        Self {
            hash_obj,
            type_id: TypeId::of::<T>(),
            phantom: PhantomData,
        }
    }
}

impl Hash for AnyHash {
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.type_id.hash(state);
        self.hash_obj.hash(state);
    }
}

#[cfg(test)]
mod tests {
    use super::{BuildHasherSeeded, HashIterBuilder};
    use std::collections::hash_map::DefaultHasher;
    use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};

    #[test]
    fn hash_iter_builder_getter() {
        let bh1 = BuildHasherDefault::<DefaultHasher>::default();
        let builder = HashIterBuilder::new(42, 2, bh1);

        assert_eq!(builder.m(), 42);
        assert_eq!(builder.k(), 2);

        let bh2 = BuildHasherDefault::<DefaultHasher>::default();
        assert_eq!(builder.buildhasher(), &bh2);
    }

    #[test]
    fn hash_iter_builder_f() {
        let bh1 = BuildHasherDefault::<DefaultHasher>::default();
        let bh2 = BuildHasherDefault::<DefaultHasher>::default();
        let builder1 = HashIterBuilder::new(42, 2, bh1);
        let builder2 = HashIterBuilder::new(42, 2, bh2);

        assert_eq!(builder1.f(0), builder2.f(0));
        assert_eq!(builder1.f(1), builder2.f(1));
    }

    #[test]
    fn hash_iter_builder_eq() {
        let bh1 = BuildHasherSeeded::new(0);
        let bh2 = BuildHasherSeeded::new(0);
        let bh3 = BuildHasherSeeded::new(0);
        let bh4 = BuildHasherSeeded::new(1);
        let builder1 = HashIterBuilder::new(42, 2, bh1);
        let builder2 = HashIterBuilder::new(42, 2, bh2);
        let builder3 = HashIterBuilder::new(42, 3, bh3);
        let builder4 = HashIterBuilder::new(42, 2, bh4);

        assert_eq!(builder1, builder2);
        assert_ne!(builder1, builder3);
        assert_ne!(builder1, builder4);
    }

    #[test]
    fn hash_iter() {
        let bh = BuildHasherDefault::<DefaultHasher>::default();
        let obj = 1337;

        let builder = HashIterBuilder::new(42, 2, bh);

        let iter1 = builder.iter_for(&obj);
        let v1: Vec<usize> = iter1.collect();
        assert_eq!(v1.len(), 2);
        assert!(v1[0] < 42);
        assert!(v1[1] < 42);
        assert_ne!(v1[0], v1[1]);

        let iter2 = builder.iter_for(&obj);
        let v2: Vec<usize> = iter2.collect();
        assert_eq!(v1, v2);
    }

    #[test]
    fn build_hasher_seeded() {
        let bh1 = BuildHasherSeeded::new(0);
        let bh2 = BuildHasherSeeded::new(0);
        let bh3 = BuildHasherSeeded::new(1);

        let mut hasher1 = bh1.build_hasher();
        let mut hasher2 = bh2.build_hasher();
        let mut hasher3 = bh3.build_hasher();

        let obj = "foo bar";
        obj.hash(&mut hasher1);
        obj.hash(&mut hasher2);
        obj.hash(&mut hasher3);

        let result1 = hasher1.finish();
        let result2 = hasher2.finish();
        let result3 = hasher3.finish();

        assert_eq!(result1, result2);
        assert_ne!(result1, result3);
    }
}