hamming_bow/
lib.rs

1#![no_std]
2
3extern crate alloc;
4
5use alloc::{vec, vec::Vec};
6use bitarray::BitArray;
7#[cfg(feature = "serde")]
8use serde::{Deserialize, Serialize};
9
10/// This structure allows one to generate word presence hashes.
11///
12/// `B` is the number of bytes in the codewords.
13/// `H` is the number of bytes in the hash.
14#[derive(Clone, Debug)]
15#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
16pub struct HammingHasher<const B: usize, const H: usize> {
17    codewords: Vec<BitArray<B>>,
18}
19
20impl<const B: usize, const H: usize> HammingHasher<B, H> {
21    pub fn new() -> Self {
22        Self::new_with_seed(0)
23    }
24
25    pub fn new_with_seed(seed: u64) -> Self {
26        assert_ne!(H, 0);
27        let codewords = hamming_dict::generate_dict_seed(H * 8, seed);
28
29        Self { codewords }
30    }
31
32    /// There must be exactly `H * 8` codewords.
33    pub fn new_with_codewords(codewords: Vec<BitArray<B>>) -> Self {
34        assert_eq!(codewords.len(), H * 8);
35
36        Self { codewords }
37    }
38
39    /// Convert the input features into a hash.
40    pub fn hash_bag<'a>(&self, features: impl IntoIterator<Item = &'a BitArray<B>>) -> BitArray<H> {
41        let mut counts = vec![0usize; H * 8];
42        let mut distances = vec![0u32; H * 8];
43        for feature in features {
44            let mut lowest_distance = u32::MAX;
45            // Iterate through all the codewords.
46            for (distance, codeword) in distances.iter_mut().zip(self.codewords.iter()) {
47                // Compute the distance to the codeword and store it.
48                *distance = feature.distance(codeword);
49                // If this is lower than the lowest distance, update it.
50                if *distance < lowest_distance {
51                    lowest_distance = *distance;
52                }
53            }
54            // Go through all the distances.
55            for (ix, &distance) in distances.iter().enumerate() {
56                // If this distance is equal to the lowest distance, add 1 to that keyword.
57                if distance == lowest_distance {
58                    counts[ix] += 1;
59                }
60            }
61        }
62        // Find the largest count threshold that keeps the number of bits set above half (aside from 0).
63        let threshold = (2..)
64            .find(|&threshold| counts.iter().filter(|&&count| count >= threshold).count() < H * 4)
65            .unwrap()
66            - 1;
67        // Use the threshold to formulate the hash.
68        let mut hash = BitArray::zeros();
69        for ix in 0..H * 8 {
70            hash[ix >> 3] |= ((counts[ix] >= threshold) as u8) << (ix & 0b111);
71        }
72        hash
73    }
74}
75
76impl<const B: usize, const H: usize> Default for HammingHasher<B, H> {
77    fn default() -> Self {
78        Self::new()
79    }
80}