1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
mod consts; pub mod raw; use cfg_if::cfg_if; use raw::fingerprint_scalar; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct Fingerprint { pub fingerprint: [u32; 24], } impl Fingerprint { pub const MAX_SIMILARITY_SCORE: u32 = 24; /// Compares two fingerprints, and returns a similarity score in the range of 0 to /// `MAX_SIMILARITY_SCORE`, with 0 being "Not similar at all". pub fn similarity_score(self, other: Self) -> u32 { let mut score = 0; for (a, b) in self.fingerprint.iter().zip(other.fingerprint.iter()) { if a == b { score += 1; } } score } /// Compares two fingerprints, and returns a similarity score in the range of 0.0 to /// 1.0, with 0 being "not similar at all" and 1 being "extremely similar" pub fn similarity_score_ratio(self, other: Self) -> f32 { let score = self.similarity_score(other); score as f32 / Self::MAX_SIMILARITY_SCORE as f32 } } /// Perform GEAR based MinHash fingerprinting of the provided data, using the /// fastest implementation available on the current hardware. /// /// # GEAR Hash /// /// This method derives a fingerprint based on a GEAR hash, a rolling hash where /// each byte's hash is calculated as: /// /// ```text /// hash = (hash << 1) + table[byte] /// ``` /// /// Where `table` is a pre-calculated list of 256 random values, one for each /// possible value of a byte. The with of the rolling hash is determined by the /// bit width of the integer type used, in this case `u32`, corresponding to a /// 32-byte hash window. The GEAR hash function used can thus also be stated as /// such: /// /// ```text /// hash = (hash * 2) + table[byte] mod 2^32 /// ``` /// /// # Derived Hashes /// /// This method uses 24 different hashes derived from the internal GEAR hash, based /// on the following formula, where `X` is in the range of `0..24`, and `hash` is /// the original gear hash: /// /// ```text /// hash_X = (hash * N[X]) + M[X] /// ``` /// /// Here, `N` and `M` are separate, arbitrarily chosen tables of integers. /// /// The method keeps track of the minimum value encountered for each of the derived /// hashes as it consumes the bytes of the data, and those minimum values compose the /// fingerprint. pub fn fingerprint(data: &[u8]) -> Fingerprint { cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { use crate::raw::x86::*; if std::is_x86_feature_detected!("avx") && std::is_x86_feature_detected!("avx2") { // AVX2 verified, call through to the AVX2 implementation unsafe {fingerprint_avx2(data)} } else if std::is_x86_feature_detected!("sse2") && std::is_x86_feature_detected!("sse4.1") { unsafe {fingerprint_sse41(data)} } else { // None of the optimized implementations are available on this hardware // Use scalar fallback fingerprint_scalar(data) } } else { // Fallback branch, there is no optimized implementation on this hardware yet // Use the scalar fallback implementation fingeprint_scalar(data) } } } #[cfg(test)] mod test_utils { use rand::prelude::*; use rand_chacha::*; pub fn make_data(seed: u64) -> Vec<u8> { let mut rand = ChaChaRng::seed_from_u64(seed); let size = rand.gen_range(0, 32000); let mut data = vec![0_u8; size]; rand.fill(&mut data[..]); data } }