1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
mod consts;
pub mod raw;

use cfg_if::cfg_if;
use raw::fingerprint_scalar;

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Fingerprint {
    pub fingerprint: [u32; 24],
}

impl Fingerprint {
    pub const MAX_SIMILARITY_SCORE: u32 = 24;

    /// Compares two fingerprints, and returns a similarity score in the range of 0 to
    /// `MAX_SIMILARITY_SCORE`, with 0 being "Not similar at all".
    pub fn similarity_score(self, other: Self) -> u32 {
        let mut score = 0;
        for (a, b) in self.fingerprint.iter().zip(other.fingerprint.iter()) {
            if a == b {
                score += 1;
            }
        }
        score
    }

    /// Compares two fingerprints, and returns a similarity score in the range of 0.0 to
    /// 1.0, with 0 being "not similar at all" and 1 being "extremely similar"
    pub fn similarity_score_ratio(self, other: Self) -> f32 {
        let score = self.similarity_score(other);
        score as f32 / Self::MAX_SIMILARITY_SCORE as f32
    }
}

/// Perform GEAR based MinHash fingerprinting of the provided data, using the
/// fastest implementation available on the current hardware.
///
/// # GEAR Hash
///
/// This method derives a fingerprint based on a GEAR hash, a rolling hash where
/// each byte's hash is calculated as:
///
/// ```text
/// hash = (hash << 1) + table[byte]
/// ```
///
/// Where `table` is a pre-calculated list of 256 random values, one for each
/// possible value of a byte. The with of the rolling hash is determined by the
/// bit width of the integer type used, in this case `u32`, corresponding to a
/// 32-byte hash window. The GEAR hash function used can thus also be stated as
/// such:
///
/// ```text
/// hash = (hash * 2) + table[byte] mod 2^32
/// ```
///
/// # Derived Hashes
///
/// This method uses 24 different hashes derived from the internal GEAR hash, based
/// on the following formula, where `X` is in the range of `0..24`, and `hash` is
/// the original gear hash:
///
/// ```text
/// hash_X = (hash * N[X]) + M[X]
/// ```
///
/// Here, `N` and `M` are separate, arbitrarily chosen tables of integers.
///
/// The method keeps track of the minimum value encountered for each of the derived
/// hashes as it consumes the bytes of the data, and those minimum values compose the
/// fingerprint.
pub fn fingerprint(data: &[u8]) -> Fingerprint {
    cfg_if! {
        if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
            use crate::raw::x86::*;
            if std::is_x86_feature_detected!("avx") && std::is_x86_feature_detected!("avx2") {
                // AVX2 verified, call through to the AVX2 implementation
                unsafe {fingerprint_avx2(data)}
            } else if std::is_x86_feature_detected!("sse2") && std::is_x86_feature_detected!("sse4.1") {
                unsafe {fingerprint_sse41(data)}
            } else {
                // None of the optimized implementations are available on this hardware
                // Use scalar fallback
                fingerprint_scalar(data)
            }
        } else {
            // Fallback branch, there is no optimized implementation on this hardware yet
            // Use the scalar fallback implementation
            fingeprint_scalar(data)
        }
    }
}

#[cfg(test)]
mod test_utils {
    use rand::prelude::*;
    use rand_chacha::*;
    pub fn make_data(seed: u64) -> Vec<u8> {
        let mut rand = ChaChaRng::seed_from_u64(seed);
        let size = rand.gen_range(0, 32000);
        let mut data = vec![0_u8; size];
        rand.fill(&mut data[..]);
        data
    }
}