Struct probabilistic_collections::similarity::MinHash[][src]

pub struct MinHash<T, U> { /* fields omitted */ }

MinHash is a locality sensitive hashing scheme that can estimate the Jaccard Similarity measure between two sets s1 and s2. It uses multiple hash functions and for each hash function h, finds the minimum hash value obtained from the hashing an item in s1 using h and hashing an item in s2 using h. Our estimate for the Jaccard Similarity is the number of minimum hash values that are equal divided by the number of total hash functions used.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};

let min_hash = MinHash::new(100);

assert_eq!(
    min_hash.get_similarity(
        ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()),
        ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()),
    ),
    0.42,
);

Methods

impl<T, U> MinHash<T, U>
[src]

Constructs a new MinHash with a specified number of hash functions to use.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};

let min_hash = MinHash::<ShingleIterator<&str>, &str>::new(100);

Returns the minimum hash values obtained from a specified iterator iter. This function is used in conjunction with get_similarity_from_hashes when doing multiple comparisons.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};

let min_hash = MinHash::new(100);

let shingles = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect());
let min_hashes = min_hash.get_min_hashes(shingles);

Returns the estimated Jaccard Similarity measure from the minimum hashes of two iterators. This function is used in conjunction with get_min_hashes when doing multiple comparisons.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};

let min_hash = MinHash::new(100);

let shingles = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect());
let min_hashes = min_hash.get_min_hashes(shingles);

Returns the estimated Jaccard Similarity measure from two iterators iter_1 and iter_2.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};

let min_hash = MinHash::new(100);

assert_eq!(
    min_hash.get_similarity(
        ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()),
        ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()),
    ),
    0.42,
);

Returns the number of hash functions being used in MinHash.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};

let min_hash = MinHash::<ShingleIterator<&str>, &str>::new(100);
assert_eq!(min_hash.hasher_count(), 100);

Auto Trait Implementations

impl<T, U> Send for MinHash<T, U> where
    T: Send,
    U: Send

impl<T, U> Sync for MinHash<T, U> where
    T: Sync,
    U: Sync