pub struct MinHash<T, U, B = SipHasherBuilder> { /* private fields */ }
Expand description
MinHash
is a locality sensitive hashing scheme that can estimate the Jaccard Similarity
measure between two sets s1
and s2
. It uses multiple hash functions and for each hash
function h
, finds the minimum hash value obtained from the hashing an item in s1
using h
and hashing an item in s2
using h
. Our estimate for the Jaccard Similarity is the number of
minimum hash values that are equal divided by the number of total hash functions used.
§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;
let min_hash = MinHash::with_hashers(
100,
[SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);
assert_eq!(
min_hash.get_similarity(
ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()),
ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()),
),
0.49,
);
Implementations§
Source§impl<T, U, B> MinHash<T, U, B>where
T: Iterator<Item = U>,
B: BuildHasher,
impl<T, U, B> MinHash<T, U, B>where
T: Iterator<Item = U>,
B: BuildHasher,
Sourcepub fn with_hashers(hasher_count: usize, hash_builders: [B; 2]) -> Self
pub fn with_hashers(hasher_count: usize, hash_builders: [B; 2]) -> Self
Constructs a new MinHash
with a specified number of hash functions to use, and a hasher
builder.
§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;
let min_hash = MinHash::<ShingleIterator<str>, _>::with_hashers(
100,
[SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);
Sourcepub fn get_min_hashes(&self, iter: T) -> Vec<u64>where
U: Hash,
pub fn get_min_hashes(&self, iter: T) -> Vec<u64>where
U: Hash,
Returns the minimum hash values obtained from a specified iterator iter
. This function is
used in conjunction with get_similarity_from_hashes
when doing multiple comparisons.
§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;
let min_hash = MinHash::with_hashers(
100,
[SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);
let shingles1 = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect());
let shingles2 = ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect());
let min_hashes1 = min_hash.get_min_hashes(shingles1);
let min_hashes2 = min_hash.get_min_hashes(shingles2);
assert_eq!(
min_hash.get_similarity_from_hashes(&min_hashes1, &min_hashes2),
0.49,
);
Sourcepub fn get_similarity_from_hashes(
&self,
min_hashes_1: &[u64],
min_hashes_2: &[u64],
) -> f64
pub fn get_similarity_from_hashes( &self, min_hashes_1: &[u64], min_hashes_2: &[u64], ) -> f64
Returns the estimated Jaccard Similarity measure from the minimum hashes of two iterators.
This function is used in conjunction with get_min_hashes
when doing multiple comparisons.
§Panics
Panics if the length of the two hashes are not equal.
§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;
let min_hash = MinHash::with_hashers(
100,
[SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);
let shingles1 = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect());
let shingles2 = ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect());
let min_hashes1 = min_hash.get_min_hashes(shingles1);
let min_hashes2 = min_hash.get_min_hashes(shingles2);
assert_eq!(
min_hash.get_similarity_from_hashes(&min_hashes1, &min_hashes2),
0.49,
);
Sourcepub fn get_similarity(&self, iter_1: T, iter_2: T) -> f64where
U: Hash,
pub fn get_similarity(&self, iter_1: T, iter_2: T) -> f64where
U: Hash,
Returns the estimated Jaccard Similarity measure from two iterators iter_1
and
iter_2
.
§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;
let min_hash = MinHash::with_hashers(
100,
[SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);
assert_eq!(
min_hash.get_similarity(
ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()),
ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()),
),
0.49,
);
Sourcepub fn hasher_count(&self) -> usize
pub fn hasher_count(&self) -> usize
Returns the number of hash functions being used in MinHash
.
§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
let min_hash = MinHash::<ShingleIterator<str>, _>::new(100);
assert_eq!(min_hash.hasher_count(), 100);
Sourcepub fn hashers(&self) -> &[B; 2]
pub fn hashers(&self) -> &[B; 2]
Returns a reference to the MinHash
’s hasher builders.
§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;
let min_hash = MinHash::<ShingleIterator<str>, _>::new(100);
let hashers = min_hash.hashers();