Struct probabilistic_collections::similarity::MinHash [−][src]
pub struct MinHash<T, U> { /* fields omitted */ }
MinHash
is a locality sensitive hashing scheme that can estimate the Jaccard Similarity
measure between two sets s1
and s2
. It uses multiple hash functions and for each hash
function h
, finds the minimum hash value obtained from the hashing an item in s1
using h
and hashing an item in s2
using h
. Our estimate for the Jaccard Similarity is the number of
minimum hash values that are equal divided by the number of total hash functions used.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; let min_hash = MinHash::new(100); assert_eq!( min_hash.get_similarity( ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()), ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()), ), 0.42, );
Methods
impl<T, U> MinHash<T, U>
[src]
impl<T, U> MinHash<T, U>
pub fn new(hasher_count: usize) -> Self
[src]
pub fn new(hasher_count: usize) -> Self
Constructs a new MinHash
with a specified number of hash functions to use.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; let min_hash = MinHash::<ShingleIterator<&str>, &str>::new(100);
pub fn get_min_hashes(&self, iter: T) -> Vec<u64> where
T: Iterator<Item = U>,
U: Hash,
[src]
pub fn get_min_hashes(&self, iter: T) -> Vec<u64> where
T: Iterator<Item = U>,
U: Hash,
Returns the minimum hash values obtained from a specified iterator iter
. This function is
used in conjunction with get_similarity_from_hashes
when doing multiple comparisons.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; let min_hash = MinHash::new(100); let shingles = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()); let min_hashes = min_hash.get_min_hashes(shingles);
pub fn get_similarity_from_hashes(
&self,
min_hashes_1: Vec<u64>,
min_hashes_2: Vec<u64>
) -> f64
[src]
pub fn get_similarity_from_hashes(
&self,
min_hashes_1: Vec<u64>,
min_hashes_2: Vec<u64>
) -> f64
Returns the estimated Jaccard Similarity measure from the minimum hashes of two iterators.
This function is used in conjunction with get_min_hashes
when doing multiple comparisons.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; let min_hash = MinHash::new(100); let shingles = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()); let min_hashes = min_hash.get_min_hashes(shingles);
pub fn get_similarity(&self, iter_1: T, iter_2: T) -> f64 where
T: Iterator<Item = U>,
U: Hash,
[src]
pub fn get_similarity(&self, iter_1: T, iter_2: T) -> f64 where
T: Iterator<Item = U>,
U: Hash,
Returns the estimated Jaccard Similarity measure from two iterators iter_1
and
iter_2
.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; let min_hash = MinHash::new(100); assert_eq!( min_hash.get_similarity( ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()), ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()), ), 0.42, );
pub fn hasher_count(&self) -> usize
[src]
pub fn hasher_count(&self) -> usize
Returns the number of hash functions being used in MinHash
.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; let min_hash = MinHash::<ShingleIterator<&str>, &str>::new(100); assert_eq!(min_hash.hasher_count(), 100);