[−][src]Struct probabilistic_collections::similarity::MinHash
MinHash
is a locality sensitive hashing scheme that can estimate the Jaccard Similarity
measure between two sets s1
and s2
. It uses multiple hash functions and for each hash
function h
, finds the minimum hash value obtained from the hashing an item in s1
using h
and hashing an item in s2
using h
. Our estimate for the Jaccard Similarity is the number of
minimum hash values that are equal divided by the number of total hash functions used.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; use probabilistic_collections::SipHasherBuilder; let min_hash = MinHash::with_hashers( 100, [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)], ); assert_eq!( min_hash.get_similarity( ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()), ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()), ), 0.49, );
Implementations
impl<T, U> MinHash<T, U> where
T: Iterator<Item = U>,
[src]
T: Iterator<Item = U>,
pub fn new(hasher_count: usize) -> Self
[src]
Constructs a new MinHash
with a specified number of hash functions to use.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; let min_hash = MinHash::<ShingleIterator<str>, _>::new(100);
impl<T, U, B> MinHash<T, U, B> where
T: Iterator<Item = U>,
B: BuildHasher,
[src]
T: Iterator<Item = U>,
B: BuildHasher,
pub fn with_hashers(hasher_count: usize, hash_builders: [B; 2]) -> Self
[src]
Constructs a new MinHash
with a specified number of hash functions to use, and a hasher
builder.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; use probabilistic_collections::SipHasherBuilder; let min_hash = MinHash::<ShingleIterator<str>, _>::with_hashers( 100, [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)], );
pub fn get_min_hashes(&self, iter: T) -> Vec<u64> where
U: Hash,
[src]
U: Hash,
Returns the minimum hash values obtained from a specified iterator iter
. This function is
used in conjunction with get_similarity_from_hashes
when doing multiple comparisons.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; use probabilistic_collections::SipHasherBuilder; let min_hash = MinHash::with_hashers( 100, [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)], ); let shingles1 = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()); let shingles2 = ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()); let min_hashes1 = min_hash.get_min_hashes(shingles1); let min_hashes2 = min_hash.get_min_hashes(shingles2); assert_eq!( min_hash.get_similarity_from_hashes(&min_hashes1, &min_hashes2), 0.49, );
pub fn get_similarity_from_hashes(
&self,
min_hashes_1: &[u64],
min_hashes_2: &[u64]
) -> f64
[src]
&self,
min_hashes_1: &[u64],
min_hashes_2: &[u64]
) -> f64
Returns the estimated Jaccard Similarity measure from the minimum hashes of two iterators.
This function is used in conjunction with get_min_hashes
when doing multiple comparisons.
Panics
Panics if the length of the two hashes are not equal.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; use probabilistic_collections::SipHasherBuilder; let min_hash = MinHash::with_hashers( 100, [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)], ); let shingles1 = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()); let shingles2 = ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()); let min_hashes1 = min_hash.get_min_hashes(shingles1); let min_hashes2 = min_hash.get_min_hashes(shingles2); assert_eq!( min_hash.get_similarity_from_hashes(&min_hashes1, &min_hashes2), 0.49, );
pub fn get_similarity(&self, iter_1: T, iter_2: T) -> f64 where
U: Hash,
[src]
U: Hash,
Returns the estimated Jaccard Similarity measure from two iterators iter_1
and
iter_2
.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; use probabilistic_collections::SipHasherBuilder; let min_hash = MinHash::with_hashers( 100, [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)], ); assert_eq!( min_hash.get_similarity( ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()), ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()), ), 0.49, );
pub fn hasher_count(&self) -> usize
[src]
Returns the number of hash functions being used in MinHash
.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; let min_hash = MinHash::<ShingleIterator<str>, _>::new(100); assert_eq!(min_hash.hasher_count(), 100);
pub fn hashers(&self) -> &[B; 2]
[src]
Returns a reference to the MinHash
's hasher builders.
Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator}; use probabilistic_collections::SipHasherBuilder; let min_hash = MinHash::<ShingleIterator<str>, _>::new(100); let hashers = min_hash.hashers();
Auto Trait Implementations
impl<T, U, B> RefUnwindSafe for MinHash<T, U, B> where
B: RefUnwindSafe,
T: RefUnwindSafe,
U: RefUnwindSafe,
B: RefUnwindSafe,
T: RefUnwindSafe,
U: RefUnwindSafe,
impl<T, U, B> Send for MinHash<T, U, B> where
B: Send,
T: Send,
U: Send,
B: Send,
T: Send,
U: Send,
impl<T, U, B> Sync for MinHash<T, U, B> where
B: Sync,
T: Sync,
U: Sync,
B: Sync,
T: Sync,
U: Sync,
impl<T, U, B> Unpin for MinHash<T, U, B> where
B: Unpin,
T: Unpin,
U: Unpin,
B: Unpin,
T: Unpin,
U: Unpin,
impl<T, U, B> UnwindSafe for MinHash<T, U, B> where
B: UnwindSafe,
T: UnwindSafe,
U: UnwindSafe,
B: UnwindSafe,
T: UnwindSafe,
U: UnwindSafe,
Blanket Implementations
impl<T> Any for T where
T: 'static + ?Sized,
[src]
T: 'static + ?Sized,
impl<T> Borrow<T> for T where
T: ?Sized,
[src]
T: ?Sized,
impl<T> BorrowMut<T> for T where
T: ?Sized,
[src]
T: ?Sized,
fn borrow_mut(&mut self) -> &mut T
[src]
impl<T> From<T> for T
[src]
impl<T, U> Into<U> for T where
U: From<T>,
[src]
U: From<T>,
impl<T, U> TryFrom<U> for T where
U: Into<T>,
[src]
U: Into<T>,
type Error = Infallible
The type returned in the event of a conversion error.
fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>
[src]
impl<T, U> TryInto<U> for T where
U: TryFrom<T>,
[src]
U: TryFrom<T>,
type Error = <U as TryFrom<T>>::Error
The type returned in the event of a conversion error.
fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>
[src]
impl<V, T> VZip<V> for T where
V: MultiLane<T>,
V: MultiLane<T>,