[][src]Struct probabilistic_collections::similarity::MinHash

pub struct MinHash<T, U, B = SipHasherBuilder> { /* fields omitted */ }

MinHash is a locality sensitive hashing scheme that can estimate the Jaccard Similarity measure between two sets s1 and s2. It uses multiple hash functions and for each hash function h, finds the minimum hash value obtained from the hashing an item in s1 using h and hashing an item in s2 using h. Our estimate for the Jaccard Similarity is the number of minimum hash values that are equal divided by the number of total hash functions used.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::with_hashers(
    100,
    [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);

assert_eq!(
    min_hash.get_similarity(
        ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()),
        ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()),
    ),
    0.49,
);

Implementations

impl<T, U> MinHash<T, U> where
    T: Iterator<Item = U>, 
[src]

pub fn new(hasher_count: usize) -> Self[src]

Constructs a new MinHash with a specified number of hash functions to use.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};

let min_hash = MinHash::<ShingleIterator<str>, _>::new(100);

impl<T, U, B> MinHash<T, U, B> where
    T: Iterator<Item = U>,
    B: BuildHasher
[src]

pub fn with_hashers(hasher_count: usize, hash_builders: [B; 2]) -> Self[src]

Constructs a new MinHash with a specified number of hash functions to use, and a hasher builder.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::<ShingleIterator<str>, _>::with_hashers(
    100,
    [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);

pub fn get_min_hashes(&self, iter: T) -> Vec<u64> where
    U: Hash
[src]

Returns the minimum hash values obtained from a specified iterator iter. This function is used in conjunction with get_similarity_from_hashes when doing multiple comparisons.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::with_hashers(
    100,
    [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);

let shingles1 = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect());
let shingles2 = ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect());
let min_hashes1 = min_hash.get_min_hashes(shingles1);
let min_hashes2 = min_hash.get_min_hashes(shingles2);

assert_eq!(
    min_hash.get_similarity_from_hashes(&min_hashes1, &min_hashes2),
    0.49,
);

pub fn get_similarity_from_hashes(
    &self,
    min_hashes_1: &[u64],
    min_hashes_2: &[u64]
) -> f64
[src]

Returns the estimated Jaccard Similarity measure from the minimum hashes of two iterators. This function is used in conjunction with get_min_hashes when doing multiple comparisons.

Panics

Panics if the length of the two hashes are not equal.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::with_hashers(
    100,
    [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);

let shingles1 = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect());
let shingles2 = ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect());
let min_hashes1 = min_hash.get_min_hashes(shingles1);
let min_hashes2 = min_hash.get_min_hashes(shingles2);

assert_eq!(
    min_hash.get_similarity_from_hashes(&min_hashes1, &min_hashes2),
    0.49,
);

pub fn get_similarity(&self, iter_1: T, iter_2: T) -> f64 where
    U: Hash
[src]

Returns the estimated Jaccard Similarity measure from two iterators iter_1 and iter_2.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::with_hashers(
    100,
    [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);

assert_eq!(
    min_hash.get_similarity(
        ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()),
        ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()),
    ),
    0.49,
);

pub fn hasher_count(&self) -> usize[src]

Returns the number of hash functions being used in MinHash.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};

let min_hash = MinHash::<ShingleIterator<str>, _>::new(100);
assert_eq!(min_hash.hasher_count(), 100);

pub fn hashers(&self) -> &[B; 2][src]

Returns a reference to the MinHash's hasher builders.

Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::<ShingleIterator<str>, _>::new(100);
let hashers = min_hash.hashers();

Auto Trait Implementations

impl<T, U, B> RefUnwindSafe for MinHash<T, U, B> where
    B: RefUnwindSafe,
    T: RefUnwindSafe,
    U: RefUnwindSafe

impl<T, U, B> Send for MinHash<T, U, B> where
    B: Send,
    T: Send,
    U: Send

impl<T, U, B> Sync for MinHash<T, U, B> where
    B: Sync,
    T: Sync,
    U: Sync

impl<T, U, B> Unpin for MinHash<T, U, B> where
    B: Unpin,
    T: Unpin,
    U: Unpin

impl<T, U, B> UnwindSafe for MinHash<T, U, B> where
    B: UnwindSafe,
    T: UnwindSafe,
    U: UnwindSafe

Blanket Implementations

impl<T> Any for T where
    T: 'static + ?Sized
[src]

impl<T> Borrow<T> for T where
    T: ?Sized
[src]

impl<T> BorrowMut<T> for T where
    T: ?Sized
[src]

impl<T> From<T> for T[src]

impl<T, U> Into<U> for T where
    U: From<T>, 
[src]

impl<T, U> TryFrom<U> for T where
    U: Into<T>, 
[src]

type Error = Infallible

The type returned in the event of a conversion error.

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>, 
[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.

impl<V, T> VZip<V> for T where
    V: MultiLane<T>,