Struct MinHash

Source
pub struct MinHash<T, U, B = SipHasherBuilder> { /* private fields */ }
Expand description

MinHash is a locality sensitive hashing scheme that can estimate the Jaccard Similarity measure between two sets s1 and s2. It uses multiple hash functions and for each hash function h, finds the minimum hash value obtained from the hashing an item in s1 using h and hashing an item in s2 using h. Our estimate for the Jaccard Similarity is the number of minimum hash values that are equal divided by the number of total hash functions used.

§Examples

use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::with_hashers(
    100,
    [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);

assert_eq!(
    min_hash.get_similarity(
        ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()),
        ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()),
    ),
    0.49,
);

Implementations§

Source§

impl<T, U> MinHash<T, U>
where T: Iterator<Item = U>,

Source

pub fn new(hasher_count: usize) -> Self

Constructs a new MinHash with a specified number of hash functions to use.

§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};

let min_hash = MinHash::<ShingleIterator<str>, _>::new(100);
Source§

impl<T, U, B> MinHash<T, U, B>
where T: Iterator<Item = U>, B: BuildHasher,

Source

pub fn with_hashers(hasher_count: usize, hash_builders: [B; 2]) -> Self

Constructs a new MinHash with a specified number of hash functions to use, and a hasher builder.

§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::<ShingleIterator<str>, _>::with_hashers(
    100,
    [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);
Source

pub fn get_min_hashes(&self, iter: T) -> Vec<u64>
where U: Hash,

Returns the minimum hash values obtained from a specified iterator iter. This function is used in conjunction with get_similarity_from_hashes when doing multiple comparisons.

§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::with_hashers(
    100,
    [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);

let shingles1 = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect());
let shingles2 = ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect());
let min_hashes1 = min_hash.get_min_hashes(shingles1);
let min_hashes2 = min_hash.get_min_hashes(shingles2);

assert_eq!(
    min_hash.get_similarity_from_hashes(&min_hashes1, &min_hashes2),
    0.49,
);
Source

pub fn get_similarity_from_hashes( &self, min_hashes_1: &[u64], min_hashes_2: &[u64], ) -> f64

Returns the estimated Jaccard Similarity measure from the minimum hashes of two iterators. This function is used in conjunction with get_min_hashes when doing multiple comparisons.

§Panics

Panics if the length of the two hashes are not equal.

§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::with_hashers(
    100,
    [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);

let shingles1 = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect());
let shingles2 = ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect());
let min_hashes1 = min_hash.get_min_hashes(shingles1);
let min_hashes2 = min_hash.get_min_hashes(shingles2);

assert_eq!(
    min_hash.get_similarity_from_hashes(&min_hashes1, &min_hashes2),
    0.49,
);
Source

pub fn get_similarity(&self, iter_1: T, iter_2: T) -> f64
where U: Hash,

Returns the estimated Jaccard Similarity measure from two iterators iter_1 and iter_2.

§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::with_hashers(
    100,
    [SipHasherBuilder::from_seed(0, 0), SipHasherBuilder::from_seed(1, 1)],
);

assert_eq!(
    min_hash.get_similarity(
        ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()),
        ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()),
    ),
    0.49,
);
Source

pub fn hasher_count(&self) -> usize

Returns the number of hash functions being used in MinHash.

§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};

let min_hash = MinHash::<ShingleIterator<str>, _>::new(100);
assert_eq!(min_hash.hasher_count(), 100);
Source

pub fn hashers(&self) -> &[B; 2]

Returns a reference to the MinHash’s hasher builders.

§Examples
use probabilistic_collections::similarity::{MinHash, ShingleIterator};
use probabilistic_collections::SipHasherBuilder;

let min_hash = MinHash::<ShingleIterator<str>, _>::new(100);
let hashers = min_hash.hashers();

Auto Trait Implementations§

§

impl<T, U, B> Freeze for MinHash<T, U, B>
where B: Freeze,

§

impl<T, U, B> RefUnwindSafe for MinHash<T, U, B>

§

impl<T, U, B> Send for MinHash<T, U, B>
where B: Send, U: Send, T: Send,

§

impl<T, U, B> Sync for MinHash<T, U, B>
where B: Sync, U: Sync, T: Sync,

§

impl<T, U, B> Unpin for MinHash<T, U, B>
where B: Unpin, U: Unpin, T: Unpin,

§

impl<T, U, B> UnwindSafe for MinHash<T, U, B>
where B: UnwindSafe, U: UnwindSafe, T: UnwindSafe,

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V