gaoya/
lib.rs

1/*!
2This library implements probabilistic Locality Sensitive Hashing algorithms
3for indexing and searching text documents.
4* [MinHash](https://en.wikipedia.org/wiki/MinHash)
5* [SimHash](https://en.wikipedia.org/wiki/SimHash)
6
7Main use cases for gaoya are clustering and deduplication
8
9
10## Example
11
12 ```
13 use gaoya::minhash::{MinHashIndex, MinHasher32, MinHasher} ;
14 use gaoya::text::whitespace_split;
15 use std::collections::HashSet;
16 let corpus = [
17     "This is the first document.",
18     "This document is the second document.",
19     "And this is the third document.",
20     "Is this the first document?",
21     "This not the first nor the second nor the third, but the fourth document"];
22 let (num_bands, band_width) = (42, 3);
23 let minhasher = MinHasher32::new(num_bands * band_width);
24 let mut index = MinHashIndex::new(num_bands, band_width, 0.5);
25 for (i, doc) in corpus.iter().enumerate() {
26     index.insert(i, minhasher.create_signature(whitespace_split(&doc.to_lowercase())));
27 }
28 for (i, doc) in corpus.iter().enumerate() {
29     if i < 4 {
30         let mut expected = HashSet::default();
31         expected.extend(vec![0, 1, 2, 3].into_iter());
32         assert_eq!(index.query_owned(&minhasher.create_signature(whitespace_split(&doc.to_lowercase()))), expected);
33     } else {
34         let mut expected = HashSet::default();
35         expected.insert(4);
36         assert_eq!(index.query_owned(&minhasher.create_signature(whitespace_split(&doc.to_lowercase()))), expected);
37     }
38 }
39
40 ```
41
42## References
43[[1] Chapter 3, Mining of Massive Datasets](http://www.mmds.org)
44
45[[2] Similarity Estimation Techniques from Rounding Algorithms](https://www.cs.princeton.edu/courses/archive/spr04/cos598B/bib/CharikarEstim.pdf)
46
47[[3] Detecting Near-Duplicates for Web Crawling](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33026.pdf)
48
49 */
50#![allow(dead_code)]
51#![allow(unused)]
52#![cfg_attr(feature = "unstable", feature(hash_raw_entry))]
53//#![feature(get_mut_unchecked)]
54
55pub mod minhash;
56pub mod simhash;
57pub mod text;
58pub mod clustering;