find_cosine/
find_cosine.rs

1use find_simdoc::tfidf::{Idf, Tf};
2use find_simdoc::CosineSearcher;
3
4fn main() {
5    let documents = vec![
6        "Welcome to Jimbocho, the town of books and curry!",
7        "Welcome to Jimbocho, the city of books and curry!",
8        "We welcome you to Jimbocho, the town of books and curry.",
9        "Welcome to the town of books and curry, Jimbocho!",
10    ];
11
12    // Creates a searcher for word unigrams (with random seed value 42).
13    let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap();
14    // Creates a term frequency (TF) weighter.
15    let tf = Tf::new();
16    // Creates a inverse document frequency (IDF) weighter.
17    let idf = Idf::new()
18        .build(documents.iter().clone(), searcher.config())
19        .unwrap();
20    // Builds the database of binary sketches converted from input documents,
21    let searcher = searcher
22        // with the TF weighter and
23        .tf(Some(tf))
24        // the IDF weighter,
25        .idf(Some(idf))
26        // where binary sketches are in the Hamming space of 10*64 dimensions.
27        .build_sketches_in_parallel(documents.iter(), 10)
28        .unwrap();
29
30    // Searches all similar pairs within radius 0.25.
31    let results = searcher.search_similar_pairs(0.25);
32    // A result consists of the left-side id, the right-side id, and their distance.
33    assert_eq!(results, vec![(0, 1, 0.1671875), (0, 3, 0.246875)]);
34}