pub struct CosineSearcher { /* private fields */ }
Expand description
Searcher for all pairs of similar documents in the Cosine space.
§Approach
The search steps consist of
- Extracts features from documents, where a feature is a tfidf-weighted vector representation of character or word ngrams.
- Convert the features into binary sketches through the simplified simhash.
- Search for similar sketches in the Hamming space using
ChunkedJoiner
.
§Examples
use find_simdoc::tfidf::{Idf, Tf};
use find_simdoc::CosineSearcher;
let documents = vec![
"Welcome to Jimbocho, the town of books and curry!",
"Welcome to Jimbocho, the city of books and curry!",
"We welcome you to Jimbocho, the town of books and curry.",
"Welcome to the town of books and curry, Jimbocho!",
];
// Creates a searcher for word unigrams (with random seed value 42).
let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap();
// Creates a term frequency (TF) weighter.
let tf = Tf::new();
// Creates a inverse document frequency (IDF) weighter.
let idf = Idf::new()
.build(documents.iter().clone(), searcher.config())
.unwrap();
// Builds the database of binary sketches converted from input documents,
let searcher = searcher
// with the TF weighter and
.tf(Some(tf))
// the IDF weighter,
.idf(Some(idf))
// where binary sketches are in the Hamming space of 10*64 dimensions.
.build_sketches_in_parallel(documents.iter(), 10)
.unwrap();
// Searches all similar pairs within radius 0.25.
let results = searcher.search_similar_pairs(0.25);
// A result consists of the left-side id, the right-side id, and their distance.
assert_eq!(results, vec![(0, 1, 0.1296875), (0, 3, 0.24375)]);
Implementations§
Source§impl CosineSearcher
impl CosineSearcher
Sourcepub fn new(
window_size: usize,
delimiter: Option<char>,
seed: Option<u64>,
) -> Result<Self>
pub fn new( window_size: usize, delimiter: Option<char>, seed: Option<u64>, ) -> Result<Self>
Creates an instance.
§Arguments
window_size
- Window size for w-shingling in feature extraction (must be more than 0).delimiter
- Delimiter for recognizing words as tokens in feature extraction. IfNone
, characters are used for tokens.seed
- Seed value for random values.
Examples found in repository?
examples/find_cosine.rs (line 13)
4fn main() {
5 let documents = vec![
6 "Welcome to Jimbocho, the town of books and curry!",
7 "Welcome to Jimbocho, the city of books and curry!",
8 "We welcome you to Jimbocho, the town of books and curry.",
9 "Welcome to the town of books and curry, Jimbocho!",
10 ];
11
12 // Creates a searcher for word unigrams (with random seed value 42).
13 let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap();
14 // Creates a term frequency (TF) weighter.
15 let tf = Tf::new();
16 // Creates a inverse document frequency (IDF) weighter.
17 let idf = Idf::new()
18 .build(documents.iter().clone(), searcher.config())
19 .unwrap();
20 // Builds the database of binary sketches converted from input documents,
21 let searcher = searcher
22 // with the TF weighter and
23 .tf(Some(tf))
24 // the IDF weighter,
25 .idf(Some(idf))
26 // where binary sketches are in the Hamming space of 10*64 dimensions.
27 .build_sketches_in_parallel(documents.iter(), 10)
28 .unwrap();
29
30 // Searches all similar pairs within radius 0.25.
31 let results = searcher.search_similar_pairs(0.25);
32 // A result consists of the left-side id, the right-side id, and their distance.
33 assert_eq!(results, vec![(0, 1, 0.1671875), (0, 3, 0.246875)]);
34}
Sourcepub const fn shows_progress(self, yes: bool) -> Self
pub const fn shows_progress(self, yes: bool) -> Self
Shows the progress via the standard error output?
Sourcepub fn tf(self, tf: Option<Tf>) -> Self
pub fn tf(self, tf: Option<Tf>) -> Self
Sets the scheme of TF weighting.
Examples found in repository?
examples/find_cosine.rs (line 23)
4fn main() {
5 let documents = vec![
6 "Welcome to Jimbocho, the town of books and curry!",
7 "Welcome to Jimbocho, the city of books and curry!",
8 "We welcome you to Jimbocho, the town of books and curry.",
9 "Welcome to the town of books and curry, Jimbocho!",
10 ];
11
12 // Creates a searcher for word unigrams (with random seed value 42).
13 let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap();
14 // Creates a term frequency (TF) weighter.
15 let tf = Tf::new();
16 // Creates a inverse document frequency (IDF) weighter.
17 let idf = Idf::new()
18 .build(documents.iter().clone(), searcher.config())
19 .unwrap();
20 // Builds the database of binary sketches converted from input documents,
21 let searcher = searcher
22 // with the TF weighter and
23 .tf(Some(tf))
24 // the IDF weighter,
25 .idf(Some(idf))
26 // where binary sketches are in the Hamming space of 10*64 dimensions.
27 .build_sketches_in_parallel(documents.iter(), 10)
28 .unwrap();
29
30 // Searches all similar pairs within radius 0.25.
31 let results = searcher.search_similar_pairs(0.25);
32 // A result consists of the left-side id, the right-side id, and their distance.
33 assert_eq!(results, vec![(0, 1, 0.1671875), (0, 3, 0.246875)]);
34}
Sourcepub fn idf(self, idf: Option<Idf<u64>>) -> Self
pub fn idf(self, idf: Option<Idf<u64>>) -> Self
Sets the scheme of IDF weighting.
Examples found in repository?
examples/find_cosine.rs (line 25)
4fn main() {
5 let documents = vec![
6 "Welcome to Jimbocho, the town of books and curry!",
7 "Welcome to Jimbocho, the city of books and curry!",
8 "We welcome you to Jimbocho, the town of books and curry.",
9 "Welcome to the town of books and curry, Jimbocho!",
10 ];
11
12 // Creates a searcher for word unigrams (with random seed value 42).
13 let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap();
14 // Creates a term frequency (TF) weighter.
15 let tf = Tf::new();
16 // Creates a inverse document frequency (IDF) weighter.
17 let idf = Idf::new()
18 .build(documents.iter().clone(), searcher.config())
19 .unwrap();
20 // Builds the database of binary sketches converted from input documents,
21 let searcher = searcher
22 // with the TF weighter and
23 .tf(Some(tf))
24 // the IDF weighter,
25 .idf(Some(idf))
26 // where binary sketches are in the Hamming space of 10*64 dimensions.
27 .build_sketches_in_parallel(documents.iter(), 10)
28 .unwrap();
29
30 // Searches all similar pairs within radius 0.25.
31 let results = searcher.search_similar_pairs(0.25);
32 // A result consists of the left-side id, the right-side id, and their distance.
33 assert_eq!(results, vec![(0, 1, 0.1671875), (0, 3, 0.246875)]);
34}
Sourcepub fn build_sketches<I, D>(
self,
documents: I,
num_chunks: usize,
) -> Result<Self>
pub fn build_sketches<I, D>( self, documents: I, num_chunks: usize, ) -> Result<Self>
Builds the database of sketches from input documents.
§Arguments
documents
- List of documents (must not include an empty string).num_chunks
- Number of chunks of sketches, indicating that the number of dimensions in the Hamming space isnum_chunks*64
.
Sourcepub fn build_sketches_in_parallel<I, D>(
self,
documents: I,
num_chunks: usize,
) -> Result<Self>
pub fn build_sketches_in_parallel<I, D>( self, documents: I, num_chunks: usize, ) -> Result<Self>
Builds the database of sketches from input documents in parallel.
§Arguments
documents
- List of documents (must not include an empty string).num_chunks
- Number of chunks of sketches, indicating that the number of dimensions in the Hamming space isnum_chunks*64
.
§Notes
The progress is not printed even if shows_progress = true
.
Examples found in repository?
examples/find_cosine.rs (line 27)
4fn main() {
5 let documents = vec![
6 "Welcome to Jimbocho, the town of books and curry!",
7 "Welcome to Jimbocho, the city of books and curry!",
8 "We welcome you to Jimbocho, the town of books and curry.",
9 "Welcome to the town of books and curry, Jimbocho!",
10 ];
11
12 // Creates a searcher for word unigrams (with random seed value 42).
13 let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap();
14 // Creates a term frequency (TF) weighter.
15 let tf = Tf::new();
16 // Creates a inverse document frequency (IDF) weighter.
17 let idf = Idf::new()
18 .build(documents.iter().clone(), searcher.config())
19 .unwrap();
20 // Builds the database of binary sketches converted from input documents,
21 let searcher = searcher
22 // with the TF weighter and
23 .tf(Some(tf))
24 // the IDF weighter,
25 .idf(Some(idf))
26 // where binary sketches are in the Hamming space of 10*64 dimensions.
27 .build_sketches_in_parallel(documents.iter(), 10)
28 .unwrap();
29
30 // Searches all similar pairs within radius 0.25.
31 let results = searcher.search_similar_pairs(0.25);
32 // A result consists of the left-side id, the right-side id, and their distance.
33 assert_eq!(results, vec![(0, 1, 0.1671875), (0, 3, 0.246875)]);
34}
Sourcepub fn search_similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)>
pub fn search_similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)>
Searches for all pairs of similar documents within an input radius, returning triplets of the left-side id, the right-side id, and their distance.
Examples found in repository?
examples/find_cosine.rs (line 31)
4fn main() {
5 let documents = vec![
6 "Welcome to Jimbocho, the town of books and curry!",
7 "Welcome to Jimbocho, the city of books and curry!",
8 "We welcome you to Jimbocho, the town of books and curry.",
9 "Welcome to the town of books and curry, Jimbocho!",
10 ];
11
12 // Creates a searcher for word unigrams (with random seed value 42).
13 let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap();
14 // Creates a term frequency (TF) weighter.
15 let tf = Tf::new();
16 // Creates a inverse document frequency (IDF) weighter.
17 let idf = Idf::new()
18 .build(documents.iter().clone(), searcher.config())
19 .unwrap();
20 // Builds the database of binary sketches converted from input documents,
21 let searcher = searcher
22 // with the TF weighter and
23 .tf(Some(tf))
24 // the IDF weighter,
25 .idf(Some(idf))
26 // where binary sketches are in the Hamming space of 10*64 dimensions.
27 .build_sketches_in_parallel(documents.iter(), 10)
28 .unwrap();
29
30 // Searches all similar pairs within radius 0.25.
31 let results = searcher.search_similar_pairs(0.25);
32 // A result consists of the left-side id, the right-side id, and their distance.
33 assert_eq!(results, vec![(0, 1, 0.1671875), (0, 3, 0.246875)]);
34}
Sourcepub fn memory_in_bytes(&self) -> usize
pub fn memory_in_bytes(&self) -> usize
Gets the memory usage in bytes.
Sourcepub const fn config(&self) -> &FeatureConfig
pub const fn config(&self) -> &FeatureConfig
Gets the configure of feature extraction.
Examples found in repository?
examples/find_cosine.rs (line 18)
4fn main() {
5 let documents = vec![
6 "Welcome to Jimbocho, the town of books and curry!",
7 "Welcome to Jimbocho, the city of books and curry!",
8 "We welcome you to Jimbocho, the town of books and curry.",
9 "Welcome to the town of books and curry, Jimbocho!",
10 ];
11
12 // Creates a searcher for word unigrams (with random seed value 42).
13 let searcher = CosineSearcher::new(1, Some(' '), Some(42)).unwrap();
14 // Creates a term frequency (TF) weighter.
15 let tf = Tf::new();
16 // Creates a inverse document frequency (IDF) weighter.
17 let idf = Idf::new()
18 .build(documents.iter().clone(), searcher.config())
19 .unwrap();
20 // Builds the database of binary sketches converted from input documents,
21 let searcher = searcher
22 // with the TF weighter and
23 .tf(Some(tf))
24 // the IDF weighter,
25 .idf(Some(idf))
26 // where binary sketches are in the Hamming space of 10*64 dimensions.
27 .build_sketches_in_parallel(documents.iter(), 10)
28 .unwrap();
29
30 // Searches all similar pairs within radius 0.25.
31 let results = searcher.search_similar_pairs(0.25);
32 // A result consists of the left-side id, the right-side id, and their distance.
33 assert_eq!(results, vec![(0, 1, 0.1671875), (0, 3, 0.246875)]);
34}
Auto Trait Implementations§
impl Freeze for CosineSearcher
impl RefUnwindSafe for CosineSearcher
impl Send for CosineSearcher
impl Sync for CosineSearcher
impl Unpin for CosineSearcher
impl UnwindSafe for CosineSearcher
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self
into a Left
variant of Either<Self, Self>
if into_left
is true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self
into a Left
variant of Either<Self, Self>
if into_left(&self)
returns true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read more