find_simdoc/jaccard.rs
1//! Searcher for all pairs of similar documents in the Jaccard space.
2use std::sync::Mutex;
3
4use crate::errors::{FindSimdocError, Result};
5use crate::feature::{FeatureConfig, FeatureExtractor};
6use crate::lsh::minhash::MinHasher;
7
8use all_pairs_hamming::chunked_join::ChunkedJoiner;
9use rand::{RngCore, SeedableRng};
10use rayon::prelude::*;
11
12/// Searcher for all pairs of similar documents in the Jaccard space.
13///
14/// # Approach
15///
16/// The search steps consist of
17///
18/// 1. Extracts features from documents,
19/// where a feature is a set representation of character or word ngrams.
20/// 2. Convert the features into binary sketches through the [1-bit minwise hashing](https://dl.acm.org/doi/abs/10.1145/1772690.1772759).
21/// 3. Search for similar sketches in the Hamming space using [`ChunkedJoiner`].
22///
23/// # Examples
24///
25/// ```
26/// use find_simdoc::JaccardSearcher;
27///
28/// let documents = vec![
29/// "Welcome to Jimbocho, the town of books and curry!",
30/// "Welcome to Jimbocho, the city of books and curry!",
31/// "We welcome you to Jimbocho, the town of books and curry.",
32/// "Welcome to the town of books and curry, Jimbocho!",
33/// ];
34///
35/// // Creates a searcher for character trigrams (with random seed value 42).
36/// let searcher = JaccardSearcher::new(3, None, Some(42))
37/// .unwrap()
38/// // Builds the database of binary sketches converted from input documents,
39/// // where binary sketches are in the Hamming space of 20*64 dimensions.
40/// .build_sketches_in_parallel(documents.iter(), 20)
41/// .unwrap();
42///
43/// // Searches all similar pairs within radius 0.25.
44/// let results = searcher.search_similar_pairs(0.25);
45/// assert_eq!(results, vec![(0, 1, 0.19375), (0, 2, 0.2125), (0, 3, 0.2328125)]);
46/// ```
47pub struct JaccardSearcher {
48 config: FeatureConfig,
49 hasher: MinHasher,
50 joiner: Option<ChunkedJoiner<u64>>,
51 shows_progress: bool,
52}
53
54impl JaccardSearcher {
55 /// Creates an instance.
56 ///
57 /// # Arguments
58 ///
59 /// * `window_size` - Window size for w-shingling in feature extraction (must be more than 0).
60 /// * `delimiter` - Delimiter for recognizing words as tokens in feature extraction.
61 /// If `None`, characters are used for tokens.
62 /// * `seed` - Seed value for random values.
63 pub fn new(window_size: usize, delimiter: Option<char>, seed: Option<u64>) -> Result<Self> {
64 let seed = seed.unwrap_or_else(rand::random::<u64>);
65 let mut seeder = rand_xoshiro::SplitMix64::seed_from_u64(seed);
66 let config = FeatureConfig::new(window_size, delimiter, seeder.next_u64())?;
67 let hasher = MinHasher::new(seeder.next_u64());
68 Ok(Self {
69 config,
70 hasher,
71 joiner: None,
72 shows_progress: false,
73 })
74 }
75
76 /// Shows the progress via the standard error output?
77 pub const fn shows_progress(mut self, yes: bool) -> Self {
78 self.shows_progress = yes;
79 self
80 }
81
82 /// Builds the database of sketches from input documents.
83 ///
84 /// # Arguments
85 ///
86 /// * `documents` - List of documents (must not include an empty string).
87 /// * `num_chunks` - Number of chunks of sketches, indicating that
88 /// the number of dimensions in the Hamming space is `num_chunks*64`.
89 pub fn build_sketches<I, D>(mut self, documents: I, num_chunks: usize) -> Result<Self>
90 where
91 I: IntoIterator<Item = D>,
92 D: AsRef<str>,
93 {
94 let mut joiner = ChunkedJoiner::<u64>::new(num_chunks).shows_progress(self.shows_progress);
95 let extractor = FeatureExtractor::new(&self.config);
96
97 let mut feature = vec![];
98 for (i, doc) in documents.into_iter().enumerate() {
99 if self.shows_progress && (i + 1) % 10000 == 0 {
100 eprintln!("Processed {} documents...", i + 1);
101 }
102 let doc = doc.as_ref();
103 if doc.is_empty() {
104 return Err(FindSimdocError::input("Input document must not be empty."));
105 }
106 extractor.extract(doc, &mut feature);
107 joiner.add(self.hasher.iter(&feature)).unwrap();
108 }
109 self.joiner = Some(joiner);
110 Ok(self)
111 }
112
113 /// Builds the database of sketches from input documents in parallel.
114 ///
115 /// # Arguments
116 ///
117 /// * `documents` - List of documents (must not include an empty string).
118 /// * `num_chunks` - Number of chunks of sketches, indicating that
119 /// the number of dimensions in the Hamming space is `num_chunks*64`.
120 ///
121 /// # Notes
122 ///
123 /// The progress is not printed even if `shows_progress = true`.
124 pub fn build_sketches_in_parallel<I, D>(
125 mut self,
126 documents: I,
127 num_chunks: usize,
128 ) -> Result<Self>
129 where
130 I: Iterator<Item = D> + Send,
131 D: AsRef<str> + Send,
132 {
133 let extractor = FeatureExtractor::new(&self.config);
134 #[allow(clippy::mutex_atomic)]
135 let processed = Mutex::new(0usize);
136 let mut sketches: Vec<_> = documents
137 .into_iter()
138 .enumerate()
139 .par_bridge()
140 .map(|(i, doc)| {
141 #[allow(clippy::mutex_atomic)]
142 {
143 // Mutex::lock also locks eprintln.
144 let mut cnt = processed.lock().unwrap();
145 *cnt += 1;
146 if self.shows_progress && *cnt % 10000 == 0 {
147 eprintln!("Processed {} documents...", *cnt);
148 }
149 }
150 let doc = doc.as_ref();
151 // TODO: Returns the error value (but I dont know the manner).
152 assert!(!doc.is_empty(), "Input document must not be empty.");
153 let mut feature = vec![];
154 extractor.extract(doc, &mut feature);
155 let mut gen = self.hasher.iter(&feature);
156 let sketch: Vec<_> = (0..num_chunks).map(|_| gen.next().unwrap()).collect();
157 (i, sketch)
158 })
159 .collect();
160 sketches.par_sort_by_key(|&(i, _)| i);
161
162 let mut joiner = ChunkedJoiner::<u64>::new(num_chunks).shows_progress(self.shows_progress);
163 for (_, sketch) in sketches {
164 joiner.add(sketch).unwrap();
165 }
166 self.joiner = Some(joiner);
167 Ok(self)
168 }
169
170 /// Searches for all pairs of similar documents within an input radius, returning
171 /// triplets of the left-side id, the right-side id, and their distance.
172 pub fn search_similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)> {
173 self.joiner.as_ref().map_or_else(Vec::new, |joiner| {
174 // In 1-bit minhash, the collision probability is multiplied by 2 over the original.
175 // Thus, we should search with the half of the actual radius.
176 let mut results = joiner.similar_pairs(radius / 2.);
177 // Modifies the distances.
178 results.iter_mut().for_each(|(_, _, d)| *d *= 2.);
179 results
180 })
181 }
182
183 /// Gets the number of input documents.
184 pub fn len(&self) -> usize {
185 self.joiner
186 .as_ref()
187 .map_or(0, |joiner| joiner.num_sketches())
188 }
189
190 /// Checks if the database is empty.
191 pub fn is_empty(&self) -> bool {
192 self.len() == 0
193 }
194
195 /// Gets the memory usage in bytes.
196 pub fn memory_in_bytes(&self) -> usize {
197 self.joiner
198 .as_ref()
199 .map_or(0, |joiner| joiner.memory_in_bytes())
200 }
201
202 /// Gets the configure of feature extraction.
203 pub const fn config(&self) -> &FeatureConfig {
204 &self.config
205 }
206}