mangahub_spellchecker/lib.rs
1#![feature(exact_div)]
2
3use std::{cmp::Ordering, path::Path, str::from_utf8_unchecked};
4
5use rayon::prelude::*;
6
7mod load_dict;
8mod simd_find_matching_prefix;
9
10pub use load_dict::load_words_dict;
11use simd_find_matching_prefix::{find_matching_prefix_simd_avx2, find_matching_prefix_simd_sse2};
12
13pub enum BinarySearchWordResult {
14 Found(usize, usize),
15 NotFound(usize, usize),
16}
17
18#[derive(Debug, Clone)]
19pub struct LenGroup {
20 blob: String,
21 len: u16,
22 count: u16,
23}
24
25impl LenGroup {
26 pub fn empty(len: u16) -> Self {
27 Self {
28 blob: String::new(),
29 len,
30 count: 0
31 }
32 }
33}
34
35#[deprecated(
36 since = "0.2.2",
37 note = "The crate has been renamed. Please use the 'spel-right' crate instead."
38)]
39pub struct SpellChecker {
40 len_groups: Vec<LenGroup>,
41 max_dif: usize,
42 // added_words: Vec<String>,
43 // added_words_treshhold: usize,
44}
45
46impl SpellChecker {
47 /// Creates a new `SpellChecker` from the given `file`.
48 ///
49 /// The `file` should be formated acording to [Dataset Fixer](https://github.com/Zefirchiky/easy-spell-checker/tree/ca505359efdc0a862d3418ae3c8b9f0418a9f25e/dataset_fixer) (see also `load_words_dict()`)
50 pub fn new(file: impl AsRef<Path>) -> Self {
51 let len_groups = load_words_dict(file).unwrap();
52 Self {
53 len_groups,
54 max_dif: 2,
55 // added_words: vec![],
56 // added_words_treshhold: 20,
57 }
58 }
59
60 /// Sets the maximum difference between words to be considered similar.
61 ///
62 /// This value is used in the suggest method to determine how many words to suggest.
63 ///
64 /// A value of `0` means that only exact matches are considered similar, while a value of `1` means that words that are one `insertion`, `deletion`, or `substitution` away are also considered similar.
65 ///
66 /// A value of `2` (the default) means that words that are up to two `insertions`, `deletions`, or `substitutions` away are also considered similar.
67 pub fn set_max_dif(&mut self, max_dif: usize) -> &mut Self {
68 self.max_dif = max_dif;
69 self
70 }
71
72 pub fn add(&mut self, word: String) -> &mut Self {
73 // self.added_words.push(word);
74 // if self.added_words.len() >= self.added_words_treshhold {
75 // self.save()
76 // }
77 let res = self.find_closest(&word);
78 if let Some((lg, BinarySearchWordResult::NotFound(o1, _))) = res {
79 let i = (lg.len - 1) as usize;
80 self.len_groups.get_mut(i).unwrap().blob.insert_str(o1, &word); // FIXME: Inefficient, needs to move all the words after. It should also be responsibility of LenGroup
81 }
82 self
83 }
84
85 pub fn save(&mut self) {
86 // let added_words = mem::take(&mut self.added_words);
87 // for word in added_words {
88 // let wlen = word.len();
89 // while wlen - 1 > self.len_groups.len() {
90 // self.len_groups.push(LenGroup::empty(self.len_groups.len() as u16));
91 // }
92 // if wlen - 1 == self.len_groups.len() {
93 // self.len_groups.push(LenGroup {
94 // blob: word,
95 // len: wlen as u16,
96 // count: 1,
97 // });
98 // } else {
99 // self.len_groups[wlen-1].blob.push_str(&word);
100 // }
101 // }
102
103 // for gr in self.len_groups {
104 // gr.blob.
105 // }
106 }
107
108 /// Checks if a word exists in the dataset.
109 ///
110 /// Returns true if the word exists, false otherwise.
111 pub fn check(&self, word: &str) -> bool {
112 self.find(word).is_some()
113 }
114
115 pub fn batch_check<'a>(&self, words: &'a [&str]) -> Vec<(&'a str, bool)> {
116 words
117 .iter()
118 .map(|&word| {
119 (word, self.check(word))
120 })
121 .collect()
122 }
123
124 pub fn batch_par_check<'a>(&self, words: &'a [&str]) -> Vec<(&'a str, bool)> {
125 words
126 .par_iter()
127 .map(|&word| {
128 (word, self.check(word))
129 })
130 .collect()
131 }
132
133 /// Finds a word in the dataset and returns its length group and offsets if found.
134 ///
135 /// The word is first converted to lowercase, and then the length group is searched for.
136 /// If the word is found in the length group, its offsets are found using binary search.
137 /// If the word is not found, None is returned.
138 pub fn find(&self, word: &str) -> Option<(&LenGroup, (usize, usize))> {
139 if let (lg, BinarySearchWordResult::Found(o1, o2)) = self.find_closest(word)? {
140 Some((lg, (o1, o2)))
141 } else {
142 None
143 }
144 }
145
146 pub fn find_closest(&self, word: &str) -> Option<(&LenGroup, BinarySearchWordResult)> {
147 let word = word.to_lowercase();
148 let lg = &self.len_groups.get(word.len() - 1)?;
149 if lg.count == 0 {
150 return None;
151 }
152
153 let word = word.as_bytes();
154 let blob = lg.blob.as_bytes();
155 Some((lg, Self::find_word_in_slice_binary_search(word, blob)))
156 }
157
158 /// Finds a word in a given slice of bytes using binary search.
159 ///
160 /// The slice should contain words of the same length, sorted alphabetically.
161 ///
162 /// Returns the offsets of the word in the slice if it exists, otherwise the closest offset to it.
163 /// The offsets are given as a tuple of (start, end) where start is the index of the first byte of the word,
164 /// and end is the index of the last byte of the word plus one.
165 fn find_word_in_slice_binary_search(word: &[u8], slice: &[u8]) -> BinarySearchWordResult { // TODO: move into LenGroup
166 let mut low = 0usize;
167 let mut high = slice.len().checked_div(word.len()).unwrap();
168 let mut mid_off = 0;
169 while low < high {
170 let mid = low + ((high - low) / 2);
171 mid_off = mid * word.len();
172 let candidate = &slice[mid_off..(mid_off + word.len())];
173 match word.cmp(candidate) {
174 Ordering::Equal => return BinarySearchWordResult::Found(mid_off, mid_off + word.len()),
175 Ordering::Less => high = mid,
176 Ordering::Greater => low = mid + 1,
177 }
178 }
179 BinarySearchWordResult::NotFound(mid_off, mid_off + word.len())
180 }
181
182 /// Checks if a word matches a given candidate with at most the given maximum amount of `deletions`, `insertions` and `substitution`.
183 ///
184 /// Returns a tuple of `(bool, u16)` where the boolean is `true` if the word matches the candidate, and the `u16` is the total number of operations done to match the two words.
185 ///
186 /// The algorithm first finds the matching prefix of the two words using `SIMD` if available, and then continues with a scalar algorithm from the mismatch point.
187 ///
188 /// The maximum amount of `deletions`, `insertions` and `substitutions` are given as mutable parameters, and are decreased by one each time an operation is done.
189 ///
190 /// If the word matches the candidate with at most the given maximum amount of operations, the function returns true and the total number of operations done.
191 /// Otherwise, it returns `false` and `0`.
192 #[inline]
193 pub fn matches_single(
194 word: &[u8],
195 candidate: &[u8],
196 mut max_deletions: u16,
197 mut max_insertions: u16,
198 mut max_substitutions: u16,
199 ) -> (bool, u16) {
200 let wlen = word.len();
201 let clen = candidate.len();
202
203 // Find matching prefix using SIMD
204 let mut wi = 0;
205 let mut ci = 0;
206
207 #[cfg(target_arch = "x86_64")]
208 {
209 if is_x86_feature_detected!("avx2") {
210 unsafe {
211 wi = find_matching_prefix_simd_avx2(word, candidate);
212 ci = wi;
213 }
214 }
215 else if is_x86_feature_detected!("sse2") {
216 unsafe {
217 wi = find_matching_prefix_simd_sse2(word, candidate);
218 ci = wi;
219 }
220 }
221 }
222
223 // Continue with scalar algorithm from mismatch point
224 while wi < wlen && ci < clen {
225 if word[wi] == candidate[ci] {
226 wi += 1;
227 ci += 1;
228 }
229 else if max_deletions > 0 && wi + 1 < wlen && word[wi + 1] == candidate[ci] {
230 max_deletions -= 1;
231 wi += 1;
232 }
233 else if max_insertions > 0 && ci + 1 < clen && word[wi] == candidate[ci + 1] {
234 max_insertions -= 1;
235 ci += 1;
236 }
237 else if max_substitutions > 0 {
238 max_substitutions -= 1;
239 wi += 1;
240 ci += 1;
241 }
242 else {
243 return (false, 0);
244 }
245 }
246
247 let remaining_word = (wlen - wi) as u16;
248 let remaining_candidate = (clen - ci) as u16;
249
250 if remaining_word <= max_deletions && remaining_candidate <= max_insertions {
251 (
252 true,
253 max_deletions - remaining_word + max_insertions - remaining_candidate + max_substitutions,
254 )
255 } else {
256 (false, 0)
257 }
258 }
259
260 /// Finds all words in the dataset that are similar to the given `word`.
261 ///
262 /// Similarity is defined as the maximum number of `deletions`, `insertions`, and `substitutions` that can be done to match the two words.
263 /// The maximum difference is specified by the `max_dif` field of the `SpellChecker`.
264 ///
265 /// The function returns a vector of tuples, where the first element of the tuple is the similar word, and the second element is the distance between the two words.
266 ///
267 /// The function uses a parallel iterator to search for similar words in the dataset.
268 ///
269 /// The function first filters out all words that are not of the same length as the given `word`, or that have a difference greater than the maximum difference.
270 /// It then uses the `matches_single` function to check if each word is similar to the given `word`.
271 /// If a word is similar, it is added to the result vector.
272 ///
273 /// The function finally collects the result vector and returns it.
274 pub fn suggest_for_word(&self, word: &[u8]) -> Vec<(&str, usize)> {
275 let word_len = word.len();
276
277 let min_len = word_len.saturating_sub(self.max_dif - 1);
278 let max_len = (word_len + self.max_dif).min(self.len_groups.len());
279
280 let first_char = word[0];
281 let last_char = word[word_len - 1];
282 let words = &self.len_groups[min_len..max_len];
283 words
284 .par_iter()
285 .filter(|group| group.count > 0)
286 .flat_map(|group| {
287 let dif = group.len as isize - word_len as isize;
288 let abs_dif = dif.abs() as usize;
289
290 let max_del = dif.max(0) as u16;
291 let max_ins = (-dif).max(0) as u16;
292 let max_chg = (self.max_dif - abs_dif) as u16;
293
294 group.blob
295 .as_bytes()
296 .par_chunks(group.len as usize)
297 .filter_map(|ch| {
298 if abs_dif == self.max_dif {
299 if ch[0] != first_char && ch[0] != last_char &&
300 ch[ch.len()-1] != first_char && ch[ch.len()-1] != last_char {
301 return None;
302 }
303 }
304
305 let (is_ok, dist) = Self::matches_single(
306 ch, word, max_del, max_ins, max_chg
307 );
308 if is_ok {
309 // Dataset will always be valid, and chars are based on len group. Cant have invalid utf-8.
310 // Trust
311 Some((unsafe { from_utf8_unchecked(ch) }, dist as usize))
312 } else {
313 None
314 }
315 })
316 .collect::<Vec<_>>()
317 })
318 .collect()
319
320 }
321
322 /// Suggests words for a given `word` based on the maximum difference specified in the constructor.
323 ///
324 /// If the `word` is found in the dataset, returns a vector with the given `word`.
325 ///
326 /// If the `word` is not found in the dataset, `SpellChecker::suggest_for_word()` will be used.
327 ///
328 /// Returns the result vector, sorted by the distance, and takes the first `take_first_x` elements.
329 pub fn suggest(&self, word: &str, take_first_x: usize) -> Vec<&str> {
330 let word = word.to_lowercase();
331
332 if let Some((lg, offset)) = self.find(&word) {
333 return vec![&lg.blob[offset.0..offset.1]];
334 }
335
336 let word_bytes = word.as_bytes();
337 let mut result = self.suggest_for_word(word_bytes);
338
339 if result.len() > 1 {
340 result.par_sort_unstable_by_key(|(_, dist)| *dist);
341 result.reverse();
342 }
343
344 if take_first_x == 0 {
345 result.into_iter().map(|(word, _)| word).collect()
346 } else {
347 result.into_iter().take(take_first_x).map(|(word, _)| word).collect()
348 }
349 }
350
351 /// Suggests words for each `word` in the given `words` vector based on the maximum difference specified in the constructor.
352 ///
353 /// If a `word` is found in the dataset, returns a vector with the given `word`.
354 ///
355 /// If a `word` is not found in the dataset, `SpellChecker::suggest_for_word()` will be used.
356 ///
357 /// Returns the result vector, sorted by the distance, and takes the first `take_first_x` elements.
358 ///
359 pub fn batch_suggest<'a>(&self, words: &'a [&str], take_first_x: usize) -> Vec<(&'a str, Vec<&str>)> {
360 self.batch_suggest_iter(words, take_first_x).collect()
361 }
362
363 /// Iterates over each `word` in the given `words` vector and calls the given `callback` function with the suggestions for each word.
364 ///
365 /// The `callback` function will be called with two arguments: the original `word`, and a vector of suggestions for that word.
366 ///
367 /// The suggestions vector will contain all words that are at most `max_dif` away from the given `word`.
368 ///
369 /// The suggestions vector will be sorted by the distance, with the closest words first.
370 /// If the `word` is found in the dataset, the suggestions vector will contain the given `word`.
371 ///
372 /// The `callback` function will be called for each `word` in the given `words` vector.
373 pub fn batch_suggest_with<F>(&self, words: &[&str], take_first_x: usize, mut callback: F)
374 where F: FnMut(&str, Vec<&str>), {
375 words
376 .iter()
377 .for_each(move |word| {
378 let suggestions = self.suggest(word, take_first_x);
379 callback(word, suggestions)
380 });
381 }
382
383 /// Iterates over each `word` in the given `words` vector and calls `suggest` function with the given `word` and `take_first_x`.
384 ///
385 /// The `suggest` function will return a vector of suggestions for each word, sorted by the distance, with the closest words first.
386 ///
387 /// The `suggest` function will also return the given `word` if it is found in the dataset.
388 ///
389 /// The `suggest` function will take the first `take_first_x` elements of the suggestions vector.
390 ///
391 /// The function returns an iterator over the suggestions vectors.
392 pub fn batch_suggest_iter<'a>(&self, words: &'a [&str], take_first_x: usize) -> impl Iterator<Item = (&'a str, Vec<&str>)> {
393 words
394 .iter()
395 .map(move |&word| (word, self.suggest(word, take_first_x)))
396 }
397
398 /// Iterates over each `word` in the given `words` vector and calls `suggest` function with the given `word` and `take_first_x`.
399 ///
400 /// The `suggest` function will return a vector of suggestions for each word, sorted by the distance, with the closest words first.
401 ///
402 /// The `suggest` function will also return the given `word` if it is found in the dataset.
403 ///
404 /// The `suggest` function will take the first `take_first_x` elements of the suggestions vector.
405 ///
406 /// The function returns an iterator over the suggestions vectors.
407 ///
408 /// This function is the same as `batch_suggest`, but it uses rayon's parallel iterator, which means it will use all available CPU cores in parallel to suggest words for all given words.
409 ///
410 /// The function returns a vector of suggestions vectors.
411 ///
412 /// The function is parallel, and will use all available CPU cores in parallel.
413 pub fn batch_par_suggest<'a>(&self, words: &'a [&str], take_first_x: usize) -> Vec<(&'a str, Vec<&str>)> {
414 self.batch_par_suggest_iter(words, take_first_x).collect()
415 }
416
417 /// Iterates over each `word` in the given `words` vector and calls the given `callback` function with the suggestions for each word.
418 ///
419 /// The `callback` function will be called with two arguments: the original `word`, and a vector of suggestions for that word.
420 ///
421 /// The suggestions vector will contain all words that are at most `max_dif` away from the given `word`.
422 ///
423 /// The suggestions vector will be sorted by the distance, with the closest words first.
424 /// If the `word` is found in the dataset, the suggestions vector will contain the given `word`.
425 ///
426 /// The `callback` function will be called for each `word` in the given `words` vector.
427 ///
428 /// The function is parallel, and will use all available CPU cores in parallel.
429 pub fn batch_par_suggest_with<F>(&self, words: &[&str], take_first_x: usize, callback: F)
430 where F: FnMut(&str, Vec<&str>) + Send + Sync + Clone, {
431 words
432 .par_iter()
433 .for_each_with(callback, move |cb, word| {
434 let suggestions = self.suggest(word, take_first_x);
435 cb(word, suggestions)
436 });
437 }
438
439 /// Iterates over each `word` in the given `words` vector and calls `suggest` function with the given `word` and `take_first_x`.
440 ///
441 /// The `suggest` function will return a vector of suggestions for each word, sorted by the distance, with the closest words first.
442 ///
443 /// If the `word` is found in the dataset, the suggestions vector will contain the given `word`.
444 ///
445 /// The `suggest` function will take the first `take_first_x` elements of the suggestions vector.
446 ///
447 /// The function returns a parallel iterator over the suggestions vectors.
448 pub fn batch_par_suggest_iter<'a>(&self, words: &'a [&str], take_first_x: usize) -> impl ParallelIterator<Item = (&'a str, Vec<&str>)> {
449 words
450 .par_iter()
451 .map(move |&word| (word, self.suggest(word, take_first_x)))
452 }
453}