mangahub_spellchecker/
lib.rs

1#![feature(exact_div)]
2
3use std::{cmp::Ordering, path::Path, str::from_utf8_unchecked};
4
5use rayon::prelude::*;
6
7mod load_dict;
8mod simd_find_matching_prefix;
9
10pub use load_dict::load_words_dict;
11use simd_find_matching_prefix::{find_matching_prefix_simd_avx2, find_matching_prefix_simd_sse2};
12
13pub enum BinarySearchWordResult {
14    Found(usize, usize),
15    NotFound(usize, usize),
16}
17
18#[derive(Debug, Clone)]
19pub struct LenGroup {
20    blob: String,
21    len: u16,
22    count: u16,
23}
24
25impl LenGroup {
26    pub fn empty(len: u16) -> Self {
27        Self {
28            blob: String::new(),
29            len,
30            count: 0
31        }
32    }
33}
34
35#[deprecated(
36    since = "0.2.2", 
37    note = "The crate has been renamed. Please use the 'spel-right' crate instead."
38)]
39pub struct SpellChecker {
40    len_groups: Vec<LenGroup>,
41    max_dif: usize,
42    // added_words: Vec<String>,
43    // added_words_treshhold: usize,
44}
45
46impl SpellChecker {
47    /// Creates a new `SpellChecker` from the given `file`.
48    ///
49    /// The `file` should be formated acording to [Dataset Fixer](https://github.com/Zefirchiky/easy-spell-checker/tree/ca505359efdc0a862d3418ae3c8b9f0418a9f25e/dataset_fixer) (see also `load_words_dict()`)
50    pub fn new(file: impl AsRef<Path>) -> Self {
51        let len_groups = load_words_dict(file).unwrap();
52        Self {
53            len_groups,
54            max_dif: 2,
55            // added_words: vec![],
56            // added_words_treshhold: 20,
57        }
58    }
59
60    /// Sets the maximum difference between words to be considered similar.
61    ///
62    /// This value is used in the suggest method to determine how many words to suggest.
63    ///
64    /// A value of `0` means that only exact matches are considered similar, while a value of `1` means that words that are one `insertion`, `deletion`, or `substitution` away are also considered similar.
65    ///
66    /// A value of `2` (the default) means that words that are up to two `insertions`, `deletions`, or `substitutions` away are also considered similar.
67    pub fn set_max_dif(&mut self, max_dif: usize) -> &mut Self {
68        self.max_dif = max_dif;
69        self
70    }
71
72    pub fn add(&mut self, word: String) -> &mut Self {
73        // self.added_words.push(word);
74        // if self.added_words.len() >= self.added_words_treshhold {
75        //     self.save()
76        // }
77        let res = self.find_closest(&word);
78        if let Some((lg, BinarySearchWordResult::NotFound(o1, _))) = res {
79            let i = (lg.len - 1) as usize;
80            self.len_groups.get_mut(i).unwrap().blob.insert_str(o1, &word); // FIXME: Inefficient, needs to move all the words after. It should also be responsibility of LenGroup
81        }
82        self
83    }
84
85    pub fn save(&mut self) {
86        // let added_words = mem::take(&mut self.added_words);
87        // for word in added_words {
88        //     let wlen = word.len();
89        //     while wlen - 1 > self.len_groups.len() {
90        //         self.len_groups.push(LenGroup::empty(self.len_groups.len() as u16));
91        //     }
92        //     if wlen - 1 == self.len_groups.len() {
93        //         self.len_groups.push(LenGroup {
94        //             blob: word,
95        //             len: wlen as u16,
96        //             count: 1,
97        //         });
98        //     } else {
99        //         self.len_groups[wlen-1].blob.push_str(&word);
100        //     }
101        // }
102
103        // for gr in self.len_groups {
104        //     gr.blob.
105        // }
106    }
107
108    /// Checks if a word exists in the dataset.
109    ///
110    /// Returns true if the word exists, false otherwise.
111    pub fn check(&self, word: &str) -> bool {
112        self.find(word).is_some()
113    }
114
115    pub fn batch_check<'a>(&self, words: &'a [&str]) -> Vec<(&'a str, bool)> {
116        words
117            .iter()
118            .map(|&word| {
119                (word, self.check(word))
120            })
121            .collect()
122    }
123
124    pub fn batch_par_check<'a>(&self, words: &'a [&str]) -> Vec<(&'a str, bool)> {
125        words
126            .par_iter()
127            .map(|&word| {
128                (word, self.check(word))
129            })
130            .collect()
131    }
132
133    /// Finds a word in the dataset and returns its length group and offsets if found.
134    ///
135    /// The word is first converted to lowercase, and then the length group is searched for.
136    /// If the word is found in the length group, its offsets are found using binary search.
137    /// If the word is not found, None is returned.
138    pub fn find(&self, word: &str) -> Option<(&LenGroup, (usize, usize))> {
139        if let (lg, BinarySearchWordResult::Found(o1, o2)) = self.find_closest(word)? {
140            Some((lg, (o1, o2)))
141        } else {
142            None
143        }
144    }
145
146    pub fn find_closest(&self, word: &str) -> Option<(&LenGroup, BinarySearchWordResult)> {
147        let word = word.to_lowercase();
148        let lg = &self.len_groups.get(word.len() - 1)?;
149        if lg.count == 0 {
150            return None;
151        }
152
153        let word = word.as_bytes();
154        let blob = lg.blob.as_bytes();
155        Some((lg, Self::find_word_in_slice_binary_search(word, blob)))
156    }
157
158    /// Finds a word in a given slice of bytes using binary search.
159    ///
160    /// The slice should contain words of the same length, sorted alphabetically.
161    ///
162    /// Returns the offsets of the word in the slice if it exists, otherwise the closest offset to it.
163    /// The offsets are given as a tuple of (start, end) where start is the index of the first byte of the word,
164    /// and end is the index of the last byte of the word plus one.
165    fn find_word_in_slice_binary_search(word: &[u8], slice: &[u8]) -> BinarySearchWordResult {  // TODO: move into LenGroup
166        let mut low = 0usize;
167        let mut high = slice.len().checked_div(word.len()).unwrap();
168        let mut mid_off = 0;
169        while low < high {
170            let mid = low + ((high - low) / 2);
171            mid_off = mid * word.len();
172            let candidate = &slice[mid_off..(mid_off + word.len())];
173            match word.cmp(candidate) {
174                Ordering::Equal => return BinarySearchWordResult::Found(mid_off, mid_off + word.len()),
175                Ordering::Less => high = mid,
176                Ordering::Greater => low = mid + 1,
177            }
178        }
179        BinarySearchWordResult::NotFound(mid_off, mid_off + word.len())
180    }
181
182    /// Checks if a word matches a given candidate with at most the given maximum amount of `deletions`, `insertions` and `substitution`.
183    ///
184    /// Returns a tuple of `(bool, u16)` where the boolean is `true` if the word matches the candidate, and the `u16` is the total number of operations done to match the two words.
185    ///
186    /// The algorithm first finds the matching prefix of the two words using `SIMD` if available, and then continues with a scalar algorithm from the mismatch point.
187    ///
188    /// The maximum amount of `deletions`, `insertions` and `substitutions` are given as mutable parameters, and are decreased by one each time an operation is done.
189    ///
190    /// If the word matches the candidate with at most the given maximum amount of operations, the function returns true and the total number of operations done.
191    /// Otherwise, it returns `false` and `0`.
192    #[inline]
193    pub fn matches_single(
194        word: &[u8],
195        candidate: &[u8],
196        mut max_deletions: u16,
197        mut max_insertions: u16,
198        mut max_substitutions: u16,
199    ) -> (bool, u16) {
200        let wlen = word.len();
201        let clen = candidate.len();
202        
203        // Find matching prefix using SIMD
204        let mut wi = 0;
205        let mut ci = 0;
206        
207        #[cfg(target_arch = "x86_64")]
208        {
209            if is_x86_feature_detected!("avx2") {
210                unsafe {
211                    wi = find_matching_prefix_simd_avx2(word, candidate);
212                    ci = wi;
213                }
214            }
215            else if is_x86_feature_detected!("sse2") {
216                unsafe {
217                    wi = find_matching_prefix_simd_sse2(word, candidate);
218                    ci = wi;
219                }
220            }
221        }
222        
223        // Continue with scalar algorithm from mismatch point
224        while wi < wlen && ci < clen {
225            if word[wi] == candidate[ci] {
226                wi += 1;
227                ci += 1;
228            }
229            else if max_deletions > 0 && wi + 1 < wlen && word[wi + 1] == candidate[ci] {
230                max_deletions -= 1;
231                wi += 1;
232            }
233            else if max_insertions > 0 && ci + 1 < clen && word[wi] == candidate[ci + 1] {
234                max_insertions -= 1;
235                ci += 1;
236            }
237            else if max_substitutions > 0 {
238                max_substitutions -= 1;
239                wi += 1;
240                ci += 1;
241            }
242            else {
243                return (false, 0);
244            }
245        }
246        
247        let remaining_word = (wlen - wi) as u16;
248        let remaining_candidate = (clen - ci) as u16;
249        
250        if remaining_word <= max_deletions && remaining_candidate <= max_insertions {
251            (
252                true,
253                max_deletions - remaining_word + max_insertions - remaining_candidate + max_substitutions,
254            )
255        } else {
256            (false, 0)
257        }
258    }
259
260    /// Finds all words in the dataset that are similar to the given `word`.
261    ///
262    /// Similarity is defined as the maximum number of `deletions`, `insertions`, and `substitutions` that can be done to match the two words.
263    /// The maximum difference is specified by the `max_dif` field of the `SpellChecker`.
264    ///
265    /// The function returns a vector of tuples, where the first element of the tuple is the similar word, and the second element is the distance between the two words.
266    ///
267    /// The function uses a parallel iterator to search for similar words in the dataset.
268    ///
269    /// The function first filters out all words that are not of the same length as the given `word`, or that have a difference greater than the maximum difference.
270    /// It then uses the `matches_single` function to check if each word is similar to the given `word`.
271    /// If a word is similar, it is added to the result vector.
272    ///
273    /// The function finally collects the result vector and returns it.
274    pub fn suggest_for_word(&self, word: &[u8]) -> Vec<(&str, usize)> {
275        let word_len = word.len();
276        
277        let min_len = word_len.saturating_sub(self.max_dif - 1);
278        let max_len = (word_len + self.max_dif).min(self.len_groups.len());
279        
280        let first_char = word[0];
281        let last_char = word[word_len - 1];
282        let words = &self.len_groups[min_len..max_len];
283        words
284            .par_iter()
285            .filter(|group| group.count > 0)
286            .flat_map(|group| {
287                let dif = group.len as isize - word_len as isize;
288                let abs_dif = dif.abs() as usize;
289
290                let max_del = dif.max(0) as u16;
291                let max_ins = (-dif).max(0) as u16;
292                let max_chg = (self.max_dif - abs_dif) as u16;
293
294                group.blob
295                    .as_bytes()
296                    .par_chunks(group.len as usize)
297                    .filter_map(|ch| {
298                        if abs_dif == self.max_dif {
299                            if ch[0] != first_char && ch[0] != last_char &&
300                            ch[ch.len()-1] != first_char && ch[ch.len()-1] != last_char {
301                                return None;
302                            }
303                        }
304                        
305                        let (is_ok, dist) = Self::matches_single(
306                            ch, word, max_del, max_ins, max_chg
307                        );
308                        if is_ok {
309                            // Dataset will always be valid, and chars are based on len group. Cant have invalid utf-8.
310                            // Trust
311                            Some((unsafe { from_utf8_unchecked(ch) }, dist as usize))
312                        } else {
313                            None
314                        }
315                    })
316                    .collect::<Vec<_>>()
317            })
318            .collect()
319
320    }
321    
322    /// Suggests words for a given `word` based on the maximum difference specified in the constructor.
323    ///
324    /// If the `word` is found in the dataset, returns a vector with the given `word`.
325    ///
326    /// If the `word` is not found in the dataset, `SpellChecker::suggest_for_word()` will be used.
327    ///
328    /// Returns the result vector, sorted by the distance, and takes the first `take_first_x` elements.
329    pub fn suggest(&self, word: &str, take_first_x: usize) -> Vec<&str> {
330        let word = word.to_lowercase();
331        
332        if let Some((lg, offset)) = self.find(&word) {
333            return vec![&lg.blob[offset.0..offset.1]];
334        }
335        
336        let word_bytes = word.as_bytes();
337        let mut result = self.suggest_for_word(word_bytes);
338
339        if result.len() > 1 {
340            result.par_sort_unstable_by_key(|(_, dist)| *dist);
341            result.reverse();
342        }
343    
344        if take_first_x == 0 {
345            result.into_iter().map(|(word, _)| word).collect()
346        } else {
347            result.into_iter().take(take_first_x).map(|(word, _)| word).collect()
348        }
349    }
350
351    /// Suggests words for each `word` in the given `words` vector based on the maximum difference specified in the constructor.
352    ///
353    /// If a `word` is found in the dataset, returns a vector with the given `word`.
354    ///
355    /// If a `word` is not found in the dataset, `SpellChecker::suggest_for_word()` will be used.
356    ///
357    /// Returns the result vector, sorted by the distance, and takes the first `take_first_x` elements.
358    ///
359    pub fn batch_suggest<'a>(&self, words: &'a [&str], take_first_x: usize) -> Vec<(&'a str, Vec<&str>)> {
360        self.batch_suggest_iter(words, take_first_x).collect()
361    }
362
363    /// Iterates over each `word` in the given `words` vector and calls the given `callback` function with the suggestions for each word.
364    ///
365    /// The `callback` function will be called with two arguments: the original `word`, and a vector of suggestions for that word.
366    ///
367    /// The suggestions vector will contain all words that are at most `max_dif` away from the given `word`.
368    ///
369    /// The suggestions vector will be sorted by the distance, with the closest words first.
370    /// If the `word` is found in the dataset, the suggestions vector will contain the given `word`.
371    ///
372    /// The `callback` function will be called for each `word` in the given `words` vector.
373    pub fn batch_suggest_with<F>(&self, words: &[&str], take_first_x: usize, mut callback: F)
374    where F: FnMut(&str, Vec<&str>), {
375        words
376            .iter()
377            .for_each(move |word| {
378                let suggestions = self.suggest(word, take_first_x);
379                callback(word, suggestions)
380        });
381    }
382
383    /// Iterates over each `word` in the given `words` vector and calls `suggest` function with the given `word` and `take_first_x`.
384    ///
385    /// The `suggest` function will return a vector of suggestions for each word, sorted by the distance, with the closest words first.
386    ///
387    /// The `suggest` function will also return the given `word` if it is found in the dataset.
388    ///
389    /// The `suggest` function will take the first `take_first_x` elements of the suggestions vector.
390    ///
391    /// The function returns an iterator over the suggestions vectors.
392    pub fn batch_suggest_iter<'a>(&self, words: &'a [&str], take_first_x: usize) -> impl Iterator<Item = (&'a str, Vec<&str>)> {
393        words
394            .iter()
395            .map(move |&word| (word, self.suggest(word, take_first_x)))
396    }
397    
398    /// Iterates over each `word` in the given `words` vector and calls `suggest` function with the given `word` and `take_first_x`.
399    ///
400    /// The `suggest` function will return a vector of suggestions for each word, sorted by the distance, with the closest words first.
401    ///
402    /// The `suggest` function will also return the given `word` if it is found in the dataset.
403    ///
404    /// The `suggest` function will take the first `take_first_x` elements of the suggestions vector.
405    ///
406    /// The function returns an iterator over the suggestions vectors.
407    ///
408    /// This function is the same as `batch_suggest`, but it uses rayon's parallel iterator, which means it will use all available CPU cores in parallel to suggest words for all given words.
409    ///
410    /// The function returns a vector of suggestions vectors.
411    ///
412    /// The function is parallel, and will use all available CPU cores in parallel.
413    pub fn batch_par_suggest<'a>(&self, words: &'a [&str], take_first_x: usize) -> Vec<(&'a str, Vec<&str>)> {
414        self.batch_par_suggest_iter(words, take_first_x).collect()
415    }
416
417    /// Iterates over each `word` in the given `words` vector and calls the given `callback` function with the suggestions for each word.
418    ///
419    /// The `callback` function will be called with two arguments: the original `word`, and a vector of suggestions for that word.
420    ///
421    /// The suggestions vector will contain all words that are at most `max_dif` away from the given `word`.
422    ///
423    /// The suggestions vector will be sorted by the distance, with the closest words first.
424    /// If the `word` is found in the dataset, the suggestions vector will contain the given `word`.
425    ///
426    /// The `callback` function will be called for each `word` in the given `words` vector.
427    ///
428    /// The function is parallel, and will use all available CPU cores in parallel.
429    pub fn batch_par_suggest_with<F>(&self, words: &[&str], take_first_x: usize, callback: F)
430    where F: FnMut(&str, Vec<&str>) + Send + Sync + Clone, {
431        words
432            .par_iter()
433            .for_each_with(callback, move |cb, word| {
434                let suggestions = self.suggest(word, take_first_x);
435                cb(word, suggestions)
436        });
437    }
438
439    /// Iterates over each `word` in the given `words` vector and calls `suggest` function with the given `word` and `take_first_x`.
440    ///
441    /// The `suggest` function will return a vector of suggestions for each word, sorted by the distance, with the closest words first.
442    ///
443    /// If the `word` is found in the dataset, the suggestions vector will contain the given `word`.
444    ///
445    /// The `suggest` function will take the first `take_first_x` elements of the suggestions vector.
446    ///
447    /// The function returns a parallel iterator over the suggestions vectors.
448    pub fn batch_par_suggest_iter<'a>(&self, words: &'a [&str], take_first_x: usize) -> impl ParallelIterator<Item = (&'a str, Vec<&str>)> {
449        words
450            .par_iter()
451            .map(move |&word| (word, self.suggest(word, take_first_x)))
452    }
453}