elipdotter/
proximity.rs

1//! Word similarity for typo tolerance and matching beginning of words.
2
3use std::collections::{btree_map, BTreeMap};
4use std::fmt::Debug;
5use std::sync::Arc;
6
7use crate::index::{AlphanumRef, Alphanumeral, Id, Provider};
8
9#[derive(Debug)]
10enum PIter<
11    'a,
12    WI: Iterator<Item = &'a Arc<AlphanumRef>>,
13    WFI: Iterator<Item = &'a Arc<AlphanumRef>>,
14> {
15    WordIter(WI),
16    WordFilteredIter(WFI),
17}
18impl<'a, WI: Iterator<Item = &'a Arc<AlphanumRef>>, WFI: Iterator<Item = &'a Arc<AlphanumRef>>>
19    Iterator for PIter<'a, WI, WFI>
20{
21    type Item = &'a Arc<AlphanumRef>;
22    fn next(&mut self) -> Option<Self::Item> {
23        match self {
24            Self::WordIter(i) => i.next(),
25            Self::WordFilteredIter(i) => i.next(),
26        }
27    }
28}
29
30/// The string proximity algorithm to use.
31///
32/// See [`strsim`] for more details on these algorithms,
33/// as that's the library which provide them.
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum Algorithm {
36    /// about 10x as slow as [`Self::Exact`],
37    /// but actually accepts similar words.
38    Hamming,
39    /// 2x-ish as slow as [`Self::Hamming`],
40    /// but produces higher quality.
41    /// Performance hit still nothing compared to parsing a small HTML document.
42    Jaro,
43    /// Only exact matches are accepted. The absolute fastest.
44    Exact,
45}
46impl Algorithm {
47    /// Get the similarity between two iterators of [`char`]s.
48    pub fn similarity(&self, a: impl algo::IClo, b: impl algo::IClo) -> f64 {
49        match self {
50            Self::Hamming => algo::hamming(a, b),
51            Self::Jaro => algo::jaro(a, b),
52            Self::Exact => {
53                if a.into_iter().eq(b.into_iter()) {
54                    1.0
55                } else {
56                    0.0
57                }
58            }
59        }
60    }
61}
62impl Default for Algorithm {
63    fn default() -> Self {
64        Self::Jaro
65    }
66}
67
68/// A list of words close to the target word, and their "rating".
69#[derive(Debug, Clone)]
70#[must_use]
71pub struct ProximateList {
72    pub(crate) words: BTreeMap<Arc<AlphanumRef>, f32>,
73}
74/// Map of words to their [`ProximateList`].
75#[derive(Debug, Clone)]
76#[must_use]
77pub struct ProximateMap<'a> {
78    pub(crate) map: BTreeMap<&'a str, ProximateList>,
79}
80impl<'a> ProximateMap<'a> {
81    pub fn new() -> Self {
82        Self {
83            map: BTreeMap::new(),
84        }
85    }
86    pub(crate) fn get_or_panic(&'a self, word: &str) -> &'a ProximateList {
87        fn panic_missing_proximate_words(word: &str) -> ! {
88            panic!("Missing proximate words when iterating over occurrences of word {}. Check you are passing the correct `proximity::ProximateMap`.", word)
89        }
90
91        if let Some(s) = self.map.get(word) {
92            s
93        } else {
94            panic_missing_proximate_words(word)
95        }
96    }
97}
98impl<'a> Default for ProximateMap<'a> {
99    fn default() -> Self {
100        Self::new()
101    }
102}
103
104/// An item of the [`ProximateWordIter`], returned from [`proximate_words`].
105#[derive(Debug, PartialEq, PartialOrd)]
106#[must_use]
107pub struct ProximateWordItem<'a> {
108    /// The found word.
109    pub word: &'a Arc<AlphanumRef>,
110    /// The word's likeness. Higher is better.
111    pub rating: f32,
112}
113impl<'a> ProximateWordItem<'a> {
114    /// Wrap a new [`ProximateWordItem`]. Works well with [`Self::into_parts`].
115    pub fn new(item: (&'a Arc<AlphanumRef>, f32)) -> Self {
116        Self {
117            word: item.0,
118            rating: item.1,
119        }
120    }
121    /// Turn `self` into parts. Works well with [`Self::new`].
122    #[must_use]
123    pub fn into_parts(self) -> (&'a Arc<AlphanumRef>, f32) {
124        (self.word, self.rating)
125    }
126}
127/// An iterator over the words with high likeness to the target word, as provided when running
128/// [`proximate_words`].
129#[derive(Debug)]
130pub struct ProximateWordIter<'a, 'b, P: Provider<'a>> {
131    word: &'b str,
132    iter: PIter<'a, P::WordIter, P::WordFilteredIter>,
133    threshold: f32,
134    algo: Algorithm,
135}
136impl<'a, 'b, P: Provider<'a>> ProximateWordIter<'a, 'b, P> {
137    pub fn extend_proximates(self, proximates: &mut ProximateMap<'b>) {
138        proximates.map.insert(
139            self.word,
140            ProximateList {
141                words: self
142                    .map(|item| (Arc::clone(item.word), item.rating))
143                    .collect(),
144            },
145        );
146    }
147}
148impl<'a, 'b, P: Provider<'a>> Iterator for ProximateWordIter<'a, 'b, P> {
149    type Item = ProximateWordItem<'a>;
150    fn next(&mut self) -> Option<Self::Item> {
151        let iter = &mut self.iter;
152        if self.word.len() < 3 {
153            for other_word in iter {
154                #[allow(clippy::cast_possible_truncation)]
155                let similarity = self.algo.similarity(other_word.chars(), self.word.chars()) as f32;
156
157                if similarity > self.threshold {
158                    #[allow(clippy::cast_possible_truncation)]
159                    return Some(ProximateWordItem::new((other_word, similarity)));
160                }
161            }
162        } else {
163            for other_word in iter {
164                // Starts with
165                let other_len = other_word.chars().count();
166                let len_diff = other_len.checked_sub(self.word.len());
167                if let Some(len_diff) = len_diff {
168                    if other_word
169                        .chars()
170                        .take(self.word.chars().count())
171                        .eq(self.word.chars())
172                    {
173                        if len_diff == 0 {
174                            return Some(ProximateWordItem::new((other_word, 1.0)));
175                        }
176                        #[allow(clippy::cast_precision_loss)]
177                        return Some(ProximateWordItem::new((
178                            other_word,
179                            1.0 / ((0.05 * len_diff as f32) + 0.5) - 1.2,
180                        )));
181                    }
182                }
183
184                // Similarity
185                #[allow(clippy::cast_possible_truncation)]
186                let similarity = self.algo.similarity(other_word.chars(), self.word.chars()) as f32;
187
188                if similarity >= self.threshold {
189                    return Some(ProximateWordItem::new((other_word, similarity)));
190                }
191            }
192        }
193        None
194    }
195}
196/// If `threshold` is closer to 0, more results are accepted.
197/// It has a range of [0..1].
198/// E.g. `0.95` means `word` is probably the only word to match.
199pub fn proximate_words<'a, 'b, P: Provider<'a>>(
200    word: &'b str,
201    provider: &'a P,
202) -> ProximateWordIter<'a, 'b, P> {
203    let threshold = provider.word_proximity_threshold();
204    if let Some(c) = Alphanumeral::new(word).chars().next() {
205        if provider.word_count_upper_limit() > provider.word_count_limit() {
206            let iter = PIter::WordFilteredIter(provider.words_starting_with(c));
207            return ProximateWordIter {
208                word,
209                iter,
210                threshold,
211                algo: provider.word_proximity_algorithm(),
212            };
213        }
214    }
215    ProximateWordIter {
216        word,
217        iter: PIter::WordIter(provider.words()),
218        threshold,
219        algo: provider.word_proximity_algorithm(),
220    }
221}
222
223/// An item found by [`ProximateDocIter`], an occurrence of [`ProximateWordItem`] in any of the
224/// documents in the [`Provider`] passed to [`proximate_word_docs`].
225#[derive(Debug)]
226#[must_use]
227pub struct ProximateDocItem<'a> {
228    /// Document id
229    pub id: Id,
230    /// word we found
231    pub word: &'a AlphanumRef,
232    /// It's proximity compared to the original. Higher is more similar.
233    pub rating: f32,
234}
235impl<'a> PartialEq for ProximateDocItem<'a> {
236    fn eq(&self, other: &Self) -> bool {
237        self.id == other.id && self.word == other.word
238    }
239}
240impl<'a> Eq for ProximateDocItem<'a> {}
241impl<'a> PartialOrd for ProximateDocItem<'a> {
242    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
243        Some(self.cmp(other))
244    }
245}
246impl<'a> Ord for ProximateDocItem<'a> {
247    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
248        let cmp = self.id.cmp(&other.id);
249        if cmp.is_eq() {
250            self.word.cmp(other.word)
251        } else {
252            cmp
253        }
254    }
255}
256impl<'a> ProximateDocItem<'a> {
257    /// Wrap a new [`ProximateDocItem`]. Works well with [`Self::into_parts`].
258    pub fn new(item: (Id, &'a AlphanumRef, f32)) -> Self {
259        Self {
260            id: item.0,
261            word: item.1,
262            rating: item.2,
263        }
264    }
265    /// Turn `self` into parts. Works well with [`Self::new`].
266    #[must_use]
267    pub fn into_parts(self) -> (Id, &'a AlphanumRef, f32) {
268        (self.id, self.word, self.rating)
269    }
270}
271
272/// See [`proximate_word_docs`].
273#[derive(Debug)]
274pub struct ProximateDocIter<'a, P: Provider<'a>> {
275    word_iter: btree_map::Iter<'a, Arc<AlphanumRef>, f32>,
276    provider: &'a P,
277    current: Option<(&'a Arc<AlphanumRef>, P::DocumentIter, f32)>,
278}
279impl<'a, P: Provider<'a>> Iterator for ProximateDocIter<'a, P> {
280    type Item = ProximateDocItem<'a>;
281    fn next(&mut self) -> Option<Self::Item> {
282        loop {
283            if let Some((word, doc_iter, proximity)) = &mut self.current {
284                if let Some(doc) = doc_iter.next() {
285                    return Some(ProximateDocItem::new((doc, word, *proximity)));
286                }
287                self.current = None;
288                continue;
289            }
290            if let Some((next_word, proximity)) = self.word_iter.next() {
291                self.current = self
292                    .provider
293                    .documents_with_word(&**next_word)
294                    .map(|iter| (next_word, iter, *proximity));
295                continue;
296            }
297            return None;
298        }
299    }
300}
301
302/// Search for `words` in `provider`.
303///
304/// The output isn't ordered according to document IDs.
305///
306/// Use [`ProximateDocItem`] and [`Iterator::collect`] to map the iterator to the
307/// `ProximateDocItem`, then collect it in a [`BTreeSet`](std::collections::BTreeSet), then [`IntoIterator::into_iter`] and map
308/// again.
309pub fn proximate_word_docs<'a, P: Provider<'a>>(
310    provider: &'a P,
311    words: &'a ProximateList,
312) -> ProximateDocIter<'a, P> {
313    ProximateDocIter {
314        word_iter: words.words.iter(),
315        current: None,
316        provider,
317    }
318}
319
320/// String similarity algorithms.
321#[allow(clippy::missing_panics_doc)] // They don't happen.
322pub mod algo {
323    struct IntoIterClone<I: Iterator<Item = char> + Clone>(I);
324    impl<'a, I: Iterator<Item = char> + Clone> IntoIterator for &'a IntoIterClone<I> {
325        type IntoIter = I;
326        type Item = char;
327        fn into_iter(self) -> Self::IntoIter {
328            self.0.clone()
329        }
330    }
331
332    /// Helper trait for [`char`] [`Iterator`]s which are also [`Clone`].
333    pub trait IClo: Iterator<Item = char> + Clone {}
334    impl<T: Iterator<Item = char> + Clone> IClo for T {}
335
336    /// The currently used default.
337    pub fn jaro(a: impl IClo, b: impl IClo) -> f64 {
338        strsim::generic_jaro(&IntoIterClone(a), &IntoIterClone(b))
339    }
340    /// 5-10x faster than [`jaro`], but is quality same?
341    pub fn hamming(a: impl IClo, b: impl IClo) -> f64 {
342        let a = IntoIterClone(a);
343        let b = IntoIterClone(b);
344
345        let a_count = a.into_iter().count();
346        let b_count = b.into_iter().count();
347
348        let min = std::cmp::min(a_count, b_count);
349        let max = std::cmp::max(a_count, b_count);
350        let len_diff = max - min;
351
352        let clamped_a = a.into_iter().take(min);
353        let clamped_b = b.into_iter().take(min);
354
355        // UNWRAP: They have the same length - ↑ .take
356        let mut differences =
357            strsim::generic_hamming(&IntoIterClone(clamped_a), &IntoIterClone(clamped_b)).unwrap();
358        differences += len_diff;
359
360        // We don't care about precision.
361        #[allow(clippy::cast_precision_loss)]
362        {
363            1.0 / (1.0 * (differences as f64 / min as f64) + 1.0)
364        }
365    }
366}