1use std::collections::{btree_map, BTreeMap};
4use std::fmt::Debug;
5use std::sync::Arc;
6
7use crate::index::{AlphanumRef, Alphanumeral, Id, Provider};
8
9#[derive(Debug)]
10enum PIter<
11 'a,
12 WI: Iterator<Item = &'a Arc<AlphanumRef>>,
13 WFI: Iterator<Item = &'a Arc<AlphanumRef>>,
14> {
15 WordIter(WI),
16 WordFilteredIter(WFI),
17}
18impl<'a, WI: Iterator<Item = &'a Arc<AlphanumRef>>, WFI: Iterator<Item = &'a Arc<AlphanumRef>>>
19 Iterator for PIter<'a, WI, WFI>
20{
21 type Item = &'a Arc<AlphanumRef>;
22 fn next(&mut self) -> Option<Self::Item> {
23 match self {
24 Self::WordIter(i) => i.next(),
25 Self::WordFilteredIter(i) => i.next(),
26 }
27 }
28}
29
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum Algorithm {
36 Hamming,
39 Jaro,
43 Exact,
45}
46impl Algorithm {
47 pub fn similarity(&self, a: impl algo::IClo, b: impl algo::IClo) -> f64 {
49 match self {
50 Self::Hamming => algo::hamming(a, b),
51 Self::Jaro => algo::jaro(a, b),
52 Self::Exact => {
53 if a.into_iter().eq(b.into_iter()) {
54 1.0
55 } else {
56 0.0
57 }
58 }
59 }
60 }
61}
62impl Default for Algorithm {
63 fn default() -> Self {
64 Self::Jaro
65 }
66}
67
68#[derive(Debug, Clone)]
70#[must_use]
71pub struct ProximateList {
72 pub(crate) words: BTreeMap<Arc<AlphanumRef>, f32>,
73}
74#[derive(Debug, Clone)]
76#[must_use]
77pub struct ProximateMap<'a> {
78 pub(crate) map: BTreeMap<&'a str, ProximateList>,
79}
80impl<'a> ProximateMap<'a> {
81 pub fn new() -> Self {
82 Self {
83 map: BTreeMap::new(),
84 }
85 }
86 pub(crate) fn get_or_panic(&'a self, word: &str) -> &'a ProximateList {
87 fn panic_missing_proximate_words(word: &str) -> ! {
88 panic!("Missing proximate words when iterating over occurrences of word {}. Check you are passing the correct `proximity::ProximateMap`.", word)
89 }
90
91 if let Some(s) = self.map.get(word) {
92 s
93 } else {
94 panic_missing_proximate_words(word)
95 }
96 }
97}
98impl<'a> Default for ProximateMap<'a> {
99 fn default() -> Self {
100 Self::new()
101 }
102}
103
104#[derive(Debug, PartialEq, PartialOrd)]
106#[must_use]
107pub struct ProximateWordItem<'a> {
108 pub word: &'a Arc<AlphanumRef>,
110 pub rating: f32,
112}
113impl<'a> ProximateWordItem<'a> {
114 pub fn new(item: (&'a Arc<AlphanumRef>, f32)) -> Self {
116 Self {
117 word: item.0,
118 rating: item.1,
119 }
120 }
121 #[must_use]
123 pub fn into_parts(self) -> (&'a Arc<AlphanumRef>, f32) {
124 (self.word, self.rating)
125 }
126}
127#[derive(Debug)]
130pub struct ProximateWordIter<'a, 'b, P: Provider<'a>> {
131 word: &'b str,
132 iter: PIter<'a, P::WordIter, P::WordFilteredIter>,
133 threshold: f32,
134 algo: Algorithm,
135}
136impl<'a, 'b, P: Provider<'a>> ProximateWordIter<'a, 'b, P> {
137 pub fn extend_proximates(self, proximates: &mut ProximateMap<'b>) {
138 proximates.map.insert(
139 self.word,
140 ProximateList {
141 words: self
142 .map(|item| (Arc::clone(item.word), item.rating))
143 .collect(),
144 },
145 );
146 }
147}
148impl<'a, 'b, P: Provider<'a>> Iterator for ProximateWordIter<'a, 'b, P> {
149 type Item = ProximateWordItem<'a>;
150 fn next(&mut self) -> Option<Self::Item> {
151 let iter = &mut self.iter;
152 if self.word.len() < 3 {
153 for other_word in iter {
154 #[allow(clippy::cast_possible_truncation)]
155 let similarity = self.algo.similarity(other_word.chars(), self.word.chars()) as f32;
156
157 if similarity > self.threshold {
158 #[allow(clippy::cast_possible_truncation)]
159 return Some(ProximateWordItem::new((other_word, similarity)));
160 }
161 }
162 } else {
163 for other_word in iter {
164 let other_len = other_word.chars().count();
166 let len_diff = other_len.checked_sub(self.word.len());
167 if let Some(len_diff) = len_diff {
168 if other_word
169 .chars()
170 .take(self.word.chars().count())
171 .eq(self.word.chars())
172 {
173 if len_diff == 0 {
174 return Some(ProximateWordItem::new((other_word, 1.0)));
175 }
176 #[allow(clippy::cast_precision_loss)]
177 return Some(ProximateWordItem::new((
178 other_word,
179 1.0 / ((0.05 * len_diff as f32) + 0.5) - 1.2,
180 )));
181 }
182 }
183
184 #[allow(clippy::cast_possible_truncation)]
186 let similarity = self.algo.similarity(other_word.chars(), self.word.chars()) as f32;
187
188 if similarity >= self.threshold {
189 return Some(ProximateWordItem::new((other_word, similarity)));
190 }
191 }
192 }
193 None
194 }
195}
196pub fn proximate_words<'a, 'b, P: Provider<'a>>(
200 word: &'b str,
201 provider: &'a P,
202) -> ProximateWordIter<'a, 'b, P> {
203 let threshold = provider.word_proximity_threshold();
204 if let Some(c) = Alphanumeral::new(word).chars().next() {
205 if provider.word_count_upper_limit() > provider.word_count_limit() {
206 let iter = PIter::WordFilteredIter(provider.words_starting_with(c));
207 return ProximateWordIter {
208 word,
209 iter,
210 threshold,
211 algo: provider.word_proximity_algorithm(),
212 };
213 }
214 }
215 ProximateWordIter {
216 word,
217 iter: PIter::WordIter(provider.words()),
218 threshold,
219 algo: provider.word_proximity_algorithm(),
220 }
221}
222
223#[derive(Debug)]
226#[must_use]
227pub struct ProximateDocItem<'a> {
228 pub id: Id,
230 pub word: &'a AlphanumRef,
232 pub rating: f32,
234}
235impl<'a> PartialEq for ProximateDocItem<'a> {
236 fn eq(&self, other: &Self) -> bool {
237 self.id == other.id && self.word == other.word
238 }
239}
240impl<'a> Eq for ProximateDocItem<'a> {}
241impl<'a> PartialOrd for ProximateDocItem<'a> {
242 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
243 Some(self.cmp(other))
244 }
245}
246impl<'a> Ord for ProximateDocItem<'a> {
247 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
248 let cmp = self.id.cmp(&other.id);
249 if cmp.is_eq() {
250 self.word.cmp(other.word)
251 } else {
252 cmp
253 }
254 }
255}
256impl<'a> ProximateDocItem<'a> {
257 pub fn new(item: (Id, &'a AlphanumRef, f32)) -> Self {
259 Self {
260 id: item.0,
261 word: item.1,
262 rating: item.2,
263 }
264 }
265 #[must_use]
267 pub fn into_parts(self) -> (Id, &'a AlphanumRef, f32) {
268 (self.id, self.word, self.rating)
269 }
270}
271
272#[derive(Debug)]
274pub struct ProximateDocIter<'a, P: Provider<'a>> {
275 word_iter: btree_map::Iter<'a, Arc<AlphanumRef>, f32>,
276 provider: &'a P,
277 current: Option<(&'a Arc<AlphanumRef>, P::DocumentIter, f32)>,
278}
279impl<'a, P: Provider<'a>> Iterator for ProximateDocIter<'a, P> {
280 type Item = ProximateDocItem<'a>;
281 fn next(&mut self) -> Option<Self::Item> {
282 loop {
283 if let Some((word, doc_iter, proximity)) = &mut self.current {
284 if let Some(doc) = doc_iter.next() {
285 return Some(ProximateDocItem::new((doc, word, *proximity)));
286 }
287 self.current = None;
288 continue;
289 }
290 if let Some((next_word, proximity)) = self.word_iter.next() {
291 self.current = self
292 .provider
293 .documents_with_word(&**next_word)
294 .map(|iter| (next_word, iter, *proximity));
295 continue;
296 }
297 return None;
298 }
299 }
300}
301
302pub fn proximate_word_docs<'a, P: Provider<'a>>(
310 provider: &'a P,
311 words: &'a ProximateList,
312) -> ProximateDocIter<'a, P> {
313 ProximateDocIter {
314 word_iter: words.words.iter(),
315 current: None,
316 provider,
317 }
318}
319
320#[allow(clippy::missing_panics_doc)] pub mod algo {
323 struct IntoIterClone<I: Iterator<Item = char> + Clone>(I);
324 impl<'a, I: Iterator<Item = char> + Clone> IntoIterator for &'a IntoIterClone<I> {
325 type IntoIter = I;
326 type Item = char;
327 fn into_iter(self) -> Self::IntoIter {
328 self.0.clone()
329 }
330 }
331
332 pub trait IClo: Iterator<Item = char> + Clone {}
334 impl<T: Iterator<Item = char> + Clone> IClo for T {}
335
336 pub fn jaro(a: impl IClo, b: impl IClo) -> f64 {
338 strsim::generic_jaro(&IntoIterClone(a), &IntoIterClone(b))
339 }
340 pub fn hamming(a: impl IClo, b: impl IClo) -> f64 {
342 let a = IntoIterClone(a);
343 let b = IntoIterClone(b);
344
345 let a_count = a.into_iter().count();
346 let b_count = b.into_iter().count();
347
348 let min = std::cmp::min(a_count, b_count);
349 let max = std::cmp::max(a_count, b_count);
350 let len_diff = max - min;
351
352 let clamped_a = a.into_iter().take(min);
353 let clamped_b = b.into_iter().take(min);
354
355 let mut differences =
357 strsim::generic_hamming(&IntoIterClone(clamped_a), &IntoIterClone(clamped_b)).unwrap();
358 differences += len_diff;
359
360 #[allow(clippy::cast_precision_loss)]
362 {
363 1.0 / (1.0 * (differences as f64 / min as f64) + 1.0)
364 }
365 }
366}