1use super::{
2 FstDictionary, WordId,
3 rune::{self, AttributeList, parse_word_list},
4 word_map::{WordMap, WordMapEntry},
5};
6use crate::edit_distance::edit_distance_min_alloc;
7use itertools::Itertools;
8use std::sync::Arc;
9use std::{borrow::Cow, sync::LazyLock};
10
11use crate::{CharString, CharStringExt, DictWordMetadata};
12
13use super::FuzzyMatchResult;
14use super::dictionary::Dictionary;
15
16#[derive(Debug, Clone, Eq, PartialEq)]
25pub struct MutableDictionary {
26 word_map: WordMap,
28}
29
30fn uncached_inner_new() -> Arc<MutableDictionary> {
33 MutableDictionary::from_rune_files(
34 include_str!("../../dictionary.dict"),
35 include_str!("../../annotations.json"),
36 )
37 .map(Arc::new)
38 .unwrap_or_else(|e| panic!("Failed to load curated dictionary: {}", e))
39}
40
41static DICT: LazyLock<Arc<MutableDictionary>> = LazyLock::new(uncached_inner_new);
42
43impl MutableDictionary {
44 pub fn new() -> Self {
45 Self {
46 word_map: WordMap::default(),
47 }
48 }
49
50 pub fn from_rune_files(word_list: &str, attr_list: &str) -> Result<Self, rune::Error> {
51 let word_list = parse_word_list(word_list)?;
52 let attr_list = AttributeList::parse(attr_list)?;
53
54 let mut word_map = WordMap::default();
56
57 attr_list.expand_annotated_words(word_list, &mut word_map);
58
59 Ok(Self { word_map })
60 }
61
62 pub fn curated() -> Arc<Self> {
66 (*DICT).clone()
67 }
68
69 pub fn extend_words(
73 &mut self,
74 words: impl IntoIterator<Item = (impl AsRef<[char]>, DictWordMetadata)>,
75 ) {
76 for (chars, metadata) in words.into_iter() {
77 self.word_map.insert(WordMapEntry {
78 metadata,
79 canonical_spelling: chars.as_ref().into(),
80 })
81 }
82 }
83
84 pub fn append_word(&mut self, word: impl AsRef<[char]>, metadata: DictWordMetadata) {
89 self.extend_words(std::iter::once((word.as_ref(), metadata)))
90 }
91
92 pub fn append_word_str(&mut self, word: &str, metadata: DictWordMetadata) {
97 self.append_word(word.chars().collect::<Vec<_>>(), metadata)
98 }
99}
100
101impl Default for MutableDictionary {
102 fn default() -> Self {
103 Self::new()
104 }
105}
106
107impl Dictionary for MutableDictionary {
108 fn get_word_metadata(&self, word: &[char]) -> Option<Cow<'_, DictWordMetadata>> {
109 self.word_map
110 .get_with_chars(word)
111 .map(|v| Cow::Borrowed(&v.metadata))
112 }
113
114 fn contains_word(&self, word: &[char]) -> bool {
115 self.word_map.contains_chars(word)
116 }
117
118 fn contains_word_str(&self, word: &str) -> bool {
119 let chars: CharString = word.chars().collect();
120 self.contains_word(&chars)
121 }
122
123 fn get_word_metadata_str(&self, word: &str) -> Option<Cow<'_, DictWordMetadata>> {
124 let chars: CharString = word.chars().collect();
125 self.get_word_metadata(&chars)
126 }
127
128 fn get_correct_capitalization_of(&self, word: &[char]) -> Option<&'_ [char]> {
129 self.word_map
130 .get_with_chars(word)
131 .map(|v| v.canonical_spelling.as_slice())
132 }
133
134 fn fuzzy_match(
139 &'_ self,
140 word: &[char],
141 max_distance: u8,
142 max_results: usize,
143 ) -> Vec<FuzzyMatchResult<'_>> {
144 let misspelled_charslice = word.normalized();
145 let misspelled_charslice_lower = misspelled_charslice.to_lower();
146
147 let shortest_word_len = if misspelled_charslice.len() <= max_distance as usize {
148 1
149 } else {
150 misspelled_charslice.len() - max_distance as usize
151 };
152 let longest_word_len = misspelled_charslice.len() + max_distance as usize;
153
154 let words_to_search = self
156 .words_iter()
157 .filter(|word| (shortest_word_len..=longest_word_len).contains(&word.len()));
158
159 let mut buf_a = Vec::with_capacity(53);
162 let mut buf_b = Vec::with_capacity(53);
163
164 words_to_search
166 .filter_map(|word| {
167 let dist =
168 edit_distance_min_alloc(&misspelled_charslice, word, &mut buf_a, &mut buf_b);
169 let lowercase_dist = edit_distance_min_alloc(
170 &misspelled_charslice_lower,
171 word,
172 &mut buf_a,
173 &mut buf_b,
174 );
175
176 let smaller_dist = dist.min(lowercase_dist);
177 if smaller_dist <= max_distance {
178 Some((word, smaller_dist))
179 } else {
180 None
181 }
182 })
183 .sorted_unstable_by_key(|a| a.1)
184 .take(max_results)
185 .map(|(word, edit_distance)| FuzzyMatchResult {
186 word,
187 edit_distance,
188 metadata: self.get_word_metadata(word).unwrap(),
189 })
190 .collect()
191 }
192
193 fn fuzzy_match_str(
194 &'_ self,
195 word: &str,
196 max_distance: u8,
197 max_results: usize,
198 ) -> Vec<FuzzyMatchResult<'_>> {
199 let word: Vec<_> = word.chars().collect();
200 self.fuzzy_match(&word, max_distance, max_results)
201 }
202
203 fn words_iter(&self) -> Box<dyn Iterator<Item = &'_ [char]> + Send + '_> {
204 Box::new(
205 self.word_map
206 .iter()
207 .map(|v| v.canonical_spelling.as_slice()),
208 )
209 }
210
211 fn word_count(&self) -> usize {
212 self.word_map.len()
213 }
214
215 fn contains_exact_word(&self, word: &[char]) -> bool {
216 let normalized = word.normalized();
217
218 if let Some(found) = self.word_map.get_with_chars(normalized.as_ref())
219 && found.canonical_spelling.as_ref() == normalized.as_ref()
220 {
221 return true;
222 }
223
224 false
225 }
226
227 fn contains_exact_word_str(&self, word: &str) -> bool {
228 let word: CharString = word.chars().collect();
229 self.contains_exact_word(word.as_ref())
230 }
231
232 fn get_word_from_id(&self, id: &WordId) -> Option<&[char]> {
233 self.word_map.get(id).map(|w| w.canonical_spelling.as_ref())
234 }
235
236 fn find_words_with_prefix(&self, prefix: &[char]) -> Vec<Cow<'_, [char]>> {
237 let mut found = Vec::new();
238
239 for word in self.words_iter() {
240 if let Some(item_prefix) = word.get(0..prefix.len())
241 && item_prefix == prefix
242 {
243 found.push(Cow::Borrowed(word));
244 }
245 }
246
247 found
248 }
249
250 fn find_words_with_common_prefix(&self, word: &[char]) -> Vec<Cow<'_, [char]>> {
251 let mut found = Vec::new();
252
253 for item in self.words_iter() {
254 if let Some(item_prefix) = word.get(0..item.len())
255 && item_prefix == item
256 {
257 found.push(Cow::Borrowed(item));
258 }
259 }
260
261 found
262 }
263}
264
265impl From<MutableDictionary> for FstDictionary {
266 fn from(dict: MutableDictionary) -> Self {
267 let words = dict
268 .word_map
269 .into_iter()
270 .map(|entry| (entry.canonical_spelling, entry.metadata))
271 .collect();
272
273 FstDictionary::new(words)
274 }
275}
276
277#[cfg(test)]
278mod tests {
279 use std::borrow::Cow;
280
281 use hashbrown::HashSet;
282 use itertools::Itertools;
283
284 use crate::spell::{Dictionary, MutableDictionary};
285 use crate::{DictWordMetadata, char_string::char_string};
286
287 #[test]
288 fn curated_contains_no_duplicates() {
289 let dict = MutableDictionary::curated();
290 assert!(dict.words_iter().all_unique());
291 }
292
293 #[test]
294 fn curated_matches_capitalized() {
295 let dict = MutableDictionary::curated();
296 assert!(dict.contains_word_str("this"));
297 assert!(dict.contains_word_str("This"));
298 }
299
300 #[test]
304 fn this_is_determiner() {
305 let dict = MutableDictionary::curated();
306 assert!(dict.get_word_metadata_str("this").unwrap().is_determiner());
307 assert!(dict.get_word_metadata_str("This").unwrap().is_determiner());
308 }
309
310 #[test]
311 fn several_is_quantifier() {
312 let dict = MutableDictionary::curated();
313 assert!(
314 dict.get_word_metadata_str("several")
315 .unwrap()
316 .is_quantifier()
317 );
318 }
319
320 #[test]
321 fn few_is_quantifier() {
322 let dict = MutableDictionary::curated();
323 assert!(dict.get_word_metadata_str("few").unwrap().is_quantifier());
324 }
325
326 #[test]
327 fn fewer_is_quantifier() {
328 let dict = MutableDictionary::curated();
329 assert!(dict.get_word_metadata_str("fewer").unwrap().is_quantifier());
330 }
331
332 #[test]
333 fn than_is_conjunction() {
334 let dict = MutableDictionary::curated();
335 assert!(dict.get_word_metadata_str("than").unwrap().is_conjunction());
336 assert!(dict.get_word_metadata_str("Than").unwrap().is_conjunction());
337 }
338
339 #[test]
340 fn herself_is_pronoun() {
341 let dict = MutableDictionary::curated();
342 assert!(dict.get_word_metadata_str("herself").unwrap().is_pronoun());
343 assert!(dict.get_word_metadata_str("Herself").unwrap().is_pronoun());
344 }
345
346 #[test]
347 fn discussion_171() {
348 let dict = MutableDictionary::curated();
349 assert!(dict.contains_word_str("natively"));
350 }
351
352 #[test]
353 fn im_is_common() {
354 let dict = MutableDictionary::curated();
355 assert!(dict.get_word_metadata_str("I'm").unwrap().common);
356 }
357
358 #[test]
359 fn fuzzy_result_sorted_by_edit_distance() {
360 let dict = MutableDictionary::curated();
361
362 let results = dict.fuzzy_match_str("hello", 3, 100);
363 let is_sorted_by_dist = results
364 .iter()
365 .map(|fm| fm.edit_distance)
366 .tuple_windows()
367 .all(|(a, b)| a <= b);
368
369 assert!(is_sorted_by_dist)
370 }
371
372 #[test]
373 fn there_is_not_a_pronoun() {
374 let dict = MutableDictionary::curated();
375
376 assert!(!dict.get_word_metadata_str("there").unwrap().is_nominal());
377 assert!(!dict.get_word_metadata_str("there").unwrap().is_pronoun());
378 }
379
380 #[test]
381 fn expanded_contains_giants() {
382 assert!(MutableDictionary::curated().contains_word_str("giants"));
383 }
384
385 #[test]
386 fn expanded_contains_deallocate() {
387 assert!(MutableDictionary::curated().contains_word_str("deallocate"));
388 }
389
390 #[test]
391 fn curated_contains_repo() {
392 let dict = MutableDictionary::curated();
393
394 assert!(dict.contains_word_str("repo"));
395 assert!(dict.contains_word_str("repos"));
396 assert!(dict.contains_word_str("repo's"));
397 }
398
399 #[test]
400 fn curated_contains_possessive_abandonment() {
401 assert!(
402 MutableDictionary::curated()
403 .get_word_metadata_str("abandonment's")
404 .unwrap()
405 .is_possessive_noun()
406 )
407 }
408
409 #[test]
410 fn has_is_not_a_nominal() {
411 let dict = MutableDictionary::curated();
412
413 let has = dict.get_word_metadata_str("has");
414 assert!(has.is_some());
415
416 assert!(!has.unwrap().is_nominal())
417 }
418
419 #[test]
420 fn is_is_linking_verb() {
421 let dict = MutableDictionary::curated();
422
423 let is = dict.get_word_metadata_str("is");
424
425 assert!(is.is_some());
426 assert!(is.unwrap().is_linking_verb());
427 }
428
429 #[test]
430 fn are_merged_attrs_same_as_spread_attrs() {
431 let curated_attr_list = include_str!("../../annotations.json");
432
433 let merged = MutableDictionary::from_rune_files("1\nblork/DGS", curated_attr_list).unwrap();
434 let spread =
435 MutableDictionary::from_rune_files("2\nblork/DG\nblork/S", curated_attr_list).unwrap();
436
437 assert_eq!(
438 merged.word_map.into_iter().collect::<HashSet<_>>(),
439 spread.word_map.into_iter().collect::<HashSet<_>>()
440 );
441 }
442
443 #[test]
444 fn apart_is_not_noun() {
445 let dict = MutableDictionary::curated();
446
447 assert!(!dict.get_word_metadata_str("apart").unwrap().is_noun());
448 }
449
450 #[test]
451 fn be_is_verb_lemma() {
452 let dict = MutableDictionary::curated();
453
454 let is = dict.get_word_metadata_str("be");
455
456 assert!(is.is_some());
457 assert!(is.unwrap().is_verb_lemma());
458 }
459
460 #[test]
461 fn gets_prefixes_as_expected() {
462 let mut dict = MutableDictionary::new();
463 dict.append_word_str("predict", DictWordMetadata::default());
464 dict.append_word_str("prelude", DictWordMetadata::default());
465 dict.append_word_str("preview", DictWordMetadata::default());
466 dict.append_word_str("dwight", DictWordMetadata::default());
467
468 let with_prefix = dict.find_words_with_prefix(char_string!("pre").as_slice());
469
470 assert_eq!(with_prefix.len(), 3);
471 assert!(with_prefix.contains(&Cow::Owned(char_string!("predict").into_vec())));
472 assert!(with_prefix.contains(&Cow::Owned(char_string!("prelude").into_vec())));
473 assert!(with_prefix.contains(&Cow::Owned(char_string!("preview").into_vec())));
474 }
475
476 #[test]
477 fn gets_common_prefixes_as_expected() {
478 let mut dict = MutableDictionary::new();
479 dict.append_word_str("pre", DictWordMetadata::default());
480 dict.append_word_str("prep", DictWordMetadata::default());
481 dict.append_word_str("dwight", DictWordMetadata::default());
482
483 let with_prefix =
484 dict.find_words_with_common_prefix(char_string!("preposition").as_slice());
485
486 assert_eq!(with_prefix.len(), 2);
487 assert!(with_prefix.contains(&Cow::Owned(char_string!("pre").into_vec())));
488 assert!(with_prefix.contains(&Cow::Owned(char_string!("prep").into_vec())));
489 }
490}