1use super::{
2 FstDictionary, WordId,
3 rune::{self, AttributeList, parse_word_list},
4 word_map::{WordMap, WordMapEntry},
5};
6use crate::edit_distance::edit_distance_min_alloc;
7use itertools::Itertools;
8use lazy_static::lazy_static;
9use std::borrow::Cow;
10use std::sync::Arc;
11
12use crate::{CharString, CharStringExt, DictWordMetadata};
13
14use super::FuzzyMatchResult;
15use super::dictionary::Dictionary;
16
17#[derive(Debug, Clone, Eq, PartialEq)]
26pub struct MutableDictionary {
27 word_map: WordMap,
29}
30
31fn uncached_inner_new() -> Arc<MutableDictionary> {
34 MutableDictionary::from_rune_files(
35 include_str!("../../dictionary.dict"),
36 include_str!("../../annotations.json"),
37 )
38 .map(Arc::new)
39 .unwrap_or_else(|e| panic!("Failed to load curated dictionary: {}", e))
40}
41
42lazy_static! {
43 static ref DICT: Arc<MutableDictionary> = uncached_inner_new();
44}
45
46impl MutableDictionary {
47 pub fn new() -> Self {
48 Self {
49 word_map: WordMap::default(),
50 }
51 }
52
53 pub fn from_rune_files(word_list: &str, attr_list: &str) -> Result<Self, rune::Error> {
54 let word_list = parse_word_list(word_list)?;
55 let attr_list = AttributeList::parse(attr_list)?;
56
57 let mut word_map = WordMap::default();
59
60 attr_list.expand_annotated_words(word_list, &mut word_map);
61
62 Ok(Self { word_map })
63 }
64
65 pub fn curated() -> Arc<Self> {
69 (*DICT).clone()
70 }
71
72 pub fn extend_words(
76 &mut self,
77 words: impl IntoIterator<Item = (impl AsRef<[char]>, DictWordMetadata)>,
78 ) {
79 for (chars, metadata) in words.into_iter() {
80 self.word_map.insert(WordMapEntry {
81 metadata,
82 canonical_spelling: chars.as_ref().into(),
83 })
84 }
85 }
86
87 pub fn append_word(&mut self, word: impl AsRef<[char]>, metadata: DictWordMetadata) {
92 self.extend_words(std::iter::once((word.as_ref(), metadata)))
93 }
94
95 pub fn append_word_str(&mut self, word: &str, metadata: DictWordMetadata) {
100 self.append_word(word.chars().collect::<Vec<_>>(), metadata)
101 }
102}
103
104impl Default for MutableDictionary {
105 fn default() -> Self {
106 Self::new()
107 }
108}
109
110impl Dictionary for MutableDictionary {
111 fn get_word_metadata(&self, word: &[char]) -> Option<Cow<'_, DictWordMetadata>> {
112 self.word_map
113 .get_with_chars(word)
114 .map(|v| Cow::Borrowed(&v.metadata))
115 }
116
117 fn contains_word(&self, word: &[char]) -> bool {
118 self.word_map.contains_chars(word)
119 }
120
121 fn contains_word_str(&self, word: &str) -> bool {
122 let chars: CharString = word.chars().collect();
123 self.contains_word(&chars)
124 }
125
126 fn get_word_metadata_str(&self, word: &str) -> Option<Cow<'_, DictWordMetadata>> {
127 let chars: CharString = word.chars().collect();
128 self.get_word_metadata(&chars)
129 }
130
131 fn get_correct_capitalization_of(&self, word: &[char]) -> Option<&'_ [char]> {
132 self.word_map
133 .get_with_chars(word)
134 .map(|v| v.canonical_spelling.as_slice())
135 }
136
137 fn fuzzy_match(
142 &'_ self,
143 word: &[char],
144 max_distance: u8,
145 max_results: usize,
146 ) -> Vec<FuzzyMatchResult<'_>> {
147 let misspelled_charslice = word.normalized();
148 let misspelled_charslice_lower = misspelled_charslice.to_lower();
149
150 let shortest_word_len = if misspelled_charslice.len() <= max_distance as usize {
151 1
152 } else {
153 misspelled_charslice.len() - max_distance as usize
154 };
155 let longest_word_len = misspelled_charslice.len() + max_distance as usize;
156
157 let words_to_search = self
159 .words_iter()
160 .filter(|word| (shortest_word_len..=longest_word_len).contains(&word.len()));
161
162 let mut buf_a = Vec::with_capacity(53);
165 let mut buf_b = Vec::with_capacity(53);
166
167 words_to_search
169 .filter_map(|word| {
170 let dist =
171 edit_distance_min_alloc(&misspelled_charslice, word, &mut buf_a, &mut buf_b);
172 let lowercase_dist = edit_distance_min_alloc(
173 &misspelled_charslice_lower,
174 word,
175 &mut buf_a,
176 &mut buf_b,
177 );
178
179 let smaller_dist = dist.min(lowercase_dist);
180 if smaller_dist <= max_distance {
181 Some((word, smaller_dist))
182 } else {
183 None
184 }
185 })
186 .sorted_unstable_by_key(|a| a.1)
187 .take(max_results)
188 .map(|(word, edit_distance)| FuzzyMatchResult {
189 word,
190 edit_distance,
191 metadata: self.get_word_metadata(word).unwrap(),
192 })
193 .collect()
194 }
195
196 fn fuzzy_match_str(
197 &'_ self,
198 word: &str,
199 max_distance: u8,
200 max_results: usize,
201 ) -> Vec<FuzzyMatchResult<'_>> {
202 let word: Vec<_> = word.chars().collect();
203 self.fuzzy_match(&word, max_distance, max_results)
204 }
205
206 fn words_iter(&self) -> Box<dyn Iterator<Item = &'_ [char]> + Send + '_> {
207 Box::new(
208 self.word_map
209 .iter()
210 .map(|v| v.canonical_spelling.as_slice()),
211 )
212 }
213
214 fn word_count(&self) -> usize {
215 self.word_map.len()
216 }
217
218 fn contains_exact_word(&self, word: &[char]) -> bool {
219 let normalized = word.normalized();
220
221 if let Some(found) = self.word_map.get_with_chars(normalized.as_ref())
222 && found.canonical_spelling.as_ref() == normalized.as_ref()
223 {
224 return true;
225 }
226
227 false
228 }
229
230 fn contains_exact_word_str(&self, word: &str) -> bool {
231 let word: CharString = word.chars().collect();
232 self.contains_exact_word(word.as_ref())
233 }
234
235 fn get_word_from_id(&self, id: &WordId) -> Option<&[char]> {
236 self.word_map.get(id).map(|w| w.canonical_spelling.as_ref())
237 }
238
239 fn find_words_with_prefix(&self, prefix: &[char]) -> Vec<Cow<'_, [char]>> {
240 let mut found = Vec::new();
241
242 for word in self.words_iter() {
243 if let Some(item_prefix) = word.get(0..prefix.len())
244 && item_prefix == prefix
245 {
246 found.push(Cow::Borrowed(word));
247 }
248 }
249
250 found
251 }
252
253 fn find_words_with_common_prefix(&self, word: &[char]) -> Vec<Cow<'_, [char]>> {
254 let mut found = Vec::new();
255
256 for item in self.words_iter() {
257 if let Some(item_prefix) = word.get(0..item.len())
258 && item_prefix == item
259 {
260 found.push(Cow::Borrowed(item));
261 }
262 }
263
264 found
265 }
266}
267
268impl From<MutableDictionary> for FstDictionary {
269 fn from(dict: MutableDictionary) -> Self {
270 let words = dict
271 .word_map
272 .into_iter()
273 .map(|entry| (entry.canonical_spelling, entry.metadata))
274 .collect();
275
276 FstDictionary::new(words)
277 }
278}
279
280#[cfg(test)]
281mod tests {
282 use std::borrow::Cow;
283
284 use hashbrown::HashSet;
285 use itertools::Itertools;
286
287 use crate::spell::{Dictionary, MutableDictionary};
288 use crate::{DictWordMetadata, char_string::char_string};
289
290 #[test]
291 fn curated_contains_no_duplicates() {
292 let dict = MutableDictionary::curated();
293 assert!(dict.words_iter().all_unique());
294 }
295
296 #[test]
297 fn curated_matches_capitalized() {
298 let dict = MutableDictionary::curated();
299 assert!(dict.contains_word_str("this"));
300 assert!(dict.contains_word_str("This"));
301 }
302
303 #[test]
307 fn this_is_determiner() {
308 let dict = MutableDictionary::curated();
309 assert!(dict.get_word_metadata_str("this").unwrap().is_determiner());
310 assert!(dict.get_word_metadata_str("This").unwrap().is_determiner());
311 }
312
313 #[test]
314 fn several_is_quantifier() {
315 let dict = MutableDictionary::curated();
316 assert!(
317 dict.get_word_metadata_str("several")
318 .unwrap()
319 .is_quantifier()
320 );
321 }
322
323 #[test]
324 fn few_is_quantifier() {
325 let dict = MutableDictionary::curated();
326 assert!(dict.get_word_metadata_str("few").unwrap().is_quantifier());
327 }
328
329 #[test]
330 fn fewer_is_quantifier() {
331 let dict = MutableDictionary::curated();
332 assert!(dict.get_word_metadata_str("fewer").unwrap().is_quantifier());
333 }
334
335 #[test]
336 fn than_is_conjunction() {
337 let dict = MutableDictionary::curated();
338 assert!(dict.get_word_metadata_str("than").unwrap().is_conjunction());
339 assert!(dict.get_word_metadata_str("Than").unwrap().is_conjunction());
340 }
341
342 #[test]
343 fn herself_is_pronoun() {
344 let dict = MutableDictionary::curated();
345 assert!(dict.get_word_metadata_str("herself").unwrap().is_pronoun());
346 assert!(dict.get_word_metadata_str("Herself").unwrap().is_pronoun());
347 }
348
349 #[test]
350 fn discussion_171() {
351 let dict = MutableDictionary::curated();
352 assert!(dict.contains_word_str("natively"));
353 }
354
355 #[test]
356 fn im_is_common() {
357 let dict = MutableDictionary::curated();
358 assert!(dict.get_word_metadata_str("I'm").unwrap().common);
359 }
360
361 #[test]
362 fn fuzzy_result_sorted_by_edit_distance() {
363 let dict = MutableDictionary::curated();
364
365 let results = dict.fuzzy_match_str("hello", 3, 100);
366 let is_sorted_by_dist = results
367 .iter()
368 .map(|fm| fm.edit_distance)
369 .tuple_windows()
370 .all(|(a, b)| a <= b);
371
372 assert!(is_sorted_by_dist)
373 }
374
375 #[test]
376 fn there_is_not_a_pronoun() {
377 let dict = MutableDictionary::curated();
378
379 assert!(!dict.get_word_metadata_str("there").unwrap().is_nominal());
380 assert!(!dict.get_word_metadata_str("there").unwrap().is_pronoun());
381 }
382
383 #[test]
384 fn expanded_contains_giants() {
385 assert!(MutableDictionary::curated().contains_word_str("giants"));
386 }
387
388 #[test]
389 fn expanded_contains_deallocate() {
390 assert!(MutableDictionary::curated().contains_word_str("deallocate"));
391 }
392
393 #[test]
394 fn curated_contains_repo() {
395 let dict = MutableDictionary::curated();
396
397 assert!(dict.contains_word_str("repo"));
398 assert!(dict.contains_word_str("repos"));
399 assert!(dict.contains_word_str("repo's"));
400 }
401
402 #[test]
403 fn curated_contains_possessive_abandonment() {
404 assert!(
405 MutableDictionary::curated()
406 .get_word_metadata_str("abandonment's")
407 .unwrap()
408 .is_possessive_noun()
409 )
410 }
411
412 #[test]
413 fn has_is_not_a_nominal() {
414 let dict = MutableDictionary::curated();
415
416 let has = dict.get_word_metadata_str("has");
417 assert!(has.is_some());
418
419 assert!(!has.unwrap().is_nominal())
420 }
421
422 #[test]
423 fn is_is_linking_verb() {
424 let dict = MutableDictionary::curated();
425
426 let is = dict.get_word_metadata_str("is");
427
428 assert!(is.is_some());
429 assert!(is.unwrap().is_linking_verb());
430 }
431
432 #[test]
433 fn are_merged_attrs_same_as_spread_attrs() {
434 let curated_attr_list = include_str!("../../annotations.json");
435
436 let merged = MutableDictionary::from_rune_files("1\nblork/DGS", curated_attr_list).unwrap();
437 let spread =
438 MutableDictionary::from_rune_files("2\nblork/DG\nblork/S", curated_attr_list).unwrap();
439
440 assert_eq!(
441 merged.word_map.into_iter().collect::<HashSet<_>>(),
442 spread.word_map.into_iter().collect::<HashSet<_>>()
443 );
444 }
445
446 #[test]
447 fn apart_is_not_noun() {
448 let dict = MutableDictionary::curated();
449
450 assert!(!dict.get_word_metadata_str("apart").unwrap().is_noun());
451 }
452
453 #[test]
454 fn be_is_verb_lemma() {
455 let dict = MutableDictionary::curated();
456
457 let is = dict.get_word_metadata_str("be");
458
459 assert!(is.is_some());
460 assert!(is.unwrap().is_verb_lemma());
461 }
462
463 #[test]
464 fn gets_prefixes_as_expected() {
465 let mut dict = MutableDictionary::new();
466 dict.append_word_str("predict", DictWordMetadata::default());
467 dict.append_word_str("prelude", DictWordMetadata::default());
468 dict.append_word_str("preview", DictWordMetadata::default());
469 dict.append_word_str("dwight", DictWordMetadata::default());
470
471 let with_prefix = dict.find_words_with_prefix(char_string!("pre").as_slice());
472
473 assert_eq!(with_prefix.len(), 3);
474 assert!(with_prefix.contains(&Cow::Owned(char_string!("predict").into_vec())));
475 assert!(with_prefix.contains(&Cow::Owned(char_string!("prelude").into_vec())));
476 assert!(with_prefix.contains(&Cow::Owned(char_string!("preview").into_vec())));
477 }
478
479 #[test]
480 fn gets_common_prefixes_as_expected() {
481 let mut dict = MutableDictionary::new();
482 dict.append_word_str("pre", DictWordMetadata::default());
483 dict.append_word_str("prep", DictWordMetadata::default());
484 dict.append_word_str("dwight", DictWordMetadata::default());
485
486 let with_prefix =
487 dict.find_words_with_common_prefix(char_string!("preposition").as_slice());
488
489 assert_eq!(with_prefix.len(), 2);
490 assert!(with_prefix.contains(&Cow::Owned(char_string!("pre").into_vec())));
491 assert!(with_prefix.contains(&Cow::Owned(char_string!("prep").into_vec())));
492 }
493}