harper_core/spell/
mutable_dictionary.rs1use super::{
2 FstDictionary, WordId,
3 rune::{self, AttributeList, parse_word_list},
4 word_map::{WordMap, WordMapEntry},
5};
6use crate::edit_distance::edit_distance_min_alloc;
7use itertools::Itertools;
8use lazy_static::lazy_static;
9use std::sync::Arc;
10
11use crate::{CharString, CharStringExt, WordMetadata};
12
13use super::FuzzyMatchResult;
14use super::dictionary::Dictionary;
15
16#[derive(Debug, Clone, Eq, PartialEq)]
25pub struct MutableDictionary {
26 word_map: WordMap,
28}
29
30fn uncached_inner_new() -> Arc<MutableDictionary> {
33 Arc::new(
34 MutableDictionary::from_rune_files(
35 include_str!("../../dictionary.dict"),
36 include_str!("../../affixes.json"),
37 )
38 .expect("Curated dictionary should be valid."),
39 )
40}
41
42lazy_static! {
43 static ref DICT: Arc<MutableDictionary> = uncached_inner_new();
44}
45
46impl MutableDictionary {
47 pub fn new() -> Self {
48 Self {
49 word_map: WordMap::default(),
50 }
51 }
52
53 pub fn from_rune_files(word_list: &str, attr_list: &str) -> Result<Self, rune::Error> {
54 let word_list = parse_word_list(word_list)?;
55 let attr_list = AttributeList::parse(attr_list)?;
56
57 let mut word_map = WordMap::default();
59
60 attr_list.expand_marked_words(word_list, &mut word_map);
61
62 Ok(Self { word_map })
63 }
64
65 pub fn curated() -> Arc<Self> {
69 (*DICT).clone()
70 }
71
72 pub fn extend_words(
76 &mut self,
77 words: impl IntoIterator<Item = (impl AsRef<[char]>, WordMetadata)>,
78 ) {
79 for (chars, metadata) in words.into_iter() {
80 self.word_map.insert(WordMapEntry {
81 metadata,
82 canonical_spelling: chars.as_ref().into(),
83 })
84 }
85 }
86
87 pub fn append_word(&mut self, word: impl AsRef<[char]>, metadata: WordMetadata) {
92 self.extend_words(std::iter::once((word.as_ref(), metadata)))
93 }
94
95 pub fn append_word_str(&mut self, word: &str, metadata: WordMetadata) {
100 self.append_word(word.chars().collect::<Vec<_>>(), metadata)
101 }
102}
103
104impl Default for MutableDictionary {
105 fn default() -> Self {
106 Self::new()
107 }
108}
109
110impl Dictionary for MutableDictionary {
111 fn get_word_metadata(&self, word: &[char]) -> Option<&WordMetadata> {
112 self.word_map.get_with_chars(word).map(|v| &v.metadata)
113 }
114
115 fn contains_word(&self, word: &[char]) -> bool {
116 self.word_map.contains_chars(word)
117 }
118
119 fn contains_word_str(&self, word: &str) -> bool {
120 let chars: CharString = word.chars().collect();
121 self.contains_word(&chars)
122 }
123
124 fn get_word_metadata_str(&self, word: &str) -> Option<&WordMetadata> {
125 let chars: CharString = word.chars().collect();
126 self.get_word_metadata(&chars)
127 }
128
129 fn get_correct_capitalization_of(&self, word: &[char]) -> Option<&'_ [char]> {
130 self.word_map
131 .get_with_chars(word)
132 .map(|v| v.canonical_spelling.as_slice())
133 }
134
135 fn fuzzy_match(
140 &self,
141 word: &[char],
142 max_distance: u8,
143 max_results: usize,
144 ) -> Vec<FuzzyMatchResult> {
145 let misspelled_charslice = word.normalized();
146 let misspelled_charslice_lower = misspelled_charslice.to_lower();
147
148 let shortest_word_len = if misspelled_charslice.len() <= max_distance as usize {
149 1
150 } else {
151 misspelled_charslice.len() - max_distance as usize
152 };
153 let longest_word_len = misspelled_charslice.len() + max_distance as usize;
154
155 let words_to_search = self
157 .words_iter()
158 .filter(|word| (shortest_word_len..=longest_word_len).contains(&word.len()));
159
160 let mut buf_a = Vec::with_capacity(53);
163 let mut buf_b = Vec::with_capacity(53);
164
165 words_to_search
167 .filter_map(|word| {
168 let dist =
169 edit_distance_min_alloc(&misspelled_charslice, word, &mut buf_a, &mut buf_b);
170 let lowercase_dist = edit_distance_min_alloc(
171 &misspelled_charslice_lower,
172 word,
173 &mut buf_a,
174 &mut buf_b,
175 );
176
177 let smaller_dist = dist.min(lowercase_dist);
178 if smaller_dist <= max_distance {
179 Some((word, smaller_dist))
180 } else {
181 None
182 }
183 })
184 .sorted_unstable_by_key(|a| a.1)
185 .take(max_results)
186 .map(|(word, edit_distance)| FuzzyMatchResult {
187 word,
188 edit_distance,
189 metadata: self.get_word_metadata(word).unwrap(),
190 })
191 .collect()
192 }
193
194 fn fuzzy_match_str(
195 &self,
196 word: &str,
197 max_distance: u8,
198 max_results: usize,
199 ) -> Vec<FuzzyMatchResult> {
200 let word: Vec<_> = word.chars().collect();
201 self.fuzzy_match(&word, max_distance, max_results)
202 }
203
204 fn words_iter(&self) -> Box<dyn Iterator<Item = &'_ [char]> + Send + '_> {
205 Box::new(
206 self.word_map
207 .iter()
208 .map(|v| v.canonical_spelling.as_slice()),
209 )
210 }
211
212 fn word_count(&self) -> usize {
213 self.word_map.len()
214 }
215
216 fn contains_exact_word(&self, word: &[char]) -> bool {
217 let normalized = word.normalized();
218
219 if let Some(found) = self.word_map.get_with_chars(normalized.as_ref()) {
220 if found.canonical_spelling.as_ref() == normalized.as_ref() {
221 return true;
222 }
223 }
224
225 false
226 }
227
228 fn contains_exact_word_str(&self, word: &str) -> bool {
229 let word: CharString = word.chars().collect();
230 self.contains_exact_word(word.as_ref())
231 }
232
233 fn get_word_from_id(&self, id: &WordId) -> Option<&[char]> {
234 self.word_map.get(id).map(|w| w.canonical_spelling.as_ref())
235 }
236}
237
238impl From<MutableDictionary> for FstDictionary {
239 fn from(dict: MutableDictionary) -> Self {
240 let words = dict
241 .word_map
242 .into_iter()
243 .map(|entry| (entry.canonical_spelling, entry.metadata))
244 .collect();
245
246 FstDictionary::new(words)
247 }
248}
249
250#[cfg(test)]
251mod tests {
252 use hashbrown::HashSet;
253 use itertools::Itertools;
254
255 use crate::{Dictionary, MutableDictionary};
256
257 #[test]
258 fn curated_contains_no_duplicates() {
259 let dict = MutableDictionary::curated();
260 assert!(dict.words_iter().all_unique());
261 }
262
263 #[test]
264 fn curated_matches_capitalized() {
265 let dict = MutableDictionary::curated();
266 assert!(dict.contains_word_str("this"));
267 assert!(dict.contains_word_str("This"));
268 }
269
270 #[test]
282 fn than_is_conjunction() {
283 let dict = MutableDictionary::curated();
284 assert!(dict.get_word_metadata_str("than").unwrap().is_conjunction());
285 assert!(dict.get_word_metadata_str("Than").unwrap().is_conjunction());
286 }
287
288 #[test]
289 fn herself_is_pronoun() {
290 let dict = MutableDictionary::curated();
291 assert!(dict.get_word_metadata_str("herself").unwrap().is_pronoun());
292 assert!(dict.get_word_metadata_str("Herself").unwrap().is_pronoun());
293 }
294
295 #[test]
296 fn discussion_171() {
297 let dict = MutableDictionary::curated();
298 assert!(dict.contains_word_str("natively"));
299 }
300
301 #[test]
302 fn im_is_common() {
303 let dict = MutableDictionary::curated();
304 assert!(dict.get_word_metadata_str("I'm").unwrap().common);
305 }
306
307 #[test]
308 fn fuzzy_result_sorted_by_edit_distance() {
309 let dict = MutableDictionary::curated();
310
311 let results = dict.fuzzy_match_str("hello", 3, 100);
312 let is_sorted_by_dist = results
313 .iter()
314 .map(|fm| fm.edit_distance)
315 .tuple_windows()
316 .all(|(a, b)| a <= b);
317
318 assert!(is_sorted_by_dist)
319 }
320
321 #[test]
322 fn there_is_not_a_pronoun() {
323 let dict = MutableDictionary::curated();
324
325 assert!(!dict.get_word_metadata_str("there").unwrap().is_nominal());
326 assert!(!dict.get_word_metadata_str("there").unwrap().is_pronoun());
327 }
328
329 #[test]
330 fn expanded_contains_giants() {
331 assert!(MutableDictionary::curated().contains_word_str("giants"));
332 }
333
334 #[test]
335 fn expanded_contains_deallocate() {
336 assert!(MutableDictionary::curated().contains_word_str("deallocate"));
337 }
338
339 #[test]
340 fn curated_contains_repo() {
341 let dict = MutableDictionary::curated();
342
343 assert!(dict.contains_word_str("repo"));
344 assert!(dict.contains_word_str("repos"));
345 assert!(dict.contains_word_str("repo's"));
346 }
347
348 #[test]
349 fn curated_contains_possessive_abandonment() {
350 assert!(
351 MutableDictionary::curated()
352 .get_word_metadata_str("abandonment's")
353 .unwrap()
354 .is_possessive_noun()
355 )
356 }
357
358 #[test]
359 fn has_is_not_a_nominal() {
360 let dict = MutableDictionary::curated();
361
362 let has = dict.get_word_metadata_str("has");
363 assert!(has.is_some());
364
365 assert!(!has.unwrap().is_nominal())
366 }
367
368 #[test]
369 fn is_is_linking_verb() {
370 let dict = MutableDictionary::curated();
371
372 let is = dict.get_word_metadata_str("is");
373
374 assert!(is.is_some());
375 assert!(is.unwrap().is_linking_verb());
376 }
377
378 #[test]
379 fn are_merged_attrs_same_as_spread_attrs() {
380 let curated_attr_list = include_str!("../../affixes.json");
381
382 let merged = MutableDictionary::from_rune_files("1\nblork/DGS", curated_attr_list).unwrap();
383 let spread =
384 MutableDictionary::from_rune_files("2\nblork/DG\nblork/S", curated_attr_list).unwrap();
385
386 assert_eq!(
387 merged.word_map.into_iter().collect::<HashSet<_>>(),
388 spread.word_map.into_iter().collect::<HashSet<_>>()
389 );
390 }
391}