Skip to main content

harper_core/spell/
merged_dictionary.rs

1use std::borrow::Cow;
2use std::hash::{BuildHasher, Hasher};
3use std::sync::Arc;
4
5use foldhash::quality::FixedState;
6use itertools::Itertools;
7
8use super::{FstDictionary, FuzzyMatchResult, WordId, dictionary::Dictionary};
9use crate::{CharString, DictWordMetadata};
10
11/// A simple wrapper over [`Dictionary`] that allows
12/// one to merge multiple dictionaries without copying.
13///
14/// In cases where more than one dictionary contains a word, data in the first
15/// dictionary inserted will be returned.
16#[derive(Clone)]
17pub struct MergedDictionary {
18    children: Vec<Arc<dyn Dictionary>>,
19    hasher_builder: FixedState,
20    child_hashes: Vec<u64>,
21}
22
23impl MergedDictionary {
24    pub fn new() -> Self {
25        Self {
26            children: Vec::new(),
27            hasher_builder: FixedState::default(),
28            child_hashes: Vec::new(),
29        }
30    }
31
32    pub fn add_dictionary(&mut self, dictionary: Arc<dyn Dictionary>) {
33        self.child_hashes.push(self.hash_dictionary(&dictionary));
34        self.children.push(dictionary);
35    }
36
37    fn hash_dictionary(&self, dictionary: &Arc<dyn Dictionary>) -> u64 {
38        // Hashing the curated dictionary isn't super helpful and takes a long time.
39        if Arc::ptr_eq(
40            dictionary,
41            &(FstDictionary::curated() as Arc<dyn Dictionary>),
42        ) {
43            return 1;
44        }
45
46        let mut hasher = self.hasher_builder.build_hasher();
47
48        dictionary
49            .words_iter()
50            .for_each(|w| w.iter().for_each(|c| hasher.write_u32(*c as u32)));
51
52        hasher.finish()
53    }
54}
55
56impl PartialEq for MergedDictionary {
57    fn eq(&self, other: &Self) -> bool {
58        self.child_hashes == other.child_hashes
59    }
60}
61
62impl Default for MergedDictionary {
63    fn default() -> Self {
64        Self::new()
65    }
66}
67
68impl Dictionary for MergedDictionary {
69    fn get_correct_capitalization_of(&self, word: &[char]) -> Option<&'_ [char]> {
70        for child in &self.children {
71            if let Some(word) = child.get_correct_capitalization_of(word) {
72                return Some(word);
73            }
74        }
75        None
76    }
77
78    fn contains_word(&self, word: &[char]) -> bool {
79        for child in &self.children {
80            if child.contains_word(word) {
81                return true;
82            }
83        }
84        false
85    }
86
87    fn contains_exact_word(&self, word: &[char]) -> bool {
88        for child in &self.children {
89            if child.contains_exact_word(word) {
90                return true;
91            }
92        }
93        false
94    }
95
96    fn get_word_metadata(&self, word: &[char]) -> Option<Cow<'_, DictWordMetadata>> {
97        let mut meta_iter = self
98            .children
99            .iter()
100            .filter_map(|d| d.get_word_metadata(word));
101
102        let first = meta_iter.next()?;
103
104        // Check if multiple entries were found for the word.
105        if let Some(second) = meta_iter.next() {
106            // If so, merge them.
107            let mut first = first.into_owned();
108            first.merge(&second);
109            meta_iter.for_each(|additional_md| {
110                first.merge(&additional_md);
111            });
112
113            Some(Cow::Owned(first))
114        } else {
115            // If not, return the sole found entry.
116            Some(first)
117        }
118    }
119
120    fn words_iter(&self) -> Box<dyn Iterator<Item = &'_ [char]> + Send + '_> {
121        Box::new(self.children.iter().flat_map(|c| c.words_iter()))
122    }
123
124    fn contains_word_str(&self, word: &str) -> bool {
125        let chars: CharString = word.chars().collect();
126        self.contains_word(&chars)
127    }
128
129    fn contains_exact_word_str(&self, word: &str) -> bool {
130        let chars: CharString = word.chars().collect();
131        self.contains_exact_word(&chars)
132    }
133
134    fn get_word_metadata_str(&self, word: &str) -> Option<Cow<'_, DictWordMetadata>> {
135        let chars: CharString = word.chars().collect();
136        self.get_word_metadata(&chars)
137    }
138
139    fn fuzzy_match(
140        &'_ self,
141        word: &[char],
142        max_distance: u8,
143        max_results: usize,
144    ) -> Vec<FuzzyMatchResult<'_>> {
145        self.children
146            .iter()
147            .flat_map(|d| d.fuzzy_match(word, max_distance, max_results))
148            .sorted_by_key(|r| r.word)
149            .dedup_by(|a, b| a.word == b.word)
150            .sorted_by_key(|r| r.edit_distance)
151            .take(max_results)
152            .collect()
153    }
154
155    fn fuzzy_match_str(
156        &'_ self,
157        word: &str,
158        max_distance: u8,
159        max_results: usize,
160    ) -> Vec<FuzzyMatchResult<'_>> {
161        self.children
162            .iter()
163            .flat_map(|d| d.fuzzy_match_str(word, max_distance, max_results))
164            .sorted_by_key(|r| r.word)
165            .dedup_by(|a, b| a.word == b.word)
166            .sorted_by_key(|r| r.edit_distance)
167            .take(max_results)
168            .collect()
169    }
170
171    fn word_count(&self) -> usize {
172        self.children.iter().map(|d| d.word_count()).sum()
173    }
174
175    fn get_word_from_id(&self, id: &WordId) -> Option<&[char]> {
176        self.children
177            .iter()
178            .find_map(|dict| dict.get_word_from_id(id))
179    }
180
181    fn find_words_with_prefix(&self, prefix: &[char]) -> Vec<Cow<'_, [char]>> {
182        self.children
183            .iter()
184            .flat_map(|dict| dict.find_words_with_prefix(prefix))
185            .sorted()
186            .dedup()
187            .collect()
188    }
189
190    fn find_words_with_common_prefix(&self, word: &[char]) -> Vec<Cow<'_, [char]>> {
191        self.children
192            .iter()
193            .flat_map(|dict| dict.find_words_with_common_prefix(word))
194            .sorted()
195            .dedup()
196            .collect()
197    }
198}
199
200#[cfg(test)]
201mod tests {
202    use std::sync::Arc;
203
204    use crate::DictWordMetadata;
205    use crate::spell::{Dictionary, MergedDictionary, MutableDictionary};
206
207    #[test]
208    fn merged_contains_exact_word_str_is_case_sensitive() {
209        let mut user_dict = MutableDictionary::new();
210        user_dict.append_word_str("Foo", DictWordMetadata::default());
211
212        let mut merged = MergedDictionary::new();
213        merged.add_dictionary(Arc::new(user_dict));
214
215        assert!(merged.contains_word_str("Foo"));
216        assert!(merged.contains_word_str("foo"));
217
218        assert!(merged.contains_exact_word(&['F', 'o', 'o']));
219        assert!(!merged.contains_exact_word(&['f', 'o', 'o']));
220
221        assert!(merged.contains_exact_word_str("Foo"));
222        assert!(!merged.contains_exact_word_str("foo"));
223    }
224}