basic_trie/trie_node/
data_node.rs

1use fxhash::FxHashMap;
2use std::cmp::Ordering;
3use std::ops;
4use thin_vec::ThinVec;
5
6#[cfg(feature = "serde")]
7use serde_crate::{Deserialize, Serialize};
8
9type WordEnd<D> = Option<ThinVec<D>>;
10
11/// Helper struct for returning multiple values for deleting data.
12/// It is needed because the 'must_keep' value will at some point change
13/// from false to true, but the data stays the same from the beginning of
14/// unwinding.
15pub(crate) struct RemoveData<D> {
16    must_keep: bool,
17    pub(crate) data: WordEnd<D>,
18}
19
20/// Singular trie node that represents its children and a marker for word ending.
21#[derive(Debug, Default, Clone)]
22#[cfg_attr(
23    feature = "serde",
24    derive(Serialize, Deserialize),
25    serde(crate = "serde_crate")
26)]
27pub struct TrieDataNode<D> {
28    #[cfg_attr(feature = "serde", serde(rename = "c"))]
29    pub(crate) children: Box<FxHashMap<arrayvec::ArrayString<4>, TrieDataNode<D>>>,
30    #[cfg_attr(feature = "serde", serde(rename = "wed"))]
31    word_end_data: WordEnd<D>,
32}
33
34/// Methods only on nodes that have data.
35impl<D> TrieDataNode<D> {
36    /// Returns a new instance of a TrieNode.
37    pub(crate) fn new() -> Self {
38        TrieDataNode {
39            children: Default::default(),
40            word_end_data: None,
41        }
42    }
43
44    /// Recursive function that drops all children maps and collects data
45    /// regardless of having multiple words branching from them or not.
46    pub(crate) fn remove_all_words_collect(&mut self, found_data: &mut Vec<D>) -> usize {
47        let num_removed = self
48            .children
49            .values_mut()
50            .map(|child| child.remove_all_words_collect(found_data))
51            .sum::<usize>()
52            + self.is_associated() as usize;
53
54        if let Some(data_vec) = self.disassociate() {
55            found_data.extend(data_vec);
56        }
57
58        self.clear_children();
59
60        num_removed
61    }
62
63    /// Recursive function that counts the number of words from a starting node.
64    pub(crate) fn count_words(&self) -> usize {
65        self.children
66            .values()
67            .map(|child| child.count_words())
68            .sum::<usize>()
69            + self.is_associated() as usize
70    }
71
72    /// Recursive function finds every node that is an end of a word and appends
73    /// its data as references to the passed vector.
74    pub(crate) fn generate_all_data<'a>(&'a self, found_data: &mut Vec<&'a D>) {
75        if let Some(data_vec) = &self.word_end_data {
76            found_data.extend(data_vec.iter());
77        }
78
79        self.children
80            .values()
81            .for_each(|x| x.generate_all_data(found_data));
82    }
83
84    /// Recursive function finds every node that is an end of a word and appends
85    /// its data as mutable references to the passed vector.
86    pub(crate) fn generate_all_data_mut<'a>(&'a mut self, found_data: &mut Vec<&'a mut D>) {
87        if let Some(data_vec) = &mut self.word_end_data {
88            found_data.extend(data_vec.iter_mut());
89        }
90
91        self.children
92            .values_mut()
93            .for_each(|x| x.generate_all_data_mut(found_data));
94    }
95
96    /// Function pushes data to the association vector.
97    pub(crate) fn push_data(&mut self, data: D) {
98        self.get_association_mut().as_mut().unwrap().push(data);
99    }
100
101    /// Recursive function for inserting found words from the given node and
102    /// given starting substring.
103    pub(crate) fn find_words(&self, substring: &str, found_words: &mut Vec<String>) {
104        if self.is_associated() {
105            found_words.push(substring.to_string());
106        }
107
108        self.children.iter().for_each(|(character, node)| {
109            node.find_words(&(substring.to_owned() + character), found_words)
110        });
111    }
112
113    /// The recursive function for finding a vector of shortest and longest words in the TrieNode consists of:
114    /// - the DFS tree traversal part for getting to every child node;
115    /// - matching lengths of found words in combination with the passed ordering.
116    pub(crate) fn words_min_max(
117        &self,
118        substring: &str,
119        found_words: &mut Vec<String>,
120        ord: Ordering,
121    ) {
122        'word: {
123            if self.is_associated() {
124                if let Some(found) = found_words.first() {
125                    match substring.len().cmp(&found.len()) {
126                        Ordering::Less if ord == Ordering::Less => {
127                            found_words.clear();
128                        }
129                        Ordering::Greater if ord == Ordering::Greater => {
130                            found_words.clear();
131                        }
132                        Ordering::Equal => (),
133                        _ => break 'word,
134                    }
135                }
136                found_words.push(substring.to_string());
137            }
138        }
139
140        self.children.iter().for_each(|(character, node)| {
141            node.words_min_max(&(substring.to_owned() + character), found_words, ord)
142        });
143    }
144
145    /// Function resets the association of a word and returns the
146    /// previous association. If 'keep_word' is true, the association is only
147    /// reset.
148    pub(crate) fn clear_word_end_association(&mut self, keep_word: bool) -> WordEnd<D> {
149        let return_data = self.disassociate();
150
151        if keep_word && return_data.is_some() {
152            self.associate();
153        }
154
155        return_data
156    }
157
158    /// Recursive function for removing and freeing memory of a word that is not needed anymore.
159    /// The algorithm first finds the last node of a word given in the form of a character iterator,
160    /// then it frees the maps and unwinds to the first node that should not be deleted.
161    /// The first node that should not be deleted is either:
162    /// - the root node
163    /// - the node that has multiple words branching from it
164    /// - the node that represents an end to some word with the same prefix
165    /// The last node's data is propagated all the way to the final return
166    /// with the help of auxiliary 'RemoveData<D>' struct.
167    pub(crate) fn remove_one_word<'b>(
168        &mut self,
169        mut characters: impl Iterator<Item = &'b str>,
170    ) -> RemoveData<D> {
171        let next_character = match characters.next() {
172            None => {
173                return RemoveData {
174                    must_keep: false,
175                    data: self.disassociate(),
176                }
177            }
178            Some(char) => char,
179        };
180
181        let next_node = self.children.get_mut(next_character).unwrap();
182        let must_keep = next_node.remove_one_word(characters);
183
184        if self.children.len() > 1 || must_keep.must_keep {
185            return RemoveData {
186                must_keep: true,
187                data: must_keep.data,
188            };
189        }
190        self.clear_children();
191
192        RemoveData {
193            must_keep: self.is_associated(),
194            data: must_keep.data,
195        }
196    }
197
198    /// Function marks the node as an end of a word.
199    pub(crate) fn associate(&mut self) {
200        self.word_end_data = Some(ThinVec::new());
201    }
202
203    /// Function unmarks the node as an end of a word and returns the data.
204    pub(crate) fn disassociate(&mut self) -> WordEnd<D> {
205        self.word_end_data.take()
206    }
207
208    /// Function returns true if an association is found for the word.
209    pub(crate) fn is_associated(&self) -> bool {
210        self.word_end_data.is_some()
211    }
212
213    /// Function returns the node association.
214    pub(crate) fn get_association(&self) -> &WordEnd<D> {
215        &self.word_end_data
216    }
217
218    /// Function returns the mutable node association.
219    pub(crate) fn get_association_mut(&mut self) -> &mut WordEnd<D> {
220        &mut self.word_end_data
221    }
222
223    /// Function removes all children of a node.
224    pub(crate) fn clear_children(&mut self) {
225        self.children = Default::default();
226    }
227}
228
229impl<D> ops::AddAssign for TrieDataNode<D> {
230    /// Overriding the += operator on nodes.
231    /// Function adds two nodes based on the principle:
232    /// for every child node and character in the 'rhs' node:
233    /// - if the self node doesn't have that character in its children map,
234    /// simply move the pointer to the self's children map without any extra cost;
235    /// - if the self node has that character, the node of that character (self's child)
236    /// is added with the 'rhc's' node.
237    /// An edge case exists when the 'rhc's' node has an association but self's node doesn't.
238    /// That association is handled based on the result of 'rhc_next_node.word_end_data'.
239    /// On Some(data), the self node vector is initialized with the 'rhc' node vector.
240    fn add_assign(&mut self, rhs: Self) {
241        for (char, mut rhs_next_node) in rhs.children.into_iter() {
242            // Does self contain the character?
243            match self.children.remove(&*char) {
244                // The whole node is removed, as owned, operated on and returned in self's children.
245                Some(mut self_next_node) => {
246                    // Edge case: associate self node if the other node is also associated
247                    // Example: when adding 'word' to 'word1', 'd' on 'word' needs to be associated
248                    if let Some(data_vec_rhs) = rhs_next_node.word_end_data.take() {
249                        if let Some(data_vec_self) = &mut self_next_node.word_end_data {
250                            data_vec_self.extend(data_vec_rhs);
251                        } else {
252                            self_next_node.word_end_data = Some(data_vec_rhs);
253                        }
254                    }
255
256                    self_next_node += rhs_next_node;
257                    self.children.insert(char, self_next_node);
258                }
259                // Self doesn't contain the character, no conflict arises.
260                // The whole 'rhs' node is just moved from 'rhs' into self.
261                None => {
262                    self.children.insert(char, rhs_next_node);
263                }
264            }
265        }
266    }
267}
268
269impl<D: PartialEq> PartialEq for TrieDataNode<D> {
270    /// Operation == can be applied only to TrieNodes whose data implements PartialEq.
271    fn eq(&self, other: &Self) -> bool {
272        // If keys aren't equal, nodes aren't equal.
273        if !(self.children.len() == other.children.len()
274            && self.children.keys().all(|k| other.children.contains_key(k)))
275        {
276            return false;
277        }
278
279        // If associations aren't equal, two nodes aren't equal.
280        if !match (&self.word_end_data, &other.word_end_data) {
281            (Some(self_vec), Some(other_vec)) => {
282                // If they both have an association, return true only if the data is identical
283                self_vec.len() == other_vec.len() && self_vec.iter().all(|k| other_vec.contains(k))
284            }
285            // If they both don't have an association, return true
286            (None, None) => true,
287            _ => false,
288        } {
289            return false;
290        }
291
292        // Every child node that has the same key (character) must be equal.
293        self.children
294            .iter()
295            .map(|(char, self_child)| (self_child, other.children.get(char).unwrap()))
296            .all(|(self_child, other_child)| other_child == self_child)
297    }
298}