generalized_suffix_tree/
suffix_tree.rs

1pub mod tree;
2
3use crate::data::tree_item::Character;
4use crate::suffix_tree::tree::*;
5use crate::suffix_node::node::*;
6use crate::suffix_node::*;
7use crate::data::TreeItem;
8use crate::data::tree_item::TreeItem as OtherTreeItem;
9use crate::iter::node_iter::*;
10use crate::iter::edge_iter::*;
11
12#[cfg(feature = "non_crypto_hash")]
13use fxhash::{FxHashMap as HashMap, FxHashSet as HashSet};
14#[cfg(not(feature = "non_crypto_hash"))]
15use std::collections::{HashMap, HashSet};
16
17use std::collections::LinkedList;
18use std::fmt::{Display, Debug};
19use std::hash::Hash;
20use std::cmp;
21use std::option::Option;
22use itertools::Itertools;
23use serde::ser::{Serialize, Serializer, SerializeStruct};
24    
25
26/// A Generalized Truncated Suffix Tree implemented with a variation of Ukkonen's Algorithm.  
27
28#[derive(Debug)]
29pub struct KGST<T, U>
30where
31    T: Display + Debug + Eq + PartialEq + Hash + Clone + PartialOrd,
32    U: Display + Debug + Eq + PartialEq + Hash + Clone,
33{
34    root: usize,
35    nodes: HashMap<NodeID, Node<T>>,
36    terminal_character: T,
37    strings: HashMap<StringID, (TreeItem<T, U>, usize)>,
38    leaves: Vec<NodeID>,
39    suffix_links: HashMap<NodeID, NodeID>,
40    node_data: HashMap<NodeID, HashMap<StringID, HashSet<usize>>>
41}
42
43impl<T, U> Serialize for KGST<T, U> 
44where
45    T: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize + PartialOrd,
46    U: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize,
47{
48    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
49    where
50        S: Serializer,
51    {
52        let mut state = serializer.serialize_struct("KGST", 7)?;
53        state.serialize_field("root", &self.root)?;
54        state.serialize_field("nodes", &self.nodes)?;
55        state.serialize_field("terminal_character", &self.terminal_character)?;
56        state.serialize_field("strings", &self.strings)?;
57        state.serialize_field("leaves", &self.leaves)?;
58        state.serialize_field("suffix_links", &self.suffix_links)?;
59        state.serialize_field("node_data", &self.node_data)?;
60        state.end()
61    }
62}
63
64
65impl<T, U> KGST<T, U> 
66where
67    T: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize + PartialOrd,
68    U: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize,
69{
70    /// Creates a new empty K-Truncated Generalized Suffix tree, with a constant end symbol. 
71    /// 
72    /// # Examples
73    /// 
74    /// ```
75    /// use generalized_suffix_tree::suffix_tree::KGST;
76    /// 
77    /// let tree: KGST<char, String> = KGST::new('$');
78    /// ```
79    pub fn new(terminal_character: T)->Self{
80        Self {
81            nodes: [(0, Node::new(
82                [].into_iter().collect(),
83                None,
84                None,
85                0,
86                0
87            ))].into_iter().collect(),
88            root: 0,
89            terminal_character,
90            strings: [].into_iter().collect(),
91            leaves: Vec::new(),
92            suffix_links: [(0,0)].into_iter().collect(),
93            node_data: [(0, [].into_iter().collect())].into_iter().collect(),
94        }
95    }
96
97    /// Empties the tree of all strings and nodes.
98    pub fn clear(&mut self){
99        self.root = 0;
100        self.nodes = [].into_iter().collect();
101        self.strings = [].into_iter().collect();
102        self.leaves = Vec::new();
103        self.node_data = [].into_iter().collect();
104        self.suffix_links = [].into_iter().collect();
105    }
106
107    fn leaves_of_node(&self, node_id:&NodeID, leaves:&mut Vec<NodeID>){
108        if !self.get_node(node_id).has_children(){
109            leaves.push(*node_id);
110        }
111
112        for child_node_id in self.get_node_children(node_id).values(){
113            self.leaves_of_node(child_node_id, leaves);
114        }   
115    }
116
117    pub fn num_nodes(&self)->usize{
118        self.nodes.len()
119    }
120
121    /// Returns a Hashmap of all the strings present in the tree along with their respective tree depth.
122    pub fn get_strings(&self)->&HashMap<StringID, (TreeItem<T, U>, usize)>{
123        &self.strings
124    }
125
126    pub fn get_nodes(&self)->&HashMap<NodeID, Node<T>>{
127        &self.nodes
128    }
129
130    /// Retrieves a node from the tree by node id
131    pub fn get_node(&self, node_id: &NodeID)->&Node<T>{
132        self.nodes.get(node_id).expect("Node ID does not exist!")
133    }
134
135    /// Returns the string represented by the incoming edge of the node.
136    pub fn get_node_label(&self, node_id: &NodeID)->&[Character<T>]{
137        &self.get_node_string(node_id)[*self.get_node_start(node_id)..self.get_node_start(node_id)+(self.get_node_edge_length(node_id))]
138    }
139
140    fn create_node(&mut self, children: HashMap<Character<T>, usize>,
141            string_id: Option<usize>,
142            data: HashMap<StringID, HashSet<usize>>,
143            parent: Option<usize>,
144            edge_length: usize,
145            start: usize) -> usize{
146                let node_id: usize = self.nodes.len();
147                let node: Node<T> = Node::new(
148                    children,
149                    string_id,
150                    parent,
151                    edge_length,
152                    start
153                );
154                self.suffix_links.insert(node_id, 0);
155                self.nodes.insert(node_id, node);
156                self.node_data.insert(node_id, data);
157                node_id
158            }
159
160    fn set_node_suffix_link(&mut self, node_id: &NodeID, suffix_link_node_id: &NodeID){
161        self.suffix_links.entry(*node_id).and_modify(|e| *e *= suffix_link_node_id).or_insert(*suffix_link_node_id);
162    }
163
164    fn get_string_by_treeitem_id(&self, treeitem_id: &StringID)->&[Character<T>]{
165        self.strings.get(treeitem_id).expect("TreeItem ID does not exist!").0.get_string()
166    }
167
168    fn get_node_edge_length(&self, node_id: &NodeID)->usize{
169        self.get_node(node_id).get_edge_length()
170    }
171
172    fn get_node_string(&self, node_id: &NodeID)->&[Character<T>]{
173        self.get_string_by_treeitem_id(self.get_node_string_id(node_id))
174    }
175
176    fn get_node_string_id(&self, node_id: &NodeID)->&usize{
177        self.get_node(node_id).get_string_id().expect("Node ID is root node")
178    }
179
180    fn get_node_mut(&mut self, node_id: &NodeID)->&mut Node<T>{
181        self.nodes.get_mut(node_id).expect("Node ID does not exist!")
182    }
183
184    fn add_seq_to_leaves(&mut self, node_id: &NodeID, string_id: &StringID, start: &usize){
185        let mut leaves:Vec<NodeID> = vec![];
186        self.leaves_of_node(node_id, &mut leaves);
187        for leaf in leaves.iter(){
188            self.add_seq_to_node(leaf, string_id, start);
189        }
190    }
191
192    fn get_treeitem_by_treeitem_id(&self, treeitem_id: &StringID)->&(TreeItem<T, U>, usize){
193        self.strings.get(treeitem_id).expect("TreeItem ID does not exist!")
194    }
195
196    fn get_pattern_node(&self, q_string:&[T])->Option<&NodeID>{
197        let mut node_id: Option<&NodeID> = Some(&self.root);
198        // dbg!(q_string);
199        let mut c: &T = &q_string[0];
200        let mut i = 0;
201        loop {
202            node_id = self.get_node_child(node_id.unwrap(), c);
203            match node_id{
204                None => return None,
205                Some(n) => {
206                    if q_string.len()>self.get_node_depth(n){
207                        i += self.get_node_edge_length(n);
208                        c = &q_string[i];
209                        node_id = Some(n);
210                    }
211                    else{
212                        return Some(n);
213                    }
214                },
215            }
216        }
217    }
218
219    /// Retrieves all strings that the input slice is a suffix of.
220    pub fn suffix_match(&self, s:&[T])-> HashMap<U, HashSet<usize>>{
221        let mut query_string: Vec<T> = s.to_vec();
222        query_string.push(self.terminal_character.clone());
223        self.substring_match(&query_string)
224    }
225
226    /// Retrieves all strings that contain the input slice as some substring.
227    pub fn substring_match(&self, s:&[T]) -> HashMap<U, HashSet<usize>>{
228        let node = self.get_pattern_node(s);
229        let mut leaves:Vec<usize> = vec![];
230        let mut ids_and_indexes: HashMap<StringID, HashSet<usize>> = [].into_iter().collect();
231        match node{
232            None => {},
233            Some(i) => {
234                if self.get_node_depth(i)<s.len(){
235                    match self.get_node_parent(i){
236                        None => {},
237                        Some(_parent_id) => {self.leaves_of_node(i, &mut leaves);}
238                    }
239                }
240                else{
241                    self.leaves_of_node(i, &mut leaves);
242                }
243            }
244        }
245        for leaf in leaves{
246            for (treeitem_id, idx) in self.get_node_data(&leaf){
247                match ids_and_indexes.get_mut(treeitem_id){
248                    None => {
249                        if self.get_treeitem_by_treeitem_id(treeitem_id).1>=s.len(){
250                            ids_and_indexes.insert(*treeitem_id, idx.clone());
251                        }
252                    },
253                    Some(idxs) => {
254                        if self.get_treeitem_by_treeitem_id(treeitem_id).1>=s.len(){
255                            for i in idx.iter(){
256                                idxs.insert(*i);
257                            }
258                        }
259                    },
260                }
261            }
262        }
263        ids_and_indexes.into_iter().map(|(k, v)| (self.strings.get(&k).cloned().unwrap().0.get_id().clone(), v)).collect::<HashMap<U, HashSet<usize>>>()
264    }
265
266    fn get_node_children(&self, node_id: &NodeID)-> &HashMap<Character<T>, usize>{
267        self.get_node(node_id).get_children()
268    }
269
270    fn set_node_child_id(&mut self, edge_label: &Character<T>, parent_node_id: &NodeID, child_node_id: &NodeID){
271        self.get_node_mut(parent_node_id).set_child(edge_label.clone(), *child_node_id)
272    }
273
274    fn get_mut_treeitem_by_treeitem_id(&mut self, treeitem_id: &StringID)-> &mut TreeItem<T, U>{
275        &mut self.strings.get_mut(treeitem_id).expect("TreeItem does not exist!").0
276    }
277
278    fn add_node_to_treeitem(&mut self, treeitem_id: &StringID, node_id: &NodeID){
279        self.get_mut_treeitem_by_treeitem_id(treeitem_id).add_data_to_node(node_id)
280    }
281
282    fn add_seq_to_node(&mut self, node_id: &NodeID , seq_id: &StringID, start: &usize){
283        self.node_data.entry(*node_id).or_default().entry(*seq_id).or_default().insert(*start);
284        self.add_node_to_treeitem(seq_id, node_id);
285    }
286
287    fn add_data_to_node(&mut self, node_id: &NodeID, data: HashMap<StringID, HashSet<usize>>){
288        for (seq_id, starts) in data.iter(){
289            for start in starts.iter(){
290                self.add_seq_to_node(node_id, seq_id, start);
291            }
292        }
293    }
294
295    pub fn get_node_data(&self, node_id: &NodeID)->&HashMap<StringID, HashSet<usize>>{
296        self.node_data.get(node_id).expect("Node ID does not exist!")
297    }
298
299    fn set_node_parent_id(&mut self, node_id: &NodeID, parent_id: &NodeID){
300        self.get_node_mut(node_id).set_parent(*parent_id)
301    }
302
303    fn node_depth(&self, node_id: &NodeID, depth: usize)->usize{
304        match self.get_node(node_id).get_parent(){
305            None => depth,
306            Some(i) => self.node_depth(i, depth+self.get_node_edge_length(node_id))
307        }
308    }
309
310    fn get_node_start(&self, node_id: &NodeID)->&usize{
311        self.get_node(node_id).get_start()
312    }
313
314    fn set_node_start(&mut self, node_id: &NodeID, start: usize){
315        self.get_node_mut(node_id).set_start(start)
316    }
317
318    fn add_suffix_link(&mut self, node_id: &NodeID, need_suffix_link: &mut Option<usize>){
319        match need_suffix_link{
320            None => (),
321            Some(i) => self.set_node_suffix_link(i, node_id),
322        };
323        *need_suffix_link = Some(*node_id)
324    }
325
326    /// inserts all suffixes of a string into the tree. If max_depth>0, all substrings of length==max_depth are inserted. 
327    pub fn insert(&mut self, k: U, v: Vec<T>, max_depth: &usize){
328        let seq_id: U = k.clone();
329        let mut seq: Vec<T> = v.clone();
330
331        seq.push(self.terminal_character.clone());
332
333        let max_depth: usize = match max_depth {
334            &0 => seq.len(),
335            _ => cmp::min(*max_depth, seq.len()),
336        };
337        
338        let new_string: TreeItem<T, U> = TreeItem::new(seq_id, seq.clone());
339        let new_string_id: StringID = self.strings.len();
340        self.strings.insert(new_string_id, (new_string, max_depth));
341
342        let mut curr_pos: usize = 0;
343        let mut start_idx: usize = 0;
344        let mut need_suffix_link: Option<NodeID>;
345        let mut remainder: usize = 0;
346        let mut active_node: NodeID = 0;
347        while curr_pos < seq.len() {
348            need_suffix_link = None;
349            remainder += 1;
350            while remainder > 0{
351                let active_edge = Character::Char(seq[start_idx+self.get_node_depth(&active_node)].clone());
352                let next_node = self.get_node(&active_node).get_child(&active_edge).cloned();
353                match next_node{
354                    None => {
355                        let new_leaf_node_id: usize = self.create_node(
356                            [].into_iter().collect(),
357                            Some(new_string_id),
358                            [(new_string_id, [start_idx].into_iter().collect())].into_iter().collect(),
359                            Some(active_node),
360                            cmp::min(seq.len()-curr_pos,max_depth-self.get_node_depth(&active_node)),
361                            curr_pos,
362                        );
363                        self.set_node_child_id(&active_edge, &active_node, &new_leaf_node_id);
364                        self.add_suffix_link(&active_node, &mut need_suffix_link);
365                        let active_node_data = self.get_node_data(&active_node).clone();
366                        self.add_data_to_node(&new_leaf_node_id, active_node_data);
367                        start_idx += 1;
368                    },
369                    Some(next_node_id) => {
370                        if self.get_node_edge_length(&next_node_id)<=curr_pos-start_idx-self.get_node_depth(&active_node){
371                            // Walk down to next node (skip count trick)
372                            active_node = next_node_id;
373                            continue;
374                        }
375                        else if self.get_node_string(&next_node_id)[self.get_node_start(&next_node_id) + curr_pos-start_idx-self.get_node_depth(&active_node)] == Character::Char(seq[curr_pos].clone()){   
376                            self.add_seq_to_leaves(&next_node_id, &new_string_id, &start_idx);
377                            if curr_pos==seq.len()-1{
378                                start_idx+=1;
379                            }
380                            else{
381                                self.add_suffix_link(&active_node, &mut need_suffix_link);
382                                break;    
383                            }
384                        }
385                        else{
386                            let split_node_id: usize = self.create_node(
387                                [
388                                            (self.get_node_string(&next_node_id)[self.get_node_start(&next_node_id) + curr_pos-start_idx-self.get_node_depth(&active_node)].clone(), next_node_id)
389                                            ].into_iter().collect(),
390                                            Some(*self.get_node_string_id(&next_node_id)),
391                                            [(new_string_id, [start_idx].into_iter().collect())].into_iter().collect(),
392                                            Some(active_node),
393                                            curr_pos-start_idx-self.get_node_depth(&active_node),
394                                            *self.get_node_start(&next_node_id),
395                            );
396                            self.set_node_child_id(&active_edge, &active_node, &split_node_id);
397                            let next_node_new_start = self.get_node_start(&next_node_id) + curr_pos-start_idx-self.get_node_depth(&active_node);
398                            self.set_node_start(&next_node_id, next_node_new_start);
399                            self.set_node_parent_id(&next_node_id, &split_node_id);
400                            let leaf_node_id: usize = self.create_node(
401                                [].into_iter().collect(),
402                                Some(new_string_id),
403                                [(new_string_id, [start_idx].into_iter().collect())].into_iter().collect(),
404                                Some(split_node_id),
405                                cmp::min(seq.len()-curr_pos, max_depth-self.get_node_depth(&split_node_id)),
406                                curr_pos,
407                            );
408                            self.set_node_child_id(&Character::Char(seq[curr_pos].clone()), &split_node_id, &leaf_node_id);
409                            self.add_suffix_link(&split_node_id, &mut need_suffix_link);
410                            start_idx += 1;
411                        }
412                    },
413                };
414                if active_node != self.root{
415                    active_node = *self.get_suffix_link(&active_node);
416                }
417                remainder -= 1
418            }
419            curr_pos +=1;
420        }
421        
422    }
423
424    //Checks if a string with string_id already exists in tree.
425    pub fn contains(&self, string_id: &U)->bool{
426        let string_ids: HashSet<&U> = self.strings.values().map(|x| x.0.get_id()).collect();
427        string_ids.contains(string_id)
428    }
429
430    /// Returns a string iterator of the tree
431    pub fn iter_strings(&self)-> std::collections::hash_map::Iter<'_, usize, (TreeItem<T, U>, usize)>{
432        self.strings.iter()
433    }
434
435    /// Prints tree as a string.
436    pub fn print_tree(&self){
437        todo!()
438    }
439
440    /// Returns a preorder node iterator of the tree
441    pub fn iter_nodes_pre(&self)->PreOrdNodes<T>{
442        PreOrdNodes::new(&self.root, &self.nodes)
443    }
444    
445    /// Returns a preorder node iterator of the tree
446    pub fn iter_nodes_post(&self)->PostOrdNodes<T>{
447        PostOrdNodes::new(&self.root, &self.nodes)
448    }
449
450    /// Returns the nodes in a path in preorder
451    pub fn iter_path_pre(&self, node_id: &NodeID)->std::collections::linked_list::IntoIter<usize>{
452        self.get_node_path_pre(node_id).into_iter()
453    }
454
455    /// Returns the nodes in a path in postorder
456    pub fn iter_path_post(&self, node_id: &NodeID)->std::collections::linked_list::IntoIter<usize>{
457        self.get_node_path_post(node_id).into_iter()
458    }
459
460    /// Returns a postorder edge iterator of the tree
461    pub fn iter_edges_post(&self)->PostOrdEdges<T>{
462        PostOrdEdges::new(&self.root, &self.nodes, self.suffix_links.clone(), self.nodes.iter()
463        .map(|(k, v)| (*k, *v.get_parent().unwrap_or(&0)))
464        .collect()
465        )
466
467    }
468}
469
470impl<T, U> SuffixTree<T> for KGST<T, U>
471where
472    T: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize + PartialOrd,
473    U: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize,
474{
475    fn root(&self)->&NodeID{
476        &self.root
477    }
478
479    fn is_leaf(&self, node_id: &NodeID)->bool{
480        self.get_node(node_id).is_leaf()
481    }
482
483    fn get_node_child(&self, node_id: &NodeID, edge_label: &T)->Option<&NodeID>{
484        self.get_node(node_id).get_child(&Character::Char(edge_label.clone()))
485    }
486    fn get_node_parent(&self, node_id: &NodeID)->Option<&NodeID>{
487        self.get_node(node_id).get_parent()
488    }
489    fn get_node_depth(&self, node_id: &NodeID)->usize{
490        self.node_depth(node_id, 0)
491    }
492    fn get_suffix_link(&self, node_id: &NodeID) -> &usize{
493        self.suffix_links.get(node_id).expect("Node id does not exist!")
494    }
495    fn get_node_label(&self, node_id: &NodeID)->Vec<T>{
496        let node_edge_length  = self.get_node_edge_length(node_id);
497        let node_start = *self.get_node_start(node_id);
498        let string  = self.get_string_by_treeitem_id(self.get_node_string_id(node_id))[node_start..node_start+node_edge_length].iter();
499        string.map(|x| x.into_inner().cloned().expect("Terminal Character cannot be unwrapped!")).collect_vec()
500    }
501    fn get_node_path_label(&self, _node_id: &NodeID)->&[T]{
502        todo!();
503    }
504
505    fn get_node_path_pre(&self, node_id: &NodeID)->LinkedList<NodeID>{
506        let mut node_path: LinkedList<NodeID> = LinkedList::new();
507        let mut curr_node_id: usize = *node_id;
508        while self.get_node_parent(&curr_node_id).expect("Invalid NodeID! Path is broken")!=&0{
509            node_path.push_front(curr_node_id);
510            curr_node_id = self.get_node_parent(&curr_node_id).cloned().expect("Invalid NodeID! Path is broken");
511        }
512        node_path.push_front(curr_node_id);
513        node_path.push_front(0);
514        node_path
515    }
516
517    fn get_node_path_post(&self, node_id: &NodeID)->LinkedList<NodeID>{
518        let mut node_path: LinkedList<NodeID> = LinkedList::new();
519        let mut curr_node_id: usize = *node_id;
520        while self.get_node_parent(&curr_node_id).expect("Invalid NodeID! Path is broken")!=&0{
521            node_path.push_front(curr_node_id);
522            curr_node_id = self.get_node_parent(&curr_node_id).cloned().expect("Invalid NodeID! Path is broken");
523        }
524        node_path.push_back(curr_node_id);
525        node_path.push_back(0);
526        node_path
527    }
528
529    fn is_suffix(&self, s:&[T])->bool{
530        let mut query_string: Vec<T> = s.to_vec();
531        query_string.push(self.terminal_character.clone());
532        self.get_pattern_node(&query_string).is_some()
533    }
534}