1pub mod tree;
2
3use crate::data::tree_item::Character;
4use crate::suffix_tree::tree::*;
5use crate::suffix_node::node::*;
6use crate::suffix_node::*;
7use crate::data::TreeItem;
8use crate::data::tree_item::TreeItem as OtherTreeItem;
9use crate::iter::node_iter::*;
10use crate::iter::edge_iter::*;
11
12#[cfg(feature = "non_crypto_hash")]
13use fxhash::{FxHashMap as HashMap, FxHashSet as HashSet};
14#[cfg(not(feature = "non_crypto_hash"))]
15use std::collections::{HashMap, HashSet};
16
17use std::collections::LinkedList;
18use std::fmt::{Display, Debug};
19use std::hash::Hash;
20use std::cmp;
21use std::option::Option;
22use itertools::Itertools;
23use serde::ser::{Serialize, Serializer, SerializeStruct};
24
25
26#[derive(Debug)]
29pub struct KGST<T, U>
30where
31 T: Display + Debug + Eq + PartialEq + Hash + Clone + PartialOrd,
32 U: Display + Debug + Eq + PartialEq + Hash + Clone,
33{
34 root: usize,
35 nodes: HashMap<NodeID, Node<T>>,
36 terminal_character: T,
37 strings: HashMap<StringID, (TreeItem<T, U>, usize)>,
38 leaves: Vec<NodeID>,
39 suffix_links: HashMap<NodeID, NodeID>,
40 node_data: HashMap<NodeID, HashMap<StringID, HashSet<usize>>>
41}
42
43impl<T, U> Serialize for KGST<T, U>
44where
45 T: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize + PartialOrd,
46 U: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize,
47{
48 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
49 where
50 S: Serializer,
51 {
52 let mut state = serializer.serialize_struct("KGST", 7)?;
53 state.serialize_field("root", &self.root)?;
54 state.serialize_field("nodes", &self.nodes)?;
55 state.serialize_field("terminal_character", &self.terminal_character)?;
56 state.serialize_field("strings", &self.strings)?;
57 state.serialize_field("leaves", &self.leaves)?;
58 state.serialize_field("suffix_links", &self.suffix_links)?;
59 state.serialize_field("node_data", &self.node_data)?;
60 state.end()
61 }
62}
63
64
65impl<T, U> KGST<T, U>
66where
67 T: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize + PartialOrd,
68 U: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize,
69{
70 pub fn new(terminal_character: T)->Self{
80 Self {
81 nodes: [(0, Node::new(
82 [].into_iter().collect(),
83 None,
84 None,
85 0,
86 0
87 ))].into_iter().collect(),
88 root: 0,
89 terminal_character,
90 strings: [].into_iter().collect(),
91 leaves: Vec::new(),
92 suffix_links: [(0,0)].into_iter().collect(),
93 node_data: [(0, [].into_iter().collect())].into_iter().collect(),
94 }
95 }
96
97 pub fn clear(&mut self){
99 self.root = 0;
100 self.nodes = [].into_iter().collect();
101 self.strings = [].into_iter().collect();
102 self.leaves = Vec::new();
103 self.node_data = [].into_iter().collect();
104 self.suffix_links = [].into_iter().collect();
105 }
106
107 fn leaves_of_node(&self, node_id:&NodeID, leaves:&mut Vec<NodeID>){
108 if !self.get_node(node_id).has_children(){
109 leaves.push(*node_id);
110 }
111
112 for child_node_id in self.get_node_children(node_id).values(){
113 self.leaves_of_node(child_node_id, leaves);
114 }
115 }
116
117 pub fn num_nodes(&self)->usize{
118 self.nodes.len()
119 }
120
121 pub fn get_strings(&self)->&HashMap<StringID, (TreeItem<T, U>, usize)>{
123 &self.strings
124 }
125
126 pub fn get_nodes(&self)->&HashMap<NodeID, Node<T>>{
127 &self.nodes
128 }
129
130 pub fn get_node(&self, node_id: &NodeID)->&Node<T>{
132 self.nodes.get(node_id).expect("Node ID does not exist!")
133 }
134
135 pub fn get_node_label(&self, node_id: &NodeID)->&[Character<T>]{
137 &self.get_node_string(node_id)[*self.get_node_start(node_id)..self.get_node_start(node_id)+(self.get_node_edge_length(node_id))]
138 }
139
140 fn create_node(&mut self, children: HashMap<Character<T>, usize>,
141 string_id: Option<usize>,
142 data: HashMap<StringID, HashSet<usize>>,
143 parent: Option<usize>,
144 edge_length: usize,
145 start: usize) -> usize{
146 let node_id: usize = self.nodes.len();
147 let node: Node<T> = Node::new(
148 children,
149 string_id,
150 parent,
151 edge_length,
152 start
153 );
154 self.suffix_links.insert(node_id, 0);
155 self.nodes.insert(node_id, node);
156 self.node_data.insert(node_id, data);
157 node_id
158 }
159
160 fn set_node_suffix_link(&mut self, node_id: &NodeID, suffix_link_node_id: &NodeID){
161 self.suffix_links.entry(*node_id).and_modify(|e| *e *= suffix_link_node_id).or_insert(*suffix_link_node_id);
162 }
163
164 fn get_string_by_treeitem_id(&self, treeitem_id: &StringID)->&[Character<T>]{
165 self.strings.get(treeitem_id).expect("TreeItem ID does not exist!").0.get_string()
166 }
167
168 fn get_node_edge_length(&self, node_id: &NodeID)->usize{
169 self.get_node(node_id).get_edge_length()
170 }
171
172 fn get_node_string(&self, node_id: &NodeID)->&[Character<T>]{
173 self.get_string_by_treeitem_id(self.get_node_string_id(node_id))
174 }
175
176 fn get_node_string_id(&self, node_id: &NodeID)->&usize{
177 self.get_node(node_id).get_string_id().expect("Node ID is root node")
178 }
179
180 fn get_node_mut(&mut self, node_id: &NodeID)->&mut Node<T>{
181 self.nodes.get_mut(node_id).expect("Node ID does not exist!")
182 }
183
184 fn add_seq_to_leaves(&mut self, node_id: &NodeID, string_id: &StringID, start: &usize){
185 let mut leaves:Vec<NodeID> = vec![];
186 self.leaves_of_node(node_id, &mut leaves);
187 for leaf in leaves.iter(){
188 self.add_seq_to_node(leaf, string_id, start);
189 }
190 }
191
192 fn get_treeitem_by_treeitem_id(&self, treeitem_id: &StringID)->&(TreeItem<T, U>, usize){
193 self.strings.get(treeitem_id).expect("TreeItem ID does not exist!")
194 }
195
196 fn get_pattern_node(&self, q_string:&[T])->Option<&NodeID>{
197 let mut node_id: Option<&NodeID> = Some(&self.root);
198 let mut c: &T = &q_string[0];
200 let mut i = 0;
201 loop {
202 node_id = self.get_node_child(node_id.unwrap(), c);
203 match node_id{
204 None => return None,
205 Some(n) => {
206 if q_string.len()>self.get_node_depth(n){
207 i += self.get_node_edge_length(n);
208 c = &q_string[i];
209 node_id = Some(n);
210 }
211 else{
212 return Some(n);
213 }
214 },
215 }
216 }
217 }
218
219 pub fn suffix_match(&self, s:&[T])-> HashMap<U, HashSet<usize>>{
221 let mut query_string: Vec<T> = s.to_vec();
222 query_string.push(self.terminal_character.clone());
223 self.substring_match(&query_string)
224 }
225
226 pub fn substring_match(&self, s:&[T]) -> HashMap<U, HashSet<usize>>{
228 let node = self.get_pattern_node(s);
229 let mut leaves:Vec<usize> = vec![];
230 let mut ids_and_indexes: HashMap<StringID, HashSet<usize>> = [].into_iter().collect();
231 match node{
232 None => {},
233 Some(i) => {
234 if self.get_node_depth(i)<s.len(){
235 match self.get_node_parent(i){
236 None => {},
237 Some(_parent_id) => {self.leaves_of_node(i, &mut leaves);}
238 }
239 }
240 else{
241 self.leaves_of_node(i, &mut leaves);
242 }
243 }
244 }
245 for leaf in leaves{
246 for (treeitem_id, idx) in self.get_node_data(&leaf){
247 match ids_and_indexes.get_mut(treeitem_id){
248 None => {
249 if self.get_treeitem_by_treeitem_id(treeitem_id).1>=s.len(){
250 ids_and_indexes.insert(*treeitem_id, idx.clone());
251 }
252 },
253 Some(idxs) => {
254 if self.get_treeitem_by_treeitem_id(treeitem_id).1>=s.len(){
255 for i in idx.iter(){
256 idxs.insert(*i);
257 }
258 }
259 },
260 }
261 }
262 }
263 ids_and_indexes.into_iter().map(|(k, v)| (self.strings.get(&k).cloned().unwrap().0.get_id().clone(), v)).collect::<HashMap<U, HashSet<usize>>>()
264 }
265
266 fn get_node_children(&self, node_id: &NodeID)-> &HashMap<Character<T>, usize>{
267 self.get_node(node_id).get_children()
268 }
269
270 fn set_node_child_id(&mut self, edge_label: &Character<T>, parent_node_id: &NodeID, child_node_id: &NodeID){
271 self.get_node_mut(parent_node_id).set_child(edge_label.clone(), *child_node_id)
272 }
273
274 fn get_mut_treeitem_by_treeitem_id(&mut self, treeitem_id: &StringID)-> &mut TreeItem<T, U>{
275 &mut self.strings.get_mut(treeitem_id).expect("TreeItem does not exist!").0
276 }
277
278 fn add_node_to_treeitem(&mut self, treeitem_id: &StringID, node_id: &NodeID){
279 self.get_mut_treeitem_by_treeitem_id(treeitem_id).add_data_to_node(node_id)
280 }
281
282 fn add_seq_to_node(&mut self, node_id: &NodeID , seq_id: &StringID, start: &usize){
283 self.node_data.entry(*node_id).or_default().entry(*seq_id).or_default().insert(*start);
284 self.add_node_to_treeitem(seq_id, node_id);
285 }
286
287 fn add_data_to_node(&mut self, node_id: &NodeID, data: HashMap<StringID, HashSet<usize>>){
288 for (seq_id, starts) in data.iter(){
289 for start in starts.iter(){
290 self.add_seq_to_node(node_id, seq_id, start);
291 }
292 }
293 }
294
295 pub fn get_node_data(&self, node_id: &NodeID)->&HashMap<StringID, HashSet<usize>>{
296 self.node_data.get(node_id).expect("Node ID does not exist!")
297 }
298
299 fn set_node_parent_id(&mut self, node_id: &NodeID, parent_id: &NodeID){
300 self.get_node_mut(node_id).set_parent(*parent_id)
301 }
302
303 fn node_depth(&self, node_id: &NodeID, depth: usize)->usize{
304 match self.get_node(node_id).get_parent(){
305 None => depth,
306 Some(i) => self.node_depth(i, depth+self.get_node_edge_length(node_id))
307 }
308 }
309
310 fn get_node_start(&self, node_id: &NodeID)->&usize{
311 self.get_node(node_id).get_start()
312 }
313
314 fn set_node_start(&mut self, node_id: &NodeID, start: usize){
315 self.get_node_mut(node_id).set_start(start)
316 }
317
318 fn add_suffix_link(&mut self, node_id: &NodeID, need_suffix_link: &mut Option<usize>){
319 match need_suffix_link{
320 None => (),
321 Some(i) => self.set_node_suffix_link(i, node_id),
322 };
323 *need_suffix_link = Some(*node_id)
324 }
325
326 pub fn insert(&mut self, k: U, v: Vec<T>, max_depth: &usize){
328 let seq_id: U = k.clone();
329 let mut seq: Vec<T> = v.clone();
330
331 seq.push(self.terminal_character.clone());
332
333 let max_depth: usize = match max_depth {
334 &0 => seq.len(),
335 _ => cmp::min(*max_depth, seq.len()),
336 };
337
338 let new_string: TreeItem<T, U> = TreeItem::new(seq_id, seq.clone());
339 let new_string_id: StringID = self.strings.len();
340 self.strings.insert(new_string_id, (new_string, max_depth));
341
342 let mut curr_pos: usize = 0;
343 let mut start_idx: usize = 0;
344 let mut need_suffix_link: Option<NodeID>;
345 let mut remainder: usize = 0;
346 let mut active_node: NodeID = 0;
347 while curr_pos < seq.len() {
348 need_suffix_link = None;
349 remainder += 1;
350 while remainder > 0{
351 let active_edge = Character::Char(seq[start_idx+self.get_node_depth(&active_node)].clone());
352 let next_node = self.get_node(&active_node).get_child(&active_edge).cloned();
353 match next_node{
354 None => {
355 let new_leaf_node_id: usize = self.create_node(
356 [].into_iter().collect(),
357 Some(new_string_id),
358 [(new_string_id, [start_idx].into_iter().collect())].into_iter().collect(),
359 Some(active_node),
360 cmp::min(seq.len()-curr_pos,max_depth-self.get_node_depth(&active_node)),
361 curr_pos,
362 );
363 self.set_node_child_id(&active_edge, &active_node, &new_leaf_node_id);
364 self.add_suffix_link(&active_node, &mut need_suffix_link);
365 let active_node_data = self.get_node_data(&active_node).clone();
366 self.add_data_to_node(&new_leaf_node_id, active_node_data);
367 start_idx += 1;
368 },
369 Some(next_node_id) => {
370 if self.get_node_edge_length(&next_node_id)<=curr_pos-start_idx-self.get_node_depth(&active_node){
371 active_node = next_node_id;
373 continue;
374 }
375 else if self.get_node_string(&next_node_id)[self.get_node_start(&next_node_id) + curr_pos-start_idx-self.get_node_depth(&active_node)] == Character::Char(seq[curr_pos].clone()){
376 self.add_seq_to_leaves(&next_node_id, &new_string_id, &start_idx);
377 if curr_pos==seq.len()-1{
378 start_idx+=1;
379 }
380 else{
381 self.add_suffix_link(&active_node, &mut need_suffix_link);
382 break;
383 }
384 }
385 else{
386 let split_node_id: usize = self.create_node(
387 [
388 (self.get_node_string(&next_node_id)[self.get_node_start(&next_node_id) + curr_pos-start_idx-self.get_node_depth(&active_node)].clone(), next_node_id)
389 ].into_iter().collect(),
390 Some(*self.get_node_string_id(&next_node_id)),
391 [(new_string_id, [start_idx].into_iter().collect())].into_iter().collect(),
392 Some(active_node),
393 curr_pos-start_idx-self.get_node_depth(&active_node),
394 *self.get_node_start(&next_node_id),
395 );
396 self.set_node_child_id(&active_edge, &active_node, &split_node_id);
397 let next_node_new_start = self.get_node_start(&next_node_id) + curr_pos-start_idx-self.get_node_depth(&active_node);
398 self.set_node_start(&next_node_id, next_node_new_start);
399 self.set_node_parent_id(&next_node_id, &split_node_id);
400 let leaf_node_id: usize = self.create_node(
401 [].into_iter().collect(),
402 Some(new_string_id),
403 [(new_string_id, [start_idx].into_iter().collect())].into_iter().collect(),
404 Some(split_node_id),
405 cmp::min(seq.len()-curr_pos, max_depth-self.get_node_depth(&split_node_id)),
406 curr_pos,
407 );
408 self.set_node_child_id(&Character::Char(seq[curr_pos].clone()), &split_node_id, &leaf_node_id);
409 self.add_suffix_link(&split_node_id, &mut need_suffix_link);
410 start_idx += 1;
411 }
412 },
413 };
414 if active_node != self.root{
415 active_node = *self.get_suffix_link(&active_node);
416 }
417 remainder -= 1
418 }
419 curr_pos +=1;
420 }
421
422 }
423
424 pub fn contains(&self, string_id: &U)->bool{
426 let string_ids: HashSet<&U> = self.strings.values().map(|x| x.0.get_id()).collect();
427 string_ids.contains(string_id)
428 }
429
430 pub fn iter_strings(&self)-> std::collections::hash_map::Iter<'_, usize, (TreeItem<T, U>, usize)>{
432 self.strings.iter()
433 }
434
435 pub fn print_tree(&self){
437 todo!()
438 }
439
440 pub fn iter_nodes_pre(&self)->PreOrdNodes<T>{
442 PreOrdNodes::new(&self.root, &self.nodes)
443 }
444
445 pub fn iter_nodes_post(&self)->PostOrdNodes<T>{
447 PostOrdNodes::new(&self.root, &self.nodes)
448 }
449
450 pub fn iter_path_pre(&self, node_id: &NodeID)->std::collections::linked_list::IntoIter<usize>{
452 self.get_node_path_pre(node_id).into_iter()
453 }
454
455 pub fn iter_path_post(&self, node_id: &NodeID)->std::collections::linked_list::IntoIter<usize>{
457 self.get_node_path_post(node_id).into_iter()
458 }
459
460 pub fn iter_edges_post(&self)->PostOrdEdges<T>{
462 PostOrdEdges::new(&self.root, &self.nodes, self.suffix_links.clone(), self.nodes.iter()
463 .map(|(k, v)| (*k, *v.get_parent().unwrap_or(&0)))
464 .collect()
465 )
466
467 }
468}
469
470impl<T, U> SuffixTree<T> for KGST<T, U>
471where
472 T: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize + PartialOrd,
473 U: Display + Debug + Eq + PartialEq + Hash + Clone + Serialize,
474{
475 fn root(&self)->&NodeID{
476 &self.root
477 }
478
479 fn is_leaf(&self, node_id: &NodeID)->bool{
480 self.get_node(node_id).is_leaf()
481 }
482
483 fn get_node_child(&self, node_id: &NodeID, edge_label: &T)->Option<&NodeID>{
484 self.get_node(node_id).get_child(&Character::Char(edge_label.clone()))
485 }
486 fn get_node_parent(&self, node_id: &NodeID)->Option<&NodeID>{
487 self.get_node(node_id).get_parent()
488 }
489 fn get_node_depth(&self, node_id: &NodeID)->usize{
490 self.node_depth(node_id, 0)
491 }
492 fn get_suffix_link(&self, node_id: &NodeID) -> &usize{
493 self.suffix_links.get(node_id).expect("Node id does not exist!")
494 }
495 fn get_node_label(&self, node_id: &NodeID)->Vec<T>{
496 let node_edge_length = self.get_node_edge_length(node_id);
497 let node_start = *self.get_node_start(node_id);
498 let string = self.get_string_by_treeitem_id(self.get_node_string_id(node_id))[node_start..node_start+node_edge_length].iter();
499 string.map(|x| x.into_inner().cloned().expect("Terminal Character cannot be unwrapped!")).collect_vec()
500 }
501 fn get_node_path_label(&self, _node_id: &NodeID)->&[T]{
502 todo!();
503 }
504
505 fn get_node_path_pre(&self, node_id: &NodeID)->LinkedList<NodeID>{
506 let mut node_path: LinkedList<NodeID> = LinkedList::new();
507 let mut curr_node_id: usize = *node_id;
508 while self.get_node_parent(&curr_node_id).expect("Invalid NodeID! Path is broken")!=&0{
509 node_path.push_front(curr_node_id);
510 curr_node_id = self.get_node_parent(&curr_node_id).cloned().expect("Invalid NodeID! Path is broken");
511 }
512 node_path.push_front(curr_node_id);
513 node_path.push_front(0);
514 node_path
515 }
516
517 fn get_node_path_post(&self, node_id: &NodeID)->LinkedList<NodeID>{
518 let mut node_path: LinkedList<NodeID> = LinkedList::new();
519 let mut curr_node_id: usize = *node_id;
520 while self.get_node_parent(&curr_node_id).expect("Invalid NodeID! Path is broken")!=&0{
521 node_path.push_front(curr_node_id);
522 curr_node_id = self.get_node_parent(&curr_node_id).cloned().expect("Invalid NodeID! Path is broken");
523 }
524 node_path.push_back(curr_node_id);
525 node_path.push_back(0);
526 node_path
527 }
528
529 fn is_suffix(&self, s:&[T])->bool{
530 let mut query_string: Vec<T> = s.to_vec();
531 query_string.push(self.terminal_character.clone());
532 self.get_pattern_node(&query_string).is_some()
533 }
534}