ab_radix_trie/
lib.rs

1use std::collections::{HashMap, HashSet};
2use std::collections::hash_map::DefaultHasher;
3use std::fmt::{Debug, Formatter};
4use std::hash::{Hash, Hasher};
5use std::sync::atomic::Ordering::Relaxed;
6use serde::Serialize;
7use serde::Deserialize;
8use log::trace;
9#[derive(Debug,  Serialize, Deserialize)]
10pub struct Trie<V> {
11    children: HashMap<char, Node<V>>,
12    #[serde(default)]
13    node_count: std::sync::atomic::AtomicU32, // TODO: this didn't need to be atomic - had mutability issues to contend with
14    #[serde(default)]
15    char_count: std::sync::atomic::AtomicU32,
16}
17
18impl<V> Trie<V> {
19    pub fn new() -> Trie<V> {
20        Trie {
21            children: Default::default(),
22            node_count: Default::default(),
23            char_count: Default::default()
24        }
25    }
26    pub fn insert(&mut self, text: &str,
27                  optional_associated_value: Option<V>) {
28        if text.is_empty() {
29            return
30        }
31        let c = text.chars().next().unwrap();
32        if let Some(child) = self.children.get_mut(&c) {
33            child.insert(text, optional_associated_value, &self.node_count, &self.char_count) ;
34        } else {
35            self.node_count.fetch_add(1, Relaxed);
36            self.char_count.fetch_add(text.len() as u32, Relaxed);
37            self.children.insert(c, Node {
38                text: text.to_string(),
39                terminal: true,
40                children: Default::default(),
41                value: optional_associated_value,
42                visit_count: Default::default(),
43                #[cfg(feature = "tracing")]
44                node_id: gen_id(),
45                weight: text.len()
46            });
47        }
48    }
49    /// removes the text from the trie and compresses nodes along the way
50    pub fn remove(&mut self, text: &str) {
51        let first = text.chars().next().unwrap();
52        if let Some(first) = self.children.get_mut(&first) {
53            first.remove(text, &self.node_count, &self.char_count);
54        }
55    }
56    /// returns the suffix tree root for a given prefix
57    pub fn suffix_tree(&self, prefix: &str) -> Option<&Node<V>> {
58        if prefix.is_empty() {
59            return None
60        }
61        let first = prefix.chars().next().unwrap();
62        if let Some(child) = self.children.get(&first) {
63            return child.suffix_root(prefix)
64        }
65        None
66    }
67    /// returns the suffix tree with the given matching options
68    pub fn suffix_tree_with_matching_options(&self, prefix: &str, options: &MatchingOptions) -> Option<&Node<V>> {
69        let first = prefix.chars().next().unwrap();
70        if let Some(child) = self.children.get(&first) {
71            let tagged  = options.tag(prefix);
72            return child.suffix_tree_with_options(tagged.chars.as_slice(), options);
73        }
74        None
75    }
76    pub fn get_string_suffixes(&self, prefix: &str) -> HashSet<String> {
77        let mut coll = Vec::new();
78        let mut emit = HashSet::new();
79        self.suffix_tree(prefix).map(|t| {
80            t.get_string_suffixes(true, prefix, &mut coll, &mut emit)
81        });
82        emit
83    }
84
85    pub fn get_suffixes_values(&self, prefix: &str) -> Option<Vec<Entry<V>>> {
86        let mut coll = Vec::new();
87        self.suffix_tree(prefix).map(|t| {
88            t.get_suffixes(true, prefix,  &mut coll)
89        })
90    }
91
92    pub fn get_suffixes_with_matching_options(&self, prefix: &str, options: &MatchingOptions) -> Option<Vec<Entry<V>>> {
93        let mut coll = Vec::new();
94        self.suffix_tree_with_matching_options(prefix, options).map(|t| {
95            t.get_suffixes(true, prefix,  &mut coll)
96        })
97    }
98}
99
100#[derive(Serialize, Deserialize)]
101pub struct Node<V> {
102    text: String,
103    terminal: bool,
104    children: HashMap<char, Node<V>>,
105    value: Option<V>,
106    // for pruning purposes
107    #[serde(default)]
108    visit_count: std::sync::atomic::AtomicU64, // TODO: this didn't need to be atomic
109    #[cfg(feature = "tracing")]
110    #[serde(default = "gen_id")]
111    node_id: Option<u8>, // TODO: u8 might not be enough ? consider bigger
112    #[serde(default)]
113    weight: usize,
114}
115
116#[cfg(feature = "tracing")]
117fn gen_id() -> Option<u8> {
118    use rand::Rng;
119    let mut rng = rand::thread_rng();
120    Some(rng.gen())
121}
122
123impl<V> Debug for Node<V> where V: Debug {
124    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
125        f.debug_struct("Node")
126            .field("text", &self.text)
127            .field("terminal", &self.terminal)
128            .field("value", &self.value).field("children", &self.children)
129            .field("nested_chars", &self.weight)
130            .finish()
131    }
132}
133
134impl<V> Node<V> {
135    pub fn new(string: &str, is_terminal: bool, value: Option<V>) -> Self {
136        Node {
137            text: string.to_string(),
138            terminal: is_terminal,
139            children: Default::default(),
140            value,
141            visit_count: Default::default(),
142            #[cfg(feature = "tracing")]
143            node_id: gen_id(),
144            weight: string.len()
145        }
146    }
147
148    pub fn remove(&mut self, text: &str,
149                  node_count: &std::sync::atomic::AtomicU32,
150                  char_count: &std::sync::atomic::AtomicU32) {
151        let mut position = 0;
152        let mut my_iter = self.text.chars();
153        let mut other_iter = text.chars();
154        loop {
155            let my_char = my_iter.next();
156            let other_char = other_iter.next();
157            let remaining = grapheme_slicer_until_end(text, position);
158            match (my_char, other_char) {
159                // this = abcd
160                // other = abce
161                (Some(x), Some(y)) if x != y => {
162                    // stop
163                    return
164                },
165                (Some(x), Some(y)) if x == y => {
166                    // more to come
167                },
168                (Some(_), None) => {
169                    return
170                },
171                (None, Some(y)) => {
172                    if let Some(child) = self.children.get_mut(&y) {
173                        child.remove(remaining.as_str(), node_count, char_count);
174                        if child.children.len() == 0 {
175                            node_count.fetch_sub(1, Relaxed);
176                            char_count.fetch_sub(child.text.len() as u32, Relaxed);
177                            self.weight -= child.weight;
178                            if !child.terminal {
179                                self.children.remove(&y); // removing dangling child
180                            }
181                        }
182                        break;
183                    }
184                    // this prefix never existed in this tree
185                    return
186                },
187                (None, None) => {
188                    // this case we need to remove this node possibly
189                    if self.children.len() == 1 {
190                        break
191                    }
192                    // make zombie to be removed
193                    self.terminal = false;
194                    return
195
196                },
197                _ => {}
198            }
199            position += 1;
200        }
201        if self.children.len() == 1 {
202            // merge this with the child
203            let (_, child) = self.children.drain().next().unwrap();
204            node_count.fetch_sub(1, Relaxed);
205            // same text which is merged back
206            self.text = format!("{}{}", self.text, child.text);
207            self.value = child.value;
208            self.terminal = child.terminal;
209            self.children = child.children;
210            return
211        }
212    }
213
214    pub fn char_weight_of_children(&self) -> usize {
215        self.children
216            .iter()
217            .fold(0, |x, (_,y) | x + y.weight)
218    }
219
220    pub fn insert(&mut self, text: &str,
221                  value: Option<V>,
222                  node_count: &std::sync::atomic::AtomicU32,
223                  char_count: &std::sync::atomic::AtomicU32) {
224        let mut position = 0;
225        let mut child_iter = text.chars();
226        let mut this_iter = self.text.chars();
227        loop {
228            let text_next_char = child_iter.next();
229            let this_next_char = this_iter.next();
230
231            match (text_next_char, this_next_char) {
232                (Some(x), Some(y)) if x == y => {
233                    // continue
234                },
235                (Some(input_next), Some(this_next)) if input_next != this_next => {
236                    // split
237                    let current_child_weight = self.char_weight_of_children();
238                    let existing_remainder = grapheme_slicer_until_end(self.text.as_str(), position);
239                    // let existing_remainder = self.text[position..].to_string();
240
241                    let mut new_node = Node {
242                        text: existing_remainder.clone(),
243                        terminal: self.terminal, // if I was terminal, then suffice to say my splitted up self is also terminal
244                        children: Default::default(),
245                        value: None,
246                        visit_count: std::sync::atomic::AtomicU64::new(self.visit_count()),
247                        #[cfg(feature = "std")]
248                        node_id: gen_id(),
249                        weight: current_child_weight + existing_remainder.len()
250                    };
251                    // exhange my children for the new node (I am empty and will add a new node back)
252                    std::mem::swap(&mut new_node.children, &mut self.children);
253                    std::mem::swap(&mut new_node.value, &mut self.value);
254                    let first_char_of_existing_remainder = existing_remainder.chars().next().unwrap();
255                    // new node was created but same num chars which was split between 2 nodes
256                    node_count.fetch_add(1, Relaxed);
257                    self.children.insert(first_char_of_existing_remainder, new_node);
258
259
260                    let common = grapheme_slicer_until_point(text, position);
261
262                    let remainder = grapheme_slicer_until_end(text, position);
263                    // let common = &text[0..position];
264                    // let remainder = &text[position..];
265
266                    let input_new_node = Node {
267                        text: remainder.to_string(),
268                        terminal: true,
269                        children: Default::default(),
270                        value,
271                        visit_count: std::sync::atomic::AtomicU64::new(self.visit_count()),
272                        #[cfg(feature = "tracing")]
273                        node_id: gen_id(),
274                        weight: remainder.len()
275                    };
276
277                    let c = remainder.chars().next().unwrap();
278                    // new node added
279                    node_count.fetch_add(1, Relaxed);
280                    // remainder chars added to tree
281                    char_count.fetch_add(remainder.len() as u32, Relaxed);
282                    self.children.insert(c, input_new_node);
283                    // let delta_weight = self.text.len() - common.len();
284                    self.text = common.to_string();
285                    self.terminal = false;
286                    self.weight = self.text.len() + self.char_weight_of_children();
287                    // self.weight += (delta_weight + remainder.len());
288                    return;
289
290
291
292                }
293                (None, Some(c)) => {
294                    // let prefix = self.text[0..position].to_string();
295                    // let remainder = self.text[position..].to_string();
296
297                    let prefix = grapheme_slicer_until_point(self.text.as_str(), position);
298
299                    let remainder = grapheme_slicer_until_end(self.text.as_str(), position);
300                    self.text = prefix.to_string();
301                    self.terminal = true;
302                    // self.nested_chars += remainder.len();
303                    let current_child_weight = self.char_weight_of_children();
304                    if let Some(child) = self.children.get_mut(&c) {
305                        let taken = std::mem::take(&mut self.value);
306                        self.value = value;
307
308                        child.insert(remainder.as_str(), taken, node_count, char_count);
309                        let new_char_weight = self.char_weight_of_children();
310                        let delta = new_char_weight - current_child_weight;
311                        self.weight += delta;
312                        return
313                    }
314                    let new_node = Node {
315                        text: remainder.to_string(),
316                        terminal: true,
317                        children: Default::default(),
318                        value,
319                        visit_count: std::sync::atomic::AtomicU64::new(self.visit_count()),
320                        #[cfg(feature = "tracing")]
321                        node_id: gen_id(),
322                        weight: remainder.len()
323                    };
324                    node_count.fetch_add(1, Relaxed);
325                    char_count.fetch_add(remainder.len() as u32, Relaxed);
326                    self.children.insert(c, new_node);
327                    self.terminal = true;
328                    return;
329                },
330                (Some(text_next), None) => {
331                    let remainder = grapheme_slicer_until_end(text, position);
332                    // let remainder = &text[position..];
333
334                    // self.nested_chars += remainder.len();
335                    let current_child_weight = self.char_weight_of_children();
336                    if let Some(next) = self.children.get_mut(&text_next) {
337
338                        next.insert(remainder.as_str(), value, node_count, char_count);
339                        let new_weight_of_children = self.char_weight_of_children();
340                        let delta = new_weight_of_children - current_child_weight;
341                        self.weight += delta;
342                        return
343                    }
344                    // make new child
345                    let new_node = Node {
346                        text: remainder.to_string(),
347                        terminal: true,
348                        children: Default::default(),
349                        value,
350                        visit_count: std::sync::atomic::AtomicU64::new(self.visit_count()),
351                        #[cfg(feature = "tracing")]
352                        node_id: gen_id(),
353                        weight: remainder.len()
354                    };
355                    self.weight += remainder.len();
356                    node_count.fetch_add(1, Relaxed);
357                    char_count.fetch_add(remainder.len() as u32, Relaxed);
358                    self.children.insert(text_next, new_node);
359                    return;
360
361                }
362                (None, None) => {
363                    self.terminal = true;
364                    if self.value.is_none() {
365                        self.value = value;
366                    }
367                    return;
368                }
369                _ => {panic!("Should never be here {} {}", self.text, text )} // compiler yells that it wants this case but I don't see how it could occur
370            }
371            position += 1;
372
373
374
375        }
376    }
377
378    fn match_on_treated_suffix_trees(&self, prefix: &[(Tagged, Offset)], options: &MatchingOptions) -> Vec<&Node<V>> {
379        if prefix.len() == 0 {
380            return vec![];
381        }
382        // i don't think this is necessarily correct - example if you have multiple whitespaces
383        let tagged_char = &prefix.first().unwrap().0.char();
384
385        let mut v = self.children.iter().filter(|(key, _node)| {
386            key == tagged_char ||
387            options.treatments.contains_key(key)
388        } ).map(|(_, n)| {
389            n.suffix_tree_with_options(prefix, options)
390        }).flatten().collect::<Vec<_>>();
391        v.sort_by(|x,y| y.weight.cmp(&x.weight));
392        v
393    }
394
395    fn suffix_tree_with_options(&self, prefix: &[(Tagged, Offset)], options: &MatchingOptions) -> Option<&Node<V>> {
396        // update visit count
397
398
399        self.visit_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
400        let self_tagged = options.tag(self.text.as_str());
401        #[cfg(feature = "tracing")]
402        if true {
403            trace!("this: {:?} {:?}", self.text, self.node_id);
404            trace!("self_tagged: {:?}", self_tagged);
405            trace!("prefix: {:?}", prefix);
406            trace!("children {:?}", self.children.keys());
407        }
408
409        let this_iter = self_tagged.chars.iter();
410        let taken = prefix.iter().zip(this_iter).take_while(|((x, _),(y, _))| {x == y}).collect::<Vec<_>>().len();
411        // trace!("taken: {}", taken);
412        if taken < prefix.len() {
413            // trace!("prefix token after: {:?}", &prefix[taken]);
414        }
415        if taken < self_tagged.chars.len() {
416            // trace!("this token after {:?}", &self_tagged.chars[taken]);
417        }
418        match taken {
419            x if x < prefix.len() && x < self_tagged.chars.len() => {
420                // we have a mismatch at some position and need to attempt to branch
421                let (_,offset) = self_tagged.chars.get(x).unwrap();
422                let continuation = grapheme_slicer_until_end(self.text.as_str(), *offset);
423                // let continuation = &self.text.as_str()[*offset..];
424                let child_ = continuation.chars().next().unwrap();
425                let new_prefix = &prefix[x..];
426                let best_attempt = self.match_on_treated_suffix_trees(new_prefix, options).into_iter().next();
427                if let Some(child) = self.children.get(&child_) {
428                    let c = child.suffix_tree_with_options(new_prefix, options);
429                    match (best_attempt, c) {
430                        (Some(best_attempt), Some(c)) if best_attempt.weight > c.weight => return Some(best_attempt),
431                        (Some(best_attempt), _) => return Some(best_attempt),
432                        (_, Some(c)) => return Some(c),
433                        _ => {}
434                    }
435                    // if let Some(res) = child.suffix_tree_with_options(new_prefix, options) {
436                    //     return Some(res)
437                    // }
438                    // return child.suffix_tree_with_options(new_prefix, options)
439                }
440                return best_attempt;
441                // return None
442            }
443            x if x == prefix.len() && x <= self_tagged.chars.len() => {
444                return Some(self) // this is the terminating node
445            }
446            x if x == self_tagged.chars.len() && x < prefix.len() => {
447                let (last,_o) = prefix.get(x).unwrap();
448                match last {
449                    Tagged::Char(c) | Tagged::Sentinel(_, c) => {
450                        // all this since choosing a single path
451                        // perhaps the better solution is to return both options
452                        // TODO: define the proper strategy here
453                        let new_prefix = &prefix[x..];
454                        let mut treated = self.match_on_treated_suffix_trees(new_prefix, options);
455                        treated.sort_by(|x,y| {
456                            y.weight.partial_cmp(&x.weight).unwrap()
457                        });
458                        let best = treated.into_iter().next();
459
460                        if let Some(child) = self.children.get(c) {
461                            let c = child.suffix_tree_with_options(new_prefix, options);
462                            match (best,c) {
463                                (Some(best), Some(c)) if best.weight > c.weight => {
464                                    trace!("choose fuzzy over exact match");
465                                    return Some(best)}
466                                (Some(best), _) => {
467                                    trace!("choose fuzzy (no exact match)");
468                                    return Some(best)
469                                },
470                                (_, Some(c)) => return Some(c),
471                                _ => {}
472                            }
473
474                        }
475                        return best
476                    }
477
478                }
479
480            }
481            _ => {
482                // should not get here
483                return None
484            }
485        }
486    }
487
488    /// returns the suffix tree for the given prefix
489    /// if you want to include partial results in the case that the node text contains the prefix text but possibly longer, then include_partial should be set to true
490    /// example: Node: abcdef, prefix: abc
491    /// include_partial: false => None
492    /// include_partial: true => Some(self) // overlaps on abc
493    pub fn suffix_root(&self, prefix: &str) -> Option<&Node<V>> {
494        // update visit count
495        self.visit_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
496        let mut position = 0;
497        let mut this_iter = self.text.chars();
498        let mut other_iter = prefix.chars();
499        loop {
500            let this_next_char = this_iter.next();
501            let other_next_char = other_iter.next();
502            match (this_next_char, other_next_char) {
503                (Some(x), Some(y)) => {
504                    if x != y {
505                        // return nothing
506                        if !x.is_whitespace() && !y.is_whitespace() {
507                            return None
508                        }
509
510                    }
511                },
512                (None, Some(x)) => {
513                    if let Some(next_child) = self.children.get(&x) {
514                        return next_child.suffix_root(grapheme_slicer_until_end(prefix, position).as_str())
515                    }
516                    return None
517                }
518                (Some(x), None) => {
519                    if let Some(child) = self.children.get(&x) {
520                        return child.suffix_root(prefix)
521                    }
522                    return Some(self)
523                }
524                (None, None) => {
525                    return Some(self);
526                }
527            }
528            position += 1;
529        }
530    }
531
532    /// returns all the suffixes below this node
533    /// notice that we pass the prefix as an input parameter because there may be some overlap between the node contents and the prefix which needs to be stripped away
534    /// for example a tree like "ab" -> "cde"
535    /// for a given input "abc" we would be lead to this node
536    /// so we want to return results like "abcde" and not "abccde" (notice the "c" appears twice if blindly appending)
537    /// notice that there is an edge case here which is not yet handled where the match might have been fuzzy with options and that overlap is not handled correctly
538    /// for instance if this node text ends with white space
539    fn get_suffixes<'a>(&'a self, is_root: bool, prefix: &str, collector: &mut Vec<String>) -> Vec<Entry<'a, V>> {
540        // update visit count
541        self.visit_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
542        //Entry
543        if !is_root {
544            collector.push(self.text.clone());
545        } else {
546            if let Some(pos) = find_common_overlap_of_prefix_with_node(prefix, self.text.as_str()) {
547                if pos < self.text.len() {
548                    collector.push(grapheme_slicer_until_end(self.text.as_str(), pos));
549                }
550            } else {
551                collector.push(self.text.clone());
552            }
553        }
554
555        let mut v: Vec<Entry<'a ,V>> = Vec::<Entry<V>>::new();
556        if self.terminal {
557            let jo = collector.join("");
558            let entry: Entry<'a, V> = Entry {
559                key: jo,
560                val: &self.value
561            };
562            v.push(entry);
563        }
564        let mut children = self.children.values();
565        while let Some(child) = children.next() {
566            let mut add = child.get_suffixes(false, prefix, collector);
567            v.append(&mut add);
568        }
569        collector.pop();
570        v
571    }
572
573    pub fn get_string_suffixes(&self, is_root: bool, prefix: &str, collector: &mut Vec<String>, emit: &mut HashSet<String>) {
574        // update visit count
575        self.visit_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
576        if !is_root {
577            collector.push(self.text.clone());
578        } else {
579            if let Some(pos) = find_common_overlap_of_prefix_with_node(prefix, self.text.as_str()) {
580                if pos < self.text.len() {
581                    collector.push(grapheme_slicer_until_end(self.text.as_str(), pos));
582                }
583            }
584
585        }
586
587        if self.terminal {
588            let jo = collector.join("");
589            emit.insert(jo);
590        }
591        let mut children = self.children.values();
592        while let Some(child) = children.next() {
593            child.get_string_suffixes(false, prefix, collector, emit);
594        }
595        collector.pop();
596    }
597
598    fn visit_count(&self) -> u64 {
599        self.visit_count.load(std::sync::atomic::Ordering::Relaxed)
600    }
601}
602
603#[derive(Debug)]
604pub struct Entry<'a, V> {
605    pub key: String,
606    pub val: &'a Option<V>
607}
608
609pub enum CharacterSet {
610    /// spaces and tabs
611    WhiteSpaces,
612    /// just \n
613    NewLines,
614    /// spaces and tabs and \b
615    WhiteSpacesAndNewLines,
616    /// Capitalized are treated like lower cased
617    CapitalizedLetters,
618
619    Char(HashSet<char>)
620
621}
622
623#[derive(Debug,Clone,PartialEq,Eq)]
624pub enum NormalizedChar {
625    // this char can be squashed since it i
626    Squash,
627    Char(char),
628    // sentinal hash value for this equivalnce set
629    Sentinal(u64,char),
630}
631
632#[derive(Clone,Debug)]
633enum Tagged {
634    Char(char),
635    /// for use when replacing sets of characters with a common code
636    /// i.e consider a,b,c,d,e,f as identical characters - will be a Sentinel(hash, real char)
637    Sentinel(u64, char)
638}
639
640impl Tagged {
641    fn char(&self) -> &char {
642        match self {
643            Tagged::Char(x) => {x}
644            Tagged::Sentinel(_, x) => {x}
645        }
646    }
647}
648
649impl PartialEq for Tagged {
650    fn eq(&self, other: &Self) -> bool {
651        match (self,other) {
652            (Tagged::Char(c1), Tagged::Char(c2)) => c1 == c2,
653            (Tagged::Sentinel(h1, _),Tagged::Sentinel(h2, _) ) => h1 == h2,
654            _ => false
655        }
656    }
657}
658
659
660/// the originating offset in that string
661type Offset = usize;
662/// the idea is that for a given string you map to the offset for a trimmed character set
663#[derive(Clone,Debug)]
664struct TaggedString {
665    chars: Vec<(Tagged, Offset)>
666}
667
668impl TaggedString {
669    /// tags this string
670    /// example: "abc" => < (a,0), (b,1), (b,2) >
671    #[allow(dead_code)]
672    fn new(str: &str) -> Self {
673        Self {
674            chars: str.chars().enumerate().map(|(offset, char)| {
675                (Tagged::Char(char), offset)
676            }).collect()
677        }
678    }
679}
680
681
682/// tags the strings with the offsets
683impl From<Vec<NormalizedChar>> for TaggedString {
684    fn from(chars: Vec<NormalizedChar>) -> Self {
685        let tagged  = chars.into_iter().enumerate().flat_map(|(offset,y)| {
686            match y {
687                NormalizedChar::Squash => {None}
688                NormalizedChar::Char(x) => {Some((Tagged::Char(x), offset))}
689                NormalizedChar::Sentinal(hash, char) => {Some((Tagged::Sentinel(hash,char), offset))}
690            }
691        }).collect();
692        Self {
693            chars: tagged
694        }
695    }
696}
697
698impl CharacterSet {
699    pub fn normalized_char(&self, char: char) -> NormalizedChar {
700        match self {
701            CharacterSet::WhiteSpaces if char.is_whitespace() => {
702                NormalizedChar::Squash
703            }
704            CharacterSet::NewLines if char == '\n' => { NormalizedChar::Squash}
705            CharacterSet::WhiteSpacesAndNewLines if char == '\n' || char == ' '=> { NormalizedChar::Squash}
706            CharacterSet::CapitalizedLetters => { NormalizedChar::Char(char.to_uppercase().next().unwrap_or(char))}
707            CharacterSet::Char(x) if x.contains(&char)=> {
708                let mut v = x.iter().collect::<Vec<_>>();
709                v.sort(); // not sure if hash is dependant on ordering
710                let mut s = DefaultHasher::new();
711
712                v.hash(&mut s);
713                NormalizedChar::Sentinal(s.finish(), char)
714            }
715            _ => NormalizedChar::Char(char)
716        }
717    }
718}
719
720fn encode(str: &str, treatments: &HashMap<char, CharacterSet>) -> Vec<NormalizedChar> {
721    str.chars().map(|c| {
722            treatments.get(&c).map(|t| t.normalized_char(c)).unwrap_or_else(|| NormalizedChar::Char(c))
723        }
724    ).collect::<Vec<_>>()
725}
726
727/// since slicing a string by bytes will not work when you have long unicode characters (wide grapheme cluster?)
728/// take chars and allocate a new string (unless you have a way to slice it by cluster???).
729/// TODO: figure out how to return a slice instead of new string
730fn grapheme_slicer_until_end(str: &str, from:usize) -> String {
731    // figure how to slice char boundary and not allocate a new string
732    let temp = str.chars().collect::<Vec<_>>();
733    let x = &temp.as_slice()[from..temp.len()];
734    x.iter().collect()
735}
736
737fn grapheme_slicer_until_point(str: &str, until: usize) -> String {
738
739    let temp = str.chars().collect::<Vec<_>>();
740    let x = &temp.as_slice()[0..until];
741    x.iter().collect()
742}
743
744#[test]
745fn test_shitty_slicer() {
746    let text = "🤡abcded🤡";
747    let pref = grapheme_slicer_until_point(text, 4);
748    assert_eq!(pref, "🤡abc");
749}
750/// describes matching options
751/// you supply a mapping of characters to the character set to match against
752/// for example * matches against all characters
753pub struct MatchingOptions {
754    treatments: HashMap<char, CharacterSet>, // TODO: need to check if "char" supports emoji and other wide characters
755}
756
757impl MatchingOptions {
758    /// exact match only
759    pub fn exact() -> Self {
760        Self {
761            treatments: Default::default()
762        }
763    }
764    /// match but accept white space differences
765    pub fn ignoring_white_space() -> Self {
766        let mut treatments = HashMap::new();
767        treatments.insert(' ', CharacterSet::WhiteSpaces);
768        treatments.insert('\t', CharacterSet::WhiteSpaces);
769        Self {
770            treatments
771        }
772    }
773
774    /// match but accept diff in new lines
775    pub fn ignoring_new_lines() -> Self {
776        let mut treatments = HashMap::new();
777        treatments.insert('\n', CharacterSet::WhiteSpaces);
778        Self {
779            treatments
780        }
781    }
782    /// match but accept new lines and whitespace differences
783    pub fn ignoring_white_space_and_new_lines() -> Self {
784        let mut treatments = HashMap::new();
785        treatments.insert(' ', CharacterSet::WhiteSpaces);
786        treatments.insert('\t', CharacterSet::WhiteSpaces);
787        treatments.insert('\n', CharacterSet::WhiteSpaces);
788        Self {
789            treatments
790        }
791    }
792
793    fn tag(&self, str: &str) -> TaggedString {
794        let encoded = encode(str, &self.treatments);
795        TaggedString::from(encoded)
796    }
797}
798
799fn find_common_overlap_of_prefix_with_node(prefix: &str, node: &str) -> Option<usize>{
800    // node - omeabcde
801    // prefix rome
802    // result o
803    for offset in (0..prefix.len()).rev() {
804        let str = grapheme_slicer_until_end(&prefix, offset);
805        if node.starts_with(str.as_str()) {
806            return Some(str.len())
807        }
808    }
809    None
810
811}
812
813
814#[test]
815fn test_squashing() {
816    let node_text = "   abc    def       iop \t\t\t qwe   ";
817    let input = "abc def iop     qwe";
818
819    let internal = MatchingOptions::ignoring_white_space();
820    let nt1 = internal.tag(node_text);
821    let nt2 = internal.tag(input);
822
823    println!("{:?}", nt1);
824    println!("{:?}", nt2);
825}
826
827#[test]
828fn test_empty() {
829    let mut trie: Trie<String> = Trie::new();
830    trie.insert("", None);
831    let results = trie.get_suffixes_values("");
832    assert!(results.is_none());
833    trie.insert("romanus", None);
834    let results = trie.get_suffixes_values("");
835    assert!(results.is_none());
836    trie.insert("romulus", None);
837    let results = trie.get_suffixes_values("");
838    assert!(results.is_none());
839    trie.insert("rubens", None);
840    let results = trie.get_suffixes_values("");
841    assert!(results.is_none());
842    trie.insert("ruber", None);
843    let results = trie.get_suffixes_values("");
844    assert!(results.is_none());
845    trie.insert("rubicon", None);
846    let results = trie.get_suffixes_values("");
847    assert!(results.is_none());
848    trie.insert("rubicundus", None);
849    let results = trie.get_suffixes_values("");
850    assert!(results.is_none());
851}
ab_radix_trie/lib.rs

ab_radix_trie/
lib.rs