cedarwood/
lib.rs

1//!  Efficiently-updatable double-array trie in Rust (ported from cedar).
2//!
3//! Add it to your `Cargo.toml`:
4//!
5//! ```toml
6//! [dependencies]
7//! cedarwood = "0.4"
8//! ```
9//!
10//! then you are good to go. If you are using Rust 2015 you have to `extern crate cedarwood` to your crate root as well.
11//!
12//! ## Example
13//!
14//! ```rust
15//! use cedarwood::Cedar;
16//!
17//! let dict = vec![
18//!     "a",
19//!     "ab",
20//!     "abc",
21//!     "アルゴリズム",
22//!     "データ",
23//!     "構造",
24//!     "网",
25//!     "网球",
26//!     "网球拍",
27//!     "中",
28//!     "中华",
29//!     "中华人民",
30//!     "中华人民共和国",
31//! ];
32//! let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
33//! let mut cedar = Cedar::new();
34//! cedar.build(&key_values);
35//!
36//! let result: Vec<i32> = cedar.common_prefix_search("abcdefg").unwrap().iter().map(|x| x.0).collect();
37//! assert_eq!(vec![0, 1, 2], result);
38//!
39//! let result: Vec<i32> = cedar
40//!     .common_prefix_search("网球拍卖会")
41//!     .unwrap()
42//!     .iter()
43//!     .map(|x| x.0)
44//!     .collect();
45//! assert_eq!(vec![6, 7, 8], result);
46//!
47//! let result: Vec<i32> = cedar
48//!     .common_prefix_search("中华人民共和国")
49//!     .unwrap()
50//!     .iter()
51//!     .map(|x| x.0)
52//!     .collect();
53//! assert_eq!(vec![9, 10, 11, 12], result);
54//!
55//! let result: Vec<i32> = cedar
56//!     .common_prefix_search("データ構造とアルゴリズム")
57//!     .unwrap()
58//!     .iter()
59//!     .map(|x| x.0)
60//!     .collect();
61//! assert_eq!(vec![4], result);
62//! ```
63
64use smallvec::SmallVec;
65use std::fmt;
66
67/// NInfo stores the information about the trie
68#[derive(Debug, Default, Clone)]
69struct NInfo {
70    sibling: u8, // the index of right sibling, it is 0 if it doesn't have a sibling.
71    child: u8,   // the index of the first child
72}
73
74/// Node contains the array of `base` and `check` as specified in the paper: "An efficient implementation of trie structures"
75/// https://dl.acm.org/citation.cfm?id=146691
76#[derive(Debug, Default, Clone)]
77struct Node {
78    base_: i32, // if it is a negative value, then it stores the value of previous index that is free.
79    check: i32, // if it is a negative value, then it stores the value of next index that is free.
80}
81
82impl Node {
83    #[inline]
84    fn base(&self) -> i32 {
85        #[cfg(feature = "reduced-trie")]
86        return -(self.base_ + 1);
87        #[cfg(not(feature = "reduced-trie"))]
88        return self.base_;
89    }
90}
91
92/// Block stores the linked-list pointers and the stats info for blocks.
93#[derive(Debug, Clone)]
94struct Block {
95    prev: i32,   // previous block's index, 3 bytes width
96    next: i32,   // next block's index, 3 bytes width
97    num: i16,    // the number of slots that is free, the range is 0-256
98    reject: i16, // a heuristic number to make the search for free space faster, it is the minimum number of iteration in each trie node it has to try before we can conclude that we can reject this block. If the number of kids for the block we are looking for is less than this number then this block is worthy of searching.
99    trial: i32,  // the number of times this block has been probed by `find_places` for the free block.
100    e_head: i32, // the index of the first empty elemenet in this block
101}
102
103impl Block {
104    pub fn new() -> Self {
105        Block {
106            prev: 0,
107            next: 0,
108            num: 256,    // each of block has 256 free slots at the beginning
109            reject: 257, // initially every block need to be fully iterated through so that we can reject it to be unusable.
110            trial: 0,
111            e_head: 0,
112        }
113    }
114}
115
116/// Blocks are marked as either of three categories, so that we can quickly decide if we can
117/// allocate it for use or not.
118enum BlockType {
119    Open,   // The block has spaces more than 1.
120    Closed, // The block is only left with one free slot
121    Full,   // The block's slots are fully used.
122}
123
124/// `Cedar` holds all of the information about double array trie.
125#[derive(Clone)]
126pub struct Cedar {
127    array: Vec<Node>, // storing the `base` and `check` info from the original paper.
128    n_infos: Vec<NInfo>,
129    blocks: Vec<Block>,
130    reject: Vec<i16>,
131    blocks_head_full: i32,   // the index of the first 'Full' block, 0 means no 'Full' block
132    blocks_head_closed: i32, // the index of the first 'Closed' block, 0 means no ' Closed' block
133    blocks_head_open: i32,   // the index of the first 'Open' block, 0 means no 'Open' block
134    capacity: usize,
135    size: usize,
136    ordered: bool,
137    max_trial: i32, // the parameter for cedar, it could be tuned for more, but the default is 1.
138}
139
140impl fmt::Debug for Cedar {
141    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
142        write!(f, "Cedar(size={}, ordered={})", self.size, self.ordered)
143    }
144}
145
146#[allow(dead_code)]
147const CEDAR_VALUE_LIMIT: i32 = std::i32::MAX - 1;
148const CEDAR_NO_VALUE: i32 = -1;
149
150/// Iterator for `common_prefix_search`
151#[derive(Clone)]
152pub struct PrefixIter<'a> {
153    cedar: &'a Cedar,
154    key: &'a [u8],
155    from: usize,
156    i: usize,
157}
158
159impl<'a> Iterator for PrefixIter<'a> {
160    type Item = (i32, usize);
161
162    fn size_hint(&self) -> (usize, Option<usize>) {
163        (0, Some(self.key.len()))
164    }
165
166    fn next(&mut self) -> Option<Self::Item> {
167        while self.i < self.key.len() {
168            if let Some(value) = self.cedar.find(&self.key[self.i..=self.i], &mut self.from) {
169                if value == CEDAR_NO_VALUE {
170                    self.i += 1;
171                    continue;
172                } else {
173                    let result = Some((value, self.i));
174                    self.i += 1;
175                    return result;
176                }
177            } else {
178                break;
179            }
180        }
181
182        None
183    }
184}
185
186/// Iterator for `common_prefix_predict`
187#[derive(Clone)]
188pub struct PrefixPredictIter<'a> {
189    cedar: &'a Cedar,
190    key: &'a [u8],
191    from: usize,
192    p: usize,
193    root: usize,
194    value: Option<i32>,
195}
196
197impl<'a> PrefixPredictIter<'a> {
198    fn next_until_none(&mut self) -> Option<(i32, usize)> {
199        #[allow(clippy::never_loop)]
200        while let Some(value) = self.value {
201            let result = (value, self.p);
202
203            let (v_, from_, p_) = self.cedar.next(self.from, self.p, self.root);
204            self.from = from_;
205            self.p = p_;
206            self.value = v_;
207
208            return Some(result);
209        }
210
211        None
212    }
213}
214
215impl<'a> Iterator for PrefixPredictIter<'a> {
216    type Item = (i32, usize);
217
218    fn next(&mut self) -> Option<Self::Item> {
219        if self.from == 0 && self.p == 0 {
220            // To locate the prefix's position first, if it doesn't exist then that means we
221            // don't have do anything. `from` would serve as the cursor.
222            if self.cedar.find(self.key, &mut self.from).is_some() {
223                self.root = self.from;
224
225                let (v_, from_, p_) = self.cedar.begin(self.from, self.p);
226                self.from = from_;
227                self.p = p_;
228                self.value = v_;
229
230                self.next_until_none()
231            } else {
232                None
233            }
234        } else {
235            self.next_until_none()
236        }
237    }
238}
239
240#[allow(clippy::cast_lossless)]
241impl Cedar {
242    /// Initialize the Cedar for further use.
243    #[allow(clippy::new_without_default)]
244    pub fn new() -> Self {
245        let mut array: Vec<Node> = Vec::with_capacity(256);
246        let n_infos: Vec<NInfo> = (0..256).map(|_| Default::default()).collect();
247        let mut blocks: Vec<Block> = vec![Block::new(); 1];
248        let reject: Vec<i16> = (0..=256).map(|i| i + 1).collect();
249
250        #[cfg(feature = "reduced-trie")]
251        array.push(Node { base_: -1, check: -1 });
252        #[cfg(not(feature = "reduced-trie"))]
253        array.push(Node { base_: 0, check: -1 });
254
255        for i in 1..256 {
256            // make `base_` point to the previous element, and make `check` point to the next element
257            array.push(Node {
258                base_: -(i - 1),
259                check: -(i + 1),
260            })
261        }
262
263        // make them link as a cyclic doubly-linked list
264        array[1].base_ = -255;
265        array[255].check = -1;
266
267        blocks[0].e_head = 1;
268
269        Cedar {
270            array,
271            n_infos,
272            blocks,
273            reject,
274            blocks_head_full: 0,
275            blocks_head_closed: 0,
276            blocks_head_open: 0,
277            capacity: 256,
278            size: 256,
279            ordered: true,
280            max_trial: 1,
281        }
282    }
283
284    /// Build the double array trie from the given key value pairs
285    #[allow(dead_code)]
286    pub fn build(&mut self, key_values: &[(&str, i32)]) {
287        for (key, value) in key_values {
288            self.update(key, *value);
289        }
290    }
291
292    /// Update the key for the value, it is public interface that works on &str
293    pub fn update(&mut self, key: &str, value: i32) {
294        let from = 0;
295        let pos = 0;
296        self.update_(key.as_bytes(), value, from, pos);
297    }
298
299    // Update the key for the value, it is internal interface that works on &[u8] and cursor.
300    fn update_(&mut self, key: &[u8], value: i32, mut from: usize, mut pos: usize) -> i32 {
301        if from == 0 && key.is_empty() {
302            panic!("failed to insert zero-length key");
303        }
304
305        while pos < key.len() {
306            #[cfg(feature = "reduced-trie")]
307            {
308                let val_ = self.array[from].base_;
309                if val_ >= 0 && val_ != CEDAR_VALUE_LIMIT {
310                    let to = self.follow(from, 0);
311                    self.array[to as usize].base_ = val_;
312                }
313            }
314
315            from = self.follow(from, key[pos]) as usize;
316            pos += 1;
317        }
318
319        #[cfg(feature = "reduced-trie")]
320        let to = if self.array[from].base_ >= 0 {
321            from as i32
322        } else {
323            self.follow(from, 0)
324        };
325
326        #[cfg(feature = "reduced-trie")]
327        {
328            if self.array[to as usize].base_ == CEDAR_VALUE_LIMIT {
329                self.array[to as usize].base_ = 0;
330            }
331        }
332
333        #[cfg(not(feature = "reduced-trie"))]
334        let to = self.follow(from, 0);
335
336        self.array[to as usize].base_ = value;
337        self.array[to as usize].base_
338    }
339
340    // To move in the trie by following the `label`, and insert the node if the node is not there,
341    // it is used by the `update` to populate the trie.
342    #[inline]
343    fn follow(&mut self, from: usize, label: u8) -> i32 {
344        let base = self.array[from].base();
345
346        #[allow(unused_assignments)]
347        let mut to = 0;
348
349        // the node is not there
350        if base < 0 || self.array[(base ^ (label as i32)) as usize].check < 0 {
351            // allocate a e node
352            to = self.pop_e_node(base, label, from as i32);
353            let branch: i32 = to ^ (label as i32);
354
355            // maintain the info in ninfo
356            self.push_sibling(from, branch, label, base >= 0);
357        } else {
358            // the node is already there and the ownership is not `from`, therefore a conflict.
359            to = base ^ (label as i32);
360            if self.array[to as usize].check != (from as i32) {
361                // call `resolve` to relocate.
362                to = self.resolve(from, base, label);
363            }
364        }
365
366        to
367    }
368
369    // Find key from double array trie, with `from` as the cursor to traverse the nodes.
370    fn find(&self, key: &[u8], from: &mut usize) -> Option<i32> {
371        #[allow(unused_assignments)]
372        let mut to: usize = 0;
373        let mut pos = 0;
374
375        // recursively matching the key.
376        while pos < key.len() {
377            #[cfg(feature = "reduced-trie")]
378            {
379                if self.array[*from].base_ >= 0 {
380                    break;
381                }
382            }
383
384            to = (self.array[*from].base() ^ (key[pos] as i32)) as usize;
385            if self.array[to as usize].check != (*from as i32) {
386                return None;
387            }
388
389            *from = to;
390            pos += 1;
391        }
392
393        #[cfg(feature = "reduced-trie")]
394        {
395            if self.array[*from].base_ >= 0 {
396                if pos == key.len() {
397                    return Some(self.array[*from].base_);
398                } else {
399                    return None;
400                }
401            }
402        }
403
404        // return the value of the node if `check` is correctly marked fpr the ownership, otherwise
405        // it means no value is stored.
406        let n = &self.array[(self.array[*from].base()) as usize];
407        if n.check != (*from as i32) {
408            Some(CEDAR_NO_VALUE)
409        } else {
410            Some(n.base_)
411        }
412    }
413
414    /// Delete the key from the trie, the public interface that works on &str
415    pub fn erase(&mut self, key: &str) {
416        self.erase_(key.as_bytes())
417    }
418
419    // Delete the key from the trie, the internal interface that works on &[u8]
420    fn erase_(&mut self, key: &[u8]) {
421        let mut from = 0;
422
423        // move the cursor to the right place and use erase__ to delete it.
424        if let Some(v) = self.find(key, &mut from) {
425            if v != CEDAR_NO_VALUE {
426                self.erase__(from);
427            }
428        }
429    }
430
431    fn erase__(&mut self, mut from: usize) {
432        #[cfg(feature = "reduced-trie")]
433        let mut e: i32 = if self.array[from].base_ >= 0 {
434            from as i32
435        } else {
436            self.array[from].base()
437        };
438
439        #[cfg(feature = "reduced-trie")]
440        {
441            from = self.array[e as usize].check as usize;
442        }
443
444        #[cfg(not(feature = "reduced-trie"))]
445        let mut e = self.array[from].base();
446
447        #[allow(unused_assignments)]
448        let mut has_sibling = false;
449        loop {
450            let n = self.array[from].clone();
451            has_sibling = self.n_infos[(n.base() ^ (self.n_infos[from].child as i32)) as usize].sibling != 0;
452
453            // if the node has siblings, then remove `e` from the sibling.
454            if has_sibling {
455                self.pop_sibling(from as i32, n.base(), (n.base() ^ e) as u8);
456            }
457
458            // maintain the data structures.
459            self.push_e_node(e);
460            e = from as i32;
461
462            // traverse to the parent.
463            from = self.array[from].check as usize;
464
465            // if it has sibling then this layer has more than one nodes, then we are done.
466            if has_sibling {
467                break;
468            }
469        }
470    }
471
472    /// To check if `key` is in the dictionary.
473    pub fn exact_match_search(&self, key: &str) -> Option<(i32, usize, usize)> {
474        let key = key.as_bytes();
475        let mut from = 0;
476
477        if let Some(value) = self.find(key, &mut from) {
478            if value == CEDAR_NO_VALUE {
479                return None;
480            }
481
482            Some((value, key.len(), from))
483        } else {
484            None
485        }
486    }
487
488    /// To return an iterator to iterate through the common prefix in the dictionary with the `key` passed in.
489    pub fn common_prefix_iter<'a>(&'a self, key: &'a str) -> PrefixIter<'a> {
490        let key = key.as_bytes();
491
492        PrefixIter {
493            cedar: self,
494            key,
495            from: 0,
496            i: 0,
497        }
498    }
499
500    /// To return the collection of the common prefix in the dictionary with the `key` passed in.
501    pub fn common_prefix_search(&self, key: &str) -> Option<Vec<(i32, usize)>> {
502        self.common_prefix_iter(key).map(Some).collect()
503    }
504
505    /// To return an iterator to iterate through the list of words in the dictionary that has `key` as their prefix.
506    pub fn common_prefix_predict_iter<'a>(&'a self, key: &'a str) -> PrefixPredictIter<'a> {
507        let key = key.as_bytes();
508
509        PrefixPredictIter {
510            cedar: self,
511            key,
512            from: 0,
513            p: 0,
514            root: 0,
515            value: None,
516        }
517    }
518
519    /// To return the list of words in the dictionary that has `key` as their prefix.
520    pub fn common_prefix_predict(&self, key: &str) -> Option<Vec<(i32, usize)>> {
521        self.common_prefix_predict_iter(key).map(Some).collect()
522    }
523
524    // To get the cursor of the first leaf node starting by `from`
525    fn begin(&self, mut from: usize, mut p: usize) -> (Option<i32>, usize, usize) {
526        let base = self.array[from].base();
527        let mut c = self.n_infos[from].child;
528
529        if from == 0 {
530            c = self.n_infos[(base ^ (c as i32)) as usize].sibling;
531
532            // if no sibling couldn be found from the virtual root, then we are done.
533            if c == 0 {
534                return (None, from, p);
535            }
536        }
537
538        // recursively traversing down to look for the first leaf.
539        while c != 0 {
540            from = (self.array[from].base() ^ (c as i32)) as usize;
541            c = self.n_infos[from].child;
542            p += 1;
543        }
544
545        #[cfg(feature = "reduced-trie")]
546        {
547            if self.array[from].base_ >= 0 {
548                return (Some(self.array[from].base_), from, p);
549            }
550        }
551
552        // To return the value of the leaf.
553        let v = self.array[(self.array[from].base() ^ (c as i32)) as usize].base_;
554        (Some(v), from, p)
555    }
556
557    // To move the cursor from one leaf to the next for the common_prefix_predict.
558    fn next(&self, mut from: usize, mut p: usize, root: usize) -> (Option<i32>, usize, usize) {
559        #[allow(unused_assignments)]
560        let mut c: u8 = 0;
561
562        #[cfg(feature = "reduced-trie")]
563        {
564            if self.array[from].base_ < 0 {
565                c = self.n_infos[(self.array[from].base()) as usize].sibling;
566            }
567        }
568        #[cfg(not(feature = "reduced-trie"))]
569        {
570            c = self.n_infos[(self.array[from].base()) as usize].sibling;
571        }
572
573        // traversing up until there is a sibling or it has reached the root.
574        while c == 0 && from != root {
575            c = self.n_infos[from as usize].sibling;
576            from = self.array[from as usize].check as usize;
577
578            p -= 1;
579        }
580
581        if c != 0 {
582            // it has a sibling so we leverage on `begin` to traverse the subtree down again.
583            from = (self.array[from].base() ^ (c as i32)) as usize;
584            let (v_, from_, p_) = self.begin(from, p + 1);
585            (v_, from_, p_)
586        } else {
587            // no more work since we couldn't find anything.
588            (None, from, p)
589        }
590    }
591
592    // pop a block at idx from the linked-list of type `from`, specially handled if it is the last
593    // one in the linked-list.
594    fn pop_block(&mut self, idx: i32, from: BlockType, last: bool) {
595        let head: &mut i32 = match from {
596            BlockType::Open => &mut self.blocks_head_open,
597            BlockType::Closed => &mut self.blocks_head_closed,
598            BlockType::Full => &mut self.blocks_head_full,
599        };
600
601        if last {
602            *head = 0;
603        } else {
604            let b = self.blocks[idx as usize].clone();
605            self.blocks[b.prev as usize].next = b.next;
606            self.blocks[b.next as usize].prev = b.prev;
607
608            if idx == *head {
609                *head = b.next;
610            }
611        }
612    }
613
614    // return the block at idx to the linked-list of `to`, specially handled if the linked-list is
615    // empty
616    fn push_block(&mut self, idx: i32, to: BlockType, empty: bool) {
617        let head: &mut i32 = match to {
618            BlockType::Open => &mut self.blocks_head_open,
619            BlockType::Closed => &mut self.blocks_head_closed,
620            BlockType::Full => &mut self.blocks_head_full,
621        };
622
623        if empty {
624            self.blocks[idx as usize].next = idx;
625            self.blocks[idx as usize].prev = idx;
626            *head = idx;
627        } else {
628            self.blocks[idx as usize].prev = self.blocks[*head as usize].prev;
629            self.blocks[idx as usize].next = *head;
630
631            let t = self.blocks[*head as usize].prev;
632            self.blocks[t as usize].next = idx;
633            self.blocks[*head as usize].prev = idx;
634            *head = idx;
635        }
636    }
637
638    /// Reallocate more spaces so that we have more free blocks.
639    fn add_block(&mut self) -> i32 {
640        if self.size == self.capacity {
641            self.capacity += self.capacity;
642
643            self.array.resize(self.capacity, Default::default());
644            self.n_infos.resize(self.capacity, Default::default());
645            self.blocks.resize(self.capacity >> 8, Block::new());
646        }
647
648        self.blocks[self.size >> 8].e_head = self.size as i32;
649
650        // make it a doubley linked list
651        self.array[self.size] = Node {
652            base_: -((self.size as i32) + 255),
653            check: -((self.size as i32) + 1),
654        };
655
656        for i in (self.size + 1)..(self.size + 255) {
657            self.array[i] = Node {
658                base_: -(i as i32 - 1),
659                check: -(i as i32 + 1),
660            };
661        }
662
663        self.array[self.size + 255] = Node {
664            base_: -((self.size as i32) + 254),
665            check: -(self.size as i32),
666        };
667
668        let is_empty = self.blocks_head_open == 0;
669        let idx = (self.size >> 8) as i32;
670        debug_assert!(self.blocks[idx as usize].num > 1);
671        self.push_block(idx, BlockType::Open, is_empty);
672
673        self.size += 256;
674
675        ((self.size >> 8) - 1) as i32
676    }
677
678    // transfer the block at idx from the linked-list of `from` to the linked-list of `to`,
679    // specially handle the case where the destination linked-list is empty.
680    fn transfer_block(&mut self, idx: i32, from: BlockType, to: BlockType, to_block_empty: bool) {
681        let is_last = idx == self.blocks[idx as usize].next; //it's the last one if the next points to itself
682        let is_empty = to_block_empty && (self.blocks[idx as usize].num != 0);
683
684        self.pop_block(idx, from, is_last);
685        self.push_block(idx, to, is_empty);
686    }
687
688    /// Mark an edge `e` as used in a trie node.
689    fn pop_e_node(&mut self, base: i32, label: u8, from: i32) -> i32 {
690        let e: i32 = if base < 0 {
691            self.find_place()
692        } else {
693            base ^ (label as i32)
694        };
695
696        let idx = e >> 8;
697        let n = self.array[e as usize].clone();
698
699        self.blocks[idx as usize].num -= 1;
700        // move the block at idx to the correct linked-list depending the free slots it still have.
701        if self.blocks[idx as usize].num == 0 {
702            if idx != 0 {
703                self.transfer_block(idx, BlockType::Closed, BlockType::Full, self.blocks_head_full == 0);
704            }
705        } else {
706            self.array[(-n.base_) as usize].check = n.check;
707            self.array[(-n.check) as usize].base_ = n.base_;
708
709            if e == self.blocks[idx as usize].e_head {
710                self.blocks[idx as usize].e_head = -n.check;
711            }
712
713            if idx != 0 && self.blocks[idx as usize].num == 1 && self.blocks[idx as usize].trial != self.max_trial {
714                self.transfer_block(idx, BlockType::Open, BlockType::Closed, self.blocks_head_closed == 0);
715            }
716        }
717
718        #[cfg(feature = "reduced-trie")]
719        {
720            self.array[e as usize].base_ = CEDAR_VALUE_LIMIT;
721            self.array[e as usize].check = from;
722            if base < 0 {
723                self.array[from as usize].base_ = -(e ^ (label as i32)) - 1;
724            }
725        }
726
727        #[cfg(not(feature = "reduced-trie"))]
728        {
729            if label != 0 {
730                self.array[e as usize].base_ = -1;
731            } else {
732                self.array[e as usize].base_ = 0;
733            }
734            self.array[e as usize].check = from;
735            if base < 0 {
736                self.array[from as usize].base_ = e ^ (label as i32);
737            }
738        }
739
740        e
741    }
742
743    /// Mark an edge `e` as free in a trie node.
744    fn push_e_node(&mut self, e: i32) {
745        let idx = e >> 8;
746        self.blocks[idx as usize].num += 1;
747
748        if self.blocks[idx as usize].num == 1 {
749            self.blocks[idx as usize].e_head = e;
750            self.array[e as usize] = Node { base_: -e, check: -e };
751
752            if idx != 0 {
753                // Move the block from 'Full' to 'Closed' since it has one free slot now.
754                self.transfer_block(idx, BlockType::Full, BlockType::Closed, self.blocks_head_closed == 0);
755            }
756        } else {
757            let prev = self.blocks[idx as usize].e_head;
758
759            let next = -self.array[prev as usize].check;
760
761            // Insert to the edge immediately after the e_head
762            self.array[e as usize] = Node {
763                base_: -prev,
764                check: -next,
765            };
766
767            self.array[prev as usize].check = -e;
768            self.array[next as usize].base_ = -e;
769
770            // Move the block from 'Closed' to 'Open' since it has more than one free slot now.
771            if self.blocks[idx as usize].num == 2 || self.blocks[idx as usize].trial == self.max_trial {
772                debug_assert!(self.blocks[idx as usize].num > 1);
773                if idx != 0 {
774                    self.transfer_block(idx, BlockType::Closed, BlockType::Open, self.blocks_head_open == 0);
775                }
776            }
777
778            // Reset the trial stats
779            self.blocks[idx as usize].trial = 0;
780        }
781
782        if self.blocks[idx as usize].reject < self.reject[self.blocks[idx as usize].num as usize] {
783            self.blocks[idx as usize].reject = self.reject[self.blocks[idx as usize].num as usize];
784        }
785
786        self.n_infos[e as usize] = Default::default();
787    }
788
789    // push the `label` into the sibling chain
790    fn push_sibling(&mut self, from: usize, base: i32, label: u8, has_child: bool) {
791        let keep_order: bool = if self.ordered {
792            label > self.n_infos[from].child
793        } else {
794            self.n_infos[from].child == 0
795        };
796
797        let sibling: u8;
798        {
799            let mut c: &mut u8 = &mut self.n_infos[from as usize].child;
800            if has_child && keep_order {
801                loop {
802                    let code = *c as i32;
803                    c = &mut self.n_infos[(base ^ code) as usize].sibling;
804
805                    if !(self.ordered && (*c != 0) && (*c < label)) {
806                        break;
807                    }
808                }
809            }
810            sibling = *c;
811
812            *c = label;
813        }
814
815        self.n_infos[(base ^ (label as i32)) as usize].sibling = sibling;
816    }
817
818    // remove the `label` from the sibling chain.
819    #[allow(dead_code)]
820    fn pop_sibling(&mut self, from: i32, base: i32, label: u8) {
821        let mut c: *mut u8 = &mut self.n_infos[from as usize].child;
822        unsafe {
823            while *c != label {
824                let code = *c as i32;
825                c = &mut self.n_infos[(base ^ code) as usize].sibling;
826            }
827
828            let code = label as i32;
829            *c = self.n_infos[(base ^ code) as usize].sibling;
830        }
831    }
832
833    // Loop through the siblings to see which one reached the end first, which means it is the one
834    // with smaller in children size, and we should try ti relocate the smaller one.
835    fn consult(&self, base_n: i32, base_p: i32, mut c_n: u8, mut c_p: u8) -> bool {
836        loop {
837            c_n = self.n_infos[(base_n ^ (c_n as i32)) as usize].sibling;
838            c_p = self.n_infos[(base_p ^ (c_p as i32)) as usize].sibling;
839
840            if !(c_n != 0 && c_p != 0) {
841                break;
842            }
843        }
844
845        c_p != 0
846    }
847
848    // Collect the list of the children, and push the label as well if it is not terminal node.
849    fn set_child(&self, base: i32, mut c: u8, label: u8, not_terminal: bool) -> SmallVec<[u8; 256]> {
850        let mut child: SmallVec<[u8; 256]> = SmallVec::new();
851
852        if c == 0 {
853            child.push(c);
854            c = self.n_infos[(base ^ (c as i32)) as usize].sibling;
855        }
856
857        if self.ordered {
858            while c != 0 && c <= label {
859                child.push(c);
860                c = self.n_infos[(base ^ (c as i32)) as usize].sibling;
861            }
862        }
863
864        if not_terminal {
865            child.push(label);
866        }
867
868        while c != 0 {
869            child.push(c);
870            c = self.n_infos[(base ^ (c as i32)) as usize].sibling;
871        }
872
873        child
874    }
875
876    // For the case where only one free slot is needed
877    fn find_place(&mut self) -> i32 {
878        if self.blocks_head_closed != 0 {
879            return self.blocks[self.blocks_head_closed as usize].e_head;
880        }
881
882        if self.blocks_head_open != 0 {
883            return self.blocks[self.blocks_head_open as usize].e_head;
884        }
885
886        // the block is not enough, resize it and allocate it.
887        self.add_block() << 8
888    }
889
890    // For the case where multiple free slots are needed.
891    fn find_places(&mut self, child: &[u8]) -> i32 {
892        let mut idx = self.blocks_head_open;
893
894        // we still have available 'Open' blocks.
895        if idx != 0 {
896            debug_assert!(self.blocks[idx as usize].num > 1);
897            let bz = self.blocks[self.blocks_head_open as usize].prev;
898            let nc = child.len() as i16;
899
900            loop {
901                // only proceed if the free slots are more than the number of children. Also, we
902                // save the minimal number of attempts to fail in the `reject`, it only worths to
903                // try out this block if the number of children is less than that number.
904                if self.blocks[idx as usize].num >= nc && nc < self.blocks[idx as usize].reject {
905                    let mut e = self.blocks[idx as usize].e_head;
906                    loop {
907                        let base = e ^ (child[0] as i32);
908
909                        let mut i = 1;
910                        // iterate through the children to see if they are available: (check < 0)
911                        while self.array[(base ^ (child[i] as i32)) as usize].check < 0 {
912                            if i == child.len() - 1 {
913                                // we have found the available block.
914                                self.blocks[idx as usize].e_head = e;
915                                return e;
916                            }
917                            i += 1;
918                        }
919
920                        // we save the next free block's information in `check`
921                        e = -self.array[e as usize].check;
922                        if e == self.blocks[idx as usize].e_head {
923                            break;
924                        }
925                    }
926                }
927
928                // we broke out of the loop, that means we failed. We save the information in
929                // `reject` for future pruning.
930                self.blocks[idx as usize].reject = nc;
931                if self.blocks[idx as usize].reject < self.reject[self.blocks[idx as usize].num as usize] {
932                    // put this stats into the global array of information as well.
933                    self.reject[self.blocks[idx as usize].num as usize] = self.blocks[idx as usize].reject;
934                }
935
936                let idx_ = self.blocks[idx as usize].next;
937
938                self.blocks[idx as usize].trial += 1;
939
940                // move this block to the 'Closed' block list since it has reached the max_trial
941                if self.blocks[idx as usize].trial == self.max_trial {
942                    self.transfer_block(idx, BlockType::Open, BlockType::Closed, self.blocks_head_closed == 0);
943                }
944
945                // we have finsihed one round of this cyclic doubly-linked-list.
946                if idx == bz {
947                    break;
948                }
949
950                // going to the next in this linked list group
951                idx = idx_;
952            }
953        }
954
955        self.add_block() << 8
956    }
957
958    // resolve the conflict by moving one of the the nodes to a free block.
959    fn resolve(&mut self, mut from_n: usize, base_n: i32, label_n: u8) -> i32 {
960        let to_pn = base_n ^ (label_n as i32);
961
962        // the `base` and `from` for the conflicting one.
963        let from_p = self.array[to_pn as usize].check;
964        let base_p = self.array[from_p as usize].base();
965
966        // whether to replace siblings of newly added
967        let flag = self.consult(
968            base_n,
969            base_p,
970            self.n_infos[from_n as usize].child,
971            self.n_infos[from_p as usize].child,
972        );
973
974        // collect the list of children for the block that we are going to relocate.
975        let children = if flag {
976            self.set_child(base_n, self.n_infos[from_n as usize].child, label_n, true)
977        } else {
978            self.set_child(base_p, self.n_infos[from_p as usize].child, 255, false)
979        };
980
981        // decide which algorithm to allocate free block depending on the number of children we
982        // have.
983        let mut base = if children.len() == 1 {
984            self.find_place()
985        } else {
986            self.find_places(&children)
987        };
988
989        base ^= children[0] as i32;
990
991        let (from, base_) = if flag {
992            (from_n as i32, base_n)
993        } else {
994            (from_p, base_p)
995        };
996
997        if flag && children[0] == label_n {
998            self.n_infos[from as usize].child = label_n;
999        }
1000
1001        #[cfg(feature = "reduced-trie")]
1002        {
1003            self.array[from as usize].base_ = -base - 1;
1004        }
1005
1006        #[cfg(not(feature = "reduced-trie"))]
1007        {
1008            self.array[from as usize].base_ = base;
1009        }
1010
1011        // the actual work for relocating the chilren
1012        for i in 0..(children.len()) {
1013            let to = self.pop_e_node(base, children[i], from);
1014            let to_ = base_ ^ (children[i] as i32);
1015
1016            if i == children.len() - 1 {
1017                self.n_infos[to as usize].sibling = 0;
1018            } else {
1019                self.n_infos[to as usize].sibling = children[i + 1];
1020            }
1021
1022            if flag && to_ == to_pn {
1023                continue;
1024            }
1025
1026            self.array[to as usize].base_ = self.array[to_ as usize].base_;
1027
1028            #[cfg(feature = "reduced-trie")]
1029            let condition = self.array[to as usize].base_ < 0 && children[i] != 0;
1030            #[cfg(not(feature = "reduced-trie"))]
1031            let condition = self.array[to as usize].base_ > 0 && children[i] != 0;
1032
1033            if condition {
1034                let mut c = self.n_infos[to_ as usize].child;
1035
1036                self.n_infos[to as usize].child = c;
1037
1038                loop {
1039                    let idx = (self.array[to as usize].base() ^ (c as i32)) as usize;
1040                    self.array[idx].check = to;
1041                    c = self.n_infos[idx].sibling;
1042
1043                    if c == 0 {
1044                        break;
1045                    }
1046                }
1047            }
1048
1049            if !flag && to_ == (from_n as i32) {
1050                from_n = to as usize;
1051            }
1052
1053            // clean up the space that was moved away from.
1054            if !flag && to_ == to_pn {
1055                self.push_sibling(from_n, to_pn ^ (label_n as i32), label_n, true);
1056                self.n_infos[to_ as usize].child = 0;
1057
1058                #[cfg(feature = "reduced-trie")]
1059                {
1060                    self.array[to_ as usize].base_ = CEDAR_VALUE_LIMIT;
1061                }
1062
1063                #[cfg(not(feature = "reduced-trie"))]
1064                {
1065                    if label_n != 0 {
1066                        self.array[to_ as usize].base_ = -1;
1067                    } else {
1068                        self.array[to_ as usize].base_ = 0;
1069                    }
1070                }
1071
1072                self.array[to_ as usize].check = from_n as i32;
1073            } else {
1074                self.push_e_node(to_);
1075            }
1076        }
1077
1078        // return the position that is free now.
1079        if flag {
1080            base ^ (label_n as i32)
1081        } else {
1082            to_pn
1083        }
1084    }
1085}
1086
1087#[cfg(test)]
1088mod tests {
1089    use super::*;
1090    use rand::distributions::Alphanumeric;
1091    use rand::{thread_rng, Rng};
1092    use std::iter;
1093
1094    #[test]
1095    fn test_insert_and_delete() {
1096        let dict = vec!["a"];
1097        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1098        let mut cedar = Cedar::new();
1099        cedar.build(&key_values);
1100
1101        let result = cedar.exact_match_search("ab").map(|x| x.0);
1102        assert_eq!(None, result);
1103
1104        cedar.update("ab", 1);
1105        let result = cedar.exact_match_search("ab").map(|x| x.0);
1106        assert_eq!(Some(1), result);
1107
1108        cedar.erase("ab");
1109        let result = cedar.exact_match_search("ab").map(|x| x.0);
1110        assert_eq!(None, result);
1111
1112        cedar.update("abc", 2);
1113        let result = cedar.exact_match_search("abc").map(|x| x.0);
1114        assert_eq!(Some(2), result);
1115
1116        cedar.erase("abc");
1117        let result = cedar.exact_match_search("abc").map(|x| x.0);
1118        assert_eq!(None, result);
1119    }
1120
1121    #[test]
1122    fn test_common_prefix_search() {
1123        let dict = vec![
1124            "a",
1125            "ab",
1126            "abc",
1127            "アルゴリズム",
1128            "データ",
1129            "構造",
1130            "网",
1131            "网球",
1132            "网球拍",
1133            "中",
1134            "中华",
1135            "中华人民",
1136            "中华人民共和国",
1137        ];
1138        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1139        let mut cedar = Cedar::new();
1140        cedar.build(&key_values);
1141
1142        let result: Vec<i32> = cedar
1143            .common_prefix_search("abcdefg")
1144            .unwrap()
1145            .iter()
1146            .map(|x| x.0)
1147            .collect();
1148        assert_eq!(vec![0, 1, 2], result);
1149
1150        let result: Vec<i32> = cedar
1151            .common_prefix_search("网球拍卖会")
1152            .unwrap()
1153            .iter()
1154            .map(|x| x.0)
1155            .collect();
1156        assert_eq!(vec![6, 7, 8], result);
1157
1158        let result: Vec<i32> = cedar
1159            .common_prefix_search("中华人民共和国")
1160            .unwrap()
1161            .iter()
1162            .map(|x| x.0)
1163            .collect();
1164        assert_eq!(vec![9, 10, 11, 12], result);
1165
1166        let result: Vec<i32> = cedar
1167            .common_prefix_search("データ構造とアルゴリズム")
1168            .unwrap()
1169            .iter()
1170            .map(|x| x.0)
1171            .collect();
1172        assert_eq!(vec![4], result);
1173    }
1174
1175    #[test]
1176    fn test_common_prefix_iter() {
1177        let dict = vec![
1178            "a",
1179            "ab",
1180            "abc",
1181            "アルゴリズム",
1182            "データ",
1183            "構造",
1184            "网",
1185            "网球",
1186            "网球拍",
1187            "中",
1188            "中华",
1189            "中华人民",
1190            "中华人民共和国",
1191        ];
1192
1193        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1194        let mut cedar = Cedar::new();
1195        cedar.build(&key_values);
1196
1197        let result: Vec<i32> = cedar.common_prefix_iter("abcdefg").map(|x| x.0).collect();
1198        assert_eq!(vec![0, 1, 2], result);
1199
1200        let result: Vec<i32> = cedar.common_prefix_iter("网球拍卖会").map(|x| x.0).collect();
1201        assert_eq!(vec![6, 7, 8], result);
1202
1203        let result: Vec<i32> = cedar.common_prefix_iter("中华人民共和国").map(|x| x.0).collect();
1204        assert_eq!(vec![9, 10, 11, 12], result);
1205
1206        let result: Vec<i32> = cedar
1207            .common_prefix_iter("データ構造とアルゴリズム")
1208            .map(|x| x.0)
1209            .collect();
1210        assert_eq!(vec![4], result);
1211    }
1212
1213    #[test]
1214    fn test_common_prefix_predict() {
1215        let dict = vec!["a", "ab", "abc"];
1216        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1217        let mut cedar = Cedar::new();
1218        cedar.build(&key_values);
1219
1220        let result: Vec<i32> = cedar.common_prefix_predict("a").unwrap().iter().map(|x| x.0).collect();
1221        assert_eq!(vec![0, 1, 2], result);
1222    }
1223
1224    #[test]
1225    fn test_exact_match_search() {
1226        let dict = vec!["a", "ab", "abc"];
1227        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1228        let mut cedar = Cedar::new();
1229        cedar.build(&key_values);
1230
1231        let result = cedar.exact_match_search("abc").map(|x| x.0);
1232        assert_eq!(Some(2), result);
1233    }
1234
1235    #[test]
1236    fn test_unicode_han_sip() {
1237        let dict = vec!["讥䶯䶰", "讥䶯䶰䶱䶲", "讥䶯䶰䶱䶲䶳䶴䶵𦡦"];
1238
1239        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1240        let mut cedar = Cedar::new();
1241        cedar.build(&key_values);
1242
1243        let result: Vec<i32> = cedar.common_prefix_iter("讥䶯䶰䶱䶲䶳䶴䶵𦡦").map(|x| x.0).collect();
1244        assert_eq!(vec![0, 1, 2], result);
1245    }
1246
1247    #[test]
1248    fn test_unicode_grapheme_cluster() {
1249        let dict = vec!["a", "abc", "abcde\u{0301}"];
1250
1251        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1252        let mut cedar = Cedar::new();
1253        cedar.build(&key_values);
1254
1255        let result: Vec<i32> = cedar
1256            .common_prefix_iter("abcde\u{0301}\u{1100}\u{1161}\u{AC00}")
1257            .map(|x| x.0)
1258            .collect();
1259        assert_eq!(vec![0, 1, 2], result);
1260    }
1261
1262    #[test]
1263    fn test_erase() {
1264        let dict = vec!["a", "ab", "abc"];
1265        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1266        let mut cedar = Cedar::new();
1267        cedar.build(&key_values);
1268
1269        cedar.erase("abc");
1270        assert!(cedar.exact_match_search("abc").is_none());
1271        assert!(cedar.exact_match_search("ab").is_some());
1272        assert!(cedar.exact_match_search("a").is_some());
1273
1274        cedar.erase("ab");
1275        assert!(cedar.exact_match_search("ab").is_none());
1276        assert!(cedar.exact_match_search("a").is_some());
1277
1278        cedar.erase("a");
1279        assert!(cedar.exact_match_search("a").is_none());
1280    }
1281
1282    #[test]
1283    fn test_erase_on_internal_key() {
1284        let mut cedar = Cedar::new();
1285
1286        cedar.update("aa", 0);
1287        assert!(cedar.exact_match_search("aa").is_some());
1288        cedar.update("ab", 1);
1289        assert!(cedar.exact_match_search("ab").is_some());
1290
1291        cedar.erase("a");
1292        assert!(cedar.exact_match_search("a").is_none());
1293        cedar.erase("aa");
1294        assert!(cedar.exact_match_search("aa").is_none());
1295        cedar.erase("ab");
1296        assert!(cedar.exact_match_search("ab").is_none());
1297    }
1298
1299    #[test]
1300    fn test_update() {
1301        let dict = vec!["a", "ab", "abc"];
1302        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1303        let mut cedar = Cedar::new();
1304        cedar.build(&key_values);
1305
1306        cedar.update("abcd", 3);
1307
1308        assert!(cedar.exact_match_search("a").is_some());
1309        assert!(cedar.exact_match_search("ab").is_some());
1310        assert!(cedar.exact_match_search("abc").is_some());
1311        assert!(cedar.exact_match_search("abcd").is_some());
1312        assert!(cedar.exact_match_search("abcde").is_none());
1313
1314        let dict = vec!["a", "ab", "abc"];
1315        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1316        let mut cedar = Cedar::new();
1317        cedar.build(&key_values);
1318        cedar.update("bachelor", 1);
1319        cedar.update("jar", 2);
1320        cedar.update("badge", 3);
1321        cedar.update("baby", 4);
1322
1323        assert!(cedar.exact_match_search("bachelor").is_some());
1324        assert!(cedar.exact_match_search("jar").is_some());
1325        assert!(cedar.exact_match_search("badge").is_some());
1326        assert!(cedar.exact_match_search("baby").is_some());
1327        assert!(cedar.exact_match_search("abcde").is_none());
1328
1329        let dict = vec!["a", "ab", "abc"];
1330        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1331        let mut cedar = Cedar::new();
1332        cedar.build(&key_values);
1333        cedar.update("中", 1);
1334        cedar.update("中华", 2);
1335        cedar.update("中华人民", 3);
1336        cedar.update("中华人民共和国", 4);
1337
1338        assert!(cedar.exact_match_search("中").is_some());
1339        assert!(cedar.exact_match_search("中华").is_some());
1340        assert!(cedar.exact_match_search("中华人民").is_some());
1341        assert!(cedar.exact_match_search("中华人民共和国").is_some());
1342    }
1343
1344    #[test]
1345    fn test_quickcheck_like() {
1346        let mut rng = thread_rng();
1347        let mut dict: Vec<String> = Vec::with_capacity(1000);
1348        for _ in 0..1000 {
1349            let chars: Vec<u8> = iter::repeat(()).map(|()| rng.sample(Alphanumeric)).take(30).collect();
1350            let s = String::from_utf8(chars).unwrap();
1351            dict.push(s);
1352        }
1353
1354        let key_values: Vec<(&str, i32)> = dict.iter().enumerate().map(|(k, s)| (s.as_ref(), k as i32)).collect();
1355        let mut cedar = Cedar::new();
1356        cedar.build(&key_values);
1357
1358        for (k, s) in dict.iter().enumerate() {
1359            assert_eq!(cedar.exact_match_search(s).map(|x| x.0), Some(k as i32));
1360        }
1361    }
1362
1363    #[test]
1364    fn test_quickcheck_like_with_deep_trie() {
1365        let mut rng = thread_rng();
1366        let mut dict: Vec<String> = Vec::with_capacity(1000);
1367        let mut s = String::new();
1368        for _ in 0..1000 {
1369            let c: char = rng.sample(Alphanumeric) as char;
1370            s.push(c);
1371            dict.push(s.clone());
1372        }
1373
1374        let key_values: Vec<(&str, i32)> = dict.iter().enumerate().map(|(k, s)| (s.as_ref(), k as i32)).collect();
1375        let mut cedar = Cedar::new();
1376        cedar.build(&key_values);
1377
1378        for (k, s) in dict.iter().enumerate() {
1379            assert_eq!(cedar.exact_match_search(s).map(|x| x.0), Some(k as i32));
1380        }
1381    }
1382
1383    #[test]
1384    fn test_mass_erase() {
1385        let mut rng = thread_rng();
1386        let mut dict: Vec<String> = Vec::with_capacity(1000);
1387        for _ in 0..1000 {
1388            let chars: Vec<u8> = iter::repeat(()).map(|()| rng.sample(Alphanumeric)).take(30).collect();
1389            let s = String::from_utf8(chars).unwrap();
1390
1391            dict.push(s);
1392        }
1393
1394        let key_values: Vec<(&str, i32)> = dict.iter().enumerate().map(|(k, s)| (s.as_ref(), k as i32)).collect();
1395        let mut cedar = Cedar::new();
1396        cedar.build(&key_values);
1397
1398        for s in dict.iter() {
1399            cedar.erase(s);
1400            assert!(cedar.exact_match_search(s).is_none());
1401        }
1402    }
1403
1404    #[test]
1405    fn test_duplication() {
1406        let dict = vec!["些许端", "些須", "些须", "亜", "亝", "亞", "亞", "亞丁", "亞丁港"];
1407        let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect();
1408        let mut cedar = Cedar::new();
1409        cedar.build(&key_values);
1410
1411        assert_eq!(cedar.exact_match_search("亞").map(|t| t.0), Some(6));
1412        assert_eq!(cedar.exact_match_search("亞丁港").map(|t| t.0), Some(8));
1413        assert_eq!(cedar.exact_match_search("亝").map(|t| t.0), Some(4));
1414        assert_eq!(cedar.exact_match_search("些須").map(|t| t.0), Some(1));
1415    }
1416}