1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
use super::*;

use num_traits::{bounds::Bounded, AsPrimitive, FromPrimitive, Unsigned};

/// Find a element `K` that will split the given `bucket` in half at specified element index of key.
/// For example: if key is ["a", "b", "c"] and ["a", "c", "d"], given index is 1, the returned `K` will be
/// "c". If given idex is 2, the return `K` will be "d". This is because when index is 1, it look at second
/// element of both key thus it found "b" and "c". When index is 2, it look at third element which is
/// "c" and "d".
fn find_half_point<K, V>(bucket: &ArrayHash<twox_hash::XxHash64, Box<[K]>, V>, index: usize, start: usize, end: usize) -> K 
where K: AsPrimitive<usize> + core::hash::Hash + FromPrimitive + Bounded + PartialEq + PartialOrd + Unsigned,
        Box<[K]>: Clone + core::cmp::PartialEq, 
        V: Clone {
    // Find split point
    let mut count = vec![0; end - start + 1]; // Save some memory by allocate only necessary size
    let mut split_portion = 0;

    for (k, _) in bucket.iter() {
        if k.len() > index {
            count[k[index].as_() - start] += 1; // Accumulate each individual key
            split_portion += 1; // Accumulate total number of entry
        }
    }

    split_portion /= 2; // Now we got split portion

    let mut split_point = 0;
    let mut first_portion = 0;

    while first_portion < split_portion {
        first_portion += count[split_point];
        split_point += 1;
    }

    split_point += start;

    K::from_usize(split_point).unwrap()
}

/// Construct a hat-trie hybrid container by using raw pointer to skip
/// lookup time.
/// This is possible for "dense" type only as each key element can be converted into
/// usize to directly index into child node.
/// 
/// By convention, this is safe. It is because it use raw pointer only by internal
/// operation which when mutation occur, it occur under `&mut self` of outer type thus
/// make borrow checker enforce borrow rule as a whole on outer object.
#[derive(Clone, Debug)]
struct ChildsContainer<K, T, V> 
where K: AsPrimitive<usize> + core::hash::Hash + PartialEq + PartialOrd,
      Box<[K]>: Clone + core::cmp::PartialEq, 
      T: Clone + TrieNode<K, V>,
      V: Clone {
    /// Owned childs of a node. It need to be pre-allocated to prevent re-allocation which
    /// if happen, will invalidate all pointers that point to it.
    childs: Vec<NodeType<K, T, V>>,
    _internal_pointer: Box<[*mut NodeType<K, T, V>]>
}

impl<K, T, V> ChildsContainer<K, T, V> 
where K: AsPrimitive<usize> + Bounded + core::fmt::Debug + core::hash::Hash + FromPrimitive + PartialEq + PartialOrd + Unsigned,
      Box<[K]>: Clone + core::cmp::PartialEq, 
      T: Clone + TrieNode<K, V>,
      V: Clone + Default {
    /// Construct an empty childs container which initialize to Single "hybrid" node and
    /// all pointer in parent point to this single child.
    pub fn new(size: usize) -> Self {
        let mut childs = Vec::with_capacity(size);
        let start = K::min_value().as_();
        let end = K::max_value().as_();
        let bucket = NodeType::Hybrid((ArrayHashBuilder::default().build(), K::from_usize(start).unwrap()..=K::from_usize(end).unwrap()));
        childs.push(bucket);
        let ptrs = (0..size).map(|_| (&mut childs[0]) as *mut NodeType<K, T, V>).collect();
        ChildsContainer {
            childs,
            _internal_pointer: ptrs
        }
    }
    
    /// Attempt to split a child node at given key.
    /// 
    /// If the child node have element >= threshold, it will
    /// split the child according to hat-trie paper.
    /// 
    /// Since splitting child effect it parent pointer, we need to 
    /// perform it in this level.
    /// 
    /// # Return
    /// True if it was split. Otherwise, it return false.
    fn maybe_split(&mut self, key: K) -> bool {
        // This is a new trie which will be parent of new hybrid node yield from pure node split.
        let mut pure_trie = None;
        // This is a result of hybrid node split.
        let mut split_result = None;
        match &mut self[key] {
            NodeType::Pure(bucket) => {
                if bucket.len() >= super::BURST_THRESHOLD {
                    // flag that current key need to be burst/split
                    pure_trie = Some(key);
                } else {
                    return false
                }
            },
            NodeType::Hybrid((bucket, range)) => {
                if bucket.len() >= super::BURST_THRESHOLD {
                    // Need to split
                    let start = range.start().as_();
                    let end = range.end().as_();

                    // Find split point
                    let split_point = find_half_point(bucket, 0, start, end + 1);
                    // Now we got split point

                    // Make a new bucket to store half of old bucket
                    let new_bucket = bucket.split_by(|(k, _)| {
                        k[0] >= split_point
                    });

                    *range = K::from_usize(start).unwrap()..=K::from_usize(split_point.as_() - 1).unwrap();

                    split_result = Some((new_bucket, [start, split_point.as_(), end]));
                } else {
                    return false
                }
            },
            // Other type doesn't have to be split
            _ => ()
        };

        if let Some(key) = pure_trie {
            // Post processing for Pure bucket
            let k = key.as_();

            // Need unsafe way to get access to short circuit scanning for an owned child.
            // This should be safe as the child is owned by itself in here.
            // It can also be done in completely safe code by scanning `self.childs` looking
            // for a child with a key having k as prefix but it going to be significantly slower
            // than de-reference the pointer.

            // Temporary take out child at K
            unsafe {
                let old_child = std::mem::take(&mut *self._internal_pointer[k]);

                // It is only possible to be "Pure" node type to reach here
                if let NodeType::Pure(bucket) = old_child {
                    // Consume bucket and make a trie out of it
                    let trie = NodeType::Trie(T::new_split(bucket));
                    // Place new child at K
                    std::mem::replace(&mut *self._internal_pointer[k], trie);
                }
            }
            return true
        } else if let Some((new_bucket, [start, split_point, end])) = split_result {
            // Post processing for Hybrid split.
            // 1. change existing bucket to pure if needed. 2. update parent pointers for new bucket
            if start == split_point {
                // The only case where we need to make node Pure
                let old = std::mem::take(&mut self[key]);
                // The only possible type in here is Hybrid
                if let NodeType::Hybrid((table, _)) = old {
                    // Range can only be one here so we ignore it.

                    // Now replace the old Hybrid with Pure type
                    std::mem::replace(&mut self[key], NodeType::Pure(table));
                }
            }

            if split_point < end {
                self.childs.push(NodeType::Hybrid((new_bucket, K::from_usize(split_point).unwrap()..=K::from_usize(end).unwrap())));
            } else {
                // Only single possible parent
                self.childs.push(NodeType::Pure(new_bucket));
            }

            // Last element of self.childs hold a new bucket, we need to update all remain pointer
            let last = self.childs.len() - 1;

            // Update all remaining pointer to point to new half
            for ptr in self._internal_pointer[split_point..].iter_mut() {
                *ptr = (&mut self.childs[last]) as *mut NodeType<K, T, V>;
            }
            
            return true
        }
        false
    }
}

impl<K, T, V> core::ops::Index<K> for ChildsContainer<K, T, V> 
where K: AsPrimitive<usize> + core::hash::Hash + PartialEq + PartialOrd + Unsigned,
      Box<[K]>: Clone + core::cmp::PartialEq, 
      T: Clone + TrieNode<K, V>,
      V: Clone {
    type Output=NodeType<K, T, V>;

    fn index(&self, idx: K) -> &Self::Output {
        unsafe {
            &*self._internal_pointer[idx.as_()]
        }
    }
}

impl<K, T, V> core::ops::IndexMut<K> for ChildsContainer<K, T, V> 
where K: AsPrimitive<usize> + core::hash::Hash + PartialEq + PartialOrd + Unsigned,
      Box<[K]>: Clone + core::cmp::PartialEq, 
      T: Clone + TrieNode<K, V>,
      V: Clone {

    fn index_mut(&mut self, idx: K) -> &mut Self::Output {
        unsafe {
            &mut *self._internal_pointer[idx.as_()]
        }
    }
}

#[derive(Clone, Debug)]
pub struct DenseVecTrieNode<K, V> 
where K: AsPrimitive<usize> + Bounded + Copy + core::fmt::Debug + core::hash::Hash + FromPrimitive + PartialEq + PartialOrd + Sized + Unsigned,
      Box<[K]>: Clone + PartialEq,
      V: Clone + Default {
    childs: ChildsContainer<K, Self, V>,
    value: Option<V>,
}

impl<K, V> TrieNode<K, V> for DenseVecTrieNode<K, V> 
where K: AsPrimitive<usize> + Bounded + Copy + core::fmt::Debug + core::hash::Hash + FromPrimitive + PartialEq + PartialOrd + Sized + Unsigned,
      Box<[K]>: Clone + PartialEq,
      V: Clone + Default {
    fn new_split(mut bucket: ArrayHash<twox_hash::XxHash64, Box<[K]>, V>) -> Self {
        let start = K::min_value().as_();
        let end = K::max_value().as_();
        let mut old_size = bucket.len();
        let split_point = find_half_point(&bucket, 1, start, end);
        let mut node_value = None;
        let mut left = ArrayHashBuilder::default().build();
        let mut right = ArrayHashBuilder::default().build();

        for (key, value) in bucket.drain() {
            if key.len() == 1 {
                // There's only single character in key.
                // It value must be present on new Trie node and remove itself from bucket.
                node_value = Some(value);
                old_size -= 1;
                continue;
            }
            if key[1] >= split_point {
                assert!(right.put(key[1..].into(), value).is_none());
            } else {
                assert!(left.put(key[1..].into(), value).is_none());
            }
        }
        assert_eq!(old_size, left.len() + right.len());

        // Construct a child container manually as we need to properly align each
        // K with correct side of bucket.
        // In future, if other struct also need this, we shall move this to ChildsContainer struct
        let mut childs = vec![
            NodeType::Hybrid((left, K::from_usize(start).unwrap()..=K::from_usize(split_point.as_() - 1).unwrap())), 
            NodeType::Hybrid((right, split_point..=K::from_usize(end).unwrap())) ];

        let split_point_usize = split_point.as_();
        let ptr = (start..=end).map(|key| {
            if key >= split_point_usize {
                (&mut childs[1]) as *mut NodeType<K, DenseVecTrieNode<K, V>, V>
            } else {
                (&mut childs[0]) as *mut NodeType<K, DenseVecTrieNode<K, V>, V>
            }
        }).collect::<Vec<*mut NodeType<K, DenseVecTrieNode<K, V>, V>>>().into_boxed_slice();
        
        DenseVecTrieNode {
            childs: ChildsContainer {
                childs,
                _internal_pointer: ptr
            },
            value: node_value,
        }
    }

    fn child<'a>(&'a self, key: &K) -> &'a NodeType<K, Self, V> {
        &self.childs[*key]
    }

    fn value(&self) -> Option<&V> {
        self.value.as_ref()
    }

    fn value_mut(&mut self) -> &mut Option<V> {
        &mut self.value
    }

    fn put(&mut self, key: &[K], value: V) -> Option<V> {
        if key.len() == 0 {return None} // Cannot put empty key into trie
        let mut offset = 0; // Progress of key being process
        // let mut nodes = &mut self.childs; // Set root childs as nodes to be search for a key
        let mut parent = self;
        let mut nodes = &mut parent.childs; // Node context, all siblings are here
        nodes.maybe_split(key[0]); // Check if root node need to be split
        let mut node; // Current node being processed
        loop {
            // Dense Trie can use byte as index into each possible slot
            node = &mut nodes[key[offset]];

            match node {
                NodeType::None => {
                    // The byte is not part of trie. Add remain key and value to this trie here.
                    // Make a new container node to store value.
                    let mut bucket = ArrayHashBuilder::default().build();
                    bucket.put(key[offset..].into(), value);
                    *node = NodeType::Pure(bucket);

                    return None
                }
                NodeType::Trie(t) => {
                    // For simple trie, we just add all childs back to node waiting to be process
                    parent = t;

                    offset += 1;

                    if offset >= key.len() {
                        // We exhausted the key, this trie node hold a value
                        return parent.value.replace(value)
                    }
                    nodes = &mut parent.childs;
                    nodes.maybe_split(key[offset]);
                },
                NodeType::Pure(childs) => {
                    // For Pure type, it's `ArrayHash` that contains whole remaining key.
                    // We can simply leverage `put` method using remaining key and value.
                    let old = childs.put(key[offset..].into(), value);
                    
                    if let Some((_, v)) = old {
                        return Some(v)
                    } else {
                        return None
                    }
                },
                NodeType::Hybrid(bucket) => {
                    // For Hybrid type, it has left and right and split point to determine
                    // which `ArrayHash` to put the remaining key and value.
                    let old = bucket.0.put(key[offset..].into(), value);
                    if let Some((_, v)) = old {
                        return Some(v)
                    } else {
                        return None
                    }
                }
            }
        }
    }

    fn try_put<'a>(&'a mut self, key: &[K], value: V) -> Option<&'a V> where K: 'a {
        if key.len() == 0 {return None} // Cannot put empty key into trie
        let mut offset = 0; // Progress of key being process
        let mut parent = self;
        let mut nodes = &mut parent.childs; // Set root childs as nodes to be search for a key
        let mut node; // Current node being processed
        loop {
            // Dense Trie can use byte as index into each possible slot
            node = &mut nodes[key[offset]];

            if offset < key.len() {
                // if there's some remain key to be process, we analyze child of current node
                match node {
                    NodeType::None => {
                        // The byte is not part of trie. Add remain key and value to this trie here.
                        // Make a new container node to store value.
                        let mut bucket = ArrayHashBuilder::default().build();
                        bucket.put(key[offset..].into(), value);
                        *node = NodeType::Pure(bucket);

                        return None
                    },
                    NodeType::Trie(t) => {
                        // For simple trie, we just add all childs back to node waiting to be process
                        parent = t;
                        nodes = &mut parent.childs;
                    },
                    NodeType::Pure(childs) => {
                        // For Pure type, it's `ArrayHash` that contains whole remaining key.
                        // We can simply leverage `try_put` method using remaining key and value.
                        return childs.try_put(key[offset..].into(), value)
                    },
                    NodeType::Hybrid(bucket) => {
                        // For Hybrid type, it has left and right and split point to determine
                        // which `ArrayHash` to try put the remaining key and value.
                        return bucket.0.try_put(key[offset..].into(), value)
                    }
                }
            } else {
                match node {
                    NodeType::Trie(ref mut t) => {
                        // all bytes in key are consumed, return last child value
                        if t.value.is_none() {
                            t.value.replace(value);
                            return None
                        } else {
                            return t.value.as_ref()
                        }
                    },
                    _ => {
                        // shall never reach this block as either it is still trying to process trie
                        // or it is in a case where Pure/Hybrid container was met
                        return None
                    }
                }
            }
            offset += 1; // Move offset so next time key being use will be those that is not yet processed.
        }
    }
}

impl<K, V> DenseVecTrieNode<K, V> 
where K: Copy + AsPrimitive<usize> + Bounded + core::fmt::Debug + core::hash::Hash + FromPrimitive + PartialEq + PartialOrd + Sized + Unsigned,
      Box<[K]>: Clone + PartialEq,
      V: Clone + Default {

    pub fn new() -> DenseVecTrieNode<K, V> {
        DenseVecTrieNode {
                // root always has no value
            value: None,
            // Allocate Trie with length equals to Trie size. Since K is not `Clone`, we need to manually create all of it.
            childs: ChildsContainer::new(2usize.pow(std::mem::size_of::<K>() as u32 * 8u32))
        }
    }
}

#[cfg(test)]
mod tests;