1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
extern crate arrayvec;
use arrayvec::ArrayVec;
use crate::run_block_av_flat::{VC_LEN, MAX_BLOCK_SIZE, RLEBlock}; // current best
const MAX_NODE_SIZE: usize = 64; // must be a power of 2
const NODE_MIDPOINT: usize = MAX_NODE_SIZE / 2;
/// A B+ tree structure that has special functionality to encode runs of the same symbol in a defined alphabet
#[derive(Clone)]
pub struct RLEBPlusTree {
/// the current height of the tree
height: usize,
/// the collection of nodes in the tree, root is always at index 0
nodes: Vec<RLEBPlusNode>,
/// the collection of data blocks referenced by the tree nodes, these are the actual symbols
data_arena: Vec<RLEBlock>,
/// for the RLEBlocks, this says which block is next for an in-order traversal
next_child: Vec<usize>
}
/// A node in the B+ tree
#[derive(Clone,Debug)]
struct RLEBPlusNode {
/// stores if this is a leaf node or internal node
is_leaf: bool,
/// the parent node ID
parent: usize,
/// stores the total symbol counts for that child node and all its children
total_counts: ArrayVec<u64, MAX_NODE_SIZE>,
/// stores the individual symbol counts for that child node and all its children
total_symbols: ArrayVec<[u64; VC_LEN], MAX_NODE_SIZE>,
/// the indices of any child nodes; if it's a leaf this refers to RLEBlocks, otherwise other RLEBPlusNodes
children: ArrayVec<usize, MAX_NODE_SIZE>
}
impl RLEBPlusNode {
#[inline]
fn count(&self, relative_index: u64, symbol: u8) -> (usize, u64, u64) {
let mut total_count: u64 = 0;
let bs_index: usize = self.total_counts.iter().position(|&v| {
total_count += v;
total_count >= relative_index
}).unwrap();
total_count -= self.total_counts[bs_index];
let symbol_count: u64 = self.total_symbols[..bs_index].iter().fold(0, |acc, v| acc + v[symbol as usize]);
(bs_index, symbol_count, total_count)
}
//for now, this is a clone except for two minor increments on the found node
#[inline]
fn insert_and_count(&mut self, relative_index: u64, symbol: u8) -> (usize, u64, u64) {
let mut total_count: u64 = 0;
let bs_index: usize = self.total_counts.iter().position(|&v| {
total_count += v;
total_count >= relative_index
}).unwrap();
total_count -= self.total_counts[bs_index];
let symbol_count: u64 = self.total_symbols[..bs_index].iter().fold(0, |acc, v| acc + v[symbol as usize]);
//increment total counts and symbols for this node
self.total_counts[bs_index] += 1;
self.total_symbols[bs_index][symbol as usize] += 1;
(bs_index, symbol_count, total_count)
}
#[inline]
fn build_indices(&mut self) {
//place-holder
}
}
impl Default for RLEBPlusTree {
#[inline]
fn default() -> Self {
let children: Vec<RLEBlock> = vec![Default::default()];
let mut child_indices: ArrayVec<usize, MAX_NODE_SIZE> = Default::default();
child_indices.push(0);
let mut total_counts = ArrayVec::<u64, MAX_NODE_SIZE>::new();
total_counts.push(0);
let mut total_symbols = ArrayVec::<[u64; VC_LEN], MAX_NODE_SIZE>::new();
total_symbols.push([0; VC_LEN]);
let root: RLEBPlusNode = RLEBPlusNode {
is_leaf: true,
parent: 0,
total_counts,
total_symbols,
children: child_indices
};
RLEBPlusTree {
height: 0,
nodes: vec![root],
data_arena: children,
next_child: vec![0]
}
}
}
impl<'a> IntoIterator for &'a RLEBPlusTree {
type Item = u8;
type IntoIter = RLEBPlusTreeIterator<'a>;
fn into_iter(self) -> Self::IntoIter {
let next_child_index = self.next_child[0];
let current_block_iter = self.data_arena[0].to_vec().into_iter();
RLEBPlusTreeIterator {
tree: self,
next_child_index,
current_block_iter
}
}
}
impl RLEBPlusTree {
/// Returns the current height of the B+ tree
#[inline]
pub fn get_height(&self) -> usize {
self.height
}
/// Returns the total number of data nodes (i.e. leaf nodes) in the B+ tree
#[inline]
pub fn get_node_count(&self) -> usize {
self.data_arena.len()
}
/// Performs a rank/count operation.
/// For a given data index, this will count all occurences of symbol `value` up to that index.
/// # Arguments
/// * `index` - the index to count to
/// * `value` - the symbol to count
/// # Examples
/// ```rust
/// use msbwt2::rle_bplus_tree::RLEBPlusTree;
/// let mut tree: RLEBPlusTree = Default::default();
/// let data: Vec<u8> = vec![0, 1, 1, 1, 2, 0, 2, 3, 4, 1, 1, 1, 0];
/// let expected_counts: Vec<u64> = vec![0, 0, 1, 2, 0, 1, 1, 0, 0, 3, 4, 5, 2];
/// for (i, v) in data.iter().enumerate() {
/// let pre_count = tree.count(i as u64, *v);
/// assert_eq!(pre_count, expected_counts[i]);
/// let count = tree.insert_and_count(i as u64, *v);
/// println!("{} {:?}", i, tree.to_vec());
/// assert_eq!(count, expected_counts[i]);
/// }
/// ```
#[inline]
pub fn count(&self, index: u64, value: u8) -> u64 {
//start with root node 0
let mut current_node_index: usize = 0;
let mut current_node: &RLEBPlusNode = &self.nodes[current_node_index];
let mut relative_index: u64 = index;
let mut total_count: u64 = 0;
//iterate downwards until we find a leaf node point to run blocks
while !current_node.is_leaf {
//linear search tended to be faster in practice
let (bs_index, sc, tc) = current_node.count(relative_index, value);
//add symbol count to the running total
total_count += sc;
//adjust our relative index by how much we went through
relative_index -= tc;
//go down to the indexing node
current_node_index = current_node.children[bs_index];
current_node = &self.nodes[current_node_index];
}
//we're in a leaf, still need to search for the data node
let (bs_index, sc, tc) = current_node.count(relative_index, value);
//add symbol count to the running total
total_count += sc;
//adjust our relative index by how much we went through
relative_index -= tc;
//get the data node from the search index
let arena_index: usize = current_node.children[bs_index];
//add in the final counts and return
total_count += self.data_arena[arena_index].count(relative_index, value);
total_count
}
/// Performs a rank/count operation while also inserting that symbol at the provided index.
/// For a given data index, this will count all occurences of symbol `value` up to that index, and then insert an addition `value` at that index.
/// # Arguments
/// * `index` - the index to count to
/// * `value` - the symbol to count
/// # Examples
/// ```rust
/// use msbwt2::rle_bplus_tree::RLEBPlusTree;
/// let mut tree: RLEBPlusTree = Default::default();
/// let data: Vec<u8> = vec![0, 1, 1, 1, 2, 0, 2, 3, 4, 1, 1, 1, 0];
/// let expected_counts: Vec<u64> = vec![0, 0, 1, 2, 0, 1, 1, 0, 0, 3, 4, 5, 2];
/// for (i, v) in data.iter().enumerate() {
/// let pre_count = tree.count(i as u64, *v);
/// assert_eq!(pre_count, expected_counts[i]);
/// let count = tree.insert_and_count(i as u64, *v);
/// println!("{} {:?}", i, tree.to_vec());
/// assert_eq!(count, expected_counts[i]);
/// }
/// ```
#[inline]
pub fn insert_and_count(&mut self, index: u64, value: u8) -> u64{
//start with root node 0
let mut current_node_index: usize = 0;
let mut current_node = &mut self.nodes[current_node_index];
let mut relative_index: u64 = index;
let mut total_count: u64 = 0;
//iterate downwards until we find a leaf node point to run blocks
while !current_node.is_leaf {
//linear search tended to be faster in practice
let (bs_index, sc, tc) = current_node.insert_and_count(relative_index, value);
//add symbol count to the running total
total_count += sc;
//adjust our relative index by how much we went through
relative_index -= tc;
//update current node
current_node_index = current_node.children[bs_index];
current_node = &mut self.nodes[current_node_index];
}
//we're in a leaf, get the leaf index and then the arena index for the data node
let (bs_index, sc, tc) = current_node.insert_and_count(relative_index, value);
//add symbol count to the running total
total_count += sc;
//adjust our relative index by how much we went through
relative_index -= tc;
//increment the total count by what comes back from the block insertion
let arena_index: usize = current_node.children[bs_index];
total_count += self.data_arena[arena_index].insert_and_count(relative_index, value);
//handle any internal splitting that needs to occur due to nodes filling up
self.split_internal_nodes(arena_index, current_node_index, bs_index);
total_count
}
/// This is a helper function for splitting internal nodes to maintain the structure.
/// It's currently only used in the insert function, but it's useful to have it separated out for readability.
#[inline]
fn split_internal_nodes(&mut self, arena_index: usize, node_index: usize, bs_index: usize) {
//check if the block is big enough to get split
let mut current_node_index = node_index;
let current_node = &mut self.nodes[current_node_index];
if self.data_arena[arena_index].block_len() >= MAX_BLOCK_SIZE {
//first we need to actually split the block
let new_block: RLEBlock = self.data_arena[arena_index].split();
let new_arena_index = self.data_arena.len();
self.data_arena.push(new_block);
//correct the next child options
self.next_child.push(self.next_child[arena_index]);
self.next_child[arena_index] = new_arena_index;
//insert into the before counts
current_node.total_counts[bs_index] = self.data_arena[arena_index].get_values_contained();
current_node.total_counts.insert(bs_index+1, self.data_arena[new_arena_index].get_values_contained());
//insert into the before symbols also
current_node.total_symbols[bs_index] = self.data_arena[arena_index].get_symbol_counts();
current_node.total_symbols.insert(bs_index+1, self.data_arena[new_arena_index].get_symbol_counts());
//insert the child into the list
current_node.children.insert(bs_index+1, new_arena_index);
current_node.build_indices();
//recursively push as needed into internal nodes
while self.nodes[current_node_index].children.len() == MAX_NODE_SIZE {
assert!(self.nodes[current_node_index].total_counts.len() == MAX_NODE_SIZE);
let current_node = &mut self.nodes[current_node_index];
//changes to left child mostly occur in-place by removing the right side data
let rtc = current_node.total_counts.drain(NODE_MIDPOINT..).collect();
let rts = current_node.total_symbols.drain(NODE_MIDPOINT..).collect();
let rtch = current_node.children.drain(NODE_MIDPOINT..).collect();
//get the total counts for the new left node
let left_total_count: u64 = current_node.total_counts.iter().sum();
let mut left_total_symbols: [u64; VC_LEN] = [0; VC_LEN];
for ts in current_node.total_symbols.iter() {
for i in 0..VC_LEN {
left_total_symbols[i] += ts[i];
}
}
//right child is create from the removed data
let mut right_child = RLEBPlusNode {
is_leaf: current_node.is_leaf,
parent: current_node.parent,
total_counts: rtc,
total_symbols: rts,
children: rtch
};
//all node changes have been made to current & right at this point
current_node.build_indices();
right_child.build_indices();
//get the total counts for the new right node
let right_total_count: u64 = right_child.total_counts.iter().sum();
let mut right_total_symbols: [u64; VC_LEN] = [0; VC_LEN];
for ts in right_child.total_symbols.iter() {
for i in 0..VC_LEN {
right_total_symbols[i] += ts[i];
}
}
if current_node_index == 0 {
//special things because it's root:
//we increase the height and we actually create a full new left child because root needs to be a new node
self.height += 1;
let left_child = current_node.clone();
//parent currently does not need to be set because root's parent is 0 (e.g. also root)
//push the two new children
let left_child_index = self.nodes.len();
let right_child_index = left_child_index+1;
if !left_child.is_leaf {
for &cindex in left_child.children.iter() {
self.nodes[cindex].parent = left_child_index;
}
for &cindex in right_child.children.iter() {
self.nodes[cindex].parent = right_child_index;
}
}
self.nodes.push(left_child);
self.nodes.push(right_child);
//update root with new info
self.nodes[0].is_leaf = false;
self.nodes[0].total_counts[0] = left_total_count;
self.nodes[0].total_counts[1] = right_total_count;
self.nodes[0].total_counts.truncate(2);
self.nodes[0].total_symbols[0] = left_total_symbols;
self.nodes[0].total_symbols[1] = right_total_symbols;
self.nodes[0].total_symbols.truncate(2);
self.nodes[0].children[0] = left_child_index;
self.nodes[0].children[1] = right_child_index;
self.nodes[0].children.truncate(2);
//root node changes are complete
self.nodes[0].build_indices();
} else {
//this is not root, so left child is completely done at this point; we need to insert right child though
//push the midpoint before counts into the parent and add the new (right) child
let right_child_index = self.nodes.len();
if !right_child.is_leaf {
for cindex in right_child.children.iter() {
self.nodes[*cindex].parent = right_child_index;
}
}
//calculate the midpoint information
let parent_index = right_child.parent;
let parent_node = &mut self.nodes[parent_index];
let child_index = parent_node.children.iter().position(|&cindex| cindex == current_node_index).unwrap();
parent_node.children.insert(child_index+1, right_child_index);
//update the before symbols & counts
parent_node.total_counts[child_index] = left_total_count;
parent_node.total_counts.insert(child_index+1, right_total_count);
parent_node.total_symbols[child_index] = left_total_symbols;
parent_node.total_symbols.insert(child_index+1, right_total_symbols);
//parent node changes are complete
parent_node.build_indices();
//finally, push the right node as a new node
self.nodes.push(right_child);
//now go up a level to make sure it isn't too big
current_node_index = parent_index;
}
}
}
}
/// This will convert the stored data into a plain Vec<u8> for easy manipulation.
/// This is really most useful for debugging and testing.
/// # Example
/// ```rust
/// use msbwt2::rle_bplus_tree::RLEBPlusTree;
/// let mut tree: RLEBPlusTree = Default::default();
/// let data: Vec<u8> = vec![0, 1, 1, 1, 2, 0, 2, 3, 4, 1, 1, 1, 0];
/// let expected_counts: Vec<u64> = vec![0, 0, 1, 2, 0, 1, 1, 0, 0, 3, 4, 5, 2];
/// for (i, v) in data.iter().enumerate() {
/// let _count = tree.insert_and_count(i as u64, *v);
/// }
/// assert_eq!(tree.to_vec(), data);
/// ```
pub fn to_vec(&self) -> Vec<u8> {
let mut ret: Vec<u8> = vec![];
let mut level_vec: Vec<usize>;
let mut next_level_vec: Vec<usize> = vec![0];
while !self.nodes[next_level_vec[0]].is_leaf {
level_vec = next_level_vec;
next_level_vec = vec![];
for lv in level_vec.iter() {
next_level_vec.extend(&self.nodes[*lv].children);
}
}
for lv in next_level_vec.iter() {
for block_index in self.nodes[*lv].children.iter() {
ret.extend_from_slice(&self.data_arena[*block_index].to_vec());
}
}
ret
/*
TODO: functionally, the below statement can replace the entire above code, but I would guess it
is less efficient, do we care? I don't think we plan to use to_vec in practice anywhere, just for
testing currently
benchmarks have the current to_vec about 40 usecs vs 100 usecs in the iter method
*/
//self.into_iter().collect()
}
/// This will return a run iterator over the data stored in the tree.
/// # Example
/// ```rust
/// use msbwt2::rle_bplus_tree::RLEBPlusTree;
/// let mut tree: RLEBPlusTree = Default::default();
/// let data: Vec<u8> = vec![0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0];
/// for (i, v) in data.iter().enumerate() {
/// let _count = tree.insert_and_count(i as u64, *v);
/// }
/// let expected_runs: Vec<(u8, u64)> = vec![(0, 1), (1, 3), (2, 6), (0, 1)];
/// assert_eq!(tree.run_iter().collect::<Vec<(u8, u64)>>(), expected_runs);
/// ```
pub fn run_iter(&self) -> RLEBPlusTreeRunIterator<'_> {
let next_child_index = self.next_child[0];
let current_block_iter = Box::new(self.data_arena[0].run_iter());
RLEBPlusTreeRunIterator {
tree: self,
next_child_index,
current_block_iter,
next_sym: 0,
next_count: 0
}
}
}
/// An iterator over the data nodes of the B+ tree that will perform an in-order traversal of the contained characters.
/// Any runs are automatically decompressed into single symbols.
pub struct RLEBPlusTreeIterator<'a> {
tree: &'a RLEBPlusTree,
next_child_index: usize,
current_block_iter: std::vec::IntoIter<u8>
}
impl<'a> Iterator for RLEBPlusTreeIterator<'a> {
type Item = u8;
/// Will return the next symbol contained by the compressed B+ tree data
fn next(&mut self) -> Option<u8> {
match self.current_block_iter.next() {
//current block has data, so return it
Some(x) => Some(x),
None => {
//current block does not have data
if self.next_child_index == 0 {
//no more blocks left (we've circled around), so no more data
None
} else {
//get the next block iterator and update the next child index before returning a value
self.current_block_iter = self.tree.data_arena[self.next_child_index].to_vec().into_iter();
self.next_child_index = self.tree.next_child[self.next_child_index];
//this *should* always work given our current setup, there shouldn't be any empty
//blocks unless the entire tree is empty
self.current_block_iter.next()
}
}
}
}
}
/// An iterator over the data nodes of the B+ tree that will perform an in-order traversal of the contained characters.
/// Each run is returned as a tuple (symbol (u8), count (u64)) instead of an individual symbol at a time.
pub struct RLEBPlusTreeRunIterator<'a> {
tree: &'a RLEBPlusTree,
next_child_index: usize,
current_block_iter: Box<dyn Iterator<Item=(u8, u64)> + 'a>, //std::slice::Iter<'a, (u8, u64)>,
next_sym: u8,
next_count: u64
}
impl<'a> Iterator for RLEBPlusTreeRunIterator<'a> {
type Item = (u8, u64);
/// Will return the next run contained by the compressed B+ tree data
fn next(&mut self) -> Option<(u8, u64)> {
//None
loop {
let next_pair = match self.current_block_iter.next() {
//current block has data, so return it
Some(x) => Some(x),
None => {
//current block does not have data
if self.next_child_index == 0 {
//no more blocks left (we've circled around), so no more data
None
} else {
//get the next block iterator and update the next child index before returning a value
self.current_block_iter = Box::new(self.tree.data_arena[self.next_child_index].run_iter());
self.next_child_index = self.tree.next_child[self.next_child_index];
//this *should* always work given our current setup, there shouldn't be any empty
//blocks unless the entire tree is empty
self.current_block_iter.next()
}
}
};
match next_pair {
Some(pair_values) => {
//check if this run matches the ongoing run
if pair_values.0 == self.next_sym {
//part of the same run, increment and loop back
self.next_count += pair_values.1;
} else {
//not part of the same run, store the new run values and return the current one
let ret_value = (self.next_sym, self.next_count);
self.next_sym = pair_values.0;
self.next_count = pair_values.1;
if ret_value.1 > 0 {
return Some(ret_value);
}
}
},
None => {
//we're at the end of the iterators
if self.next_count > 0 {
//one last pair to send
let ret_value = (self.next_sym, self.next_count);
self.next_count = 0;
return Some(ret_value);
} else {
//nothing remains
return None;
}
}
}
}
}
}
#[cfg(test)]
mod tests {
extern crate rand;
use super::*;
use rand::Rng;
#[test]
fn test_init() {
let tree: RLEBPlusTree = Default::default();
assert_eq!(tree.to_vec(), Vec::<u8>::new());
assert_eq!(tree.into_iter().collect::<Vec<u8>>(), Vec::<u8>::new());
let full_pairing = tree.run_iter().collect::<Vec<(u8, u64)>>();
assert_eq!(full_pairing, vec![]);
}
#[test]
fn test_simple_inserts() {
let mut tree: RLEBPlusTree = Default::default();
let data: Vec<u8> = vec![0, 1, 1, 1, 2, 0, 2, 3, 4, 1, 1, 1, 0];
let expected_counts: Vec<u64> = vec![0, 0, 1, 2, 0, 1, 1, 0, 0, 3, 4, 5, 2];
for (i, v) in data.iter().enumerate() {
let pre_count = tree.count(i as u64, *v);
assert_eq!(pre_count, expected_counts[i]);
let count = tree.insert_and_count(i as u64, *v);
println!("{} {:?}", i, tree.to_vec());
assert_eq!(count, expected_counts[i]);
}
assert_eq!(tree.to_vec(), data);
assert_eq!(tree.into_iter().collect::<Vec<u8>>(), data);
//check the pairs since this is in-order
let runs: Vec<(u8, u64)> = vec![(0, 1), (1, 3), (2, 1), (0, 1), (2, 1), (3, 1), (4, 1), (1, 3), (0, 1)];
let full_pairing = tree.run_iter().collect::<Vec<(u8, u64)>>();
assert_eq!(full_pairing, runs);
}
#[test]
fn test_failure_case_001() {
let mut tree: RLEBPlusTree = Default::default();
let data: Vec<u8> = vec![3, 0, 3, 0, 5, 2, 5, 2, 3, 1, 2];
let expected_counts: Vec<u64> = vec![0, 0, 1, 1, 0, 0, 1, 1, 2, 0, 2];
for (i, v) in data.iter().enumerate() {
let pre_count = tree.count(i as u64, *v);
assert_eq!(pre_count, expected_counts[i]);
let count = tree.insert_and_count(i as u64, *v);
println!("{} {:?}", i, tree.to_vec());
assert_eq!(tree.to_vec(), data[..i+1].to_vec());
assert_eq!(count, expected_counts[i]);
}
assert_eq!(tree.to_vec(), data);
//check the pairs since this is in-order
let runs: Vec<(u8, u64)> = vec![(3, 1), (0, 1), (3, 1), (0, 1), (5, 1), (2, 1), (5, 1), (2, 1), (3, 1), (1, 1), (2, 1)];
let full_pairing = tree.run_iter().collect::<Vec<(u8, u64)>>();
assert_eq!(full_pairing, runs);
}
#[test]
fn test_failure_case_002() {
/*
//this one was cause by used binary search on the children array, silly me
[2, 3, 3, 1, 3, 2, 4, 2, 3, 4, 5, 1, 2, 5]
[2, 3, 3, 1, 3, 2, 4, 2, 3, 4, 5, 1, 2, 1, 5]
*/
let mut tree: RLEBPlusTree = Default::default();
let mut data: Vec<u8> = vec![];
let inserted: Vec<u8> = vec![4, 5, 5, 0, 0, 0, 2, 3, 0, 1, 5, 3, 3, 5];
let positions: Vec<usize> = vec![0, 0, 2, 1, 1, 1, 5, 0, 5, 6, 3, 9, 0, 10];
for i in 0..inserted.len() {
data.insert(positions[i], inserted[i]);
let mut expected_count = 0;
for j in 0..positions[i] {
if data[j] == inserted[i] {
expected_count += 1;
}
}
let pre_count = tree.count(positions[i] as u64, inserted[i]);
assert_eq!(pre_count, expected_count);
let count = tree.insert_and_count(positions[i] as u64, inserted[i]);
println!("{} {:?}", i, tree.to_vec());
assert_eq!(tree.to_vec(), data);
assert_eq!(count, expected_count);
}
}
#[test]
fn test_failure_case_003() {
/*
caught by when I added an addition loop to fix a problem that actually wasn't the solution,
instead causing this problem
*/
let mut tree: RLEBPlusTree = Default::default();
let mut data: Vec<u8> = vec![];
let inserted: Vec<u8> = vec![2, 3, 1, 1, 1, 4, 3, 0, 2, 2, 1, 4, 3];
let positions: Vec<usize> = vec![0, 1, 0, 3, 2, 5, 1, 7, 3, 7, 2, 2, 9];
for i in 0..inserted.len() {
data.insert(positions[i], inserted[i]);
let mut expected_count = 0;
for j in 0..positions[i] {
if data[j] == inserted[i] {
expected_count += 1;
}
}
let pre_count = tree.count(positions[i] as u64, inserted[i]);
assert_eq!(pre_count, expected_count);
let count = tree.insert_and_count(positions[i] as u64, inserted[i]);
println!("{} {:?}", i, tree.to_vec());
assert_eq!(tree.to_vec(), data);
assert_eq!(count, expected_count);
}
}
#[test]
fn test_10krandom_insert() {
let mut tree: RLEBPlusTree = Default::default();
let mut data: Vec<u8> = vec![];
let mut rng = rand::thread_rng();
let mut inserted: Vec<u8> = vec![];
let mut positions: Vec<usize> = vec![];
for _ in 0..10000 {
let symbol: u8 = rng.gen_range(0, 6);
let position: usize = rng.gen_range(0, data.len()+1);
inserted.push(symbol);
positions.push(position);
data.insert(position, symbol);
println!("RANDOM_DATA: {:?}", data);
println!("inserted: {:?}", inserted);
println!("positions: {:?}", positions);
let pre_count = tree.count(position as u64, symbol);
let post_count = tree.insert_and_count(position as u64, symbol);
assert_eq!(pre_count, post_count);
assert_eq!(tree.to_vec(), data);
assert_eq!(tree.into_iter().collect::<Vec<u8>>(), data);
}
}
#[test]
fn test_run_iter() {
//this test is needed to make sure we are testing the run iterations across data blocks
let mut tree: RLEBPlusTree = Default::default();
let total_symbols = MAX_BLOCK_SIZE*256*MAX_NODE_SIZE;
for _ in 0..total_symbols {
let _post_count = tree.insert_and_count(0, 0);
}
//we expect 1 really big run of 0s
let full_pairing = tree.run_iter().collect::<Vec<(u8, u64)>>();
assert_eq!(full_pairing, vec![(0, total_symbols as u64)]);
assert!(tree.get_node_count() > 1);
//now let's break the run somewhere and just make sure things are still fine
let break_point: u64 = (MAX_BLOCK_SIZE*256) as u64;
tree.insert_and_count(break_point, 1);
let full_pairing = tree.run_iter().collect::<Vec<(u8, u64)>>();
assert_eq!(full_pairing, vec![(0, break_point), (1, 1), (0, total_symbols as u64-break_point)]);
}
}