jammdb/
node.rs

1use std::{cell::RefCell, mem::size_of, rc::Rc};
2
3use crate::{
4    bucket::{BucketMeta, InnerBucket, META_SIZE},
5    bytes::Bytes,
6    errors::Result,
7    freelist::TxFreelist,
8    page::{BranchElement, LeafElement, Page, PageID, PageType},
9};
10
11pub(crate) type NodeID = u64;
12
13const HEADER_SIZE: u64 = size_of::<Page>() as u64;
14const LEAF_SIZE: u64 = size_of::<LeafElement>() as u64;
15const BRANCH_SIZE: u64 = size_of::<BranchElement>() as u64;
16const MIN_KEYS_PER_NODE: usize = 2;
17const FILL_PERCENT: f32 = 0.5;
18
19pub(crate) struct Node<'n> {
20    pub(crate) id: NodeID,
21    pub(crate) page_id: PageID,
22    pub(crate) num_pages: u64,
23    pub(crate) children: Vec<NodeID>,
24    pub(crate) data: NodeData<'n>,
25    pub(crate) deleted: bool,
26    pub(crate) original_key: Option<Bytes<'n>>,
27    pub(crate) parent: Option<u64>,
28    pagesize: u64,
29    spilled: bool,
30}
31
32impl<'n> Node<'n> {
33    // This is only used when creating a root node for a new bucket
34    // So the parent is always going to be None
35    pub(crate) fn new(id: NodeID, t: PageType, pagesize: u64) -> Node<'n> {
36        let data: NodeData = match t {
37            Page::TYPE_BRANCH => NodeData::Branches(Vec::new()),
38            Page::TYPE_LEAF => NodeData::Leaves(Vec::new()),
39            _ => panic!("INVALID PAGE TYPE FOR NEW NODE"),
40        };
41        Node {
42            id,
43            page_id: 0,
44            num_pages: 0,
45            children: Vec::new(),
46            data,
47            deleted: false,
48            original_key: None,
49            pagesize,
50            spilled: false,
51            parent: None,
52        }
53    }
54
55    // This is used to initialize nodes for pages that are being modified.
56    // The parent value needs to be set afterwards!
57    pub(crate) fn from_page(id: NodeID, p: &Page, pagesize: u64) -> Node<'n> {
58        let data: NodeData = match p.page_type {
59            Page::TYPE_BRANCH => {
60                let mut data = Vec::with_capacity(p.count as usize);
61                for branch in p.branch_elements() {
62                    data.push(Branch {
63                        key: Bytes::Slice(branch.key()),
64                        page: branch.page,
65                    });
66                }
67                NodeData::Branches(data)
68            }
69            Page::TYPE_LEAF => {
70                let mut data = Vec::with_capacity(p.count as usize);
71                for leaf in p.leaf_elements() {
72                    data.push(Leaf::from_leaf(leaf));
73                }
74                NodeData::Leaves(data)
75            }
76            _ => panic!("INVALID PAGE TYPE FOR FROM_PAGE"),
77        };
78        let original_key = if data.len() > 0 {
79            Some(data.first_key())
80        } else {
81            None
82        };
83        Node {
84            id,
85            page_id: p.id,
86            num_pages: p.overflow + 1,
87            children: Vec::new(),
88            data,
89            deleted: false,
90            original_key,
91            pagesize,
92            spilled: false,
93            parent: None,
94        }
95    }
96
97    // This is used to create new nodes created by splitting existing nodes.
98    // They don't need to have their parent set since we no longer care about parent/child
99    // relationships once we're splitting.
100    pub(crate) fn with_data(id: NodeID, data: NodeData<'n>, pagesize: u64) -> Node<'n> {
101        let original_key = Some(data.first_key());
102        Node {
103            id,
104            page_id: 0,
105            num_pages: 0,
106            children: Vec::new(),
107            data,
108            deleted: false,
109            original_key,
110            pagesize,
111            spilled: false,
112            parent: None,
113        }
114    }
115
116    pub(crate) fn insert_child(&mut self, id: NodeID, key: Bytes) {
117        match &mut self.data {
118            NodeData::Branches(branches) => {
119                debug_assert!(!self.children.contains(&id));
120                debug_assert!(branches
121                    .binary_search_by_key(&key.as_ref(), |b| b.key())
122                    .is_ok());
123                self.children.push(id);
124            }
125            NodeData::Leaves(_) => panic!("CANNOT INSERT BRANCH INTO A LEAF NODE"),
126        }
127    }
128
129    pub(crate) fn insert_data<'a>(&'a mut self, leaf: Leaf<'n>) {
130        match &mut self.data {
131            NodeData::Branches(_) => panic!("CANNOT INSERT DATA INTO A BRANCH NODE"),
132            NodeData::Leaves(leaves) => {
133                match leaves.binary_search_by_key(&leaf.key(), |l| l.key()) {
134                    Ok(i) => leaves[i] = leaf,
135                    Err(i) => leaves.insert(i, leaf),
136                };
137            }
138        }
139    }
140
141    pub(crate) fn insert_branch<'a>(
142        &'a mut self,
143        original_key: &Option<Bytes<'n>>,
144        branch: Branch<'n>,
145    ) {
146        let search_key = match original_key {
147            Some(k) => k.as_ref(),
148            None => branch.key(),
149        };
150        match &mut self.data {
151            NodeData::Leaves(_) => panic!("CANNOT INSERT BRANCH INTO A LEAF NODE"),
152            NodeData::Branches(branches) => {
153                match branches.binary_search_by_key(&search_key, |b| b.key()) {
154                    Ok(i) => {
155                        assert!(original_key.is_some());
156                        branches[i] = branch
157                    }
158                    Err(i) => {
159                        assert!(original_key.is_none());
160                        branches.insert(i, branch)
161                    }
162                };
163            }
164        }
165    }
166
167    pub(crate) fn delete<'a>(&'a mut self, index: usize) -> Leaf<'n> {
168        match &mut self.data {
169            NodeData::Branches(_) => panic!("CANNOT DELETE DATA FROM A BRANCH NODE"),
170            NodeData::Leaves(leaves) => leaves.remove(index),
171        }
172    }
173
174    pub(crate) fn leaf(&self) -> bool {
175        match &self.data {
176            NodeData::Branches(_) => false,
177            NodeData::Leaves(_) => true,
178        }
179    }
180
181    fn size(&self) -> u64 {
182        HEADER_SIZE + self.data.size()
183    }
184
185    pub(crate) fn needs_merging(&self) -> bool {
186        self.data.len() < MIN_KEYS_PER_NODE || self.size() < (self.pagesize / 4)
187    }
188
189    pub(crate) fn spill<'a>(
190        &'a mut self,
191        bucket: &'a mut InnerBucket<'n>,
192        tx_freelist: &'a mut TxFreelist,
193        parent: Option<&'a mut Self>,
194    ) -> Result<Option<PageID>> {
195        let mut root_page_id: Option<PageID> = None;
196        if self.spilled {
197            return Ok(root_page_id);
198        }
199        // Sort the children so we iterate over them in order
200        self.children
201            .sort_by_cached_key(|id| bucket.nodes[*id as usize].borrow().data.first_key());
202
203        // spill all of the children nodes
204        let mut i = 0_usize;
205        // Becuase spilling a child can result in that child being split into more children
206        // we iterate using an index checking the length of the children vector at each iteration.
207        while i < self.children.len() {
208            let child_id = self.children[i];
209            let child = bucket.nodes[child_id as usize].clone();
210            let mut child = child.borrow_mut();
211            child.spill(bucket, tx_freelist, Some(self))?;
212            i += 1;
213        }
214
215        let new_siblings = self.split(bucket);
216        // We now have this node's final data, so write it to some dirty pages.
217        self.write(tx_freelist)?;
218        if let Some(new_siblings) = &new_siblings {
219            // We have some new siblings to welcome into the world!
220            // Get all of them spilled onto some dirty pages.
221            self.write(tx_freelist)?;
222            for s in new_siblings.iter() {
223                let mut s = s.borrow_mut();
224                s.write(tx_freelist)?;
225            }
226        }
227        // Check if we have a parent...
228        match parent {
229            Some(parent) => {
230                // If we do, update all of it's branches!
231                // Note that this means self is not the root node and we will return None.
232                // Tell our parent about our new page_id and key.
233
234                parent.insert_branch(&self.original_key, Branch::from_node(self));
235                if let Some(new_siblings) = new_siblings {
236                    // Tell the parent about our new siblings
237                    for s in new_siblings.iter() {
238                        let s = s.borrow();
239                        // All of these nodes are new, so the key will always be the first key in the dataset
240                        let branch = Branch::from_node(&s);
241                        parent.insert_branch(&None, branch);
242                    }
243                }
244            }
245            None => {
246                // If we don't, we are currently the root node.
247                match new_siblings {
248                    // If we're currently the root node but we just spawned siblings,
249                    // Then create a new root node to be our parent.
250                    Some(new_siblings) => {
251                        // Create branches for all of the children (ourselves included as the first child)
252                        let mut branches: Vec<Branch> = Vec::with_capacity(new_siblings.len() + 1);
253                        branches.push(Branch::from_node(self));
254                        for s in new_siblings {
255                            let s = s.borrow();
256                            branches.push(Branch::from_node(&s));
257                        }
258                        // Create parent from those branches
259                        let new_parent = bucket.new_node(NodeData::Branches(branches));
260                        let mut new_parent = new_parent.borrow_mut();
261                        // Spill the parent, potentially splitting it, and writing it's data to dirty pages
262                        match new_parent.spill(bucket, tx_freelist, None)? {
263                            // The new parent must return a new page_id, so we can update the bucket's
264                            // root page.
265                            Some(page_id) => root_page_id = Some(page_id),
266                            None => panic!("New parent did not return a new root_page_id"),
267                        };
268                    }
269
270                    None => {
271                        // No siblings means that self is still the root node.
272                        // Set the root_page_id to our new page.
273                        root_page_id = Some(self.page_id);
274                    }
275                }
276            }
277        }
278
279        Ok(root_page_id)
280    }
281
282    pub(crate) fn split<'a>(
283        &'a mut self,
284        bucket: &'a mut InnerBucket<'n>,
285    ) -> Option<Vec<Rc<RefCell<Node<'n>>>>> {
286        if self.data.len() <= (MIN_KEYS_PER_NODE * 2) || self.size() < self.pagesize {
287            return None;
288        }
289        let threshold = ((self.pagesize as f32) * FILL_PERCENT) as u64;
290        let mut split_indexes = Vec::<usize>::new();
291        let mut current_size = HEADER_SIZE;
292        let mut count = 0;
293        match &self.data {
294            NodeData::Branches(b) => {
295                let len = b.len();
296                for (i, b) in b[..len - 2].iter().enumerate() {
297                    if i > len - 3 {
298                        break;
299                    }
300                    count += 1;
301                    let size = BRANCH_SIZE + (b.key_size() as u64);
302                    let new_size = current_size + size;
303                    if count >= MIN_KEYS_PER_NODE && new_size > threshold {
304                        split_indexes.push(i + 1);
305                        current_size = HEADER_SIZE + size;
306                        count = 0;
307                    } else {
308                        current_size = new_size;
309                    }
310                }
311            }
312            NodeData::Leaves(leaves) => {
313                let len = leaves.len();
314                for (i, l) in leaves[..len - 2].iter().enumerate() {
315                    // if i > len - 3 {
316                    //     break;
317                    // }
318                    count += 1;
319                    let size = LEAF_SIZE + (l.size() as u64);
320                    let new_size = current_size + size;
321                    if count >= MIN_KEYS_PER_NODE && new_size > threshold {
322                        split_indexes.push(i + 1);
323                        current_size = HEADER_SIZE + size;
324                        count = 0;
325                    } else {
326                        current_size = new_size;
327                    }
328                }
329            }
330        };
331        // for some reason we didn't find a place to split
332        if split_indexes.is_empty() {
333            return None;
334        }
335
336        // split all of the data on the split indexes
337        // Create new vector of data to go on it's own pages
338        #[allow(clippy::needless_collect)]
339        let new_data: Vec<NodeData> = split_indexes
340            .into_iter()
341            // Split from the right size so we only break off small chunks at a time.
342            .rev()
343            // Split the data.
344            .map(|i| self.data.split_at(i))
345            .collect();
346
347        // Create nodes for each bit of data we split apart.
348        Some(
349            new_data
350                .into_iter()
351                // Reverse again so the nodes are in the correct order
352                .rev()
353                .map(|data| bucket.new_node(data))
354                .collect(),
355        )
356    }
357
358    // Write this node to a new (in-memory) page.
359    pub(crate) fn write(&mut self, tx_freelist: &mut TxFreelist) -> Result<()> {
360        if self.deleted {
361            return Ok(());
362        }
363        self.spilled = true;
364        let page = self.allocate(tx_freelist)?;
365        page.write_node(self, self.num_pages)
366    }
367
368    // Free our old page (if we have one) and get a new page for ourselves.
369    fn allocate<'a>(&'a mut self, tx_freelist: &'a mut TxFreelist) -> Result<&'n mut Page> {
370        self.free_page(tx_freelist);
371        let size = self.size();
372        let page = tx_freelist.allocate(size)?;
373        self.page_id = page.id;
374        self.num_pages = page.overflow + 1;
375        Ok(page)
376    }
377
378    // Give our current page back to the freelist (if we have one)
379    pub(crate) fn free_page(&mut self, tx_freelist: &mut TxFreelist) {
380        if self.page_id != 0 {
381            tx_freelist.free(self.page_id, self.num_pages);
382            self.page_id = 0;
383        }
384    }
385}
386
387pub(crate) enum NodeData<'a> {
388    Branches(Vec<Branch<'a>>),
389    Leaves(Vec<Leaf<'a>>),
390}
391
392impl<'a> NodeData<'a> {
393    pub(crate) fn len(&self) -> usize {
394        match self {
395            NodeData::Branches(b) => b.len(),
396            NodeData::Leaves(l) => l.len(),
397        }
398    }
399
400    fn size(&self) -> u64 {
401        match self {
402            NodeData::Branches(b) => b.iter().fold(BRANCH_SIZE * b.len() as u64, |acc, b| {
403                acc + b.key_size() as u64
404            }),
405            NodeData::Leaves(l) => l
406                .iter()
407                .fold(LEAF_SIZE * l.len() as u64, |acc, l| acc + l.size() as u64),
408        }
409    }
410
411    pub(crate) fn first_key<'b>(&'b self) -> Bytes<'a> {
412        debug_assert!(self.len() > 0, "Cannot get key parts of empty data");
413        match self {
414            NodeData::Branches(b) => b[0].key.clone(),
415            NodeData::Leaves(l) => l[0].key_bytes(),
416        }
417    }
418
419    pub(crate) fn merge(&mut self, other_data: &mut Self) {
420        match (self, other_data) {
421            (NodeData::Branches(b1), NodeData::Branches(b2)) => {
422                b1.append(b2);
423                b1.sort_unstable_by_key(|b| b.key.clone());
424            }
425            (NodeData::Leaves(l1), NodeData::Leaves(l2)) => {
426                l1.append(l2);
427                l1.sort_unstable_by_key(|l| l.key_bytes());
428                let mut last = l1[0].key();
429                for l in l1[1..].iter() {
430                    if last >= l.key() {
431                        println!("HA. GOT 'EM!");
432                    }
433                    last = l.key();
434                }
435            }
436            _ => panic!("incompatible data types"),
437        }
438    }
439
440    fn split_at<'b>(&'b mut self, index: usize) -> NodeData<'a> {
441        match self {
442            NodeData::Branches(b) => NodeData::Branches(b.split_off(index)),
443            NodeData::Leaves(l) => NodeData::Leaves(l.split_off(index)),
444        }
445    }
446}
447
448pub(crate) struct Branch<'a> {
449    key: Bytes<'a>,
450    pub(crate) page: PageID,
451}
452
453impl<'a> Branch<'a> {
454    pub(crate) fn from_node<'b>(node: &'b Node<'a>) -> Branch<'a> {
455        Branch {
456            key: node.data.first_key(),
457            page: node.page_id,
458        }
459    }
460
461    pub(crate) fn key(&self) -> &[u8] {
462        self.key.as_ref()
463    }
464
465    pub(crate) fn key_size(&self) -> usize {
466        self.key.size()
467    }
468}
469
470#[derive(Clone)]
471pub(crate) enum Leaf<'a> {
472    Bucket(Bytes<'a>, BucketMeta),
473    Kv(Bytes<'a>, Bytes<'a>),
474}
475
476impl<'a> Leaf<'a> {
477    pub(crate) fn from_leaf<'b>(l: &'b LeafElement) -> Leaf<'a> {
478        match l.node_type {
479            Node::TYPE_DATA => Leaf::Kv(Bytes::Slice(l.key()), Bytes::Slice(l.value())),
480            Node::TYPE_BUCKET => Leaf::Bucket(Bytes::Slice(l.key()), l.value().into()),
481            _ => panic!("INVALID NODE TYPE"),
482        }
483    }
484
485    pub(crate) fn node_type(&self) -> NodeType {
486        match self {
487            Self::Bucket(_, _) => Node::TYPE_BUCKET,
488            Self::Kv(_, _) => Node::TYPE_DATA,
489        }
490    }
491
492    pub(crate) fn key_bytes<'b>(&'b self) -> Bytes<'a> {
493        match self {
494            Self::Bucket(name, _) => name.clone(),
495            Self::Kv(k, _) => k.clone(),
496        }
497    }
498
499    pub(crate) fn key(&self) -> &[u8] {
500        match self {
501            Self::Bucket(b, _) => b.as_ref(),
502            Self::Kv(k, _) => k.as_ref(),
503        }
504    }
505
506    pub(crate) fn value(&self) -> &[u8] {
507        match self {
508            Self::Bucket(_, meta) => meta.as_ref(),
509            Self::Kv(_, v) => v.as_ref(),
510        }
511    }
512
513    pub(crate) fn size(&self) -> usize {
514        match self {
515            Self::Bucket(b, _) => b.size() + META_SIZE,
516            Self::Kv(k, v) => k.size() + v.size(),
517        }
518    }
519
520    pub(crate) fn is_kv(&self) -> bool {
521        match self {
522            Self::Bucket(_, _) => false,
523            Self::Kv(_, _) => true,
524        }
525    }
526}
527
528// Change to DataType
529pub(crate) type NodeType = u8;
530
531impl<'n> Node<'n> {
532    pub(crate) const TYPE_DATA: NodeType = 0x00;
533    pub(crate) const TYPE_BUCKET: NodeType = 0x01;
534}
535
536#[cfg(test)]
537mod test {
538    use std::collections::HashMap;
539
540    use super::*;
541    use crate::{
542        testutil::{rand_bytes, RandomFile},
543        OpenOptions,
544    };
545
546    #[test]
547    fn test_split() -> Result<()> {
548        let random_file = RandomFile::new();
549        let db = OpenOptions::new().pagesize(1024).open(&random_file)?;
550        // Test split
551        {
552            let tx = db.tx(true)?;
553            let b = tx.create_bucket("a")?;
554            let mut data = HashMap::new();
555            // Insert six nodes, each the size of a page.
556            for key in ["a", "b", "c", "d", "e", "f"] {
557                let value = rand_bytes(512);
558                b.put(key, value.clone())?;
559                data.insert(key, value);
560            }
561            {
562                // Since this bucket was just created, there should be one node.
563                let mut b = b.inner.borrow_mut();
564                assert!(b.nodes.len() == 1);
565
566                let tx_freelist = tx.inner.borrow().freelist.clone();
567                let mut tx_freelist = tx_freelist.borrow_mut();
568                b.spill(&mut tx_freelist)?;
569                // Since everything is spilled, there should be two key / value pairs to a list.
570                // That means we should have three leaf nodes and one branch node at the root.
571                assert!(b.nodes.len() == 4);
572                // Make sure the branch has the right data
573                let branch_node = &b.nodes[3];
574                let branch_node = branch_node.borrow();
575                if let NodeData::Branches(branches) = &branch_node.data {
576                    assert!(branches.len() == 3);
577
578                    assert!(branches[0].key() == b"a");
579                    assert!(branches[0].page == 7);
580
581                    assert!(branches[1].key() == b"c");
582                    assert!(branches[1].page == 10);
583
584                    assert!(branches[2].key() == b"e");
585                    assert!(branches[2].page == 13);
586                } else {
587                    panic!("Node 3 should have been a branch node")
588                }
589                // Make sure each node has the right data
590                for (n, keys) in b.nodes[0..=2]
591                    .iter()
592                    .zip([["a", "b"], ["c", "d"], ["e", "f"]])
593                {
594                    let n = n.borrow();
595                    assert!(n.data.len() == 2);
596                    match &n.data {
597                        NodeData::Leaves(leaves) => {
598                            for (kv, key) in leaves.iter().zip(keys) {
599                                assert!(kv.key() == key.as_bytes());
600                                assert!(kv.value() == data[key]);
601                            }
602                        }
603                        _ => panic!("Must be a leaf node"),
604                    }
605                }
606            }
607        }
608        Ok(())
609    }
610}