summavy_stacker/
expull.rs

1use std::mem;
2
3use common::serialize_vint_u32;
4
5use crate::memory_arena::{load, store};
6use crate::{Addr, MemoryArena};
7
8const MAX_BLOCK_LEN: u32 = 1u32 << 15;
9const FIRST_BLOCK: usize = 16;
10const INLINED_BLOCK_LEN: usize = FIRST_BLOCK + mem::size_of::<Addr>();
11
12enum CapacityResult {
13    Available(u32),
14    NeedAlloc(u32),
15}
16
17fn len_to_capacity(len: u32) -> CapacityResult {
18    match len {
19        0..=15 => CapacityResult::Available(FIRST_BLOCK as u32 - len),
20        16..=MAX_BLOCK_LEN => {
21            let cap = 1 << (32u32 - (len - 1u32).leading_zeros());
22            let available = cap - len;
23            if available == 0 {
24                CapacityResult::NeedAlloc(len)
25            } else {
26                CapacityResult::Available(available)
27            }
28        }
29        n => {
30            let available = n % MAX_BLOCK_LEN;
31            if available == 0 {
32                CapacityResult::NeedAlloc(MAX_BLOCK_LEN)
33            } else {
34                CapacityResult::Available(MAX_BLOCK_LEN - available)
35            }
36        }
37    }
38}
39
40/// An exponential unrolled link.
41///
42/// The use case is as follows. Tantivy's indexer conceptually acts like a
43/// `HashMap<Term, Vec<u32>>`. As we come across a given term in document
44/// `D`, we lookup the term in the map and append the document id to its vector.
45///
46/// The vector is then only read when it is serialized.
47///
48/// The `ExpUnrolledLinkedList` offers a more efficient solution to this
49/// problem.
50///
51/// It combines the idea of the unrolled linked list and tries to address the
52/// problem of selecting an adequate block size using a strategy similar to
53/// that of the `Vec` amortized resize strategy.
54///
55/// Data is stored in a linked list of blocks. The first block has a size of `4`
56/// and each block has a length of twice that of the previous block up to
57/// `MAX_BLOCK_LEN = 32768`.
58///
59/// This strategy is a good trade off to handle numerous very rare terms
60/// and avoid wasting half of the memory for very frequent terms.
61#[derive(Debug, Clone, Copy)]
62pub struct ExpUnrolledLinkedList {
63    len: u32,
64    tail: Addr,
65    inlined_data: [u8; INLINED_BLOCK_LEN],
66}
67
68pub struct ExpUnrolledLinkedListWriter<'a> {
69    eull: &'a mut ExpUnrolledLinkedList,
70    arena: &'a mut MemoryArena,
71}
72
73fn ensure_capacity<'a>(
74    eull: &'a mut ExpUnrolledLinkedList,
75    arena: &'a mut MemoryArena,
76) -> &'a mut [u8] {
77    if eull.len <= FIRST_BLOCK as u32 {
78        // We are still hitting the inline block.
79        if eull.len < FIRST_BLOCK as u32 {
80            return &mut eull.inlined_data[eull.len as usize..FIRST_BLOCK];
81        }
82        // We need to allocate a new block!
83        let new_block_addr: Addr = arena.allocate_space(FIRST_BLOCK + mem::size_of::<Addr>());
84        store(&mut eull.inlined_data[FIRST_BLOCK..], new_block_addr);
85        eull.tail = new_block_addr;
86        return arena.slice_mut(eull.tail, FIRST_BLOCK);
87    }
88    let len = match len_to_capacity(eull.len) {
89        CapacityResult::NeedAlloc(new_block_len) => {
90            let new_block_addr: Addr =
91                arena.allocate_space(new_block_len as usize + mem::size_of::<Addr>());
92            arena.write_at(eull.tail, new_block_addr);
93            eull.tail = new_block_addr;
94            new_block_len
95        }
96        CapacityResult::Available(available) => available,
97    };
98    arena.slice_mut(eull.tail, len as usize)
99}
100
101impl<'a> ExpUnrolledLinkedListWriter<'a> {
102    pub fn write_u32_vint(&mut self, val: u32) {
103        let mut buf = [0u8; 8];
104        let data = serialize_vint_u32(val, &mut buf);
105        self.extend_from_slice(data);
106    }
107
108    pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
109        while !buf.is_empty() {
110            let add_len: usize;
111            {
112                let output_buf = ensure_capacity(self.eull, self.arena);
113                add_len = buf.len().min(output_buf.len());
114                output_buf[..add_len].copy_from_slice(&buf[..add_len]);
115            }
116            self.eull.len += add_len as u32;
117            self.eull.tail = self.eull.tail.offset(add_len as u32);
118            buf = &buf[add_len..];
119        }
120    }
121}
122
123impl Default for ExpUnrolledLinkedList {
124    fn default() -> ExpUnrolledLinkedList {
125        ExpUnrolledLinkedList {
126            len: 0u32,
127            tail: Addr::null_pointer(),
128            inlined_data: [0u8; INLINED_BLOCK_LEN],
129        }
130    }
131}
132
133impl ExpUnrolledLinkedList {
134    #[inline]
135    pub fn writer<'a>(&'a mut self, arena: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
136        ExpUnrolledLinkedListWriter { eull: self, arena }
137    }
138
139    pub fn read_to_end(&self, arena: &MemoryArena, output: &mut Vec<u8>) {
140        let len = self.len as usize;
141        if len <= FIRST_BLOCK {
142            output.extend_from_slice(&self.inlined_data[..len]);
143            return;
144        }
145        output.extend_from_slice(&self.inlined_data[..FIRST_BLOCK]);
146        let mut cur = FIRST_BLOCK;
147        let mut addr = load(&self.inlined_data[FIRST_BLOCK..]);
148        loop {
149            let cap = match len_to_capacity(cur as u32) {
150                CapacityResult::Available(capacity) => capacity,
151                CapacityResult::NeedAlloc(capacity) => capacity,
152            } as usize;
153            let data = arena.slice(addr, cap);
154            if cur + cap >= len {
155                output.extend_from_slice(&data[..(len - cur)]);
156                return;
157            }
158            output.extend_from_slice(data);
159            cur += cap;
160            addr = arena.read(addr.offset(cap as u32));
161        }
162    }
163}
164
165#[cfg(test)]
166mod tests {
167    use common::{read_u32_vint, write_u32_vint};
168
169    use super::super::MemoryArena;
170    use super::{len_to_capacity, *};
171
172    #[test]
173    fn test_eull() {
174        let mut arena = MemoryArena::default();
175        let mut stack = ExpUnrolledLinkedList::default();
176        stack.writer(&mut arena).extend_from_slice(&[1u8]);
177        stack.writer(&mut arena).extend_from_slice(&[2u8]);
178        stack.writer(&mut arena).extend_from_slice(&[3u8, 4u8]);
179        stack.writer(&mut arena).extend_from_slice(&[5u8]);
180        {
181            let mut buffer = Vec::new();
182            stack.read_to_end(&arena, &mut buffer);
183            assert_eq!(&buffer[..], &[1u8, 2u8, 3u8, 4u8, 5u8]);
184        }
185    }
186
187    #[test]
188    fn test_eull_long() {
189        let mut arena = MemoryArena::default();
190        let mut eull = ExpUnrolledLinkedList::default();
191        let data: Vec<u32> = (0..100).collect();
192        for &el in &data {
193            eull.writer(&mut arena).write_u32_vint(el);
194        }
195        let mut buffer = Vec::new();
196        eull.read_to_end(&arena, &mut buffer);
197        let mut result = vec![];
198        let mut remaining = &buffer[..];
199        while !remaining.is_empty() {
200            result.push(read_u32_vint(&mut remaining));
201        }
202        assert_eq!(&result[..], &data[..]);
203    }
204
205    #[test]
206    fn test_eull_interlaced() {
207        let mut eull = MemoryArena::default();
208        let mut stack = ExpUnrolledLinkedList::default();
209        let mut stack2 = ExpUnrolledLinkedList::default();
210
211        let mut vec1: Vec<u8> = vec![];
212        let mut vec2: Vec<u8> = vec![];
213
214        for i in 0..9 {
215            stack.writer(&mut eull).write_u32_vint(i);
216            assert!(write_u32_vint(i, &mut vec1).is_ok());
217            if i % 2 == 0 {
218                stack2.writer(&mut eull).write_u32_vint(i);
219                assert!(write_u32_vint(i, &mut vec2).is_ok());
220            }
221        }
222        let mut res1 = vec![];
223        let mut res2 = vec![];
224        stack.read_to_end(&eull, &mut res1);
225        stack2.read_to_end(&eull, &mut res2);
226        assert_eq!(&vec1[..], &res1[..]);
227        assert_eq!(&vec2[..], &res2[..]);
228    }
229
230    #[test]
231    fn test_jump_if_needed() {
232        let mut available = 16u32;
233        for i in 0..10_000_000 {
234            match len_to_capacity(i) {
235                CapacityResult::NeedAlloc(cap) => {
236                    assert_eq!(available, 0, "Failed len={}: Expected 0 got {}", i, cap);
237                    available = cap;
238                }
239                CapacityResult::Available(cap) => {
240                    assert_eq!(
241                        available, cap,
242                        "Failed len={}: Expected {} Got {}",
243                        i, available, cap
244                    );
245                }
246            }
247            available -= 1;
248        }
249    }
250
251    #[test]
252    fn test_jump_if_needed_progression() {
253        let mut v = vec![];
254        for i in 0.. {
255            if v.len() >= 10 {
256                break;
257            }
258            if let CapacityResult::NeedAlloc(cap) = len_to_capacity(i) {
259                v.push((i, cap));
260            }
261        }
262        assert_eq!(
263            &v[..],
264            &[
265                (16, 16),
266                (32, 32),
267                (64, 64),
268                (128, 128),
269                (256, 256),
270                (512, 512),
271                (1024, 1024),
272                (2048, 2048),
273                (4096, 4096),
274                (8192, 8192)
275            ]
276        );
277    }
278}
279
280#[cfg(all(test, feature = "unstable"))]
281mod bench {
282    use std::iter;
283
284    use test::Bencher;
285
286    use super::super::MemoryArena;
287    use super::ExpUnrolledLinkedList;
288
289    const NUM_STACK: usize = 10_000;
290    const STACK_SIZE: u32 = 1000;
291
292    #[bench]
293    fn bench_push_vec(bench: &mut Bencher) {
294        bench.iter(|| {
295            let mut vecs = Vec::with_capacity(100);
296            for _ in 0..NUM_STACK {
297                vecs.push(Vec::new());
298            }
299            for s in 0..NUM_STACK {
300                for i in 0u32..STACK_SIZE {
301                    let t = s * 392017 % NUM_STACK;
302                    vecs[t].push(i);
303                }
304            }
305        });
306    }
307
308    #[bench]
309    fn bench_push_stack(bench: &mut Bencher) {
310        bench.iter(|| {
311            let mut arena = MemoryArena::default();
312            let mut stacks: Vec<ExpUnrolledLinkedList> =
313                iter::repeat_with(ExpUnrolledLinkedList::default)
314                    .take(NUM_STACK)
315                    .collect();
316            for s in 0..NUM_STACK {
317                for i in 0u32..STACK_SIZE {
318                    let t = s * 392017 % NUM_STACK;
319                    stacks[t]
320                        .writer(&mut arena)
321                        .extend_from_slice(&i.to_ne_bytes());
322                }
323            }
324        });
325    }
326}