insertion_set/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
//! Performs a set of batched insertions on a vector.
//!
//! [`Vec::insert(index, value)`][Vec::insert] takes `O(n)` time to move internal memory,
//! so calling it in a loop can cause quadratic blowup.
//!
//! If you batch multiple values together with an [`InsertionSet`]
//! you can defer the expensive movement of the vector's
//! memory till the of the loop.
//!
//! This code was originally copied from the first prototype compiler for [DuckLogic].
//! It was inspired by the way the [B3 JIT] handles insertions.
//!
//! [DuckLogic]: https://ducklogic.org/
//! [B3 JIT]: https://webkit.org/blog/5852/introducing-the-b3-jit-compiler/
use std::fmt::Debug;
use std::iter::{ExactSizeIterator, FromIterator};
use std::ops::Range;

use self::sorting::insertion_sort_by_key;

mod shift;
mod sorting;

use self::shift::BulkShifter;

/// A value that is pending insertion
#[derive(Debug)]
pub struct Insertion<T> {
    /// Where in the original vector to insert this value.
    ///
    /// This is equivelant to the index argument in `Vec::insert`
    pub index: usize,
    /// The value to be inserted
    pub element: T,
}
impl<T> Insertion<T> {
    /// Create a new Insertion
    #[inline]
    pub fn new(index: usize, element: T) -> Self {
        Insertion { index, element }
    }
}
impl<T> From<(usize, T)> for Insertion<T> {
    #[inline]
    fn from(tuple: (usize, T)) -> Self {
        Insertion::new(tuple.0, tuple.1)
    }
}

/// A set of pending insertions on a Vec
///
/// When multiple insertions at a
///
/// See module documentation for an overview.
pub struct InsertionSet<T> {
    insertions: Vec<Insertion<T>>,
}
impl<T> InsertionSet<T> {
    /// Create a new InsertionSet
    #[inline]
    pub fn new() -> Self {
        InsertionSet {
            insertions: Vec::new(),
        }
    }
    /// Queue the specified insertion
    ///
    /// If there are multiple insertions at the same index,
    /// they will be applied in the order queued.
    #[inline]
    pub fn push(&mut self, insertion: Insertion<T>) {
        self.insertions.push(insertion)
    }
    /// Insert the element to be inserted before the given index
    ///
    /// If multiple elements are queued to be inserted at the same index,
    /// they will be applied in the original order queued.
    #[inline]
    pub fn insert(&mut self, index: usize, element: T) {
        self.push(Insertion { index, element })
    }
    /// Apply all of the pending insertions against the specified vector,
    /// returning the result
    #[inline]
    pub fn applied(mut self, mut target: Vec<T>) -> Vec<T> {
        self.apply(&mut target);
        target
    }
    /// The number of insertions that are currently queued
    #[inline]
    pub fn desired_insertions(&self) -> usize {
        self.insertions.len()
    }
    /// List the updated locations of all the elements (both original and newly inserted).
    ///
    /// See [Self::compute_updated_locations] for details
    pub fn list_updated_locations(&mut self, target: &[T]) -> Vec<(OriginalLocation, usize)> {
        let mut result = Vec::with_capacity(target.len() + self.desired_insertions());
        self.compute_updated_locations(target, |original, updated| {
            result.push((original, updated))
        });
        result.sort_by_key(|&(_, updated)| updated);
        result
    }
    /// Compute the updated locations of all the elements (both original and newly inserted).
    ///
    /// Assumes this set of insertions are being applied against the specified slice,
    /// invoking the callback on each element (even if the location is unchanged).
    ///
    /// If any of the insertion indexes are out of bounds of the original vec,
    /// then this function will panic.
    pub fn compute_updated_locations<F>(&mut self, target: &[T], mut func: F)
    where
        F: FnMut(OriginalLocation, usize),
    {
        self.sort();
        compute_updated_locations(
            target,
            self.insertions
                .iter()
                .rev()
                .map(|insertion| insertion.index),
            |original, updated| {
                func(
                    match original {
                        OriginalLocation::Original(_) => original,
                        OriginalLocation::Insertion(reversed_index) => {
                            // Convert the reversed insertion index back to the original one
                            OriginalLocation::Insertion(
                                self.insertions.len() - (reversed_index + 1),
                            )
                        }
                    },
                    updated,
                )
            },
        )
    }
    /// Applies all the insertions to the specified target vector.
    ///
    /// This reuses the Vector's existing memory if possible,
    /// but may require a reallocation (due to new values)
    ///
    /// The average runtime of this function is `O(n + m)`,
    /// where `n` is the number of existing elements and `m` is the number of insertions.
    pub fn apply(&mut self, target: &mut Vec<T>) {
        self.sort();
        apply_bulk_insertions(target, PoppingIter(&mut self.insertions));
    }
    fn sort(&mut self) {
        /*
         * Why would we possibly want to use insertion sort here?
         * First of all,
         * we need to maintain a stable sort to preserve the original order of the `Insertion`s.
         * Insertion sort has many other advantages over mergesort and quicksort,
         * and can be significantly faster in some scenarios.
         *
         * When the array is already mostly sorted, insertion sort has average running time `O(nk)`,
         * where `k` is the average distance of each element from its proper position.
         * In a randomly sorted array `k == n` giving `O(n^2)` worst case performance,
         * this isn't true in all scenarios as `k` may be significantly smaller.
         * We expect the `InsertionSet` to be mostly sorted already,
         * with only a few slightly out of place elements,
         * giving a very low average `k` value and very good running time.
         *
         * This is inspired by WebKit's choice to use bubble sort for their insertion set,
         * except that bubble sort is a terrible algorithm and insertion sort is much better.
         */
        insertion_sort_by_key(&mut *self.insertions, |insertion| insertion.index);
    }
}
impl<T> FromIterator<Insertion<T>> for InsertionSet<T> {
    #[inline]
    fn from_iter<I: IntoIterator<Item = Insertion<T>>>(iter: I) -> Self {
        InsertionSet {
            insertions: iter.into_iter().collect(),
        }
    }
}
impl<T> FromIterator<(usize, T)> for InsertionSet<T> {
    #[inline]
    fn from_iter<I: IntoIterator<Item = (usize, T)>>(iter: I) -> Self {
        iter.into_iter().map(Insertion::from).collect()
    }
}
impl<T> Default for InsertionSet<T> {
    #[inline]
    fn default() -> Self {
        InsertionSet::new()
    }
}

struct PoppingIter<'a, T: 'a>(&'a mut Vec<T>);
impl<'a, T> Iterator for PoppingIter<'a, T> {
    type Item = T;

    #[inline]
    fn next(&mut self) -> Option<T> {
        self.0.pop()
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        (self.0.len(), Some(self.0.len()))
    }
}
impl<'a, T> ExactSizeIterator for PoppingIter<'a, T> {}

/// Applies all the specified insertions into the target vector.
///
/// The insertion iterator must be sorted in reverse order and give the proper size for its `ExactSizeIterator`.
/// Violating these constraints will never cause undefined behavior,
/// since internally we use the completely safe `BulkShifter` abstraction.
pub fn apply_bulk_insertions<T, I>(target: &mut Vec<T>, mut insertions: I)
where
    I: Iterator<Item = Insertion<T>>,
    I: ExactSizeIterator,
{
    let mut shifter = BulkShifter::new(target, insertions.len());
    /*
     * We perform insertions in reverse order to reduce moving memory,
     * and ensure that the function is panic safe.
     *
     * For example, given the vector
     * and the InsertionSet `[(0, 0), (1, 2), (1, 3) (4, 9)]`:
     *
     * Since the first (working backwards) insertion is `(4, 9)`,
     * we need to to shift all elements after our first insertion
     * to the left 4 places:
     * `[1, 4, 5, 7, undef, undef, undef, undef, 11]`.
     * The element `11` will never need to be moved again,
     * since we've already made room for all future insertions.
     *
     * Next, we perform our first insertion (4, 9) at the last `undef` element:
     * `[1, 4, 5, 7, undef, undef, undef, 9, 11]`.
     * We only have 3 insertions left to perform,
     * so all future shifts will only need to move over two.
     * Then, we handle the group of insertions `[(1, 2), [(1, 3)]`,
     * and shift all elements past index 1 to the left 3 spaces:
     * [1, undef, undef, undef, 4, 5, 7, 9, 11].
     * Then we perform our desired insertions at index 1:
     * [1, undef, 2, 3, 4, 9, 11].
     * Finally, we perform the same process for the final insertion (0, 0),
     * resulting in the desired result: [0, 1, 2, 3, 4, 9, 11].
     */
    while !shifter.is_finished() {
        let Insertion { index, element } = insertions.next().expect("Expected more insertions!");
        shifter.shift_original(index);
        shifter.push_shifted(element);
    }
    shifter.finish();
    assert_eq!(insertions.len(), 0, "Unexpected insertions");
}

/// The original location of an element (before a set of insertions are applied)
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum OriginalLocation {
    /// The element was a queued insertion with the specified index
    Insertion(usize),
    /// The element was originally part of the vector
    Original(usize),
}

/// Compute the updated locations of all elements (original + inserted).
///
/// See [InsertionSet::compute_updated_locations] for details
pub fn compute_updated_locations<T, I, F>(target: &[T], mut insertions: I, mut updated: F)
where
    I: Iterator<Item = usize>,
    I: ExactSizeIterator,
    F: FnMut(OriginalLocation, usize),
{
    // This mirrors `apply_bulk_insertions` without actually shifting memory
    let mut original_len = target.len();
    let shifted_end = original_len + insertions.len();
    let mut shifted_start = shifted_end;
    let mut insertion_id = 0;
    while original_len != shifted_start {
        let insertion_index = insertions.next().expect("Expected more insertions!");
        assert!(
            insertion_index <= original_len,
            "Invalid insertion index {} > len {}",
            insertion_index,
            original_len
        );
        let moved_memory = original_len - insertion_index;
        if moved_memory > 0 {
            assert!(
                shifted_start >= moved_memory && insertion_index <= shifted_start - moved_memory
            );
            update_range(
                insertion_index..original_len,
                shifted_start - moved_memory,
                &mut updated,
            );
            shifted_start -= moved_memory;
            original_len = insertion_index;
        }
        assert!(shifted_start > original_len);
        shifted_start -= 1;
        updated(OriginalLocation::Insertion(insertion_id), shifted_start);
        insertion_id += 1;
    }
    for original_index in 0..original_len {
        updated(OriginalLocation::Original(original_index), original_index);
    }
    assert_eq!(insertions.len(), 0, "Unexpected insertions");
}
#[inline]
fn update_range<F: FnMut(OriginalLocation, usize)>(
    original: Range<usize>,
    updated_start: usize,
    func: &mut F,
) {
    let mut updated = updated_start;
    for original_index in original {
        func(OriginalLocation::Original(original_index), updated);
        updated += 1;
    }
}

#[cfg(test)]
mod test {
    use super::*;
    #[test]
    fn basic() {
        /*
         * For example, given the vector `[1, 4, 5, 7, 11]`
         * and the InsertionSet `[(0, 0), (1, 2), (1, 3) (4, 9)]`:
         */
        let vector = vec![1, 4, 5, 7, 11];
        let insertions = [(0, 0), (1, 2), (1, 3), (4, 9)]
            .iter()
            .cloned()
            .collect::<InsertionSet<u32>>();
        assert_eq!(insertions.applied(vector), vec![0, 1, 2, 3, 4, 5, 7, 9, 11]);
    }
    #[test]
    fn updated_locations() {
        /*
         * For example, given the vector `[1, 4, 5, 7, 11]`
         * and the InsertionSet `[(0, 0), (1, 2), (1, 3) (4, 9)]`:
         */
        let vector = vec![1, 4, 5, 7, 11];
        let mut insertions = [(0, 0), (1, 2), (1, 3), (4, 9)]
            .iter()
            .cloned()
            .collect::<InsertionSet<u32>>();
        assert_eq!(
            insertions.list_updated_locations(&vector),
            vec![
                (OriginalLocation::Insertion(0), 0),
                (OriginalLocation::Original(0), 1),
                (OriginalLocation::Insertion(1), 2),
                (OriginalLocation::Insertion(2), 3),
                (OriginalLocation::Original(1), 4),
                (OriginalLocation::Original(2), 5),
                (OriginalLocation::Original(3), 6),
                (OriginalLocation::Insertion(3), 7),
                (OriginalLocation::Original(4), 8),
            ]
        );
    }
    #[test]
    fn empty_updated_locations() {
        let vector = vec![1, 4, 5, 7, 11];
        assert_eq!(
            InsertionSet::new().list_updated_locations(&vector),
            vec![
                (OriginalLocation::Original(0), 0),
                (OriginalLocation::Original(1), 1),
                (OriginalLocation::Original(2), 2),
                (OriginalLocation::Original(3), 3),
                (OriginalLocation::Original(4), 4),
            ]
        );
    }
}