Skip to main content

pagination_packing/
lib.rs

1#![deny(missing_docs)]
2#![doc = include_str!("../README.md")]
3
4use std::{
5    collections::{HashSet, VecDeque},
6    hash::Hash,
7    ops::Index,
8};
9
10/// Packs `Tile`s (composed of [comparable](Eq) `Symbol`s) into pages (essentially `[Symbol; page_size]`).
11/// Returns a vector of “assignments”:
12///
13/// ```rust
14/// # use std::collections::HashSet;
15/// # use pagination_packing::overload_and_remove;
16/// #
17/// // Each lesson focuses on a single sentence, and refers to one page of the vocabulary booklet.
18/// let vocabulary: [&[_]; 5] = [
19///     &[], // The introductory lesson actually does not contain a sentence.
20///     &["little", "lamb"], // "Mary had a little lamb."
21///     &["tailor", "rich"], // "My tailor is rich."
22///     &["rich", "little", "people"], // "Those rich little people!"
23///     &["send", "dress", "tailor"], // "Send your dress to my tailor."
24/// ];
25/// // Each page of the booklet can contain 4 words of vocabulary.
26/// let assignments = overload_and_remove::<_, _, [_]>(&vocabulary, 4);
27///
28/// let booklet: Vec<HashSet<_>> = assignments
29///     .into_iter()
30///     .map(|page_assignments| {
31///        page_assignments.referenced_tiles(&vocabulary).copied().flatten().collect()
32///     })
33///     .collect();
34/// for (i, page) in booklet.iter().enumerate() {
35///     println!("Page {}: {page:?}", i + 1);
36/// }
37///
38/// // Each element of `vocabulary` is fully contained within one of the pages.
39/// assert_eq!( // As you can see, we can't find a lesson...
40///     vocabulary.iter().find(|&&lesson| {
41///         let set = HashSet::from_iter(lesson);
42///         // ...such that there isn't a page that the lesson is a subset of.
43///         !booklet.iter().any(|page| set.is_subset(page))
44///     }),
45///     None,
46/// );
47/// ```
48///
49/// This algorithm has quadratic complexity (compared to the linear complexity of gluttonous heuristics like “first-fit”), but performs better on many more inputs.
50pub fn overload_and_remove<
51    Symbol: Eq + Hash,
52    Tile: AsRef<[Symbol]> + ?Sized,
53    C: Index<usize, Output = Tile> + ?Sized,
54>(
55    tiles: &C,
56    page_size: usize,
57) -> Vec<AssignedTiles>
58where
59    for<'a> &'a C: IntoIterator,
60    for<'a> <&'a C as IntoIterator>::IntoIter: ExactSizeIterator, // This gives the C(ollection)'s length.
61{
62    // Sort the tiles by size, which improves the packing algorithm's efficiency.
63    let mut queue = VecDeque::with_capacity(tiles.into_iter().len());
64    for i in 0..tiles.into_iter().len() {
65        queue.insert(
66            // Note that we want them sorted *largest first*!
67            // (Ties are broken so that the new element is inserted as late as possible, to avoid moving data too much.)
68            queue.partition_point(|tile_ref: &TileRef| {
69                tiles[tile_ref.tile_idx].as_ref().len() >= tiles[i].as_ref().len()
70            }),
71            TileRef::new(i),
72        );
73    }
74
75    let mut assignments: Vec<AssignedTiles> = Vec::new();
76    while let Some(tile_ref) = queue.pop_front() {
77        let tile = &tiles[tile_ref.tile_idx];
78
79        let best_page_index = assignments
80            .iter()
81            .enumerate()
82            .filter(|(i, _assignment)| tile_ref.is_banned_from(*i))
83            .fold(
84                (assignments.len(), tile.as_ref().len() as f64),
85                |(best_idx, best_rel_size), (i, assignment)| {
86                    let relative_size = assignment.relative_size_of(tile, tiles);
87                    if relative_size < best_rel_size {
88                        (i, relative_size)
89                    } else {
90                        (best_idx, best_rel_size)
91                    }
92                },
93            )
94            .0;
95
96        if let Some(best_page) = assignments.get_mut(best_page_index) {
97            // Add the tile to that page.
98            best_page.assigned.push(tile_ref);
99
100            // If this overloads the page, get it back to normal (if possible).
101            while best_page.volume(tiles) > page_size {
102                // Look for a tile minimising "efficiency", i.e. size divided by relative size.
103                let efficiency = |tile: &Tile| {
104                    (tile.as_ref().len()) as f64 / best_page.relative_size_of(tile, tiles)
105                };
106                let [(min_efficiency, (min_idx, _min_tile)), (max_efficiency, (_max_idx, _max_tile))] =
107                    minmax_by_key(
108                        best_page.referenced_tiles(tiles).enumerate(),
109                        |(_i, tile_ref)| efficiency(tile_ref),
110                    )
111                    .expect("Pages should not be empty!");
112
113                // Give up if all tiles have the same efficiency.
114                // All efficiencies are identical if and only if the min is equal to the max.
115                // The estimated inaccuracy is 2^(-50), so we use a threshold somewhat higher than that: 2^(-40).
116                let two_pow_m40 = f64::from_bits(0x3d70_0000_0000_0000);
117                if max_efficiency - min_efficiency < two_pow_m40 {
118                    break;
119                }
120
121                // Remove the tile with minimal efficiency.
122                let mut min_tile = best_page.assigned.swap_remove(min_idx);
123                min_tile.ban_from(best_page_index); // Prevent it from going back into this page, to ensure that the algorithm terminates.
124                queue.push_back(min_tile); // Push it back into the queue.
125            }
126        } else {
127            // Found nowhere to put this tile, create a new page for it.
128            assignments.push(AssignedTiles {
129                assigned: vec![tile_ref],
130            });
131        }
132    }
133
134    // Deal with pages still overloaded, by emptying them and re-distributing their tiles.
135    for page in &mut assignments {
136        if page.volume(tiles) > page_size {
137            for tile_ref in page.assigned.drain(..) {
138                // We are about to re-insert them via first-fit, so sorting the queue by size improves its efficiency.
139                queue.insert(
140                    queue.partition_point(|other_ref| {
141                        tiles[other_ref.tile_idx].as_ref().len()
142                            >= tiles[tile_ref.tile_idx].as_ref().len()
143                    }),
144                    tile_ref,
145                );
146            }
147        }
148    }
149    // Place back any tiles now in the queue via first-fit.
150    for tile_ref in queue {
151        let tile = &tiles[tile_ref.tile_idx];
152        if let Some(page) = assignments
153            .iter_mut()
154            .find(|page| page.can_fit(tile, page_size, tiles))
155        {
156            page.assigned.push(tile_ref);
157        } else {
158            // No page can accomodate this tile? Create a new page.
159            assignments.push(AssignedTiles {
160                assigned: vec![tile_ref],
161            });
162        }
163    }
164
165    debug_assert_eq!(
166        assignments
167            .iter()
168            .position(|assignment| assignment.unique_symbols(tiles).len() > page_size),
169        None
170    );
171
172    // "Decant" the result.
173    decant(&mut assignments, tiles, page_size);
174    // Note that the result does not contain any empty pages.
175    debug_assert_eq!(
176        assignments
177            .iter()
178            .position(|assignment| assignment.assigned.is_empty()),
179        None
180    );
181
182    debug_assert_eq!(
183        assignments
184            .iter()
185            .position(|assignment| assignment.unique_symbols(tiles).len() > page_size),
186        None
187    );
188
189    assignments
190}
191
192fn decant<
193    Symbol: Eq + Hash,
194    Tile: AsRef<[Symbol]> + ?Sized,
195    C: Index<usize, Output = Tile> + ?Sized,
196>(
197    assignments: &mut Vec<AssignedTiles>,
198    tiles: &C,
199    page_size: usize,
200) {
201    // "Decanting" is the process of moving all *things* that can fit in a lower index there.
202    fn decant_on<F: FnMut(&mut AssignedTiles, &mut AssignedTiles)>(
203        mut try_decanting: F,
204        assignments: &mut Vec<AssignedTiles>,
205    ) {
206        // No need to attempt decanting on page #0, as there are no pages to decant to.
207        for from_idx in (1..assignments.len()).rev() {
208            let (write_half, read_half) = assignments.split_at_mut(from_idx);
209            let from = &mut read_half[0];
210            // Scan all pages before this one.
211            for to in write_half {
212                try_decanting(to, from);
213
214                // If the tile is now empty, remove it.
215                // Doing this now reduces the the number of iterations performed by later steps.
216                // NB: order is intentionally preserved so as not to alter the "decantation"'s properties.
217                // NB: this does mean that the first step might get empty pages in its input!
218                // NB: this is safe to do because we go towards the beginning of the vector, thereby not
219                //     invalidating our iteration (thus, iterators should not be used to drive the outer loop).
220                if from.assigned.is_empty() {
221                    assignments.remove(from_idx);
222                    break;
223                }
224            }
225        }
226    }
227
228    // Decant on pages.
229    decant_on(
230        |to, from| {
231            // If the entire pages can be merged, move all of `from`'s tiles.
232            if to.combined_volume(
233                from.assigned
234                    .iter()
235                    .flat_map(|tile_ref| tiles[tile_ref.tile_idx].as_ref()),
236                tiles,
237            ) <= page_size
238            {
239                to.assigned.append(&mut from.assigned);
240            }
241        },
242        assignments,
243    );
244
245    debug_assert_eq!(
246        assignments
247            .iter()
248            .position(|assignment| assignment.unique_symbols(tiles).len() > page_size),
249        None
250    );
251
252    // Decant on "components" (= tiles sharing symbols).
253    decant_on(
254        |to, from| {
255            // We need to iterate on all the "components", which are groups of tiles sharing at least
256            // one symbol with another tile in the group.
257            // (But they need not all share one common symbol!)
258            // We do this by adding the first available tile, and then looking for tiles with common
259            // symbols. (As an optimisation, we know we can skip tiles already scanned.)
260            // Note that, after adding a tile, we need to re-iterate on all of them!
261            // Some tiles that have been iterated past may be "connected" to the one that has been
262            // newly added (but none other in the set).
263            // Note that we have to round the number of bytes up, not down.
264            let mut processed = vec![0u8; (from.assigned.len() + 7) / 8];
265            let mut symbols = HashSet::new();
266            let mut members = Vec::new();
267            // Find the first byte that reports an un-processed symbol.
268            while let Some(byte_idx) = processed.iter().position(|&byte| byte != 0xFF) {
269                let bit_idx = processed[byte_idx].trailing_ones() as usize;
270                let idx = byte_idx * 8 + bit_idx;
271                // Due to the rounding-up, it's possible that the resulting index is actually higher
272                // than the number of tile refs.
273                let Some(slice) = from.assigned.get(idx..) else {
274                    break;
275                };
276                let Some(first) = slice.first() else {
277                    // It is possible for the slice to be empty.
278                    break;
279                };
280
281                // Seed the "component" with the tile that was just found.
282                symbols.extend(tiles[first.tile_idx].as_ref());
283                members.clear();
284                members.push(idx);
285                processed[byte_idx] |= 1 << bit_idx;
286                let mut rescan = true;
287                while rescan {
288                    rescan = false;
289                    for (i, tile_ref) in std::iter::zip(idx.., slice) {
290                        let processed_byte = &mut processed[i / 8];
291                        let processed_mask = 1 << (i % 8);
292                        if *processed_byte & processed_mask != 0 {
293                            continue;
294                        }
295                        let tile = &tiles[tile_ref.tile_idx];
296
297                        // If this is the first tile, or if at least one of its symbols matches, add it.
298                        let tile_symbols: HashSet<_> = tile.as_ref().iter().collect();
299                        if !symbols.is_disjoint(&tile_symbols) {
300                            symbols.extend(tile_symbols);
301                            // We need to keep the array sorted for the later removal to work.
302                            members.insert(members.partition_point(|&j| j <= i), i);
303                            *processed_byte |= processed_mask;
304                            rescan = true;
305                        }
306                    }
307                }
308
309                // So, we have the list of refs in `members`, and the set of symbols they contain in
310                // `symbols`.
311                // The latter has to be sorted, because removing an element alters the indices of at
312                // least one of the subsequent ones, which could mess up subsequent removals.
313                debug_assert_eq!(
314                    members.windows(2).position(|pair| pair[0] > pair[1]),
315                    None,
316                    "Members array {members:?} is not sorted",
317                );
318                if to.combined_volume(symbols.drain(), tiles) <= page_size {
319                    // Extract the "component" into the target page.
320                    for idx in members.iter().rev() {
321                        let symbol = from.assigned.swap_remove(*idx);
322                        to.assigned.push(symbol);
323                    }
324                }
325            }
326        },
327        assignments,
328    );
329
330    debug_assert_eq!(
331        assignments
332            .iter()
333            .position(|assignment| assignment.unique_symbols(tiles).len() > page_size),
334        None
335    );
336
337    // And finally, decant on individual tiles.
338    decant_on(
339        |to, from| {
340            for i in (0..from.assigned.len()).rev() {
341                if to.combined_volume(tiles[from.assigned[i].tile_idx].as_ref(), tiles) <= page_size
342                {
343                    let symbol = from.assigned.swap_remove(i);
344                    to.assigned.push(symbol);
345                }
346            }
347        },
348        assignments,
349    );
350}
351
352#[derive(Debug, Clone, PartialEq, Eq, Hash)]
353struct TileRef {
354    tile_idx: usize,
355    banned_from: Vec<u8>,
356}
357
358/// A collection of references to tiles.
359#[derive(Debug, Clone, PartialEq, Eq, Hash)]
360pub struct AssignedTiles {
361    assigned: Vec<TileRef>,
362}
363
364impl TileRef {
365    fn new(tile_idx: usize) -> Self {
366        Self {
367            tile_idx,
368            banned_from: vec![],
369        }
370    }
371    fn is_banned_from(&self, page_idx: usize) -> bool {
372        let byte_idx = page_idx / 8;
373        let bit_idx = page_idx % 8;
374        self.banned_from
375            .get(byte_idx)
376            .map_or(false, |byte| byte & (1 << bit_idx) != 0)
377    }
378
379    fn ban_from(&mut self, page_idx: usize) {
380        let byte_idx = page_idx / 8;
381        let bit_idx = page_idx % 8;
382        if self.banned_from.len() <= byte_idx {
383            self.banned_from.resize(byte_idx + 1, 0);
384        }
385        self.banned_from[byte_idx] |= 1 << bit_idx;
386    }
387}
388
389impl AssignedTiles {
390    /// Returns an iterator through the IDs of the tiles referenced by this page.
391    pub fn referenced_tile_ids(&self) -> impl Iterator<Item = usize> + '_ {
392        self.assigned.iter().map(|tile_ref| tile_ref.tile_idx)
393    }
394
395    /// Returns an iterator through the tiles referenced by this page.
396    pub fn referenced_tiles<
397        'this,
398        'tiles: 'this,
399        Tile: 'tiles + ?Sized,
400        C: Index<usize, Output = Tile> + ?Sized,
401    >(
402        &'this self,
403        tiles: &'tiles C,
404    ) -> impl Iterator<Item = &'tiles Tile> + 'this {
405        self.assigned
406            .iter()
407            .map(|tile_ref| &tiles[tile_ref.tile_idx])
408    }
409
410    /// Gather a set of all the symbols contained in this page's tiles.
411    ///
412    /// Due to [the nature of a set][HashSet], each symbol is only referenced a single time.
413    pub fn unique_symbols<
414        'tiles,
415        Symbol: Eq + Hash,
416        Tile: AsRef<[Symbol]> + ?Sized + 'tiles,
417        C: Index<usize, Output = Tile> + ?Sized,
418    >(
419        &self,
420        tiles: &'tiles C,
421    ) -> HashSet<&'tiles Symbol> {
422        self.referenced_tiles(tiles)
423            .flat_map(|tile| tile.as_ref().iter())
424            .collect()
425    }
426
427    fn volume<
428        Symbol: Eq + Hash,
429        Tile: AsRef<[Symbol]> + ?Sized,
430        C: Index<usize, Output = Tile> + ?Sized,
431    >(
432        &self,
433        tiles: &C,
434    ) -> usize {
435        self.unique_symbols(tiles).len()
436    }
437
438    fn can_fit<
439        Symbol: Eq + Hash,
440        Tile: AsRef<[Symbol]> + ?Sized,
441        C: Index<usize, Output = Tile> + ?Sized,
442    >(
443        &self,
444        tile: &Tile,
445        page_size: usize,
446        tiles: &C,
447    ) -> bool {
448        let mut unique_symbols = self.unique_symbols(tiles);
449        unique_symbols.extend(tile.as_ref().iter());
450        unique_symbols.len() <= page_size
451    }
452
453    fn relative_size_of<
454        Symbol: Eq + Hash,
455        Tile: AsRef<[Symbol]> + ?Sized,
456        C: Index<usize, Output = Tile> + ?Sized,
457    >(
458        &self,
459        tile: &Tile,
460        tiles: &C,
461    ) -> f64 {
462        tile.as_ref().iter().fold(0., |acc, symbol| {
463            let n = self
464                .referenced_tiles(tiles)
465                .filter(|referenced_tile| referenced_tile.as_ref().contains(symbol))
466                .count();
467            acc + 1. / (1 + n) as f64
468        })
469    }
470
471    fn combined_volume<
472        'tiles,
473        Symbol: Eq + Hash + 'tiles,
474        Tile: AsRef<[Symbol]> + ?Sized + 'tiles,
475        C: Index<usize, Output = Tile> + ?Sized,
476        It: IntoIterator<Item = &'tiles Symbol>,
477    >(
478        &self,
479        iter: It,
480        tiles: &'tiles C,
481    ) -> usize {
482        let mut symbols = self.unique_symbols(tiles);
483        symbols.extend(iter);
484        symbols.len()
485    }
486}
487
488fn minmax_by_key<K: PartialOrd + Copy, T: Copy, It: IntoIterator<Item = T>, F: FnMut(T) -> K>(
489    it: It,
490    mut key_func: F,
491) -> Option<[(K, T); 2]> {
492    let mut iter = it.into_iter().map(|item| (key_func(item), item));
493    let first = iter.next()?;
494
495    let mut min = first;
496    let mut max = first;
497    for item in iter {
498        if item.0 < min.0 {
499            min = item;
500        } else if item.0 > max.0 {
501            max = item;
502        }
503    }
504
505    Some([min, max])
506}