swh_graph/utils/
shuffle.rs

1// Copyright (C) 2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::ops::{Add, Range};
7
8use rand::prelude::*;
9use rayon::prelude::*;
10
11/// Returns a parallel iterator on the range where items are pseudo-shuffled
12///
13/// When running an operation on all nodes of the graph, the naive approach is
14/// to use `(0..graph.num_nodes()).into_par_iter().map(f)`.
15///
16/// However, nodes are not uniformly distributed over the range of ids, meaning that
17/// some sections of the range of ids can be harder for `f` than others.
18/// This can be an issue when some instances are harder than others, or simply because
19/// it makes ETAs unreliable.
20///
21/// Instead, this function returns the same set of nodes as the naive approach would,
22/// but in a somewhat-random order.
23///
24/// # Example
25///
26/// ```
27/// # #[cfg(not(miri))] // very slow to run on Miri
28/// # {
29/// use rayon::prelude::*;
30/// use swh_graph::utils::shuffle::par_iter_shuffled_range;
31///
32/// let mut integers: Vec<usize> = par_iter_shuffled_range(0..10000).collect();
33/// assert_ne!(integers, (0..10000).collect::<Vec<_>>(), "integers are still sorted");
34/// integers.sort();
35/// assert_eq!(integers, (0..10000).collect::<Vec<_>>(), "integers do not match the input range");
36/// # }
37/// ```
38pub fn par_iter_shuffled_range<Item>(
39    range: Range<Item>,
40) -> impl ParallelIterator<Item = <Range<Item> as IntoParallelIterator>::Item>
41where
42    Range<Item>: ExactSizeIterator,
43    usize: Add<Item, Output = Item>,
44    Item: Ord + Sync + Send + Copy,
45    std::ops::Range<Item>: rayon::iter::IntoParallelIterator,
46{
47    let num_chunks = 100_000; // Arbitrary value
48    let chunk_size = range.len().div_ceil(num_chunks);
49    let mut chunks: Vec<usize> = (0..num_chunks).collect();
50
51    chunks.shuffle(&mut rand::thread_rng());
52
53    chunks
54        .into_par_iter()
55        // Lazily rebuild the list of nodes from the shuffled chunks
56        .flat_map(move |chunk_id| {
57            ((chunk_id * chunk_size) + range.start)
58                ..Item::min(
59                    (chunk_id.checked_add(1).unwrap()) * chunk_size + range.start,
60                    range.end,
61                )
62        })
63}