swh_graph/utils/shuffle.rs
1// Copyright (C) 2024 The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::ops::{Add, Range};
7
8use rand::prelude::*;
9use rayon::prelude::*;
10
11/// Returns a parallel iterator on the range where items are pseudo-shuffled
12///
13/// When running an operation on all nodes of the graph, the naive approach is
14/// to use `(0..graph.num_nodes()).into_par_iter().map(f)`.
15///
16/// However, nodes are not uniformly distributed over the range of ids, meaning that
17/// some sections of the range of ids can be harder for `f` than others.
18/// This can be an issue when some instances are harder than others, or simply because
19/// it makes ETAs unreliable.
20///
21/// Instead, this function returns the same set of nodes as the naive approach would,
22/// but in a somewhat-random order.
23///
24/// # Example
25///
26/// ```
27/// # #[cfg(not(miri))] // very slow to run on Miri
28/// # {
29/// use rayon::prelude::*;
30/// use swh_graph::utils::shuffle::par_iter_shuffled_range;
31///
32/// let mut integers: Vec<usize> = par_iter_shuffled_range(0..10000).collect();
33/// assert_ne!(integers, (0..10000).collect::<Vec<_>>(), "integers are still sorted");
34/// integers.sort();
35/// assert_eq!(integers, (0..10000).collect::<Vec<_>>(), "integers do not match the input range");
36/// # }
37/// ```
38pub fn par_iter_shuffled_range<Item>(
39 range: Range<Item>,
40) -> impl ParallelIterator<Item = <Range<Item> as IntoParallelIterator>::Item>
41where
42 Range<Item>: ExactSizeIterator,
43 usize: Add<Item, Output = Item>,
44 Item: Ord + Sync + Send + Copy,
45 std::ops::Range<Item>: rayon::iter::IntoParallelIterator,
46{
47 let num_chunks = 100_000; // Arbitrary value
48 let chunk_size = range.len().div_ceil(num_chunks);
49 let mut chunks: Vec<usize> = (0..num_chunks).collect();
50
51 chunks.shuffle(&mut rand::thread_rng());
52
53 chunks
54 .into_par_iter()
55 // Lazily rebuild the list of nodes from the shuffled chunks
56 .flat_map(move |chunk_id| {
57 ((chunk_id * chunk_size) + range.start)
58 ..Item::min(
59 (chunk_id.checked_add(1).unwrap()) * chunk_size + range.start,
60 range.end,
61 )
62 })
63}