Skip to main content

webgraph/utils/
mod.rs

1/*
2 * SPDX-FileCopyrightText: 2023 Inria
3 * SPDX-FileCopyrightText: 2023 Sebastiano Vigna
4 *
5 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
6 */
7
8//! Miscellaneous utilities.
9
10use rand::RngExt;
11use std::path::PathBuf;
12
13/// Creates a new random dir inside the given folder
14pub fn temp_dir<P: AsRef<std::path::Path>>(base: P) -> anyhow::Result<PathBuf> {
15    let mut base = base.as_ref().to_owned();
16    const ALPHABET: &[u8] = b"0123456789abcdef";
17    let mut rnd = rand::rng();
18    let mut random_str = String::new();
19    loop {
20        random_str.clear();
21        for _ in 0..16 {
22            let idx = rnd.random_range(0..ALPHABET.len());
23            random_str.push(ALPHABET[idx] as char);
24        }
25        base.push(&random_str);
26
27        if !base.exists() {
28            std::fs::create_dir(&base)?;
29            return Ok(base);
30        }
31        base.pop();
32    }
33}
34
35mod batch_codec;
36pub use batch_codec::*;
37
38mod circular_buffer;
39pub(crate) use circular_buffer::*;
40
41mod ragged_array;
42pub use ragged_array::RaggedArray;
43
44mod mmap_helper;
45pub use mmap_helper::*;
46
47mod java_perm;
48pub use java_perm::*;
49
50mod granularity;
51pub use granularity::*;
52
53pub mod matrix;
54pub use matrix::Matrix;
55
56pub mod sort_pairs;
57pub use sort_pairs::SortPairs;
58
59pub mod par_sort_pairs;
60pub use par_sort_pairs::ParSortPairs;
61
62pub mod par_sort_iters;
63pub use par_sort_iters::ParSortIters;
64
65use crate::graphs::{
66    arc_list_graph::NodeLabels,
67    bvgraph::{Decode, Encode},
68};
69
70/// A decoder that encodes the read values using the given encoder.
71/// This is commonly used to change the codes of a graph without decoding and
72/// re-encoding it but by changing the codes.
73pub struct Converter<D: Decode, E: Encode> {
74    pub decoder: D,
75    pub encoder: E,
76    pub offset: usize,
77}
78
79impl<D: Decode, E: Encode> Decode for Converter<D, E> {
80    // TODO: implement correctly start_node/end_node
81    #[inline(always)]
82    fn read_outdegree(&mut self) -> u64 {
83        let res = self.decoder.read_outdegree();
84        self.offset += self.encoder.write_outdegree(res).unwrap();
85        res
86    }
87    #[inline(always)]
88    fn read_reference_offset(&mut self) -> u64 {
89        let res = self.decoder.read_reference_offset();
90        self.offset += self.encoder.write_reference_offset(res).unwrap();
91        res
92    }
93    #[inline(always)]
94    fn read_block_count(&mut self) -> u64 {
95        let res = self.decoder.read_block_count();
96        self.offset += self.encoder.write_block_count(res).unwrap();
97        res
98    }
99    #[inline(always)]
100    fn read_block(&mut self) -> u64 {
101        let res = self.decoder.read_block();
102        self.offset += self.encoder.write_block(res).unwrap();
103        res
104    }
105    #[inline(always)]
106    fn read_interval_count(&mut self) -> u64 {
107        let res = self.decoder.read_interval_count();
108        self.offset += self.encoder.write_interval_count(res).unwrap();
109        res
110    }
111    #[inline(always)]
112    fn read_interval_start(&mut self) -> u64 {
113        let res = self.decoder.read_interval_start();
114        self.offset += self.encoder.write_interval_start(res).unwrap();
115        res
116    }
117    #[inline(always)]
118    fn read_interval_len(&mut self) -> u64 {
119        let res = self.decoder.read_interval_len();
120        self.offset += self.encoder.write_interval_len(res).unwrap();
121        res
122    }
123    #[inline(always)]
124    fn read_first_residual(&mut self) -> u64 {
125        let res = self.decoder.read_first_residual();
126        self.offset += self.encoder.write_first_residual(res).unwrap();
127        res
128    }
129    #[inline(always)]
130    fn read_residual(&mut self) -> u64 {
131        let res = self.decoder.read_residual();
132        self.offset += self.encoder.write_residual(res).unwrap();
133        res
134    }
135    #[inline(always)]
136    fn num_of_residuals(&mut self, num_of_residuals: usize) {
137        self.encoder.num_of_residuals(num_of_residuals);
138    }
139}
140
141/// An enum expressing the memory requirements for batched algorithms
142/// such as [`SortPairs`], [`ParSortPairs`], and [`ParSortIters`].
143///
144/// This type implements [`Mul`](core::ops::Mul) and [`Div`](core::ops::Div) to
145/// scale the memory usage requirements by a given factor, independently of the
146/// variant.
147#[derive(Clone, Copy, Debug)]
148pub enum MemoryUsage {
149    /// The target overall memory usage in bytes.
150    ///
151    /// This is the more user-friendly option. The actual number of elements
152    /// can be computed using [`batch_size`](MemoryUsage::batch_size).
153    MemorySize(usize),
154    /// The number of elements used in all batches.
155    ///
156    /// This is a more low-level option that gives more control to the user, but
157    /// the actual memory usage will depend on the size of labels (if any).
158    BatchSize(usize),
159}
160
161/// Default implementation, returning half of the physical RAM.
162impl Default for MemoryUsage {
163    fn default() -> Self {
164        Self::from_perc(50.0)
165    }
166}
167
168impl core::fmt::Display for MemoryUsage {
169    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
170        match self {
171            MemoryUsage::MemorySize(size) => write!(f, "{} bytes", size),
172            MemoryUsage::BatchSize(size) => write!(f, "{} elements", size),
173        }
174    }
175}
176
177impl core::ops::Mul<usize> for MemoryUsage {
178    type Output = MemoryUsage;
179
180    fn mul(self, rhs: usize) -> Self::Output {
181        match self {
182            MemoryUsage::MemorySize(size) => MemoryUsage::MemorySize(size * rhs),
183            MemoryUsage::BatchSize(size) => MemoryUsage::BatchSize(size * rhs),
184        }
185    }
186}
187
188impl core::ops::Div<usize> for MemoryUsage {
189    type Output = MemoryUsage;
190
191    fn div(self, rhs: usize) -> Self::Output {
192        match self {
193            MemoryUsage::MemorySize(size) => MemoryUsage::MemorySize(size / rhs),
194            MemoryUsage::BatchSize(size) => MemoryUsage::BatchSize(size / rhs),
195        }
196    }
197}
198
199impl MemoryUsage {
200    /// Creates a new memory usage expressed as a percentage of the
201    /// physical RAM.
202    pub fn from_perc(perc: f64) -> Self {
203        let system = sysinfo::System::new_with_specifics(
204            sysinfo::RefreshKind::nothing()
205                .with_memory(sysinfo::MemoryRefreshKind::nothing().with_ram()),
206        );
207        MemoryUsage::MemorySize(
208            usize::try_from((system.total_memory() as f64 * perc / 100.0) as u64)
209                .expect("System memory overflows usize"),
210        )
211    }
212
213    /// Returns the batch size for elements of type `T`.
214    ///
215    /// If the [memory usage is expressed as a number of
216    /// bytes](MemoryUsage::MemorySize), this method divides the number of bytes
217    /// by the size of `T` to obtain the number of elements. Otherwise, [it just
218    /// returns specified batch size](MemoryUsage::BatchSize).
219    pub fn batch_size<T>(&self) -> usize {
220        match &self {
221            MemoryUsage::MemorySize(memory_size) => {
222                let elem_size = std::mem::size_of::<T>();
223                assert!(elem_size > 0, "Element size cannot be zero");
224                memory_size / elem_size
225            }
226            MemoryUsage::BatchSize(batch_size) => *batch_size,
227        }
228    }
229}
230
231/// Writes a human-readable representation of a large number using SI prefixes units.
232pub fn humanize(value: f64) -> String {
233    const UNITS: &[&str] = &["", "K", "M", "G", "T", "P", "E"];
234    let mut v = value;
235    let mut unit: usize = 0;
236    while v >= 1000.0 && unit + 1 < UNITS.len() {
237        v /= 1000.0;
238        unit += 1;
239    }
240    if unit == 0 {
241        format!("{:.0}{}", v, UNITS[unit])
242    } else {
243        format!("{:.3}{}", v, UNITS[unit])
244    }
245}
246
247/// A structure holding partition iterators and their boundaries.
248///
249/// This type holds a list of iterators and a list of boundaries, one more
250/// than the number of iterators. The implied semantics is that each iterator
251/// returns (labelled) pairs of nodes, and that the first element of
252/// each pair sits between the boundaries associated with the iterator.
253///
254/// This structures is returned by [`ParSortPairs`] and [`ParSortIters`] and can
255/// easily be converted into lenders for use with
256/// [`BvCompConfig::par_comp_lenders`](crate::graphs::bvgraph::BvCompConfig::par_comp_lenders)
257/// using a convenient implementation of the [`From`] trait.
258///
259/// Note that it sufficient to write `let lenders: Vec<_> = split_iters.into()`
260/// to perform the conversion. Type inference might not work properly if the
261/// call is embedded in a larger expression, in which case an explicit type
262/// annotation might be necessary.
263pub struct SplitIters<I> {
264    pub boundaries: Box<[usize]>,
265    pub iters: Box<[I]>,
266}
267
268impl<I> SplitIters<I> {
269    pub fn new(boundaries: Box<[usize]>, iters: Box<[I]>) -> Self {
270        Self { boundaries, iters }
271    }
272}
273
274impl<I> From<(Box<[usize]>, Box<[I]>)> for SplitIters<I> {
275    fn from((boundaries, iters): (Box<[usize]>, Box<[I]>)) -> Self {
276        Self::new(boundaries, iters)
277    }
278}
279
280/// Conversion of a [`SplitIters`] of iterators on unlabeled pairs into a
281/// sequence of pairs of starting points and associated lenders.
282///
283/// This is useful for converting the output of sorting utilities like
284/// [`ParSortPairs`] or [`ParSortIters`] into a form suitable for
285/// [`BvCompConfig::par_comp_lenders`](crate::graphs::bvgraph::BvCompConfig::par_comp_lenders)
286/// when working with unlabeled graphs.
287///
288/// The pairs `(src, dst)` are automatically converted to labeled form with unit
289/// labels, and the resulting lenders are wrapped with
290/// [`LeftIterator`](crate::labels::proj::LeftIterator) to project out just the
291/// successor nodes.
292///
293/// Note that it sufficient to write `let lenders: Vec<_> = split_iters.into()`
294/// to perform the conversion. Type inference might not work properly if the
295/// call is embedded in a larger expression, in which case an explicit type
296/// annotation might be necessary.
297impl<
298    I: Iterator<Item = (usize, usize)> + Send + Sync,
299    IT: IntoIterator<Item = (usize, usize), IntoIter = I>,
300> From<SplitIters<IT>>
301    for Vec<
302        crate::labels::proj::LeftIterator<
303            NodeLabels<(), std::iter::Map<I, fn((usize, usize)) -> ((usize, usize), ())>>,
304        >,
305    >
306{
307    fn from(split: SplitIters<IT>) -> Self {
308        Box::into_iter(split.iters)
309            .enumerate()
310            .map(|(i, iter)| {
311                let start_node = split.boundaries[i];
312                let end_node = split.boundaries[i + 1];
313                let num_partition_nodes = end_node - start_node;
314                // Map pairs to labeled pairs with unit labels
315                #[allow(clippy::type_complexity)]
316                let map_fn: fn((usize, usize)) -> ((usize, usize), ()) = |pair| (pair, ());
317                let labeled_iter = iter.into_iter().map(map_fn);
318                let lender =
319                    NodeLabels::try_new_from(num_partition_nodes, labeled_iter, start_node)
320                        .expect("Iterator should start from the expected first node");
321                // Wrap with LeftIterator to project out just the successor
322                crate::labels::proj::LeftIterator(lender)
323            })
324            .collect()
325    }
326}
327
328/// Conversion of a [`SplitIters`] of iterators on labelled pairs into a
329/// sequences of pairs of starting points and associated lenders.
330///
331/// This is useful for converting the output of sorting utilities like
332/// [`ParSortPairs`] or [`ParSortIters`] into a form suitable for
333/// [`BvCompConfig::par_comp_lenders`](crate::graphs::bvgraph::BvCompConfig::par_comp_lenders).
334///
335/// Note that it sufficient to write `let lenders: Vec<_> = split_iters.into()`
336/// to perform the conversion. Type inference might not work properly if the
337/// call is embedded in a larger expression, in which case an explicit type
338/// annotation might be necessary.
339impl<
340    L: Clone + Copy + 'static,
341    I: Iterator<Item = ((usize, usize), L)> + Send + Sync,
342    IT: IntoIterator<Item = ((usize, usize), L), IntoIter = I>,
343> From<SplitIters<IT>> for Vec<NodeLabels<L, I>>
344{
345    fn from(split: SplitIters<IT>) -> Self {
346        Box::into_iter(split.iters)
347            .enumerate()
348            .map(|(i, iter)| {
349                let start_node = split.boundaries[i];
350                let end_node = split.boundaries[i + 1];
351                let num_partition_nodes = end_node - start_node;
352
353                NodeLabels::try_new_from(num_partition_nodes, iter.into_iter(), start_node)
354                    .expect("Iterator should start from the expected first node")
355            })
356            .collect()
357    }
358}