Skip to main content

swh_graph/map/
permutation.rs

1// Copyright (C) 2023-2026  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::io::{Read, Seek, Write};
7use std::mem::MaybeUninit;
8use std::path::Path;
9use std::sync::atomic::{AtomicBool, Ordering};
10
11use anyhow::{bail, Context, Result};
12use byteorder::{BigEndian, ByteOrder, ReadBytesExt};
13use dsi_progress_logger::{progress_logger, ProgressLog};
14use mmap_rs::{Mmap, MmapFlags};
15use rayon::prelude::*;
16
17/// An array of `n` unique integers in the `0..n` range.
18#[allow(clippy::len_without_is_empty)]
19pub trait Permutation {
20    /// Returns the number of items
21    fn len(&self) -> usize;
22    /// Returns an item
23    fn get(&self, old_node: usize) -> Option<usize>;
24    /// Returns an item without checking it is within the bounds
25    ///
26    /// # Safety
27    ///
28    /// Undefined behavior if `old_node >= len()`
29    unsafe fn get_unchecked(&self, old_node: usize) -> usize;
30}
31
32/// A [`Permutation`] backed by an `usize` vector
33pub struct OwnedPermutation<T: Sync + AsRef<[usize]>>(T);
34
35impl<T: Sync + AsRef<[usize]>> OwnedPermutation<T> {
36    /// Creates a permutation
37    #[inline(always)]
38    pub fn new_unchecked(perm: T) -> Self {
39        OwnedPermutation(perm)
40    }
41
42    /// Creates a permutation, or returns an error in case the permutation is
43    /// incorrect
44    #[inline]
45    pub fn new(perm: T) -> Result<Self> {
46        // Check the permutation's image has the same size as the preimage
47        if let Some((old, new)) = perm
48            .as_ref()
49            .par_iter()
50            .enumerate()
51            .find_any(|&(_old, &new)| new >= perm.as_ref().len())
52        {
53            bail!(
54                "Found node {} with id {} in permutation, graph size is {}",
55                old,
56                new,
57                perm.as_ref().len()
58            );
59        }
60
61        // Check the permutation is injective
62        let mut seen = Vec::with_capacity(perm.as_ref().len() as _);
63        seen.extend((0..perm.as_ref().len()).map(|_| AtomicBool::new(false)));
64        if let Some((old, _)) = perm
65            .as_ref()
66            .par_iter()
67            .map(|&node| seen[node].fetch_or(true, Ordering::SeqCst))
68            .enumerate()
69            .find_any(|&(_node, is_duplicated)| is_duplicated)
70        {
71            let new = perm.as_ref()[old];
72            bail!(
73                "At least two nodes are mapped to {}; one of them is {}",
74                new,
75                old,
76            );
77        }
78
79        // Therefore, the permutation is bijective.
80
81        Ok(OwnedPermutation(perm))
82    }
83
84    pub fn dump<W: Write>(&self, file: &mut W) -> std::io::Result<()> {
85        let mut pl = progress_logger!(
86            display_memory = true,
87            item_name = "byte",
88            local_speed = true,
89            expected_updates = Some(self.len() * 8),
90        );
91        pl.start("Writing permutation");
92
93        let chunk_size = 1_000_000; // 1M of u64 -> 8MB
94        let mut buf = vec![0u8; chunk_size * 8];
95        for chunk in self.0.as_ref().chunks(chunk_size) {
96            let buf_slice = &mut buf[..chunk.len() * 8]; // no-op except for the last chunk
97            assert_eq!(
98                usize::BITS,
99                u64::BITS,
100                "Only 64-bits architectures are supported"
101            );
102            let chunk: &[u64] =
103                bytemuck::try_cast_slice(chunk).expect("Could not cast &[usize] to &[u64]");
104            BigEndian::write_u64_into(chunk, buf_slice);
105            file.write_all(buf_slice)?;
106            pl.update_with_count(chunk.len() * 8);
107        }
108        pl.done();
109        Ok(())
110    }
111}
112
113impl<T: Sync + AsRef<[usize]> + AsMut<[usize]>> OwnedPermutation<T> {
114    /// Applies the other permutation to `self`, so `self` becomes a new permutation.
115    pub fn compose_in_place<T2: Permutation + Sync>(&mut self, other: T2) -> Result<()> {
116        if self.as_mut().len() != other.len() {
117            bail!(
118                "Cannot compose permutation of size {} with permutation of size {}",
119                self.as_ref().len(),
120                other.len()
121            );
122        }
123        self.as_mut()
124            .par_iter_mut()
125            .for_each(|x| *x = unsafe { other.get_unchecked(*x) });
126
127        Ok(())
128    }
129}
130
131impl OwnedPermutation<Vec<usize>> {
132    /// Loads a permutation from disk and returns IO errors if any.
133    #[inline]
134    pub fn load_unchecked(num_nodes: usize, path: &Path) -> Result<Self> {
135        assert_eq!(
136            usize::BITS,
137            u64::BITS,
138            "Only 64-bits architectures are supported"
139        );
140
141        let mut file = std::fs::File::open(path).context("Could not open permutation")?;
142
143        let mut buf = [0u8; 8];
144        file.read_exact(&mut buf)?;
145        let epserde = &buf == b"epserde ";
146        if epserde {
147            use epserde::prelude::*;
148
149            // SAFETY: it's a plain vector and deserializing its values (usize) from any bytes is
150            // always valid
151            let perm = unsafe { <Vec<usize>>::load_full(path) }?;
152            Ok(OwnedPermutation(perm))
153        } else {
154            let mut perm = Vec::with_capacity(num_nodes);
155            file.rewind()?;
156            file.read_u64_into::<BigEndian>(unsafe {
157                std::mem::transmute::<&mut [MaybeUninit<usize>], &mut [u64]>(
158                    perm.spare_capacity_mut(),
159                )
160            })
161            .context("Could not read permutation")?;
162
163            // read_u64_into() called read_exact(), which checked the length
164            unsafe { perm.set_len(num_nodes) };
165
166            Ok(OwnedPermutation(perm))
167        }
168    }
169
170    /// Loads a permutation from disk, and returns errors in case of IO errors
171    /// or incorrect permutations.
172    #[inline]
173    pub fn load(num_nodes: usize, path: &Path) -> Result<Self> {
174        let perm = Self::load_unchecked(num_nodes, path)?;
175        Self::new(perm.0)
176    }
177}
178
179impl<T: Sync + AsRef<[usize]>> AsRef<[usize]> for OwnedPermutation<T> {
180    #[inline(always)]
181    fn as_ref(&self) -> &[usize] {
182        self.0.as_ref()
183    }
184}
185
186impl<T: Sync + AsRef<[usize]> + AsMut<[usize]>> AsMut<[usize]> for OwnedPermutation<T> {
187    #[inline(always)]
188    fn as_mut(&mut self) -> &mut [usize] {
189        self.0.as_mut()
190    }
191}
192
193impl<T: Sync + AsRef<[usize]>> std::ops::Index<usize> for OwnedPermutation<T> {
194    type Output = usize;
195
196    #[inline(always)]
197    fn index(&self, old_node: usize) -> &Self::Output {
198        &self.0.as_ref()[old_node]
199    }
200}
201
202impl<T: Sync + AsRef<[usize]>> Permutation for OwnedPermutation<T> {
203    #[inline(always)]
204    fn len(&self) -> usize {
205        self.as_ref().len()
206    }
207
208    #[inline(always)]
209    fn get(&self, old_node: usize) -> Option<usize> {
210        self.0.as_ref().get(old_node).copied()
211    }
212
213    #[inline(always)]
214    unsafe fn get_unchecked(&self, old_node: usize) -> usize {
215        *self.0.as_ref().get_unchecked(old_node)
216    }
217}
218
219/// A [`Permutation`] backed by a big-endian mmapped file
220pub struct MappedPermutation(Mmap);
221
222impl MappedPermutation {
223    /// Creates a permutation
224    #[inline(always)]
225    pub fn new_unchecked(perm: Mmap) -> Self {
226        MappedPermutation(perm)
227    }
228
229    /// Creates a permutation, or returns an error in case the permutation is
230    /// incorrect
231    #[inline]
232    pub fn new(perm: Mmap) -> Result<Self> {
233        assert_eq!(
234            perm.size() % 8,
235            0,
236            "mmap has size {}, which is not a multiple of 8",
237            perm.size()
238        );
239        let num_nodes = perm.len() / 8;
240
241        // Check the permutation's image has the same size as the preimage
242        if let Some((old, new)) = perm
243            .par_iter()
244            .chunks(8)
245            .map(|bytes| {
246                BigEndian::read_u64(bytes.into_iter().cloned().collect::<Vec<_>>().as_slice())
247                    as usize
248            })
249            .enumerate()
250            .find_any(|&(_old, new)| new >= num_nodes)
251        {
252            bail!(
253                "Found node {} has id {} in permutation, graph size is {}",
254                old,
255                new,
256                perm.len()
257            );
258        }
259
260        // Check the permutation is injective
261        let mut seen = Vec::with_capacity(perm.len() as _);
262        seen.extend((0..num_nodes).map(|_| AtomicBool::new(false)));
263        if let Some((old, _)) = perm
264            .par_iter()
265            .chunks(8)
266            .map(|bytes| {
267                BigEndian::read_u64(bytes.into_iter().cloned().collect::<Vec<_>>().as_slice())
268                    as usize
269            })
270            .map(|node| seen[node].fetch_or(true, Ordering::SeqCst))
271            .enumerate()
272            .find_any(|&(_node, is_duplicated)| is_duplicated)
273        {
274            let new = perm[old];
275            bail!(
276                "At least two nodes are mapped to {}; one of them is {}",
277                new,
278                old,
279            );
280        }
281
282        // Therefore, the permutation is bijective.
283
284        Ok(MappedPermutation(perm))
285    }
286
287    /// Loads a permutation from disk and returns IO errors if any.
288    #[inline]
289    pub fn load_unchecked(path: &Path) -> Result<Self> {
290        assert_eq!(
291            usize::BITS,
292            u64::BITS,
293            "Only 64-bits architectures are supported"
294        );
295
296        let file_len = path.metadata()?.len();
297        assert_eq!(
298            file_len % 8,
299            0,
300            "{} has size {}, which is not a multiple of 8",
301            path.display(),
302            file_len
303        );
304
305        let file = std::fs::File::open(path).context("Could not open permutation")?;
306        let perm = unsafe {
307            mmap_rs::MmapOptions::new(file_len as _)
308                .context("Could not initialize permutation mmap")?
309                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES | MmapFlags::RANDOM_ACCESS)
310                .with_file(&file, 0)
311        }
312        .map()
313        .context("Could not mmap permutation")?;
314
315        Ok(MappedPermutation(perm))
316    }
317
318    /// Loads a permutation from disk, and returns errors in case of IO errors
319    /// or incorrect permutations.
320    #[inline]
321    pub fn load(num_nodes: usize, path: &Path) -> Result<Self> {
322        let perm = Self::load_unchecked(path)?;
323        assert_eq!(
324            perm.len(),
325            num_nodes,
326            "Expected permutation to have length {}, got {}",
327            num_nodes,
328            perm.len()
329        );
330        Self::new(perm.0)
331    }
332}
333
334impl Permutation for MappedPermutation {
335    #[inline(always)]
336    fn len(&self) -> usize {
337        self.0.size() / 8
338    }
339
340    #[inline(always)]
341    fn get(&self, old_node: usize) -> Option<usize> {
342        let range = (old_node * 8)..((old_node + 1) * 8);
343        Some(BigEndian::read_u64(self.0.get(range)?) as usize)
344    }
345
346    #[inline(always)]
347    unsafe fn get_unchecked(&self, old_node: usize) -> usize {
348        let range = (old_node * 8)..((old_node + 1) * 8);
349        BigEndian::read_u64(self.0.get_unchecked(range)) as usize
350    }
351}