swh_graph/map/
permutation.rs

1// Copyright (C) 2023  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::io::{Read, Seek, Write};
7use std::mem::MaybeUninit;
8use std::path::Path;
9use std::sync::atomic::{AtomicBool, Ordering};
10
11use anyhow::{bail, Context, Result};
12use byteorder::{BigEndian, ByteOrder, ReadBytesExt};
13use dsi_progress_logger::{progress_logger, ProgressLog};
14use mmap_rs::{Mmap, MmapFlags};
15use rayon::prelude::*;
16
17/// An array of `n` unique integers in the `0..n` range.
18#[allow(clippy::len_without_is_empty)]
19pub trait Permutation {
20    /// Returns the number of items
21    fn len(&self) -> usize;
22    /// Returns an item
23    fn get(&self, old_node: usize) -> Option<usize>;
24    /// Returns an item without checking it is within the bounds
25    ///
26    /// # Safety
27    ///
28    /// Undefined behavior if `old_node >= len()`
29    unsafe fn get_unchecked(&self, old_node: usize) -> usize;
30}
31
32/// A [`Permutation`] backed by an `usize` vector
33pub struct OwnedPermutation<T: Sync + AsRef<[usize]>>(T);
34
35impl<T: Sync + AsRef<[usize]>> OwnedPermutation<T> {
36    /// Creates a permutation
37    #[inline(always)]
38    pub fn new_unchecked(perm: T) -> Self {
39        OwnedPermutation(perm)
40    }
41
42    /// Creates a permutation, or returns an error in case the permutation is
43    /// incorrect
44    #[inline]
45    pub fn new(perm: T) -> Result<Self> {
46        // Check the permutation's image has the same size as the preimage
47        if let Some((old, new)) = perm
48            .as_ref()
49            .par_iter()
50            .enumerate()
51            .find_any(|&(_old, &new)| new >= perm.as_ref().len())
52        {
53            bail!(
54                "Found node {} has id {} in permutation, graph size is {}",
55                old,
56                new,
57                perm.as_ref().len()
58            );
59        }
60
61        // Check the permutation is injective
62        let mut seen = Vec::with_capacity(perm.as_ref().len() as _);
63        seen.extend((0..perm.as_ref().len()).map(|_| AtomicBool::new(false)));
64        if let Some((old, _)) = perm
65            .as_ref()
66            .par_iter()
67            .map(|&node| seen[node].fetch_or(true, Ordering::SeqCst))
68            .enumerate()
69            .find_any(|&(_node, is_duplicated)| is_duplicated)
70        {
71            let new = perm.as_ref()[old];
72            bail!(
73                "At least two nodes are mapped to {}; one of them is {}",
74                new,
75                old,
76            );
77        }
78
79        // Therefore, the permutation is bijective.
80
81        Ok(OwnedPermutation(perm))
82    }
83
84    pub fn dump<W: Write>(&self, file: &mut W) -> std::io::Result<()> {
85        let mut pl = progress_logger!(
86            display_memory = true,
87            item_name = "byte",
88            local_speed = true,
89            expected_updates = Some(self.len() * 8),
90        );
91        pl.start("Writing permutation");
92
93        let chunk_size = 1_000_000; // 1M of u64 -> 8MB
94        let mut buf = vec![0u8; chunk_size * 8];
95        for chunk in self.0.as_ref().chunks(chunk_size) {
96            let buf_slice = &mut buf[..chunk.len() * 8]; // no-op except for the last chunk
97            assert_eq!(
98                usize::BITS,
99                u64::BITS,
100                "Only 64-bits architectures are supported"
101            );
102            let chunk: &[u64] =
103                bytemuck::try_cast_slice(chunk).expect("Could not cast &[usize] to &[u64]");
104            BigEndian::write_u64_into(chunk, buf_slice);
105            file.write_all(buf_slice)?;
106            pl.update_with_count(chunk.len() * 8);
107        }
108        pl.done();
109        Ok(())
110    }
111}
112
113impl<T: Sync + AsRef<[usize]> + AsMut<[usize]>> OwnedPermutation<T> {
114    /// Applies the other permutation to `self`, so `self` becomes a new permutation.
115    pub fn compose_in_place<T2: Permutation + Sync>(&mut self, other: T2) -> Result<()> {
116        if self.as_mut().len() != other.len() {
117            bail!(
118                "Cannot compose permutation of size {} with permutation of size {}",
119                self.as_ref().len(),
120                other.len()
121            );
122        }
123        self.as_mut()
124            .par_iter_mut()
125            .for_each(|x| *x = unsafe { other.get_unchecked(*x) });
126
127        Ok(())
128    }
129}
130
131impl OwnedPermutation<Vec<usize>> {
132    /// Loads a permutation from disk and returns IO errors if any.
133    #[inline]
134    pub fn load_unchecked(num_nodes: usize, path: &Path) -> Result<Self> {
135        assert_eq!(
136            usize::BITS,
137            u64::BITS,
138            "Only 64-bits architectures are supported"
139        );
140
141        let mut file = std::fs::File::open(path).context("Could not open permutation")?;
142
143        let mut buf = [0u8; 8];
144        file.read_exact(&mut buf)?;
145        let epserde = &buf == b"epserde ";
146        if epserde {
147            use epserde::prelude::*;
148
149            let perm = <Vec<usize>>::load_full(path)?;
150            Ok(OwnedPermutation(perm))
151        } else {
152            let mut perm = Vec::with_capacity(num_nodes);
153            file.rewind()?;
154            file.read_u64_into::<BigEndian>(unsafe {
155                std::mem::transmute::<&mut [MaybeUninit<usize>], &mut [u64]>(
156                    perm.spare_capacity_mut(),
157                )
158            })
159            .context("Could not read permutation")?;
160
161            // read_u64_into() called read_exact(), which checked the length
162            unsafe { perm.set_len(num_nodes) };
163
164            Ok(OwnedPermutation(perm))
165        }
166    }
167
168    /// Loads a permutation from disk, and returns errors in case of IO errors
169    /// or incorrect permutations.
170    #[inline]
171    pub fn load(num_nodes: usize, path: &Path) -> Result<Self> {
172        let perm = Self::load_unchecked(num_nodes, path)?;
173        Self::new(perm.0)
174    }
175}
176
177impl<T: Sync + AsRef<[usize]>> AsRef<[usize]> for OwnedPermutation<T> {
178    #[inline(always)]
179    fn as_ref(&self) -> &[usize] {
180        self.0.as_ref()
181    }
182}
183
184impl<T: Sync + AsRef<[usize]> + AsMut<[usize]>> AsMut<[usize]> for OwnedPermutation<T> {
185    #[inline(always)]
186    fn as_mut(&mut self) -> &mut [usize] {
187        self.0.as_mut()
188    }
189}
190
191impl<T: Sync + AsRef<[usize]>> std::ops::Index<usize> for OwnedPermutation<T> {
192    type Output = usize;
193
194    #[inline(always)]
195    fn index(&self, old_node: usize) -> &Self::Output {
196        &self.0.as_ref()[old_node]
197    }
198}
199
200impl<T: Sync + AsRef<[usize]>> Permutation for OwnedPermutation<T> {
201    fn len(&self) -> usize {
202        self.as_ref().len()
203    }
204
205    fn get(&self, old_node: usize) -> Option<usize> {
206        self.0.as_ref().get(old_node).copied()
207    }
208
209    unsafe fn get_unchecked(&self, old_node: usize) -> usize {
210        *self.0.as_ref().get_unchecked(old_node)
211    }
212}
213
214/// A [`Permutation`] backed by a big-endian mmapped file
215pub struct MappedPermutation(Mmap);
216
217impl MappedPermutation {
218    /// Creates a permutation
219    #[inline(always)]
220    pub fn new_unchecked(perm: Mmap) -> Self {
221        MappedPermutation(perm)
222    }
223
224    /// Creates a permutation, or returns an error in case the permutation is
225    /// incorrect
226    #[inline]
227    pub fn new(perm: Mmap) -> Result<Self> {
228        assert_eq!(
229            perm.size() % 8,
230            0,
231            "mmap has size {}, which is not a multiple of 8",
232            perm.size()
233        );
234
235        // Check the permutation's image has the same size as the preimage
236        if let Some((old, new)) = perm
237            .par_iter()
238            .chunks(8)
239            .map(|bytes| {
240                BigEndian::read_u64(bytes.into_iter().cloned().collect::<Vec<_>>().as_slice())
241                    as usize
242            })
243            .enumerate()
244            .find_any(|&(_old, new)| new >= perm.len())
245        {
246            bail!(
247                "Found node {} has id {} in permutation, graph size is {}",
248                old,
249                new,
250                perm.len()
251            );
252        }
253
254        // Check the permutation is injective
255        let mut seen = Vec::with_capacity(perm.len() as _);
256        seen.extend((0..perm.len()).map(|_| AtomicBool::new(false)));
257        if let Some((old, _)) = perm
258            .par_iter()
259            .chunks(8)
260            .map(|bytes| {
261                BigEndian::read_u64(bytes.into_iter().cloned().collect::<Vec<_>>().as_slice())
262                    as usize
263            })
264            .map(|node| seen[node].fetch_or(true, Ordering::SeqCst))
265            .enumerate()
266            .find_any(|&(_node, is_duplicated)| is_duplicated)
267        {
268            let new = perm[old];
269            bail!(
270                "At least two nodes are mapped to {}; one of them is {}",
271                new,
272                old,
273            );
274        }
275
276        // Therefore, the permutation is bijective.
277
278        Ok(MappedPermutation(perm))
279    }
280
281    /// Loads a permutation from disk and returns IO errors if any.
282    #[inline]
283    pub fn load_unchecked(path: &Path) -> Result<Self> {
284        assert_eq!(
285            usize::BITS,
286            u64::BITS,
287            "Only 64-bits architectures are supported"
288        );
289
290        let file_len = path.metadata()?.len();
291        assert_eq!(
292            file_len % 8,
293            0,
294            "{} has size {}, which is not a multiple of 8",
295            path.display(),
296            file_len
297        );
298
299        let file = std::fs::File::open(path).context("Could not open permutation")?;
300        let perm = unsafe {
301            mmap_rs::MmapOptions::new(file_len as _)
302                .context("Could not initialize permutation mmap")?
303                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES | MmapFlags::RANDOM_ACCESS)
304                .with_file(&file, 0)
305        }
306        .map()
307        .context("Could not mmap permutation")?;
308
309        Ok(MappedPermutation(perm))
310    }
311
312    /// Loads a permutation from disk, and returns errors in case of IO errors
313    /// or incorrect permutations.
314    #[inline]
315    pub fn load(num_nodes: usize, path: &Path) -> Result<Self> {
316        let perm = Self::load_unchecked(path)?;
317        assert_eq!(
318            perm.len(),
319            num_nodes,
320            "Expected permutation to have length {}, got {}",
321            num_nodes,
322            perm.len()
323        );
324        Self::new(perm.0)
325    }
326}
327
328impl Permutation for MappedPermutation {
329    fn len(&self) -> usize {
330        self.0.size() / 8
331    }
332
333    fn get(&self, old_node: usize) -> Option<usize> {
334        let range = (old_node * 8)..((old_node + 1) * 8);
335        Some(BigEndian::read_u64(self.0.get(range)?) as usize)
336    }
337
338    unsafe fn get_unchecked(&self, old_node: usize) -> usize {
339        let range = (old_node * 8)..((old_node + 1) * 8);
340        BigEndian::read_u64(self.0.get_unchecked(range)) as usize
341    }
342}