swh_graph/map/
permutation.rs

1// Copyright (C) 2023  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::io::{Read, Seek, Write};
7use std::mem::MaybeUninit;
8use std::path::Path;
9use std::sync::atomic::{AtomicBool, Ordering};
10
11use anyhow::{bail, Context, Result};
12use byteorder::{BigEndian, ByteOrder, ReadBytesExt};
13use dsi_progress_logger::{progress_logger, ProgressLog};
14use mmap_rs::{Mmap, MmapFlags};
15use rayon::prelude::*;
16
17/// An array of `n` unique integers in the `0..n` range.
18#[allow(clippy::len_without_is_empty)]
19pub trait Permutation {
20    /// Returns the number of items
21    fn len(&self) -> usize;
22    /// Returns an item
23    fn get(&self, old_node: usize) -> Option<usize>;
24    /// Returns an item without checking it is within the bounds
25    ///
26    /// # Safety
27    ///
28    /// Undefined behavior if `old_node >= len()`
29    unsafe fn get_unchecked(&self, old_node: usize) -> usize;
30}
31
32/// A [`Permutation`] backed by an `usize` vector
33pub struct OwnedPermutation<T: Sync + AsRef<[usize]>>(T);
34
35impl<T: Sync + AsRef<[usize]>> OwnedPermutation<T> {
36    /// Creates a permutation
37    #[inline(always)]
38    pub fn new_unchecked(perm: T) -> Self {
39        OwnedPermutation(perm)
40    }
41
42    /// Creates a permutation, or returns an error in case the permutation is
43    /// incorrect
44    #[inline]
45    pub fn new(perm: T) -> Result<Self> {
46        // Check the permutation's image has the same size as the preimage
47        if let Some((old, new)) = perm
48            .as_ref()
49            .par_iter()
50            .enumerate()
51            .find_any(|&(_old, &new)| new >= perm.as_ref().len())
52        {
53            bail!(
54                "Found node {} has id {} in permutation, graph size is {}",
55                old,
56                new,
57                perm.as_ref().len()
58            );
59        }
60
61        // Check the permutation is injective
62        let mut seen = Vec::with_capacity(perm.as_ref().len() as _);
63        seen.extend((0..perm.as_ref().len()).map(|_| AtomicBool::new(false)));
64        if let Some((old, _)) = perm
65            .as_ref()
66            .par_iter()
67            .map(|&node| seen[node].fetch_or(true, Ordering::SeqCst))
68            .enumerate()
69            .find_any(|&(_node, is_duplicated)| is_duplicated)
70        {
71            let new = perm.as_ref()[old];
72            bail!(
73                "At least two nodes are mapped to {}; one of them is {}",
74                new,
75                old,
76            );
77        }
78
79        // Therefore, the permutation is bijective.
80
81        Ok(OwnedPermutation(perm))
82    }
83
84    pub fn dump<W: Write>(&self, file: &mut W) -> std::io::Result<()> {
85        let mut pl = progress_logger!(
86            display_memory = true,
87            item_name = "byte",
88            local_speed = true,
89            expected_updates = Some(self.len() * 8),
90        );
91        pl.start("Writing permutation");
92
93        let chunk_size = 1_000_000; // 1M of u64 -> 8MB
94        let mut buf = vec![0u8; chunk_size * 8];
95        for chunk in self.0.as_ref().chunks(chunk_size) {
96            let buf_slice = &mut buf[..chunk.len() * 8]; // no-op except for the last chunk
97            assert_eq!(
98                usize::BITS,
99                u64::BITS,
100                "Only 64-bits architectures are supported"
101            );
102            let chunk: &[u64] =
103                bytemuck::try_cast_slice(chunk).expect("Could not cast &[usize] to &[u64]");
104            BigEndian::write_u64_into(chunk, buf_slice);
105            file.write_all(buf_slice)?;
106            pl.update_with_count(chunk.len() * 8);
107        }
108        pl.done();
109        Ok(())
110    }
111}
112
113impl<T: Sync + AsRef<[usize]> + AsMut<[usize]>> OwnedPermutation<T> {
114    /// Applies the other permutation to `self`, so `self` becomes a new permutation.
115    pub fn compose_in_place<T2: Permutation + Sync>(&mut self, other: T2) -> Result<()> {
116        if self.as_mut().len() != other.len() {
117            bail!(
118                "Cannot compose permutation of size {} with permutation of size {}",
119                self.as_ref().len(),
120                other.len()
121            );
122        }
123        self.as_mut()
124            .par_iter_mut()
125            .for_each(|x| *x = unsafe { other.get_unchecked(*x) });
126
127        Ok(())
128    }
129}
130
131impl OwnedPermutation<Vec<usize>> {
132    /// Loads a permutation from disk and returns IO errors if any.
133    #[inline]
134    pub fn load_unchecked(num_nodes: usize, path: &Path) -> Result<Self> {
135        assert_eq!(
136            usize::BITS,
137            u64::BITS,
138            "Only 64-bits architectures are supported"
139        );
140
141        let mut file = std::fs::File::open(path).context("Could not open permutation")?;
142
143        let mut buf = [0u8; 8];
144        file.read_exact(&mut buf)?;
145        let epserde = &buf == b"epserde ";
146        if epserde {
147            use epserde::prelude::*;
148
149            // SAFETY: it's a plain vector and deserializing its values (usize) from any bytes is
150            // always valid
151            let perm = unsafe { <Vec<usize>>::load_full(path) }?;
152            Ok(OwnedPermutation(perm))
153        } else {
154            let mut perm = Vec::with_capacity(num_nodes);
155            file.rewind()?;
156            file.read_u64_into::<BigEndian>(unsafe {
157                std::mem::transmute::<&mut [MaybeUninit<usize>], &mut [u64]>(
158                    perm.spare_capacity_mut(),
159                )
160            })
161            .context("Could not read permutation")?;
162
163            // read_u64_into() called read_exact(), which checked the length
164            unsafe { perm.set_len(num_nodes) };
165
166            Ok(OwnedPermutation(perm))
167        }
168    }
169
170    /// Loads a permutation from disk, and returns errors in case of IO errors
171    /// or incorrect permutations.
172    #[inline]
173    pub fn load(num_nodes: usize, path: &Path) -> Result<Self> {
174        let perm = Self::load_unchecked(num_nodes, path)?;
175        Self::new(perm.0)
176    }
177}
178
179impl<T: Sync + AsRef<[usize]>> AsRef<[usize]> for OwnedPermutation<T> {
180    #[inline(always)]
181    fn as_ref(&self) -> &[usize] {
182        self.0.as_ref()
183    }
184}
185
186impl<T: Sync + AsRef<[usize]> + AsMut<[usize]>> AsMut<[usize]> for OwnedPermutation<T> {
187    #[inline(always)]
188    fn as_mut(&mut self) -> &mut [usize] {
189        self.0.as_mut()
190    }
191}
192
193impl<T: Sync + AsRef<[usize]>> std::ops::Index<usize> for OwnedPermutation<T> {
194    type Output = usize;
195
196    #[inline(always)]
197    fn index(&self, old_node: usize) -> &Self::Output {
198        &self.0.as_ref()[old_node]
199    }
200}
201
202impl<T: Sync + AsRef<[usize]>> Permutation for OwnedPermutation<T> {
203    fn len(&self) -> usize {
204        self.as_ref().len()
205    }
206
207    fn get(&self, old_node: usize) -> Option<usize> {
208        self.0.as_ref().get(old_node).copied()
209    }
210
211    unsafe fn get_unchecked(&self, old_node: usize) -> usize {
212        *self.0.as_ref().get_unchecked(old_node)
213    }
214}
215
216/// A [`Permutation`] backed by a big-endian mmapped file
217pub struct MappedPermutation(Mmap);
218
219impl MappedPermutation {
220    /// Creates a permutation
221    #[inline(always)]
222    pub fn new_unchecked(perm: Mmap) -> Self {
223        MappedPermutation(perm)
224    }
225
226    /// Creates a permutation, or returns an error in case the permutation is
227    /// incorrect
228    #[inline]
229    pub fn new(perm: Mmap) -> Result<Self> {
230        assert_eq!(
231            perm.size() % 8,
232            0,
233            "mmap has size {}, which is not a multiple of 8",
234            perm.size()
235        );
236
237        // Check the permutation's image has the same size as the preimage
238        if let Some((old, new)) = perm
239            .par_iter()
240            .chunks(8)
241            .map(|bytes| {
242                BigEndian::read_u64(bytes.into_iter().cloned().collect::<Vec<_>>().as_slice())
243                    as usize
244            })
245            .enumerate()
246            .find_any(|&(_old, new)| new >= perm.len())
247        {
248            bail!(
249                "Found node {} has id {} in permutation, graph size is {}",
250                old,
251                new,
252                perm.len()
253            );
254        }
255
256        // Check the permutation is injective
257        let mut seen = Vec::with_capacity(perm.len() as _);
258        seen.extend((0..perm.len()).map(|_| AtomicBool::new(false)));
259        if let Some((old, _)) = perm
260            .par_iter()
261            .chunks(8)
262            .map(|bytes| {
263                BigEndian::read_u64(bytes.into_iter().cloned().collect::<Vec<_>>().as_slice())
264                    as usize
265            })
266            .map(|node| seen[node].fetch_or(true, Ordering::SeqCst))
267            .enumerate()
268            .find_any(|&(_node, is_duplicated)| is_duplicated)
269        {
270            let new = perm[old];
271            bail!(
272                "At least two nodes are mapped to {}; one of them is {}",
273                new,
274                old,
275            );
276        }
277
278        // Therefore, the permutation is bijective.
279
280        Ok(MappedPermutation(perm))
281    }
282
283    /// Loads a permutation from disk and returns IO errors if any.
284    #[inline]
285    pub fn load_unchecked(path: &Path) -> Result<Self> {
286        assert_eq!(
287            usize::BITS,
288            u64::BITS,
289            "Only 64-bits architectures are supported"
290        );
291
292        let file_len = path.metadata()?.len();
293        assert_eq!(
294            file_len % 8,
295            0,
296            "{} has size {}, which is not a multiple of 8",
297            path.display(),
298            file_len
299        );
300
301        let file = std::fs::File::open(path).context("Could not open permutation")?;
302        let perm = unsafe {
303            mmap_rs::MmapOptions::new(file_len as _)
304                .context("Could not initialize permutation mmap")?
305                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES | MmapFlags::RANDOM_ACCESS)
306                .with_file(&file, 0)
307        }
308        .map()
309        .context("Could not mmap permutation")?;
310
311        Ok(MappedPermutation(perm))
312    }
313
314    /// Loads a permutation from disk, and returns errors in case of IO errors
315    /// or incorrect permutations.
316    #[inline]
317    pub fn load(num_nodes: usize, path: &Path) -> Result<Self> {
318        let perm = Self::load_unchecked(path)?;
319        assert_eq!(
320            perm.len(),
321            num_nodes,
322            "Expected permutation to have length {}, got {}",
323            num_nodes,
324            perm.len()
325        );
326        Self::new(perm.0)
327    }
328}
329
330impl Permutation for MappedPermutation {
331    fn len(&self) -> usize {
332        self.0.size() / 8
333    }
334
335    fn get(&self, old_node: usize) -> Option<usize> {
336        let range = (old_node * 8)..((old_node + 1) * 8);
337        Some(BigEndian::read_u64(self.0.get(range)?) as usize)
338    }
339
340    unsafe fn get_unchecked(&self, old_node: usize) -> usize {
341        let range = (old_node * 8)..((old_node + 1) * 8);
342        BigEndian::read_u64(self.0.get_unchecked(range)) as usize
343    }
344}