swh_graph/map/
node2swhid.rs

1// Copyright (C) 2023-2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use crate::{OutOfBoundError, SWHID};
7use anyhow::{Context, Result};
8use mmap_rs::{Mmap, MmapFlags, MmapMut};
9
10/// Struct to load a `.node2swhid.bin` file and convert node ids to SWHIDs.
11pub struct Node2SWHID<B> {
12    data: B,
13}
14
15impl Node2SWHID<Mmap> {
16    /// Load a `.node2swhid.bin` file
17    pub fn load<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
18        let path = path.as_ref();
19        let file_len = path
20            .metadata()
21            .with_context(|| format!("Could not stat {}", path.display()))?
22            .len();
23        let file = std::fs::File::open(path)
24            .with_context(|| format!("Could not open {}", path.display()))?;
25        let data = unsafe {
26            mmap_rs::MmapOptions::new(file_len as _)
27                .context("Could not initialize mmap")?
28                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES | MmapFlags::RANDOM_ACCESS)
29                .with_file(&file, 0)
30                .map()
31                .with_context(|| format!("Could not mmap {}", path.display()))?
32        };
33        Ok(Self { data })
34    }
35}
36
37impl Node2SWHID<MmapMut> {
38    /// Create a new `.node2swhid.bin` file
39    pub fn new<P: AsRef<std::path::Path>>(path: P, num_nodes: usize) -> Result<Self> {
40        let path = path.as_ref();
41        let file_len = (num_nodes * SWHID::BYTES_SIZE)
42            .try_into()
43            .context("File size overflowed u64")?;
44        let file = std::fs::File::options()
45            .read(true)
46            .write(true)
47            .create_new(true)
48            .open(path)
49            .with_context(|| format!("Could not create {}", path.display()))?;
50
51        // fallocate the file with zeros so we can fill it without ever resizing it
52        file.set_len(file_len)
53            .with_context(|| format!("Could not fallocate {} with zeros", path.display()))?;
54
55        let data = unsafe {
56            mmap_rs::MmapOptions::new(file_len as _)
57                .context("Could not initialize mmap")?
58                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES | MmapFlags::SHARED)
59                .with_file(&file, 0)
60                .map_mut()
61                .with_context(|| format!("Could not mmap {}", path.display()))?
62        };
63        Ok(Self { data })
64    }
65}
66
67impl Node2SWHID<Vec<u8>> {
68    pub fn new_from_iter(swhids: impl ExactSizeIterator<Item = SWHID>) -> Self {
69        let file_len = swhids.len() * SWHID::BYTES_SIZE;
70        let data = vec![0; file_len];
71        let mut node2swhid = Node2SWHID { data };
72        for (i, swhid) in swhids.enumerate() {
73            node2swhid.set(i, swhid);
74        }
75        node2swhid
76    }
77}
78
79impl<B: AsRef<[u8]>> Node2SWHID<B> {
80    /// Convert a node_id to a SWHID
81    ///
82    /// # Safety
83    /// This function is unsafe because it does not check that `node_id` is
84    /// within bounds of the array if debug asserts are disabled
85    #[inline]
86    pub unsafe fn get_unchecked(&self, node_id: usize) -> SWHID {
87        let offset = node_id * SWHID::BYTES_SIZE;
88        let bytes = self
89            .data
90            .as_ref()
91            .get_unchecked(offset..offset + SWHID::BYTES_SIZE);
92        // this unwrap is always safe because we use the same const
93        let bytes: [u8; SWHID::BYTES_SIZE] = bytes.try_into().unwrap();
94        // this unwrap can only fail on a corrupted file, so it's ok to panic
95        SWHID::try_from(bytes).unwrap()
96    }
97
98    /// Convert a node_id to a SWHID
99    #[inline]
100    pub fn get(&self, node_id: usize) -> Result<SWHID, OutOfBoundError> {
101        let offset = node_id * SWHID::BYTES_SIZE;
102        let bytes = self
103            .data
104            .as_ref()
105            .get(offset..offset + SWHID::BYTES_SIZE)
106            .ok_or(OutOfBoundError {
107                index: node_id,
108                len: self.data.as_ref().len() / SWHID::BYTES_SIZE,
109            })?;
110        // this unwrap is always safe because we use the same const
111        let bytes: [u8; SWHID::BYTES_SIZE] = bytes.try_into().unwrap();
112        // this unwrap can only fail on a corrupted file, so it's ok to panic
113        Ok(SWHID::try_from(bytes).unwrap())
114    }
115
116    /// Return how many node_ids are in this map
117    #[allow(clippy::len_without_is_empty)] // rationale: we don't care about empty maps
118    #[inline]
119    pub fn len(&self) -> usize {
120        self.data.as_ref().len() / SWHID::BYTES_SIZE
121    }
122}
123
124impl<B: AsMut<[u8]> + AsRef<[u8]>> Node2SWHID<B> {
125    /// Set a node_id to map to a given SWHID, without checking bounds
126    ///
127    /// # Safety
128    /// This function is unsafe because it does not check that `node_id` is
129    /// within bounds of the array if debug asserts are disabled
130    #[inline]
131    pub unsafe fn set_unchecked(&mut self, node_id: usize, swhid: SWHID) {
132        let bytes: [u8; SWHID::BYTES_SIZE] = swhid.into();
133        let offset = node_id * SWHID::BYTES_SIZE;
134        self.data
135            .as_mut()
136            .get_unchecked_mut(offset..offset + SWHID::BYTES_SIZE)
137            .copy_from_slice(&bytes[..]);
138    }
139
140    /// Set a node_id to map to a given SWHID
141    #[inline]
142    pub fn set(&mut self, node_id: usize, swhid: SWHID) {
143        let bytes: [u8; SWHID::BYTES_SIZE] = swhid.into();
144        let offset = node_id * SWHID::BYTES_SIZE;
145        self.data
146            .as_mut()
147            .get_mut(offset..offset + SWHID::BYTES_SIZE)
148            .expect("Tried to write past the end of Node2SWHID map")
149            .copy_from_slice(&bytes[..]);
150    }
151}
152
153impl<B: AsRef<[u8]>> core::ops::Index<usize> for Node2SWHID<B> {
154    type Output = SWHID;
155    fn index(&self, index: usize) -> &Self::Output {
156        let offset = index * SWHID::BYTES_SIZE;
157        let bytes = &self.data.as_ref()[offset..offset + SWHID::BYTES_SIZE];
158        debug_assert!(core::mem::size_of::<SWHID>() == SWHID::BYTES_SIZE);
159        // unsafe :( but it's ok because SWHID does not depends on endianness
160        // also TODO!: check for version
161        unsafe { &*(bytes.as_ptr() as *const SWHID) }
162    }
163}