swh_graph/properties/
maps.rs

1// Copyright (C) 2023-2025  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use anyhow::{Context, Result};
7use mmap_rs::Mmap;
8use thiserror::Error;
9
10use super::suffixes::*;
11use super::*;
12use crate::graph::NodeId;
13use crate::map::{Node2SWHID, Node2Type, UsizeMmap};
14use crate::mph::{LoadableSwhidMphf, SwhidMphf, VecMphf};
15use crate::utils::suffix_path;
16use crate::{swhid::StrSWHIDDeserializationError, NodeType, SWHID};
17
18/// Trait implemented by both [`NoMaps`] and all implementors of [`Maps`],
19/// to allow loading maps only if needed.
20pub trait MaybeMaps {}
21
22pub struct MappedMaps<MPHF: LoadableSwhidMphf> {
23    mphf: <MPHF as LoadableSwhidMphf>::WithMappedPermutation,
24    node2swhid: Node2SWHID<Mmap>,
25    node2type: Node2Type<UsizeMmap<Mmap>>,
26}
27impl<M: Maps> MaybeMaps for M {}
28
29/// Placeholder for when maps are not loaded.
30pub struct NoMaps;
31impl MaybeMaps for NoMaps {}
32
33#[diagnostic::on_unimplemented(
34    label = "does not have NodeId<->SWHID mappings loaded",
35    note = "Use `let graph = graph.load_properties(|props| props.load_maps::<DynMphf>()).unwrap()` to load them",
36    note = "Or replace `graph.init_properties()` with `graph.load_all_properties::<DynMphf>().unwrap()` to load all properties"
37)]
38/// Trait for backend storage of maps (either in-memory, or loaded from disk and memory-mapped)
39pub trait Maps {
40    type MPHF: SwhidMphf;
41
42    fn mphf(&self) -> &Self::MPHF;
43    fn node2swhid(&self, node: NodeId) -> Result<SWHID, OutOfBoundError>;
44    fn node2type(&self, node: NodeId) -> Result<NodeType, OutOfBoundError>;
45}
46
47impl<MPHF: LoadableSwhidMphf> Maps for MappedMaps<MPHF> {
48    type MPHF = <MPHF as LoadableSwhidMphf>::WithMappedPermutation;
49
50    #[inline(always)]
51    fn mphf(&self) -> &Self::MPHF {
52        &self.mphf
53    }
54    #[inline(always)]
55    fn node2swhid(&self, node: NodeId) -> Result<SWHID, OutOfBoundError> {
56        self.node2swhid.get(node)
57    }
58    #[inline(always)]
59    fn node2type(&self, node: NodeId) -> Result<NodeType, OutOfBoundError> {
60        self.node2type.get(node)
61    }
62}
63
64/// Trivial implementation of [`Maps`] that stores everything in a vector,
65/// instead of mmapping from disk
66pub struct VecMaps {
67    mphf: VecMphf,
68    node2swhid: Node2SWHID<Vec<u8>>,
69    node2type: Node2Type<Vec<usize>>,
70}
71
72impl VecMaps {
73    pub fn new(swhids: Vec<SWHID>) -> Self {
74        let node2swhid = Node2SWHID::new_from_iter(swhids.iter().cloned());
75        let node2type = Node2Type::new_from_iter(swhids.iter().map(|swhid| swhid.node_type));
76        VecMaps {
77            node2type,
78            node2swhid,
79            mphf: VecMphf { swhids },
80        }
81    }
82}
83
84impl Maps for VecMaps {
85    type MPHF = VecMphf;
86
87    #[inline(always)]
88    fn mphf(&self) -> &Self::MPHF {
89        &self.mphf
90    }
91    #[inline(always)]
92    fn node2swhid(&self, node: NodeId) -> Result<SWHID, OutOfBoundError> {
93        self.node2swhid.get(node)
94    }
95    #[inline(always)]
96    fn node2type(&self, node: NodeId) -> Result<NodeType, OutOfBoundError> {
97        self.node2type.get(node)
98    }
99}
100
101impl<
102        TIMESTAMPS: MaybeTimestamps,
103        PERSONS: MaybePersons,
104        CONTENTS: MaybeContents,
105        STRINGS: MaybeStrings,
106        LABELNAMES: MaybeLabelNames,
107    > SwhGraphProperties<NoMaps, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
108{
109    /// Consumes a [`SwhGraphProperties`] and returns a new one with these methods
110    /// available:
111    ///
112    /// * [`SwhGraphProperties::node_id_unchecked`]
113    /// * [`SwhGraphProperties::node_id`]
114    /// * [`SwhGraphProperties::swhid`]
115    /// * [`SwhGraphProperties::node_type`]
116    pub fn load_maps<MPHF: LoadableSwhidMphf>(
117        self,
118    ) -> Result<
119        SwhGraphProperties<MappedMaps<MPHF>, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>,
120    > {
121        let mphf = MPHF::load(&self.path)
122            .context("Could not load MPHF")?
123            .with_mapped_permutation(&self.path)
124            .context("Could not load permutation")?;
125        let maps = MappedMaps {
126            mphf,
127            node2swhid: Node2SWHID::load(suffix_path(&self.path, NODE2SWHID))
128                .context("Could not load node2swhid")?,
129            node2type: Node2Type::load(suffix_path(&self.path, NODE2TYPE), self.num_nodes)
130                .context("Could not load node2type")?,
131        };
132        self.with_maps(maps)
133    }
134
135    /// Alternative to [`load_maps`](Self::load_maps) that allows using arbitrary maps
136    /// implementations
137    pub fn with_maps<MAPS: MaybeMaps>(
138        self,
139        maps: MAPS,
140    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>> {
141        Ok(SwhGraphProperties {
142            maps,
143            timestamps: self.timestamps,
144            persons: self.persons,
145            contents: self.contents,
146            strings: self.strings,
147            label_names: self.label_names,
148            path: self.path,
149            num_nodes: self.num_nodes,
150            label_names_are_in_base64_order: self.label_names_are_in_base64_order,
151        })
152    }
153}
154
155#[derive(Error, Debug, PartialEq, Eq, Hash)]
156pub enum NodeIdFromSwhidError<E> {
157    #[error("invalid SWHID")]
158    InvalidSwhid(E),
159    #[error("unknown SWHID: {0}")]
160    UnknownSwhid(SWHID),
161    #[error("internal error: {0}")]
162    InternalError(&'static str),
163}
164
165/// Functions to map between SWHID and node id.
166///
167/// Only available after calling [`load_contents`](SwhGraphProperties::load_contents)
168/// or [`load_all_properties`](crate::graph::SwhBidirectionalGraph::load_all_properties)
169impl<
170        MAPS: Maps,
171        TIMESTAMPS: MaybeTimestamps,
172        PERSONS: MaybePersons,
173        CONTENTS: MaybeContents,
174        STRINGS: MaybeStrings,
175        LABELNAMES: MaybeLabelNames,
176    > SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
177{
178    /// Returns the node id of the given SWHID
179    ///
180    /// May return the id of a random node if the SWHID does not exist in the graph.
181    ///
182    /// # Safety
183    ///
184    /// Undefined behavior if the swhid does not exist.
185    #[inline]
186    pub unsafe fn node_id_unchecked(&self, swhid: &SWHID) -> NodeId {
187        self.maps
188            .mphf()
189            .hash_swhid(swhid)
190            .unwrap_or_else(|| panic!("Unknown SWHID {swhid}"))
191    }
192
193    /// Returns the node id of the given SWHID, or `None` if it does not exist.
194    #[inline]
195    pub fn node_id<T: TryInto<SWHID>>(
196        &self,
197        swhid: T,
198    ) -> Result<NodeId, NodeIdFromSwhidError<<T as TryInto<SWHID>>::Error>> {
199        use NodeIdFromSwhidError::*;
200
201        let swhid = swhid.try_into().map_err(InvalidSwhid)?;
202        let node_id = self
203            .maps
204            .mphf()
205            .hash_swhid(&swhid)
206            .ok_or(UnknownSwhid(swhid))?;
207        let actual_swhid = self
208            .maps
209            .node2swhid(node_id)
210            .map_err(|_| InternalError("node2swhid map is shorter than SWHID hash value"))?;
211        if actual_swhid == swhid {
212            Ok(node_id)
213        } else {
214            Err(UnknownSwhid(swhid))
215        }
216    }
217
218    /// Specialized version of `node_id` when the SWHID is a string
219    ///
220    /// Under the hood, when using [`GOVMPH`](crate::java_compat::mph::gov::GOVMPH),
221    /// `node_id` serializes the SWHID to a string, which can be bottleneck.
222    /// This function skips the serialization by working directly on the string.
223    #[inline]
224    pub fn node_id_from_string_swhid<T: AsRef<str>>(
225        &self,
226        swhid: T,
227    ) -> Result<NodeId, NodeIdFromSwhidError<StrSWHIDDeserializationError>> {
228        use NodeIdFromSwhidError::*;
229
230        let swhid = swhid.as_ref();
231        let node_id = self
232            .maps
233            .mphf()
234            .hash_str(swhid)
235            .ok_or_else(|| match swhid.try_into() {
236                Ok(swhid) => UnknownSwhid(swhid),
237                Err(e) => InvalidSwhid(e),
238            })?;
239        let actual_swhid = self
240            .maps
241            .node2swhid(node_id)
242            .map_err(|_| InternalError("node2swhid map is shorter than SWHID hash value"))?;
243        let swhid = SWHID::try_from(swhid).map_err(InvalidSwhid)?;
244        if actual_swhid == swhid {
245            Ok(node_id)
246        } else {
247            Err(UnknownSwhid(swhid))
248        }
249    }
250
251    /// Returns the SWHID of a given node
252    ///
253    /// # Panics
254    ///
255    /// If the node id does not exist.
256    #[inline]
257    pub fn swhid(&self, node_id: NodeId) -> SWHID {
258        self.try_swhid(node_id)
259            .unwrap_or_else(|e| panic!("Cannot get node SWHID: {e}"))
260    }
261
262    /// Returns the SWHID of a given node, or `None` if the node id does not exist
263    #[inline]
264    pub fn try_swhid(&self, node_id: NodeId) -> Result<SWHID, OutOfBoundError> {
265        self.maps.node2swhid(node_id)
266    }
267
268    /// Returns the type of a given node
269    ///
270    /// # Panics
271    ///
272    /// If the node id does not exist.
273    #[inline]
274    pub fn node_type(&self, node_id: NodeId) -> NodeType {
275        self.try_node_type(node_id)
276            .unwrap_or_else(|e| panic!("Cannot get node type: {e}"))
277    }
278
279    /// Returns the type of a given node, or `None` if the node id does not exist
280    #[inline]
281    pub fn try_node_type(&self, node_id: NodeId) -> Result<NodeType, OutOfBoundError> {
282        self.maps.node2type(node_id)
283    }
284}