Skip to main content

swh_graph/
mph.rs

1// Copyright (C) 2024-2026  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Abstraction over possible Minimal-perfect hash functions
7
8use std::path::Path;
9
10use anyhow::{bail, Context, Result};
11use pthash::{DictionaryDictionary, Minimal, MurmurHash2_128, PartitionedPhf, Phf};
12
13use crate::graph::NodeId;
14use crate::java_compat::mph::gov::GOVMPH;
15use crate::map::{MappedPermutation, Permutation};
16use crate::utils::suffix_path;
17use crate::SWHID;
18
19/// Minimal-perfect hash function over [`SWHID`].
20///
21/// See [`DynMphf`] which wraps all implementer structs in an enum to dynamically choose
22/// which MPH algorithm to use with less overhead than `dyn SwhidMphf`.
23pub trait SwhidMphf {
24    /// Hashes a SWHID's binary representation
25    #[inline(always)]
26    fn hash_array(&self, swhid: &[u8; SWHID::BYTES_SIZE]) -> Option<NodeId> {
27        self.hash_swhid(&(*swhid).try_into().ok()?)
28    }
29
30    /// Hashes a SWHID's textual representation
31    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId>;
32
33    /// Hashes a SWHID's textual representation
34    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId>;
35
36    /// Hashes a [`SWHID`]
37    #[inline(always)]
38    fn hash_swhid(&self, swhid: &SWHID) -> Option<NodeId> {
39        self.hash_str(swhid.to_string())
40    }
41}
42
43#[diagnostic::on_unimplemented(
44    label = "requires LoadableSwhidMphf (not just SwhidMphf)",
45    note = "swh-graph v7.0.0 split some methods of the SwhidMphf trait into a new LoadableSwhidMphf trait."
46)]
47/// Sub-trait of [`SwhidMphf`] for "concrete" MPHF implementations
48pub trait LoadableSwhidMphf: SwhidMphf {
49    type WithMappedPermutation: SwhidMphf;
50
51    fn load(basepath: impl AsRef<Path>) -> Result<Self>
52    where
53        Self: Sized;
54
55    /// Given the base path of the MPH, mmaps the associated .order file and returns it
56    fn with_mapped_permutation(
57        self,
58        basepath: impl AsRef<Path>,
59    ) -> Result<Self::WithMappedPermutation>;
60}
61
62/// Composition of a MPHF and a permutation
63pub struct PermutedMphf<MPHF: SwhidMphf, P: Permutation> {
64    mphf: MPHF,
65    permutation: P,
66}
67
68impl<MPHF: SwhidMphf, P: Permutation> SwhidMphf for PermutedMphf<MPHF, P> {
69    #[inline(always)]
70    fn hash_array(&self, swhid: &[u8; SWHID::BYTES_SIZE]) -> Option<NodeId> {
71        self.mphf
72            .hash_array(swhid)
73            .and_then(|id| self.permutation.get(id))
74    }
75
76    #[inline(always)]
77    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
78        self.mphf
79            .hash_str(swhid)
80            .and_then(|id| self.permutation.get(id))
81    }
82
83    #[inline(always)]
84    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
85        self.mphf
86            .hash_str_array(swhid)
87            .and_then(|id| self.permutation.get(id))
88    }
89
90    #[inline(always)]
91    fn hash_swhid(&self, swhid: &SWHID) -> Option<NodeId> {
92        self.mphf
93            .hash_swhid(swhid)
94            .and_then(|id| self.permutation.get(id))
95    }
96}
97
98/// Trivial implementation of [`SwhidMphf`] that stores the list of items in a vector
99pub struct VecMphf {
100    pub swhids: Vec<SWHID>,
101}
102
103impl SwhidMphf for VecMphf {
104    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
105        swhid
106            .as_ref()
107            .try_into()
108            .ok()
109            .and_then(|swhid: SWHID| self.hash_swhid(&swhid))
110    }
111
112    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
113        String::from_utf8(swhid.to_vec())
114            .ok()
115            .and_then(|swhid| self.hash_str(swhid))
116    }
117
118    fn hash_swhid(&self, swhid: &SWHID) -> Option<NodeId> {
119        self.swhids.iter().position(|item| item == swhid)
120    }
121}
122
123impl LoadableSwhidMphf for GOVMPH {
124    type WithMappedPermutation = PermutedMphf<Self, MappedPermutation>;
125
126    fn load(basepath: impl AsRef<Path>) -> Result<Self>
127    where
128        Self: Sized,
129    {
130        let path = suffix_path(basepath, ".cmph");
131        GOVMPH::load(&path).with_context(|| format!("Could not load {}", path.display()))
132    }
133
134    fn with_mapped_permutation(
135        self,
136        basepath: impl AsRef<Path>,
137    ) -> Result<Self::WithMappedPermutation> {
138        let suffix = ".order"; // not .cmph.order because .order predates "modular" MPH functions
139        let permutation = MappedPermutation::load_unchecked(&suffix_path(&basepath, suffix))?;
140        Ok(PermutedMphf {
141            mphf: self,
142            permutation,
143        })
144    }
145}
146
147impl SwhidMphf for GOVMPH {
148    #[inline(always)]
149    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
150        Some(self.get_byte_array(swhid.as_ref().as_bytes()) as usize)
151    }
152
153    #[inline(always)]
154    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
155        Some(self.get_byte_array(swhid) as usize)
156    }
157}
158
159pub struct SwhidPthash(pub PartitionedPhf<Minimal, MurmurHash2_128, DictionaryDictionary>);
160
161impl SwhidPthash {
162    pub fn load(path: impl AsRef<Path>) -> Result<Self>
163    where
164        Self: Sized,
165    {
166        let path = path.as_ref();
167        PartitionedPhf::load(path)
168            .with_context(|| format!("Could not load {}", path.display()))
169            .map(SwhidPthash)
170    }
171}
172
173/// Newtype to make SWHID hashable by PTHash
174pub(crate) struct HashableSWHID<T: AsRef<[u8]>>(pub T);
175
176impl<T: AsRef<[u8]>> pthash::Hashable for HashableSWHID<T> {
177    type Bytes<'a>
178        = &'a T
179    where
180        T: 'a;
181    fn as_bytes(&self) -> Self::Bytes<'_> {
182        &self.0
183    }
184}
185
186impl LoadableSwhidMphf for SwhidPthash {
187    type WithMappedPermutation = PermutedMphf<Self, MappedPermutation>;
188
189    fn load(basepath: impl AsRef<Path>) -> Result<Self>
190    where
191        Self: Sized,
192    {
193        let path = suffix_path(basepath, ".pthash");
194        SwhidPthash::load(path)
195    }
196
197    fn with_mapped_permutation(
198        self,
199        basepath: impl AsRef<Path>,
200    ) -> Result<Self::WithMappedPermutation> {
201        let suffix = ".pthash.order";
202        let permutation = MappedPermutation::load_unchecked(&suffix_path(&basepath, suffix))?;
203        Ok(PermutedMphf {
204            mphf: self,
205            permutation,
206        })
207    }
208}
209
210impl SwhidMphf for SwhidPthash {
211    #[inline(always)]
212    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
213        Some(self.0.hash(HashableSWHID(swhid.as_ref().as_bytes())) as usize)
214    }
215
216    #[inline(always)]
217    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
218        Some(self.0.hash(HashableSWHID(swhid)) as usize)
219    }
220}
221
222/// Enum of possible implementations of [`SwhidMphf`].
223///
224/// Loads either [`SwhidPthash`] or [`GOVMPH`]
225/// depending on which file is available at the given path.
226pub enum DynMphf {
227    Pthash(SwhidPthash),
228    GOV(GOVMPH),
229}
230
231impl std::fmt::Debug for DynMphf {
232    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
233        match self {
234            DynMphf::Pthash(_) => write!(f, "DynMphf::Pthash(_)"),
235            DynMphf::GOV(_) => write!(f, "DynMphf::GOV(_)"),
236        }
237    }
238}
239
240impl From<GOVMPH> for DynMphf {
241    #[inline(always)]
242    fn from(value: GOVMPH) -> DynMphf {
243        DynMphf::GOV(value)
244    }
245}
246
247impl From<SwhidPthash> for DynMphf {
248    #[inline(always)]
249    fn from(value: SwhidPthash) -> DynMphf {
250        DynMphf::Pthash(value)
251    }
252}
253
254impl LoadableSwhidMphf for DynMphf {
255    type WithMappedPermutation = PermutedDynMphf;
256
257    fn load(basepath: impl AsRef<Path>) -> Result<Self> {
258        let basepath = basepath.as_ref();
259
260        let pthash_path = suffix_path(basepath, ".pthash");
261        if pthash_path.exists() {
262            return LoadableSwhidMphf::load(basepath)
263                .map(Self::Pthash)
264                .with_context(|| format!("Could not load {}", pthash_path.display()));
265        }
266
267        let gov_path = suffix_path(basepath, ".cmph");
268        if gov_path.exists() {
269            return LoadableSwhidMphf::load(basepath)
270                .map(Self::GOV)
271                .with_context(|| format!("Could not load {}", gov_path.display()));
272        }
273
274        bail!(
275            "Cannot load MPH function, neither {} nor {} exists.",
276            pthash_path.display(),
277            gov_path.display()
278        );
279    }
280
281    fn with_mapped_permutation(
282        self,
283        basepath: impl AsRef<Path>,
284    ) -> Result<Self::WithMappedPermutation> {
285        match self {
286            Self::Pthash(mphf) => mphf
287                .with_mapped_permutation(basepath)
288                .map(PermutedDynMphf::Pthash),
289            Self::GOV(mphf) => mphf
290                .with_mapped_permutation(basepath)
291                .map(PermutedDynMphf::GOV),
292        }
293    }
294}
295
296impl SwhidMphf for DynMphf {
297    #[inline(always)]
298    fn hash_array(&self, swhid: &[u8; SWHID::BYTES_SIZE]) -> Option<NodeId> {
299        match self {
300            Self::Pthash(mphf) => mphf.hash_array(swhid),
301            Self::GOV(mphf) => mphf.hash_array(swhid),
302        }
303    }
304
305    #[inline(always)]
306    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
307        match self {
308            Self::Pthash(mphf) => mphf.hash_str(swhid),
309            Self::GOV(mphf) => mphf.hash_str(swhid),
310        }
311    }
312
313    #[inline(always)]
314    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
315        match self {
316            Self::Pthash(mphf) => mphf.hash_str_array(swhid),
317            Self::GOV(mphf) => mphf.hash_str_array(swhid),
318        }
319    }
320
321    #[inline(always)]
322    fn hash_swhid(&self, swhid: &SWHID) -> Option<NodeId> {
323        match self {
324            Self::Pthash(mphf) => mphf.hash_swhid(swhid),
325            Self::GOV(mphf) => mphf.hash_swhid(swhid),
326        }
327    }
328}
329
330/// [`DynMphf`] composed with a permutation
331pub enum PermutedDynMphf {
332    Pthash(<SwhidPthash as LoadableSwhidMphf>::WithMappedPermutation),
333    GOV(<GOVMPH as LoadableSwhidMphf>::WithMappedPermutation),
334}
335
336impl SwhidMphf for PermutedDynMphf {
337    #[inline(always)]
338    fn hash_array(&self, swhid: &[u8; SWHID::BYTES_SIZE]) -> Option<NodeId> {
339        match self {
340            Self::Pthash(mphf) => mphf.hash_array(swhid),
341            Self::GOV(mphf) => mphf.hash_array(swhid),
342        }
343    }
344
345    #[inline(always)]
346    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
347        match self {
348            Self::Pthash(mphf) => mphf.hash_str(swhid),
349            Self::GOV(mphf) => mphf.hash_str(swhid),
350        }
351    }
352
353    #[inline(always)]
354    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
355        match self {
356            Self::Pthash(mphf) => mphf.hash_str_array(swhid),
357            Self::GOV(mphf) => mphf.hash_str_array(swhid),
358        }
359    }
360
361    #[inline(always)]
362    fn hash_swhid(&self, swhid: &SWHID) -> Option<NodeId> {
363        match self {
364            Self::Pthash(mphf) => mphf.hash_swhid(swhid),
365            Self::GOV(mphf) => mphf.hash_swhid(swhid),
366        }
367    }
368}