swh_graph/
mph.rs

1// Copyright (C) 2024-2025  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Abstraction over possible Minimal-perfect hash functions
7
8use std::path::Path;
9
10use anyhow::{bail, Context, Result};
11use pthash::{DictionaryDictionary, Minimal, MurmurHash2_128, PartitionedPhf, Phf};
12
13use crate::graph::NodeId;
14use crate::java_compat::mph::gov::GOVMPH;
15use crate::map::{MappedPermutation, Permutation};
16use crate::utils::suffix_path;
17use crate::SWHID;
18
19/// Minimal-perfect hash function over [`SWHID`].
20///
21/// See [`DynMphf`] which wraps all implementer structs in an enum to dynamically choose
22/// which MPH algorithm to use with less overhead than `dyn SwhidMphf`.
23pub trait SwhidMphf {
24    /// Hashes a SWHID's binary representation
25    #[inline(always)]
26    fn hash_array(&self, swhid: &[u8; SWHID::BYTES_SIZE]) -> Option<NodeId> {
27        self.hash_swhid(&(*swhid).try_into().ok()?)
28    }
29
30    /// Hashes a SWHID's textual representation
31    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId>;
32
33    /// Hashes a SWHID's textual representation
34    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId>;
35
36    /// Hashes a [`SWHID`]
37    #[inline(always)]
38    fn hash_swhid(&self, swhid: &SWHID) -> Option<NodeId> {
39        self.hash_str(swhid.to_string())
40    }
41}
42
43#[diagnostic::on_unimplemented(
44    label = "requires LoadableSwhidMphf (not just SwhidMphf)",
45    note = "swh-graph v7.0.0 split some methods of the SwhidMphf trait into a new LoadableSwhidMphf trait."
46)]
47/// Sub-trait of [`SwhidMphf`] for "concrete" MPHF implementations
48pub trait LoadableSwhidMphf: SwhidMphf {
49    type WithMappedPermutation: SwhidMphf;
50
51    fn load(basepath: impl AsRef<Path>) -> Result<Self>
52    where
53        Self: Sized;
54
55    /// Given the base path of the MPH, mmaps the associated .order file and returns it
56    fn with_mapped_permutation(
57        self,
58        basepath: impl AsRef<Path>,
59    ) -> Result<Self::WithMappedPermutation>;
60}
61
62/// Composition of a MPHF and a permutation
63pub struct PermutedMphf<MPHF: SwhidMphf, P: Permutation> {
64    mphf: MPHF,
65    permutation: P,
66}
67
68impl<MPHF: SwhidMphf, P: Permutation> SwhidMphf for PermutedMphf<MPHF, P> {
69    #[inline(always)]
70    fn hash_array(&self, swhid: &[u8; SWHID::BYTES_SIZE]) -> Option<NodeId> {
71        self.mphf
72            .hash_array(swhid)
73            .and_then(|id| self.permutation.get(id))
74    }
75
76    #[inline(always)]
77    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
78        self.mphf
79            .hash_str(swhid)
80            .and_then(|id| self.permutation.get(id))
81    }
82
83    #[inline(always)]
84    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
85        self.mphf
86            .hash_str_array(swhid)
87            .and_then(|id| self.permutation.get(id))
88    }
89
90    #[inline(always)]
91    fn hash_swhid(&self, swhid: &SWHID) -> Option<NodeId> {
92        self.mphf
93            .hash_swhid(swhid)
94            .and_then(|id| self.permutation.get(id))
95    }
96}
97
98/// Trivial implementation of [`SwhidMphf`] that stores the list of items in a vector
99pub struct VecMphf {
100    pub swhids: Vec<SWHID>,
101}
102
103impl SwhidMphf for VecMphf {
104    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
105        swhid
106            .as_ref()
107            .try_into()
108            .ok()
109            .and_then(|swhid: SWHID| self.hash_swhid(&swhid))
110    }
111
112    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
113        String::from_utf8(swhid.to_vec())
114            .ok()
115            .and_then(|swhid| self.hash_str(swhid))
116    }
117
118    fn hash_swhid(&self, swhid: &SWHID) -> Option<NodeId> {
119        self.swhids.iter().position(|item| item == swhid)
120    }
121}
122
123impl LoadableSwhidMphf for GOVMPH {
124    type WithMappedPermutation = PermutedMphf<Self, MappedPermutation>;
125
126    fn load(basepath: impl AsRef<Path>) -> Result<Self>
127    where
128        Self: Sized,
129    {
130        let path = suffix_path(basepath, ".cmph");
131        GOVMPH::load(&path).with_context(|| format!("Could not load {}", path.display()))
132    }
133
134    fn with_mapped_permutation(
135        self,
136        basepath: impl AsRef<Path>,
137    ) -> Result<Self::WithMappedPermutation> {
138        let suffix = ".order"; // not .cmph.order because .order predates "modular" MPH functions
139        let permutation = MappedPermutation::load_unchecked(&suffix_path(&basepath, suffix))?;
140        Ok(PermutedMphf {
141            mphf: self,
142            permutation,
143        })
144    }
145}
146
147impl SwhidMphf for GOVMPH {
148    #[inline(always)]
149    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
150        Some(self.get_byte_array(swhid.as_ref().as_bytes()) as usize)
151    }
152
153    #[inline(always)]
154    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
155        Some(self.get_byte_array(swhid) as usize)
156    }
157}
158
159pub struct SwhidPthash(pub PartitionedPhf<Minimal, MurmurHash2_128, DictionaryDictionary>);
160
161impl SwhidPthash {
162    pub fn load(path: impl AsRef<Path>) -> Result<Self>
163    where
164        Self: Sized,
165    {
166        let path = path.as_ref();
167        PartitionedPhf::load(path)
168            .with_context(|| format!("Could not load {}", path.display()))
169            .map(SwhidPthash)
170    }
171}
172
173/// Newtype to make SWHID hashable by PTHash
174pub(crate) struct HashableSWHID<T: AsRef<[u8]>>(pub T);
175
176impl<T: AsRef<[u8]>> pthash::Hashable for HashableSWHID<T> {
177    type Bytes<'a>
178        = &'a T
179    where
180        T: 'a;
181    fn as_bytes(&self) -> Self::Bytes<'_> {
182        &self.0
183    }
184}
185
186impl LoadableSwhidMphf for SwhidPthash {
187    type WithMappedPermutation = PermutedMphf<Self, MappedPermutation>;
188
189    fn load(basepath: impl AsRef<Path>) -> Result<Self>
190    where
191        Self: Sized,
192    {
193        let path = suffix_path(basepath, ".pthash");
194        SwhidPthash::load(path)
195    }
196
197    fn with_mapped_permutation(
198        self,
199        basepath: impl AsRef<Path>,
200    ) -> Result<Self::WithMappedPermutation> {
201        let suffix = ".pthash.order";
202        let permutation = MappedPermutation::load_unchecked(&suffix_path(&basepath, suffix))?;
203        Ok(PermutedMphf {
204            mphf: self,
205            permutation,
206        })
207    }
208}
209
210impl SwhidMphf for SwhidPthash {
211    #[inline(always)]
212    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
213        Some(self.0.hash(HashableSWHID(swhid.as_ref().as_bytes())) as usize)
214    }
215
216    #[inline(always)]
217    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
218        Some(self.0.hash(HashableSWHID(swhid)) as usize)
219    }
220}
221
222/// Enum of possible implementations of [`SwhidMphf`].
223///
224/// Loads either [`SwhidPthash`] or [`GOVMPH`]
225/// depending on which file is available at the given path.
226pub enum DynMphf {
227    Pthash(SwhidPthash),
228    GOV(GOVMPH),
229}
230
231impl std::fmt::Debug for DynMphf {
232    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
233        match self {
234            DynMphf::Pthash(_) => write!(f, "DynMphf::Pthash(_)"),
235            DynMphf::GOV(_) => write!(f, "DynMphf::GOV(_)"),
236        }
237    }
238}
239
240impl From<GOVMPH> for DynMphf {
241    fn from(value: GOVMPH) -> DynMphf {
242        DynMphf::GOV(value)
243    }
244}
245
246impl From<SwhidPthash> for DynMphf {
247    fn from(value: SwhidPthash) -> DynMphf {
248        DynMphf::Pthash(value)
249    }
250}
251
252impl LoadableSwhidMphf for DynMphf {
253    type WithMappedPermutation = PermutedDynMphf;
254
255    fn load(basepath: impl AsRef<Path>) -> Result<Self> {
256        let basepath = basepath.as_ref();
257
258        let pthash_path = suffix_path(basepath, ".pthash");
259        if pthash_path.exists() {
260            return LoadableSwhidMphf::load(basepath)
261                .map(Self::Pthash)
262                .with_context(|| format!("Could not load {}", pthash_path.display()));
263        }
264
265        let gov_path = suffix_path(basepath, ".cmph");
266        if gov_path.exists() {
267            return LoadableSwhidMphf::load(basepath)
268                .map(Self::GOV)
269                .with_context(|| format!("Could not load {}", gov_path.display()));
270        }
271
272        bail!(
273            "Cannot load MPH function, neither {} nor {} exists.",
274            pthash_path.display(),
275            gov_path.display()
276        );
277    }
278
279    fn with_mapped_permutation(
280        self,
281        basepath: impl AsRef<Path>,
282    ) -> Result<Self::WithMappedPermutation> {
283        match self {
284            Self::Pthash(mphf) => mphf
285                .with_mapped_permutation(basepath)
286                .map(PermutedDynMphf::Pthash),
287            Self::GOV(mphf) => mphf
288                .with_mapped_permutation(basepath)
289                .map(PermutedDynMphf::GOV),
290        }
291    }
292}
293
294impl SwhidMphf for DynMphf {
295    #[inline(always)]
296    fn hash_array(&self, swhid: &[u8; SWHID::BYTES_SIZE]) -> Option<NodeId> {
297        match self {
298            Self::Pthash(mphf) => mphf.hash_array(swhid),
299            Self::GOV(mphf) => mphf.hash_array(swhid),
300        }
301    }
302
303    #[inline(always)]
304    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
305        match self {
306            Self::Pthash(mphf) => mphf.hash_str(swhid),
307            Self::GOV(mphf) => mphf.hash_str(swhid),
308        }
309    }
310
311    #[inline(always)]
312    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
313        match self {
314            Self::Pthash(mphf) => mphf.hash_str_array(swhid),
315            Self::GOV(mphf) => mphf.hash_str_array(swhid),
316        }
317    }
318
319    #[inline(always)]
320    fn hash_swhid(&self, swhid: &SWHID) -> Option<NodeId> {
321        match self {
322            Self::Pthash(mphf) => mphf.hash_swhid(swhid),
323            Self::GOV(mphf) => mphf.hash_swhid(swhid),
324        }
325    }
326}
327
328/// [`DynMphf`] composed with a permutation
329pub enum PermutedDynMphf {
330    Pthash(<SwhidPthash as LoadableSwhidMphf>::WithMappedPermutation),
331    GOV(<GOVMPH as LoadableSwhidMphf>::WithMappedPermutation),
332}
333
334impl SwhidMphf for PermutedDynMphf {
335    #[inline(always)]
336    fn hash_array(&self, swhid: &[u8; SWHID::BYTES_SIZE]) -> Option<NodeId> {
337        match self {
338            Self::Pthash(mphf) => mphf.hash_array(swhid),
339            Self::GOV(mphf) => mphf.hash_array(swhid),
340        }
341    }
342
343    #[inline(always)]
344    fn hash_str(&self, swhid: impl AsRef<str>) -> Option<NodeId> {
345        match self {
346            Self::Pthash(mphf) => mphf.hash_str(swhid),
347            Self::GOV(mphf) => mphf.hash_str(swhid),
348        }
349    }
350
351    #[inline(always)]
352    fn hash_str_array(&self, swhid: &[u8; 50]) -> Option<NodeId> {
353        match self {
354            Self::Pthash(mphf) => mphf.hash_str_array(swhid),
355            Self::GOV(mphf) => mphf.hash_str_array(swhid),
356        }
357    }
358
359    #[inline(always)]
360    fn hash_swhid(&self, swhid: &SWHID) -> Option<NodeId> {
361        match self {
362            Self::Pthash(mphf) => mphf.hash_swhid(swhid),
363            Self::GOV(mphf) => mphf.hash_swhid(swhid),
364        }
365    }
366}