swh_graph/compress/
label_names.rs

1// Copyright (C) 2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::fs::File;
7use std::io::{BufRead, BufReader};
8use std::path::{Path, PathBuf};
9
10use anyhow::{ensure, Context, Result};
11use dsi_progress_logger::{concurrent_progress_logger, progress_logger, ProgressLog};
12use pthash::{
13    BuildConfiguration, DictionaryDictionary, Hashable, Minimal, MurmurHash2_128, PartitionedPhf,
14    Phf,
15};
16use rayon::prelude::*;
17
18use crate::labels::LabelNameId;
19use crate::map::{MappedPermutation, OwnedPermutation, Permutation};
20
21pub struct LabelName<T: AsRef<[u8]>>(pub T);
22
23impl<T: AsRef<[u8]>> Hashable for LabelName<T> {
24    type Bytes<'a>
25        = &'a [u8]
26    where
27        T: 'a;
28    fn as_bytes(&self) -> Self::Bytes<'_> {
29        self.0.as_ref()
30    }
31}
32
33// pthash requires 128-bits hash when using over 2^30 keys, and the 2024-05-16 production
34// graph has just over 2^32 keys
35pub type LabelNameMphf = PartitionedPhf<Minimal, MurmurHash2_128, DictionaryDictionary>;
36
37fn iter_labels(path: &Path) -> Result<impl Iterator<Item = LabelName<Box<[u8]>>>> {
38    let base64 = base64_simd::STANDARD;
39    let labels_file =
40        File::open(path).with_context(|| format!("Could not open {}", path.display()))?;
41    Ok(BufReader::new(labels_file)
42        .lines()
43        .map(move |label_base64| {
44            let label_base64 = label_base64.expect("Could not read line");
45            LabelName(
46                base64
47                    .decode_to_vec(&label_base64)
48                    .unwrap_or_else(|_| panic!("Label {label_base64}, could not be base64-decoded"))
49                    .into_boxed_slice(),
50            )
51        }))
52}
53
54/// Reads base64-encoded labels from the path and return a MPH function for them.
55pub fn build_mphf(path: PathBuf, num_labels: usize) -> Result<LabelNameMphf> {
56    let mut pass_counter = 0;
57    let iter_labels = || {
58        pass_counter += 1;
59        let mut pl = concurrent_progress_logger!(
60            display_memory = true,
61            item_name = "label",
62            local_speed = true,
63            expected_updates = Some(num_labels),
64        );
65        pl.start(format!("Reading labels (pass #{pass_counter})"));
66        iter_labels(&path)
67            .expect("Could not read labels")
68            .inspect(move |_| pl.light_update())
69    };
70    let temp_dir = tempfile::tempdir().unwrap();
71
72    // From zack's benchmarks on the 2023-09-06 graph (4 billion label names)
73    let mut config = BuildConfiguration::new(temp_dir.path().to_owned());
74    config.c = 4.000000;
75    config.alpha = 0.940000;
76    config.num_partitions = (num_labels as u64).div_ceil(40000000);
77    config.num_threads = num_cpus::get() as u64;
78
79    log::info!("Building MPH with parameters: {:?}", config);
80
81    assert_ne!(
82        config.num_partitions, 0,
83        "Cannot build MPHF for empty label list"
84    );
85
86    let mut f = LabelNameMphf::new();
87    f.build_in_internal_memory_from_bytes(iter_labels, &config)
88        .context("Failed to build MPH")?;
89    Ok(f)
90}
91
92/// Reads base64-encoded labels from the path and a MPH function for these labels
93/// (as returned by [`build_mphf`]) and returns a permutation that maps their hashes
94/// to their position in the (sorted) list.
95pub fn build_order(
96    path: PathBuf,
97    mphf_path: PathBuf,
98    num_labels: usize,
99) -> Result<OwnedPermutation<Vec<usize>>> {
100    assert_eq!(
101        usize::BITS,
102        u64::BITS,
103        "Only 64-bits architectures are supported"
104    );
105
106    let mphf = LabelNameMphf::load(&mphf_path)
107        .with_context(|| format!("Could not load MPH from {}", mphf_path.display()))?;
108    ensure!(
109        mphf.num_keys() == num_labels as u64,
110        "mphf.num_keys() == {} does not match expected number of keys ({})",
111        mphf.num_keys(),
112        num_labels
113    );
114
115    let mut pl = progress_logger!(
116        display_memory = true,
117        item_name = "label",
118        local_speed = true,
119        expected_updates = Some(num_labels),
120    );
121    pl.start("Reading labels");
122
123    let mut order: Vec<_> = (0..num_labels).map(|_| usize::MAX).collect();
124    for (i, label) in iter_labels(&path)?.enumerate() {
125        pl.light_update();
126        let hash = mphf.hash(&label) as usize;
127        ensure!(hash < num_labels, "{} is not minimal", mphf_path.display());
128        ensure!(
129            order[hash] == usize::MAX,
130            "hash collision involving {}",
131            String::from_utf8_lossy(&label.0)
132        );
133        order[hash] = i;
134    }
135
136    log::info!("Checking permutation...");
137    order.par_iter().enumerate().try_for_each(|(i, value)| {
138        ensure!(*value != usize::MAX, "no label hash equals {}", i);
139        Ok(())
140    })?;
141
142    Ok(OwnedPermutation::new_unchecked(order))
143}
144
145#[derive(Clone, Copy)]
146pub struct LabelNameHasher<'a> {
147    mphf: &'a LabelNameMphf,
148    order: &'a MappedPermutation,
149}
150
151impl<'a> LabelNameHasher<'a> {
152    pub fn new(mphf: &'a LabelNameMphf, order: &'a MappedPermutation) -> Result<Self> {
153        ensure!(
154            mphf.num_keys() == order.len() as u64,
155            "Number of MPHF keys ({}) does not match permutation length ({})",
156            mphf.num_keys(),
157            order.len()
158        );
159
160        Ok(LabelNameHasher { mphf, order })
161    }
162
163    pub fn mphf(&self) -> &'a LabelNameMphf {
164        self.mphf
165    }
166
167    pub fn hash<T: AsRef<[u8]>>(&self, label_name: T) -> Result<LabelNameId> {
168        Ok(LabelNameId(
169            self.order
170                .get(
171                    self.mphf
172                        .hash(LabelName(label_name))
173                        .try_into()
174                        .expect("label MPH overflowed"),
175                )
176                .context("out of bound dir entry name hash")?
177                .try_into()
178                .context("label permutation overflowed")?,
179        ))
180    }
181}