swh_graph/compress/
mph.rs

1// Copyright (C) 2023  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::path::PathBuf;
7
8use anyhow::{Context, Result};
9use dsi_progress_logger::{concurrent_progress_logger, ProgressLog};
10use pthash::{BuildConfiguration, PartitionedPhf, Phf};
11use rayon::prelude::*;
12
13use crate::compress::zst_dir::*;
14use crate::mph::{HashableSWHID, SwhidPthash};
15
16/// Reads textual SWHIDs from the path and return a MPH function for them.
17pub fn build_swhids_mphf(swhids_dir: PathBuf, num_nodes: usize) -> Result<SwhidPthash> {
18    let mut pass_counter = 0;
19    let iter_swhids = || {
20        pass_counter += 1;
21        let mut pl = concurrent_progress_logger!(
22            display_memory = true,
23            item_name = "SWHID",
24            local_speed = true,
25            expected_updates = Some(num_nodes),
26        );
27        pl.start(format!("Reading SWHIDs (pass #{pass_counter})"));
28        par_iter_lines_from_dir(&swhids_dir, pl).map(HashableSWHID::<Vec<u8>>)
29    };
30    let temp_dir = tempfile::tempdir().unwrap();
31
32    // Tuned by zack on the 2023-09-06 graph on a machine with two Intel Xeon Gold 6342 CPUs
33    let mut config = BuildConfiguration::new(temp_dir.path().to_owned());
34    config.c = 5.;
35    config.alpha = 0.94;
36    config.num_partitions = num_nodes.div_ceil(10000000) as u64;
37    config.num_threads = num_cpus::get() as u64;
38
39    log::info!("Building MPH with parameters: {:?}", config);
40
41    let mut f = PartitionedPhf::new();
42    f.par_build_in_internal_memory_from_bytes(iter_swhids, &config)
43        .context("Failed to build MPH")?;
44    Ok(SwhidPthash(f))
45}