swh_graph/compress/
label_names.rs1use std::fs::File;
7use std::io::{BufRead, BufReader};
8use std::path::{Path, PathBuf};
9
10use anyhow::{ensure, Context, Result};
11use dsi_progress_logger::{concurrent_progress_logger, progress_logger, ProgressLog};
12use pthash::{
13 BuildConfiguration, DictionaryDictionary, Hashable, Minimal, MurmurHash2_128, PartitionedPhf,
14 Phf,
15};
16use rayon::prelude::*;
17
18use crate::labels::LabelNameId;
19use crate::map::{MappedPermutation, OwnedPermutation, Permutation};
20
21pub struct LabelName<T: AsRef<[u8]>>(pub T);
22
23impl<T: AsRef<[u8]>> Hashable for LabelName<T> {
24 type Bytes<'a>
25 = &'a [u8]
26 where
27 T: 'a;
28 fn as_bytes(&self) -> Self::Bytes<'_> {
29 self.0.as_ref()
30 }
31}
32
33pub type LabelNameMphf = PartitionedPhf<Minimal, MurmurHash2_128, DictionaryDictionary>;
36
37fn iter_labels(path: &Path) -> Result<impl Iterator<Item = LabelName<Box<[u8]>>>> {
38 let base64 = base64_simd::STANDARD;
39 let labels_file =
40 File::open(path).with_context(|| format!("Could not open {}", path.display()))?;
41 Ok(BufReader::new(labels_file)
42 .lines()
43 .map(move |label_base64| {
44 let label_base64 = label_base64.expect("Could not read line");
45 LabelName(
46 base64
47 .decode_to_vec(&label_base64)
48 .unwrap_or_else(|_| panic!("Label {label_base64}, could not be base64-decoded"))
49 .into_boxed_slice(),
50 )
51 }))
52}
53
54pub fn build_mphf(path: PathBuf, num_labels: usize) -> Result<LabelNameMphf> {
56 let mut pass_counter = 0;
57 let iter_labels = || {
58 pass_counter += 1;
59 let mut pl = concurrent_progress_logger!(
60 display_memory = true,
61 item_name = "label",
62 local_speed = true,
63 expected_updates = Some(num_labels),
64 );
65 pl.start(format!("Reading labels (pass #{pass_counter})"));
66 iter_labels(&path)
67 .expect("Could not read labels")
68 .inspect(move |_| pl.light_update())
69 };
70 let temp_dir = tempfile::tempdir().unwrap();
71
72 let mut config = BuildConfiguration::new(temp_dir.path().to_owned());
74 config.c = 4.000000;
75 config.alpha = 0.940000;
76 config.num_partitions = (num_labels as u64).div_ceil(40000000);
77 config.num_threads = num_cpus::get() as u64;
78
79 log::info!("Building MPH with parameters: {:?}", config);
80
81 assert_ne!(
82 config.num_partitions, 0,
83 "Cannot build MPHF for empty label list"
84 );
85
86 let mut f = LabelNameMphf::new();
87 f.build_in_internal_memory_from_bytes(iter_labels, &config)
88 .context("Failed to build MPH")?;
89 Ok(f)
90}
91
92pub fn build_order(
96 path: PathBuf,
97 mphf_path: PathBuf,
98 num_labels: usize,
99) -> Result<OwnedPermutation<Vec<usize>>> {
100 assert_eq!(
101 usize::BITS,
102 u64::BITS,
103 "Only 64-bits architectures are supported"
104 );
105
106 let mphf = LabelNameMphf::load(&mphf_path)
107 .with_context(|| format!("Could not load MPH from {}", mphf_path.display()))?;
108 ensure!(
109 mphf.num_keys() == num_labels as u64,
110 "mphf.num_keys() == {} does not match expected number of keys ({})",
111 mphf.num_keys(),
112 num_labels
113 );
114
115 let mut pl = progress_logger!(
116 display_memory = true,
117 item_name = "label",
118 local_speed = true,
119 expected_updates = Some(num_labels),
120 );
121 pl.start("Reading labels");
122
123 let mut order: Vec<_> = (0..num_labels).map(|_| usize::MAX).collect();
124 for (i, label) in iter_labels(&path)?.enumerate() {
125 pl.light_update();
126 let hash = mphf.hash(&label) as usize;
127 ensure!(hash < num_labels, "{} is not minimal", mphf_path.display());
128 ensure!(
129 order[hash] == usize::MAX,
130 "hash collision involving {}",
131 String::from_utf8_lossy(&label.0)
132 );
133 order[hash] = i;
134 }
135
136 log::info!("Checking permutation...");
137 order.par_iter().enumerate().try_for_each(|(i, value)| {
138 ensure!(*value != usize::MAX, "no label hash equals {}", i);
139 Ok(())
140 })?;
141
142 Ok(OwnedPermutation::new_unchecked(order))
143}
144
145#[derive(Clone, Copy)]
146pub struct LabelNameHasher<'a> {
147 mphf: &'a LabelNameMphf,
148 order: &'a MappedPermutation,
149}
150
151impl<'a> LabelNameHasher<'a> {
152 pub fn new(mphf: &'a LabelNameMphf, order: &'a MappedPermutation) -> Result<Self> {
153 ensure!(
154 mphf.num_keys() == order.len() as u64,
155 "Number of MPHF keys ({}) does not match permutation length ({})",
156 mphf.num_keys(),
157 order.len()
158 );
159
160 Ok(LabelNameHasher { mphf, order })
161 }
162
163 pub fn mphf(&self) -> &'a LabelNameMphf {
164 self.mphf
165 }
166
167 pub fn hash<T: AsRef<[u8]>>(&self, label_name: T) -> Result<LabelNameId> {
168 Ok(LabelNameId(
169 self.order
170 .get(
171 self.mphf
172 .hash(LabelName(label_name))
173 .try_into()
174 .expect("label MPH overflowed"),
175 )
176 .context("out of bound dir entry name hash")?
177 .try_into()
178 .context("label permutation overflowed")?,
179 ))
180 }
181}