swh_graph/java_compat/mph/
gov.rs

1/*
2 *
3 * SPDX-FileCopyrightText: 2023 Tommaso Fontana
4 * SPDX-FileCopyrightText: 2023 Inria
5 * SPDX-FileCopyrightText: 2023 Sebastiano Vigna
6 *
7 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
8 */
9
10//! Ported from <https://github.com/vigna/Sux4J/blob/master/c/mph.c>
11
12use super::spooky::{spooky_short, spooky_short_rehash};
13use anyhow::Result;
14use std::fs::File;
15use std::io::BufReader;
16use std::io::Read;
17use std::path::Path;
18
19/// A stub to load and access Genuzio-Ottaviano-Vigna minimal perfect hash functions.
20///
21/// To generate the structure you must use the Java version:
22/// ```bash
23/// java it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction --byte-array SOURCE test.mph
24/// ```
25///
26/// To obtain a file that can be read by this structure, load
27/// the serialized Java instance of the minimal perfect hash function and
28/// dump it in C-compatible format:
29/// ```shell
30/// echo '((it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction)it.unimi.dsi.fastutil.io.BinIO.loadObject("test.mph")).dump("test.cmph");' | jshell
31/// ```
32///
33/// You can now load the dumped file with the [`load`](GOVMPH::load) method.
34///
35/// # Reference:
36/// - [Marco Genuzio, Giuseppe Ottaviano, and Sebastiano Vigna, Fast Scalable Construction of (Minimal Perfect Hash) Functions](https://arxiv.org/pdf/1603.04330.pdf)
37/// - [Java version with `dump` method](https://github.com/vigna/Sux4J/blob/master/src/it/unimi/dsi/sux4j/mph/GOVMinimalPerfectHashFunction.java)
38#[derive(Debug)]
39pub struct GOVMPH {
40    pub size: u64,
41    pub multiplier: u64,
42    pub global_seed: u64,
43    pub edge_offset_and_seed: Vec<u64>,
44    pub array: Vec<u64>,
45}
46
47impl GOVMPH {
48    pub fn size(&self) -> u64 {
49        self.size
50    }
51
52    pub fn get_byte_array(&self, key: &[u8]) -> u64 {
53        let signature = spooky_short(key, self.global_seed);
54        let bucket = ((((signature[0] as u128) >> 1) * (self.multiplier as u128)) >> 64) as u64;
55        let edge_offset_seed = self.edge_offset_and_seed[bucket as usize];
56        let bucket_offset = vertex_offset(edge_offset_seed);
57        let num_variables =
58            vertex_offset(self.edge_offset_and_seed[bucket as usize + 1]) - bucket_offset;
59        let e = signature_to_equation(&signature, edge_offset_seed & (!OFFSET_MASK), num_variables);
60        let eq_idx = (get_2bit_value(&self.array, e[0] + bucket_offset)
61            + get_2bit_value(&self.array, e[1] + bucket_offset)
62            + get_2bit_value(&self.array, e[2] + bucket_offset))
63            % 3;
64        let offset = count_nonzero_pairs(
65            bucket_offset,
66            bucket_offset + e[eq_idx as usize],
67            &self.array,
68        );
69        (edge_offset_seed & OFFSET_MASK) + offset
70    }
71}
72
73#[inline(always)]
74#[must_use]
75/// Count the number of pairs of bits that are both set in a word.
76const fn count_non_zero_pairs_in_word(x: u64) -> u64 {
77    ((x | (x >> 1)) & 0x5555555555555555).count_ones() as u64
78}
79
80/// Count the number of pairs of bits that are both set in a slice of words from
81/// bit offset `start` to bit offset `end`.
82fn count_nonzero_pairs(start: u64, end: u64, array: &[u64]) -> u64 {
83    let mut block = start / 32;
84    let end_block = end / 32;
85    let start_offset = start % 32;
86    let end_offset = end % 32;
87
88    if block == end_block {
89        return count_non_zero_pairs_in_word(
90            (array[block as usize] & ((1 << (end_offset * 2)) - 1)) >> (start_offset * 2),
91        );
92    }
93
94    let mut pairs = 0;
95    if start_offset != 0 {
96        pairs += count_non_zero_pairs_in_word(array[block as usize] >> (start_offset * 2));
97        block += 1;
98    }
99    while block < end_block {
100        pairs += count_non_zero_pairs_in_word(array[block as usize]);
101        block += 1;
102    }
103    if end_offset != 0 {
104        pairs +=
105            count_non_zero_pairs_in_word(array[block as usize] & ((1 << (end_offset * 2)) - 1));
106    }
107    pairs
108}
109
110const OFFSET_MASK: u64 = u64::MAX >> 8;
111const C_TIMES_256: u64 = 281; // floor((1.09 + 0.01) * 256.0)
112
113#[inline(always)]
114#[must_use]
115fn signature_to_equation(signature: &[u64; 4], seed: u64, num_variables: u64) -> [u64; 3] {
116    let hash = spooky_short_rehash(signature, seed);
117    let shift = num_variables.leading_zeros();
118    let mask = (1_u64 << shift) - 1;
119    [
120        ((hash[0] & mask) * num_variables) >> shift,
121        ((hash[1] & mask) * num_variables) >> shift,
122        ((hash[2] & mask) * num_variables) >> shift,
123    ]
124}
125
126#[inline(always)]
127#[must_use]
128const fn vertex_offset(edge_offset_seed: u64) -> u64 {
129    ((edge_offset_seed & OFFSET_MASK) * C_TIMES_256) >> 8
130}
131
132#[inline(always)]
133#[must_use]
134const fn get_2bit_value(data: &[u64], mut pos: u64) -> u64 {
135    pos *= 2;
136    (data[(pos / 64) as usize] >> (pos % 64)) & 3
137}
138
139impl GOVMPH {
140    /// Given a path to a file `.cmph` generated from the java version,
141    /// load a GOVMPH structure from a file
142    pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
143        Self::load_reader(BufReader::new(File::open(path.as_ref())?))
144    }
145
146    /// Given a generic `Read` implementor, load a GOVMPH structure from a file.
147    pub fn load_reader<F: Read>(mut file: F) -> Result<Self> {
148        macro_rules! read {
149            ($file:expr, $type:ty) => {{
150                let mut buffer: [u8; core::mem::size_of::<$type>()] =
151                    [0; core::mem::size_of::<$type>()];
152                $file.read_exact(&mut buffer)?;
153                <$type>::from_le_bytes(buffer)
154            }};
155        }
156
157        macro_rules! read_array {
158            ($file:expr, $type:ty) => {{
159                // create a bytes buffer big enough for $len elements of type $type
160                let len = read!($file, u64) as usize;
161                let mut buffer: Vec<u64> = vec![0; len];
162                // read the file in the buffer
163                $file.read_exact(bytemuck::cast_slice_mut(&mut buffer))?;
164                buffer
165            }};
166        }
167        // actually load the data :)
168        let size = read!(file, u64);
169        let multiplier = read!(file, u64);
170        let global_seed = read!(file, u64);
171        let edge_offset_and_seed = read_array!(file, u64);
172        let array = read_array!(file, u64);
173
174        Ok(Self {
175            size,
176            multiplier,
177            global_seed,
178            edge_offset_and_seed,
179            array,
180        })
181    }
182}