swh_graph/java_compat/sf/
gov3.rs

1/*
2 *
3 * SPDX-FileCopyrightText: 2023 Tommaso Fontana
4 * SPDX-FileCopyrightText: 2023 Inria
5 * SPDX-FileCopyrightText: 2023 Sebastiano Vigna
6 *
7 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
8 */
9
10//! Ported from <https://github.com/vigna/Sux4J/blob/master/c/sf3.c>
11
12use crate::java_compat::mph::spooky::{spooky_short, spooky_short_rehash};
13use anyhow::Result;
14use std::fs::File;
15use std::io::BufReader;
16use std::io::Read;
17use std::path::Path;
18
19/// A stub to load and access Genuzio-Ottaviano-Vigna static functions.
20///
21/// To generate the structure you must use the Java version:
22/// ```bash
23/// java it.unimi.dsi.sux4j.mph.GOV3Function --byte-array SOURCE test.sf
24/// ```
25///
26/// To obtain a file that can be read by this structure, load
27/// the serialized Java instance of the static function and
28/// dump it in C-compatible format:
29/// ```shell
30/// echo '((it.unimi.dsi.sux4j.mph.GOV3Function)it.unimi.dsi.fastutil.io.BinIO.loadObject("test.sf")).dump("test.csf");' | jshell
31/// ```
32///
33/// You can now load the dumped file with the [`load`](GOV3::load) method.
34///
35/// # Reference:
36/// - [Marco Genuzio, Giuseppe Ottaviano, and Sebastiano Vigna, Fast Scalable Construction of (Minimal Perfect Hash) Functions](https://arxiv.org/pdf/1603.04330.pdf)
37/// - [Java version with `dump` method](https://github.com/vigna/Sux4J/blob/master/src/it/unimi/dsi/sux4j/mph/GOV3Function.java)
38
39#[derive(Debug)]
40pub struct GOV3 {
41    pub size: u64,
42    pub width: u64,
43    pub multiplier: u64,
44    pub global_seed: u64,
45    pub offset_and_seed: Vec<u64>,
46    pub array: Vec<u64>,
47}
48
49impl GOV3 {
50    /// Given a path to a file `.csf3` generated from the java version,
51    /// load a GOV3 structure from a file
52    pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
53        Self::load_reader(BufReader::new(File::open(path.as_ref())?))
54    }
55
56    /// Given a generic `Read` implementor, load a GOV3 structure from a file.
57    pub fn load_reader<F: Read>(mut file: F) -> Result<Self> {
58        macro_rules! read {
59            ($file:expr, $type:ty) => {{
60                let mut buffer: [u8; core::mem::size_of::<$type>()] =
61                    [0; core::mem::size_of::<$type>()];
62                $file.read_exact(&mut buffer)?;
63                <$type>::from_le_bytes(buffer)
64            }};
65        }
66
67        macro_rules! read_array {
68            ($file:expr, $type:ty) => {{
69                // create a bytes buffer big enough for $len elements of type $type
70                let len = read!($file, u64) as usize;
71                let bytes = len * core::mem::size_of::<$type>();
72                let mut buffer: Vec<u8> = vec![0; bytes];
73                // read the file in the buffer
74                $file.read_exact(&mut buffer)?;
75                // convert the buffer Vec<u8> into a Vec<$type>
76                let ptr = buffer.as_mut_ptr();
77                core::mem::forget(buffer);
78                unsafe { Vec::from_raw_parts(ptr as *mut $type, len, len) }
79            }};
80        }
81        // actually load the data :)
82        let size = read!(file, u64);
83        let width = read!(file, u64);
84        let multiplier = read!(file, u64);
85        let global_seed = read!(file, u64);
86        let offset_and_seed = read_array!(file, u64);
87        let array = read_array!(file, u64);
88
89        Ok(Self {
90            size,
91            width,
92            multiplier,
93            global_seed,
94            offset_and_seed,
95            array,
96        })
97    }
98}
99
100impl GOV3 {
101    pub fn size(&self) -> u64 {
102        self.size
103    }
104
105    pub fn get_byte_array(&self, key: &[u8]) -> u64 {
106        let signature = spooky_short(key, self.global_seed);
107        let bucket = ((((signature[0] as u128) >> 1) * (self.multiplier as u128)) >> 64) as u64;
108        let offset_seed = self.offset_and_seed[bucket as usize];
109        let bucket_offset = offset_seed & OFFSET_MASK;
110        let num_variables =
111            (self.offset_and_seed[bucket as usize + 1] & OFFSET_MASK) - bucket_offset;
112        let e = signature_to_equation(&signature, offset_seed & (!OFFSET_MASK), num_variables);
113        get_value(&self.array, e[0] + bucket_offset, self.width)
114            ^ get_value(&self.array, e[1] + bucket_offset, self.width)
115            ^ get_value(&self.array, e[2] + bucket_offset, self.width)
116    }
117}
118
119const OFFSET_MASK: u64 = u64::MAX >> 8;
120
121#[inline(always)]
122#[must_use]
123fn signature_to_equation(signature: &[u64; 4], seed: u64, num_variables: u64) -> [u64; 3] {
124    let hash = spooky_short_rehash(signature, seed);
125    let shift = num_variables.leading_zeros();
126    let mask = (1_u64 << shift) - 1;
127    [
128        ((hash[0] & mask) * num_variables) >> shift,
129        ((hash[1] & mask) * num_variables) >> shift,
130        ((hash[2] & mask) * num_variables) >> shift,
131    ]
132}
133
134#[inline(always)]
135#[must_use]
136fn get_value(array: &[u64], mut pos: u64, width: u64) -> u64 {
137    pos *= width;
138    let l = 64 - width;
139    let start_word = pos / 64;
140    let start_bit = pos % 64;
141    if start_bit <= l {
142        (array[start_word as usize] << (l - start_bit)) >> l
143    } else {
144        (array[start_word as usize] >> start_bit)
145            | ((array[start_word as usize + 1] << (64 + l - start_bit)) >> l)
146    }
147}