Skip to main content

swh_graph/front_coded_list/
read.rs

1// Copyright (C) 2023-2026  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::fs::File;
7use std::io::BufReader;
8use std::path::Path;
9
10use anyhow::{bail, Context, Result};
11use mmap_rs::{Mmap, MmapFlags};
12use value_traits::slices::SliceByValue;
13
14use crate::utils::suffix_path;
15
16#[derive(Debug, Clone)]
17/// Front coded list, it takes a list of strings and encode them in a way that
18/// the common prefix between strings is encoded only once.
19///
20/// The encoding is done in blocks of k strings, the first string is encoded
21/// without compression, the other strings are encoded with the common prefix
22/// removed.
23///
24/// See the
25/// [`it.unimi.dsi.fastutil.bytes.ByteArrayFrontCodedBigList` documentation](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/bytes/ByteArrayFrontCodedBigList.html)
26/// or the
27/// [implementation](https://archive.softwareheritage.org/swh:1:cnt:2fc1092d2f792fcfcbf6ff9baf849f6d22e41486;origin=https://repo1.maven.org/maven2/it/unimi/dsi/fastutil;visit=swh:1:snp:8007412c404cf39fa38e3db600bdf93700410741;anchor=swh:1:rel:1ec2b63253f642eae54f1a3e5ddd20178867bc7d;path=/it/unimi/dsi/fastutil/bytes/ByteArrayFrontCodedList.java) for details
28pub struct FrontCodedList<D: AsRef<[u8]>, P: AsRef<[u8]>> {
29    /// The number of strings in a block, this regulates the compression vs
30    /// decompression speed tradeoff
31    k: usize,
32    /// Number of encoded strings
33    len: usize,
34    /// The encoded bytestrings
35    data: D,
36    /// The pointer to which byte the k-th string starts, in big endian
37    pointers: P,
38}
39
40impl FrontCodedList<Mmap, Mmap> {
41    pub fn load<P: AsRef<Path>>(base_path: P) -> Result<Self> {
42        let properties_path = suffix_path(&base_path, ".properties");
43        let bytearray_path = suffix_path(&base_path, ".bytearray");
44        let pointers_path = suffix_path(&base_path, ".pointers");
45
46        // Parse properties
47        let properties_file = File::open(&properties_path)
48            .with_context(|| format!("Could not open {}", properties_path.display()))?;
49        let map = java_properties::read(BufReader::new(properties_file)).with_context(|| {
50            format!(
51                "Could not parse properties from {}",
52                properties_path.display()
53            )
54        })?;
55        let len =
56            map.get("n").unwrap().parse::<usize>().with_context(|| {
57                format!("Could not parse 'n' from {}", properties_path.display())
58            })?;
59        let k = map
60            .get("ratio")
61            .unwrap()
62            .parse::<usize>()
63            .with_context(|| {
64                format!("Could not parse 'ratio' from {}", properties_path.display())
65            })?;
66
67        // mmap data
68        let bytearray_len = bytearray_path
69            .metadata()
70            .with_context(|| format!("Could not read {} stats", bytearray_path.display()))?
71            .len();
72        let bytearray_file = std::fs::File::open(&bytearray_path)
73            .with_context(|| format!("Could not open {}", bytearray_path.display()))?;
74        let data = unsafe {
75            mmap_rs::MmapOptions::new(bytearray_len as _)
76                .context("Could not initialize mmap")?
77                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES | MmapFlags::RANDOM_ACCESS)
78                .with_file(&bytearray_file, 0)
79                .map()
80                .with_context(|| format!("Could not mmap {}", bytearray_path.display()))?
81        };
82
83        // mmap pointers
84        let pointers_len = pointers_path
85            .metadata()
86            .with_context(|| format!("Could not read {} stats", pointers_path.display()))?
87            .len();
88        let expected_pointers_len = ((len.div_ceil(k)) * 8) as u64;
89        if pointers_len != expected_pointers_len {
90            bail!(
91                "FCL at {} has length {} and ratio {} so {} should have length {}, but it has length {}",
92                base_path.as_ref().display(),
93                len,
94                k,
95                pointers_path.display(),
96                expected_pointers_len,
97                pointers_len
98            );
99        }
100        let pointers_file = std::fs::File::open(&pointers_path)
101            .with_context(|| format!("Could not open {}", pointers_path.display()))?;
102        let pointers = unsafe {
103            mmap_rs::MmapOptions::new(pointers_len as _)
104                .context("Could not initialize mmap")?
105                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES | MmapFlags::RANDOM_ACCESS)
106                .with_file(&pointers_file, 0)
107                .map()
108                .with_context(|| format!("Could not mmap {}", pointers_path.display()))?
109        };
110
111        Ok(FrontCodedList {
112            k,
113            len,
114            data,
115            pointers,
116        })
117    }
118}
119
120// Adapted from https://archive.softwareheritage.org/swh:1:cnt:08cf9306577d3948360afebfa77ee623edec7f1a;origin=https://github.com/vigna/sux-rs;visit=swh:1:snp:bed7ce7510f76c0b1e8fb995778028614bfff354;anchor=swh:1:rev:fac0e742d7a404237abca48e4aeffcde34f41e58;path=/src/dict/rear_coded_list.rs;lines=304-329
121impl<D: AsRef<[u8]>, P: AsRef<[u8]>> FrontCodedList<D, P> {
122    /// Write the index-th string to `result` as bytes. This is done to avoid
123    /// allocating a new string for every query.
124    #[inline(always)]
125    pub fn get_inplace(&self, index: usize, result: &mut Vec<u8>) {
126        result.clear();
127        let block = index / self.k;
128        let offset = index % self.k;
129
130        let start = u64::from_be_bytes(
131            self.pointers.as_ref()[block * 8..(block + 1) * 8]
132                .try_into()
133                .unwrap(),
134        )
135        .try_into()
136        .expect("FCL pointer overflowed usize");
137        let data = &self.data.as_ref()[start..];
138
139        // decode the first string in the block
140        let (len, mut data) = decode_int(data);
141        result.extend(&data[..len as usize]);
142        data = &data[len as usize..];
143
144        for _ in 0..offset {
145            let (new_suffix_len, tmp) = decode_int(data);
146            let (reused_prefix_len, tmp) = decode_int(tmp);
147
148            result.resize(reused_prefix_len as usize, 0);
149
150            result.extend(&tmp[..new_suffix_len as usize]);
151
152            data = &tmp[new_suffix_len as usize..];
153        }
154    }
155}
156
157impl<D: AsRef<[u8]>, P: AsRef<[u8]>> SliceByValue for &FrontCodedList<D, P> {
158    type Value = Vec<u8>;
159
160    fn len(&self) -> usize {
161        self.len
162    }
163
164    /// Returns the n-th bytestring, or `None` if `index` is larger than the length
165    fn get_value(&self, index: usize) -> Option<Self::Value> {
166        if index >= self.len {
167            None
168        } else {
169            let mut result = Vec::with_capacity(128);
170            self.get_inplace(index, &mut result);
171            Some(result)
172        }
173    }
174
175    /// Returns the n-th bytestring
176    ///
177    /// # Panics
178    ///
179    /// If `index` is out of bound
180    unsafe fn get_value_unchecked(&self, index: usize) -> Self::Value {
181        let mut result = Vec::with_capacity(128);
182        self.get_inplace(index, &mut result);
183        result
184    }
185}
186
187#[inline(always)]
188// Reads a varint at the beginning of the array, then returns the varint and the rest
189// of the array.
190//
191// Adapted from https://archive.softwareheritage.org/swh:1:cnt:66c21893f9cd9686456b0127df0b9b48a0fe153d;origin=https://repo1.maven.org/maven2/it/unimi/dsi/fastutil;visit=swh:1:snp:8007412c404cf39fa38e3db600bdf93700410741;anchor=swh:1:rel:1ec2b63253f642eae54f1a3e5ddd20178867bc7d;path=/it/unimi/dsi/fastutil/bytes/ByteArrayFrontCodedList.java;lines=142-159
192pub(crate) fn decode_int(data: &[u8]) -> (u32, &[u8]) {
193    let high_bit_mask = 0b1000_0000u8;
194    let invert = |n: u8| (!n) as u32;
195
196    if data[0] & high_bit_mask == 0 {
197        (data[0] as u32, &data[1..])
198    } else if data[1] & high_bit_mask == 0 {
199        ((invert(data[0]) << 7) | (data[1] as u32), &data[2..])
200    } else if data[2] & high_bit_mask == 0 {
201        (
202            ((invert(data[0])) << 14) | (invert(data[1]) << 7) | (data[2] as u32),
203            &data[3..],
204        )
205    } else if data[3] & high_bit_mask == 0 {
206        (
207            (invert(data[0]) << 21)
208                | (invert(data[1]) << 14)
209                | (invert(data[2]) << 7)
210                | (data[3] as u32),
211            &data[4..],
212        )
213    } else {
214        (
215            ((invert(data[0])) << 28)
216                | (invert(data[1]) << 21)
217                | (invert(data[2]) << 14)
218                | (invert(data[3]) << 7)
219                | (data[4] as u32),
220            &data[5..],
221        )
222    }
223}