swh_graph/front_coded_list/
read.rs

1// Copyright (C) 2023  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::fs::File;
7use std::io::BufReader;
8use std::path::Path;
9
10use anyhow::{bail, Context, Result};
11use mmap_rs::{Mmap, MmapFlags};
12
13use crate::utils::{suffix_path, GetIndex};
14
15#[derive(Debug, Clone)]
16/// Front coded list, it takes a list of strings and encode them in a way that
17/// the common prefix between strings is encoded only once.
18///
19/// The encoding is done in blocks of k strings, the first string is encoded
20/// without compression, the other strings are encoded with the common prefix
21/// removed.
22///
23/// See the
24/// [`it.unimi.dsi.fastutil.bytes.ByteArrayFrontCodedBigList` documentation](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/bytes/ByteArrayFrontCodedBigList.html)
25/// or the
26/// [implementation](https://archive.softwareheritage.org/swh:1:cnt:2fc1092d2f792fcfcbf6ff9baf849f6d22e41486;origin=https://repo1.maven.org/maven2/it/unimi/dsi/fastutil;visit=swh:1:snp:8007412c404cf39fa38e3db600bdf93700410741;anchor=swh:1:rel:1ec2b63253f642eae54f1a3e5ddd20178867bc7d;path=/it/unimi/dsi/fastutil/bytes/ByteArrayFrontCodedList.java) for details
27pub struct FrontCodedList<D: AsRef<[u8]>, P: AsRef<[u8]>> {
28    /// The number of strings in a block, this regulates the compression vs
29    /// decompression speed tradeoff
30    k: usize,
31    /// Number of encoded strings
32    len: usize,
33    /// The encoded bytestrings
34    data: D,
35    /// The pointer to in which byte the k-th string start, in big endian
36    pointers: P,
37}
38
39impl FrontCodedList<Mmap, Mmap> {
40    pub fn load<P: AsRef<Path>>(base_path: P) -> Result<Self> {
41        let properties_path = suffix_path(&base_path, ".properties");
42        let bytearray_path = suffix_path(&base_path, ".bytearray");
43        let pointers_path = suffix_path(&base_path, ".pointers");
44
45        // Parse properties
46        let properties_file = File::open(&properties_path)
47            .with_context(|| format!("Could not open {}", properties_path.display()))?;
48        let map = java_properties::read(BufReader::new(properties_file)).with_context(|| {
49            format!(
50                "Could not parse properties from {}",
51                properties_path.display()
52            )
53        })?;
54        let len =
55            map.get("n").unwrap().parse::<usize>().with_context(|| {
56                format!("Could not parse 'n' from {}", properties_path.display())
57            })?;
58        let k = map
59            .get("ratio")
60            .unwrap()
61            .parse::<usize>()
62            .with_context(|| {
63                format!("Could not parse 'ratio' from {}", properties_path.display())
64            })?;
65
66        // mmap data
67        let bytearray_len = bytearray_path
68            .metadata()
69            .with_context(|| format!("Could not read {} stats", bytearray_path.display()))?
70            .len();
71        let bytearray_file = std::fs::File::open(&bytearray_path)
72            .with_context(|| format!("Could not open {}", bytearray_path.display()))?;
73        let data = unsafe {
74            mmap_rs::MmapOptions::new(bytearray_len as _)
75                .context("Could not initialize mmap")?
76                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES | MmapFlags::RANDOM_ACCESS)
77                .with_file(&bytearray_file, 0)
78                .map()
79                .with_context(|| format!("Could not mmap {}", bytearray_path.display()))?
80        };
81
82        // mmap pointers
83        let pointers_len = pointers_path
84            .metadata()
85            .with_context(|| format!("Could not read {} stats", pointers_path.display()))?
86            .len();
87        let expected_pointers_len = ((len.div_ceil(k)) * 8) as u64;
88        if pointers_len != expected_pointers_len {
89            bail!(
90                "FCL at {} has length {} and ratio {} so {} should have length {}, but it has length {}",
91                base_path.as_ref().display(),
92                len,
93                k,
94                pointers_path.display(),
95                expected_pointers_len,
96                pointers_len
97            );
98        }
99        let pointers_file = std::fs::File::open(&pointers_path)
100            .with_context(|| format!("Could not open {}", pointers_path.display()))?;
101        let pointers = unsafe {
102            mmap_rs::MmapOptions::new(pointers_len as _)
103                .context("Could not initialize mmap")?
104                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES | MmapFlags::RANDOM_ACCESS)
105                .with_file(&pointers_file, 0)
106                .map()
107                .with_context(|| format!("Could not mmap {}", pointers_path.display()))?
108        };
109
110        Ok(FrontCodedList {
111            k,
112            len,
113            data,
114            pointers,
115        })
116    }
117}
118
119// Adapted from https://archive.softwareheritage.org/swh:1:cnt:08cf9306577d3948360afebfa77ee623edec7f1a;origin=https://github.com/vigna/sux-rs;visit=swh:1:snp:bed7ce7510f76c0b1e8fb995778028614bfff354;anchor=swh:1:rev:fac0e742d7a404237abca48e4aeffcde34f41e58;path=/src/dict/rear_coded_list.rs;lines=304-329
120impl<D: AsRef<[u8]>, P: AsRef<[u8]>> FrontCodedList<D, P> {
121    /// Write the index-th string to `result` as bytes. This is done to avoid
122    /// allocating a new string for every query.
123    #[inline(always)]
124    pub fn get_inplace(&self, index: usize, result: &mut Vec<u8>) {
125        result.clear();
126        let block = index / self.k;
127        let offset = index % self.k;
128
129        let start = u64::from_be_bytes(
130            self.pointers.as_ref()[block * 8..(block + 1) * 8]
131                .try_into()
132                .unwrap(),
133        )
134        .try_into()
135        .expect("FCL pointer overflowed usize");
136        let data = &self.data.as_ref()[start..];
137
138        // decode the first string in the block
139        let (len, mut data) = decode_int(data);
140        result.extend(&data[..len as usize]);
141        data = &data[len as usize..];
142
143        for _ in 0..offset {
144            let (new_suffix_len, tmp) = decode_int(data);
145            let (reused_prefix_len, tmp) = decode_int(tmp);
146
147            result.resize(reused_prefix_len as usize, 0);
148
149            result.extend(&tmp[..new_suffix_len as usize]);
150
151            data = &tmp[new_suffix_len as usize..];
152        }
153    }
154}
155
156impl<D: AsRef<[u8]>, P: AsRef<[u8]>> GetIndex for &FrontCodedList<D, P> {
157    type Output = Vec<u8>;
158
159    fn len(&self) -> usize {
160        self.len
161    }
162
163    /// Returns the n-th bytestring, or `None` if `index` is larger than the length
164    fn get(&self, index: usize) -> Option<Self::Output> {
165        if index >= self.len {
166            None
167        } else {
168            let mut result = Vec::with_capacity(128);
169            self.get_inplace(index, &mut result);
170            Some(result)
171        }
172    }
173
174    /// Returns the n-th bytestring
175    ///
176    /// # Panics
177    ///
178    /// If `index` is out of bound
179    unsafe fn get_unchecked(&self, index: usize) -> Self::Output {
180        let mut result = Vec::with_capacity(128);
181        self.get_inplace(index, &mut result);
182        result
183    }
184}
185
186#[inline(always)]
187// Reads a varint at the beginning of the array, then returns the varint and the rest
188// of the array.
189//
190// Adapted from https://archive.softwareheritage.org/swh:1:cnt:66c21893f9cd9686456b0127df0b9b48a0fe153d;origin=https://repo1.maven.org/maven2/it/unimi/dsi/fastutil;visit=swh:1:snp:8007412c404cf39fa38e3db600bdf93700410741;anchor=swh:1:rel:1ec2b63253f642eae54f1a3e5ddd20178867bc7d;path=/it/unimi/dsi/fastutil/bytes/ByteArrayFrontCodedList.java;lines=142-159
191pub(crate) fn decode_int(data: &[u8]) -> (u32, &[u8]) {
192    let high_bit_mask = 0b1000_0000u8;
193    let invert = |n: u8| (!n) as u32;
194
195    if data[0] & high_bit_mask == 0 {
196        (data[0] as u32, &data[1..])
197    } else if data[1] & high_bit_mask == 0 {
198        ((invert(data[0]) << 7) | (data[1] as u32), &data[2..])
199    } else if data[2] & high_bit_mask == 0 {
200        (
201            ((invert(data[0])) << 14) | (invert(data[1]) << 7) | (data[2] as u32),
202            &data[3..],
203        )
204    } else if data[3] & high_bit_mask == 0 {
205        (
206            (invert(data[0]) << 21)
207                | (invert(data[1]) << 14)
208                | (invert(data[2]) << 7)
209                | (data[3] as u32),
210            &data[4..],
211        )
212    } else {
213        (
214            ((invert(data[0])) << 28)
215                | (invert(data[1]) << 21)
216                | (invert(data[2]) << 14)
217                | (invert(data[3]) << 7)
218                | (data[4] as u32),
219            &data[5..],
220        )
221    }
222}