swh_graph/front_coded_list/
write.rs

1// Copyright (C) 2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::collections::HashMap;
7use std::fs::File;
8use std::io::{BufWriter, Write};
9use std::num::NonZeroUsize;
10use std::path::Path;
11
12use anyhow::{Context, Result};
13
14use crate::utils::suffix_path;
15
16#[derive(thiserror::Error, Debug, PartialEq, Eq)]
17pub enum PushError {
18    #[error("Found the same string twice in a row")]
19    Duplicate,
20    #[error("String length is 2^28 or larger")]
21    TooLarge,
22}
23
24pub struct FrontCodedListBuilder {
25    /// The number of strings in a block, this regulates the compression vs
26    /// decompression speed tradeoff
27    block_size: NonZeroUsize,
28    /// Number of encoded strings
29    len: usize,
30    /// The encoded bytestrings
31    data: Vec<u8>,
32    /// The pointer to in which byte the k-th string start, in big endian
33    pointers: Vec<u8>,
34    last_string: Option<Vec<u8>>,
35}
36
37impl FrontCodedListBuilder {
38    /// Creates a new [`FrontCodedListBuilder] with blocks of the given size
39    pub fn new(block_size: NonZeroUsize) -> Self {
40        Self {
41            block_size,
42            len: 0,
43            data: Vec::new(),
44            pointers: Vec::new(),
45            last_string: None,
46        }
47    }
48
49    /// Adds a string at the end of the Front-Coded List
50    ///
51    /// Returns `Err` if the string is not strictly greater than the previous one.
52    pub fn push(&mut self, s: Vec<u8>) -> Result<(), PushError> {
53        if self.len % self.block_size == 0 {
54            // This is the first string of the block, decode all of it.
55            self.pointers.extend(
56                u64::try_from(self.data.len())
57                    .expect("String length overflowed u64")
58                    .to_be_bytes(),
59            );
60            push_int(&mut self.data, s.len())?;
61            self.data.extend(&s);
62        } else {
63            // Omit the prefix
64            let last_string = self
65                .last_string
66                .as_ref()
67                .expect("Not at start of block, but last_string is unset");
68            let prefix_len = longest_common_prefix(last_string, &s)?;
69            let suffix_len = s.len() - prefix_len;
70
71            push_int(&mut self.data, suffix_len)?;
72            push_int(&mut self.data, prefix_len)?;
73            self.data.extend(&s[prefix_len..]);
74        }
75
76        self.len += 1;
77        self.last_string = Some(s);
78
79        Ok(())
80    }
81
82    /// Writes the FCL to disk
83    pub fn dump(&self, base_path: impl AsRef<Path>) -> Result<()> {
84        let properties_path = suffix_path(&base_path, ".properties");
85        let bytearray_path = suffix_path(&base_path, ".bytearray");
86        let pointers_path = suffix_path(&base_path, ".pointers");
87
88        let (res1, res2) = rayon::join(
89            || -> Result<()> {
90                let file = File::create_new(&bytearray_path)
91                    .with_context(|| format!("Could not create {}", bytearray_path.display()))?;
92                let mut writer = BufWriter::new(file);
93                writer
94                    .write_all(&self.data)
95                    .context("Could not write bytearray")?;
96                writer.flush().context("Could not write bytearray")?;
97                Ok(())
98            },
99            || -> Result<()> {
100                let file = File::create_new(&pointers_path)
101                    .with_context(|| format!("Could not create {}", pointers_path.display()))?;
102                let mut writer = BufWriter::new(file);
103                writer
104                    .write_all(&self.pointers)
105                    .context("Could not write pointers")?;
106                writer.flush().context("Could not write bytearray")?;
107                Ok(())
108            },
109        );
110        res1?;
111        res2?;
112
113        let properties_file = File::create_new(&properties_path)
114            .with_context(|| format!("Could not create {}", properties_path.display()))?;
115        let mut writer = BufWriter::new(properties_file);
116        java_properties::write(
117            &mut writer,
118            &HashMap::from_iter(vec![
119                ("ratio".to_string(), self.block_size.to_string()),
120                ("n".to_string(), self.len.to_string()),
121            ]),
122        )
123        .context("Could not write properties")?;
124        writer.flush().context("Could not flush properties")?;
125
126        Ok(())
127    }
128}
129
130#[inline(always)]
131/// Computes the longest common prefix between two strings as bytes.
132///
133/// Returns `Err` if the second string is not strictly greater than the first.
134fn longest_common_prefix(a: &[u8], b: &[u8]) -> Result<usize, PushError> {
135    let min_len = usize::min(a.len(), b.len());
136    for i in 0..min_len {
137        if a[i] != b[i] {
138            return Ok(i);
139        }
140    }
141
142    if a.len() == b.len() {
143        // Both strings are equal
144        return Err(PushError::Duplicate);
145    }
146
147    // a is a prefix of b
148    Ok(min_len)
149}
150
151#[inline(always)]
152/// Writes a varint at the end of the given vector
153///
154/// Returns `Err` if the given integer is 2^28 or greater
155pub(crate) fn push_int(v: &mut Vec<u8>, i: usize) -> Result<(), PushError> {
156    if i < 1 << 7 {
157        v.push(i as u8);
158        Ok(())
159    } else if i < 1 << 14 {
160        v.push(!(i >> 7) as u8);
161        v.push((i & 0x7F) as u8);
162        Ok(())
163    } else if i < 1 << 21 {
164        v.push(!(i >> 14) as u8);
165        v.push(!((i >> 7) & 0x7F) as u8);
166        v.push((i & 0x7F) as u8);
167        Ok(())
168    } else if i < 1 << 28 {
169        v.push(!(i >> 21) as u8);
170        v.push(!((i >> 14) & 0x7F) as u8);
171        v.push(!((i >> 7) & 0x7F) as u8);
172        v.push((i & 0x7F) as u8);
173        Ok(())
174    } else {
175        Err(PushError::TooLarge)
176    }
177}
178
179#[test]
180fn test_longest_common_prefix() {
181    assert_eq!(
182        longest_common_prefix(b"foo", b"foo"),
183        Err(PushError::Duplicate)
184    );
185    assert_eq!(longest_common_prefix(b"bar", b"foo"), Ok(0));
186    assert_eq!(longest_common_prefix(b"", b"foo"), Ok(0));
187    assert_eq!(longest_common_prefix(b"bar", b"baz"), Ok(2));
188    assert_eq!(longest_common_prefix(b"baz", b"bar"), Ok(2));
189    assert_eq!(longest_common_prefix(b"quux", b"quxx"), Ok(2));
190    assert_eq!(longest_common_prefix(b"qux", b"quux"), Ok(2));
191}