swh_graph/java_compat/
bit_vector.rs

1// Copyright (C) 2023  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Structure to read and write a Java `it.unimi.dsi.bits.LongArrayBitVector` object
7//! in Java's native serialization format
8
9use std::io::Write;
10use std::path::Path;
11
12use anyhow::{Context, Result};
13use byteorder::ByteOrder;
14use rayon::prelude::*;
15
16/// First bytes of a serialized `it.unimi.dsi.bits.LongArrayBitVector`,
17/// followed by the number of bits
18const HEADER: &[u8] = b"\xac\xed\x00\x05sr\x00$it.unimi.dsi.bits.LongArrayBitVector\x00\x00\x00\x00\x00\x00\x00\x01\x03\x00\x01\x4a\x00\x06lengthxp";
19
20/// Separator between number of bits and data in
21/// `it.unimi.dsi.bits.LongArrayBitVector` serialization, which contains information
22/// about the following data, see Java's serialization protocol's
23/// [Terminal Symbols and Constants](https://docs.oracle.com/javase/6/docs/platform/serialization/spec/protocol.html#10152)
24const TC_BLOCKDATA: u8 = 0x77;
25const TC_BLOCKDATALONG: u8 = 0x7A;
26
27/// Last bytes of `it.unimi.dsi.bits.LongArrayBitVector` serialization
28const TC_ENDBLOCKDATA: u8 = 0x78;
29
30/// Maximum size of a BLOCKDATA, as its length is an unsigned byte; larger blocks use
31/// BLOCKDATALONG whose length is a signed integer
32const MAX_BLOCKDATA_SIZE: usize = 256;
33
34const DEFAULT_BLOCKDATALONG_SIZE: usize = 0x100000; // Arbitrary; OpenJDK defaults to 0x400
35
36/// Writer for serialized `it.unimi.dsi.bits.LongArrayBitVector` objects
37pub struct LongArrayBitVector<B> {
38    data: B,
39    num_bits: usize,
40}
41
42impl LongArrayBitVector<Vec<u64>> {
43    pub fn new_from_bitvec(bitvec: sux::bits::bit_vec::BitVec<Vec<usize>>) -> Self {
44        let (vec, num_bits) = bitvec.into_raw_parts();
45
46        assert_eq!(usize::BITS, u64::BITS);
47        // Sound because sux interprets values in the vector as being in big endian,
48        // so this works no matter the architecture.
49        let mut vec: Vec<u64> =
50            bytemuck::allocation::try_cast_vec(vec).expect("Could not cast Vec<usize> to Vec<u64>");
51
52        vec.par_iter_mut()
53            .for_each(|cell| *cell = u64::from_be(*cell).to_le());
54
55        LongArrayBitVector {
56            data: vec,
57            num_bits,
58        }
59    }
60
61    pub fn dump<P: AsRef<Path>>(&self, path: P) -> Result<()> {
62        let path = path.as_ref();
63        let file = std::fs::File::create(path)
64            .with_context(|| format!("Could not create {}", path.display()))?;
65        let mut file = std::io::BufWriter::new(file);
66
67        file.write_all(HEADER)?;
68
69        let mut length_buf = [0u8; 8];
70        byteorder::BigEndian::write_u64(&mut length_buf, self.num_bits as u64);
71        file.write_all(&length_buf)?;
72
73        for chunk in self.data.chunks(DEFAULT_BLOCKDATALONG_SIZE / 8) {
74            let chunk_size = chunk.len() * 8; // In bytes
75
76            if chunk_size > MAX_BLOCKDATA_SIZE {
77                file.write_all(&[TC_BLOCKDATALONG])?;
78                let chunk_size: i32 = chunk_size.try_into().expect("Chunk size overflows i32");
79                file.write_all(&chunk_size.to_be_bytes())
80                    .with_context(|| format!("Could not write to {}", path.display()))?;
81
82                file.write_all(bytemuck::cast_slice(chunk))
83                    .with_context(|| format!("Could not write to {}", path.display()))?;
84            } else {
85                file.write_all(&[TC_BLOCKDATA])?;
86                let chunk_size: u8 = chunk_size.try_into().expect("Chunk size overflows u8");
87                file.write_all(&chunk_size.to_be_bytes())
88                    .with_context(|| format!("Could not write to {}", path.display()))?;
89
90                file.write_all(bytemuck::cast_slice(chunk))
91                    .with_context(|| format!("Could not write to {}", path.display()))?;
92            }
93        }
94
95        file.write_all(&[TC_ENDBLOCKDATA])?;
96
97        Ok(())
98    }
99}