swh_graph/front_coded_list/
write.rs1use std::collections::HashMap;
7use std::fs::File;
8use std::io::{BufWriter, Write};
9use std::num::NonZeroUsize;
10use std::path::Path;
11
12use anyhow::{Context, Result};
13
14use crate::utils::suffix_path;
15
16#[derive(thiserror::Error, Debug, PartialEq, Eq)]
17pub enum PushError {
18 #[error("Found the same string twice in a row")]
19 Duplicate,
20 #[error("String length is 2^28 or larger")]
21 TooLarge,
22}
23
24pub struct FrontCodedListBuilder {
25 block_size: NonZeroUsize,
28 len: usize,
30 data: Vec<u8>,
32 pointers: Vec<u8>,
34 last_string: Option<Vec<u8>>,
35}
36
37impl FrontCodedListBuilder {
38 pub fn new(block_size: NonZeroUsize) -> Self {
40 Self {
41 block_size,
42 len: 0,
43 data: Vec::new(),
44 pointers: Vec::new(),
45 last_string: None,
46 }
47 }
48
49 pub fn push(&mut self, s: Vec<u8>) -> Result<(), PushError> {
53 if self.len % self.block_size == 0 {
54 self.pointers.extend(
56 u64::try_from(self.data.len())
57 .expect("String length overflowed u64")
58 .to_be_bytes(),
59 );
60 push_int(&mut self.data, s.len())?;
61 self.data.extend(&s);
62 } else {
63 let last_string = self
65 .last_string
66 .as_ref()
67 .expect("Not at start of block, but last_string is unset");
68 let prefix_len = longest_common_prefix(last_string, &s)?;
69 let suffix_len = s.len() - prefix_len;
70
71 push_int(&mut self.data, suffix_len)?;
72 push_int(&mut self.data, prefix_len)?;
73 self.data.extend(&s[prefix_len..]);
74 }
75
76 self.len += 1;
77 self.last_string = Some(s);
78
79 Ok(())
80 }
81
82 pub fn dump(&self, base_path: impl AsRef<Path>) -> Result<()> {
84 let properties_path = suffix_path(&base_path, ".properties");
85 let bytearray_path = suffix_path(&base_path, ".bytearray");
86 let pointers_path = suffix_path(&base_path, ".pointers");
87
88 let (res1, res2) = rayon::join(
89 || -> Result<()> {
90 let file = File::create_new(&bytearray_path)
91 .with_context(|| format!("Could not create {}", bytearray_path.display()))?;
92 let mut writer = BufWriter::new(file);
93 writer
94 .write_all(&self.data)
95 .context("Could not write bytearray")?;
96 writer.flush().context("Could not write bytearray")?;
97 Ok(())
98 },
99 || -> Result<()> {
100 let file = File::create_new(&pointers_path)
101 .with_context(|| format!("Could not create {}", pointers_path.display()))?;
102 let mut writer = BufWriter::new(file);
103 writer
104 .write_all(&self.pointers)
105 .context("Could not write pointers")?;
106 writer.flush().context("Could not write bytearray")?;
107 Ok(())
108 },
109 );
110 res1?;
111 res2?;
112
113 let properties_file = File::create_new(&properties_path)
114 .with_context(|| format!("Could not create {}", properties_path.display()))?;
115 let mut writer = BufWriter::new(properties_file);
116 java_properties::write(
117 &mut writer,
118 &HashMap::from_iter(vec![
119 ("ratio".to_string(), self.block_size.to_string()),
120 ("n".to_string(), self.len.to_string()),
121 ]),
122 )
123 .context("Could not write properties")?;
124 writer.flush().context("Could not flush properties")?;
125
126 Ok(())
127 }
128}
129
130#[inline(always)]
131fn longest_common_prefix(a: &[u8], b: &[u8]) -> Result<usize, PushError> {
135 let min_len = usize::min(a.len(), b.len());
136 for i in 0..min_len {
137 if a[i] != b[i] {
138 return Ok(i);
139 }
140 }
141
142 if a.len() == b.len() {
143 return Err(PushError::Duplicate);
145 }
146
147 Ok(min_len)
149}
150
151#[inline(always)]
152pub(crate) fn push_int(v: &mut Vec<u8>, i: usize) -> Result<(), PushError> {
156 if i < 1 << 7 {
157 v.push(i as u8);
158 Ok(())
159 } else if i < 1 << 14 {
160 v.push(!(i >> 7) as u8);
161 v.push((i & 0x7F) as u8);
162 Ok(())
163 } else if i < 1 << 21 {
164 v.push(!(i >> 14) as u8);
165 v.push(!((i >> 7) & 0x7F) as u8);
166 v.push((i & 0x7F) as u8);
167 Ok(())
168 } else if i < 1 << 28 {
169 v.push(!(i >> 21) as u8);
170 v.push(!((i >> 14) & 0x7F) as u8);
171 v.push(!((i >> 7) & 0x7F) as u8);
172 v.push((i & 0x7F) as u8);
173 Ok(())
174 } else {
175 Err(PushError::TooLarge)
176 }
177}
178
179#[test]
180fn test_longest_common_prefix() {
181 assert_eq!(
182 longest_common_prefix(b"foo", b"foo"),
183 Err(PushError::Duplicate)
184 );
185 assert_eq!(longest_common_prefix(b"bar", b"foo"), Ok(0));
186 assert_eq!(longest_common_prefix(b"", b"foo"), Ok(0));
187 assert_eq!(longest_common_prefix(b"bar", b"baz"), Ok(2));
188 assert_eq!(longest_common_prefix(b"baz", b"bar"), Ok(2));
189 assert_eq!(longest_common_prefix(b"quux", b"quxx"), Ok(2));
190 assert_eq!(longest_common_prefix(b"qux", b"quux"), Ok(2));
191}