use std::collections::HashMap;
use std::fs::File;
use std::io::{BufWriter, Write};
use std::num::NonZeroUsize;
use std::path::Path;
use anyhow::{Context, Result};
use crate::utils::suffix_path;
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
pub enum PushError {
#[error("Found the same string twice in a row")]
Duplicate,
#[error("String length is 2^28 or larger")]
TooLarge,
}
pub struct FrontCodedListBuilder {
block_size: NonZeroUsize,
len: usize,
data: Vec<u8>,
pointers: Vec<u8>,
last_string: Option<Vec<u8>>,
}
impl FrontCodedListBuilder {
pub fn new(block_size: NonZeroUsize) -> Self {
Self {
block_size,
len: 0,
data: Vec::new(),
pointers: Vec::new(),
last_string: None,
}
}
pub fn push(&mut self, s: Vec<u8>) -> Result<(), PushError> {
if self.len % self.block_size == 0 {
self.pointers.extend(
u64::try_from(self.data.len())
.expect("String length overflowed u64")
.to_be_bytes(),
);
push_int(&mut self.data, s.len())?;
self.data.extend(&s);
} else {
let last_string = self
.last_string
.as_ref()
.expect("Not at start of block, but last_string is unset");
let prefix_len = longest_common_prefix(last_string, &s)?;
let suffix_len = s.len() - prefix_len;
push_int(&mut self.data, suffix_len)?;
push_int(&mut self.data, prefix_len)?;
self.data.extend(&s[prefix_len..]);
}
self.len += 1;
self.last_string = Some(s);
Ok(())
}
pub fn dump(&self, base_path: impl AsRef<Path>) -> Result<()> {
let properties_path = suffix_path(&base_path, ".properties");
let bytearray_path = suffix_path(&base_path, ".bytearray");
let pointers_path = suffix_path(&base_path, ".pointers");
let (res1, res2) = rayon::join(
|| -> Result<()> {
let file = File::create_new(&bytearray_path)
.with_context(|| format!("Could not create {}", bytearray_path.display()))?;
let mut writer = BufWriter::new(file);
writer
.write_all(&self.data)
.context("Could not write bytearray")?;
writer.flush().context("Could not write bytearray")?;
Ok(())
},
|| -> Result<()> {
let file = File::create_new(&pointers_path)
.with_context(|| format!("Could not create {}", pointers_path.display()))?;
let mut writer = BufWriter::new(file);
writer
.write_all(&self.pointers)
.context("Could not write pointers")?;
writer.flush().context("Could not flush pointers")?;
Ok(())
},
);
res1?;
res2?;
let properties_file = File::create_new(&properties_path)
.with_context(|| format!("Could not create {}", properties_path.display()))?;
let mut writer = BufWriter::new(properties_file);
java_properties::write(
&mut writer,
&HashMap::from_iter(vec![
("ratio".to_string(), self.block_size.to_string()),
("n".to_string(), self.len.to_string()),
]),
)
.context("Could not write properties")?;
writer.flush().context("Could not flush properties")?;
Ok(())
}
}
#[inline(always)]
fn longest_common_prefix(a: &[u8], b: &[u8]) -> Result<usize, PushError> {
let min_len = usize::min(a.len(), b.len());
for i in 0..min_len {
if a[i] != b[i] {
return Ok(i);
}
}
if a.len() == b.len() {
return Err(PushError::Duplicate);
}
Ok(min_len)
}
#[inline(always)]
pub(crate) fn push_int(v: &mut Vec<u8>, i: usize) -> Result<(), PushError> {
if i < 1 << 7 {
v.push(i as u8);
Ok(())
} else if i < 1 << 14 {
v.push(!(i >> 7) as u8);
v.push((i & 0x7F) as u8);
Ok(())
} else if i < 1 << 21 {
v.push(!(i >> 14) as u8);
v.push(!((i >> 7) & 0x7F) as u8);
v.push((i & 0x7F) as u8);
Ok(())
} else if i < 1 << 28 {
v.push(!(i >> 21) as u8);
v.push(!((i >> 14) & 0x7F) as u8);
v.push(!((i >> 7) & 0x7F) as u8);
v.push((i & 0x7F) as u8);
Ok(())
} else {
Err(PushError::TooLarge)
}
}
#[test]
fn test_longest_common_prefix() {
assert_eq!(
longest_common_prefix(b"foo", b"foo"),
Err(PushError::Duplicate)
);
assert_eq!(longest_common_prefix(b"bar", b"foo"), Ok(0));
assert_eq!(longest_common_prefix(b"", b"foo"), Ok(0));
assert_eq!(longest_common_prefix(b"bar", b"baz"), Ok(2));
assert_eq!(longest_common_prefix(b"baz", b"bar"), Ok(2));
assert_eq!(longest_common_prefix(b"quux", b"quxx"), Ok(2));
assert_eq!(longest_common_prefix(b"qux", b"quux"), Ok(2));
}