use crate::traits::{IndexedDict, IndexedSeq, IntoIteratorFrom, Types};
use core::marker::PhantomData;
use lender::{ExactSizeLender, IntoLender, Lender, Lending, check_covariance};
use lender::{FusedLender, for_};
use mem_dbg::*;
use std::borrow::Borrow;
#[derive(Debug, Clone, MemSize, MemDbg, Default)]
#[mem_size(flat)]
struct Stats {
pub max_block_bytes: usize,
pub sum_block_bytes: usize,
pub max_lcp: usize,
pub sum_lcp: usize,
pub max_str_len: usize,
pub sum_str_len: usize,
pub code_bytes: usize,
pub suffixes_bytes: usize,
pub redundancy: isize,
}
#[derive(Debug, Clone, MemSize, MemDbg)]
#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct RearCodedList<I: ?Sized, O, D = Box<[u8]>, P = Box<[usize]>, const SORTED: bool = true> {
ratio: usize,
len: usize,
data: D,
pointers: P,
_marker_i: PhantomData<I>,
_marker_o: PhantomData<O>,
}
pub type RearCodedListSliceU8<const SORTED: bool = true> =
RearCodedList<[u8], Vec<u8>, Box<[u8]>, Box<[usize]>, SORTED>;
pub type RearCodedListStr<const SORTED: bool = true> =
RearCodedList<str, String, Box<[u8]>, Box<[usize]>, SORTED>;
impl<
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> RearCodedList<I, O, D, P, SORTED>
{
#[inline]
pub const fn len(&self) -> usize {
self.len
}
#[inline]
pub(super) fn get_in_place_impl(&self, index: usize, result: &mut Vec<u8>) {
result.clear();
let block = index / self.ratio;
let offset = index % self.ratio;
let start = self.pointers.as_ref()[block];
let mut data = &self.data.as_ref()[start..];
let (mut rear_len, mut suffix_len);
(suffix_len, data) = decode_int(data);
result.extend_from_slice(&data[..suffix_len]);
data = &data[suffix_len..];
for _ in 0..offset {
(suffix_len, data) = decode_int(data);
(rear_len, data) = decode_int(data);
let lcp = result.len() - rear_len;
result.truncate(lcp);
result.extend_from_slice(&data[..suffix_len]);
data = &data[suffix_len..];
}
}
}
impl<
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> RearCodedList<I, O, D, P, SORTED>
where
for<'a> Lend<'a, I, O, D, P, SORTED>: Lender,
{
#[inline(always)]
pub fn lender(&self) -> Lend<'_, I, O, D, P, SORTED> {
Lend::new(self)
}
#[inline(always)]
pub fn lender_from(&self, from: usize) -> Lend<'_, I, O, D, P, SORTED> {
Lend::new_from(self, from)
}
#[inline(always)]
pub fn iter(&self) -> Iter<'_, I, O, D, P, SORTED> {
Iter(self.lender())
}
#[inline(always)]
pub fn iter_from(&self, from: usize) -> Iter<'_, I, O, D, P, SORTED> {
Iter(self.lender_from(from))
}
}
impl<I: ?Sized, O, D: AsRef<[u8]>, P: AsRef<[usize]>> RearCodedList<I, O, D, P, true> {
fn index_of(&self, value: impl AsRef<[u8]>) -> Option<usize> {
let string = value.as_ref();
let block_idx = self.pointers.as_ref().binary_search_by(|block_ptr| {
let data = &self.data.as_ref()[*block_ptr..];
let (len, tmp) = decode_int(data);
memcmp(string, &tmp[..len]).reverse()
});
if let Ok(block_idx) = block_idx {
return Some(block_idx * self.ratio);
}
let mut block_idx = block_idx.unwrap_err();
if block_idx == 0 || block_idx > self.pointers.as_ref().len() {
return None;
}
block_idx -= 1;
let mut result = Vec::with_capacity(128);
let start = self.pointers.as_ref()[block_idx];
let mut data = &self.data.as_ref()[start..];
let (mut to_copy, mut rear_length);
(to_copy, data) = decode_int(data);
result.extend_from_slice(&data[..to_copy]);
data = &data[to_copy..];
let in_block = (self.ratio - 1).min(self.len - block_idx * self.ratio - 1);
for idx in 0..in_block {
(to_copy, data) = decode_int(data);
(rear_length, data) = decode_int(data);
let lcp = result.len() - rear_length;
result.truncate(lcp);
result.extend_from_slice(&data[..to_copy]);
data = &data[to_copy..];
match memcmp(string, &result) {
core::cmp::Ordering::Greater => {}
core::cmp::Ordering::Equal => return Some(block_idx * self.ratio + idx + 1),
core::cmp::Ordering::Less => return None,
}
}
None
}
}
impl<
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> Types for RearCodedList<I, O, D, P, SORTED>
{
type Output<'a> = O;
type Input = I;
}
impl<D: AsRef<[u8]>, P: AsRef<[usize]>, const SORTED: bool> IndexedSeq
for RearCodedList<[u8], Vec<u8>, D, P, SORTED>
{
#[inline(always)]
unsafe fn get_unchecked(&self, index: usize) -> Self::Output<'_> {
let mut result = Vec::with_capacity(128);
self.get_in_place_impl(index, &mut result);
result
}
#[inline(always)]
fn len(&self) -> usize {
self.len
}
}
impl<D: AsRef<[u8]>, P: AsRef<[usize]>, const SORTED: bool>
RearCodedList<[u8], Vec<u8>, D, P, SORTED>
{
pub fn get_in_place(&self, index: usize, result: &mut Vec<u8>) {
self.get_in_place_impl(index, result);
}
}
impl<D: AsRef<[u8]>, P: AsRef<[usize]>, const SORTED: bool> IndexedSeq
for RearCodedList<str, String, D, P, SORTED>
{
#[inline(always)]
unsafe fn get_unchecked(&self, index: usize) -> Self::Output<'_> {
let mut result = Vec::with_capacity(128);
self.get_in_place_impl(index, &mut result);
debug_assert!(std::str::from_utf8(&result).is_ok());
String::from_utf8(result).unwrap()
}
#[inline(always)]
fn len(&self) -> usize {
self.len
}
}
impl<D: AsRef<[u8]>, P: AsRef<[usize]>, const SORTED: bool>
RearCodedList<str, String, D, P, SORTED>
{
pub fn get_in_place(&self, index: usize, result: &mut String) {
let mut buffer = Vec::with_capacity(64);
self.get_in_place_impl(index, &mut buffer);
result.clear();
debug_assert!(std::str::from_utf8(&buffer).is_ok());
result.push_str(std::str::from_utf8(&buffer).unwrap());
}
#[inline]
pub fn get_bytes(&self, index: usize) -> Vec<u8> {
let mut buf = Vec::with_capacity(64);
self.get_in_place_impl(index, &mut buf);
buf
}
#[inline(always)]
pub fn get_bytes_in_place(&self, index: usize, result: &mut Vec<u8>) {
self.get_in_place_impl(index, result);
}
}
impl<
I: PartialEq<O> + PartialEq + ?Sized + AsRef<[u8]>,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
> IndexedDict for RearCodedList<I, O, D, P, true>
{
fn index_of(&self, value: impl Borrow<Self::Input>) -> Option<usize> {
self.index_of(value.borrow())
}
}
#[derive(Debug, Clone, MemSize, MemDbg)]
pub struct Lend<
'a,
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> {
rcl: &'a RearCodedList<I, O, D, P, SORTED>,
data: &'a [u8],
buffer: Vec<u8>,
index: usize,
}
impl<
'a,
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> Lend<'a, I, O, D, P, SORTED>
where
Self: Lender,
{
pub fn new(rcl: &'a RearCodedList<I, O, D, P, SORTED>) -> Self {
Self {
rcl,
data: rcl.data.as_ref(),
buffer: Vec::with_capacity(128),
index: 0,
}
}
pub fn new_from(rcl: &'a RearCodedList<I, O, D, P, SORTED>, from: usize) -> Self {
if from >= rcl.len() {
assert!(
from == rcl.len(),
"Index out of bounds: {} > {}",
from,
rcl.len()
);
return Lend {
rcl,
data: &rcl.data.as_ref()[..0],
buffer: Vec::new(),
index: from,
};
}
let block = from / rcl.ratio;
let offset = from % rcl.ratio;
let start = rcl.pointers.as_ref()[block];
let mut res = Lend {
rcl,
index: block * rcl.ratio,
data: &rcl.data.as_ref()[start..],
buffer: Vec::with_capacity(128),
};
for _ in 0..offset {
res.next();
}
res
}
fn next_impl(&mut self) -> Option<&[u8]> {
if self.index >= self.rcl.len() {
return None;
}
let (to_copy, mut tmp) = decode_int(self.data);
let lcp = if self.index % self.rcl.ratio == 0 {
0
} else {
let rear_length;
(rear_length, tmp) = decode_int(tmp);
self.buffer.len() - rear_length
};
self.buffer.truncate(lcp);
self.buffer.extend_from_slice(&tmp[..to_copy]);
self.data = &tmp[to_copy..];
self.index += 1;
Some(&self.buffer)
}
}
impl<
'a,
'b,
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> Lending<'b> for Lend<'a, I, O, D, P, SORTED>
{
type Lend = &'b I;
}
impl<O: PartialEq<str> + PartialEq, D: AsRef<[u8]>, P: AsRef<[usize]>, const SORTED: bool> Lender
for Lend<'_, str, O, D, P, SORTED>
where
str: PartialEq<O> + PartialEq,
{
check_covariance!();
fn next(&mut self) -> Option<&'_ str> {
self.next_impl().map(|s| std::str::from_utf8(s).unwrap())
}
#[inline(always)]
fn size_hint(&self) -> (usize, Option<usize>) {
(self.len(), Some(self.len()))
}
}
impl<O: PartialEq<[u8]> + PartialEq, D: AsRef<[u8]>, P: AsRef<[usize]>, const SORTED: bool> Lender
for Lend<'_, [u8], O, D, P, SORTED>
where
[u8]: PartialEq<O> + PartialEq,
{
check_covariance!();
fn next(&mut self) -> Option<&[u8]> {
self.next_impl()
}
#[inline(always)]
fn size_hint(&self) -> (usize, Option<usize>) {
(self.len(), Some(self.len()))
}
}
impl<
'a,
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> ExactSizeLender for Lend<'a, I, O, D, P, SORTED>
where
Lend<'a, I, O, D, P, SORTED>: Lender,
{
#[inline(always)]
fn len(&self) -> usize {
self.rcl.len() - self.index
}
}
impl<
'a,
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> FusedLender for Lend<'a, I, O, D, P, SORTED>
where
Lend<'a, I, O, D, P, SORTED>: Lender,
{
}
#[derive(Debug, Clone, MemSize, MemDbg)]
pub struct Iter<
'a,
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
>(Lend<'a, I, O, D, P, SORTED>);
impl<
'a,
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> std::iter::ExactSizeIterator for Iter<'a, I, O, D, P, SORTED>
where
Iter<'a, I, O, D, P, SORTED>: std::iter::Iterator,
Lend<'a, I, O, D, P, SORTED>: ExactSizeLender,
{
#[inline(always)]
fn len(&self) -> usize {
self.0.len()
}
}
impl<
'a,
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> std::iter::FusedIterator for Iter<'a, I, O, D, P, SORTED>
where
Iter<'a, I, O, D, P, SORTED>: std::iter::Iterator,
Lend<'a, I, O, D, P, SORTED>: FusedLender,
{
}
impl<'a, D: AsRef<[u8]>, P: AsRef<[usize]>, const SORTED: bool> std::iter::Iterator
for Iter<'a, str, String, D, P, SORTED>
where
Lend<'a, str, String, D, P, SORTED>: Lender,
{
type Item = String;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
self.0
.next_impl()
.map(|v| String::from_utf8(Vec::from(v)).unwrap())
}
#[inline(always)]
fn size_hint(&self) -> (usize, Option<usize>) {
(self.len(), Some(self.len()))
}
}
impl<'a, D: AsRef<[u8]>, P: AsRef<[usize]>, const SORTED: bool> std::iter::Iterator
for Iter<'a, [u8], Vec<u8>, D, P, SORTED>
where
Lend<'a, [u8], Vec<u8>, D, P, SORTED>: ExactSizeLender,
{
type Item = Vec<u8>;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
self.0.next_impl().map(|v| v.into())
}
#[inline(always)]
fn size_hint(&self) -> (usize, Option<usize>) {
(self.len(), Some(self.len()))
}
}
impl<
'a,
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> IntoLender for &'a RearCodedList<I, O, D, P, SORTED>
where
Lend<'a, I, O, D, P, SORTED>: Lender,
{
type Lender = Lend<'a, I, O, D, P, SORTED>;
#[inline(always)]
fn into_lender(self) -> Lend<'a, I, O, D, P, SORTED> {
Lend::new(self)
}
}
impl<
'a,
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> IntoIterator for &'a RearCodedList<I, O, D, P, SORTED>
where
for<'b> Lend<'b, I, O, D, P, SORTED>: Lender,
Iter<'a, I, O, D, P, SORTED>: std::iter::Iterator,
{
type Item = <Iter<'a, I, O, D, P, SORTED> as Iterator>::Item;
type IntoIter = Iter<'a, I, O, D, P, SORTED>;
#[inline(always)]
fn into_iter(self) -> Self::IntoIter {
self.iter()
}
}
impl<
'a,
I: PartialEq<O> + PartialEq + ?Sized,
O: PartialEq<I> + PartialEq,
D: AsRef<[u8]>,
P: AsRef<[usize]>,
const SORTED: bool,
> IntoIteratorFrom for &'a RearCodedList<I, O, D, P, SORTED>
where
for<'b> Lend<'b, I, O, D, P, SORTED>: Lender,
Iter<'a, I, O, D, P, SORTED>: std::iter::Iterator,
{
type IntoIterFrom = Iter<'a, I, O, D, P, SORTED>;
#[inline(always)]
fn into_iter_from(self, from: usize) -> Self::IntoIter {
self.iter_from(from)
}
}
#[derive(Debug, Clone, MemSize, MemDbg)]
pub struct RearCodedListBuilder<I: ?Sized, const SORTED: bool = true> {
ratio: usize,
len: usize,
data: Vec<u8>,
written_bytes: usize,
pointers: Vec<usize>,
stats: Stats,
last_str: Vec<u8>,
_marker: PhantomData<I>,
}
#[inline(always)]
fn memcmp(string: &[u8], data: &[u8]) -> core::cmp::Ordering {
for (a, b) in string.iter().zip(data.iter()) {
let ord = a.cmp(b);
if ord != core::cmp::Ordering::Equal {
return ord;
}
}
string.len().cmp(&data.len())
}
impl<I: ?Sized, const SORTED: bool> RearCodedListBuilder<I, SORTED> {
#[must_use]
pub fn new(ratio: usize) -> Self {
Self {
data: Vec::with_capacity(1024),
last_str: Vec::with_capacity(1024),
pointers: Vec::new(),
len: 0,
written_bytes: 0,
ratio,
stats: Stats::default(),
_marker: PhantomData,
}
}
pub const fn len(&self) -> usize {
self.len
}
pub const fn is_empty(&self) -> bool {
self.len == 0
}
pub fn print_stats(&self) {
println!(
"{:>20}: {:>10}",
"max_block_bytes", self.stats.max_block_bytes
);
println!(
"{:>20}: {:>10.3}",
"avg_block_bytes",
self.stats.sum_block_bytes as f64 / self.len as f64
);
println!("{:>20}: {:>10}", "max_lcp", self.stats.max_lcp);
println!(
"{:>20}: {:>10.3}",
"avg_lcp",
self.stats.sum_lcp as f64 / self.len as f64
);
println!("{:>20}: {:>10}", "max_str_len", self.stats.max_str_len);
println!(
"{:>20}: {:>10.3}",
"avg_str_len",
self.stats.sum_str_len as f64 / self.len as f64
);
let ptr_size: usize = self.pointers.len() * core::mem::size_of::<usize>();
fn human(key: &str, x: isize) {
const UOM: &[&str] = &["B", "KB", "MB", "GB", "TB"];
let sign = x.signum();
let mut y = x.abs() as f64;
let mut uom_idx = 0;
while y > 1000.0 {
uom_idx += 1;
y /= 1000.0;
}
println!(
"{:>20}:{:>10.3}{}{:>20} ",
key,
sign as f64 * y,
UOM[uom_idx],
x
);
}
let total_size = ptr_size + self.data.len() + core::mem::size_of::<Self>();
human("data_bytes", self.data.len() as isize);
human("codes_bytes", self.stats.code_bytes as isize);
human("suffixes_bytes", self.stats.suffixes_bytes as isize);
human("ptrs_bytes", ptr_size as isize);
human("uncompressed_size", self.stats.sum_str_len as isize);
human("total_size", total_size as isize);
human(
"optimal_size",
self.data.len() as isize - self.stats.redundancy,
);
human("redundancy", self.stats.redundancy);
let overhead = self.stats.redundancy + ptr_size as isize;
println!(
"overhead_ratio: {:>10}",
overhead as f64 / (overhead + self.data.len() as isize) as f64
);
println!(
"no_overhead_compression_ratio: {:.3}",
(self.data.len() as isize - self.stats.redundancy) as f64
/ self.stats.sum_str_len as f64
);
println!(
"compression_ratio: {:.3}",
(ptr_size + self.data.len()) as f64 / self.stats.sum_str_len as f64
);
}
fn push_impl(&mut self, string: impl AsRef<[u8]>) {
let string = string.as_ref();
self.stats.max_str_len = self.stats.max_str_len.max(string.len());
self.stats.sum_str_len += string.len();
let (lcp, order) = longest_common_prefix(&self.last_str, string);
if SORTED && order == core::cmp::Ordering::Greater {
panic!(
"Strings must be sorted in ascending order when building a sorted RearCodedList. Got '{:?}' after '{}'",
string,
String::from_utf8_lossy(&self.last_str)
);
}
let rear_length = self.last_str.len() - lcp;
let suffix_len = string.len() - lcp;
let length_before = self.data.len();
let to_encode = if self.len % self.ratio == 0 {
let last_ptr = self.pointers.last().copied().unwrap_or(0);
let block_bytes = self.written_bytes - last_ptr;
self.stats.max_block_bytes = self.stats.max_block_bytes.max(block_bytes);
self.stats.sum_block_bytes += block_bytes;
self.pointers.push(self.written_bytes);
let prev_len = self.data.len();
encode_int(string.len(), &mut self.data);
self.stats.code_bytes += self.data.len() - prev_len;
if self.len != 0 {
self.stats.redundancy += lcp as isize;
self.stats.redundancy += encode_int_len(string.len()) as isize;
self.stats.redundancy -= encode_int_len(rear_length) as isize;
self.stats.redundancy -= encode_int_len(suffix_len) as isize;
}
string
} else {
self.stats.max_lcp = self.stats.max_lcp.max(lcp);
self.stats.sum_lcp += lcp;
let prev_len = self.data.len();
encode_int(suffix_len, &mut self.data);
encode_int(rear_length, &mut self.data);
self.stats.code_bytes += self.data.len() - prev_len;
&string[lcp..]
};
self.data.extend_from_slice(to_encode);
self.written_bytes += self.data.len() - length_before;
self.stats.suffixes_bytes += to_encode.len();
self.last_str.truncate(lcp);
self.last_str.extend_from_slice(&string[lcp..]);
self.len += 1;
}
}
impl<const SORTED: bool> RearCodedListBuilder<str, SORTED> {
#[must_use]
pub fn build(self) -> RearCodedListStr<SORTED> {
RearCodedList {
data: self.data.into(),
pointers: self.pointers.into(),
len: self.len,
ratio: self.ratio,
_marker_i: PhantomData,
_marker_o: PhantomData,
}
}
}
impl<const SORTED: bool> RearCodedListBuilder<[u8], SORTED> {
#[must_use]
pub fn build(self) -> RearCodedListSliceU8<SORTED> {
RearCodedList {
data: self.data.into(),
pointers: self.pointers.into(),
len: self.len,
ratio: self.ratio,
_marker_i: PhantomData,
_marker_o: PhantomData,
}
}
}
impl<I: ?Sized + AsRef<[u8]>, const SORTED: bool> RearCodedListBuilder<I, SORTED> {
pub fn push(&mut self, string: impl Borrow<I>) {
self.push_impl(string.borrow().as_ref());
}
pub fn extend<L: IntoLender>(&mut self, into_lender: L)
where
for<'lend> lender::Lend<'lend, <L as IntoLender>::Lender>: AsRef<I>,
{
for_!(elem in into_lender {
self.push(elem.as_ref());
});
}
}
#[inline(always)]
fn longest_common_prefix(a: &[u8], b: &[u8]) -> (usize, core::cmp::Ordering) {
let min_len = a.len().min(b.len());
let i = crate::utils::lcp_len(&a[..min_len], &b[..min_len]);
if i < min_len {
(i, a[i].cmp(&b[i]))
} else {
(i, a.len().cmp(&b.len()))
}
}
#[inline(always)]
fn encode_int_len(mut value: usize) -> usize {
let mut len = 1;
loop {
value >>= 7;
if value == 0 {
return len;
}
value -= 1;
len += 1;
}
}
#[inline(always)]
fn encode_int(mut value: usize, data: &mut Vec<u8>) {
let mut buf = [0u8; 10];
let mut pos = buf.len() - 1;
buf[pos] = (value & 0x7F) as u8;
value >>= 7;
while value != 0 {
value -= 1;
pos -= 1;
buf[pos] = 0x80 | (value & 0x7F) as u8;
value >>= 7;
}
for &byte in &buf[pos..] {
data.push(byte);
}
}
#[inline(always)]
fn decode_int(mut data: &[u8]) -> (usize, &[u8]) {
let mut byte = data[0];
data = &data[1..];
let mut value = (byte & 0x7F) as usize;
while (byte >> 7) != 0 {
value += 1;
byte = data[0];
data = &data[1..];
value = (value << 7) | (byte & 0x7F) as usize;
}
(value, data)
}
#[cfg(feature = "epserde")]
mod epserde_impl {
use super::{RearCodedList, RearCodedListBuilder};
use crate::utils::FallibleRewindableLender;
#[cfg(feature = "epserde")]
use epserde::{prelude::SerIter, ser::Serialize};
use lender::FallibleLending;
use std::{borrow::Borrow, cell::RefCell, marker::PhantomData};
fn serialize_impl<
T: ?Sized + Borrow<I>,
I: PartialEq<O> + PartialEq + ?Sized + AsRef<[u8]>,
O: PartialEq<I> + PartialEq,
L: FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend T>,
const SORTED: bool,
>(
ratio: usize,
mut lender: L,
mut writer: impl std::io::Write,
) -> anyhow::Result<usize>
where
for<'a> super::RearCodedList<
I,
O,
SerIter<u8, BytesIter<'a, T, I, L, SORTED>>,
SerIter<usize, PointersIter<'a, I, SORTED>>,
SORTED,
>: Serialize,
{
use log::info;
let mut len = 0;
let mut byte_len = 0;
let mut builder = RearCodedListBuilder::<I, SORTED>::new(ratio);
info!("Counting strings...");
while let Some(s) = lender.next()? {
builder.push(s.borrow());
len += 1;
byte_len += builder.data.len();
builder.data.truncate(0);
}
info!("Counted {} strings, compressed in {} bytes", len, byte_len);
lender = lender.rewind()?;
let builder = RefCell::new(RearCodedListBuilder::<I, SORTED>::new(ratio));
let rear_coded_list = RearCodedList::<I, O, _, _, SORTED> {
ratio,
len,
data: SerIter::new(BytesIter::<T, I, L, SORTED> {
builder: &builder,
lender,
byte_len,
pos: 0,
_marker_i: PhantomData,
}),
pointers: SerIter::new(PointersIter::<I, SORTED> {
builder: &builder,
pos: 0,
}),
_marker_i: PhantomData,
_marker_o: PhantomData,
};
info!("Serializing...");
let written = unsafe { rear_coded_list.serialize(&mut writer)? };
info!("Completed.");
Ok(written)
}
#[cfg(feature = "epserde")]
pub fn serialize_str<
T: ?Sized + Borrow<str>,
L: FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend T>,
const SORTED: bool,
>(
ratio: usize,
lender: L,
writer: impl std::io::Write,
) -> anyhow::Result<usize> {
serialize_impl::<T, str, String, L, SORTED>(ratio, lender, writer)
}
#[cfg(feature = "epserde")]
pub fn serialize_slice_u8<
T: ?Sized + Borrow<[u8]>,
L: FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend T>,
const SORTED: bool,
>(
ratio: usize,
lender: L,
writer: impl std::io::Write,
) -> anyhow::Result<usize> {
serialize_impl::<T, [u8], Vec<u8>, L, SORTED>(ratio, lender, writer)
}
#[cfg(feature = "epserde")]
pub fn store_str<
T: ?Sized + Borrow<str>,
L: FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend T>,
const SORTED: bool,
>(
ratio: usize,
lender: L,
filename: impl AsRef<std::path::Path>,
) -> anyhow::Result<usize> {
let dst_file =
std::fs::File::create(filename.as_ref()).expect("Cannot create destination file");
let mut buf_writer = std::io::BufWriter::new(dst_file);
serialize_impl::<T, str, String, L, SORTED>(ratio, lender, &mut buf_writer)
}
#[cfg(feature = "epserde")]
pub fn store_slice_u8<
T: ?Sized + Borrow<[u8]>,
L: FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend T>,
const SORTED: bool,
>(
ratio: usize,
lender: L,
filename: impl AsRef<std::path::Path>,
) -> anyhow::Result<usize> {
let dst_file =
std::fs::File::create(filename.as_ref()).expect("Cannot create destination file");
let mut buf_writer = std::io::BufWriter::new(dst_file);
serialize_impl::<T, [u8], Vec<u8>, L, SORTED>(ratio, lender, &mut buf_writer)
}
struct BytesIter<
'a,
T: ?Sized + Borrow<I>,
I: ?Sized + AsRef<[u8]>,
L: FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend T>,
const SORTED: bool,
> {
builder: &'a RefCell<RearCodedListBuilder<I, SORTED>>,
lender: L,
byte_len: usize,
pos: usize,
_marker_i: PhantomData<I>,
}
impl<
'a,
T: ?Sized + Borrow<I>,
I: ?Sized + AsRef<[u8]>,
L: FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend T>,
const SORTED: bool,
> Iterator for BytesIter<'a, T, I, L, SORTED>
{
type Item = u8;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
let mut builder = self.builder.borrow_mut();
if self.pos < builder.data.len() {
let byte = builder.data[self.pos];
self.pos += 1;
self.byte_len -= 1;
Some(byte)
} else {
match self.lender.next() {
Ok(None) => None,
Ok(Some(s)) => {
builder.data.truncate(0);
builder.push(s.borrow());
let byte = builder.data[0];
self.pos = 1;
self.byte_len -= 1;
Some(byte)
}
Err(e) => {
panic!(
"Error while iterating the underlying lender \
during RearCodedList serialization: {e}"
)
}
}
}
}
}
impl<
'a,
T: ?Sized + Borrow<I>,
I: ?Sized + AsRef<[u8]>,
L: FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend T>,
const SORTED: bool,
> ExactSizeIterator for BytesIter<'a, T, I, L, SORTED>
{
fn len(&self) -> usize {
self.byte_len
}
}
struct PointersIter<'a, I: ?Sized + AsRef<[u8]>, const SORTED: bool> {
builder: &'a RefCell<RearCodedListBuilder<I, SORTED>>,
pos: usize,
}
impl<'a, I: ?Sized + AsRef<[u8]>, const SORTED: bool> Iterator for PointersIter<'a, I, SORTED> {
type Item = usize;
fn next(&mut self) -> Option<Self::Item> {
let builder = self.builder.borrow();
if self.pos == builder.pointers.len() {
None
} else {
let ptr = builder.pointers[self.pos];
self.pos += 1;
Some(ptr)
}
}
}
impl<'a, I: ?Sized + AsRef<[u8]>, const SORTED: bool> ExactSizeIterator
for PointersIter<'a, I, SORTED>
{
fn len(&self) -> usize {
let builder = self.builder.borrow();
builder.pointers.len() - self.pos
}
}
}
#[cfg(feature = "epserde")]
pub use epserde_impl::{serialize_slice_u8, serialize_str, store_slice_u8, store_str};
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_memcmp() {
assert_eq!(memcmp(b"abcd", b"abcd"), core::cmp::Ordering::Equal);
assert_eq!(memcmp(b"abcd", b"abbd"), core::cmp::Ordering::Greater);
assert_eq!(memcmp(b"abcd", b"abdd"), core::cmp::Ordering::Less);
assert_eq!(memcmp(b"a", b"b\0"), core::cmp::Ordering::Less);
assert_eq!(memcmp(b"b", b"a\0"), core::cmp::Ordering::Greater);
assert_eq!(memcmp(b"abc", b"abc\0"), core::cmp::Ordering::Less);
assert_eq!(memcmp(b"abc", b"ab\0"), core::cmp::Ordering::Greater);
assert_eq!(memcmp(b"ab\0", b"abc"), core::cmp::Ordering::Less);
}
const UPPER_BOUND_1: usize = 128;
const UPPER_BOUND_2: usize = 128_usize.pow(2) + UPPER_BOUND_1;
const UPPER_BOUND_3: usize = 128_usize.pow(3) + UPPER_BOUND_2;
const UPPER_BOUND_4: usize = 128_usize.pow(4) + UPPER_BOUND_3;
#[cfg(target_pointer_width = "64")]
const UPPER_BOUND_5: usize = 128_usize.pow(5) + UPPER_BOUND_4;
#[cfg(target_pointer_width = "64")]
const UPPER_BOUND_6: usize = 128_usize.pow(6) + UPPER_BOUND_5;
#[cfg(target_pointer_width = "64")]
const UPPER_BOUND_7: usize = 128_usize.pow(7) + UPPER_BOUND_6;
#[cfg(target_pointer_width = "64")]
const UPPER_BOUND_8: usize = 128_usize.pow(8) + UPPER_BOUND_7;
#[test]
fn test_encode_decode_int() {
let values = [
0,
1,
2,
3,
4,
5,
6,
7,
8,
UPPER_BOUND_1 - 1,
UPPER_BOUND_1,
UPPER_BOUND_1 + 1,
UPPER_BOUND_2 - 1,
UPPER_BOUND_2,
UPPER_BOUND_2 + 1,
UPPER_BOUND_3 - 1,
UPPER_BOUND_3,
UPPER_BOUND_3 + 1,
UPPER_BOUND_4 - 1,
UPPER_BOUND_4,
UPPER_BOUND_4 + 1,
];
let mut buffer = Vec::with_capacity(128);
for i in &values {
encode_int(*i, &mut buffer);
}
let mut data = &buffer[..];
for i in &values {
let (j, tmp) = decode_int(data);
assert_eq!(data.len() - tmp.len(), encode_int_len(*i));
data = tmp;
assert_eq!(*i, j);
}
}
#[cfg(target_pointer_width = "64")]
#[test]
fn test_encode_decode_int_large() {
let values = [
UPPER_BOUND_5 - 1,
UPPER_BOUND_5,
UPPER_BOUND_5 + 1,
UPPER_BOUND_6 - 1,
UPPER_BOUND_6,
UPPER_BOUND_6 + 1,
UPPER_BOUND_7 - 1,
UPPER_BOUND_7,
UPPER_BOUND_7 + 1,
UPPER_BOUND_8 - 1,
UPPER_BOUND_8,
UPPER_BOUND_8 + 1,
];
let mut buffer = Vec::with_capacity(128);
for i in &values {
encode_int(*i, &mut buffer);
}
let mut data = &buffer[..];
for i in &values {
let (j, tmp) = decode_int(data);
assert_eq!(data.len() - tmp.len(), encode_int_len(*i));
data = tmp;
assert_eq!(*i, j);
}
}
#[test]
fn test_longest_common_prefix() {
let str1 = b"absolutely";
let str2 = b"absorption";
assert_eq!(
longest_common_prefix(str1, str2),
(4, core::cmp::Ordering::Less),
);
assert_eq!(
longest_common_prefix(str1, str1),
(str1.len(), core::cmp::Ordering::Equal)
);
assert_eq!(
longest_common_prefix(str2, str2),
(str2.len(), core::cmp::Ordering::Equal)
);
}
#[test]
fn test_builder_basic_str() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
assert!(builder.is_empty());
assert_eq!(builder.len(), 0);
builder.push("apple");
builder.push("application");
builder.push("banana");
assert!(!builder.is_empty());
assert_eq!(builder.len(), 3);
let rcl = builder.build();
assert_eq!(rcl.len(), 3);
assert_eq!(rcl.get(0), "apple");
assert_eq!(rcl.get(1), "application");
assert_eq!(rcl.get(2), "banana");
}
#[test]
fn test_builder_basic_bytes() {
let mut builder = RearCodedListBuilder::<[u8], true>::new(4);
builder.push(b"hello".as_slice());
builder.push(b"world".as_slice());
let rcl = builder.build();
assert_eq!(rcl.len(), 2);
assert_eq!(rcl.get(0), b"hello".to_vec());
assert_eq!(rcl.get(1), b"world".to_vec());
}
#[test]
fn test_builder_unsorted() {
let mut builder = RearCodedListBuilder::<str, false>::new(4);
builder.push("zebra");
builder.push("apple");
builder.push("banana");
builder.push("aardvark");
let rcl = builder.build();
assert_eq!(rcl.len(), 4);
assert_eq!(rcl.get(0), "zebra");
assert_eq!(rcl.get(1), "apple");
assert_eq!(rcl.get(2), "banana");
assert_eq!(rcl.get(3), "aardvark");
}
#[test]
fn test_builder_with_blocks() {
let mut builder = RearCodedListBuilder::<str, true>::new(2);
builder.push("aa");
builder.push("aab");
builder.push("abc");
builder.push("abcd");
builder.push("abcde");
builder.push("abcdef");
let rcl = builder.build();
assert_eq!(rcl.len(), 6);
for i in 0..6 {
assert!(!rcl.get(i).is_empty());
}
}
#[test]
fn test_get_in_place_str() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("hello");
builder.push("help");
builder.push("helper");
let rcl = builder.build();
let mut result = String::new();
rcl.get_in_place(0, &mut result);
assert_eq!(result, "hello");
rcl.get_in_place(1, &mut result);
assert_eq!(result, "help");
rcl.get_in_place(2, &mut result);
assert_eq!(result, "helper");
}
#[test]
fn test_get_in_place_bytes() {
let mut builder = RearCodedListBuilder::<[u8], true>::new(4);
builder.push(b"test1".as_slice());
builder.push(b"test2".as_slice());
let rcl = builder.build();
let mut result = Vec::new();
rcl.get_in_place(0, &mut result);
assert_eq!(result, b"test1");
rcl.get_in_place(1, &mut result);
assert_eq!(result, b"test2");
}
#[test]
fn test_get_bytes() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("hello");
builder.push("world");
let rcl = builder.build();
let bytes = rcl.get_bytes(0);
assert_eq!(bytes, b"hello");
let bytes = rcl.get_bytes(1);
assert_eq!(bytes, b"world");
}
#[test]
fn test_get_bytes_in_place() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("bar");
builder.push("foo");
let rcl = builder.build();
let mut result = Vec::new();
rcl.get_bytes_in_place(0, &mut result);
assert_eq!(result, b"bar");
rcl.get_bytes_in_place(1, &mut result);
assert_eq!(result, b"foo");
}
#[test]
fn test_index_of() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("aa");
builder.push("aab");
builder.push("abc");
builder.push("abdd");
builder.push("abde");
builder.push("abdf");
let rcl = builder.build();
assert_eq!(rcl.index_of("aa"), Some(0));
assert_eq!(rcl.index_of("aab"), Some(1));
assert_eq!(rcl.index_of("abc"), Some(2));
assert_eq!(rcl.index_of("abdd"), Some(3));
assert_eq!(rcl.index_of("abde"), Some(4));
assert_eq!(rcl.index_of("abdf"), Some(5));
assert_eq!(rcl.index_of("not_found"), None);
assert_eq!(rcl.index_of("a"), None);
assert_eq!(rcl.index_of("zzz"), None);
}
#[test]
fn test_contains() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("apple");
builder.push("banana");
builder.push("cherry");
let rcl = builder.build();
assert!(rcl.contains("apple"));
assert!(rcl.contains("banana"));
assert!(rcl.contains("cherry"));
assert!(!rcl.contains("orange"));
}
#[test]
fn test_index_of_with_multiple_blocks() {
let mut builder = RearCodedListBuilder::<str, true>::new(3);
let strings: Vec<String> = (0..20).map(|i| format!("str{:03}", i)).collect();
for s in strings.iter() {
builder.push(s.as_str());
}
let rcl = builder.build();
for (i, s) in strings.iter().enumerate() {
assert_eq!(rcl.index_of(s.as_str()), Some(i));
}
assert_eq!(rcl.index_of("not_found"), None);
}
#[test]
fn test_iter_str() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("a");
builder.push("b");
builder.push("c");
let rcl = builder.build();
let collected: Vec<String> = rcl.iter().collect();
assert_eq!(collected, vec!["a", "b", "c"]);
}
#[test]
fn test_iter_bytes() {
let mut builder = RearCodedListBuilder::<[u8], true>::new(4);
builder.push(b"x".as_slice());
builder.push(b"y".as_slice());
builder.push(b"z".as_slice());
let rcl = builder.build();
let collected: Vec<Vec<u8>> = rcl.iter().collect();
assert_eq!(collected, vec![b"x".to_vec(), b"y".to_vec(), b"z".to_vec()]);
}
#[test]
fn test_iter_from() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("a");
builder.push("b");
builder.push("c");
builder.push("d");
builder.push("e");
let rcl = builder.build();
let collected: Vec<String> = rcl.iter_from(2).collect();
assert_eq!(collected, vec!["c", "d", "e"]);
}
#[test]
fn test_iter_exact_size() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("one");
builder.push("three");
builder.push("two");
let rcl = builder.build();
let iter = rcl.iter();
assert_eq!(iter.len(), 3);
}
#[test]
fn test_lender_str() {
use lender::Lender;
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("hello");
builder.push("help");
builder.push("world");
let rcl = builder.build();
let mut lender = rcl.lender();
assert_eq!(lender.next(), Some("hello"));
assert_eq!(lender.next(), Some("help"));
assert_eq!(lender.next(), Some("world"));
assert_eq!(lender.next(), None);
}
#[test]
fn test_lender_from() {
use lender::Lender;
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("a");
builder.push("b");
builder.push("c");
builder.push("d");
let rcl = builder.build();
let mut lender = rcl.lender_from(2);
assert_eq!(lender.next(), Some("c"));
assert_eq!(lender.next(), Some("d"));
assert_eq!(lender.next(), None);
}
#[test]
fn test_lender_exact_size() {
use lender::{ExactSizeLender, Lender};
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("one");
builder.push("three");
builder.push("two");
let rcl = builder.build();
let mut lender = rcl.lender();
assert_eq!(lender.len(), 3);
lender.next();
assert_eq!(lender.len(), 2);
lender.next();
assert_eq!(lender.len(), 1);
lender.next();
assert_eq!(lender.len(), 0);
}
#[test]
fn test_into_iterator() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("x");
builder.push("y");
builder.push("z");
let rcl = builder.build();
let collected: Vec<String> = (&rcl).into_iter().collect();
assert_eq!(collected, vec!["x", "y", "z"]);
}
#[test]
fn test_into_iterator_from() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("a");
builder.push("b");
builder.push("c");
let rcl = builder.build();
let collected: Vec<String> = (&rcl).into_iter_from(1).collect();
assert_eq!(collected, vec!["b", "c"]);
}
#[test]
fn test_single_element() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("only_one");
let rcl = builder.build();
assert_eq!(rcl.len(), 1);
assert_eq!(rcl.get(0), "only_one");
assert_eq!(rcl.index_of("only_one"), Some(0));
assert_eq!(rcl.index_of("not_found"), None);
}
#[test]
fn test_empty_strings() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("");
builder.push("a");
builder.push("b");
let rcl = builder.build();
assert_eq!(rcl.len(), 3);
assert_eq!(rcl.get(0), "");
assert_eq!(rcl.get(1), "a");
assert_eq!(rcl.index_of(""), Some(0));
}
#[test]
fn test_strings_with_null_bytes() {
let mut builder = RearCodedListBuilder::<str, true>::new(4);
builder.push("ab\0cd");
builder.push("ab\0de");
builder.push("ab\0df");
let rcl = builder.build();
assert_eq!(rcl.len(), 3);
assert_eq!(rcl.get(0), "ab\0cd");
assert_eq!(rcl.get(1), "ab\0de");
assert_eq!(rcl.index_of("ab\0cd"), Some(0));
assert_eq!(rcl.index_of("ab\0de"), Some(1));
}
#[test]
fn test_large_compression_ratio() {
let mut builder = RearCodedListBuilder::<str, true>::new(8);
let prefix = "this_is_a_very_long_common_prefix_";
for i in 0..100 {
let s = format!("{}{:03}", prefix, i);
builder.push(s.as_str());
}
let rcl = builder.build();
assert_eq!(rcl.len(), 100);
for i in 0..100 {
let expected = format!("{}{:03}", prefix, i);
assert_eq!(rcl.get(i), expected);
}
}
#[test]
fn test_memcmp_rust() {
assert_eq!(memcmp(b"abc", b"abc"), core::cmp::Ordering::Equal);
assert_eq!(memcmp(b"abc", b"abd"), core::cmp::Ordering::Less);
assert_eq!(memcmp(b"abd", b"abc"), core::cmp::Ordering::Greater);
assert_eq!(memcmp(b"ab", b"abc"), core::cmp::Ordering::Less);
assert_eq!(memcmp(b"abc", b"ab"), core::cmp::Ordering::Greater);
}
#[test]
fn test_index_of_boundary_cases() {
let mut builder = RearCodedListBuilder::<str, true>::new(3);
builder.push("aaa");
builder.push("bbb");
builder.push("ccc");
builder.push("ddd");
builder.push("eee");
let rcl = builder.build();
assert_eq!(rcl.index_of(""), None);
assert_eq!(rcl.index_of("zzz"), None);
assert_eq!(rcl.index_of("aaa"), Some(0));
assert_eq!(rcl.index_of("ddd"), Some(3)); }
}