use std::mem::MaybeUninit;
use crate::column::Parts;
use crate::types::MAX_TOKEN_SIZE;
mod scalar;
pub(crate) mod fat;
pub fn decompressed_len(parts: Parts<'_>) -> usize {
parts.codes.iter().map(|&code| code_len(parts, code)).sum()
}
pub fn decompress_into(parts: Parts<'_>, out: &mut [MaybeUninit<u8>]) -> usize {
let big_enough = out.len() >= parts.codes.len().saturating_mul(MAX_TOKEN_SIZE)
|| out.len() >= decompressed_len(parts);
assert!(big_enough, "output buffer too small for decompressed bytes");
unsafe { decode_fat::<true>(parts, out) }
}
#[cold]
#[inline(never)]
pub(crate) fn code_out_of_range() -> ! {
panic!("onpair: code index out of range for dictionary")
}
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum InvalidParts {
NonIncreasingOffsets,
TokenTooLarge,
MissingDecoderPadding,
CodeOutOfRange,
}
impl std::fmt::Display for InvalidParts {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(match self {
Self::NonIncreasingOffsets => {
"dictionary offsets must be increasing (non-empty tokens)"
}
Self::TokenTooLarge => "dictionary token exceeds MAX_TOKEN_SIZE",
Self::MissingDecoderPadding => "dict_bytes lacks the required trailing decoder padding",
Self::CodeOutOfRange => "code index out of range for dictionary",
})
}
}
impl std::error::Error for InvalidParts {}
impl Parts<'_> {
pub fn validate_dictionary(&self) -> Result<(), InvalidParts> {
let mut last_offset = None;
for w in self.dict_offsets.windows(2) {
let (s, e) = (w[0], w[1]);
if s >= e {
return Err(InvalidParts::NonIncreasingOffsets);
}
if (e - s) as usize > MAX_TOKEN_SIZE {
return Err(InvalidParts::TokenTooLarge);
}
last_offset = Some(s as usize);
}
if let Some(off) = last_offset
&& off + MAX_TOKEN_SIZE > self.dict_bytes.len()
{
return Err(InvalidParts::MissingDecoderPadding);
}
Ok(())
}
pub fn validate(&self) -> Result<(), InvalidParts> {
self.validate_dictionary()?;
let ntok = self.dict_offsets.len().saturating_sub(1);
if self.codes.iter().any(|&c| c as usize >= ntok) {
return Err(InvalidParts::CodeOutOfRange);
}
Ok(())
}
}
#[inline]
fn assert_valid_dictionary(parts: Parts<'_>) {
if let Err(e) = parts.validate_dictionary() {
panic!("onpair: {e}");
}
}
#[inline]
fn code_byte_range(parts: Parts<'_>, code: u16) -> (usize, usize) {
let s = parts.dict_offsets[code as usize] as usize;
let e = parts.dict_offsets[code as usize + 1] as usize;
assert!(e >= s, "dictionary offsets must be nondecreasing");
(s, e)
}
#[inline]
fn code_len(parts: Parts<'_>, code: u16) -> usize {
let (s, e) = code_byte_range(parts, code);
e - s
}
#[inline]
unsafe fn decode_fat<const CHECK: bool>(parts: Parts<'_>, out: &mut [MaybeUninit<u8>]) -> usize {
if CHECK {
assert_valid_dictionary(parts);
}
unsafe { fat::decode_loop::<CHECK>(parts.codes, &fat::build(parts), out) }
}
pub unsafe fn decompress_into_unchecked(parts: Parts<'_>, out: &mut [MaybeUninit<u8>]) -> usize {
unsafe { decode_fat::<false>(parts, out) }
}
pub fn decompress(parts: Parts<'_>) -> Vec<u8> {
let decoded_len = decompressed_len(parts);
let mut out: Vec<u8> = Vec::with_capacity(decoded_len);
let len = unsafe { decode_fat::<true>(parts, out.spare_capacity_mut()) };
unsafe { out.set_len(len) };
out
}
#[cfg(test)]
mod tests {
use crate::{Bits, Config, DEFAULT_CONFIG, Parts, compress};
use super::*;
#[test]
fn decompress_into_uses_caller_buffer() {
let rows: &[&[u8]] = &[b"alpha", b"", b"beta beta", b"gamma"];
let mut bytes = Vec::new();
let mut offsets = vec![0u32];
for row in rows {
bytes.extend_from_slice(row);
offsets.push(bytes.len() as u32);
}
let col = compress(&bytes, &offsets, DEFAULT_CONFIG).unwrap();
assert_eq!(
col.as_parts().validate_dictionary(),
Ok(()),
"compressed columns include the required decoder padding"
);
let mut decoded = Vec::with_capacity(bytes.len());
let len = decompress_into(col.as_parts(), decoded.spare_capacity_mut());
unsafe { decoded.set_len(len) };
assert_eq!(decoded, bytes);
}
fn valid_padded(tokens: &[&[u8]], code_seq: &[u16]) -> (Vec<u8>, Vec<u32>, Vec<u16>) {
let mut dict = Vec::new();
let mut offsets = vec![0u32];
for t in tokens {
dict.extend_from_slice(t);
offsets.push(dict.len() as u32);
}
dict.resize(dict.len() + MAX_TOKEN_SIZE - 1, 0);
let codes = code_seq.to_vec();
(dict, offsets, codes)
}
fn parts<'a>(dict: &'a [u8], offsets: &'a [u32], codes: &'a [u16]) -> Parts<'a> {
Parts {
dict_bytes: dict,
dict_offsets: offsets,
bits: 3,
codes,
}
}
#[test]
fn decode_valid_padded_roundtrip() {
let tokens: &[&[u8]] = &[b"a", b"bc", b"def", b"ghij"];
let seq: Vec<u16> = (0..40).map(|i| (i % 4) as u16).collect();
let (dict, offsets, codes) = valid_padded(tokens, &seq);
let p = parts(&dict, &offsets, &codes);
let expected: Vec<u8> = seq
.iter()
.flat_map(|&c| tokens[c as usize].iter().copied())
.collect();
assert_eq!(decompress(p), expected);
let mut out: Vec<MaybeUninit<u8>> = (0..codes.len() * MAX_TOKEN_SIZE)
.map(|_| MaybeUninit::uninit())
.collect();
let n = decompress_into(p, &mut out);
let decoded: Vec<u8> = out[..n]
.iter()
.map(|b| unsafe { b.assume_init() })
.collect();
assert_eq!(decoded, expected);
let mut tight: Vec<MaybeUninit<u8>> =
(0..expected.len()).map(|_| MaybeUninit::uninit()).collect();
let n = decompress_into(p, &mut tight);
assert_eq!(n, expected.len());
let mut ue: Vec<MaybeUninit<u8>> = (0..codes.len() * MAX_TOKEN_SIZE)
.map(|_| MaybeUninit::uninit())
.collect();
let n = unsafe { decompress_into_unchecked(p, &mut ue) };
let decoded: Vec<u8> = ue[..n].iter().map(|b| unsafe { b.assume_init() }).collect();
assert_eq!(decoded, expected);
}
fn assert_decode_panics(dict: &[u8], offsets: &[u32], codes: &[u16]) {
let p = parts(dict, offsets, codes);
let mut out: Vec<MaybeUninit<u8>> = (0..codes.len() * MAX_TOKEN_SIZE + 16)
.map(|_| MaybeUninit::uninit())
.collect();
decompress_into(p, &mut out);
}
#[test]
#[should_panic(expected = "offsets must be increasing")]
fn checked_panics_on_non_monotonic_offsets() {
let mut dict = b"ab".to_vec();
dict.resize(2 + MAX_TOKEN_SIZE - 1, 0);
assert_decode_panics(&dict, &[0, 2, 1], &[0, 1]);
}
#[test]
#[should_panic(expected = "offsets must be increasing")]
fn checked_panics_on_zero_length_token() {
let mut dict = b"ab".to_vec();
dict.resize(2 + MAX_TOKEN_SIZE - 1, 0);
assert_decode_panics(&dict, &[0, 1, 1, 2], &[0, 2, 3]);
}
#[test]
#[should_panic(expected = "exceeds MAX_TOKEN_SIZE")]
fn checked_panics_on_oversize_token() {
let mut dict = vec![b'x'; 21];
dict.resize(21 + MAX_TOKEN_SIZE - 1, 0);
assert_decode_panics(&dict, &[0, 20, 21], &[0, 1]);
}
#[test]
#[should_panic(expected = "decoder padding")]
fn checked_panics_on_missing_padding() {
assert_decode_panics(b"abcdef", &[0, 4, 6], &[0, 1]);
}
#[test]
fn parts_validate_classifies_corruption() {
let tokens: &[&[u8]] = &[b"a", b"bc", b"def"];
let (dict, offsets, codes) = valid_padded(tokens, &[0, 1, 2, 0]);
let p = parts(&dict, &offsets, &codes);
assert_eq!(p.validate_dictionary(), Ok(()));
assert_eq!(p.validate(), Ok(()));
let pad = |dict: &mut Vec<u8>| dict.resize(dict.len() + MAX_TOKEN_SIZE - 1, 0);
let mut d = b"ab".to_vec();
pad(&mut d);
assert_eq!(
parts(&d, &[0, 2, 1], &[0]).validate_dictionary(),
Err(InvalidParts::NonIncreasingOffsets)
);
let mut d = vec![b'x'; 21];
pad(&mut d);
assert_eq!(
parts(&d, &[0, 20, 21], &[0]).validate_dictionary(),
Err(InvalidParts::TokenTooLarge)
);
assert_eq!(
parts(b"abcdef", &[0, 4, 6], &[0]).validate_dictionary(),
Err(InvalidParts::MissingDecoderPadding)
);
let mut d = b"ab".to_vec();
pad(&mut d);
let p = parts(&d, &[0, 1, 2], &[0, 5]);
assert_eq!(p.validate_dictionary(), Ok(()));
assert_eq!(p.validate(), Err(InvalidParts::CodeOutOfRange));
}
#[test]
fn decode_zero_padding_full_width_last_token() {
let mut dict = vec![b'x']; dict.extend(std::iter::repeat_n(b'y', MAX_TOKEN_SIZE)); let offsets = [0u32, 1, 1 + MAX_TOKEN_SIZE as u32];
let seq: Vec<u16> = (0..40).map(|i| (i % 2) as u16).collect();
let p = parts(&dict, &offsets, &seq);
assert_eq!(p.validate_dictionary(), Ok(()));
let expected: Vec<u8> = seq
.iter()
.flat_map(|&c| {
dict[offsets[c as usize] as usize..offsets[c as usize + 1] as usize].to_vec()
})
.collect();
assert_eq!(decompress(p), expected);
}
#[test]
fn validate_padding_requirement_is_variable() {
let full = vec![b'z'; MAX_TOKEN_SIZE];
let offs = [0u32, MAX_TOKEN_SIZE as u32];
assert_eq!(parts(&full, &offs, &[0]).validate_dictionary(), Ok(()));
let short = vec![b'z'; MAX_TOKEN_SIZE - 1];
assert_eq!(
parts(&short, &offs, &[0]).validate_dictionary(),
Err(InvalidParts::MissingDecoderPadding)
);
let mut buf = b"ab".to_vec();
buf.resize(1 + MAX_TOKEN_SIZE, 0);
assert_eq!(
parts(&buf, &[0, 1, 2], &[0, 1]).validate_dictionary(),
Ok(())
);
buf.pop(); assert_eq!(
parts(&buf, &[0, 1, 2], &[0, 1]).validate_dictionary(),
Err(InvalidParts::MissingDecoderPadding)
);
}
#[test]
#[should_panic(expected = "code index out of range")]
fn decompress_into_panics_on_out_of_range_code() {
let mut dict = b"ab".to_vec();
dict.resize(2 + MAX_TOKEN_SIZE - 1, 0);
let offsets = [0u32, 1, 2];
let codes = [0u16, 5]; let parts = Parts {
dict_bytes: &dict,
dict_offsets: &offsets,
bits: 3,
codes: &codes,
};
assert_eq!(parts.validate_dictionary(), Ok(()));
let mut out: Vec<MaybeUninit<u8>> = (0..codes.len() * MAX_TOKEN_SIZE)
.map(|_| MaybeUninit::uninit())
.collect();
decompress_into(parts, &mut out);
}
#[test]
#[should_panic(expected = "decoder padding")]
fn decompress_panics_on_unpadded_parts() {
let offsets = [0u32, 1, 2];
let codes = [0u16, 1];
let parts = Parts {
dict_bytes: b"ab",
dict_offsets: &offsets,
bits: 1,
codes: &codes,
};
decompress(parts);
}
#[test]
fn decompress_matches_input_across_widths() {
let mut bytes = Vec::new();
let mut offsets = vec![0u32];
for i in 0..5000u32 {
let row = format!("row-{i:04}-https://example.com/path/{}", i % 37);
bytes.extend_from_slice(row.as_bytes());
offsets.push(bytes.len() as u32);
}
for bits in 9..=16u8 {
let cfg = Config {
bits: Bits::new(bits).unwrap(),
..DEFAULT_CONFIG
};
let col = compress(&bytes, &offsets, cfg).unwrap();
assert_eq!(
decompress(col.as_parts()),
bytes,
"decompress @ bits={bits}"
);
let mut decoded = Vec::with_capacity(bytes.len());
let len = decompress_into(col.as_parts(), decoded.spare_capacity_mut());
unsafe { decoded.set_len(len) };
assert_eq!(decoded, bytes, "decompress_into @ bits={bits}");
}
}
}