use std::cmp::Ordering;
use crate::cpc::CpcSketch;
use crate::cpc::Flavor;
use crate::cpc::compression_data::COLUMN_PERMUTATIONS_FOR_DECODING;
use crate::cpc::compression_data::COLUMN_PERMUTATIONS_FOR_ENCODING;
use crate::cpc::compression_data::DECODING_TABLES_FOR_HIGH_ENTROPY_BYTE;
use crate::cpc::compression_data::ENCODING_TABLES_FOR_HIGH_ENTROPY_BYTE;
use crate::cpc::compression_data::LENGTH_LIMITED_UNARY_DECODING_TABLE65;
use crate::cpc::compression_data::LENGTH_LIMITED_UNARY_ENCODING_TABLE65;
use crate::cpc::determine_correct_offset;
use crate::cpc::determine_flavor;
use crate::cpc::pair_table::PairTable;
#[derive(Default)]
pub(super) struct CompressedState {
pub(super) table_data: Vec<u32>,
pub(super) table_data_words: usize,
pub(super) table_num_entries: u32,
pub(super) window_data: Vec<u32>,
pub(super) window_data_words: usize,
}
impl CompressedState {
pub fn compress(&mut self, source: &CpcSketch) {
match source.flavor() {
Flavor::Empty => {
}
Flavor::Sparse => {
self.compress_sparse_flavor(source);
debug_assert!(self.window_data.is_empty(), "window is not expected");
debug_assert!(!self.table_data.is_empty(), "table is expected");
}
Flavor::Hybrid => {
self.compress_hybrid_flavor(source);
debug_assert!(self.window_data.is_empty(), "window is not expected");
debug_assert!(!self.table_data.is_empty(), "table is expected");
}
Flavor::Pinned => {
self.compress_pinned_flavor(source);
debug_assert!(!self.window_data.is_empty(), "window is expected");
}
Flavor::Sliding => {
self.compress_sliding_flavor(source);
debug_assert!(!self.window_data.is_empty(), "window is expected");
}
}
}
fn compress_sparse_flavor(&mut self, source: &CpcSketch) {
debug_assert!(source.sliding_window.is_empty());
let mut pairs = source.surprising_value_table().unwrapping_get_items();
pairs.sort_unstable();
self.compress_surprising_values(&pairs, source.lg_k());
}
fn compress_hybrid_flavor(&mut self, source: &CpcSketch) {
debug_assert!(!source.sliding_window.is_empty());
debug_assert_eq!(source.window_offset, 0);
let k = 1 << source.lg_k();
let mut pairs = source.surprising_value_table().unwrapping_get_items();
pairs.sort_unstable();
let num_pairs_from_table = pairs.len();
let num_pairs_from_window = (source.num_coupons() as usize) - num_pairs_from_table;
let all_pairs_len = num_pairs_from_table + num_pairs_from_window;
let mut all_pairs = vec![0; all_pairs_len];
{
let mut idx = num_pairs_from_table;
for row_index in 0..k {
let mut byte = source.sliding_window[row_index];
while byte != 0 {
let col_index = byte.trailing_zeros();
byte ^= 1 << col_index; all_pairs[idx] = ((row_index << 6) as u32) | col_index;
idx += 1;
}
}
assert_eq!(idx, all_pairs_len);
}
{
let mut final_idx = 0;
let mut table_idx = 0;
let mut window_idx = num_pairs_from_table;
while final_idx < all_pairs_len {
if table_idx < num_pairs_from_table
&& (window_idx >= all_pairs_len || pairs[table_idx] <= all_pairs[window_idx])
{
all_pairs[final_idx] = pairs[table_idx];
table_idx += 1;
} else {
all_pairs[final_idx] = all_pairs[window_idx];
window_idx += 1;
}
final_idx += 1;
}
}
self.compress_surprising_values(&all_pairs, source.lg_k());
}
fn compress_pinned_flavor(&mut self, source: &CpcSketch) {
self.compress_sliding_window(&source.sliding_window, source.lg_k(), source.num_coupons());
let mut pairs = source.surprising_value_table().unwrapping_get_items();
if !pairs.is_empty() {
for pair in &mut pairs {
assert!(*pair & 63 >= 8, "pair column index is less than 8: {pair}");
*pair -= 8;
}
pairs.sort_unstable();
self.compress_surprising_values(&pairs, source.lg_k());
}
}
fn compress_sliding_flavor(&mut self, source: &CpcSketch) {
self.compress_sliding_window(&source.sliding_window, source.lg_k(), source.num_coupons());
let mut pairs = source.surprising_value_table().unwrapping_get_items();
if !pairs.is_empty() {
let pseudo_phase = determine_pseudo_phase(source.lg_k(), source.num_coupons());
let permutation = &COLUMN_PERMUTATIONS_FOR_ENCODING[pseudo_phase as usize];
let offset = source.window_offset;
debug_assert!(offset <= 56);
for pair in &mut pairs {
let row_col = *pair;
let row = row_col >> 6;
let mut col = (row_col & 63) as u8;
col = (col + 56 - offset) & 63;
debug_assert!(col < 56);
col = permutation[col as usize];
*pair = (row << 6) | (col as u32);
}
pairs.sort_unstable();
self.compress_surprising_values(&pairs, source.lg_k());
}
}
fn compress_surprising_values(&mut self, pairs: &[u32], lg_k: u8) {
let k = 1 << lg_k;
let num_pairs = pairs.len() as u32;
let num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs as u64);
let table_len = safe_length_for_compressed_pair_buf(k, num_pairs, num_base_bits);
self.table_data.resize(table_len, 0);
let compressed_surprising_values = self.low_level_compress_pairs(pairs, num_base_bits);
self.table_data_words = compressed_surprising_values;
self.table_num_entries = num_pairs;
}
fn compress_sliding_window(&mut self, window: &[u8], lg_k: u8, num_coupons: u32) {
let k = 1 << lg_k;
let window_buf_len = safe_length_for_compressed_window_buf(k);
self.window_data.resize(window_buf_len, 0);
let pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
let data_words = self.low_level_compress_bytes(
window,
k,
&ENCODING_TABLES_FOR_HIGH_ENTROPY_BYTE[pseudo_phase as usize],
);
self.window_data_words = data_words;
}
fn low_level_compress_bytes(
&mut self,
byte_array: &[u8],
num_bytes_to_encode: u32,
encoding_table: &[u16],
) -> usize {
let mut bitbuf = 0;
let mut bufbits = 0;
let mut next_word_index = 0;
for byte_index in 0..num_bytes_to_encode {
let code_info = encoding_table[byte_array[byte_index as usize] as usize];
let code_val = (code_info & 0xfff) as u64;
let code_len = (code_info >> 12) as u8;
bitbuf |= code_val << bufbits;
bufbits += code_len;
maybe_flush_bitbuf(
&mut bitbuf,
&mut bufbits,
&mut self.window_data,
&mut next_word_index,
);
}
bufbits += 11;
maybe_flush_bitbuf(
&mut bitbuf,
&mut bufbits,
&mut self.window_data,
&mut next_word_index,
);
if bufbits > 0 {
debug_assert!(bufbits < 32);
self.window_data[next_word_index] = (bitbuf & 0xffffffff) as u32;
next_word_index += 1;
}
next_word_index
}
fn low_level_compress_pairs(&mut self, pairs: &[u32], num_base_bits: u8) -> usize {
let mut bitbuf = 0;
let mut bufbits = 0;
let mut next_word_index = 0;
let golomb_lo_mask = ((1 << num_base_bits) - 1) as u64;
let mut predicted_row_index = 0;
let mut predicted_col_index = 0;
for pair_index in 0..pairs.len() {
let row_col = pairs[pair_index];
let row_index = row_col >> 6;
let col_index = row_col & 63;
if row_index != predicted_row_index {
predicted_col_index = 0;
}
assert!(row_index >= predicted_row_index);
assert!(col_index >= predicted_col_index);
let y_delta = row_index - predicted_row_index;
let x_delta = col_index - predicted_col_index;
predicted_row_index = row_index;
predicted_col_index = col_index + 1;
let code_info = LENGTH_LIMITED_UNARY_ENCODING_TABLE65[x_delta as usize];
let code_val = (code_info & 0xfff) as u64;
let code_len = (code_info >> 12) as u8;
bitbuf |= code_val << bufbits;
bufbits += code_len;
maybe_flush_bitbuf(
&mut bitbuf,
&mut bufbits,
&mut self.table_data,
&mut next_word_index,
);
let golomb_lo = (y_delta as u64) & golomb_lo_mask;
let golomb_hi = (y_delta as u64) >> num_base_bits;
write_unary(
&mut self.table_data,
&mut next_word_index,
&mut bitbuf,
&mut bufbits,
golomb_hi,
);
bitbuf |= golomb_lo << bufbits;
bufbits += num_base_bits;
maybe_flush_bitbuf(
&mut bitbuf,
&mut bufbits,
&mut self.table_data,
&mut next_word_index,
);
}
let padding = 10u8.saturating_sub(num_base_bits);
bufbits += padding;
maybe_flush_bitbuf(
&mut bitbuf,
&mut bufbits,
&mut self.table_data,
&mut next_word_index,
);
if bufbits > 0 {
assert!(bufbits < 32);
self.table_data[next_word_index] = (bitbuf & 0xffffffff) as u32;
next_word_index += 1;
}
next_word_index
}
}
pub(super) struct UncompressedState {
pub(super) table: PairTable,
pub(super) window: Vec<u8>,
}
impl CompressedState {
pub fn uncompress(&self, lg_k: u8, num_coupons: u32) -> UncompressedState {
match determine_flavor(lg_k, num_coupons) {
Flavor::Empty => UncompressedState {
table: PairTable::new(2, lg_k + 6),
window: vec![],
},
Flavor::Sparse => self.uncompress_sparse_flavor(lg_k),
Flavor::Hybrid => self.uncompress_hybrid_flavor(lg_k),
Flavor::Pinned => self.uncompress_pinned_flavor(lg_k, num_coupons),
Flavor::Sliding => self.uncompress_sliding_flavor(lg_k, num_coupons),
}
}
fn uncompress_sparse_flavor(&self, lg_k: u8) -> UncompressedState {
debug_assert!(self.window_data.is_empty(), "window is not expected");
debug_assert!(!self.table_data.is_empty(), "table is expected");
let pairs = uncompress_surprising_values(
&self.table_data,
self.table_data_words,
self.table_num_entries,
lg_k,
);
UncompressedState {
table: PairTable::from_slots(lg_k, self.table_num_entries, pairs),
window: vec![],
}
}
fn uncompress_hybrid_flavor(&self, lg_k: u8) -> UncompressedState {
debug_assert!(self.window_data.is_empty(), "window is not expected");
debug_assert!(!self.table_data.is_empty(), "table is expected");
let mut pairs = uncompress_surprising_values(
&self.table_data,
self.table_data_words,
self.table_num_entries,
lg_k,
);
let k = 1 << lg_k;
let mut window = vec![0u8; k]; let mut next_true_pair = 0;
for i in 0..self.table_num_entries {
let row_col = pairs[i as usize];
assert_ne!(row_col, u32::MAX);
let col = row_col & 63;
if col < 8 {
let row = row_col >> 6;
window[row as usize] |= 1 << col; } else {
pairs[next_true_pair as usize] = row_col;
next_true_pair += 1;
}
}
UncompressedState {
table: PairTable::from_slots(lg_k, next_true_pair, pairs),
window,
}
}
fn uncompress_pinned_flavor(&self, lg_k: u8, num_coupons: u32) -> UncompressedState {
debug_assert!(!self.window_data.is_empty(), "window is expected");
let mut window = vec![];
uncompress_sliding_window(
&self.window_data,
self.window_data_words,
&mut window,
lg_k,
num_coupons,
);
let num_pairs = self.table_num_entries;
let table = if num_pairs == 0 {
PairTable::new(2, lg_k + 6)
} else {
debug_assert!(!self.table_data.is_empty(), "table is expected");
let mut pairs = uncompress_surprising_values(
&self.table_data,
self.table_data_words,
num_pairs,
lg_k,
);
for i in 0..num_pairs {
let i = i as usize;
assert!(
(pairs[i] & 63) < 56,
"pair column index is invalid: {}",
pairs[i]
);
pairs[i] += 8;
}
PairTable::from_slots(lg_k, num_pairs, pairs)
};
UncompressedState { table, window }
}
fn uncompress_sliding_flavor(&self, lg_k: u8, num_coupons: u32) -> UncompressedState {
debug_assert!(!self.window_data.is_empty(), "window is expected");
let mut window = vec![];
uncompress_sliding_window(
&self.window_data,
self.window_data_words,
&mut window,
lg_k,
num_coupons,
);
let num_pairs = self.table_num_entries;
let table = if num_pairs == 0 {
PairTable::new(2, lg_k + 6)
} else {
debug_assert!(!self.table_data.is_empty(), "table is expected");
let mut pairs = uncompress_surprising_values(
&self.table_data,
self.table_data_words,
num_pairs,
lg_k,
);
let pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
let permutation = &COLUMN_PERMUTATIONS_FOR_DECODING[pseudo_phase as usize];
let offset = determine_correct_offset(lg_k, num_coupons);
assert!(offset <= 56, "offset is invalid: {offset}");
for i in 0..num_pairs {
let i = i as usize;
let row_col = pairs[i];
let row = row_col >> 6;
let mut col = (row_col & 63) as u8;
col = permutation[col as usize];
col = (col + (offset + 8)) & 63;
pairs[i] = (row << 6) | (col as u32);
}
PairTable::from_slots(lg_k, num_pairs, pairs)
};
UncompressedState { table, window }
}
}
fn uncompress_surprising_values(
data: &[u32],
data_words: usize,
num_pairs: u32,
lg_k: u8,
) -> Vec<u32> {
let k = 1 << lg_k;
let mut pairs = vec![0; num_pairs as usize];
let num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs as u64);
low_level_uncompress_pairs(&mut pairs, num_pairs, num_base_bits, data, data_words);
pairs
}
fn uncompress_sliding_window(
data: &[u32],
data_words: usize,
window: &mut Vec<u8>,
lg_k: u8,
num_coupons: u32,
) {
let k = 1 << lg_k;
window.resize(k, 0);
let pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
low_level_uncompress_bytes(
window,
k as u32,
data,
data_words,
&DECODING_TABLES_FOR_HIGH_ENTROPY_BYTE[pseudo_phase as usize],
);
}
fn low_level_uncompress_pairs(
pairs: &mut [u32],
num_pairs_to_decode: u32,
num_base_bits: u8,
compressed_words: &[u32],
num_compressed_words: usize,
) {
let mut word_index = 0;
let mut bitbuf = 0;
let mut bufbits = 0;
let golomb_lo_mask = (1 << num_base_bits) - 1;
let mut predicted_row_index = 0u32;
let mut predicted_col_index = 0u8;
for pair_index in 0..num_pairs_to_decode {
maybe_fill_bitbuf(
&mut bitbuf,
&mut bufbits,
compressed_words,
&mut word_index,
12,
);
let peek12 = bitbuf & 0xfff;
let lookup = LENGTH_LIMITED_UNARY_DECODING_TABLE65[peek12 as usize];
let code_word_length = (lookup >> 8) as u8;
let x_delta = (lookup & 0xff) as u8;
bitbuf >>= code_word_length;
bufbits -= code_word_length;
let golomb_hi = read_unary(compressed_words, &mut word_index, &mut bitbuf, &mut bufbits);
maybe_fill_bitbuf(
&mut bitbuf,
&mut bufbits,
compressed_words,
&mut word_index,
num_base_bits,
);
let golomb_lo = bitbuf & golomb_lo_mask;
bitbuf >>= num_base_bits;
bufbits -= num_base_bits;
let y_delta = ((golomb_hi << num_base_bits) | golomb_lo) as u32;
if y_delta > 0 {
predicted_col_index = 0;
}
let row_index = predicted_row_index + y_delta;
let col_index = predicted_col_index + x_delta;
let row_col = (row_index << 6) | (col_index as u32);
pairs[pair_index as usize] = row_col;
predicted_row_index = row_index;
predicted_col_index = col_index + 1;
}
debug_assert!(
word_index <= num_compressed_words,
"word_index: {word_index}, num_compressed_words: {num_compressed_words}",
);
}
fn low_level_uncompress_bytes(
byte_array: &mut [u8],
num_bytes_to_decode: u32,
compressed_words: &[u32],
num_compressed_words: usize,
decoding_table: &[u16],
) {
let mut word_index = 0;
let mut bitbuf = 0;
let mut bufbits = 0;
for byte_index in 0..num_bytes_to_decode {
maybe_fill_bitbuf(
&mut bitbuf,
&mut bufbits,
compressed_words,
&mut word_index,
12,
);
let peek12 = bitbuf & 0xfff;
let lookup = decoding_table[peek12 as usize];
let code_word_length = (lookup >> 8) as u8;
let decoded_byte = (lookup & 0xff) as u8;
byte_array[byte_index as usize] = decoded_byte;
bitbuf >>= code_word_length;
bufbits -= code_word_length;
}
debug_assert!(
word_index <= num_compressed_words,
"word_index: {word_index}, num_compressed_words: {num_compressed_words}",
);
}
fn determine_pseudo_phase(lg_k: u8, num_coupons: u32) -> u8 {
let k = 1 << lg_k;
if 1000 * num_coupons < 2375 * k {
if 4 * num_coupons < 3 * k {
16
} else if 10 * num_coupons < 11 * k {
16 + 1
} else if 100 * num_coupons < 132 * k {
16 + 2
} else if 3 * num_coupons < 5 * k {
16 + 3
} else if 1000 * num_coupons < 1965 * k {
16 + 4
} else if 1000 * num_coupons < 2275 * k {
16 + 5
} else {
6
}
} else {
debug_assert!(lg_k >= 4);
let tmp = num_coupons >> (lg_k - 4);
(tmp & 15) as u8 }
}
fn write_unary(
compressed_words: &mut [u32],
next_word_index: &mut usize,
bitbuf: &mut u64,
bufbits: &mut u8,
value: u64,
) {
assert!(*bufbits <= 31);
let mut remaining = value;
while remaining >= 16 {
remaining -= 16;
*bufbits += 16; maybe_flush_bitbuf(bitbuf, bufbits, compressed_words, next_word_index);
}
let the_unary_code = 1 << remaining;
*bitbuf |= the_unary_code << *bufbits;
*bufbits += (remaining + 1) as u8;
maybe_flush_bitbuf(bitbuf, bufbits, compressed_words, next_word_index);
}
fn read_unary(
compressed_words: &[u32],
next_word_index: &mut usize,
bitbuf: &mut u64,
bufbits: &mut u8,
) -> u64 {
let mut subtotal = 0u64;
loop {
maybe_fill_bitbuf(bitbuf, bufbits, compressed_words, next_word_index, 8);
let peek8 = *bitbuf & 0xff;
let trailing_zeros = peek8.trailing_zeros() as u8;
if trailing_zeros < 8 {
*bufbits -= 1 + trailing_zeros;
*bitbuf >>= 1 + trailing_zeros;
return subtotal + trailing_zeros as u64;
}
subtotal += 8;
*bufbits -= 8;
*bitbuf >>= 8;
}
}
fn maybe_flush_bitbuf(
bitbuf: &mut u64,
bufbits: &mut u8,
word: &mut [u32],
word_index: &mut usize,
) {
if *bufbits >= 32 {
word[*word_index] = (*bitbuf & 0xffffffff) as u32;
*word_index += 1;
*bitbuf >>= 32;
*bufbits -= 32;
}
}
fn maybe_fill_bitbuf(
bitbuf: &mut u64,
bufbits: &mut u8,
words: &[u32],
word_index: &mut usize,
minbits: u8,
) {
if *bufbits < minbits {
*bitbuf |= (words[*word_index] as u64) << *bufbits;
*word_index += 1;
*bufbits += 32;
}
}
fn safe_length_for_compressed_window_buf(k: u32) -> usize {
let bits = 12 * k + 11;
divide_longs_rounding_up(bits as usize, 32)
}
fn safe_length_for_compressed_pair_buf(k: u32, num_pairs: u32, num_base_bits: u8) -> usize {
let k = k as usize;
let num_pairs = num_pairs as usize;
let num_base_bits = num_base_bits as usize;
let ybits = num_pairs * (1 + num_base_bits) + (k >> num_base_bits);
let xbits = 12 * (num_pairs);
let padding = 10usize.saturating_sub(num_base_bits);
divide_longs_rounding_up(xbits + ybits + padding, 32)
}
fn divide_longs_rounding_up(x: usize, y: usize) -> usize {
debug_assert_ne!(y, 0);
let quotient = x / y;
if quotient * y == x {
quotient
} else {
quotient + 1
}
}
fn golomb_choose_number_of_base_bits(k: u32, count: u64) -> u8 {
debug_assert!(k > 0);
debug_assert!(count > 0);
let quotient = ((k as u64) - count) / count; if quotient == 0 {
0
} else {
floor_log2_of_long(quotient)
}
}
fn floor_log2_of_long(x: u64) -> u8 {
debug_assert!(x > 0);
let mut p = 0u8;
let mut y = 1u64;
loop {
match u64::cmp(&y, &x) {
Ordering::Equal => return p,
Ordering::Greater => return p - 1,
Ordering::Less => {
p += 1;
y <<= 1;
}
}
}
}