use std::hash::Hash;
use binout::{VByte, AsIs, Serializer};
use minimum_redundancy::DecodingResult;
use bitm::{BitAccess, BitVec, Rank};
use crate::fp::level_size_chooser::LevelSizeChooser;
use ph::utils::{ArrayWithRank, read_bits};
use ph::{BuildDefaultSeededHasher, BuildSeededHasher, stats, utils};
use std::collections::HashMap;
use std::io;
mod conf;
pub use conf::CMapConf;
use crate::fp::collision_solver::{CollisionSolver, CollisionSolverBuilder, IsLossless};
use crate::fp::common::{encode_all, encode_all_from_map};
use dyn_size_of::GetSize;
use crate::coding::{Coding, Decoder, SerializableCoding, BuildCoding};
pub struct CMap<C, S = BuildDefaultSeededHasher> {
array: ArrayWithRank,
value_fragments: Box<[u64]>, level_sizes: Box<[u64]>,
value_coding: C,
hash_builder: S
}
impl<C: GetSize, S> GetSize for CMap<C, S> {
fn size_bytes_dyn(&self) -> usize {
self.array.size_bytes_dyn()
+ self.value_fragments.size_bytes_dyn()
+ self.level_sizes.size_bytes_dyn()
+ self.value_coding.size_bytes_dyn()
}
const USES_DYN_MEM: bool = true;
}
impl<C, S: BuildSeededHasher> CMap<C, S> {
#[inline(always)] fn index<K: Hash + ?Sized>(&self, k: &K, level_nr: u32, size: usize) -> usize {
utils::map64_to_64(self.hash_builder.hash_one(k, level_nr), size as u64) as usize
}
}
impl<C: Coding, S: BuildSeededHasher> CMap<C, S> {
pub fn get_stats<K: Hash + ?Sized, A: stats::AccessStatsCollector>(&self, k: &K, access_stats: &mut A) -> Option<<<C as Coding>::Decoder<'_> as Decoder>::Decoded> {
let mut result_decoder = self.value_coding.decoder();
let mut array_begin_index = 0usize;
let mut level = 0u32;
loop {
let level_size = (*self.level_sizes.get(level as usize)? as usize) << 6usize;
let i = array_begin_index + self.index(k, level, level_size);
if self.array.content.get_bit(i) {
match result_decoder.consume(self.value_fragments.get_fragment(self.array.rank(i), self.value_coding.bits_per_fragment()) as u8) {
DecodingResult::Value(v) => {
access_stats.found_on_level(level);
return Some(v)
},
DecodingResult::Invalid => {
access_stats.fail_on_level(level);
return None
},
DecodingResult::Incomplete => {}
}
}
array_begin_index += level_size;
level += 1;
}
}
#[inline(always)]
pub fn get<K: Hash + ?Sized>(&self, k: &K) -> Option<<<C as Coding>::Decoder<'_> as Decoder>::Decoded> {
self.get_stats(k, &mut ())
}
fn with_fragments<K, LSC, CSB, BS, BC>(
keys: &mut [K], values: &mut [C::Codeword],
value_coding: C, conf: CMapConf<BC, LSC, CSB, S>,
stats: &mut BS)
-> Self
where K: Hash,
LSC: LevelSizeChooser,
CSB: CollisionSolverBuilder + IsLossless,
BS: stats::BuildStatsCollector
{
let mut levels = Vec::<u64>::new();
let mut arrays = Vec::<Box<[u64]>>::new();
let mut input_size = keys.len();
let mut value_rev_indices: Box<[u8]> = values.iter().map(|c| value_coding.len_of(*c)-1).collect();
let mut level_nr = 0u32;
while input_size != 0 {
let level_size_segments = conf.level_size_chooser.size_segments(
&value_coding,
&values[0..input_size], &value_rev_indices[0..input_size]);
let level_size = level_size_segments * 64;
stats.level(input_size, level_size);
let mut collision_solver = conf.collision_solver.new(level_size_segments, value_coding.bits_per_fragment());
for i in 0..input_size {
let a_index = utils::map64_to_64(conf.hash.hash_one(&keys[i], level_nr), level_size as u64) as usize;
if collision_solver.is_under_collision(a_index) { continue }
collision_solver.process_fragment(a_index,
value_coding.rev_fragment_of(values[i], value_rev_indices[i]),
value_coding.bits_per_fragment());
}
let current_array = collision_solver.to_collision_array();
let mut i = 0usize;
while i < input_size {
let a_index = utils::map64_to_64(conf.hash.hash_one(&keys[i], level_nr), level_size as u64) as usize;
if current_array.get_bit(a_index) { let rev_index = &mut value_rev_indices[i];
if *rev_index == 0 { input_size -= 1;
keys.swap(i, input_size);
values.swap(i, input_size);
value_rev_indices.swap(i, input_size);
} else { *rev_index -= 1;
i += 1;
}
} else { i += 1;
}
}
arrays.push(current_array);
levels.push(level_size_segments as u64);
level_nr += 1;
}
let (array, out_fragments_num) = ArrayWithRank::build(arrays.concat().into_boxed_slice());
let mut output_value_fragments = Box::<[u64]>::with_zeroed_bits(out_fragments_num as usize * value_coding.bits_per_fragment() as usize);
for input_index in 0..keys.len() {
let mut array_begin_index = 0usize;
let mut level = 0u32;
loop {
let level_size = (levels[level as usize] as usize) << 6usize;
let i = array_begin_index + utils::map64_to_64(conf.hash.hash_one(&keys[input_index], level), level_size as u64) as usize;
if array.content.get_bit(i) {
let code = &mut values[input_index];
output_value_fragments.init_fragment( array.rank(i),
value_coding.first_fragment_of(*code) as _,
value_coding.bits_per_fragment());
if value_coding.remove_first_fragment_of(code) {
break;
}
}
array_begin_index += level_size;
level += 1;
}
}
stats.end(0);
Self {
array,
value_fragments: output_value_fragments,
level_sizes: levels.into_boxed_slice(),
value_coding,
hash_builder: conf.hash
}
}
}
impl<C: SerializableCoding, S: BuildSeededHasher> CMap<C, S> {
pub fn write_bytes(&self, bytes_per_value: usize) -> usize {
VByte::array_size(&self.level_sizes)
+ AsIs::array_content_size(&self.array.content)
+ self.value_coding.write_bytes(bytes_per_value)
+ AsIs::array_content_size(&self.value_fragments)
}
pub fn write<F>(&self, output: &mut dyn io::Write, write_value: F) -> io::Result<()>
where F: FnMut(&mut dyn io::Write, &C::Value) -> io::Result<()>
{
VByte::write_array(output, &self.level_sizes)?;
AsIs::write_all(output, self.array.content.iter())?;
self.value_coding.write(output, write_value)?;
AsIs::write_all(output, self.value_fragments.iter())
}
pub fn read_with_hasher<F>(input: &mut dyn io::Read, read_value: F, hasher: S) -> io::Result<Self>
where F: FnMut(&mut dyn io::Read) -> io::Result<C::Value>
{
let level_sizes = VByte::read_array(input)?;
let array_content = AsIs::read_n(input, level_sizes.iter().map(|v|*v as usize).sum::<usize>())?;
let (array_with_rank, number_of_ones) = ArrayWithRank::build(array_content);
let value_coding = C::read(input, read_value)?;
let value_fragments = read_bits(input, number_of_ones as usize * value_coding.bits_per_fragment() as usize)?;
Ok(Self {
array: array_with_rank,
value_fragments: value_fragments,
level_sizes: level_sizes,
value_coding,
hash_builder: hasher
})
}
}
impl<C: SerializableCoding> CMap<C> {
pub fn read<F>(input: &mut dyn io::Read, read_value: F) -> io::Result<Self>
where F: FnMut(&mut dyn io::Read) -> io::Result<C::Value>
{
Self::read_with_hasher(input, read_value, Default::default())
}
}
impl<C: Coding, S: BuildSeededHasher> CMap<C, S> {
pub fn from_slices_with_coding_conf<K, LSC, CSB, BS, BC>(
keys: &mut [K], values: &[C::Value],
value_coding: C, conf: CMapConf<BC, LSC, CSB, S>,
stats: &mut BS
) -> Self
where K: Hash,
LSC: LevelSizeChooser,
CSB: CollisionSolverBuilder + IsLossless,
BS: stats::BuildStatsCollector
{
Self::with_fragments(keys, &mut encode_all(&value_coding, values), value_coding, conf, stats)
}
pub fn from_slices_with_conf<K, LSC, CSB, BS, BC>(
keys: &mut [K], values: &[C::Value], conf: CMapConf<BC, LSC, CSB, S>, stats: &mut BS
) -> Self
where K: Hash,
LSC: LevelSizeChooser,
CSB: CollisionSolverBuilder + IsLossless,
BS: stats::BuildStatsCollector,
BC: BuildCoding<C::Value, Coding=C>
{
Self::from_slices_with_coding_conf(keys, values, conf.coding.build_from_iter(values, 0), conf, stats)
}
pub fn from_map_with_coding_conf<K, H, LSC, CSB, BS, BC>(
map: &HashMap<K, C::Value, H>, value_coding: C, conf: CMapConf<BC, LSC, CSB, S>, stats: &mut BS
) -> Self
where K: Hash + Clone,
LSC: LevelSizeChooser,
CSB: CollisionSolverBuilder+IsLossless,
BS: stats::BuildStatsCollector
{
let (mut keys, mut values) = encode_all_from_map(&value_coding, map);
Self::with_fragments(&mut keys, &mut values, value_coding, conf, stats)
}
pub fn from_map_with_conf<K, H, LSC, CSB, BS, BC>(
map: &HashMap<K, C::Value, H>, conf: CMapConf<BC, LSC, CSB, S>, stats: &mut BS
) -> Self
where K: Hash + Clone,
LSC: LevelSizeChooser,
CSB: CollisionSolverBuilder+IsLossless,
BS: stats::BuildStatsCollector,
BC: BuildCoding<C::Value, Coding=C>
{
Self::from_map_with_coding_conf(map, conf.coding.build_from_iter(map.values(), 0), conf, stats)
}
}
impl<C: Coding> CMap<C> {
pub fn from_slices_with_coding<K: Hash, BS: stats::BuildStatsCollector>(keys: &mut [K], values: &[C::Value], value_coding: C, stats: &mut BS) -> Self {
Self::from_slices_with_coding_conf(keys, values, value_coding, CMapConf::default(), stats)
}
}
impl<V: Hash + Eq + Clone> CMap<minimum_redundancy::Coding<V>> {
pub fn from_slices<K: Hash, BS: stats::BuildStatsCollector>(keys: &mut [K], values: &[V], stats: &mut BS) -> Self {
Self::from_slices_with_conf(keys, values, Default::default(), stats)
}
pub fn from_map<K: Hash + Clone, H, BS: stats::BuildStatsCollector>(map: &HashMap<K, V, H>, stats: &mut BS) -> Self {
Self::from_map_with_conf(map, Default::default(), stats)
}
}
impl<K: Hash + Clone, V: Hash + Eq + Clone, H> From<&HashMap<K, V, H>> for CMap<minimum_redundancy::Coding<V>> {
fn from(map: &HashMap<K, V, H>) -> Self {
Self::from_map(map, &mut ())
}
}
impl<K: Hash + Clone, V: Hash + Eq + Clone, H> From<HashMap<K, V, H>> for CMap<minimum_redundancy::Coding<V>> {
fn from(map: HashMap<K, V, H>) -> Self {
Self::from_map(&map, &mut ())
}
}
#[cfg(test)]
mod tests {
use super::*;
use binout::Serializer;
use maplit::hashmap;
use bitm::ceiling_div;
use crate::coding::BuildMinimumRedundancy;
fn test_read_write<C: SerializableCoding<Value=u8>>(bbmap: &CMap<C>) {
let mut buff = Vec::new();
bbmap.write(&mut buff, |b, v| AsIs::write(b, *v)).unwrap();
assert_eq!(buff.len(), bbmap.write_bytes(1));
let read = CMap::<C>::read(&mut &buff[..], |b| AsIs::read(b)).unwrap();
assert_eq!(bbmap.array.content, read.array.content);
assert_eq!(bbmap.level_sizes, read.level_sizes);
}
fn test_bbmap_invariants<C: Coding>(bbmap: &CMap<C>) {
assert_eq!(bbmap.level_sizes.iter().map(|v|*v as usize).sum::<usize>(), bbmap.array.content.len());
assert_eq!(
ceiling_div(bbmap.array.content.iter().map(|v|v.count_ones()).sum::<u32>() as usize * bbmap.value_coding.bits_per_fragment() as usize, 64),
bbmap.value_fragments.len()
);
}
fn test_4pairs<LSC: LevelSizeChooser>(conf: CMapConf<BuildMinimumRedundancy, LSC>) {
let bbmap = CMap::from_map_with_conf(&hashmap!('a'=>1u8, 'b'=>2u8, 'c'=>1u8, 'd'=>3u8), conf, &mut ());
assert_eq!(bbmap.get(&'a'), Some(&1));
assert_eq!(bbmap.get(&'b'), Some(&2));
assert_eq!(bbmap.get(&'c'), Some(&1));
assert_eq!(bbmap.get(&'d'), Some(&3));
test_bbmap_invariants(&bbmap);
test_read_write(&bbmap);
}
#[test]
fn with_hashmap_bpf1() {
test_4pairs(CMapConf::bpf(1));
}
fn test_8pairs<LSC: LevelSizeChooser>(conf: CMapConf<BuildMinimumRedundancy, LSC>) {
let bbmap = CMap::from_map_with_conf(&hashmap!(
'a' => 1, 'b' => 2, 'c' => 1, 'd' => 3,
'e' => 4, 'f' => 1, 'g' => 5, 'h' => 6), conf, &mut ());
assert_eq!(bbmap.get(&'a'), Some(&1));
assert_eq!(bbmap.get(&'b'), Some(&2));
assert_eq!(bbmap.get(&'c'), Some(&1));
assert_eq!(bbmap.get(&'d'), Some(&3));
assert_eq!(bbmap.get(&'e'), Some(&4));
assert_eq!(bbmap.get(&'f'), Some(&1));
assert_eq!(bbmap.get(&'g'), Some(&5));
assert_eq!(bbmap.get(&'h'), Some(&6));
test_bbmap_invariants(&bbmap);
test_read_write(&bbmap);
}
#[test]
fn with_hashmap_bpf2() {
test_8pairs(CMapConf::bpf(2));
}
}