use std::{hash::Hash, io, usize};
use crate::{phast::Params, seeds::{Bits8, SeedSize}};
use super::{bits_per_seed_to_100_bucket_size, builder::{build_mt, build_st}, conf::Conf, seed_chooser::{SeedChooser, SeedOnly}, CompressedArray, DefaultCompressedArray, WINDOW_SIZE};
use binout::{Serializer, VByte};
use bitm::BitAccess;
use dyn_size_of::GetSize;
use seedable_hash::{BuildDefaultSeededHasher, BuildSeededHasher};
use voracious_radix_sort::RadixSort;
use rayon::prelude::*;
pub(crate) struct SeedEx<SSVecElement> {
pub(crate) seeds: Box<[SSVecElement]>,
pub(crate) conf: Conf,
}
impl<SSVecElement> SeedEx<SSVecElement> {
#[inline(always)]
pub(crate) fn bucket_for(&self, key: u64) -> usize { self.conf.bucket_for(key) }
#[inline(always)]
pub(crate) unsafe fn seed_for<SS>(&self, seed_size: SS, key: u64) -> u16 where SS: SeedSize<VecElement=SSVecElement> {
seed_size.get_seed(&self.seeds, self.bucket_for(key))
}
pub fn write<SS: SeedSize<VecElement = SSVecElement>>(&self, output: &mut dyn io::Write, seed_size: SS) -> io::Result<()>
{
self.conf.write(output)?;
seed_size.write_seed_vec(output, &self.seeds)
}
pub fn read<SS: SeedSize<VecElement = SSVecElement>>(input: &mut dyn io::Read) -> io::Result<(SS, Self)> {
let conf = Conf::read(input)?;
let (seed_size, seeds) = SS::read_seed_vec(input, conf.buckets_num)?;
Ok((seed_size, Self{ seeds, conf }))
}
}
impl<SSVecElement: GetSize> SeedEx<SSVecElement> {
pub fn write_bytes(&self) -> usize {
self.conf.write_bytes() + GetSize::size_bytes_dyn(&self.seeds)
}
}
impl<SSVecElement: GetSize> GetSize for SeedEx<SSVecElement> {
fn size_bytes_dyn(&self) -> usize { self.seeds.size_bytes_dyn() }
fn size_bytes_content_dyn(&self) -> usize { self.seeds.size_bytes_content_dyn() }
const USES_DYN_MEM: bool = true;
}
pub(crate) struct Level<SSVecElement> {
pub(crate) seeds: SeedEx<SSVecElement>,
pub(crate) shift: usize
}
impl<SSVecElement: GetSize> GetSize for Level<SSVecElement> {
fn size_bytes_dyn(&self) -> usize { self.seeds.size_bytes_dyn() }
fn size_bytes_content_dyn(&self) -> usize { self.seeds.size_bytes_content_dyn() }
const USES_DYN_MEM: bool = true;
}
impl<SSVecElement: GetSize> Level<SSVecElement> {
pub fn write_bytes(&self) -> usize {
VByte::size(self.shift) + self.seeds.write_bytes()
}
}
impl<SSVecElement> Level<SSVecElement> {
pub fn write<SS: SeedSize<VecElement = SSVecElement>>(&self, output: &mut dyn io::Write, seed_size: SS) -> io::Result<()>
{
VByte::write(output, self.shift)?;
self.seeds.write(output, seed_size)
}
pub fn read<SS: SeedSize<VecElement = SSVecElement>>(input: &mut dyn io::Read) -> io::Result<(SS, Self)> {
let shift = VByte::read(input)?;
SeedEx::<SSVecElement>::read::<SS>(input).map(|(ss, seeds)| (ss, Self{ seeds, shift }))
}
}
#[inline]
pub(crate) fn build_level_from_slice_st<K, SS, SC, S>(keys: &[K], params: &Params<SS>, hasher: &S, seed_chooser: SC, level_nr: u64)
-> (Vec<K>, SeedEx<SS::VecElement>, Box<[u64]>, usize)
where K: Hash+Clone, SS: SeedSize, SC: SeedChooser, S: BuildSeededHasher
{
let mut hashes: Box<[_]> = keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect();
hashes.voracious_sort();
let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
let (seeds, builder) =
build_st(&hashes, conf, params.seed_size, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser);
let (unassigned_values, unassigned_len) = builder.unassigned_values(&seeds);
drop(builder);
let mut keys_vec = Vec::with_capacity(unassigned_len);
keys_vec.extend(keys.into_iter().filter(|key| {
unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
}).cloned());
(keys_vec, SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
}
#[inline]
pub(crate) fn build_level_from_slice_mt<K, SS, SC, S>(keys: &[K], params: &Params<SS>, threads_num: usize, hasher: &S, seed_chooser: SC, level_nr: u64)
-> (Vec<K>, SeedEx<SS::VecElement>, Box<[u64]>, usize)
where K: Hash+Sync+Send+Clone, SS: SeedSize, SC: SeedChooser+Sync, S: BuildSeededHasher+Sync
{
let mut hashes: Box<[_]> = if keys.len() > 4*2048 { keys.par_iter().with_min_len(256).map(|k| hasher.hash_one(k, level_nr)).collect()
} else {
keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect()
};
hashes.voracious_mt_sort(threads_num);
let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
let (seeds, builder) =
build_mt(&hashes, conf, params.seed_size, WINDOW_SIZE, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser, threads_num);
let (unassigned_values, unassigned_len) = builder.unassigned_values(&seeds);
drop(builder);
let mut keys_vec = Vec::with_capacity(unassigned_len);
keys_vec.par_extend(keys.into_par_iter().filter(|key| {
unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
}).cloned());
(keys_vec, SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
}
#[inline(always)]
pub(crate) fn build_level_st<K, SS, SC, S>(keys: &mut Vec::<K>, params: &Params<SS>, hasher: &S, seed_chooser: SC, level_nr: u64)
-> (SeedEx<SS::VecElement>, Box<[u64]>, usize)
where K: Hash, SS: SeedSize, SC: SeedChooser, S: BuildSeededHasher
{
let mut hashes: Box<[_]> = keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect();
hashes.voracious_sort();
let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
let (seeds, builder) =
build_st(&hashes, conf, params.seed_size, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser);
let (unassigned_values, unassigned_len) = builder.unassigned_values(&seeds);
drop(builder);
keys.retain(|key| {
unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
});
(SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
}
#[inline]
pub(crate) fn build_level_mt<K, SS, SC, S>(keys: &mut Vec::<K>, params: &Params<SS>, threads_num: usize, hasher: &S, seed_chooser: SC, level_nr: u64)
-> (SeedEx<SS::VecElement>, Box<[u64]>, usize)
where K: Hash+Sync+Send, SS: SeedSize, SC: SeedChooser+Sync, S: BuildSeededHasher+Sync
{
let mut hashes: Box<[_]> = if keys.len() > 4*2048 { keys.par_iter().with_min_len(256).map(|k| hasher.hash_one(k, level_nr)).collect()
} else {
keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect()
};
hashes.voracious_mt_sort(threads_num);
let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
let (seeds, builder) =
build_mt(&hashes, conf, params.seed_size, WINDOW_SIZE, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser, threads_num);
let (unassigned_values, unassigned_len) = builder.unassigned_values(&seeds);
drop(builder);
let mut result = Vec::with_capacity(unassigned_len);
std::mem::swap(keys, &mut result);
keys.par_extend(result.into_par_iter().filter(|key| {
unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
}));
(SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
}
pub struct Function<SS, SC = SeedOnly, CA = DefaultCompressedArray, S = BuildDefaultSeededHasher>
where SS: SeedSize
{
level0: SeedEx<SS::VecElement>,
unassigned: CA,
levels: Box<[Level<SS::VecElement>]>,
hasher: S,
seed_chooser: SC,
seed_size: SS, }
impl<SC, SS: SeedSize, CA, S> GetSize for Function<SS, SC, CA, S> where CA: GetSize {
fn size_bytes_dyn(&self) -> usize {
self.level0.size_bytes_dyn() +
self.unassigned.size_bytes_dyn() +
self.levels.size_bytes_dyn()
}
fn size_bytes_content_dyn(&self) -> usize {
self.level0.size_bytes_content_dyn() +
self.unassigned.size_bytes_content_dyn() +
self.levels.size_bytes_content_dyn()
}
const USES_DYN_MEM: bool = true;
}
impl<SS: SeedSize, SC: SeedChooser, CA: CompressedArray, S: BuildSeededHasher> Function<SS, SC, CA, S> {
#[inline(always)] pub fn get<K>(&self, key: &K) -> usize where K: Hash + ?Sized {
let key_hash = self.hasher.hash_one(key, 0);
let seed = unsafe{ self.level0.seed_for(self.seed_size, key_hash) };
if seed != 0 { return self.seed_chooser.f(key_hash, seed, &self.level0.conf); }
for level_nr in 0..self.levels.len() {
let l = &self.levels[level_nr];
let key_hash = self.hasher.hash_one(key, level_nr as u64 + 1);
let seed = unsafe { l.seeds.seed_for(self.seed_size, key_hash) };
if seed != 0 {
return self.unassigned.get(self.seed_chooser.f(key_hash, seed, &l.seeds.conf) + l.shift)
}
}
unreachable!()
}
pub fn with_vec_p_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash {
let number_of_keys = keys.len();
Self::_new(|h| {
let (level0, unassigned_values, unassigned_len) =
build_level_st(&mut keys, params, h, seed_chooser, 0);
(keys, level0, unassigned_values, unassigned_len)
}, |keys, level_nr, h| {
build_level_st(keys, params, h, seed_chooser, level_nr)
}, hasher, seed_chooser, params.seed_size, number_of_keys)
}
pub fn with_vec_p_threads_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
where K: Hash+Sync+Send, S: Sync, SC: Sync
{
if threads_num == 1 { return Self::with_vec_p_hash_sc(keys, params, hasher, seed_chooser); }
let number_of_keys = keys.len();
Self::_new(|h| {
let (level0, unassigned_values, unassigned_len) =
build_level_mt(&mut keys, params, threads_num, h, seed_chooser, 0);
(keys, level0, unassigned_values, unassigned_len)
}, |keys, level_nr, h| {
build_level_mt(keys, params, threads_num, h, seed_chooser, level_nr)
}, hasher, seed_chooser, params.seed_size, number_of_keys)
}
pub fn with_slice_p_hash_sc<K>(keys: &[K], params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash+Clone {
Self::_new(|h| {
build_level_from_slice_st(keys, params, h, seed_chooser, 0)
}, |keys, level_nr, h| {
build_level_st(keys, params, h, seed_chooser, level_nr)
}, hasher, seed_chooser, params.seed_size, keys.len())
}
pub fn with_slice_p_threads_hash_sc<K>(keys: &[K], params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
where K: Hash+Sync+Send+Clone, S: Sync, SC: Sync {
if threads_num == 1 { return Self::with_slice_p_hash_sc(keys, params, hasher, seed_chooser); }
Self::_new(|h| {
build_level_from_slice_mt(keys, params, threads_num, h, seed_chooser, 0)
}, |keys, level_nr, h| {
build_level_mt(keys, params, threads_num, h, seed_chooser, level_nr)
}, hasher, seed_chooser, params.seed_size, keys.len())
}
#[inline]
fn _new<K, BF, BL>(build_first: BF, build_level: BL, hasher: S, seed_chooser: SC, seed_size: SS, number_of_keys: usize) -> Self
where BF: FnOnce(&S) -> (Vec::<K>, SeedEx<SS::VecElement>, Box<[u64]>, usize),
BL: Fn(&mut Vec::<K>, u64, &S) -> (SeedEx<SS::VecElement>, Box<[u64]>, usize),
K: Hash
{
let (mut keys, level0, unassigned_values, unassigned_len) = build_first(&hasher);
debug_assert_eq!(unassigned_len, unassigned_values.bit_ones().count());
let mut level0_unassigned = unassigned_values.bit_ones();
let mut unassigned = Vec::with_capacity(unassigned_len * 3 / 2);
let mut levels = Vec::with_capacity(16);
let mut last = 0;
while !keys.is_empty() {
let keys_len = keys.len();
let (seeds, unassigned_values, _unassigned_len) =
build_level(&mut keys, levels.len() as u64+1, &hasher);
let shift = unassigned.len();
for i in 0..keys_len {
if CA::MAX_FOR_UNUSED {
if !unsafe{unassigned_values.get_bit_unchecked(i)} {
last = level0_unassigned.next().unwrap();
unassigned.push(last);
} else {
unassigned.push(usize::MAX);
}
} else {
if !unsafe{unassigned_values.get_bit_unchecked(i)} {
last = level0_unassigned.next().unwrap();
}
unassigned.push(last);
}
}
levels.push(Level { seeds, shift });
}
debug_assert!(level0_unassigned.next().is_none());
drop(level0_unassigned);
Self {
level0,
unassigned: CA::new(unassigned, last, number_of_keys),
levels: levels.into_boxed_slice(),
hasher,
seed_chooser,
seed_size
}
}
#[inline(always)] pub fn minimal_output_range(&self, num_of_keys: usize) -> usize { self.seed_chooser.minimal_output_range(num_of_keys) }
pub fn output_range(&self) -> usize {
self.level0.conf.output_range(self.seed_chooser, self.seed_size.into())
}
pub fn write_bytes(&self) -> usize {
self.level0.write_bytes() +
self.unassigned.write_bytes() +
VByte::size(self.levels.len()) +
self.levels.iter().map(|l| l.size_bytes()).sum::<usize>()
}
pub fn write(&self, output: &mut dyn io::Write) -> io::Result<()>
{
self.level0.write(output, self.seed_size)?;
self.unassigned.write(output)?;
VByte::write(output, self.levels.len())?;
for level in &self.levels {
level.write(output, self.seed_size)?;
}
Ok(())
}
pub fn read_with_hasher_sc(input: &mut dyn io::Read, hasher: S, seed_chooser: SC) -> io::Result<Self> {
let (seed_size, level0) = SeedEx::read(input)?;
let unassigned = CA::read(input)?;
let levels_num: usize = VByte::read(input)?;
let mut levels = Vec::with_capacity(levels_num);
for _ in 0..levels_num {
levels.push(Level::read::<SS>(input)?.1);
}
Ok(Self { level0, unassigned, levels: levels.into_boxed_slice(), hasher, seed_chooser, seed_size })
}
}
impl<SS: SeedSize> Function<SS, SeedOnly, DefaultCompressedArray, BuildDefaultSeededHasher> {
pub fn read(input: &mut dyn io::Read) -> io::Result<Self> {
Self::read_with_hasher_sc(input, BuildDefaultSeededHasher::default(), SeedOnly)
}
}
impl Function<Bits8, SeedOnly, DefaultCompressedArray, BuildDefaultSeededHasher> {
pub fn from_vec_st<K>(keys: Vec::<K>) -> Self where K: Hash {
Self::with_vec_p_hash_sc(keys, &Params::new(Bits8::default(), bits_per_seed_to_100_bucket_size(8)),
BuildDefaultSeededHasher::default(), SeedOnly)
}
pub fn from_vec_mt<K>(keys: Vec::<K>) -> Self where K: Hash+Send+Sync {
Self::with_vec_p_threads_hash_sc(keys, &Params::new(Bits8::default(), bits_per_seed_to_100_bucket_size(8)),
std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnly)
}
pub fn from_slice_st<K>(keys: &[K]) -> Self where K: Hash+Clone {
Self::with_slice_p_hash_sc(keys, &Params::new(Bits8::default(), bits_per_seed_to_100_bucket_size(8)),
BuildDefaultSeededHasher::default(), SeedOnly)
}
pub fn from_slice_mt<K>(keys: &[K]) -> Self where K: Hash+Clone+Send+Sync {
Self::with_slice_p_threads_hash_sc(keys, &Params::new(Bits8::default(), bits_per_seed_to_100_bucket_size(8)),
std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnly)
}
}
#[cfg(test)]
pub(crate) mod tests {
use super::*;
use crate::utils::tests::test_mphf;
fn test_read_write<SS: SeedSize>(h: &Function::<SS>) where SS::VecElement: std::fmt::Debug+PartialEq {
let mut buff = Vec::new();
h.write(&mut buff).unwrap();
let read = Function::<SS>::read(&mut &buff[..]).unwrap();
assert_eq!(h.level0.conf, read.level0.conf);
assert_eq!(h.levels.len(), read.levels.len());
for (hl, rl) in h.levels.iter().zip(&read.levels) {
assert_eq!(hl.shift, rl.shift);
assert_eq!(hl.seeds.conf, rl.seeds.conf);
assert_eq!(hl.seeds.seeds, rl.seeds.seeds);
}
}
#[test]
fn test_small() {
let input = [1, 2, 3, 4, 5];
let f = Function::from_slice_st(&input);
test_mphf(&input, |key| Some(f.get(key)));
test_read_write(&f);
}
}