#![allow(clippy::type_complexity)]
use crate::bits::BitFieldVec;
use crate::func::VFunc;
use crate::func::shard_edge::{Fuse3NoShards, FuseLge3Shards, ShardEdge};
use crate::traits::TryIntoUnaligned;
use crate::traits::Unaligned;
use crate::utils::*;
use mem_dbg::*;
use num_primitive::PrimitiveInteger;
use std::borrow::Borrow;
use value_traits::slices::SliceByValue;
use xxhash_rust::xxh3;
#[derive(Debug, Clone, Copy, MemSize, MemDbg)]
#[mem_size(flat)]
#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct IntBitPrefix<K> {
value: K,
bit_length: usize,
}
impl<K: PrimitiveInteger> IntBitPrefix<K> {
#[inline]
pub fn new(value: K, bit_length: usize) -> Self {
debug_assert!(bit_length <= K::BITS as usize);
Self { value, bit_length }
}
#[inline]
fn masked_value(&self) -> K {
let all_ones = K::MIN | K::MAX;
match all_ones.checked_shl(K::BITS - self.bit_length as u32) {
Some(m) => self.value & m,
None => K::MIN, }
}
}
#[inline]
pub(crate) fn pack_int_bit_prefix<K: PrimitiveInteger>(
bp: &IntBitPrefix<K>,
buf: &mut [u8],
) -> usize {
let val: K::Bytes = bp.masked_value().to_ne_bytes();
let val = val.borrow() as &[u8];
let len = bp.bit_length.to_ne_bytes();
let n = val.len() + len.len();
buf[..val.len()].copy_from_slice(val);
buf[val.len()..n].copy_from_slice(&len);
n
}
impl<K: PrimitiveInteger> ToSig<[u64; 1]> for IntBitPrefix<K> {
#[inline]
fn to_sig(key: impl Borrow<Self>, seed: u64) -> [u64; 1] {
let mut buf = [0u8; 24];
let n = pack_int_bit_prefix(key.borrow(), &mut buf);
let mut hasher = xxh3::Xxh3::with_seed(seed);
hasher.update(&buf[..n]);
<[u64; 1]>::from_hasher(&hasher)
}
}
#[cfg(feature = "rayon")]
mod build {
use super::*;
use crate::func::VBuilder;
use anyhow::{Result, bail};
use dsi_progress_logger::ProgressLog;
use lender::*;
use rdst::RadixKey;
#[inline(always)]
pub(crate) fn lcp_bits<K: PrimitiveInteger>(a: K, b: K) -> usize {
(a ^ b).leading_zeros() as usize
}
pub(crate) fn log2_bucket_size(n: usize) -> usize {
if n <= 1 {
return 0;
}
let ln_n = (n as f64).ln();
let theoretical = 1.0 + 1.11 * f64::ln(2.0) + ln_n - (1.0 + ln_n).ln();
let theoretical = theoretical.ceil().max(1.0) as usize;
theoretical.next_power_of_two().ilog2() as usize
}
impl<
K: PrimitiveInteger + ToSig<S0> + std::fmt::Debug + Send + Sync + Copy + Ord,
S0: Sig + Send + Sync,
E0: ShardEdge<S0, 3> + MemSize + mem_dbg::FlatType,
S1: Sig + Send + Sync,
E1: ShardEdge<S1, 3> + MemSize + mem_dbg::FlatType,
> LcpMmphfInt<K, BitFieldVec<Box<[usize]>>, S0, E0, S1, E1>
where
IntBitPrefix<K>: ToSig<S1>,
SigVal<S0, usize>: RadixKey,
SigVal<E0::LocalSig, usize>: std::ops::BitXor + std::ops::BitXorAssign,
SigVal<S1, usize>: RadixKey,
SigVal<E1::LocalSig, usize>: std::ops::BitXor + std::ops::BitXorAssign,
{
pub fn try_new(
keys: impl FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend K>,
n: usize,
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<Self> {
Self::try_new_with_builder(keys, n, VBuilder::default(), pl)
}
pub fn try_new_with_builder(
keys: impl FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend K>,
n: usize,
builder: VBuilder<BitFieldVec<Box<[usize]>>, S0, E0>,
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<Self> {
Self::try_new_inner(keys, n, builder, pl).map(|(mmphf, _)| mmphf)
}
pub(crate) fn try_new_inner<
L: FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend K>,
>(
mut keys: L,
n: usize,
builder: VBuilder<BitFieldVec<Box<[usize]>>, S0, E0>,
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<(Self, L)> {
if n == 0 {
return Ok((
Self {
n: 0,
log2_bucket_size: 0,
offset_lcp_length: VFunc::empty(),
lcp2bucket: VFunc::empty(),
},
keys,
));
}
let log2_bs = log2_bucket_size(n);
let bucket_size = 1usize << log2_bs;
let bucket_mask = bucket_size - 1;
let num_buckets = n.div_ceil(bucket_size);
pl.info(format_args!(
"Bucket size: 2^{log2_bs} = {bucket_size} ({num_buckets} buckets for {n} keys)"
));
let mut lcp_bit_lengths: Vec<usize> = Vec::with_capacity(num_buckets);
let mut bucket_first_keys: Vec<K> = Vec::with_capacity(num_buckets);
let mut prev_key: Option<K> = None;
let mut curr_lcp_bits: usize = 0;
let mut i = 0usize;
while let Some(key) = keys.next()? {
let key: K = *key;
if let Some(prev) = prev_key {
if key <= prev {
bail!(
"Keys are not in strictly increasing order at \
position {i}: {prev:?} >= {key:?}"
);
}
}
let offset = i & bucket_mask;
if offset == 0 {
if i > 0 {
lcp_bit_lengths.push(curr_lcp_bits);
}
bucket_first_keys.push(key);
curr_lcp_bits = K::BITS as usize;
} else {
curr_lcp_bits = curr_lcp_bits.min(lcp_bits(key, prev_key.unwrap()));
}
prev_key = Some(key);
i += 1;
}
assert_eq!(i, n, "Expected {n} keys but got {i}");
lcp_bit_lengths.push(curr_lcp_bits);
assert_eq!(lcp_bit_lengths.len(), num_buckets);
pl.info(format_args!("Building key → (LCP length, offset) map..."));
let keys = keys.rewind()?;
let (offset_lcp_length, keys) =
builder.expected_num_keys(n).try_build_func::<K, K, _, _>(
keys,
FromCloneableIntoIterator::new((0..n).map(|idx| {
(lcp_bit_lengths[idx >> log2_bs] << log2_bs) | (idx & bucket_mask)
})),
BitFieldVec::<Box<[usize]>>::new_padded,
pl,
)?;
pl.info(format_args!(
"Building LCP prefix → bucket map ({num_buckets} buckets)..."
));
let lcp2bucket =
<VFunc<IntBitPrefix<K>, BitFieldVec<Box<[usize]>>, S1, E1>>::try_new_with_builder(
FromCloneableIntoIterator::new((0..num_buckets).map(|b| {
IntBitPrefix::new(bucket_first_keys[b] ^ K::MIN, lcp_bit_lengths[b])
})),
FromCloneableIntoIterator::new(0..num_buckets),
num_buckets,
VBuilder::default(),
pl,
)?;
let result = Self {
n,
log2_bucket_size: log2_bs,
offset_lcp_length,
lcp2bucket,
};
let total_bits = result.mem_size(SizeFlags::default()) * 8;
pl.info(format_args!(
"Actual bit cost per key: {:.2} ({total_bits} bits for {n} keys)",
total_bits as f64 / n as f64
));
Ok((result, keys))
}
pub fn try_par_new(
keys: &[K],
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<Self> {
Self::try_par_new_with_builder(keys, VBuilder::default(), pl)
}
pub fn try_par_new_with_builder(
keys: &[K],
builder: VBuilder<BitFieldVec<Box<[usize]>>, S0, E0>,
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<Self> {
Self::try_par_new_inner(keys, builder, pl)
}
pub(crate) fn try_par_new_inner(
keys: &[K],
builder: VBuilder<BitFieldVec<Box<[usize]>>, S0, E0>,
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<Self> {
let n = keys.len();
if n == 0 {
return Ok(Self {
n: 0,
log2_bucket_size: 0,
offset_lcp_length: VFunc::empty(),
lcp2bucket: VFunc::empty(),
});
}
let log2_bs = log2_bucket_size(n);
let bucket_size = 1usize << log2_bs;
let bucket_mask = bucket_size - 1;
let num_buckets = n.div_ceil(bucket_size);
pl.info(format_args!(
"Bucket size: 2^{log2_bs} = {bucket_size} ({num_buckets} buckets for {n} keys)"
));
let mut lcp_bit_lengths: Vec<usize> = Vec::with_capacity(num_buckets);
let mut bucket_first_keys: Vec<K> = Vec::with_capacity(num_buckets);
let mut prev_key: Option<K> = None;
let mut curr_lcp_bits: usize = 0;
for (i, &key) in keys.iter().enumerate() {
if let Some(prev) = prev_key {
if key <= prev {
bail!(
"Keys are not in strictly increasing order at \
position {i}: {prev:?} >= {key:?}"
);
}
}
let offset = i & bucket_mask;
if offset == 0 {
if i > 0 {
lcp_bit_lengths.push(curr_lcp_bits);
}
bucket_first_keys.push(key);
curr_lcp_bits = K::BITS as usize;
} else {
curr_lcp_bits = curr_lcp_bits.min(lcp_bits(key, prev_key.unwrap()));
}
prev_key = Some(key);
}
lcp_bit_lengths.push(curr_lcp_bits);
assert_eq!(lcp_bit_lengths.len(), num_buckets);
pl.info(format_args!(
"Building key → (LCP length, offset) map (parallel)..."
));
let offset_lcp_length = builder.expected_num_keys(n).try_par_populate_and_build(
keys,
&|i| (lcp_bit_lengths[i >> log2_bs] << log2_bs) | (i & bucket_mask),
&mut |builder, seed, mut store, max_value, _num_keys, pl, _state: &mut ()| {
builder.bit_width = max_value.bit_len() as usize;
let data = BitFieldVec::<Box<[usize]>>::new_padded(
builder.bit_width,
builder.shard_edge.num_vertices() * builder.shard_edge.num_shards(),
);
let func = builder.try_build_from_shard_iter(
seed,
data,
store.drain(),
&|_, sv| sv.val,
&|_| {},
pl,
)?;
Ok(func)
},
pl,
(),
)?;
pl.info(format_args!(
"Building LCP prefix → bucket map ({num_buckets} buckets)..."
));
let lcp2bucket =
<VFunc<IntBitPrefix<K>, BitFieldVec<Box<[usize]>>, S1, E1>>::try_new_with_builder(
FromCloneableIntoIterator::new((0..num_buckets).map(|b| {
IntBitPrefix::new(bucket_first_keys[b] ^ K::MIN, lcp_bit_lengths[b])
})),
FromCloneableIntoIterator::new(0..num_buckets),
num_buckets,
VBuilder::default(),
pl,
)?;
let result = Self {
n,
log2_bucket_size: log2_bs,
offset_lcp_length,
lcp2bucket,
};
let total_bits = result.mem_size(SizeFlags::default()) * 8;
pl.info(format_args!(
"Actual bit cost per key: {:.2} ({total_bits} bits for {n} keys)",
total_bits as f64 / n as f64
));
Ok(result)
}
}
use crate::utils::lcp_len;
pub(crate) fn lcp_bits_nul<const DISTINCT: bool>(a: &[u8], b: &[u8]) -> usize {
let min_len = a.len().min(b.len());
let pos = lcp_len(&a[..min_len], &b[..min_len]);
if pos < min_len {
return pos * 8 + (a[pos] ^ b[pos]).leading_zeros() as usize;
}
if !DISTINCT && a.len() == b.len() {
return min_len * 8 + 8;
}
let longer = if a.len() > b.len() { a } else { b };
debug_assert!(longer.len() > min_len);
min_len * 8 + longer[min_len].leading_zeros() as usize
}
impl<
K: ?Sized + AsRef<[u8]> + ToSig<S0> + std::fmt::Debug,
S0: Sig + Send + Sync,
E0: ShardEdge<S0, 3> + MemSize + mem_dbg::FlatType,
S1: Sig + Send + Sync,
E1: ShardEdge<S1, 3> + MemSize + mem_dbg::FlatType,
> LcpMmphf<K, BitFieldVec<Box<[usize]>>, S0, E0, S1, E1>
where
BitPrefix: ToSig<S1>,
SigVal<S0, usize>: RadixKey,
SigVal<E0::LocalSig, usize>: std::ops::BitXor + std::ops::BitXorAssign,
SigVal<S1, usize>: RadixKey,
SigVal<E1::LocalSig, usize>: std::ops::BitXor + std::ops::BitXorAssign,
{
pub fn try_new<B: ?Sized + AsRef<[u8]> + Borrow<K>>(
keys: impl FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend B>,
n: usize,
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<Self> {
Self::try_new_with_builder(keys, n, VBuilder::default(), pl)
}
pub fn try_new_with_builder<B: ?Sized + AsRef<[u8]> + Borrow<K>>(
keys: impl FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend B>,
n: usize,
builder: VBuilder<BitFieldVec<Box<[usize]>>, S0, E0>,
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<Self> {
Self::try_new_inner(keys, n, builder, pl).map(|(mmphf, _)| mmphf)
}
pub(crate) fn try_new_inner<
B: ?Sized + AsRef<[u8]> + Borrow<K>,
L: FallibleRewindableLender<
RewindError: std::error::Error + Send + Sync + 'static,
Error: std::error::Error + Send + Sync + 'static,
> + for<'lend> FallibleLending<'lend, Lend = &'lend B>,
>(
mut keys: L,
n: usize,
builder: VBuilder<BitFieldVec<Box<[usize]>>, S0, E0>,
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<(Self, L)> {
if n == 0 {
return Ok((
Self {
n: 0,
log2_bucket_size: 0,
offset_lcp_length: VFunc::empty(),
lcp2bucket: VFunc::empty(),
},
keys,
));
}
let log2_bs = log2_bucket_size(n);
let bucket_size = 1usize << log2_bs;
let bucket_mask = bucket_size - 1;
let num_buckets = n.div_ceil(bucket_size);
pl.info(format_args!(
"Bucket size: 2^{log2_bs} = {bucket_size} ({num_buckets} buckets for {n} keys)"
));
let mut lcp_bit_lengths: Vec<usize> = Vec::with_capacity(num_buckets);
let mut bucket_first_keys: Vec<Vec<u8>> = Vec::with_capacity(num_buckets);
let mut prev_key: Vec<u8> = Vec::new();
let mut curr_lcp_bits: usize = 0;
let mut i = 0usize;
while let Some(key) = keys.next()? {
let key_bytes: &[u8] = key.as_ref();
if i > 0 && key_bytes <= prev_key.as_slice() {
bail!(
"Keys are not in strictly increasing lexicographic \
order at position {i}"
);
}
let offset = i & bucket_mask;
if offset == 0 {
if i > 0 {
lcp_bit_lengths.push(curr_lcp_bits);
}
bucket_first_keys.push(key_bytes.to_vec());
curr_lcp_bits = (key_bytes.len() + 1) * 8;
} else {
curr_lcp_bits = curr_lcp_bits.min(lcp_bits_nul::<true>(key_bytes, &prev_key));
}
prev_key.clear();
prev_key.extend_from_slice(key_bytes);
i += 1;
}
assert_eq!(i, n, "Expected {n} keys but got {i}");
lcp_bit_lengths.push(curr_lcp_bits);
assert_eq!(lcp_bit_lengths.len(), num_buckets);
pl.info(format_args!("Building key → (LCP length, offset) map..."));
let keys = keys.rewind()?;
let (offset_lcp_length, keys) =
builder.expected_num_keys(n).try_build_func::<K, B, _, _>(
keys,
FromCloneableIntoIterator::new((0..n).map(|idx| {
(lcp_bit_lengths[idx >> log2_bs] << log2_bs) | (idx & bucket_mask)
})),
BitFieldVec::<Box<[usize]>>::new_padded,
pl,
)?;
pl.info(format_args!(
"Building LCP prefix → bucket map ({num_buckets} buckets)..."
));
let extended_first_keys: Vec<Vec<u8>> = bucket_first_keys
.iter()
.map(|k| {
let mut v = Vec::with_capacity(k.len() + 1);
v.extend_from_slice(k);
v.push(0x00);
v
})
.collect();
let lcp2bucket =
<VFunc<BitPrefix, BitFieldVec<Box<[usize]>>, S1, E1>>::try_new_with_builder(
FromCloneableIntoIterator::new(
(0..num_buckets)
.map(|b| BitPrefix::new(&extended_first_keys[b], lcp_bit_lengths[b])),
),
FromCloneableIntoIterator::new(0..num_buckets),
num_buckets,
VBuilder::default(),
pl,
)?;
let result = Self {
n,
log2_bucket_size: log2_bs,
offset_lcp_length,
lcp2bucket,
};
let total_bits = result.mem_size(SizeFlags::default()) * 8;
pl.info(format_args!(
"Actual bit cost per key: {:.2} ({total_bits} bits for {n} keys)",
total_bits as f64 / n as f64
));
Ok((result, keys))
}
pub fn try_par_new<B: AsRef<[u8]> + Borrow<K> + Sync>(
keys: &[B],
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<Self>
where
K: Sync,
{
Self::try_par_new_with_builder(keys, VBuilder::default(), pl)
}
pub fn try_par_new_with_builder<B: AsRef<[u8]> + Borrow<K> + Sync>(
keys: &[B],
builder: VBuilder<BitFieldVec<Box<[usize]>>, S0, E0>,
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<Self>
where
K: Sync,
{
Self::try_par_new_inner(keys, builder, pl)
}
pub(crate) fn try_par_new_inner<B: AsRef<[u8]> + Borrow<K> + Sync>(
keys: &[B],
builder: VBuilder<BitFieldVec<Box<[usize]>>, S0, E0>,
pl: &mut (impl ProgressLog + Clone + Send + Sync),
) -> Result<Self>
where
K: Sync,
{
let n = keys.len();
if n == 0 {
return Ok(Self {
n: 0,
log2_bucket_size: 0,
offset_lcp_length: VFunc::empty(),
lcp2bucket: VFunc::empty(),
});
}
let log2_bs = log2_bucket_size(n);
let bucket_size = 1usize << log2_bs;
let bucket_mask = bucket_size - 1;
let num_buckets = n.div_ceil(bucket_size);
pl.info(format_args!(
"Bucket size: 2^{log2_bs} = {bucket_size} ({num_buckets} buckets for {n} keys)"
));
let mut lcp_bit_lengths: Vec<usize> = Vec::with_capacity(num_buckets);
let mut bucket_first_keys: Vec<Vec<u8>> = Vec::with_capacity(num_buckets);
let mut prev_key: Vec<u8> = Vec::new();
let mut curr_lcp_bits: usize = 0;
for (i, key) in keys.iter().enumerate() {
let key_bytes: &[u8] = key.as_ref();
if i > 0 && key_bytes <= prev_key.as_slice() {
bail!(
"Keys are not in strictly increasing lexicographic \
order at position {i}"
);
}
let offset = i & bucket_mask;
if offset == 0 {
if i > 0 {
lcp_bit_lengths.push(curr_lcp_bits);
}
bucket_first_keys.push(key_bytes.to_vec());
curr_lcp_bits = (key_bytes.len() + 1) * 8;
} else {
curr_lcp_bits = curr_lcp_bits.min(lcp_bits_nul::<true>(key_bytes, &prev_key));
}
prev_key.clear();
prev_key.extend_from_slice(key_bytes);
}
lcp_bit_lengths.push(curr_lcp_bits);
assert_eq!(lcp_bit_lengths.len(), num_buckets);
pl.info(format_args!(
"Building key → (LCP length, offset) map (parallel)..."
));
let offset_lcp_length = builder.expected_num_keys(n).try_par_populate_and_build(
keys,
&|i| (lcp_bit_lengths[i >> log2_bs] << log2_bs) | (i & bucket_mask),
&mut |builder, seed, mut store, max_value, _num_keys, pl, _state: &mut ()| {
builder.bit_width = max_value.bit_len() as usize;
let data = BitFieldVec::<Box<[usize]>>::new_padded(
builder.bit_width,
builder.shard_edge.num_vertices() * builder.shard_edge.num_shards(),
);
let func = builder.try_build_from_shard_iter(
seed,
data,
store.drain(),
&|_, sv| sv.val,
&|_| {},
pl,
)?;
Ok(func)
},
pl,
(),
)?;
pl.info(format_args!(
"Building LCP prefix → bucket map ({num_buckets} buckets)..."
));
let extended_first_keys: Vec<Vec<u8>> = bucket_first_keys
.iter()
.map(|k| {
let mut v = Vec::with_capacity(k.len() + 1);
v.extend_from_slice(k);
v.push(0x00);
v
})
.collect();
let lcp2bucket =
<VFunc<BitPrefix, BitFieldVec<Box<[usize]>>, S1, E1>>::try_new_with_builder(
FromCloneableIntoIterator::new(
(0..num_buckets)
.map(|b| BitPrefix::new(&extended_first_keys[b], lcp_bit_lengths[b])),
),
FromCloneableIntoIterator::new(0..num_buckets),
num_buckets,
VBuilder::default(),
pl,
)?;
let result = Self {
n,
log2_bucket_size: log2_bs,
offset_lcp_length,
lcp2bucket,
};
let total_bits = result.mem_size(SizeFlags::default()) * 8;
pl.info(format_args!(
"Actual bit cost per key: {:.2} ({total_bits} bits for {n} keys)",
total_bits as f64 / n as f64
));
Ok(result)
}
}
}
#[cfg(feature = "rayon")]
pub(crate) use build::{lcp_bits, lcp_bits_nul, log2_bucket_size};
#[derive(Debug, Clone, MemSize, MemDbg)]
#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct LcpMmphfInt<
K,
D = BitFieldVec<Box<[usize]>>,
S0 = [u64; 2],
E0 = FuseLge3Shards,
S1 = [u64; 1],
E1 = Fuse3NoShards,
> {
pub(crate) n: usize,
pub(crate) log2_bucket_size: usize,
pub(crate) offset_lcp_length: VFunc<K, D, S0, E0>,
pub(crate) lcp2bucket: VFunc<IntBitPrefix<K>, D, S1, E1>,
}
impl<
K: PrimitiveInteger + ToSig<S0>,
D: SliceByValue<Value = usize>,
S0: Sig,
E0: ShardEdge<S0, 3>,
S1: Sig,
E1: ShardEdge<S1, 3>,
> LcpMmphfInt<K, D, S0, E0, S1, E1>
where
IntBitPrefix<K>: ToSig<S1>,
{
#[inline]
pub fn get(&self, key: K) -> usize {
let packed = self.offset_lcp_length.get(key);
let lcp_bit_length = packed >> self.log2_bucket_size;
let offset = packed & ((1 << self.log2_bucket_size) - 1);
let prefix = IntBitPrefix::new(key ^ K::MIN, lcp_bit_length);
let bucket = self.lcp2bucket.get(prefix);
(bucket << self.log2_bucket_size) + offset
}
}
impl<
K: PrimitiveInteger,
D: SliceByValue<Value = usize>,
S0: Sig,
E0: ShardEdge<S0, 3>,
S1: Sig,
E1: ShardEdge<S1, 3>,
> LcpMmphfInt<K, D, S0, E0, S1, E1>
{
pub const fn len(&self) -> usize {
self.n
}
pub const fn is_empty(&self) -> bool {
self.n == 0
}
}
#[derive(Debug, Clone, MemSize, MemDbg)]
#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct BitPrefix {
bytes: Vec<u8>,
bit_length: usize,
}
impl BitPrefix {
#[inline]
pub fn new(bytes: &[u8], bit_length: usize) -> Self {
debug_assert!(bytes.len() * 8 >= bit_length);
let needed = bit_length.div_ceil(8);
Self {
bytes: bytes[..needed].to_vec(),
bit_length,
}
}
}
#[inline]
pub(crate) fn hash_bit_prefix_raw(hasher: &mut xxh3::Xxh3, bytes: &[u8], bit_length: usize) {
let full_bytes = bit_length / 8;
let extra_bits = bit_length % 8;
hasher.update(&bytes[..full_bytes]);
if extra_bits > 0 {
let mask = !((1u8 << (8 - extra_bits)) - 1);
hasher.update(&[bytes[full_bytes] & mask]);
}
hasher.update(&bit_length.to_ne_bytes());
}
#[inline]
pub(crate) fn bit_prefix_sig<S: Sig>(bytes: &[u8], bit_length: usize, seed: u64) -> S {
let mut hasher = xxh3::Xxh3::with_seed(seed);
hash_bit_prefix_raw(&mut hasher, bytes, bit_length);
S::from_hasher(&hasher)
}
impl ToSig<[u64; 1]> for BitPrefix {
#[inline]
fn to_sig(key: impl Borrow<Self>, seed: u64) -> [u64; 1] {
let bp = key.borrow();
bit_prefix_sig(&bp.bytes, bp.bit_length, seed)
}
}
#[derive(Debug, Clone, MemSize, MemDbg)]
#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct LcpMmphf<
K: ?Sized,
D = BitFieldVec<Box<[usize]>>,
S0 = [u64; 2],
E0 = FuseLge3Shards,
S1 = [u64; 1],
E1 = Fuse3NoShards,
> {
pub(crate) n: usize,
pub(crate) log2_bucket_size: usize,
pub(crate) offset_lcp_length: VFunc<K, D, S0, E0>,
pub(crate) lcp2bucket: VFunc<BitPrefix, D, S1, E1>,
}
pub type LcpMmphfStr<
D = BitFieldVec<Box<[usize]>>,
S0 = [u64; 2],
E0 = FuseLge3Shards,
S1 = [u64; 1],
E1 = Fuse3NoShards,
> = LcpMmphf<str, D, S0, E0, S1, E1>;
pub type LcpMmphfSliceU8<
D = BitFieldVec<Box<[usize]>>,
S0 = [u64; 2],
E0 = FuseLge3Shards,
S1 = [u64; 1],
E1 = Fuse3NoShards,
> = LcpMmphf<[u8], D, S0, E0, S1, E1>;
impl<
K: ?Sized + AsRef<[u8]> + ToSig<S0>,
D: SliceByValue<Value = usize>,
S0: Sig,
E0: ShardEdge<S0, 3>,
S1: Sig,
E1: ShardEdge<S1, 3>,
> LcpMmphf<K, D, S0, E0, S1, E1>
where
BitPrefix: ToSig<S1>,
{
#[inline]
pub fn get(&self, key: &K) -> usize {
let packed = self.offset_lcp_length.get(key);
let lcp_bit_length = packed >> self.log2_bucket_size;
let offset = packed & ((1 << self.log2_bucket_size) - 1);
let key_bytes: &[u8] = key.as_ref();
let seed = self.lcp2bucket.seed;
let sig: S1 = if lcp_bit_length <= key_bytes.len() * 8 {
bit_prefix_sig(key_bytes, lcp_bit_length, seed)
} else {
let mut hasher = xxh3::Xxh3::with_seed(seed);
hasher.update(key_bytes);
hasher.update(&[0u8]);
hasher.update(&lcp_bit_length.to_ne_bytes());
S1::from_hasher(&hasher)
};
let bucket = self.lcp2bucket.get_by_sig(sig);
(bucket << self.log2_bucket_size) + offset
}
}
impl<K: ?Sized, D: SliceByValue, S0: Sig, E0: ShardEdge<S0, 3>, S1: Sig, E1: ShardEdge<S1, 3>>
LcpMmphf<K, D, S0, E0, S1, E1>
{
pub const fn len(&self) -> usize {
self.n
}
pub const fn is_empty(&self) -> bool {
self.n == 0
}
}
impl<K, S0: Sig, E0: ShardEdge<S0, 3>, S1: Sig, E1: ShardEdge<S1, 3>>
From<Unaligned<LcpMmphfInt<K, BitFieldVec<Box<[usize]>>, S0, E0, S1, E1>>>
for LcpMmphfInt<K, BitFieldVec<Box<[usize]>>, S0, E0, S1, E1>
{
fn from(f: Unaligned<LcpMmphfInt<K, BitFieldVec<Box<[usize]>>, S0, E0, S1, E1>>) -> Self {
LcpMmphfInt {
n: f.n,
log2_bucket_size: f.log2_bucket_size,
offset_lcp_length: f.offset_lcp_length.into(),
lcp2bucket: f.lcp2bucket.into(),
}
}
}
impl<K, S0: Sig, E0: ShardEdge<S0, 3>, S1: Sig, E1: ShardEdge<S1, 3>> TryIntoUnaligned
for LcpMmphfInt<K, BitFieldVec<Box<[usize]>>, S0, E0, S1, E1>
{
type Unaligned = LcpMmphfInt<K, Unaligned<BitFieldVec<Box<[usize]>>>, S0, E0, S1, E1>;
fn try_into_unaligned(
self,
) -> Result<Self::Unaligned, crate::traits::UnalignedConversionError> {
Ok(LcpMmphfInt {
n: self.n,
log2_bucket_size: self.log2_bucket_size,
offset_lcp_length: self.offset_lcp_length.try_into_unaligned()?,
lcp2bucket: self.lcp2bucket.try_into_unaligned()?,
})
}
}
impl<K: ?Sized, S0: Sig, E0: ShardEdge<S0, 3>, S1: Sig, E1: ShardEdge<S1, 3>>
From<Unaligned<LcpMmphf<K, BitFieldVec<Box<[usize]>>, S0, E0, S1, E1>>>
for LcpMmphf<K, BitFieldVec<Box<[usize]>>, S0, E0, S1, E1>
{
fn from(f: Unaligned<LcpMmphf<K, BitFieldVec<Box<[usize]>>, S0, E0, S1, E1>>) -> Self {
LcpMmphf {
n: f.n,
log2_bucket_size: f.log2_bucket_size,
offset_lcp_length: f.offset_lcp_length.into(),
lcp2bucket: f.lcp2bucket.into(),
}
}
}
impl<K: ?Sized, S0: Sig, E0: ShardEdge<S0, 3>, S1: Sig, E1: ShardEdge<S1, 3>> TryIntoUnaligned
for LcpMmphf<K, BitFieldVec<Box<[usize]>>, S0, E0, S1, E1>
{
type Unaligned = LcpMmphf<K, Unaligned<BitFieldVec<Box<[usize]>>>, S0, E0, S1, E1>;
fn try_into_unaligned(
self,
) -> Result<Self::Unaligned, crate::traits::UnalignedConversionError> {
Ok(LcpMmphf {
n: self.n,
log2_bucket_size: self.log2_bucket_size,
offset_lcp_length: self.offset_lcp_length.try_into_unaligned()?,
lcp2bucket: self.lcp2bucket.try_into_unaligned()?,
})
}
}