use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;
use std::cmp::Ordering;
use std::num::NonZeroU32;
use std::ops::Index;
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub struct Interned(NonZeroU32);
#[derive(Debug, Default)]
pub struct OrderPreservingInterner {
keys: InternBuffer,
values: InternBuffer,
bucket: Box<Bucket>,
hasher: ahash::RandomState,
lookup: HashMap<Interned, (), ()>,
}
impl OrderPreservingInterner {
pub fn intern<I, V>(&mut self, input: I) -> Vec<Option<Interned>>
where
I: IntoIterator<Item = Option<V>>,
V: AsRef<[u8]>,
{
let iter = input.into_iter();
let capacity = iter.size_hint().0;
let mut out = Vec::with_capacity(capacity);
let mut to_intern: Vec<(usize, u64, V)> = Vec::with_capacity(capacity);
let mut to_intern_len = 0;
for (idx, item) in iter.enumerate() {
let value: V = match item {
Some(value) => value,
None => {
out.push(None);
continue;
}
};
let v = value.as_ref();
let hash = self.hasher.hash_one(v);
let entry = self
.lookup
.raw_entry_mut()
.from_hash(hash, |a| &self.values[*a] == v);
match entry {
RawEntryMut::Occupied(o) => out.push(Some(*o.key())),
RawEntryMut::Vacant(_) => {
out.push(None);
to_intern_len += v.len();
to_intern.push((idx, hash, value));
}
};
}
to_intern.sort_unstable_by(|(_, _, a), (_, _, b)| a.as_ref().cmp(b.as_ref()));
self.keys.offsets.reserve(to_intern.len());
self.keys.values.reserve(to_intern.len()); self.values.offsets.reserve(to_intern.len());
self.values.values.reserve(to_intern_len);
for (idx, hash, value) in to_intern {
let val = value.as_ref();
let entry = self
.lookup
.raw_entry_mut()
.from_hash(hash, |a| &self.values[*a] == val);
match entry {
RawEntryMut::Occupied(o) => {
out[idx] = Some(*o.key());
}
RawEntryMut::Vacant(v) => {
let val = value.as_ref();
self.bucket
.insert(&mut self.values, val, &mut self.keys.values);
self.keys.values.push(0);
let interned = self.keys.append();
let hasher = &mut self.hasher;
let values = &self.values;
v.insert_with_hasher(hash, interned, (), |key| {
hasher.hash_one(&values[*key])
});
out[idx] = Some(interned);
}
}
}
out
}
pub fn normalized_key(&self, key: Interned) -> &[u8] {
&self.keys[key]
}
}
#[derive(Debug)]
struct InternBuffer {
values: Vec<u8>,
offsets: Vec<usize>,
}
impl Default for InternBuffer {
fn default() -> Self {
Self {
values: Default::default(),
offsets: vec![0],
}
}
}
impl InternBuffer {
fn insert(&mut self, data: &[u8]) -> Interned {
self.values.extend_from_slice(data);
self.append()
}
fn append(&mut self) -> Interned {
let idx: u32 = self.offsets.len().try_into().unwrap();
let key = Interned(NonZeroU32::new(idx).unwrap());
self.offsets.push(self.values.len());
key
}
}
impl Index<Interned> for InternBuffer {
type Output = [u8];
fn index(&self, key: Interned) -> &Self::Output {
let index = key.0.get() as usize;
let end = self.offsets[index];
let start = self.offsets[index - 1];
unsafe { self.values.get_unchecked(start..end) }
}
}
#[derive(Debug, Default, Clone)]
struct Slot {
value: Option<Interned>,
child: Option<Box<Bucket>>,
}
#[derive(Debug, Clone)]
struct Bucket {
slots: Box<[Slot]>,
}
impl Default for Bucket {
fn default() -> Self {
let slots = (0..255).map(|_| Slot::default()).collect::<Vec<_>>().into();
Self { slots }
}
}
impl Bucket {
fn insert_pos(&self, values_buf: &InternBuffer, data: &[u8]) -> Result<usize, usize> {
let mut size = self.slots.len() - 1;
let mut left = 0;
let mut right = size;
while left < right {
let mid = left + (size / 2).min(3);
let slot = &self.slots[mid];
let val = match slot.value {
Some(val) => val,
None => return Err(mid),
};
let cmp = values_buf[val].cmp(data);
if cmp == Ordering::Less {
left = mid + 1;
} else if cmp == Ordering::Greater {
right = mid;
} else {
return Ok(mid);
}
size = right - left;
}
Err(left)
}
fn insert(&mut self, values_buf: &mut InternBuffer, data: &[u8], out: &mut Vec<u8>) {
match self.insert_pos(values_buf, data) {
Ok(_) => unreachable!("value already exists"),
Err(idx) => {
let slot = &mut self.slots[idx];
if idx != 254 && slot.value.is_none() {
out.push(idx as u8 + 2);
slot.value = Some(values_buf.insert(data))
} else {
out.push(idx as u8 + 1);
slot.child
.get_or_insert_with(Default::default)
.insert(values_buf, data, out);
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use rand::prelude::*;
#[allow(clippy::needless_collect)]
fn test_intern_values(values: &[u64]) {
let mut interner = OrderPreservingInterner::default();
let interned: Vec<_> = values
.iter()
.flat_map(|v| interner.intern([Some(&v.to_be_bytes())]))
.map(Option::unwrap)
.collect();
let interned: Vec<_> = interned
.into_iter()
.map(|x| interner.normalized_key(x))
.collect();
for (i, a) in interned.iter().enumerate() {
for (j, b) in interned.iter().enumerate() {
let interned_cmp = a.cmp(b);
let values_cmp = values[i].cmp(&values[j]);
assert_eq!(
interned_cmp, values_cmp,
"({:?} vs {:?}) vs ({} vs {})",
a, b, values[i], values[j]
)
}
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_interner() {
test_intern_values(&[8, 6, 5, 7]);
let mut values: Vec<_> = (0_u64..2000).collect();
test_intern_values(&values);
let mut rng = thread_rng();
values.shuffle(&mut rng);
test_intern_values(&values);
}
#[test]
fn test_intern_duplicates() {
let values = vec![0_u8, 1, 8, 4, 1, 0];
let mut interner = OrderPreservingInterner::default();
let interned = interner.intern(values.iter().map(std::slice::from_ref).map(Some));
let interned: Vec<_> = interned.into_iter().map(Option::unwrap).collect();
assert_eq!(interned[0], interned[5]);
assert_eq!(interned[1], interned[4]);
assert!(
interner.normalized_key(interned[0]) < interner.normalized_key(interned[1])
);
assert!(
interner.normalized_key(interned[1]) < interner.normalized_key(interned[2])
);
assert!(
interner.normalized_key(interned[1]) < interner.normalized_key(interned[3])
);
assert!(
interner.normalized_key(interned[3]) < interner.normalized_key(interned[2])
);
}
}