1use super::{hash_bytes, splitmix64};
36use std::collections::BTreeMap;
37
38pub const NSHARDS: usize = 256;
41const SHARD_MASK: u64 = (NSHARDS - 1) as u64;
42
43pub const MICROBUCKET_CAPACITY: usize = 16;
46
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
49pub enum LookupProfile {
50 Memory,
51 Speed,
52}
53
54impl Default for LookupProfile {
55 fn default() -> Self {
56 LookupProfile::Memory
57 }
58}
59
60pub const INLINE_KEY_LEN: usize = 16;
66
67#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72struct KeyHandle {
73 offset: u32,
77 len: u32,
79 inline: [u8; INLINE_KEY_LEN],
81}
82
83impl KeyHandle {
84 #[inline(always)]
85 fn is_inline(self) -> bool {
86 (self.len as usize) <= INLINE_KEY_LEN
87 }
88
89 #[inline]
92 fn bytes<'a>(&'a self, slab: &'a [u8]) -> &'a [u8] {
93 if self.is_inline() {
94 &self.inline[..self.len as usize]
95 } else {
96 &slab[self.offset as usize..(self.offset + self.len) as usize]
97 }
98 }
99
100 fn intern(slab: &mut Vec<u8>, key: &[u8]) -> Self {
103 if key.len() <= INLINE_KEY_LEN {
104 let mut inline = [0u8; INLINE_KEY_LEN];
105 inline[..key.len()].copy_from_slice(key);
106 KeyHandle {
107 offset: 0,
108 len: key.len() as u32,
109 inline,
110 }
111 } else {
112 let offset = slab.len() as u32;
113 slab.extend_from_slice(key);
114 KeyHandle {
115 offset,
116 len: key.len() as u32,
117 inline: [0u8; INLINE_KEY_LEN],
118 }
119 }
120 }
121}
122
123pub const FRONT_SLOTS: usize = 256;
130const FRONT_MASK: u64 = (FRONT_SLOTS - 1) as u64;
131
132#[derive(Debug, Clone)]
139enum FrontEntry<V: Clone> {
140 Empty,
141 Singleton { handle: KeyHandle, value: V },
142 Bucket(u32),
143}
144
145impl<V: Clone> Default for FrontEntry<V> {
146 fn default() -> Self {
147 FrontEntry::Empty
148 }
149}
150
151#[derive(Debug, Clone)]
154pub struct MicroBucket<V: Clone> {
155 handles: Vec<KeyHandle>,
156 values: Vec<V>,
157}
158
159impl<V: Clone> MicroBucket<V> {
160 fn new() -> Self {
161 Self {
162 handles: Vec::new(),
163 values: Vec::new(),
164 }
165 }
166
167 fn len(&self) -> usize {
168 self.handles.len()
169 }
170
171 fn full(&self) -> bool {
172 self.handles.len() >= MICROBUCKET_CAPACITY
173 }
174
175 #[inline]
178 fn get<'a>(&'a self, slab: &[u8], key: &[u8]) -> Option<&'a V> {
179 for (i, h) in self.handles.iter().enumerate() {
180 if h.bytes(slab) == key {
181 return Some(&self.values[i]);
182 }
183 }
184 None
185 }
186
187 fn upsert_existing(&mut self, slab: &[u8], key: &[u8], value: V) -> Option<V> {
188 for (i, h) in self.handles.iter().enumerate() {
189 if h.bytes(slab) == key {
190 return Some(std::mem::replace(&mut self.values[i], value));
191 }
192 }
193 None
194 }
195
196 fn push(&mut self, handle: KeyHandle, value: V) {
197 debug_assert!(!self.full());
198 self.handles.push(handle);
199 self.values.push(value);
200 }
201}
202
203#[derive(Debug, Clone)]
204struct Shard<V: Clone> {
205 key_pool: Vec<u8>,
208 front: Box<[FrontEntry<V>; FRONT_SLOTS]>,
211 buckets: Vec<MicroBucket<V>>,
213 overflow: BTreeMap<Vec<u8>, V>,
216 overflow_count: u64,
217 max_bucket_size: u32,
218}
219
220impl<V: Clone> Shard<V> {
221 fn new() -> Self {
222 let arr: [FrontEntry<V>; FRONT_SLOTS] =
225 std::array::from_fn(|_| FrontEntry::Empty);
226 Self {
227 key_pool: Vec::new(),
228 front: Box::new(arr),
229 buckets: Vec::new(),
230 overflow: BTreeMap::new(),
231 overflow_count: 0,
232 max_bucket_size: 0,
233 }
234 }
235
236 #[inline]
238 fn intern_key(&mut self, bytes: &[u8]) -> KeyHandle {
239 KeyHandle::intern(&mut self.key_pool, bytes)
240 }
241
242 #[inline]
244 fn slab_bytes<'a>(&'a self, h: &'a KeyHandle) -> &'a [u8] {
245 h.bytes(&self.key_pool)
246 }
247}
248
249#[inline(always)]
253fn h_eq(handle: &KeyHandle, slab: &[u8], probe: &[u8]) -> bool {
254 handle.bytes(slab) == probe
255}
256
257#[derive(Debug, Clone)]
259pub struct DHarht<V: Clone> {
260 shards: Vec<Shard<V>>,
261 sealed: bool,
262 profile: LookupProfile,
263 total_entries: u64,
264}
265
266impl<V: Clone> DHarht<V> {
267 pub fn new() -> Self {
268 Self::with_profile(LookupProfile::Memory)
269 }
270
271 pub fn with_profile(profile: LookupProfile) -> Self {
272 Self {
273 shards: (0..NSHARDS).map(|_| Shard::new()).collect(),
274 sealed: false,
275 profile,
276 total_entries: 0,
277 }
278 }
279
280 pub fn profile(&self) -> LookupProfile {
281 self.profile
282 }
283
284 pub fn is_sealed(&self) -> bool {
285 self.sealed
286 }
287
288 pub fn len(&self) -> u64 {
289 self.total_entries
290 }
291
292 pub fn is_empty(&self) -> bool {
293 self.total_entries == 0
294 }
295
296 pub fn overflow_count(&self) -> u64 {
297 self.shards.iter().map(|s| s.overflow_count).sum()
298 }
299
300 pub fn max_bucket_size(&self) -> u32 {
301 self.shards
302 .iter()
303 .map(|s| s.max_bucket_size)
304 .max()
305 .unwrap_or(0)
306 }
307
308 pub fn singleton_count(&self) -> u64 {
311 self.shards
312 .iter()
313 .flat_map(|s| s.front.iter())
314 .filter(|e| matches!(e, FrontEntry::Singleton { .. }))
315 .count() as u64
316 }
317
318 #[inline]
322 fn locate(key: &[u8]) -> (usize, usize) {
323 let h = hash_bytes(key);
324 let shard = ((h >> 56) & SHARD_MASK) as usize;
325 let slot = ((h >> 48) & FRONT_MASK) as usize;
326 (shard, slot)
327 }
328
329 pub fn insert_bytes(&mut self, key: &[u8], value: V) -> Option<V> {
330 debug_assert!(!self.sealed, "DHarht: insert_bytes after seal_for_lookup");
331 let (s, slot) = Self::locate(key);
332 let shard = &mut self.shards[s];
333
334 if let Some(v) = shard.overflow.get_mut(key) {
337 return Some(std::mem::replace(v, value));
338 }
339
340 match &shard.front[slot] {
342 FrontEntry::Empty => {
343 let handle = shard.intern_key(key);
344 shard.front[slot] = FrontEntry::Singleton { handle, value };
345 self.total_entries += 1;
346 shard.max_bucket_size = shard.max_bucket_size.max(1);
347 None
348 }
349 FrontEntry::Singleton { handle, .. } => {
350 let existing_bytes_match = shard.slab_bytes(handle) == key;
352 if existing_bytes_match {
353 if let FrontEntry::Singleton { value: ref mut v, .. } = shard.front[slot] {
354 return Some(std::mem::replace(v, value));
355 }
356 unreachable!()
357 }
358 let old = std::mem::replace(
360 &mut shard.front[slot],
361 FrontEntry::Bucket(u32::MAX),
362 );
363 let (old_handle, old_value) = match old {
364 FrontEntry::Singleton { handle, value } => (handle, value),
365 _ => unreachable!(),
366 };
367 let new_handle = shard.intern_key(key);
368 let mut bucket = MicroBucket::new();
369 bucket.push(old_handle, old_value);
370 bucket.push(new_handle, value);
371 let bid = shard.buckets.len() as u32;
372 shard.buckets.push(bucket);
373 shard.front[slot] = FrontEntry::Bucket(bid);
374 self.total_entries += 1;
375 shard.max_bucket_size = shard.max_bucket_size.max(2);
376 None
377 }
378 FrontEntry::Bucket(bid) => {
379 let bid = *bid as usize;
380 let key_pool = &shard.key_pool;
381 let bucket = &mut shard.buckets[bid];
382 if let Some(prev) = bucket.upsert_existing(key_pool, key, value.clone()) {
383 return Some(prev);
384 }
385 if !bucket.full() {
386 let handle = shard.intern_key(key);
387 let bucket = &mut shard.buckets[bid];
388 bucket.push(handle, value);
389 self.total_entries += 1;
390 shard.max_bucket_size =
391 shard.max_bucket_size.max(bucket.len() as u32);
392 return None;
393 }
394 let prev = shard.overflow.insert(key.to_vec(), value);
395 if prev.is_none() {
396 self.total_entries += 1;
397 shard.overflow_count += 1;
398 }
399 prev
400 }
401 }
402 }
403
404 #[inline]
405 pub fn get_bytes(&self, key: &[u8]) -> Option<&V> {
406 let (s, slot) = Self::locate(key);
407 let shard = &self.shards[s];
408 match &shard.front[slot] {
413 FrontEntry::Singleton { handle, value }
414 if h_eq(handle, &shard.key_pool, key) =>
415 {
416 Some(value)
417 }
418 FrontEntry::Bucket(bid) => {
419 let bucket = &shard.buckets[*bid as usize];
420 bucket.get(&shard.key_pool, key).or_else(|| {
421 if shard.overflow.is_empty() {
422 None
423 } else {
424 shard.overflow.get(key)
425 }
426 })
427 }
428 _ => {
429 if shard.overflow.is_empty() {
433 None
434 } else {
435 shard.overflow.get(key)
436 }
437 }
438 }
439 }
440
441 pub fn contains_bytes(&self, key: &[u8]) -> bool {
442 self.get_bytes(key).is_some()
443 }
444
445 pub fn seal_for_lookup(&mut self) {
446 for shard in self.shards.iter_mut() {
447 shard.key_pool.shrink_to_fit();
448 shard.buckets.shrink_to_fit();
450 for bucket in shard.buckets.iter_mut() {
451 bucket.handles.shrink_to_fit();
452 bucket.values.shrink_to_fit();
453 }
454 }
455 self.sealed = true;
456 }
457
458 pub fn iter(&self) -> impl Iterator<Item = (&[u8], &V)> + '_ {
461 self.shards.iter().flat_map(|shard| {
462 let from_front = shard.front.iter().flat_map(move |entry| {
463 let pairs: Vec<(&[u8], &V)> = match entry {
464 FrontEntry::Empty => Vec::new(),
465 FrontEntry::Singleton { handle, value } => {
466 vec![(shard.slab_bytes(handle), value)]
467 }
468 FrontEntry::Bucket(bid) => {
469 let bucket = &shard.buckets[*bid as usize];
470 bucket
471 .handles
472 .iter()
473 .zip(bucket.values.iter())
474 .map(|(h, v)| (shard.slab_bytes(h), v))
475 .collect()
476 }
477 };
478 pairs.into_iter()
479 });
480 let from_overflow = shard.overflow.iter().map(|(k, v)| (k.as_slice(), v));
481 from_front.chain(from_overflow)
482 })
483 }
484
485 pub fn iter_sorted(&self) -> Vec<(&[u8], &V)> {
486 let mut all: Vec<(&[u8], &V)> = self.iter().collect();
487 all.sort_by(|a, b| a.0.cmp(b.0));
488 all
489 }
490}
491
492impl<V: Clone> Default for DHarht<V> {
493 fn default() -> Self {
494 Self::new()
495 }
496}
497
498pub fn deterministic_shape_hash<V: Clone + std::hash::Hash>(t: &DHarht<V>) -> u64 {
500 use std::hash::{Hash, Hasher};
501 let mut h = std::collections::hash_map::DefaultHasher::new();
502 let entries = t.iter_sorted();
503 h.write_u64(entries.len() as u64);
504 for (k, v) in entries {
505 h.write_usize(k.len());
506 h.write(k);
507 v.hash(&mut h);
508 }
509 for shard in &t.shards {
510 h.write_u64(shard.overflow_count);
511 h.write_u32(shard.max_bucket_size);
512 }
513 splitmix64(h.finish())
514}
515
516#[cfg(test)]
517mod tests {
518 use super::*;
519
520 #[test]
521 fn insert_get_roundtrip() {
522 let mut t: DHarht<i32> = DHarht::new();
523 t.insert_bytes(b"alpha", 1);
524 t.insert_bytes(b"beta", 2);
525 t.insert_bytes(b"gamma", 3);
526 assert_eq!(t.get_bytes(b"alpha"), Some(&1));
527 assert_eq!(t.get_bytes(b"beta"), Some(&2));
528 assert_eq!(t.get_bytes(b"gamma"), Some(&3));
529 assert_eq!(t.get_bytes(b"delta"), None);
530 assert_eq!(t.len(), 3);
531 }
532
533 #[test]
534 fn insert_overwrites_returns_old() {
535 let mut t: DHarht<i32> = DHarht::new();
536 t.insert_bytes(b"k", 1);
537 let prev = t.insert_bytes(b"k", 2);
538 assert_eq!(prev, Some(1));
539 assert_eq!(t.get_bytes(b"k"), Some(&2));
540 assert_eq!(t.len(), 1);
541 }
542
543 #[test]
544 fn singleton_then_bucket_promotion() {
545 let mut t: DHarht<i32> = DHarht::new();
546 t.insert_bytes(b"a", 1);
548 assert!(t.singleton_count() >= 1);
549 for i in 0..10_000u32 {
551 t.insert_bytes(&i.to_le_bytes(), i as i32);
552 }
553 assert!(t.max_bucket_size() >= 2);
558 }
559
560 #[test]
561 fn seal_preserves_all_entries() {
562 let mut t: DHarht<u32> = DHarht::new();
563 for i in 0..1000u32 {
564 t.insert_bytes(&i.to_le_bytes(), i);
565 }
566 let len_before = t.len();
567 t.seal_for_lookup();
568 assert!(t.is_sealed());
569 assert_eq!(t.len(), len_before);
570 for i in 0..1000u32 {
571 assert_eq!(t.get_bytes(&i.to_le_bytes()), Some(&i));
572 }
573 }
574
575 #[test]
576 fn overflow_to_btreemap_fallback_no_data_loss() {
577 let mut t: DHarht<u32> = DHarht::new();
578 for i in 0..50_000u32 {
579 t.insert_bytes(&i.to_le_bytes(), i);
580 }
581 for i in 0..50_000u32 {
582 assert_eq!(t.get_bytes(&i.to_le_bytes()), Some(&i));
583 }
584 assert_eq!(t.len(), 50_000);
585 }
586
587 #[test]
588 fn deterministic_double_build_same_shape_hash() {
589 fn build() -> DHarht<u32> {
590 let mut t: DHarht<u32> = DHarht::new();
591 for i in 0..500u32 {
592 t.insert_bytes(&i.to_be_bytes(), i);
593 }
594 t.seal_for_lookup();
595 t
596 }
597 let h1 = deterministic_shape_hash(&build());
598 let h2 = deterministic_shape_hash(&build());
599 assert_eq!(h1, h2);
600 }
601
602 #[test]
603 fn iter_sorted_is_canonical_regardless_of_insertion_order() {
604 let mut a: DHarht<u32> = DHarht::new();
605 let mut b: DHarht<u32> = DHarht::new();
606 for i in 0..100u32 {
607 a.insert_bytes(&i.to_be_bytes(), i);
608 }
609 for i in (0..100u32).rev() {
610 b.insert_bytes(&i.to_be_bytes(), i);
611 }
612 let av: Vec<_> = a.iter_sorted().into_iter().map(|(k, v)| (k.to_vec(), *v)).collect();
613 let bv: Vec<_> = b.iter_sorted().into_iter().map(|(k, v)| (k.to_vec(), *v)).collect();
614 assert_eq!(av, bv);
615 }
616
617 #[test]
618 fn microbucket_capacity_respected() {
619 let mut t: DHarht<u32> = DHarht::new();
620 for i in 0..10_000u32 {
621 t.insert_bytes(&i.to_le_bytes(), i);
622 }
623 assert!(
624 t.max_bucket_size() as usize <= MICROBUCKET_CAPACITY,
625 "max bucket size {} exceeded MICROBUCKET_CAPACITY {}",
626 t.max_bucket_size(),
627 MICROBUCKET_CAPACITY
628 );
629 }
630
631 #[test]
632 fn matches_btreemap_oracle_on_random_workload() {
633 let mut h: DHarht<u32> = DHarht::new();
634 let mut oracle: BTreeMap<Vec<u8>, u32> = BTreeMap::new();
635 let mut x: u64 = 0xCAFEBABE;
636 for _ in 0..2000 {
637 x = splitmix64(x);
638 let key_kind = x % 100;
639 let mut key = Vec::new();
640 key.extend_from_slice(&key_kind.to_le_bytes());
641 let v = (x >> 8) as u32;
642 h.insert_bytes(&key, v);
643 oracle.insert(key, v);
644 }
645 for (k, v) in &oracle {
646 assert_eq!(h.get_bytes(k), Some(v));
647 }
648 assert_eq!(h.len() as usize, oracle.len());
649 }
650
651 #[test]
652 fn sealed_lookup_after_compaction_preserves_values() {
653 let mut t: DHarht<Vec<u8>> = DHarht::new();
654 for i in 0..500u32 {
655 t.insert_bytes(&i.to_be_bytes(), i.to_be_bytes().to_vec());
656 }
657 t.seal_for_lookup();
658 for i in 0..500u32 {
659 assert_eq!(t.get_bytes(&i.to_be_bytes()), Some(&i.to_be_bytes().to_vec()));
660 }
661 }
662
663 #[test]
664 fn empty_table_lookup_returns_none() {
665 let t: DHarht<u32> = DHarht::new();
666 assert_eq!(t.get_bytes(b"any"), None);
667 }
668
669 #[test]
670 fn zero_length_key_works() {
671 let mut t: DHarht<u32> = DHarht::new();
672 t.insert_bytes(b"", 42);
673 assert_eq!(t.get_bytes(b""), Some(&42));
674 }
675}