hexz_core/algo/dedup/hash_table/mod.rs
1//! High-performance hash tables for deduplication workloads.
2//!
3//! This module provides specialized hash table implementations optimized for
4//! Hexz's block deduplication during snapshot packing. It includes:
5//!
6//! - **Trait abstraction** (`DedupHashTable`) for swappable implementations
7//! - **Standard wrapper** around `std::collections::HashMap` as baseline
8//! - **Elastic Hash Table** implementing Krapivin et al.'s 2025 breakthrough
9//!
10//! # Background: The Krapivin Breakthrough (2025)
11//!
12//! In January 2025, Andrew Krapivin (then an undergraduate at Rutgers) published
13//! a paper with Farach-Colton and Kuszmaul that disproved a 40-year-old conjecture
14//! by Andrew Yao about the optimality of uniform hashing in open-addressed hash tables.
15//!
16//! **Paper:** [Optimal Bounds for Open Addressing Without Reordering](https://arxiv.org/abs/2501.02305)
17//!
18//! **Key contributions:**
19//! - Achieves O(1) amortized and O(log 1/δ) worst-case expected probe complexity
20//! - Enables load factors >0.9 with good performance (15-20% memory savings)
21//! - No element reordering needed (unlike cuckoo hashing)
22//! - Disproves Yao's conjecture that uniform probing is optimal
23//!
24//! # Why This Matters for Hexz
25//!
26//! Deduplication maps can consume gigabytes of RAM when packing large datasets:
27//! - **1M unique blocks** = ~56 MB with std HashMap (at 0.875 load factor)
28//! - **1M unique blocks** = ~48 MB with Elastic Hash (at 0.9 load factor)
29//! - **10M unique blocks** = 560 MB vs 480 MB (80 MB saved!)
30//!
31//! Additionally, Elastic Hashing provides better worst-case performance when
32//! tables are nearly full, preventing slowdowns during large pack operations.
33//!
34//! # Architecture
35//!
36//! ```text
37//! ┌─────────────────────────────────────────────────────────────┐
38//! │ DedupHashTable Trait │
39//! │ (insert, get, len, load_factor, memory_bytes, stats) │
40//! └───────────────────┬─────────────────────────────────────────┘
41//! │
42//! ┌──────────┴──────────┐
43//! │ │
44//! ┌──────▼──────┐ ┌──────▼──────────┐
45//! │ Standard │ │ Elastic │
46//! │ HashMap │ │ Hash Table │
47//! │ (baseline) │ │ (Krapivin 2025) │
48//! └─────────────┘ └──────────────────┘
49//! ```
50//!
51//! # Usage
52//!
53//! The `DedupHashTable` trait allows swapping implementations via feature flags:
54//!
55//! ```rust,ignore
56//! use hexz_core::algo::dedup::hash_table::{DedupHashTable, ElasticHashTable};
57//!
58//! let mut table = ElasticHashTable::with_capacity(1_000_000);
59//!
60//! // Insert hash -> offset mapping
61//! let hash = blake3::hash(b"compressed block data");
62//! table.insert(*hash.as_bytes(), 12345);
63//!
64//! // Lookup
65//! if let Some(offset) = table.get(hash.as_bytes()) {
66//! println!("Block at offset: {}", offset);
67//! }
68//!
69//! // Check memory usage
70//! println!("Memory: {} MB", table.memory_bytes() / 1_048_576);
71//! ```
72
73// Note: Result will be used when implementing error handling in insert/get
74
75/// High-performance hash table for block deduplication.
76///
77/// This trait abstracts over different hash table implementations, allowing
78/// Hexz to compare and swap between standard HashMap and cutting-edge algorithms
79/// like Elastic Hashing without changing call sites.
80///
81/// # Design Constraints
82///
83/// This trait is specialized for Hexz's deduplication use case:
84/// - **Fixed key type**: `[u8; 32]` (BLAKE3 hash)
85/// - **Fixed value type**: `u64` (physical offset in snapshot)
86/// - **Insert-only**: No `remove()` — dedup is write-once during packing
87/// - **No iteration**: We never enumerate entries
88///
89/// These constraints allow implementations to optimize for the specific workload.
90pub trait DedupHashTable: Send + Sync {
91 /// Inserts a hash->offset mapping.
92 ///
93 /// If the hash already exists, updates the offset and returns the old value.
94 /// Otherwise, returns `None`.
95 ///
96 /// # Performance
97 ///
98 /// Expected: O(1) amortized
99 /// Worst-case: Implementation-dependent
100 fn insert(&mut self, hash: [u8; 32], offset: u64) -> Option<u64>;
101
102 /// Looks up a hash and returns the associated offset.
103 ///
104 /// Returns `None` if the hash is not in the table.
105 ///
106 /// # Performance
107 ///
108 /// Expected: O(1)
109 /// Worst-case: Implementation-dependent
110 fn get(&self, hash: &[u8; 32]) -> Option<u64>;
111
112 /// Returns the number of entries in the table.
113 fn len(&self) -> usize;
114
115 /// Returns true if the table is empty.
116 fn is_empty(&self) -> bool {
117 self.len() == 0
118 }
119
120 /// Returns the current load factor (entries / capacity).
121 ///
122 /// Load factor indicates how full the table is:
123 /// - 0.0 = completely empty
124 /// - 0.5 = half full
125 /// - 0.9 = 90% full (high load)
126 /// - 1.0 = completely full (should never happen)
127 fn load_factor(&self) -> f64;
128
129 /// Returns the total memory usage in bytes.
130 ///
131 /// Includes:
132 /// - Entry storage (keys + values)
133 /// - Metadata (state flags, level info, etc.)
134 /// - Overhead (Vec capacity, allocator metadata)
135 fn memory_bytes(&self) -> usize;
136
137 /// Returns performance statistics for analysis and tuning.
138 fn stats(&self) -> TableStats;
139}
140
141/// Performance metrics for hash table operations.
142///
143/// Used for benchmarking, profiling, and comparing implementations.
144#[derive(Debug, Clone, Default)]
145pub struct TableStats {
146 /// Total number of insert operations performed
147 pub total_inserts: u64,
148
149 /// Total number of lookup operations performed
150 pub total_lookups: u64,
151
152 /// Total number of probe steps across all operations
153 ///
154 /// A probe is a single slot check during insertion or lookup.
155 /// Lower is better — indicates fewer collisions.
156 pub total_probes: u64,
157
158 /// Maximum probe length seen in any single operation
159 ///
160 /// Indicates worst-case behavior. Should be kept low (<10).
161 pub max_probe_length: u32,
162
163 /// Distribution of entries across levels (Elastic Hash only)
164 ///
165 /// - Index 0 = Level 0 (largest, 50% of capacity)
166 /// - Index 1 = Level 1 (25% of capacity)
167 /// - Index 2 = Level 2 (12.5% of capacity)
168 /// - ...
169 ///
170 /// Empty for non-leveled implementations like StandardHashTable.
171 pub level_usage: Vec<usize>,
172}
173
174impl TableStats {
175 /// Returns the average number of probes per operation.
176 ///
177 /// Ideally close to 1.0 (one probe per operation).
178 /// Higher values indicate more collisions.
179 pub fn avg_probes_per_op(&self) -> f64 {
180 let total_ops = self.total_inserts + self.total_lookups;
181 if total_ops == 0 {
182 0.0
183 } else {
184 self.total_probes as f64 / total_ops as f64
185 }
186 }
187}
188
189// Submodules
190pub mod elastic;
191
192// Re-exports for convenience
193pub use elastic::ElasticHashTable;
194
195/// Blanket `DedupHashTable` impl for `HashMap<[u8; 32], u64>` so that
196/// `write_block` can accept either HashMap or ElasticHashTable via `dyn DedupHashTable`.
197impl DedupHashTable for std::collections::HashMap<[u8; 32], u64> {
198 fn insert(&mut self, hash: [u8; 32], offset: u64) -> Option<u64> {
199 std::collections::HashMap::insert(self, hash, offset)
200 }
201
202 fn get(&self, hash: &[u8; 32]) -> Option<u64> {
203 std::collections::HashMap::get(self, hash).copied()
204 }
205
206 fn len(&self) -> usize {
207 std::collections::HashMap::len(self)
208 }
209
210 fn load_factor(&self) -> f64 {
211 let cap = std::collections::HashMap::capacity(self);
212 if cap == 0 {
213 0.0
214 } else {
215 std::collections::HashMap::len(self) as f64 / cap as f64
216 }
217 }
218
219 fn memory_bytes(&self) -> usize {
220 std::collections::HashMap::capacity(self) * 56
221 }
222
223 fn stats(&self) -> TableStats {
224 TableStats::default()
225 }
226}
227
228/// Baseline hash table wrapping `std::collections::HashMap`.
229///
230/// Used for comparison benchmarks against [`ElasticHashTable`].
231pub struct StandardHashTable {
232 map: std::collections::HashMap<[u8; 32], u64>,
233 total_inserts: u64,
234 total_lookups: std::cell::Cell<u64>,
235 total_probes: std::cell::Cell<u64>,
236}
237
238impl StandardHashTable {
239 pub fn new() -> Self {
240 Self {
241 map: std::collections::HashMap::new(),
242 total_inserts: 0,
243 total_lookups: std::cell::Cell::new(0),
244 total_probes: std::cell::Cell::new(0),
245 }
246 }
247
248 pub fn with_capacity(capacity: usize) -> Self {
249 Self {
250 map: std::collections::HashMap::with_capacity(capacity),
251 total_inserts: 0,
252 total_lookups: std::cell::Cell::new(0),
253 total_probes: std::cell::Cell::new(0),
254 }
255 }
256}
257
258impl Default for StandardHashTable {
259 fn default() -> Self {
260 Self::new()
261 }
262}
263
264impl DedupHashTable for StandardHashTable {
265 fn insert(&mut self, hash: [u8; 32], offset: u64) -> Option<u64> {
266 self.total_inserts += 1;
267 self.map.insert(hash, offset)
268 }
269
270 fn get(&self, hash: &[u8; 32]) -> Option<u64> {
271 self.total_lookups.set(self.total_lookups.get() + 1);
272 self.total_probes.set(self.total_probes.get() + 1);
273 self.map.get(hash).copied()
274 }
275
276 fn len(&self) -> usize {
277 self.map.len()
278 }
279
280 fn load_factor(&self) -> f64 {
281 let cap = self.map.capacity();
282 if cap == 0 {
283 0.0
284 } else {
285 self.map.len() as f64 / cap as f64
286 }
287 }
288
289 fn memory_bytes(&self) -> usize {
290 // Approximate: each entry is key (32) + value (8) + hash (8) + pointer (8) = 56 bytes
291 self.map.capacity() * 56
292 }
293
294 fn stats(&self) -> TableStats {
295 TableStats {
296 total_inserts: self.total_inserts,
297 total_lookups: self.total_lookups.get(),
298 total_probes: self.total_probes.get(),
299 max_probe_length: 1, // HashMap doesn't expose this
300 level_usage: Vec::new(),
301 }
302 }
303}
304
305// SAFETY: Same justification as ElasticHashTable — Cell fields are only
306// used for statistics counters and are not shared across threads simultaneously.
307unsafe impl Sync for StandardHashTable {}