Skip to main content

hexz_core/algo/dedup/hash_table/
mod.rs

1//! High-performance hash tables for deduplication workloads.
2//!
3//! This module provides specialized hash table implementations optimized for
4//! Hexz's block deduplication during snapshot packing. It includes:
5//!
6//! - **Trait abstraction** (`DedupHashTable`) for swappable implementations
7//! - **Standard wrapper** around `std::collections::HashMap` as baseline
8//! - **Elastic Hash Table** implementing Krapivin et al.'s 2025 breakthrough
9//!
10//! # Background: The Krapivin Breakthrough (2025)
11//!
12//! In January 2025, Andrew Krapivin (then an undergraduate at Rutgers) published
13//! a paper with Farach-Colton and Kuszmaul that disproved a 40-year-old conjecture
14//! by Andrew Yao about the optimality of uniform hashing in open-addressed hash tables.
15//!
16//! **Paper:** [Optimal Bounds for Open Addressing Without Reordering](https://arxiv.org/abs/2501.02305)
17//!
18//! **Key contributions:**
19//! - Achieves O(1) amortized and O(log 1/δ) worst-case expected probe complexity
20//! - Enables load factors >0.9 with good performance (15-20% memory savings)
21//! - No element reordering needed (unlike cuckoo hashing)
22//! - Disproves Yao's conjecture that uniform probing is optimal
23//!
24//! # Why This Matters for Hexz
25//!
26//! Deduplication maps can consume gigabytes of RAM when packing large datasets:
27//! - **1M unique blocks** = ~56 MB with std HashMap (at 0.875 load factor)
28//! - **1M unique blocks** = ~48 MB with Elastic Hash (at 0.9 load factor)
29//! - **10M unique blocks** = 560 MB vs 480 MB (80 MB saved!)
30//!
31//! Additionally, Elastic Hashing provides better worst-case performance when
32//! tables are nearly full, preventing slowdowns during large pack operations.
33//!
34//! # Architecture
35//!
36//! ```text
37//! ┌─────────────────────────────────────────────────────────────┐
38//! │                     DedupHashTable Trait                     │
39//! │  (insert, get, len, load_factor, memory_bytes, stats)        │
40//! └───────────────────┬─────────────────────────────────────────┘
41//!                     │
42//!          ┌──────────┴──────────┐
43//!          │                     │
44//!   ┌──────▼──────┐      ┌──────▼──────────┐
45//!   │  Standard   │      │     Elastic      │
46//!   │  HashMap    │      │   Hash Table     │
47//!   │  (baseline) │      │ (Krapivin 2025)  │
48//!   └─────────────┘      └──────────────────┘
49//! ```
50//!
51//! # Usage
52//!
53//! The `DedupHashTable` trait allows swapping implementations via feature flags:
54//!
55//! ```rust,ignore
56//! use hexz_core::algo::dedup::hash_table::{DedupHashTable, ElasticHashTable};
57//!
58//! let mut table = ElasticHashTable::with_capacity(1_000_000);
59//!
60//! // Insert hash -> offset mapping
61//! let hash = blake3::hash(b"compressed block data");
62//! table.insert(*hash.as_bytes(), 12345);
63//!
64//! // Lookup
65//! if let Some(offset) = table.get(hash.as_bytes()) {
66//!     println!("Block at offset: {}", offset);
67//! }
68//!
69//! // Check memory usage
70//! println!("Memory: {} MB", table.memory_bytes() / 1_048_576);
71//! ```
72
73// Note: Result will be used when implementing error handling in insert/get
74
75/// High-performance hash table for block deduplication.
76///
77/// This trait abstracts over different hash table implementations, allowing
78/// Hexz to compare and swap between standard HashMap and cutting-edge algorithms
79/// like Elastic Hashing without changing call sites.
80///
81/// # Design Constraints
82///
83/// This trait is specialized for Hexz's deduplication use case:
84/// - **Fixed key type**: `[u8; 32]` (BLAKE3 hash)
85/// - **Fixed value type**: `u64` (physical offset in snapshot)
86/// - **Insert-only**: No `remove()` — dedup is write-once during packing
87/// - **No iteration**: We never enumerate entries
88///
89/// These constraints allow implementations to optimize for the specific workload.
90pub trait DedupHashTable: Send + Sync {
91    /// Inserts a hash->offset mapping.
92    ///
93    /// If the hash already exists, updates the offset and returns the old value.
94    /// Otherwise, returns `None`.
95    ///
96    /// # Performance
97    ///
98    /// Expected: O(1) amortized
99    /// Worst-case: Implementation-dependent
100    fn insert(&mut self, hash: [u8; 32], offset: u64) -> Option<u64>;
101
102    /// Looks up a hash and returns the associated offset.
103    ///
104    /// Returns `None` if the hash is not in the table.
105    ///
106    /// # Performance
107    ///
108    /// Expected: O(1)
109    /// Worst-case: Implementation-dependent
110    fn get(&self, hash: &[u8; 32]) -> Option<u64>;
111
112    /// Returns the number of entries in the table.
113    fn len(&self) -> usize;
114
115    /// Returns true if the table is empty.
116    fn is_empty(&self) -> bool {
117        self.len() == 0
118    }
119
120    /// Returns the current load factor (entries / capacity).
121    ///
122    /// Load factor indicates how full the table is:
123    /// - 0.0 = completely empty
124    /// - 0.5 = half full
125    /// - 0.9 = 90% full (high load)
126    /// - 1.0 = completely full (should never happen)
127    fn load_factor(&self) -> f64;
128
129    /// Returns the total memory usage in bytes.
130    ///
131    /// Includes:
132    /// - Entry storage (keys + values)
133    /// - Metadata (state flags, level info, etc.)
134    /// - Overhead (Vec capacity, allocator metadata)
135    fn memory_bytes(&self) -> usize;
136
137    /// Returns performance statistics for analysis and tuning.
138    fn stats(&self) -> TableStats;
139}
140
141/// Performance metrics for hash table operations.
142///
143/// Used for benchmarking, profiling, and comparing implementations.
144#[derive(Debug, Clone, Default)]
145pub struct TableStats {
146    /// Total number of insert operations performed
147    pub total_inserts: u64,
148
149    /// Total number of lookup operations performed
150    pub total_lookups: u64,
151
152    /// Total number of probe steps across all operations
153    ///
154    /// A probe is a single slot check during insertion or lookup.
155    /// Lower is better — indicates fewer collisions.
156    pub total_probes: u64,
157
158    /// Maximum probe length seen in any single operation
159    ///
160    /// Indicates worst-case behavior. Should be kept low (<10).
161    pub max_probe_length: u32,
162
163    /// Distribution of entries across levels (Elastic Hash only)
164    ///
165    /// - Index 0 = Level 0 (largest, 50% of capacity)
166    /// - Index 1 = Level 1 (25% of capacity)
167    /// - Index 2 = Level 2 (12.5% of capacity)
168    /// - ...
169    ///
170    /// Empty for non-leveled implementations like StandardHashTable.
171    pub level_usage: Vec<usize>,
172}
173
174impl TableStats {
175    /// Returns the average number of probes per operation.
176    ///
177    /// Ideally close to 1.0 (one probe per operation).
178    /// Higher values indicate more collisions.
179    pub fn avg_probes_per_op(&self) -> f64 {
180        let total_ops = self.total_inserts + self.total_lookups;
181        if total_ops == 0 {
182            0.0
183        } else {
184            self.total_probes as f64 / total_ops as f64
185        }
186    }
187}
188
189// Submodules
190pub mod elastic;
191
192// Re-exports for convenience
193pub use elastic::ElasticHashTable;
194
195/// Blanket `DedupHashTable` impl for `HashMap<[u8; 32], u64>` so that
196/// `write_block` can accept either HashMap or ElasticHashTable via `dyn DedupHashTable`.
197impl DedupHashTable for std::collections::HashMap<[u8; 32], u64> {
198    fn insert(&mut self, hash: [u8; 32], offset: u64) -> Option<u64> {
199        std::collections::HashMap::insert(self, hash, offset)
200    }
201
202    fn get(&self, hash: &[u8; 32]) -> Option<u64> {
203        std::collections::HashMap::get(self, hash).copied()
204    }
205
206    fn len(&self) -> usize {
207        std::collections::HashMap::len(self)
208    }
209
210    fn load_factor(&self) -> f64 {
211        let cap = std::collections::HashMap::capacity(self);
212        if cap == 0 {
213            0.0
214        } else {
215            std::collections::HashMap::len(self) as f64 / cap as f64
216        }
217    }
218
219    fn memory_bytes(&self) -> usize {
220        std::collections::HashMap::capacity(self) * 56
221    }
222
223    fn stats(&self) -> TableStats {
224        TableStats::default()
225    }
226}
227
228/// Baseline hash table wrapping `std::collections::HashMap`.
229///
230/// Used for comparison benchmarks against [`ElasticHashTable`].
231pub struct StandardHashTable {
232    map: std::collections::HashMap<[u8; 32], u64>,
233    total_inserts: u64,
234    total_lookups: std::cell::Cell<u64>,
235    total_probes: std::cell::Cell<u64>,
236}
237
238impl StandardHashTable {
239    pub fn new() -> Self {
240        Self {
241            map: std::collections::HashMap::new(),
242            total_inserts: 0,
243            total_lookups: std::cell::Cell::new(0),
244            total_probes: std::cell::Cell::new(0),
245        }
246    }
247
248    pub fn with_capacity(capacity: usize) -> Self {
249        Self {
250            map: std::collections::HashMap::with_capacity(capacity),
251            total_inserts: 0,
252            total_lookups: std::cell::Cell::new(0),
253            total_probes: std::cell::Cell::new(0),
254        }
255    }
256}
257
258impl Default for StandardHashTable {
259    fn default() -> Self {
260        Self::new()
261    }
262}
263
264impl DedupHashTable for StandardHashTable {
265    fn insert(&mut self, hash: [u8; 32], offset: u64) -> Option<u64> {
266        self.total_inserts += 1;
267        self.map.insert(hash, offset)
268    }
269
270    fn get(&self, hash: &[u8; 32]) -> Option<u64> {
271        self.total_lookups.set(self.total_lookups.get() + 1);
272        self.total_probes.set(self.total_probes.get() + 1);
273        self.map.get(hash).copied()
274    }
275
276    fn len(&self) -> usize {
277        self.map.len()
278    }
279
280    fn load_factor(&self) -> f64 {
281        let cap = self.map.capacity();
282        if cap == 0 {
283            0.0
284        } else {
285            self.map.len() as f64 / cap as f64
286        }
287    }
288
289    fn memory_bytes(&self) -> usize {
290        // Approximate: each entry is key (32) + value (8) + hash (8) + pointer (8) = 56 bytes
291        self.map.capacity() * 56
292    }
293
294    fn stats(&self) -> TableStats {
295        TableStats {
296            total_inserts: self.total_inserts,
297            total_lookups: self.total_lookups.get(),
298            total_probes: self.total_probes.get(),
299            max_probe_length: 1, // HashMap doesn't expose this
300            level_usage: Vec::new(),
301        }
302    }
303}
304
305// SAFETY: Same justification as ElasticHashTable — Cell fields are only
306// used for statistics counters and are not shared across threads simultaneously.
307unsafe impl Sync for StandardHashTable {}