Skip to main content

hexz_core/algo/dedup/hash_table/
mod.rs

1//! Hash table for deduplication workloads.
2//!
3//! This module provides a specialized hash table optimized for Hexz's block
4//! deduplication during snapshot packing. It wraps `std::collections::HashMap`
5//! with an identity hasher since keys are BLAKE3 hashes (already uniformly
6//! distributed), eliminating redundant SipHash overhead.
7//!
8//! # Usage
9//!
10//! ```rust,ignore
11//! use hexz_core::algo::dedup::hash_table::StandardHashTable;
12//!
13//! let mut table = StandardHashTable::with_capacity(1_000_000);
14//!
15//! // Insert hash -> offset mapping
16//! let hash = blake3::hash(b"compressed block data");
17//! table.insert(*hash.as_bytes(), 12345);
18//!
19//! // Lookup
20//! if let Some(offset) = table.get(hash.as_bytes()) {
21//!     println!("Block at offset: {}", offset);
22//! }
23//!
24//! // Check memory usage
25//! println!("Memory: {} MB", table.memory_bytes() / 1_048_576);
26//! ```
27
28/// Performance metrics for hash table operations.
29///
30/// Used for benchmarking and profiling.
31#[derive(Debug, Clone, Default)]
32pub struct TableStats {
33    /// Total number of insert operations performed
34    pub total_inserts: u64,
35
36    /// Total number of lookup operations performed
37    pub total_lookups: u64,
38
39    /// Total number of probe steps across all operations
40    pub total_probes: u64,
41
42    /// Maximum probe length seen in any single operation
43    pub max_probe_length: u32,
44
45    /// Distribution of entries across levels (reserved for future use)
46    pub level_usage: Vec<usize>,
47}
48
49impl TableStats {
50    /// Returns the average number of probes per operation.
51    pub fn avg_probes_per_op(&self) -> f64 {
52        let total_ops = self.total_inserts + self.total_lookups;
53        if total_ops == 0 {
54            0.0
55        } else {
56            self.total_probes as f64 / total_ops as f64
57        }
58    }
59}
60
61/// Identity hasher that uses the first 8 bytes of a BLAKE3 key directly.
62///
63/// Since keys in the dedup table are BLAKE3 hashes (already uniformly distributed),
64/// there is no need for the default SipHash to re-hash them.
65struct IdentityHasher(u64);
66
67impl std::hash::Hasher for IdentityHasher {
68    #[inline]
69    fn write(&mut self, bytes: &[u8]) {
70        if bytes.len() >= 8 {
71            self.0 = u64::from_le_bytes(bytes[0..8].try_into().unwrap());
72        } else {
73            self.0 = bytes
74                .iter()
75                .fold(0u64, |acc, &b| acc.wrapping_mul(256).wrapping_add(b as u64));
76        }
77    }
78
79    #[inline]
80    fn finish(&self) -> u64 {
81        self.0
82    }
83}
84
85#[derive(Clone, Default)]
86struct BuildIdentityHasher;
87
88impl std::hash::BuildHasher for BuildIdentityHasher {
89    type Hasher = IdentityHasher;
90
91    #[inline]
92    fn build_hasher(&self) -> IdentityHasher {
93        IdentityHasher(0)
94    }
95}
96
97/// Hash table for block deduplication with identity hashing.
98///
99/// Uses an identity hasher since keys are BLAKE3 hashes (already uniformly
100/// distributed), giving ~3-6x speedup over the default SipHash.
101pub struct StandardHashTable {
102    map: std::collections::HashMap<[u8; 32], u64, BuildIdentityHasher>,
103    total_inserts: u64,
104    total_lookups: std::cell::Cell<u64>,
105    total_probes: std::cell::Cell<u64>,
106}
107
108impl StandardHashTable {
109    pub fn new() -> Self {
110        Self {
111            map: std::collections::HashMap::with_hasher(BuildIdentityHasher),
112            total_inserts: 0,
113            total_lookups: std::cell::Cell::new(0),
114            total_probes: std::cell::Cell::new(0),
115        }
116    }
117
118    pub fn with_capacity(capacity: usize) -> Self {
119        Self {
120            map: std::collections::HashMap::with_capacity_and_hasher(capacity, BuildIdentityHasher),
121            total_inserts: 0,
122            total_lookups: std::cell::Cell::new(0),
123            total_probes: std::cell::Cell::new(0),
124        }
125    }
126
127    pub fn len(&self) -> usize {
128        self.map.len()
129    }
130
131    pub fn is_empty(&self) -> bool {
132        self.map.is_empty()
133    }
134
135    pub fn insert(&mut self, hash: [u8; 32], offset: u64) -> Option<u64> {
136        self.total_inserts += 1;
137        self.map.insert(hash, offset)
138    }
139
140    pub fn get(&self, hash: &[u8; 32]) -> Option<u64> {
141        self.total_lookups.set(self.total_lookups.get() + 1);
142        self.total_probes.set(self.total_probes.get() + 1);
143        self.map.get(hash).copied()
144    }
145
146    pub fn load_factor(&self) -> f64 {
147        let cap = self.map.capacity();
148        if cap == 0 {
149            0.0
150        } else {
151            self.map.len() as f64 / cap as f64
152        }
153    }
154
155    pub fn memory_bytes(&self) -> usize {
156        self.map.capacity() * 56
157    }
158
159    pub fn stats(&self) -> TableStats {
160        TableStats {
161            total_inserts: self.total_inserts,
162            total_lookups: self.total_lookups.get(),
163            total_probes: self.total_probes.get(),
164            max_probe_length: 1,
165            level_usage: Vec::new(),
166        }
167    }
168}
169
170impl Default for StandardHashTable {
171    fn default() -> Self {
172        Self::new()
173    }
174}
175
176// SAFETY: Cell fields are only used for statistics counters and are not
177// shared across threads simultaneously.
178unsafe impl Sync for StandardHashTable {}