Skip to main content

hexz_core/algo/dedup/hash_table/
mod.rs

1//! Hash table for deduplication workloads.
2//!
3//! This module provides a specialized hash table optimized for Hexz's block
4//! deduplication during archive packing. It wraps `std::collections::HashMap`
5//! with an identity hasher since keys are BLAKE3 hashes (already uniformly
6//! distributed), eliminating redundant `SipHash` overhead.
7//!
8//! # Usage
9//!
10//! ```rust,ignore
11//! use hexz_core::algo::dedup::hash_table::StandardHashTable;
12//!
13//! let mut table = StandardHashTable::with_capacity(1_000_000);
14//!
15//! // Insert hash -> offset mapping
16//! let hash = blake3::hash(b"compressed block data");
17//! table.insert(*hash.as_bytes(), 12345);
18//!
19//! // Lookup
20//! if let Some(offset) = table.get(hash.as_bytes()) {
21//!     println!("Block at offset: {}", offset);
22//! }
23//!
24//! // Check memory usage
25//! println!("Memory: {} MB", table.memory_bytes() / 1_048_576);
26//! ```
27
28/// Performance metrics for hash table operations.
29///
30/// Used for benchmarking and profiling.
31#[derive(Debug, Clone, Default)]
32pub struct TableStats {
33    /// Total number of insert operations performed
34    pub total_inserts: u64,
35
36    /// Total number of lookup operations performed
37    pub total_lookups: u64,
38
39    /// Total number of probe steps across all operations
40    pub total_probes: u64,
41
42    /// Maximum probe length seen in any single operation
43    pub max_probe_length: u32,
44
45    /// Distribution of entries across levels (reserved for future use)
46    pub level_usage: Vec<usize>,
47}
48
49impl TableStats {
50    /// Returns the average number of probes per operation.
51    pub fn avg_probes_per_op(&self) -> f64 {
52        let total_ops = self.total_inserts + self.total_lookups;
53        if total_ops == 0 {
54            0.0
55        } else {
56            self.total_probes as f64 / total_ops as f64
57        }
58    }
59}
60
61/// Identity hasher that uses the first 8 bytes of a BLAKE3 key directly.
62///
63/// Since keys in the dedup table are BLAKE3 hashes (already uniformly distributed),
64/// there is no need for the default `SipHash` to re-hash them.
65struct IdentityHasher(u64);
66
67impl std::hash::Hasher for IdentityHasher {
68    #[inline]
69    fn write(&mut self, bytes: &[u8]) {
70        if bytes.len() >= 8 {
71            // The length check above guarantees bytes.len() >= 8, so this slice
72            // is exactly 8 bytes and try_into always succeeds.
73            let Ok(arr) = <[u8; 8]>::try_from(&bytes[0..8]) else {
74                unreachable!("length checked above")
75            };
76            self.0 = u64::from_le_bytes(arr);
77        } else {
78            self.0 = bytes
79                .iter()
80                .fold(0u64, |acc, &b| acc.wrapping_mul(256).wrapping_add(b as u64));
81        }
82    }
83
84    #[inline]
85    fn finish(&self) -> u64 {
86        self.0
87    }
88}
89
90#[derive(Clone, Default)]
91struct BuildIdentityHasher;
92
93impl std::hash::BuildHasher for BuildIdentityHasher {
94    type Hasher = IdentityHasher;
95
96    #[inline]
97    fn build_hasher(&self) -> IdentityHasher {
98        IdentityHasher(0)
99    }
100}
101
102/// Hash table for block deduplication with identity hashing.
103///
104/// Uses an identity hasher since keys are BLAKE3 hashes (already uniformly
105/// distributed), giving ~3-6x speedup over the default `SipHash`.
106pub struct StandardHashTable {
107    map: std::collections::HashMap<[u8; 32], u64, BuildIdentityHasher>,
108    total_inserts: u64,
109    total_lookups: std::cell::Cell<u64>,
110    total_probes: std::cell::Cell<u64>,
111}
112
113impl std::fmt::Debug for StandardHashTable {
114    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
115        f.debug_struct("StandardHashTable")
116            .field("len", &self.map.len())
117            .field("capacity", &self.map.capacity())
118            .field("total_inserts", &self.total_inserts)
119            .finish_non_exhaustive()
120    }
121}
122
123impl StandardHashTable {
124    /// Creates an empty hash table.
125    pub const fn new() -> Self {
126        Self {
127            map: std::collections::HashMap::with_hasher(BuildIdentityHasher),
128            total_inserts: 0,
129            total_lookups: std::cell::Cell::new(0),
130            total_probes: std::cell::Cell::new(0),
131        }
132    }
133
134    /// Creates a hash table with pre-allocated capacity for `capacity` entries.
135    pub fn with_capacity(capacity: usize) -> Self {
136        Self {
137            map: std::collections::HashMap::with_capacity_and_hasher(capacity, BuildIdentityHasher),
138            total_inserts: 0,
139            total_lookups: std::cell::Cell::new(0),
140            total_probes: std::cell::Cell::new(0),
141        }
142    }
143
144    /// Returns the number of entries in the table.
145    pub fn len(&self) -> usize {
146        self.map.len()
147    }
148
149    /// Returns `true` if the table contains no entries.
150    pub fn is_empty(&self) -> bool {
151        self.map.is_empty()
152    }
153
154    /// Inserts a hash-to-offset mapping, returning the previous offset if the hash was already present.
155    pub fn insert(&mut self, hash: [u8; 32], offset: u64) -> Option<u64> {
156        self.total_inserts += 1;
157        self.map.insert(hash, offset)
158    }
159
160    /// Looks up the offset for a given hash, returning `None` if not found.
161    pub fn get(&self, hash: &[u8; 32]) -> Option<u64> {
162        self.total_lookups.set(self.total_lookups.get() + 1);
163        self.total_probes.set(self.total_probes.get() + 1);
164        self.map.get(hash).copied()
165    }
166
167    /// Returns the ratio of entries to capacity (0.0 when empty).
168    pub fn load_factor(&self) -> f64 {
169        let cap = self.map.capacity();
170        if cap == 0 {
171            0.0
172        } else {
173            self.map.len() as f64 / cap as f64
174        }
175    }
176
177    /// Returns the estimated memory usage of the table in bytes.
178    pub fn memory_bytes(&self) -> usize {
179        self.map.capacity() * 56
180    }
181
182    /// Returns performance statistics for this table.
183    pub fn stats(&self) -> TableStats {
184        TableStats {
185            total_inserts: self.total_inserts,
186            total_lookups: self.total_lookups.get(),
187            total_probes: self.total_probes.get(),
188            max_probe_length: 1,
189            level_usage: Vec::new(),
190        }
191    }
192}
193
194impl Default for StandardHashTable {
195    fn default() -> Self {
196        Self::new()
197    }
198}
199
200// SAFETY: Cell fields are only used for statistics counters and are not
201// shared across threads simultaneously.
202unsafe impl Sync for StandardHashTable {}