infiniloom_engine/embedding/
hasher.rs

1//! BLAKE3-based content hashing for deterministic chunk IDs
2//!
3//! This module provides fast, cryptographically secure hashing for:
4//! - Chunk ID generation (128-bit truncated, collision-resistant)
5//! - Content verification (full 256-bit hash)
6//! - Manifest integrity checksums
7//!
8//! # Hash Format
9//!
10//! - **Short ID**: `ec_` + 32 hex chars (128 bits of BLAKE3)
11//!   - Collision-resistant for ~2^64 chunks (enterprise scale)
12//!   - Human-readable prefix identifies embedding chunks
13//!
14//! - **Full hash**: 64 hex chars (256 bits of BLAKE3)
15//!   - Used for collision verification
16//!   - Stored in manifest for integrity checking
17//!
18//! # Performance
19//!
20//! BLAKE3 is extremely fast:
21//! - ~3x faster than SHA-256
22//! - ~6x faster than SHA-512
23//! - Parallelizable for large inputs
24//! - SIMD-accelerated on modern CPUs
25
26use super::error::EmbedError;
27use super::normalizer::normalize_for_hash;
28
29/// Result of hashing content, containing both short ID and full hash
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub struct HashResult {
32    /// Short ID for display and indexing: "ec_" + 32 hex chars (128 bits)
33    /// Collision-resistant for ~2^64 unique chunks
34    pub short_id: String,
35
36    /// Full hash for collision detection: 64 hex chars (256 bits)
37    /// Used to verify that same short_id means same content
38    pub full_hash: String,
39}
40
41impl HashResult {
42    /// Create a new HashResult from raw hash bytes
43    #[inline]
44    fn from_hash(hash: blake3::Hash) -> Self {
45        let hex = hash.to_hex();
46
47        Self {
48            // 128 bits = 32 hex chars (collision resistant for 2^64 chunks)
49            short_id: format!("ec_{}", &hex[..32]),
50            // Full 256-bit hash for verification
51            full_hash: hex.to_string(),
52        }
53    }
54}
55
56/// Generate deterministic hashes from content
57///
58/// This is the primary hashing function. It:
59/// 1. Normalizes content for cross-platform consistency
60/// 2. Computes BLAKE3 hash of normalized content
61/// 3. Returns both short ID and full hash
62///
63/// # Example
64///
65/// ```
66/// use infiniloom_engine::embedding::hash_content;
67///
68/// let result = hash_content("fn foo() { bar(); }");
69/// assert!(result.short_id.starts_with("ec_"));
70/// assert_eq!(result.short_id.len(), 3 + 32); // "ec_" + 32 hex chars
71/// assert_eq!(result.full_hash.len(), 64);    // 256 bits = 64 hex chars
72/// ```
73#[inline]
74pub fn hash_content(content: &str) -> HashResult {
75    let normalized = normalize_for_hash(content);
76    let hash = blake3::hash(normalized.as_bytes());
77    HashResult::from_hash(hash)
78}
79
80/// Hash content that is already normalized
81///
82/// Use this when you've already called `normalize_for_hash` on the content.
83/// Skips redundant normalization for better performance.
84///
85/// # Safety
86///
87/// The caller must ensure the content is already normalized. If not,
88/// the hash will be different from `hash_content()` for the same original content.
89#[inline]
90pub fn hash_normalized(normalized_content: &str) -> HashResult {
91    let hash = blake3::hash(normalized_content.as_bytes());
92    HashResult::from_hash(hash)
93}
94
95/// Hash raw bytes without normalization
96///
97/// Use for non-text content or when you need raw byte hashing.
98#[inline]
99pub fn hash_bytes(bytes: &[u8]) -> HashResult {
100    let hash = blake3::hash(bytes);
101    HashResult::from_hash(hash)
102}
103
104/// Verify that two chunks with the same short ID have the same content
105///
106/// This detects hash collisions (extremely rare but possible).
107/// Call this when you encounter a chunk with an existing short ID.
108///
109/// # Returns
110///
111/// - `Ok(())` if hashes match (no collision)
112/// - `Err(HashCollision)` if hashes differ (collision detected)
113pub fn verify_no_collision(id: &str, hash1: &str, hash2: &str) -> Result<(), EmbedError> {
114    if hash1 != hash2 {
115        return Err(EmbedError::HashCollision {
116            id: id.to_string(),
117            hash1: hash1.to_string(),
118            hash2: hash2.to_string(),
119        });
120    }
121    Ok(())
122}
123
124/// Compute a hash for manifest integrity verification
125///
126/// Used to detect tampering with the manifest file.
127pub fn compute_integrity_hash(data: &[u8]) -> String {
128    blake3::hash(data).to_hex().to_string()
129}
130
131/// Incrementally hash multiple pieces of data
132///
133/// More efficient than concatenating strings when hashing multiple items.
134///
135/// # Example
136///
137/// ```ignore
138/// let mut hasher = IncrementalHasher::new();
139/// hasher.update(b"settings json");
140/// hasher.update(b"chunk1");
141/// hasher.update(b"chunk2");
142/// let result = hasher.finalize();
143/// ```
144pub struct IncrementalHasher {
145    hasher: blake3::Hasher,
146}
147
148impl IncrementalHasher {
149    /// Create a new incremental hasher
150    #[inline]
151    pub fn new() -> Self {
152        Self {
153            hasher: blake3::Hasher::new(),
154        }
155    }
156
157    /// Update the hash with additional data
158    #[inline]
159    pub fn update(&mut self, data: &[u8]) {
160        self.hasher.update(data);
161    }
162
163    /// Update the hash with a string
164    #[inline]
165    pub fn update_str(&mut self, s: &str) {
166        self.hasher.update(s.as_bytes());
167    }
168
169    /// Update with a u32 value (little-endian)
170    #[inline]
171    pub fn update_u32(&mut self, n: u32) {
172        self.hasher.update(&n.to_le_bytes());
173    }
174
175    /// Update with a u64 value (little-endian)
176    #[inline]
177    pub fn update_u64(&mut self, n: u64) {
178        self.hasher.update(&n.to_le_bytes());
179    }
180
181    /// Finalize and return the hash result
182    #[inline]
183    pub fn finalize(self) -> HashResult {
184        HashResult::from_hash(self.hasher.finalize())
185    }
186
187    /// Finalize and return just the hex string (256 bits)
188    #[inline]
189    pub fn finalize_hex(self) -> String {
190        self.hasher.finalize().to_hex().to_string()
191    }
192}
193
194impl Default for IncrementalHasher {
195    fn default() -> Self {
196        Self::new()
197    }
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203
204    #[test]
205    fn test_deterministic() {
206        let content = "fn foo() { bar(); }";
207        let h1 = hash_content(content);
208        let h2 = hash_content(content);
209
210        assert_eq!(h1.short_id, h2.short_id);
211        assert_eq!(h1.full_hash, h2.full_hash);
212    }
213
214    #[test]
215    fn test_format() {
216        let h = hash_content("test");
217
218        assert!(h.short_id.starts_with("ec_"));
219        assert_eq!(h.short_id.len(), 3 + 32); // "ec_" + 32 hex
220        assert_eq!(h.full_hash.len(), 64); // 256 bits = 64 hex
221    }
222
223    #[test]
224    fn test_different_content() {
225        let h1 = hash_content("fn foo() {}");
226        let h2 = hash_content("fn bar() {}");
227
228        assert_ne!(h1.short_id, h2.short_id);
229        assert_ne!(h1.full_hash, h2.full_hash);
230    }
231
232    #[test]
233    fn test_cross_platform_consistency() {
234        // These should all produce the same hash after normalization
235        let variants = [
236            "fn foo() {\n    bar();\n}",
237            "fn foo() {\r\n    bar();\r\n}",
238            "fn foo() {\r    bar();\r}",
239            "fn foo() {   \n    bar();   \n}",
240        ];
241
242        let hashes: Vec<_> = variants.iter().map(|c| hash_content(c)).collect();
243
244        for i in 1..hashes.len() {
245            assert_eq!(
246                hashes[0].short_id, hashes[i].short_id,
247                "Hash mismatch for variant {i}"
248            );
249        }
250    }
251
252    #[test]
253    fn test_unicode_consistency() {
254        // NFD: e + combining acute accent
255        let nfd = "cafe\u{0301}";
256        // NFC: single character é
257        let nfc = "caf\u{00E9}";
258
259        let h1 = hash_content(nfd);
260        let h2 = hash_content(nfc);
261
262        assert_eq!(h1.short_id, h2.short_id);
263    }
264
265    #[test]
266    fn test_verify_no_collision_ok() {
267        let result = verify_no_collision("ec_test", "abc123", "abc123");
268        assert!(result.is_ok());
269    }
270
271    #[test]
272    fn test_verify_no_collision_detected() {
273        let result = verify_no_collision("ec_test", "abc123", "def456");
274        assert!(result.is_err());
275        assert!(matches!(result, Err(EmbedError::HashCollision { .. })));
276    }
277
278    #[test]
279    fn test_hash_normalized() {
280        let content = "fn foo() { bar(); }";
281        let normalized = normalize_for_hash(content);
282
283        let h1 = hash_content(content);
284        let h2 = hash_normalized(&normalized);
285
286        assert_eq!(h1.short_id, h2.short_id);
287        assert_eq!(h1.full_hash, h2.full_hash);
288    }
289
290    #[test]
291    fn test_hash_bytes() {
292        let bytes = b"hello world";
293        let result = hash_bytes(bytes);
294
295        assert!(result.short_id.starts_with("ec_"));
296        assert_eq!(result.full_hash.len(), 64);
297    }
298
299    #[test]
300    fn test_incremental_hasher() {
301        // Concatenated hash
302        let concat = "part1part2part3";
303        let h1 = hash_bytes(concat.as_bytes());
304
305        // Incremental hash
306        let mut hasher = IncrementalHasher::new();
307        hasher.update(b"part1");
308        hasher.update(b"part2");
309        hasher.update(b"part3");
310        let h2 = hasher.finalize();
311
312        assert_eq!(h1.short_id, h2.short_id);
313    }
314
315    #[test]
316    fn test_incremental_with_numbers() {
317        let mut hasher = IncrementalHasher::new();
318        hasher.update_u32(42);
319        hasher.update_u64(123456789);
320        hasher.update_str("test");
321        let result = hasher.finalize_hex();
322
323        assert_eq!(result.len(), 64);
324    }
325
326    #[test]
327    fn test_compute_integrity_hash() {
328        let data = b"manifest data here";
329        let hash = compute_integrity_hash(data);
330
331        assert_eq!(hash.len(), 64);
332    }
333
334    #[test]
335    fn test_empty_content() {
336        let h1 = hash_content("");
337        let h2 = hash_content("\n\n\n"); // Normalizes to empty
338
339        assert_eq!(h1.short_id, h2.short_id);
340    }
341
342    #[test]
343    fn test_whitespace_only() {
344        let h1 = hash_content("   ");
345        let h2 = hash_content("  \n  \n  ");
346
347        // Both normalize to empty
348        assert_eq!(h1.short_id, h2.short_id);
349    }
350
351    #[test]
352    fn test_hash_result_clone() {
353        let result = hash_content("test");
354        let cloned = result.clone();
355
356        assert_eq!(result, cloned);
357    }
358}