infiniloom_engine/embedding/
hasher.rs

1//! BLAKE3-based content hashing for deterministic chunk IDs
2//!
3//! This module provides fast, cryptographically secure hashing for:
4//! - Chunk ID generation (128-bit truncated, collision-resistant)
5//! - Content verification (full 256-bit hash)
6//! - Manifest integrity checksums
7//!
8//! # Hash Format
9//!
10//! - **Short ID**: `ec_` + 32 hex chars (128 bits of BLAKE3)
11//!   - Collision-resistant for ~2^64 chunks (enterprise scale)
12//!   - Human-readable prefix identifies embedding chunks
13//!
14//! - **Full hash**: 64 hex chars (256 bits of BLAKE3)
15//!   - Used for collision verification
16//!   - Stored in manifest for integrity checking
17//!
18//! # Performance
19//!
20//! BLAKE3 is extremely fast:
21//! - ~3x faster than SHA-256
22//! - ~6x faster than SHA-512
23//! - Parallelizable for large inputs
24//! - SIMD-accelerated on modern CPUs
25
26use super::error::EmbedError;
27use super::normalizer::normalize_for_hash;
28
29/// Result of hashing content, containing both short ID and full hash
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub struct HashResult {
32    /// Short ID for display and indexing: "ec_" + 32 hex chars (128 bits)
33    /// Collision-resistant for ~2^64 unique chunks
34    pub short_id: String,
35
36    /// Full hash for collision detection: 64 hex chars (256 bits)
37    /// Used to verify that same short_id means same content
38    pub full_hash: String,
39}
40
41impl HashResult {
42    /// Create a new HashResult from raw hash bytes
43    #[inline]
44    fn from_hash(hash: blake3::Hash) -> Self {
45        let hex = hash.to_hex();
46
47        Self {
48            // 128 bits = 32 hex chars (collision resistant for 2^64 chunks)
49            short_id: format!("ec_{}", &hex[..32]),
50            // Full 256-bit hash for verification
51            full_hash: hex.to_string(),
52        }
53    }
54}
55
56/// Generate deterministic hashes from content
57///
58/// This is the primary hashing function. It:
59/// 1. Normalizes content for cross-platform consistency
60/// 2. Computes BLAKE3 hash of normalized content
61/// 3. Returns both short ID and full hash
62///
63/// # Example
64///
65/// ```
66/// use infiniloom_engine::embedding::hash_content;
67///
68/// let result = hash_content("fn foo() { bar(); }");
69/// assert!(result.short_id.starts_with("ec_"));
70/// assert_eq!(result.short_id.len(), 3 + 32); // "ec_" + 32 hex chars
71/// assert_eq!(result.full_hash.len(), 64);    // 256 bits = 64 hex chars
72/// ```
73#[inline]
74pub fn hash_content(content: &str) -> HashResult {
75    let normalized = normalize_for_hash(content);
76    let hash = blake3::hash(normalized.as_bytes());
77    HashResult::from_hash(hash)
78}
79
80/// Hash content that is already normalized
81///
82/// Use this when you've already called `normalize_for_hash` on the content.
83/// Skips redundant normalization for better performance.
84///
85/// # Safety
86///
87/// The caller must ensure the content is already normalized. If not,
88/// the hash will be different from `hash_content()` for the same original content.
89#[inline]
90pub(super) fn hash_normalized(normalized_content: &str) -> HashResult {
91    let hash = blake3::hash(normalized_content.as_bytes());
92    HashResult::from_hash(hash)
93}
94
95/// Hash raw bytes without normalization
96///
97/// Use for non-text content or when you need raw byte hashing.
98#[inline]
99pub(super) fn hash_bytes(bytes: &[u8]) -> HashResult {
100    let hash = blake3::hash(bytes);
101    HashResult::from_hash(hash)
102}
103
104/// Verify that two chunks with the same short ID have the same content
105///
106/// This detects hash collisions (extremely rare but possible).
107/// Call this when you encounter a chunk with an existing short ID.
108///
109/// # Returns
110///
111/// - `Ok(())` if hashes match (no collision)
112/// - `Err(HashCollision)` if hashes differ (collision detected)
113pub(super) fn verify_no_collision(id: &str, hash1: &str, hash2: &str) -> Result<(), EmbedError> {
114    if hash1 != hash2 {
115        return Err(EmbedError::HashCollision {
116            id: id.to_owned(),
117            hash1: hash1.to_owned(),
118            hash2: hash2.to_owned(),
119        });
120    }
121    Ok(())
122}
123
124/// Compute a hash for manifest integrity verification
125///
126/// Used to detect tampering with the manifest file.
127pub(super) fn compute_integrity_hash(data: &[u8]) -> String {
128    blake3::hash(data).to_hex().to_string()
129}
130
131/// Incrementally hash multiple pieces of data
132///
133/// More efficient than concatenating strings when hashing multiple items.
134///
135/// # Example
136///
137/// ```ignore
138/// let mut hasher = IncrementalHasher::new();
139/// hasher.update(b"settings json");
140/// hasher.update(b"chunk1");
141/// hasher.update(b"chunk2");
142/// let result = hasher.finalize();
143/// ```
144pub(super) struct IncrementalHasher {
145    hasher: blake3::Hasher,
146}
147
148impl IncrementalHasher {
149    /// Create a new incremental hasher
150    #[inline]
151    pub(super) fn new() -> Self {
152        Self { hasher: blake3::Hasher::new() }
153    }
154
155    /// Update the hash with additional data
156    #[inline]
157    pub(super) fn update(&mut self, data: &[u8]) {
158        self.hasher.update(data);
159    }
160
161    /// Update the hash with a string
162    #[inline]
163    pub(super) fn update_str(&mut self, s: &str) {
164        self.hasher.update(s.as_bytes());
165    }
166
167    /// Update with a u32 value (little-endian)
168    #[inline]
169    pub(super) fn update_u32(&mut self, n: u32) {
170        self.hasher.update(&n.to_le_bytes());
171    }
172
173    /// Update with a u64 value (little-endian)
174    #[inline]
175    pub(super) fn update_u64(&mut self, n: u64) {
176        self.hasher.update(&n.to_le_bytes());
177    }
178
179    /// Finalize and return the hash result
180    #[inline]
181    pub(super) fn finalize(self) -> HashResult {
182        HashResult::from_hash(self.hasher.finalize())
183    }
184
185    /// Finalize and return just the hex string (256 bits)
186    #[inline]
187    pub(super) fn finalize_hex(self) -> String {
188        self.hasher.finalize().to_hex().to_string()
189    }
190}
191
192impl Default for IncrementalHasher {
193    fn default() -> Self {
194        Self::new()
195    }
196}
197
198#[cfg(test)]
199mod tests {
200    use super::*;
201
202    #[test]
203    fn test_deterministic() {
204        let content = "fn foo() { bar(); }";
205        let h1 = hash_content(content);
206        let h2 = hash_content(content);
207
208        assert_eq!(h1.short_id, h2.short_id);
209        assert_eq!(h1.full_hash, h2.full_hash);
210    }
211
212    #[test]
213    fn test_format() {
214        let h = hash_content("test");
215
216        assert!(h.short_id.starts_with("ec_"));
217        assert_eq!(h.short_id.len(), 3 + 32); // "ec_" + 32 hex
218        assert_eq!(h.full_hash.len(), 64); // 256 bits = 64 hex
219    }
220
221    #[test]
222    fn test_different_content() {
223        let h1 = hash_content("fn foo() {}");
224        let h2 = hash_content("fn bar() {}");
225
226        assert_ne!(h1.short_id, h2.short_id);
227        assert_ne!(h1.full_hash, h2.full_hash);
228    }
229
230    #[test]
231    fn test_cross_platform_consistency() {
232        // These should all produce the same hash after normalization
233        let variants = [
234            "fn foo() {\n    bar();\n}",
235            "fn foo() {\r\n    bar();\r\n}",
236            "fn foo() {\r    bar();\r}",
237            "fn foo() {   \n    bar();   \n}",
238        ];
239
240        let hashes: Vec<_> = variants.iter().map(|c| hash_content(c)).collect();
241
242        for i in 1..hashes.len() {
243            assert_eq!(hashes[0].short_id, hashes[i].short_id, "Hash mismatch for variant {i}");
244        }
245    }
246
247    #[test]
248    fn test_unicode_consistency() {
249        // NFD: e + combining acute accent
250        let nfd = "cafe\u{0301}";
251        // NFC: single character é
252        let nfc = "caf\u{00E9}";
253
254        let h1 = hash_content(nfd);
255        let h2 = hash_content(nfc);
256
257        assert_eq!(h1.short_id, h2.short_id);
258    }
259
260    #[test]
261    fn test_verify_no_collision_ok() {
262        let result = verify_no_collision("ec_test", "abc123", "abc123");
263        assert!(result.is_ok());
264    }
265
266    #[test]
267    fn test_verify_no_collision_detected() {
268        let result = verify_no_collision("ec_test", "abc123", "def456");
269        assert!(result.is_err());
270        assert!(matches!(result, Err(EmbedError::HashCollision { .. })));
271    }
272
273    #[test]
274    fn test_hash_normalized() {
275        let content = "fn foo() { bar(); }";
276        let normalized = normalize_for_hash(content);
277
278        let h1 = hash_content(content);
279        let h2 = hash_normalized(&normalized);
280
281        assert_eq!(h1.short_id, h2.short_id);
282        assert_eq!(h1.full_hash, h2.full_hash);
283    }
284
285    #[test]
286    fn test_hash_bytes() {
287        let bytes = b"hello world";
288        let result = hash_bytes(bytes);
289
290        assert!(result.short_id.starts_with("ec_"));
291        assert_eq!(result.full_hash.len(), 64);
292    }
293
294    #[test]
295    fn test_incremental_hasher() {
296        // Concatenated hash
297        let concat = "part1part2part3";
298        let h1 = hash_bytes(concat.as_bytes());
299
300        // Incremental hash
301        let mut hasher = IncrementalHasher::new();
302        hasher.update(b"part1");
303        hasher.update(b"part2");
304        hasher.update(b"part3");
305        let h2 = hasher.finalize();
306
307        assert_eq!(h1.short_id, h2.short_id);
308    }
309
310    #[test]
311    fn test_incremental_with_numbers() {
312        let mut hasher = IncrementalHasher::new();
313        hasher.update_u32(42);
314        hasher.update_u64(123456789);
315        hasher.update_str("test");
316        let result = hasher.finalize_hex();
317
318        assert_eq!(result.len(), 64);
319    }
320
321    #[test]
322    fn test_compute_integrity_hash() {
323        let data = b"manifest data here";
324        let hash = compute_integrity_hash(data);
325
326        assert_eq!(hash.len(), 64);
327    }
328
329    #[test]
330    fn test_empty_content() {
331        let h1 = hash_content("");
332        let h2 = hash_content("\n\n\n"); // Normalizes to empty
333
334        assert_eq!(h1.short_id, h2.short_id);
335    }
336
337    #[test]
338    fn test_whitespace_only() {
339        let h1 = hash_content("   ");
340        let h2 = hash_content("  \n  \n  ");
341
342        // Both normalize to empty
343        assert_eq!(h1.short_id, h2.short_id);
344    }
345
346    #[test]
347    fn test_hash_result_clone() {
348        let result = hash_content("test");
349        let cloned = result.clone();
350
351        assert_eq!(result, cloned);
352    }
353}