Skip to main content

zccache_hash/
lib.rs

1//! Hashing utilities for zccache.
2//!
3//! Provides blake3-based content hashing and cache key computation.
4
5pub mod cache_key;
6pub mod link_cache_key;
7
8use std::io::Read;
9use std::path::Path;
10
11/// A 32-byte blake3 hash digest.
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
13pub struct ContentHash([u8; 32]);
14
15impl ContentHash {
16    /// Create a `ContentHash` from raw bytes.
17    #[must_use]
18    pub fn from_bytes(bytes: [u8; 32]) -> Self {
19        Self(bytes)
20    }
21
22    /// Returns the hash as a hex string.
23    #[must_use]
24    pub fn to_hex(&self) -> String {
25        hex_encode(&self.0)
26    }
27
28    /// Returns the raw bytes.
29    #[must_use]
30    pub fn as_bytes(&self) -> &[u8; 32] {
31        &self.0
32    }
33
34    /// Returns the first N bytes for directory sharding.
35    ///
36    /// # Panics
37    ///
38    /// Panics if `levels * bytes_per_level > 32` (exceeds hash size).
39    #[must_use]
40    pub fn shard_prefix(&self, levels: usize, bytes_per_level: usize) -> Vec<String> {
41        let hex = self.to_hex();
42        let chars_per_level = bytes_per_level * 2;
43        let required = levels * chars_per_level;
44        assert!(
45            required <= hex.len(),
46            "shard_prefix: levels={levels} * bytes_per_level={bytes_per_level} \
47             requires {required} hex chars but hash is only {} chars",
48            hex.len()
49        );
50        (0..levels)
51            .map(|i| {
52                let start = i * chars_per_level;
53                let end = start + chars_per_level;
54                hex[start..end].to_string()
55            })
56            .collect()
57    }
58}
59
60impl std::fmt::Display for ContentHash {
61    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
62        write!(f, "{}", self.to_hex())
63    }
64}
65
66/// Hash the contents of a byte slice.
67#[must_use]
68pub fn hash_bytes(data: &[u8]) -> ContentHash {
69    let hash = blake3::hash(data);
70    ContentHash(*hash.as_bytes())
71}
72
73/// Incremental hasher for building a `ContentHash` from multiple updates.
74///
75/// Avoids allocating an intermediate buffer when the input is spread across
76/// multiple slices (e.g., request fingerprinting).
77pub struct StreamHasher(blake3::Hasher);
78
79impl StreamHasher {
80    /// Create a new streaming hasher.
81    #[must_use]
82    pub fn new() -> Self {
83        Self(blake3::Hasher::new())
84    }
85
86    /// Feed bytes into the hasher.
87    pub fn update(&mut self, data: &[u8]) -> &mut Self {
88        self.0.update(data);
89        self
90    }
91
92    /// Finalize and return the hash.
93    #[must_use]
94    pub fn finalize(self) -> ContentHash {
95        ContentHash(*self.0.finalize().as_bytes())
96    }
97}
98
99impl Default for StreamHasher {
100    fn default() -> Self {
101        Self::new()
102    }
103}
104
105/// Hash the contents of a reader.
106///
107/// # Errors
108///
109/// Returns an error if reading from the reader fails.
110pub fn hash_reader<R: Read>(mut reader: R) -> std::io::Result<ContentHash> {
111    let mut hasher = blake3::Hasher::new();
112    let mut buf = [0u8; 16384];
113    loop {
114        let n = reader.read(&mut buf)?;
115        if n == 0 {
116            break;
117        }
118        hasher.update(&buf[..n]);
119    }
120    Ok(ContentHash(*hasher.finalize().as_bytes()))
121}
122
123/// Hash the contents of a file using memory mapping.
124///
125/// Uses `memmap2` for zero-copy file access. The OS page cache ensures
126/// files recently read (e.g., during compilation) are hashed from memory,
127/// not disk. Falls back to buffered reading for empty files.
128///
129/// # Errors
130///
131/// Returns an error if the file cannot be read.
132///
133/// # Safety
134///
135/// Memory mapping is technically unsafe if another process modifies the file
136/// concurrently. The TOCTOU check in `MetadataCache::hash_and_insert` detects
137/// this by comparing stat before and after hashing.
138pub fn hash_file(path: &Path) -> std::io::Result<ContentHash> {
139    let file = std::fs::File::open(path)?;
140    let meta = file.metadata()?;
141
142    if meta.len() == 0 {
143        return Ok(hash_bytes(b""));
144    }
145
146    // SAFETY: The caller (MetadataCache::hash_and_insert) stats before and
147    // after hashing to detect concurrent modification.
148    let mmap = unsafe { memmap2::Mmap::map(&file)? };
149    Ok(hash_bytes(&mmap))
150}
151
152fn hex_encode(bytes: &[u8]) -> String {
153    let mut s = String::with_capacity(bytes.len() * 2);
154    for b in bytes {
155        use std::fmt::Write;
156        let _ = write!(s, "{b:02x}");
157    }
158    s
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    #[test]
166    fn hash_deterministic() {
167        let h1 = hash_bytes(b"hello world");
168        let h2 = hash_bytes(b"hello world");
169        assert_eq!(h1, h2);
170    }
171
172    #[test]
173    fn hash_different_inputs() {
174        let h1 = hash_bytes(b"hello");
175        let h2 = hash_bytes(b"world");
176        assert_ne!(h1, h2);
177    }
178
179    #[test]
180    fn hex_roundtrip() {
181        let h = hash_bytes(b"test");
182        let hex = h.to_hex();
183        assert_eq!(hex.len(), 64);
184    }
185
186    #[test]
187    fn shard_prefix_works() {
188        let h = hash_bytes(b"test");
189        let shards = h.shard_prefix(2, 1);
190        assert_eq!(shards.len(), 2);
191        assert_eq!(shards[0].len(), 2);
192        assert_eq!(shards[1].len(), 2);
193    }
194
195    #[test]
196    fn shard_prefix_max_valid() {
197        // 32 bytes = 64 hex chars. 32 levels of 1 byte each uses all 64 chars.
198        let h = hash_bytes(b"test");
199        let shards = h.shard_prefix(32, 1);
200        assert_eq!(shards.len(), 32);
201    }
202
203    #[test]
204    #[should_panic(expected = "shard_prefix")]
205    fn shard_prefix_overflow_panics() {
206        // Bug: shard_prefix(33, 1) would index past the 64-char hex string,
207        // causing an opaque "index out of bounds" panic. Now panics with a
208        // descriptive message.
209        let h = hash_bytes(b"test");
210        let _ = h.shard_prefix(33, 1);
211    }
212
213    #[test]
214    #[should_panic(expected = "shard_prefix")]
215    fn shard_prefix_large_bytes_per_level_panics() {
216        let h = hash_bytes(b"test");
217        // 2 levels of 17 bytes each = 34 bytes > 32 hash bytes.
218        let _ = h.shard_prefix(2, 17);
219    }
220}