Skip to main content

batuta/playbook/
hasher.rs

1//! BLAKE3 hashing for playbook cache keys (PB-003)
2//!
3//! Provides deterministic hashing for files, directories, parameters, and commands.
4//! All hashes are formatted as `"blake3:{hex}"`.
5//! Uses streaming I/O to avoid OOM on large files.
6
7use super::types::yaml_value_to_string;
8use anyhow::{Context, Result};
9use std::collections::HashMap;
10use std::io::Read;
11use std::path::Path;
12
13/// Hash a single file's contents via BLAKE3 (streaming)
14pub fn hash_file(path: &Path) -> Result<String> {
15    let mut file = std::fs::File::open(path)
16        .with_context(|| format!("failed to open file for hashing: {}", path.display()))?;
17    let mut hasher = blake3::Hasher::new();
18    let mut buf = [0u8; 65536];
19    loop {
20        let n = file
21            .read(&mut buf)
22            .with_context(|| format!("failed to read file: {}", path.display()))?;
23        if n == 0 {
24            break;
25        }
26        hasher.update(&buf[..n]);
27    }
28    let hash = hasher.finalize();
29    Ok(format!("blake3:{}", hash.to_hex()))
30}
31
32/// Result of hashing a directory
33#[derive(Debug, Clone)]
34pub struct DirHashResult {
35    pub hash: String,
36    pub file_count: u64,
37    pub total_bytes: u64,
38}
39
40/// Stream a file's contents into a hasher, returning its byte size.
41fn stream_into_hasher(path: &Path, hasher: &mut blake3::Hasher) -> Result<u64> {
42    let meta =
43        std::fs::metadata(path).with_context(|| format!("failed to stat: {}", path.display()))?;
44    let mut file =
45        std::fs::File::open(path).with_context(|| format!("failed to open: {}", path.display()))?;
46    let mut buf = [0u8; 65536];
47    loop {
48        let n = file.read(&mut buf)?;
49        if n == 0 {
50            break;
51        }
52        hasher.update(&buf[..n]);
53    }
54    Ok(meta.len())
55}
56
57/// Hash a directory by walking files in sorted order (streaming I/O)
58pub fn hash_directory(path: &Path) -> Result<DirHashResult> {
59    if !path.is_dir() {
60        let mut hasher = blake3::Hasher::new();
61        let size = stream_into_hasher(path, &mut hasher)?;
62        let hash = hasher.finalize();
63        return Ok(DirHashResult {
64            hash: format!("blake3:{}", hash.to_hex()),
65            file_count: 1,
66            total_bytes: size,
67        });
68    }
69
70    let mut entries: Vec<std::path::PathBuf> = Vec::new();
71    collect_files_sorted(path, &mut entries)?;
72
73    let mut hasher = blake3::Hasher::new();
74    let mut file_count = 0u64;
75    let mut total_bytes = 0u64;
76
77    for entry in &entries {
78        let rel = entry.strip_prefix(path).unwrap_or(entry);
79        hasher.update(rel.to_string_lossy().as_bytes());
80        total_bytes += stream_into_hasher(entry, &mut hasher)?;
81        file_count += 1;
82    }
83
84    let hash = hasher.finalize();
85    Ok(DirHashResult { hash: format!("blake3:{}", hash.to_hex()), file_count, total_bytes })
86}
87
88fn collect_files_sorted(dir: &Path, out: &mut Vec<std::path::PathBuf>) -> Result<()> {
89    let mut entries: Vec<std::path::PathBuf> = Vec::new();
90
91    for entry in
92        std::fs::read_dir(dir).with_context(|| format!("failed to read dir: {}", dir.display()))?
93    {
94        let entry = entry?;
95        let ft = entry.file_type()?;
96        // Skip symlinks to avoid circular references and symlink attacks
97        if ft.is_symlink() {
98            continue;
99        }
100        entries.push(entry.path());
101    }
102
103    // Sort for deterministic ordering
104    entries.sort();
105
106    for entry in entries {
107        if entry.is_dir() {
108            collect_files_sorted(&entry, out)?;
109        } else {
110            out.push(entry);
111        }
112    }
113
114    Ok(())
115}
116
117/// Hash a dependency (file or directory)
118pub fn hash_dep(path: &Path) -> Result<DirHashResult> {
119    hash_directory(path)
120}
121
122/// Hash the parameter set relevant to a stage
123///
124/// Uses the union of declared param keys and template-extracted refs.
125/// Sorted by key for determinism.
126pub fn hash_params(
127    global_params: &HashMap<String, serde_yaml_ng::Value>,
128    referenced_keys: &[String],
129) -> Result<String> {
130    let mut pairs: Vec<(String, String)> = Vec::new();
131
132    for key in referenced_keys {
133        if let Some(val) = global_params.get(key) {
134            pairs.push((key.clone(), yaml_value_to_string(val)));
135        }
136    }
137
138    pairs.sort_by(|a, b| a.0.cmp(&b.0));
139
140    let mut hasher = blake3::Hasher::new();
141    for (k, v) in &pairs {
142        hasher.update(k.as_bytes());
143        hasher.update(b"=");
144        hasher.update(v.as_bytes());
145        hasher.update(b"\n");
146    }
147
148    let hash = hasher.finalize();
149    Ok(format!("blake3:{}", hash.to_hex()))
150}
151
152/// Extract param keys referenced in a command template (UTF-8 safe)
153pub fn extract_param_refs(cmd: &str) -> Vec<String> {
154    let mut keys = Vec::new();
155    let mut pos = 0;
156
157    while pos < cmd.len() {
158        if cmd[pos..].starts_with("{{") {
159            let start = pos + 2;
160            if let Some(end_offset) = cmd[start..].find("}}") {
161                let ref_str = cmd[start..start + end_offset].trim();
162                if let Some(key) = ref_str.strip_prefix("params.") {
163                    if !keys.contains(&key.to_string()) {
164                        keys.push(key.to_string());
165                    }
166                }
167                pos = start + end_offset + 2;
168            } else {
169                pos += 2;
170            }
171        } else {
172            let ch = cmd[pos..].chars().next().expect("iterator empty");
173            pos += ch.len_utf8();
174        }
175    }
176
177    keys
178}
179
180/// Compute the effective param keys for a stage.
181///
182/// Union of explicitly declared `stage.params` keys and template-extracted refs.
183/// This implements spec §2.3 granular param invalidation.
184pub fn effective_param_keys(declared: &Option<Vec<String>>, cmd: &str) -> Vec<String> {
185    let mut keys = extract_param_refs(cmd);
186    if let Some(declared_keys) = declared {
187        for k in declared_keys {
188            if !keys.contains(k) {
189                keys.push(k.clone());
190            }
191        }
192    }
193    keys
194}
195
196/// Hash a resolved command string
197pub fn hash_cmd(resolved_cmd: &str) -> String {
198    let hash = blake3::hash(resolved_cmd.as_bytes());
199    format!("blake3:{}", hash.to_hex())
200}
201
202/// Compute composite cache key from component hashes
203///
204/// `cache_key = BLAKE3(cmd_hash || deps_hash || params_hash)`
205pub fn compute_cache_key(cmd_hash: &str, deps_hash: &str, params_hash: &str) -> String {
206    let mut hasher = blake3::Hasher::new();
207    hasher.update(cmd_hash.as_bytes());
208    hasher.update(deps_hash.as_bytes());
209    hasher.update(params_hash.as_bytes());
210    let hash = hasher.finalize();
211    format!("blake3:{}", hash.to_hex())
212}
213
214/// Compute the combined deps hash from individual dependency hashes
215pub fn combine_deps_hashes(hashes: &[String]) -> String {
216    let mut hasher = blake3::Hasher::new();
217    for h in hashes {
218        hasher.update(h.as_bytes());
219    }
220    let hash = hasher.finalize();
221    format!("blake3:{}", hash.to_hex())
222}
223
224#[cfg(test)]
225#[allow(non_snake_case)]
226mod tests {
227    use super::*;
228
229    fn make_params(pairs: &[(&str, &str)]) -> HashMap<String, serde_yaml_ng::Value> {
230        pairs
231            .iter()
232            .map(|(k, v)| (k.to_string(), serde_yaml_ng::Value::String(v.to_string())))
233            .collect()
234    }
235
236    #[test]
237    fn test_PB003_hash_file_deterministic() {
238        let dir = tempfile::tempdir().expect("tempdir creation failed");
239        let file = dir.path().join("test.txt");
240        std::fs::write(&file, b"hello world").expect("fs write failed");
241
242        let h1 = hash_file(&file).expect("unexpected failure");
243        let h2 = hash_file(&file).expect("unexpected failure");
244        assert_eq!(h1, h2);
245        assert!(h1.starts_with("blake3:"));
246    }
247
248    #[test]
249    fn test_PB003_hash_file_changes_with_content() {
250        let dir = tempfile::tempdir().expect("tempdir creation failed");
251        let file = dir.path().join("test.txt");
252
253        std::fs::write(&file, b"hello").expect("fs write failed");
254        let h1 = hash_file(&file).expect("unexpected failure");
255
256        std::fs::write(&file, b"world").expect("fs write failed");
257        let h2 = hash_file(&file).expect("unexpected failure");
258
259        assert_ne!(h1, h2);
260    }
261
262    #[test]
263    fn test_PB003_hash_directory_sorted_walk() {
264        let dir = tempfile::tempdir().expect("tempdir creation failed");
265        std::fs::write(dir.path().join("b.txt"), b"content-b").expect("fs write failed");
266        std::fs::write(dir.path().join("a.txt"), b"content-a").expect("fs write failed");
267
268        let r1 = hash_directory(dir.path()).expect("unexpected failure");
269        assert!(r1.hash.starts_with("blake3:"));
270        assert_eq!(r1.file_count, 2);
271        assert_eq!(r1.total_bytes, 18);
272
273        let r2 = hash_directory(dir.path()).expect("unexpected failure");
274        assert_eq!(r1.hash, r2.hash);
275    }
276
277    #[test]
278    fn test_PB003_hash_directory_single_file() {
279        let dir = tempfile::tempdir().expect("tempdir creation failed");
280        let file = dir.path().join("only.txt");
281        std::fs::write(&file, b"data").expect("fs write failed");
282
283        let result = hash_directory(&file).expect("unexpected failure");
284        assert_eq!(result.file_count, 1);
285        assert_eq!(result.total_bytes, 4);
286    }
287
288    #[test]
289    fn test_PB003_hash_params_sorted() {
290        let global = make_params(&[("b", "2"), ("a", "1")]);
291        let refs = vec!["a".to_string(), "b".to_string()];
292
293        let h1 = hash_params(&global, &refs).expect("unexpected failure");
294
295        // Different reference order, same result
296        let refs2 = vec!["b".to_string(), "a".to_string()];
297        let h2 = hash_params(&global, &refs2).expect("unexpected failure");
298
299        assert_eq!(h1, h2);
300        assert!(h1.starts_with("blake3:"));
301    }
302
303    #[test]
304    fn test_PB003_hash_cmd() {
305        let h1 = hash_cmd("echo hello");
306        let h2 = hash_cmd("echo hello");
307        let h3 = hash_cmd("echo world");
308
309        assert_eq!(h1, h2);
310        assert_ne!(h1, h3);
311        assert!(h1.starts_with("blake3:"));
312    }
313
314    #[test]
315    fn test_PB003_compute_cache_key() {
316        let key1 = compute_cache_key("blake3:aaa", "blake3:bbb", "blake3:ccc");
317        let key2 = compute_cache_key("blake3:aaa", "blake3:bbb", "blake3:ccc");
318        let key3 = compute_cache_key("blake3:xxx", "blake3:bbb", "blake3:ccc");
319
320        assert_eq!(key1, key2);
321        assert_ne!(key1, key3);
322    }
323
324    #[test]
325    fn test_PB003_extract_param_refs() {
326        let refs = extract_param_refs("run --model {{params.model}} --lang {{params.lang}} plain");
327        assert_eq!(refs, vec!["model", "lang"]);
328    }
329
330    #[test]
331    fn test_PB003_extract_param_refs_no_refs() {
332        let refs = extract_param_refs("echo hello world");
333        assert!(refs.is_empty());
334    }
335
336    #[test]
337    fn test_PB003_extract_param_refs_dedup() {
338        let refs = extract_param_refs("{{params.x}} and {{params.x}} again");
339        assert_eq!(refs, vec!["x"]);
340    }
341
342    #[test]
343    fn test_PB003_effective_param_keys() {
344        // Template refs only
345        let keys = effective_param_keys(&None, "echo {{params.model}}");
346        assert_eq!(keys, vec!["model"]);
347
348        // Declared + template: union
349        let declared = Some(vec!["chunk_size".to_string()]);
350        let keys = effective_param_keys(&declared, "echo {{params.model}}");
351        assert_eq!(keys, vec!["model", "chunk_size"]);
352    }
353
354    #[test]
355    fn test_PB003_combine_deps_hashes() {
356        let h1 = combine_deps_hashes(&["blake3:aaa".to_string(), "blake3:bbb".to_string()]);
357        let h2 = combine_deps_hashes(&["blake3:aaa".to_string(), "blake3:bbb".to_string()]);
358        assert_eq!(h1, h2);
359
360        // Order matters
361        let h3 = combine_deps_hashes(&["blake3:bbb".to_string(), "blake3:aaa".to_string()]);
362        assert_ne!(h1, h3);
363    }
364
365    #[test]
366    fn test_PB003_hash_file_missing() {
367        let err = hash_file(Path::new("/nonexistent/file.txt")).unwrap_err();
368        assert!(err.to_string().contains("failed to open"));
369    }
370
371    #[test]
372    fn test_PB003_hash_directory_nested() {
373        let dir = tempfile::tempdir().expect("tempdir creation failed");
374        std::fs::create_dir(dir.path().join("sub")).expect("unexpected failure");
375        std::fs::write(dir.path().join("top.txt"), b"top").expect("fs write failed");
376        std::fs::write(dir.path().join("sub").join("nested.txt"), b"nested")
377            .expect("fs write failed");
378
379        let result = hash_directory(dir.path()).expect("unexpected failure");
380        assert_eq!(result.file_count, 2);
381        assert_eq!(result.total_bytes, 9);
382    }
383
384    #[test]
385    fn test_PB003_extract_param_refs_unicode_safe() {
386        let refs = extract_param_refs("echo {{params.model}} — résumé {{params.lang}}");
387        assert_eq!(refs, vec!["model", "lang"]);
388    }
389}