Skip to main content

alef_core/
hash.rs

1//! Content hashing and generated-file headers.
2//!
3//! Every file produced by alef gets a standard header that identifies it as
4//! generated, tells agents/developers how to fix issues, and embeds a blake3
5//! hash so `alef verify` can detect staleness without external state.
6//!
7//! # Hash semantics
8//!
9//! As of alef v0.10.1, the embedded `alef:hash:<hex>` value is a **per-file
10//! source+output fingerprint** produced by [`compute_file_hash`]:
11//!
12//! ```text
13//! blake3(sources_hash || file_content_without_hash_line)
14//! ```
15//!
16//! Where `sources_hash` is [`compute_sources_hash`] over the sorted Rust source
17//! files alef parses to build the IR. The hash deliberately does **not**
18//! include the alef version or `alef.toml`: any input change that affects the
19//! generated bytes is already reflected by hashing the file content itself,
20//! and excluding the alef version makes `alef verify` idempotent across
21//! `alef` upgrades — a CI run on a tagged repo continues to pass after the
22//! alef CLI is bumped, as long as the rust sources and emitted file contents
23//! are unchanged.
24//!
25//! `alef generate` finalises the embedded hash *after* downstream formatters
26//! (rustfmt, rubocop, dotnet format, spotless, biome, mix format, php-cs-fixer,
27//! mix format, …) have run, so the embedded hash describes the actual
28//! on-disk byte-content. `alef verify` reads the file, strips the
29//! `alef:hash:` line, recomputes the same hash, and compares — no
30//! regeneration, no writes.
31//!
32//! Pre-v0.10.1 alef used a single input-deterministic hash that incorporated
33//! the alef CLI version, which forced every consumer repo to re-run
34//! `alef generate` after every alef bump even when nothing else changed.
35
36const HASH_PREFIX: &str = "alef:hash:";
37
38/// The standard header text (without comment delimiters).
39/// Used by [`header`] to produce language-specific comment blocks.
40const HEADER_BODY: &str = "\
41This file is auto-generated by alef — DO NOT EDIT.
42To regenerate: alef generate
43To verify freshness: alef verify --exit-code
44Issues & docs: https://github.com/kreuzberg-dev/alef";
45
46/// Comment style for the generated header.
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum CommentStyle {
49    /// `// line comment`  (Rust, Go, Java, C#, TypeScript, C, PHP)
50    DoubleSlash,
51    /// `# line comment`   (Python, Ruby, Elixir, R, TOML, Shell, Makefile)
52    Hash,
53    /// `/* block comment */` (C headers)
54    Block,
55}
56
57/// Return the standard alef header as a comment block.
58///
59/// ```text
60/// // This file is auto-generated by alef — DO NOT EDIT.
61/// // To regenerate: alef generate
62/// // To verify freshness: alef verify --exit-code
63/// // Issues & docs: https://github.com/kreuzberg-dev/alef
64/// ```
65pub fn header(style: CommentStyle) -> String {
66    match style {
67        CommentStyle::DoubleSlash => HEADER_BODY.lines().map(|l| format!("// {l}\n")).collect(),
68        CommentStyle::Hash => HEADER_BODY.lines().map(|l| format!("# {l}\n")).collect(),
69        CommentStyle::Block => {
70            let mut out = String::from("/*\n");
71            for line in HEADER_BODY.lines() {
72                out.push_str(&format!(" * {line}\n"));
73            }
74            out.push_str(" */\n");
75            out
76        }
77    }
78}
79
80/// The marker string that `inject_hash_line` and `extract_hash` look for.
81/// Every alef-generated header contains this on the first line.
82const HEADER_MARKER: &str = "auto-generated by alef";
83
84/// Blake3 hash of a content string, returned as hex.
85///
86/// Used by the IR / language caches and any caller that needs a hash of an
87/// in-memory string. **Not used for the embedded `alef:hash:` header** — that
88/// is computed by [`compute_file_hash`].
89pub fn hash_content(content: &str) -> String {
90    blake3::hash(content.as_bytes()).to_hex().to_string()
91}
92
93/// Compute a stable hash over the Rust source files that alef extracts.
94///
95/// This is the "source side" of the per-file verify hash. Sources are sorted
96/// by path so the hash is stable regardless of ordering in
97/// `alef.toml`'s `[crate].sources`. The path is mixed in alongside the
98/// content because the same byte-content at a different path produces
99/// different IR (the `rust_path` on extracted types differs).
100///
101/// Used by [`compute_file_hash`]; not by itself the value embedded in any
102/// file header.
103///
104/// # Errors
105/// Returns an error if any source file is missing or unreadable.
106pub fn compute_sources_hash(sources: &[std::path::PathBuf]) -> std::io::Result<String> {
107    let mut hasher = blake3::Hasher::new();
108    let mut sorted: Vec<&std::path::PathBuf> = sources.iter().collect();
109    sorted.sort();
110    for source in sorted {
111        let content = std::fs::read(source)?;
112        hasher.update(b"src\0");
113        hasher.update(source.to_string_lossy().as_bytes());
114        hasher.update(b"\0");
115        hasher.update(&content);
116    }
117    Ok(hasher.finalize().to_hex().to_string())
118}
119
120/// Compute the per-file verify hash that alef embeds in each generated file.
121///
122/// `sources_hash` comes from [`compute_sources_hash`]. `content` is the file
123/// content; any pre-existing `alef:hash:` line is stripped before hashing so
124/// the function is idempotent — calling it on file content that already has a
125/// hash line returns the same value as calling it on the same content with no
126/// hash line. This makes the verify path symmetric with the generate path:
127///
128/// - **Generate**: write the file, run formatters, then call this with the
129///   on-disk content and inject the result.
130/// - **Verify**: read the file, extract the existing hash line, call this
131///   with the on-disk content, compare.
132pub fn compute_file_hash(sources_hash: &str, content: &str) -> String {
133    let stripped = strip_hash_line(content);
134    let mut hasher = blake3::Hasher::new();
135    hasher.update(b"sources\0");
136    hasher.update(sources_hash.as_bytes());
137    hasher.update(b"\0content\0");
138    hasher.update(stripped.as_bytes());
139    hasher.finalize().to_hex().to_string()
140}
141
142/// Inject an `alef:hash:<hex>` line immediately after the first header marker
143/// line found in the first 10 lines.  The comment syntax is inferred from the
144/// marker line itself.
145///
146/// If no marker line is found, the content is returned unchanged.
147pub fn inject_hash_line(content: &str, hash: &str) -> String {
148    let mut result = String::with_capacity(content.len() + 80);
149    let mut injected = false;
150
151    for (i, line) in content.lines().enumerate() {
152        result.push_str(line);
153        result.push('\n');
154
155        if !injected && i < 10 && line.contains(HEADER_MARKER) {
156            let trimmed = line.trim();
157            let hash_line = if trimmed.starts_with("<!--") {
158                // XML comment: inject hash line as XML comment
159                format!("<!-- {HASH_PREFIX}{hash} -->")
160            } else if trimmed.starts_with("//") {
161                format!("// {HASH_PREFIX}{hash}")
162            } else if trimmed.starts_with('#') {
163                format!("# {HASH_PREFIX}{hash}")
164            } else if trimmed.starts_with("/*") || trimmed.starts_with(" *") || trimmed.ends_with("*/") {
165                format!(" * {HASH_PREFIX}{hash}")
166            } else {
167                format!("// {HASH_PREFIX}{hash}")
168            };
169            result.push_str(&hash_line);
170            result.push('\n');
171            injected = true;
172        }
173    }
174
175    // Preserve original trailing-newline behavior.
176    if !content.ends_with('\n') && result.ends_with('\n') {
177        result.pop();
178    }
179
180    result
181}
182
183/// Extract the hash from an `alef:hash:<hex>` token in the first 10 lines.
184pub fn extract_hash(content: &str) -> Option<String> {
185    for (i, line) in content.lines().enumerate() {
186        if i >= 10 {
187            break;
188        }
189        if let Some(pos) = line.find(HASH_PREFIX) {
190            let rest = &line[pos + HASH_PREFIX.len()..];
191            // Trim trailing comment closers and whitespace.
192            let hex = rest.trim().trim_end_matches("*/").trim_end_matches("-->").trim();
193            if !hex.is_empty() {
194                return Some(hex.to_string());
195            }
196        }
197    }
198    None
199}
200
201/// Strip the `alef:hash:` line from content (for fallback comparison).
202pub fn strip_hash_line(content: &str) -> String {
203    let mut result = String::with_capacity(content.len());
204    for line in content.lines() {
205        if line.contains(HASH_PREFIX) {
206            continue;
207        }
208        result.push_str(line);
209        result.push('\n');
210    }
211    // Preserve original trailing-newline behavior.
212    if !content.ends_with('\n') && result.ends_with('\n') {
213        result.pop();
214    }
215    result
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221
222    #[test]
223    fn test_header_double_slash() {
224        let h = header(CommentStyle::DoubleSlash);
225        assert!(h.contains("// This file is auto-generated by alef"));
226        assert!(h.contains("// Issues & docs: https://github.com/kreuzberg-dev/alef"));
227    }
228
229    #[test]
230    fn test_header_hash() {
231        let h = header(CommentStyle::Hash);
232        assert!(h.contains("# This file is auto-generated by alef"));
233    }
234
235    #[test]
236    fn test_header_block() {
237        let h = header(CommentStyle::Block);
238        assert!(h.starts_with("/*\n"));
239        assert!(h.contains(" * This file is auto-generated by alef"));
240        assert!(h.ends_with(" */\n"));
241    }
242
243    #[test]
244    fn test_inject_and_extract_rust() {
245        let h = header(CommentStyle::DoubleSlash);
246        let content = format!("{h}use foo;\n");
247        let hash = hash_content(&content);
248        let injected = inject_hash_line(&content, &hash);
249        assert!(injected.contains(HASH_PREFIX));
250        assert_eq!(extract_hash(&injected), Some(hash));
251    }
252
253    #[test]
254    fn test_inject_and_extract_python() {
255        let h = header(CommentStyle::Hash);
256        let content = format!("{h}import foo\n");
257        let hash = hash_content(&content);
258        let injected = inject_hash_line(&content, &hash);
259        assert!(injected.contains(&format!("# {HASH_PREFIX}")));
260        assert_eq!(extract_hash(&injected), Some(hash));
261    }
262
263    #[test]
264    fn test_inject_and_extract_c_block() {
265        let h = header(CommentStyle::Block);
266        let content = format!("{h}#include <stdio.h>\n");
267        let hash = hash_content(&content);
268        let injected = inject_hash_line(&content, &hash);
269        assert!(injected.contains(HASH_PREFIX));
270        assert_eq!(extract_hash(&injected), Some(hash));
271    }
272
273    #[test]
274    fn test_inject_php_line2() {
275        let h = header(CommentStyle::DoubleSlash);
276        let content = format!("<?php\n{h}namespace Foo;\n");
277        let hash = hash_content(&content);
278        let injected = inject_hash_line(&content, &hash);
279        let lines: Vec<&str> = injected.lines().collect();
280        assert_eq!(lines[0], "<?php");
281        assert!(lines[1].contains(HEADER_MARKER));
282        assert!(lines.iter().any(|l| l.contains(HASH_PREFIX)));
283        assert_eq!(extract_hash(&injected), Some(hash));
284    }
285
286    #[test]
287    fn test_no_header_returns_unchanged() {
288        let content = "fn main() {}\n";
289        let injected = inject_hash_line(content, "abc123");
290        assert_eq!(injected, content);
291        assert_eq!(extract_hash(&injected), None);
292    }
293
294    #[test]
295    fn test_strip_hash_line() {
296        let content = "// auto-generated by alef\n// alef:hash:abc123\nuse foo;\n";
297        let stripped = strip_hash_line(content);
298        assert_eq!(stripped, "// auto-generated by alef\nuse foo;\n");
299    }
300
301    #[test]
302    fn test_roundtrip() {
303        let h = header(CommentStyle::Hash);
304        let original = format!("{h}import sys\n");
305        let hash = hash_content(&original);
306        let injected = inject_hash_line(&original, &hash);
307        let stripped = strip_hash_line(&injected);
308        assert_eq!(stripped, original);
309        assert_eq!(hash_content(&stripped), hash);
310    }
311
312    // ----- compute_sources_hash / compute_file_hash --------------------------
313
314    use std::path::{Path, PathBuf};
315    use tempfile::tempdir;
316
317    fn write_file(dir: &Path, name: &str, content: &str) -> PathBuf {
318        let path = dir.join(name);
319        std::fs::write(&path, content).unwrap();
320        path
321    }
322
323    #[test]
324    fn sources_hash_changes_when_path_changes_even_if_content_same() {
325        let dir = tempdir().unwrap();
326        let s_a = write_file(dir.path(), "a.rs", "fn a() {}");
327        std::fs::create_dir_all(dir.path().join("moved")).unwrap();
328        let s_b = write_file(dir.path(), "moved/a.rs", "fn a() {}");
329        let h_a = compute_sources_hash(&[s_a]).unwrap();
330        let h_b = compute_sources_hash(&[s_b]).unwrap();
331        assert_ne!(
332            h_a, h_b,
333            "same content at a different path can produce different IR (rust_path differs)"
334        );
335    }
336
337    #[test]
338    fn sources_hash_errors_on_missing_source() {
339        let dir = tempdir().unwrap();
340        let bogus = dir.path().join("does-not-exist.rs");
341        assert!(compute_sources_hash(&[bogus]).is_err());
342    }
343
344    #[test]
345    fn sources_hash_stable_across_runs() {
346        let dir = tempdir().unwrap();
347        let s1 = write_file(dir.path(), "a.rs", "fn a() {}");
348        let s2 = write_file(dir.path(), "b.rs", "fn b() {}");
349        let sources = vec![s1, s2];
350        let h1 = compute_sources_hash(&sources).unwrap();
351        let h2 = compute_sources_hash(&sources).unwrap();
352        assert_eq!(h1, h2);
353    }
354
355    #[test]
356    fn sources_hash_path_order_independent() {
357        let dir = tempdir().unwrap();
358        let s1 = write_file(dir.path(), "a.rs", "fn a() {}");
359        let s2 = write_file(dir.path(), "b.rs", "fn b() {}");
360        let h_forward = compute_sources_hash(&[s1.clone(), s2.clone()]).unwrap();
361        let h_reverse = compute_sources_hash(&[s2, s1]).unwrap();
362        assert_eq!(h_forward, h_reverse);
363    }
364
365    #[test]
366    fn sources_hash_changes_with_content() {
367        let dir = tempdir().unwrap();
368        let s = write_file(dir.path(), "a.rs", "fn a() {}");
369        let h_before = compute_sources_hash(std::slice::from_ref(&s)).unwrap();
370        std::fs::write(&s, "fn a() { let _ = 1; }").unwrap();
371        let h_after = compute_sources_hash(&[s]).unwrap();
372        assert_ne!(h_before, h_after);
373    }
374
375    #[test]
376    fn file_hash_idempotent_under_strip_hash_line() {
377        // The defining property: hash(content with hash line) == hash(content without hash line).
378        // This is what makes the verify path symmetric with the generate path.
379        let sources_hash = "abc123";
380        let bare = "// auto-generated by alef\nfn body() {}\n";
381        let with_line = "// auto-generated by alef\n// alef:hash:deadbeef\nfn body() {}\n";
382
383        let h1 = compute_file_hash(sources_hash, bare);
384        let h2 = compute_file_hash(sources_hash, with_line);
385        assert_eq!(h1, h2, "hash must ignore an existing alef:hash: line");
386    }
387
388    #[test]
389    fn file_hash_changes_when_sources_change() {
390        let content = "// auto-generated by alef\nfn body() {}\n";
391        let h_a = compute_file_hash("sources_a", content);
392        let h_b = compute_file_hash("sources_b", content);
393        assert_ne!(h_a, h_b);
394    }
395
396    #[test]
397    fn file_hash_changes_when_content_changes() {
398        let sources_hash = "abc123";
399        let h_a = compute_file_hash(sources_hash, "fn a() {}\n");
400        let h_b = compute_file_hash(sources_hash, "fn b() {}\n");
401        assert_ne!(h_a, h_b);
402    }
403
404    #[test]
405    fn file_hash_independent_of_alef_version() {
406        // Idempotency property: the hash is purely a function of (sources, content).
407        // Bumping the alef CLI version must not change it. Encoded by the type
408        // signature — there is no version parameter — but make it explicit so
409        // a future regression that re-introduces a version dimension is caught.
410        let h = compute_file_hash("sources_hash", "fn a() {}\n");
411        assert_eq!(h.len(), 64, "blake3 hex output is 64 chars");
412    }
413
414    #[test]
415    fn file_hash_round_trip_via_inject_extract() {
416        // Simulate the full generate/verify cycle:
417        // 1. generate: compute hash from stripped content, inject into header
418        // 2. verify: read back, extract hash, recompute from content, compare
419        let sources_hash = "abc123";
420        let raw = "// auto-generated by alef\nfn body() {}\n";
421        let file_hash = compute_file_hash(sources_hash, raw);
422        let on_disk = inject_hash_line(raw, &file_hash);
423
424        let extracted = extract_hash(&on_disk).expect("hash line should be present");
425        let recomputed = compute_file_hash(sources_hash, &on_disk);
426        assert_eq!(extracted, file_hash);
427        assert_eq!(recomputed, file_hash);
428        assert_eq!(extracted, recomputed, "verify must reproduce the embedded hash");
429    }
430}