Skip to main content

harn_vm/
bytecode_cache.rs

1//! Content-addressed on-disk cache for compiled `.harn` pipelines.
2//!
3//! Cold-start `harn run` re-parses, type-checks, and compiles the entry
4//! pipeline before the VM gets a single instruction to execute. For short
5//! Harn subcommands that wrap a few `llm_call`s in a small pipeline, that
6//! compile cost dominates wall-clock time.
7//!
8//! This module persists [`Chunk`] bytecode under
9//! `$HARN_CACHE_DIR/<source-hash>.harnbc` (XDG-aware). The cache key is
10//! derived from the entry source plus the content hash of every
11//! transitively-imported user file; stdlib imports are covered by the
12//! embedded `harn_version` field in the header. Any change to any input
13//! flips the key and the next run recompiles.
14//!
15//! File layout — little-endian throughout:
16//!
17//! ```text
18//! magic        : [u8; 8]   = "HARNBC\0\0"
19//! schema_ver   : u32       = SCHEMA_VERSION
20//! version_len  : u32
21//! harn_version : [u8; version_len]
22//! compiler_tag : u8        bitmask of active CompilerOptions
23//! kind         : u8        1 = entry chunk, 2 = module artifact
24//! source_hash  : [u8; 32]
25//! import_hash  : [u8; 32]
26//! payload      : bincode-serialized payload for `kind`
27//! ```
28//!
29//! The header lets a stale binary detect a future-version artifact
30//! without crashing: a magic mismatch, schema mismatch, or version
31//! mismatch is returned as `Ok(None)` so the caller transparently
32//! recompiles. Real I/O errors propagate.
33//!
34//! Concurrency: writes are atomic (write-tmp-then-rename), and parallel
35//! invocations on a cache miss race safely — the last writer wins, but
36//! every reader observes a consistent file because the rename is atomic
37//! on every supported filesystem.
38
39use std::fs;
40use std::io::{self, Read as _, Write as _};
41use std::path::{Path, PathBuf};
42
43use sha2::{Digest, Sha256};
44
45use crate::chunk::{CachedChunk, Chunk};
46use crate::compiler::CompilerOptions;
47use crate::module_artifact::ModuleArtifact;
48
49/// Header magic for all bytecode-cache artifact families.
50pub const MAGIC: &[u8; 8] = b"HARNBC\0\0";
51
52/// On-disk format version. Bump when [`CachedChunk`] or the header
53/// layout changes in a backwards-incompatible way.
54pub const SCHEMA_VERSION: u32 = 4;
55
56/// Compile-time Harn release. Cache files written by a different release
57/// are rejected on load.
58pub const HARN_VERSION: &str = env!("CARGO_PKG_VERSION");
59
60/// Build-time fingerprint of the compiler front-end — the lexer, parser, IR,
61/// and code generator — computed in `build.rs` from those crates' source and
62/// baked in via `cargo:rustc-env`. Folded into the cache key so a compiler
63/// change that alters emitted bytecode for unchanged source invalidates stale
64/// entries automatically, within a single version, with no manual cache wipe.
65/// `HARN_VERSION` only busts the cache across release bumps; this closes the
66/// same gap for the within-version compiler edits that masked #2610. See #2621.
67pub const CODEGEN_FINGERPRINT: &str = env!("HARN_CODEGEN_FINGERPRINT");
68
69/// Conventional extension for entry-chunk cache files.
70pub const CACHE_EXTENSION: &str = "harnbc";
71
72/// Conventional extension for module-artifact cache files. Distinct from
73/// [`CACHE_EXTENSION`] so the same `.harn` source can have both shipped
74/// adjacent if needed (e.g. when a file is both an executable entry and
75/// imported by other files).
76pub const MODULE_CACHE_EXTENSION: &str = "harnmod";
77
78/// On-disk discriminant for a [`Chunk`] payload.
79const KIND_ENTRY_CHUNK: u8 = 1;
80/// On-disk discriminant for a [`ModuleArtifact`] payload.
81const KIND_MODULE_ARTIFACT: u8 = 2;
82
83/// Environment override for the cache directory. When set, takes
84/// precedence over the XDG and home-directory fallbacks.
85pub const CACHE_DIR_ENV: &str = "HARN_CACHE_DIR";
86
87/// Environment override that turns the cache off entirely. Setting this
88/// to `0`, `false`, `no`, or `off` skips both reads and writes; useful
89/// when debugging compiler changes.
90pub const CACHE_ENABLED_ENV: &str = "HARN_BYTECODE_CACHE";
91
92/// Result of a cache lookup. Carries the precomputed key so the caller
93/// can write it back on a miss without rehashing.
94pub struct LookupOutcome {
95    pub key: CacheKey,
96    pub chunk: Option<Chunk>,
97}
98
99/// Cache key components for a single pipeline source. Equality of all
100/// fields is necessary and sufficient for cache reuse.
101#[derive(Clone, Debug, PartialEq, Eq)]
102pub struct CacheKey {
103    pub source_hash: [u8; 32],
104    pub import_graph_hash: [u8; 32],
105    pub harn_version: &'static str,
106    /// Compact tag for active [`CompilerOptions`]. Flipping
107    /// `HARN_DISABLE_OPTIMIZATIONS` between runs would otherwise reuse a
108    /// chunk compiled under the wrong setting.
109    pub compiler_tag: u8,
110}
111
112impl CacheKey {
113    /// Compute the cache key for a `.harn` source file plus its transitive
114    /// user imports. `read_source` is the entry-file contents; the import
115    /// graph is walked from disk relative to `source_path`.
116    pub fn from_source(source_path: &Path, source: &str) -> Self {
117        let source_hash = sha256(source.as_bytes());
118        let import_graph_hash = hash_transitive_user_imports(source_path, source);
119        Self {
120            source_hash,
121            import_graph_hash,
122            harn_version: HARN_VERSION,
123            compiler_tag: compiler_options_tag(CompilerOptions::from_env()),
124        }
125    }
126
127    /// Entry-chunk filename for this key. We hash by source content
128    /// alone so two invocations of the same source from different paths
129    /// share a cache entry; the header's import-graph hash still gates
130    /// reuse on a per-load basis.
131    pub fn filename(&self) -> String {
132        format!("{}.{}", hex(&self.source_hash), CACHE_EXTENSION)
133    }
134
135    /// Module-artifact filename for this key.
136    pub fn module_filename(&self) -> String {
137        format!("{}.{}", hex(&self.source_hash), MODULE_CACHE_EXTENSION)
138    }
139}
140
141/// Returns the directory the shared cache lives in. Honors
142/// `$HARN_CACHE_DIR`, then `$XDG_CACHE_HOME/harn/bytecode`, then
143/// `$HOME/.cache/harn/bytecode`. The directory is *not* created here —
144/// [`store`] creates it lazily on write so read-only environments don't
145/// pay an mkdir cost.
146pub fn cache_dir() -> PathBuf {
147    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
148        return PathBuf::from(custom);
149    }
150    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
151        let xdg = PathBuf::from(xdg);
152        if !xdg.as_os_str().is_empty() {
153            return xdg.join("harn").join("bytecode");
154        }
155    }
156    if let Some(home) = std::env::var_os("HOME") {
157        return PathBuf::from(home)
158            .join(".cache")
159            .join("harn")
160            .join("bytecode");
161    }
162    // Final fallback: a directory beside the binary's working dir. Mostly
163    // hit in tests that scrub HOME from the environment.
164    PathBuf::from(".harn-cache").join("bytecode")
165}
166
167/// Root for `.harnpack` archives unpacked by `harn run <bundle.harnpack>`.
168/// Each verified bundle is replayed into `<root>/<sanitized-bundle-hash>/`
169/// so re-runs reuse the unpacked tree. Honors `$HARN_CACHE_DIR/packs`
170/// when set, otherwise XDG / `$HOME/.cache/harn/packs`.
171pub fn packs_cache_dir() -> PathBuf {
172    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
173        return PathBuf::from(custom).join("packs");
174    }
175    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
176        let xdg = PathBuf::from(xdg);
177        if !xdg.as_os_str().is_empty() {
178            return xdg.join("harn").join("packs");
179        }
180    }
181    if let Some(home) = std::env::var_os("HOME") {
182        return PathBuf::from(home)
183            .join(".cache")
184            .join("harn")
185            .join("packs");
186    }
187    PathBuf::from(".harn-cache").join("packs")
188}
189
190/// True when the cache is enabled by the current environment.
191pub fn cache_enabled() -> bool {
192    match std::env::var(CACHE_ENABLED_ENV).ok().as_deref() {
193        Some(value) => !matches!(
194            value.to_ascii_lowercase().as_str(),
195            "0" | "false" | "no" | "off"
196        ),
197        None => true,
198    }
199}
200
201/// Try to load a cached chunk for `source_path` whose contents are
202/// `source`. Returns the key alongside the (optional) chunk so callers
203/// avoid recomputing the key on miss.
204pub fn load(source_path: &Path, source: &str) -> LookupOutcome {
205    let key = CacheKey::from_source(source_path, source);
206    if !cache_enabled() {
207        return LookupOutcome { key, chunk: None };
208    }
209    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
210    if let Some(adjacent) = adjacent_cache_path(source_path) {
211        candidates.push(adjacent);
212    }
213    candidates.push(cache_dir().join(key.filename()));
214    for path in candidates {
215        match read_chunk_if_matches(&path, &key) {
216            Ok(Some(chunk)) => {
217                return LookupOutcome {
218                    key,
219                    chunk: Some(chunk),
220                }
221            }
222            Ok(None) => continue,
223            Err(_) => continue,
224        }
225    }
226    LookupOutcome { key, chunk: None }
227}
228
229/// Persist `chunk` to the shared cache directory under `key`. Atomic: a
230/// temp file is written then renamed into place. Concurrent invocations
231/// on the same key race safely.
232pub fn store(key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
233    if !cache_enabled() {
234        return Ok(());
235    }
236    let dir = cache_dir();
237    fs::create_dir_all(&dir)?;
238    write_atomic_chunk(&dir.join(key.filename()), key, chunk)
239}
240
241/// Write a precompiled entry-chunk artifact to an explicit path, for
242/// use by the `harn precompile` subcommand. The header still records
243/// the key, so adjacent artifacts shipped with source are validated
244/// like any other cache hit.
245pub fn store_at(path: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
246    ensure_parent_dir(path)?;
247    write_atomic_chunk(path, key, chunk)
248}
249
250/// Look up the [`ModuleArtifact`] for `source_path` (whose contents are
251/// `source`). Mirrors [`load`] but for the `.harnmod` family.
252pub fn load_module(source_path: &Path, source: &str) -> ModuleLookupOutcome {
253    let key = CacheKey::from_source(source_path, source);
254    if !cache_enabled() {
255        return ModuleLookupOutcome {
256            key,
257            artifact: None,
258        };
259    }
260    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
261    if let Some(adjacent) = adjacent_module_cache_path(source_path) {
262        candidates.push(adjacent);
263    }
264    candidates.push(cache_dir().join(key.module_filename()));
265    for path in candidates {
266        match read_module_if_matches(&path, &key) {
267            Ok(Some(artifact)) => {
268                return ModuleLookupOutcome {
269                    key,
270                    artifact: Some(artifact),
271                }
272            }
273            Ok(None) => continue,
274            Err(_) => continue,
275        }
276    }
277    ModuleLookupOutcome {
278        key,
279        artifact: None,
280    }
281}
282
283/// Persist `artifact` to the shared cache under `key`. Atomic;
284/// concurrent invocations race safely.
285pub fn store_module(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
286    if !cache_enabled() {
287        return Ok(());
288    }
289    let dir = cache_dir();
290    fs::create_dir_all(&dir)?;
291    write_atomic_module(&dir.join(key.module_filename()), key, artifact)
292}
293
294/// Write a module artifact to an explicit path.
295pub fn store_module_at(path: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
296    ensure_parent_dir(path)?;
297    write_atomic_module(path, key, artifact)
298}
299
300/// Result of a [`load_module`] lookup. Carries the precomputed key so
301/// the caller can write it back on a miss without rehashing.
302pub struct ModuleLookupOutcome {
303    pub key: CacheKey,
304    pub artifact: Option<ModuleArtifact>,
305}
306
307/// Path to the adjacent precompiled entry-chunk artifact for
308/// `source_path`. `foo.harn` → `foo.harnbc`.
309pub fn adjacent_cache_path(source_path: &Path) -> Option<PathBuf> {
310    adjacent_path_with_extension(source_path, CACHE_EXTENSION)
311}
312
313/// Path to the adjacent precompiled module-artifact for `source_path`.
314/// `foo.harn` → `foo.harnmod`.
315pub fn adjacent_module_cache_path(source_path: &Path) -> Option<PathBuf> {
316    adjacent_path_with_extension(source_path, MODULE_CACHE_EXTENSION)
317}
318
319fn adjacent_path_with_extension(source_path: &Path, ext: &str) -> Option<PathBuf> {
320    let stem = source_path.file_stem()?;
321    if stem.is_empty() {
322        return None;
323    }
324    let parent = source_path.parent().unwrap_or_else(|| Path::new(""));
325    let mut out = parent.join(stem);
326    out.set_extension(ext);
327    Some(out)
328}
329
330fn ensure_parent_dir(path: &Path) -> io::Result<()> {
331    if let Some(parent) = path.parent() {
332        if !parent.as_os_str().is_empty() {
333            fs::create_dir_all(parent)?;
334        }
335    }
336    Ok(())
337}
338
339fn write_atomic_chunk(target: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
340    let buf = serialize_chunk_artifact(key, chunk)?;
341    write_atomic(target, &buf)
342}
343
344fn write_atomic_module(target: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
345    let buf = serialize_module_artifact(key, artifact)?;
346    write_atomic(target, &buf)
347}
348
349/// Serialize an entry-chunk artifact (header + payload) to bytes. The
350/// resulting buffer is byte-identical to the file [`store_at`] would
351/// have written for the same `(key, chunk)`. Use this when packaging
352/// artifacts into a container (e.g. `harn pack`) without going through
353/// the filesystem.
354pub fn serialize_chunk_artifact(key: &CacheKey, chunk: &Chunk) -> io::Result<Vec<u8>> {
355    let cached = chunk.freeze_for_cache();
356    let payload = bincode::serde::encode_to_vec(&cached, bincode::config::standard())
357        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
358    Ok(encode_artifact(key, KIND_ENTRY_CHUNK, &payload))
359}
360
361/// Serialize a module artifact (header + payload) to bytes. Companion
362/// to [`serialize_chunk_artifact`] for the `.harnmod` family.
363pub fn serialize_module_artifact(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<Vec<u8>> {
364    let payload = bincode::serde::encode_to_vec(artifact, bincode::config::standard())
365        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
366    Ok(encode_artifact(key, KIND_MODULE_ARTIFACT, &payload))
367}
368
369fn encode_artifact(key: &CacheKey, kind: u8, payload: &[u8]) -> Vec<u8> {
370    let mut buf: Vec<u8> = Vec::with_capacity(payload.len() + 128);
371    buf.extend_from_slice(MAGIC);
372    buf.extend_from_slice(&SCHEMA_VERSION.to_le_bytes());
373    let version_bytes = HARN_VERSION.as_bytes();
374    buf.extend_from_slice(&(version_bytes.len() as u32).to_le_bytes());
375    buf.extend_from_slice(version_bytes);
376    buf.push(key.compiler_tag);
377    buf.push(kind);
378    buf.extend_from_slice(&key.source_hash);
379    buf.extend_from_slice(&key.import_graph_hash);
380    buf.extend_from_slice(payload);
381    buf
382}
383
384fn write_atomic(target: &Path, buf: &[u8]) -> io::Result<()> {
385    let tmp_name = match target.file_name() {
386        Some(name) => format!(".{}.{}.tmp", name.to_string_lossy(), std::process::id()),
387        None => format!(".harn-cache.{}.tmp", std::process::id()),
388    };
389    let tmp_path = target.with_file_name(tmp_name);
390    let mut tmp_file = fs::File::create(&tmp_path)?;
391    tmp_file.write_all(buf)?;
392    tmp_file.sync_all()?;
393    drop(tmp_file);
394    match fs::rename(&tmp_path, target) {
395        Ok(()) => Ok(()),
396        Err(err) => {
397            let _ = fs::remove_file(&tmp_path);
398            Err(err)
399        }
400    }
401}
402
403/// Parsed cache header. Read by both the chunk and module loaders so the
404/// header-validation logic stays in one place.
405struct ParsedHeader {
406    kind: u8,
407    payload: Vec<u8>,
408}
409
410fn read_header_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ParsedHeader>> {
411    let mut file = match fs::File::open(path) {
412        Ok(f) => f,
413        Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(None),
414        Err(err) => return Err(err),
415    };
416    let mut header = [0u8; 8 + 4 + 4];
417    if file.read_exact(&mut header).is_err() {
418        return Ok(None);
419    }
420    if &header[..8] != MAGIC {
421        return Ok(None);
422    }
423    let schema = u32::from_le_bytes(header[8..12].try_into().unwrap());
424    if schema != SCHEMA_VERSION {
425        return Ok(None);
426    }
427    let version_len = u32::from_le_bytes(header[12..16].try_into().unwrap()) as usize;
428    if version_len > 256 {
429        // Bound the alloc so a corrupted file cannot force an unbounded read.
430        return Ok(None);
431    }
432    let mut version_buf = vec![0u8; version_len];
433    if file.read_exact(&mut version_buf).is_err() {
434        return Ok(None);
435    }
436    if version_buf != key.harn_version.as_bytes() {
437        return Ok(None);
438    }
439    let mut compiler_and_kind = [0u8; 2];
440    if file.read_exact(&mut compiler_and_kind).is_err() {
441        return Ok(None);
442    }
443    if compiler_and_kind[0] != key.compiler_tag {
444        return Ok(None);
445    }
446    let kind = compiler_and_kind[1];
447    let mut hashes = [0u8; 64];
448    if file.read_exact(&mut hashes).is_err() {
449        return Ok(None);
450    }
451    if hashes[..32] != key.source_hash || hashes[32..] != key.import_graph_hash {
452        return Ok(None);
453    }
454    let mut payload = Vec::new();
455    if file.read_to_end(&mut payload).is_err() {
456        return Ok(None);
457    }
458    Ok(Some(ParsedHeader { kind, payload }))
459}
460
461fn read_chunk_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<Chunk>> {
462    let Some(header) = read_header_if_matches(path, key)? else {
463        return Ok(None);
464    };
465    if header.kind != KIND_ENTRY_CHUNK {
466        return Ok(None);
467    }
468    let cached: CachedChunk =
469        match bincode::serde::decode_from_slice(&header.payload, bincode::config::standard()) {
470            Ok((c, _)) => c,
471            Err(_) => return Ok(None),
472        };
473    Ok(Some(Chunk::from_cached(&cached)))
474}
475
476fn read_module_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ModuleArtifact>> {
477    let Some(header) = read_header_if_matches(path, key)? else {
478        return Ok(None);
479    };
480    if header.kind != KIND_MODULE_ARTIFACT {
481        return Ok(None);
482    }
483    match bincode::serde::decode_from_slice::<ModuleArtifact, _>(
484        &header.payload,
485        bincode::config::standard(),
486    ) {
487        Ok((artifact, _)) => Ok(Some(artifact)),
488        Err(_) => Ok(None),
489    }
490}
491
492/// Compact representation of [`CompilerOptions`] for the cache header.
493/// Independent flags get distinct bits so adding a new flag never
494/// silently changes existing keys when an old binary reads a new
495/// artifact — the header check will fail-closed before we get there
496/// anyway, but mapping to bits also keeps the tag a stable function
497/// of the option set.
498fn compiler_options_tag(options: CompilerOptions) -> u8 {
499    let mut tag: u8 = 0;
500    if options.optimizations_enabled() {
501        tag |= 0b0000_0001;
502    }
503    tag
504}
505
506fn sha256(bytes: &[u8]) -> [u8; 32] {
507    let mut hasher = Sha256::new();
508    hasher.update(bytes);
509    hasher.finalize().into()
510}
511
512fn hex(bytes: &[u8]) -> String {
513    let mut out = String::with_capacity(bytes.len() * 2);
514    for byte in bytes {
515        out.push_str(&format!("{byte:02x}"));
516    }
517    out
518}
519
520/// Lightweight regex-free scan that surfaces user imports without paying
521/// a full lex+parse. False positives only increase cache churn, never
522/// correctness; comments and string literals are skipped so neither a
523/// commented-out import nor a `"import …"` value appearing inside an
524/// unrelated string gates the hash.
525fn collect_user_imports(source: &str) -> Vec<String> {
526    let scrubbed = strip_comments(source);
527    let mut out: Vec<String> = Vec::new();
528    let bytes = scrubbed.as_bytes();
529    let mut i = 0;
530    while i < bytes.len() {
531        if bytes[i] == b'"' {
532            // Skip past any string literal so identifiers inside string
533            // values cannot trigger the keyword match below.
534            match read_string_literal(bytes, i) {
535                Some((_, end)) => {
536                    i = end;
537                    continue;
538                }
539                None => {
540                    i += 1;
541                    continue;
542                }
543            }
544        }
545        if !matches_keyword(bytes, i, b"import") {
546            i += 1;
547            continue;
548        }
549        // Skip past `import` and any selective `{ ... } from` clause; we
550        // only need the source-position of the path string literal.
551        let mut j = i + b"import".len();
552        let mut depth = 0i32;
553        while j < bytes.len() {
554            match bytes[j] {
555                b'"' => {
556                    if let Some((path, end)) = read_string_literal(bytes, j) {
557                        if !path.starts_with("std/") {
558                            out.push(path);
559                        }
560                        i = end;
561                        break;
562                    }
563                    j += 1;
564                }
565                b'{' => {
566                    depth += 1;
567                    j += 1;
568                }
569                b'}' => {
570                    depth -= 1;
571                    j += 1;
572                }
573                b'\n' if depth == 0 => {
574                    // No string literal on this logical line; bail and
575                    // continue scanning after the keyword to avoid an
576                    // infinite loop.
577                    i = j;
578                    break;
579                }
580                _ => j += 1,
581            }
582        }
583        if j >= bytes.len() {
584            break;
585        }
586        if i < j {
587            // Defensive: ensure forward progress when the inner loop
588            // exited without setting `i`.
589            i = j;
590        }
591    }
592    out
593}
594
595fn matches_keyword(bytes: &[u8], at: usize, keyword: &[u8]) -> bool {
596    let end = at + keyword.len();
597    if end > bytes.len() {
598        return false;
599    }
600    if &bytes[at..end] != keyword {
601        return false;
602    }
603    if at > 0 && is_ident_char(bytes[at - 1]) {
604        return false;
605    }
606    if end < bytes.len() && is_ident_char(bytes[end]) {
607        return false;
608    }
609    true
610}
611
612fn is_ident_char(b: u8) -> bool {
613    b.is_ascii_alphanumeric() || b == b'_'
614}
615
616fn read_string_literal(bytes: &[u8], at: usize) -> Option<(String, usize)> {
617    debug_assert_eq!(bytes[at], b'"');
618    let mut out = String::new();
619    let mut i = at + 1;
620    while i < bytes.len() {
621        match bytes[i] {
622            b'"' => return Some((out, i + 1)),
623            b'\\' => {
624                if i + 1 >= bytes.len() {
625                    return None;
626                }
627                match bytes[i + 1] {
628                    b'"' => out.push('"'),
629                    b'\\' => out.push('\\'),
630                    b'n' => out.push('\n'),
631                    b'r' => out.push('\r'),
632                    b't' => out.push('\t'),
633                    other => out.push(other as char),
634                }
635                i += 2;
636            }
637            b'\n' => return None,
638            byte => {
639                out.push(byte as char);
640                i += 1;
641            }
642        }
643    }
644    None
645}
646
647fn strip_comments(source: &str) -> String {
648    let bytes = source.as_bytes();
649    let mut out = String::with_capacity(source.len());
650    let mut i = 0;
651    while i < bytes.len() {
652        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'/' {
653            while i < bytes.len() && bytes[i] != b'\n' {
654                i += 1;
655            }
656            continue;
657        }
658        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'*' {
659            i += 2;
660            while i + 1 < bytes.len() && !(bytes[i] == b'*' && bytes[i + 1] == b'/') {
661                i += 1;
662            }
663            i = (i + 2).min(bytes.len());
664            continue;
665        }
666        if bytes[i] == b'"' {
667            if let Some((_, end)) = read_string_literal(bytes, i) {
668                out.push_str(&source[i..end]);
669                i = end;
670                continue;
671            }
672        }
673        out.push(bytes[i] as char);
674        i += 1;
675    }
676    out
677}
678
679/// Stable digest over every embedded stdlib source. Folded into the
680/// user-file cache key so that bumping a stdlib module (changing its
681/// embedded `.harn` content) invalidates cached user bytecode that may
682/// reference stale function-pool layouts from a prior stdlib snapshot.
683/// `HARN_VERSION` already busts the cache across release bumps; this
684/// closes the same gap for within-version stdlib edits (a frequent
685/// pattern during local development).
686///
687/// Cached in a `OnceLock` because `STDLIB_SOURCES` is a static `const`
688/// slice — the digest is identical for the lifetime of the process.
689fn embedded_stdlib_digest() -> &'static [u8; 32] {
690    use std::sync::OnceLock;
691    static DIGEST: OnceLock<[u8; 32]> = OnceLock::new();
692    DIGEST.get_or_init(|| {
693        let mut entries: Vec<(&'static str, &'static str)> = harn_stdlib::STDLIB_SOURCES
694            .iter()
695            .map(|src| (src.module, src.source))
696            .collect();
697        entries.sort_by(|a, b| a.0.cmp(b.0));
698        let mut hasher = Sha256::new();
699        for (module, source) in entries {
700            hasher.update(module.as_bytes());
701            hasher.update(b"\0");
702            hasher.update(source.as_bytes());
703            hasher.update(b"\0");
704        }
705        hasher.finalize().into()
706    })
707}
708
709/// Walk the user-import graph rooted at `source_path` and produce a
710/// stable hash of every transitively-reachable file. The hash is
711/// order-independent: each visited file is keyed by canonical path and
712/// emitted in sorted order, so reordering imports inside a file does
713/// not invalidate the cache while changing any file's content does.
714///
715/// Embedded stdlib content is folded into the hash too — `collect_user_imports`
716/// deliberately skips `std/*` paths (they resolve to in-binary sources, not
717/// disk files), so without this fold a stdlib edit between development
718/// builds would leave user-file caches pinned to a stale stdlib snapshot.
719fn hash_transitive_user_imports(source_path: &Path, source: &str) -> [u8; 32] {
720    hash_transitive_user_imports_fingerprinted(source_path, source, CODEGEN_FINGERPRINT)
721}
722
723/// Inner form of [`hash_transitive_user_imports`] parameterized on the compiler
724/// fingerprint so tests can vary it; production always passes
725/// [`CODEGEN_FINGERPRINT`].
726fn hash_transitive_user_imports_fingerprinted(
727    source_path: &Path,
728    source: &str,
729    codegen_fingerprint: &str,
730) -> [u8; 32] {
731    let mut visited: std::collections::BTreeMap<PathBuf, ImportNode> =
732        std::collections::BTreeMap::new();
733    let mut frontier: Vec<(PathBuf, String)> = collect_user_imports(source)
734        .into_iter()
735        .map(|import| (source_path.to_path_buf(), import))
736        .collect();
737
738    while let Some((anchor, import)) = frontier.pop() {
739        let Some(resolved) = harn_modules::resolve_import_path(&anchor, &import) else {
740            // Unresolved imports get a sentinel keyed by their resolution
741            // anchor so that dropping a real file under that anchor later
742            // produces a different key.
743            let sentinel = anchor.join(format!("__unresolved__/{import}"));
744            visited
745                .entry(sentinel)
746                .or_insert(ImportNode::Unresolved { import });
747            continue;
748        };
749        let canonical = resolved.canonicalize().unwrap_or_else(|_| resolved.clone());
750        if visited.contains_key(&canonical) {
751            continue;
752        }
753        match fs::read_to_string(&resolved) {
754            Ok(content) => {
755                let nested = collect_user_imports(&content);
756                visited.insert(
757                    canonical.clone(),
758                    ImportNode::Resolved {
759                        content: content.clone(),
760                    },
761                );
762                for nested_import in nested {
763                    frontier.push((resolved.clone(), nested_import));
764                }
765            }
766            Err(err) => {
767                visited.insert(
768                    canonical,
769                    ImportNode::IoError {
770                        kind: err.kind().to_string(),
771                    },
772                );
773            }
774        }
775    }
776
777    let mut hasher = Sha256::new();
778    hasher.update(b"stdlib-digest\0");
779    hasher.update(embedded_stdlib_digest());
780    hasher.update(b"\0");
781    // Fold in the compiler's code-generation fingerprint so a compiler change
782    // that alters emitted bytecode for unchanged source busts stale cache
783    // entries within a single version — the gap that masked the #2610 fix until
784    // the cache was cleared by hand. See `build.rs` and `CODEGEN_FINGERPRINT`.
785    hasher.update(b"codegen-fingerprint\0");
786    hasher.update(codegen_fingerprint.as_bytes());
787    hasher.update(b"\0");
788    for (path, node) in &visited {
789        hasher.update(path.to_string_lossy().as_bytes());
790        hasher.update(b"\0");
791        match node {
792            ImportNode::Resolved { content } => {
793                hasher.update(b"resolved\0");
794                hasher.update(content.as_bytes());
795            }
796            ImportNode::Unresolved { import } => {
797                hasher.update(b"unresolved\0");
798                hasher.update(import.as_bytes());
799            }
800            ImportNode::IoError { kind } => {
801                hasher.update(b"ioerror\0");
802                hasher.update(kind.as_bytes());
803            }
804        }
805        hasher.update(b"\0");
806    }
807    hasher.finalize().into()
808}
809
810enum ImportNode {
811    Resolved { content: String },
812    Unresolved { import: String },
813    IoError { kind: String },
814}
815
816#[cfg(test)]
817mod tests {
818    use super::*;
819    use crate::compile_source;
820
821    #[test]
822    fn header_round_trips_chunk() {
823        let chunk = compile_source("__io_println(\"hello\")").expect("compile");
824        let key = CacheKey::from_source(Path::new("/tmp/example.harn"), "__io_println(\"hello\")");
825        let tmp = tempfile::tempdir().unwrap();
826        let path = tmp.path().join("entry.harnbc");
827        store_at(&path, &key, &chunk).expect("write");
828        let loaded = read_chunk_if_matches(&path, &key).unwrap();
829        assert!(loaded.is_some(), "expected cached chunk to load");
830    }
831
832    #[test]
833    fn serialize_chunk_artifact_matches_store_at() {
834        // `serialize_chunk_artifact` packages an artifact into a buffer for
835        // in-memory consumers (e.g. `harn pack` writing into a tar.zst
836        // bundle). The contract is: the resulting bytes match what
837        // `store_at` would have written for the same key+chunk, so the
838        // shipped artifact is byte-identical to the on-disk cache form.
839        let chunk = compile_source("__io_println(\"hi\")").expect("compile");
840        let key = CacheKey::from_source(Path::new("/tmp/pack.harn"), "__io_println(\"hi\")");
841        let tmp = tempfile::tempdir().unwrap();
842        let on_disk = tmp.path().join("pack.harnbc");
843        store_at(&on_disk, &key, &chunk).expect("write");
844        let on_disk_bytes = std::fs::read(&on_disk).unwrap();
845        let in_memory_bytes = serialize_chunk_artifact(&key, &chunk).expect("serialize");
846        assert_eq!(in_memory_bytes, on_disk_bytes);
847    }
848
849    #[test]
850    fn header_mismatch_returns_none() {
851        let chunk = compile_source("1 + 1").expect("compile");
852        let key = CacheKey::from_source(Path::new("/tmp/a.harn"), "1 + 1");
853        let tmp = tempfile::tempdir().unwrap();
854        let path = tmp.path().join("a.harnbc");
855        store_at(&path, &key, &chunk).expect("write");
856        let other = CacheKey {
857            source_hash: [0xAB; 32],
858            import_graph_hash: key.import_graph_hash,
859            harn_version: HARN_VERSION,
860            compiler_tag: key.compiler_tag,
861        };
862        assert!(read_chunk_if_matches(&path, &other).unwrap().is_none());
863    }
864
865    #[test]
866    fn compiler_tag_mismatch_returns_none() {
867        let chunk = compile_source("1 + 1").expect("compile");
868        let key = CacheKey::from_source(Path::new("/tmp/b.harn"), "1 + 1");
869        let tmp = tempfile::tempdir().unwrap();
870        let path = tmp.path().join("b.harnbc");
871        store_at(&path, &key, &chunk).expect("write");
872        let other = CacheKey {
873            compiler_tag: key.compiler_tag ^ 0xFF,
874            ..key
875        };
876        assert!(
877            read_chunk_if_matches(&path, &other).unwrap().is_none(),
878            "flipped HARN_DISABLE_OPTIMIZATIONS must not reuse a chunk \
879             compiled under the opposite setting"
880        );
881    }
882
883    #[test]
884    fn codegen_fingerprint_is_populated() {
885        // In-workspace builds always hash real compiler sources, so the
886        // fingerprint must be a non-empty digest; an empty value would silently
887        // disable the within-version compiler-staleness guard.
888        assert!(!CODEGEN_FINGERPRINT.is_empty());
889    }
890
891    #[test]
892    fn codegen_fingerprint_changes_cache_key() {
893        // A compiler whose code-generation source differs must produce a
894        // different cache key for the *same* user source, so a stale artifact
895        // compiled by a prior compiler at the same version misses on load
896        // rather than being replayed (#2621). The fingerprint is a compile-time
897        // constant, so exercise the parameterized inner hash directly.
898        let tmp = tempfile::tempdir().unwrap();
899        let entry = tmp.path().join("entry.harn");
900        std::fs::write(&entry, "__io_println(\"hi\")\n").unwrap();
901        let source = std::fs::read_to_string(&entry).unwrap();
902        let a = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-A");
903        let b = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-B");
904        let a_again = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-A");
905        assert_ne!(
906            a, b,
907            "differing compiler fingerprints must change the cache key"
908        );
909        assert_eq!(
910            a, a_again,
911            "an unchanged compiler fingerprint must be stable"
912        );
913    }
914
915    #[test]
916    fn collect_user_imports_ignores_stdlib_and_comments() {
917        let source = r#"
918            // import "comment/should/be/ignored"
919            import "std/agents"
920            import { foo } from "pkg/bar"
921            import "./relative/path"
922        "#;
923        let imports = collect_user_imports(source);
924        assert_eq!(
925            imports,
926            vec!["pkg/bar".to_string(), "./relative/path".to_string()]
927        );
928    }
929
930    #[test]
931    fn cache_enabled_respects_env() {
932        std::env::set_var(CACHE_ENABLED_ENV, "0");
933        assert!(!cache_enabled());
934        std::env::set_var(CACHE_ENABLED_ENV, "1");
935        assert!(cache_enabled());
936        std::env::remove_var(CACHE_ENABLED_ENV);
937        assert!(cache_enabled());
938    }
939
940    #[test]
941    fn import_path_inside_string_literal_is_ignored() {
942        let source = r#"
943            let payload = "import { foo } from \"./other\""
944            import "./real"
945        "#;
946        let imports = collect_user_imports(source);
947        assert_eq!(imports, vec!["./real".to_string()]);
948    }
949
950    #[test]
951    fn import_hash_is_stable_across_import_order() {
952        let tmp = tempfile::tempdir().unwrap();
953        std::fs::write(
954            tmp.path().join("a.harn"),
955            "pub fn a() -> int { return 1 }\n",
956        )
957        .unwrap();
958        std::fs::write(
959            tmp.path().join("b.harn"),
960            "pub fn b() -> int { return 2 }\n",
961        )
962        .unwrap();
963        let ab = tmp.path().join("entry_ab.harn");
964        std::fs::write(
965            &ab,
966            "import \"./a\"\nimport \"./b\"\n__io_println(\"hi\")\n",
967        )
968        .unwrap();
969        let ba = tmp.path().join("entry_ba.harn");
970        std::fs::write(
971            &ba,
972            "import \"./b\"\nimport \"./a\"\n__io_println(\"hi\")\n",
973        )
974        .unwrap();
975        let hash_ab = hash_transitive_user_imports(&ab, &std::fs::read_to_string(&ab).unwrap());
976        let hash_ba = hash_transitive_user_imports(&ba, &std::fs::read_to_string(&ba).unwrap());
977        assert_eq!(
978            hash_ab, hash_ba,
979            "import-graph hash must be order-independent so reordering imports \
980             does not bust the cache"
981        );
982    }
983
984    #[test]
985    fn import_hash_picks_up_nested_imports() {
986        let tmp = tempfile::tempdir().unwrap();
987        std::fs::write(
988            tmp.path().join("leaf.harn"),
989            "pub fn x() -> int { return 1 }\n",
990        )
991        .unwrap();
992        std::fs::write(
993            tmp.path().join("mid.harn"),
994            "import \"./leaf\"\npub fn y() -> int { return 2 }\n",
995        )
996        .unwrap();
997        let entry = tmp.path().join("entry.harn");
998        std::fs::write(&entry, "import \"./mid\"\n__io_println(\"hi\")\n").unwrap();
999
1000        let before =
1001            hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1002        std::fs::write(
1003            tmp.path().join("leaf.harn"),
1004            "pub fn x() -> int { return 999 }\n",
1005        )
1006        .unwrap();
1007        let after = hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1008        assert_ne!(
1009            before, after,
1010            "editing a transitively-imported file must change the import-graph hash"
1011        );
1012    }
1013}