Skip to main content

harn_vm/
bytecode_cache.rs

1//! Content-addressed on-disk cache for compiled `.harn` pipelines.
2//!
3//! Cold-start `harn run` re-parses, type-checks, and compiles the entry
4//! pipeline before the VM gets a single instruction to execute. For short
5//! Harn subcommands that wrap a few `llm_call`s in a small pipeline, that
6//! compile cost dominates wall-clock time.
7//!
8//! This module persists [`Chunk`] bytecode under
9//! `$HARN_CACHE_DIR/<source-hash>.harnbc` (XDG-aware). The cache key is
10//! derived from the entry source plus the content hash of every
11//! transitively-imported user file; stdlib imports are covered by the
12//! embedded `harn_version` field in the header. Any change to any input
13//! flips the key and the next run recompiles.
14//!
15//! File layout — little-endian throughout:
16//!
17//! ```text
18//! magic        : [u8; 8]   = "HARNBC\0\0"
19//! schema_ver   : u32       = SCHEMA_VERSION
20//! version_len  : u32
21//! harn_version : [u8; version_len]
22//! compiler_tag : u8        bitmask of active CompilerOptions
23//! kind         : u8        1 = entry chunk, 2 = module artifact
24//! source_hash  : [u8; 32]
25//! import_hash  : [u8; 32]
26//! payload      : bincode-serialized payload for `kind`
27//! ```
28//!
29//! The header lets a stale binary detect a future-version artifact
30//! without crashing: a magic mismatch, schema mismatch, or version
31//! mismatch is returned as `Ok(None)` so the caller transparently
32//! recompiles. Real I/O errors propagate.
33//!
34//! Concurrency: writes are atomic (write-tmp-then-rename), and parallel
35//! invocations on a cache miss race safely — the last writer wins, but
36//! every reader observes a consistent file because the rename is atomic
37//! on every supported filesystem.
38
39use std::fs;
40use std::io::{self, Read as _, Write as _};
41use std::path::{Path, PathBuf};
42
43use sha2::{Digest, Sha256};
44
45use crate::chunk::{CachedChunk, Chunk};
46use crate::compiler::CompilerOptions;
47use crate::module_artifact::ModuleArtifact;
48
49type ImportScan = (String, Vec<String>);
50type SharedImportScan = std::sync::Arc<ImportScan>;
51type ImportsFileMemoKey = (PathBuf, u64, i128);
52type ImportsFileMemo =
53    std::sync::Mutex<std::collections::HashMap<ImportsFileMemoKey, SharedImportScan>>;
54
55/// Header magic for all bytecode-cache artifact families.
56pub const MAGIC: &[u8; 8] = b"HARNBC\0\0";
57
58/// On-disk format version. Bump when [`CachedChunk`] or the header
59/// layout changes in a backwards-incompatible way.
60pub const SCHEMA_VERSION: u32 = 4;
61
62/// Compile-time Harn release. Cache files written by a different release
63/// are rejected on load.
64pub const HARN_VERSION: &str = env!("CARGO_PKG_VERSION");
65
66/// Build-time fingerprint of the compiler front-end — the lexer, parser, IR,
67/// and code generator — computed in `build.rs` from those crates' source and
68/// baked in via `cargo:rustc-env`. Folded into the cache key so a compiler
69/// change that alters emitted bytecode for unchanged source invalidates stale
70/// entries automatically, within a single version, with no manual cache wipe.
71/// `HARN_VERSION` only busts the cache across release bumps; this closes the
72/// same gap for the within-version compiler edits that masked #2610. See #2621.
73pub const CODEGEN_FINGERPRINT: &str = env!("HARN_CODEGEN_FINGERPRINT");
74
75/// Conventional extension for entry-chunk cache files.
76pub const CACHE_EXTENSION: &str = "harnbc";
77
78/// Conventional extension for module-artifact cache files. Distinct from
79/// [`CACHE_EXTENSION`] so the same `.harn` source can have both shipped
80/// adjacent if needed (e.g. when a file is both an executable entry and
81/// imported by other files).
82pub const MODULE_CACHE_EXTENSION: &str = "harnmod";
83
84/// On-disk discriminant for a [`Chunk`] payload.
85const KIND_ENTRY_CHUNK: u8 = 1;
86/// On-disk discriminant for a [`ModuleArtifact`] payload.
87const KIND_MODULE_ARTIFACT: u8 = 2;
88
89/// Environment override for the cache directory. When set, takes
90/// precedence over the XDG and home-directory fallbacks.
91pub const CACHE_DIR_ENV: &str = "HARN_CACHE_DIR";
92
93/// Environment override that turns the cache off entirely. Setting this
94/// to `0`, `false`, `no`, or `off` skips both reads and writes; useful
95/// when debugging compiler changes.
96pub const CACHE_ENABLED_ENV: &str = "HARN_BYTECODE_CACHE";
97
98/// Result of a cache lookup. Carries the precomputed key so the caller
99/// can write it back on a miss without rehashing.
100pub struct LookupOutcome {
101    pub key: CacheKey,
102    pub chunk: Option<Chunk>,
103}
104
105/// Cache key components for a single pipeline source. Equality of all
106/// fields is necessary and sufficient for cache reuse.
107#[derive(Clone, Debug, PartialEq, Eq)]
108pub struct CacheKey {
109    pub source_hash: [u8; 32],
110    pub import_graph_hash: [u8; 32],
111    pub harn_version: &'static str,
112    /// Compact tag for active [`CompilerOptions`]. Flipping
113    /// `HARN_DISABLE_OPTIMIZATIONS` between runs would otherwise reuse a
114    /// chunk compiled under the wrong setting.
115    pub compiler_tag: u8,
116}
117
118impl CacheKey {
119    /// Compute the cache key for a `.harn` source file plus its transitive
120    /// user imports. `read_source` is the entry-file contents; the import
121    /// graph is walked from disk relative to `source_path`.
122    pub fn from_source(source_path: &Path, source: &str) -> Self {
123        let source_hash = sha256(source.as_bytes());
124        let import_graph_hash = hash_transitive_user_imports(source_path, source);
125        Self {
126            source_hash,
127            import_graph_hash,
128            harn_version: HARN_VERSION,
129            compiler_tag: compiler_options_tag(CompilerOptions::from_env()),
130        }
131    }
132
133    /// Entry-chunk filename for this key. We hash by source content
134    /// alone so two invocations of the same source from different paths
135    /// share a cache entry; the header's import-graph hash still gates
136    /// reuse on a per-load basis.
137    pub fn filename(&self) -> String {
138        format!("{}.{}", hex(&self.source_hash), CACHE_EXTENSION)
139    }
140
141    /// Module-artifact filename for this key.
142    pub fn module_filename(&self) -> String {
143        format!("{}.{}", hex(&self.source_hash), MODULE_CACHE_EXTENSION)
144    }
145}
146
147/// Returns the directory the shared cache lives in. Honors
148/// `$HARN_CACHE_DIR`, then `$XDG_CACHE_HOME/harn/bytecode`, then
149/// `$HOME/.cache/harn/bytecode`. The directory is *not* created here —
150/// [`store`] creates it lazily on write so read-only environments don't
151/// pay an mkdir cost.
152pub fn cache_dir() -> PathBuf {
153    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
154        return PathBuf::from(custom);
155    }
156    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
157        let xdg = PathBuf::from(xdg);
158        if !xdg.as_os_str().is_empty() {
159            return xdg.join("harn").join("bytecode");
160        }
161    }
162    if let Some(home) = crate::user_dirs::home_dir() {
163        return home.join(".cache").join("harn").join("bytecode");
164    }
165    // Final fallback: a directory beside the binary's working dir. Mostly
166    // hit in tests that scrub HOME from the environment.
167    PathBuf::from(".harn-cache").join("bytecode")
168}
169
170/// Root for `.harnpack` archives unpacked by `harn run <bundle.harnpack>`.
171/// Each verified bundle is replayed into `<root>/<sanitized-bundle-hash>/`
172/// so re-runs reuse the unpacked tree. Honors `$HARN_CACHE_DIR/packs`
173/// when set, otherwise XDG / `$HOME/.cache/harn/packs`.
174pub fn packs_cache_dir() -> PathBuf {
175    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
176        return PathBuf::from(custom).join("packs");
177    }
178    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
179        let xdg = PathBuf::from(xdg);
180        if !xdg.as_os_str().is_empty() {
181            return xdg.join("harn").join("packs");
182        }
183    }
184    if let Some(home) = crate::user_dirs::home_dir() {
185        return home.join(".cache").join("harn").join("packs");
186    }
187    PathBuf::from(".harn-cache").join("packs")
188}
189
190/// True when the cache is enabled by the current environment.
191pub fn cache_enabled() -> bool {
192    match std::env::var(CACHE_ENABLED_ENV).ok().as_deref() {
193        Some(value) => !matches!(
194            value.to_ascii_lowercase().as_str(),
195            "0" | "false" | "no" | "off"
196        ),
197        None => true,
198    }
199}
200
201/// Try to load a cached chunk for `source_path` whose contents are
202/// `source`. Returns the key alongside the (optional) chunk so callers
203/// avoid recomputing the key on miss.
204pub fn load(source_path: &Path, source: &str) -> LookupOutcome {
205    let key = CacheKey::from_source(source_path, source);
206    if !cache_enabled() {
207        return LookupOutcome { key, chunk: None };
208    }
209    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
210    if let Some(adjacent) = adjacent_cache_path(source_path) {
211        candidates.push(adjacent);
212    }
213    candidates.push(cache_dir().join(key.filename()));
214    for path in candidates {
215        match read_chunk_if_matches(&path, &key) {
216            Ok(Some(chunk)) => {
217                return LookupOutcome {
218                    key,
219                    chunk: Some(chunk),
220                }
221            }
222            Ok(None) => continue,
223            Err(_) => continue,
224        }
225    }
226    LookupOutcome { key, chunk: None }
227}
228
229/// Persist `chunk` to the shared cache directory under `key`. Atomic: a
230/// temp file is written then renamed into place. Concurrent invocations
231/// on the same key race safely.
232pub fn store(key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
233    if !cache_enabled() {
234        return Ok(());
235    }
236    let dir = cache_dir();
237    fs::create_dir_all(&dir)?;
238    write_atomic_chunk(&dir.join(key.filename()), key, chunk)
239}
240
241/// Write a precompiled entry-chunk artifact to an explicit path, for
242/// use by the `harn precompile` subcommand. The header still records
243/// the key, so adjacent artifacts shipped with source are validated
244/// like any other cache hit.
245pub fn store_at(path: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
246    ensure_parent_dir(path)?;
247    write_atomic_chunk(path, key, chunk)
248}
249
250/// Look up the [`ModuleArtifact`] for `source_path` (whose contents are
251/// `source`). Mirrors [`load`] but for the `.harnmod` family.
252pub fn load_module(source_path: &Path, source: &str) -> ModuleLookupOutcome {
253    let key = CacheKey::from_source(source_path, source);
254    if !cache_enabled() {
255        return ModuleLookupOutcome {
256            key,
257            artifact: None,
258        };
259    }
260    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
261    if let Some(adjacent) = adjacent_module_cache_path(source_path) {
262        candidates.push(adjacent);
263    }
264    candidates.push(cache_dir().join(key.module_filename()));
265    for path in candidates {
266        match read_module_if_matches(&path, &key) {
267            Ok(Some(artifact)) => {
268                return ModuleLookupOutcome {
269                    key,
270                    artifact: Some(artifact),
271                }
272            }
273            Ok(None) => continue,
274            Err(_) => continue,
275        }
276    }
277    ModuleLookupOutcome {
278        key,
279        artifact: None,
280    }
281}
282
283/// Persist `artifact` to the shared cache under `key`. Atomic;
284/// concurrent invocations race safely.
285pub fn store_module(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
286    if !cache_enabled() {
287        return Ok(());
288    }
289    let dir = cache_dir();
290    fs::create_dir_all(&dir)?;
291    write_atomic_module(&dir.join(key.module_filename()), key, artifact)
292}
293
294/// Write a module artifact to an explicit path.
295pub fn store_module_at(path: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
296    ensure_parent_dir(path)?;
297    write_atomic_module(path, key, artifact)
298}
299
300/// Result of a [`load_module`] lookup. Carries the precomputed key so
301/// the caller can write it back on a miss without rehashing.
302pub struct ModuleLookupOutcome {
303    pub key: CacheKey,
304    pub artifact: Option<ModuleArtifact>,
305}
306
307/// Path to the adjacent precompiled entry-chunk artifact for
308/// `source_path`. `foo.harn` → `foo.harnbc`.
309pub fn adjacent_cache_path(source_path: &Path) -> Option<PathBuf> {
310    adjacent_path_with_extension(source_path, CACHE_EXTENSION)
311}
312
313/// Path to the adjacent precompiled module-artifact for `source_path`.
314/// `foo.harn` → `foo.harnmod`.
315pub fn adjacent_module_cache_path(source_path: &Path) -> Option<PathBuf> {
316    adjacent_path_with_extension(source_path, MODULE_CACHE_EXTENSION)
317}
318
319fn adjacent_path_with_extension(source_path: &Path, ext: &str) -> Option<PathBuf> {
320    let stem = source_path.file_stem()?;
321    if stem.is_empty() {
322        return None;
323    }
324    let parent = source_path.parent().unwrap_or_else(|| Path::new(""));
325    let mut out = parent.join(stem);
326    out.set_extension(ext);
327    Some(out)
328}
329
330fn ensure_parent_dir(path: &Path) -> io::Result<()> {
331    if let Some(parent) = path.parent() {
332        if !parent.as_os_str().is_empty() {
333            fs::create_dir_all(parent)?;
334        }
335    }
336    Ok(())
337}
338
339fn write_atomic_chunk(target: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
340    let buf = serialize_chunk_artifact(key, chunk)?;
341    write_atomic(target, &buf)
342}
343
344fn write_atomic_module(target: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
345    let buf = serialize_module_artifact(key, artifact)?;
346    write_atomic(target, &buf)
347}
348
349/// Serialize an entry-chunk artifact (header + payload) to bytes. The
350/// resulting buffer is byte-identical to the file [`store_at`] would
351/// have written for the same `(key, chunk)`. Use this when packaging
352/// artifacts into a container (e.g. `harn pack`) without going through
353/// the filesystem.
354pub fn serialize_chunk_artifact(key: &CacheKey, chunk: &Chunk) -> io::Result<Vec<u8>> {
355    let cached = chunk.freeze_for_cache();
356    let payload = bincode::serde::encode_to_vec(&cached, bincode::config::standard())
357        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
358    Ok(encode_artifact(key, KIND_ENTRY_CHUNK, &payload))
359}
360
361/// Serialize a module artifact (header + payload) to bytes. Companion
362/// to [`serialize_chunk_artifact`] for the `.harnmod` family.
363pub fn serialize_module_artifact(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<Vec<u8>> {
364    let payload = bincode::serde::encode_to_vec(artifact, bincode::config::standard())
365        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
366    Ok(encode_artifact(key, KIND_MODULE_ARTIFACT, &payload))
367}
368
369fn encode_artifact(key: &CacheKey, kind: u8, payload: &[u8]) -> Vec<u8> {
370    let mut buf: Vec<u8> = Vec::with_capacity(payload.len() + 128);
371    buf.extend_from_slice(MAGIC);
372    buf.extend_from_slice(&SCHEMA_VERSION.to_le_bytes());
373    let version_bytes = HARN_VERSION.as_bytes();
374    buf.extend_from_slice(&(version_bytes.len() as u32).to_le_bytes());
375    buf.extend_from_slice(version_bytes);
376    buf.push(key.compiler_tag);
377    buf.push(kind);
378    buf.extend_from_slice(&key.source_hash);
379    buf.extend_from_slice(&key.import_graph_hash);
380    buf.extend_from_slice(payload);
381    buf
382}
383
384fn write_atomic(target: &Path, buf: &[u8]) -> io::Result<()> {
385    let tmp_name = match target.file_name() {
386        Some(name) => format!(".{}.{}.tmp", name.to_string_lossy(), std::process::id()),
387        None => format!(".harn-cache.{}.tmp", std::process::id()),
388    };
389    let tmp_path = target.with_file_name(tmp_name);
390    let mut tmp_file = fs::File::create(&tmp_path)?;
391    tmp_file.write_all(buf)?;
392    tmp_file.sync_all()?;
393    drop(tmp_file);
394    match fs::rename(&tmp_path, target) {
395        Ok(()) => Ok(()),
396        Err(err) => {
397            let _ = fs::remove_file(&tmp_path);
398            Err(err)
399        }
400    }
401}
402
403/// Parsed cache header. Read by both the chunk and module loaders so the
404/// header-validation logic stays in one place.
405struct ParsedHeader {
406    kind: u8,
407    payload: Vec<u8>,
408}
409
410fn read_header_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ParsedHeader>> {
411    let mut file = match fs::File::open(path) {
412        Ok(f) => f,
413        Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(None),
414        Err(err) => return Err(err),
415    };
416    let mut header = [0u8; 8 + 4 + 4];
417    if file.read_exact(&mut header).is_err() {
418        return Ok(None);
419    }
420    if &header[..8] != MAGIC {
421        return Ok(None);
422    }
423    let schema = u32::from_le_bytes(header[8..12].try_into().unwrap());
424    if schema != SCHEMA_VERSION {
425        return Ok(None);
426    }
427    let version_len = u32::from_le_bytes(header[12..16].try_into().unwrap()) as usize;
428    if version_len > 256 {
429        // Bound the alloc so a corrupted file cannot force an unbounded read.
430        return Ok(None);
431    }
432    let mut version_buf = vec![0u8; version_len];
433    if file.read_exact(&mut version_buf).is_err() {
434        return Ok(None);
435    }
436    if version_buf != key.harn_version.as_bytes() {
437        return Ok(None);
438    }
439    let mut compiler_and_kind = [0u8; 2];
440    if file.read_exact(&mut compiler_and_kind).is_err() {
441        return Ok(None);
442    }
443    if compiler_and_kind[0] != key.compiler_tag {
444        return Ok(None);
445    }
446    let kind = compiler_and_kind[1];
447    let mut hashes = [0u8; 64];
448    if file.read_exact(&mut hashes).is_err() {
449        return Ok(None);
450    }
451    if hashes[..32] != key.source_hash || hashes[32..] != key.import_graph_hash {
452        return Ok(None);
453    }
454    let mut payload = Vec::new();
455    if file.read_to_end(&mut payload).is_err() {
456        return Ok(None);
457    }
458    Ok(Some(ParsedHeader { kind, payload }))
459}
460
461fn read_chunk_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<Chunk>> {
462    let Some(header) = read_header_if_matches(path, key)? else {
463        return Ok(None);
464    };
465    if header.kind != KIND_ENTRY_CHUNK {
466        return Ok(None);
467    }
468    let cached: CachedChunk =
469        match bincode::serde::decode_from_slice(&header.payload, bincode::config::standard()) {
470            Ok((c, _)) => c,
471            Err(_) => return Ok(None),
472        };
473    Ok(Some(Chunk::from_cached(&cached)))
474}
475
476fn read_module_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ModuleArtifact>> {
477    let Some(header) = read_header_if_matches(path, key)? else {
478        return Ok(None);
479    };
480    if header.kind != KIND_MODULE_ARTIFACT {
481        return Ok(None);
482    }
483    match bincode::serde::decode_from_slice::<ModuleArtifact, _>(
484        &header.payload,
485        bincode::config::standard(),
486    ) {
487        Ok((artifact, _)) => Ok(Some(artifact)),
488        Err(_) => Ok(None),
489    }
490}
491
492/// Compact representation of [`CompilerOptions`] for the cache header.
493/// Independent flags get distinct bits so adding a new flag never
494/// silently changes existing keys when an old binary reads a new
495/// artifact — the header check will fail-closed before we get there
496/// anyway, but mapping to bits also keeps the tag a stable function
497/// of the option set.
498fn compiler_options_tag(options: CompilerOptions) -> u8 {
499    let mut tag: u8 = 0;
500    if options.optimizations_enabled() {
501        tag |= 0b0000_0001;
502    }
503    tag
504}
505
506fn sha256(bytes: &[u8]) -> [u8; 32] {
507    let mut hasher = Sha256::new();
508    hasher.update(bytes);
509    hasher.finalize().into()
510}
511
512fn hex(bytes: &[u8]) -> String {
513    let mut out = String::with_capacity(bytes.len() * 2);
514    for byte in bytes {
515        out.push_str(&format!("{byte:02x}"));
516    }
517    out
518}
519
520/// Lightweight regex-free scan that surfaces user imports without paying
521/// a full lex+parse. False positives only increase cache churn, never
522/// correctness; comments and string literals are skipped so neither a
523/// commented-out import nor a `"import …"` value appearing inside an
524/// unrelated string gates the hash.
525fn collect_user_imports(source: &str) -> Vec<String> {
526    let scrubbed = strip_comments(source);
527    let mut out: Vec<String> = Vec::new();
528    let bytes = scrubbed.as_bytes();
529    let mut i = 0;
530    while i < bytes.len() {
531        if bytes[i] == b'"' {
532            // Skip past any string literal so identifiers inside string
533            // values cannot trigger the keyword match below.
534            match read_string_literal(bytes, i) {
535                Some((_, end)) => {
536                    i = end;
537                    continue;
538                }
539                None => {
540                    i += 1;
541                    continue;
542                }
543            }
544        }
545        if !matches_keyword(bytes, i, b"import") {
546            i += 1;
547            continue;
548        }
549        // Skip past `import` and any selective `{ ... } from` clause; we
550        // only need the source-position of the path string literal.
551        let mut j = i + b"import".len();
552        let mut depth = 0i32;
553        while j < bytes.len() {
554            match bytes[j] {
555                b'"' => {
556                    if let Some((path, end)) = read_string_literal(bytes, j) {
557                        if !path.starts_with("std/") {
558                            out.push(path);
559                        }
560                        i = end;
561                        break;
562                    }
563                    j += 1;
564                }
565                b'{' => {
566                    depth += 1;
567                    j += 1;
568                }
569                b'}' => {
570                    depth -= 1;
571                    j += 1;
572                }
573                b'\n' if depth == 0 => {
574                    // No string literal on this logical line; bail and
575                    // continue scanning after the keyword to avoid an
576                    // infinite loop.
577                    i = j;
578                    break;
579                }
580                _ => j += 1,
581            }
582        }
583        if j >= bytes.len() {
584            break;
585        }
586        if i < j {
587            // Defensive: ensure forward progress when the inner loop
588            // exited without setting `i`.
589            i = j;
590        }
591    }
592    out
593}
594
595fn matches_keyword(bytes: &[u8], at: usize, keyword: &[u8]) -> bool {
596    let end = at + keyword.len();
597    if end > bytes.len() {
598        return false;
599    }
600    if &bytes[at..end] != keyword {
601        return false;
602    }
603    if at > 0 && is_ident_char(bytes[at - 1]) {
604        return false;
605    }
606    if end < bytes.len() && is_ident_char(bytes[end]) {
607        return false;
608    }
609    true
610}
611
612fn is_ident_char(b: u8) -> bool {
613    b.is_ascii_alphanumeric() || b == b'_'
614}
615
616fn read_string_literal(bytes: &[u8], at: usize) -> Option<(String, usize)> {
617    debug_assert_eq!(bytes[at], b'"');
618    let mut out = String::new();
619    let mut i = at + 1;
620    while i < bytes.len() {
621        match bytes[i] {
622            b'"' => return Some((out, i + 1)),
623            b'\\' => {
624                if i + 1 >= bytes.len() {
625                    return None;
626                }
627                match bytes[i + 1] {
628                    b'"' => out.push('"'),
629                    b'\\' => out.push('\\'),
630                    b'n' => out.push('\n'),
631                    b'r' => out.push('\r'),
632                    b't' => out.push('\t'),
633                    other => out.push(other as char),
634                }
635                i += 2;
636            }
637            b'\n' => return None,
638            byte => {
639                out.push(byte as char);
640                i += 1;
641            }
642        }
643    }
644    None
645}
646
647fn strip_comments(source: &str) -> String {
648    let bytes = source.as_bytes();
649    let mut out = String::with_capacity(source.len());
650    let mut i = 0;
651    while i < bytes.len() {
652        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'/' {
653            while i < bytes.len() && bytes[i] != b'\n' {
654                i += 1;
655            }
656            continue;
657        }
658        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'*' {
659            i += 2;
660            while i + 1 < bytes.len() && !(bytes[i] == b'*' && bytes[i + 1] == b'/') {
661                i += 1;
662            }
663            i = (i + 2).min(bytes.len());
664            continue;
665        }
666        if bytes[i] == b'"' {
667            if let Some((_, end)) = read_string_literal(bytes, i) {
668                out.push_str(&source[i..end]);
669                i = end;
670                continue;
671            }
672        }
673        out.push(bytes[i] as char);
674        i += 1;
675    }
676    out
677}
678
679/// Stable digest over every embedded stdlib source. Folded into the
680/// user-file cache key so that bumping a stdlib module (changing its
681/// embedded `.harn` content) invalidates cached user bytecode that may
682/// reference stale function-pool layouts from a prior stdlib snapshot.
683/// `HARN_VERSION` already busts the cache across release bumps; this
684/// closes the same gap for within-version stdlib edits (a frequent
685/// pattern during local development).
686///
687/// Cached in a `OnceLock` because `STDLIB_SOURCES` is a static `const`
688/// slice — the digest is identical for the lifetime of the process.
689fn embedded_stdlib_digest() -> &'static [u8; 32] {
690    use std::sync::OnceLock;
691    static DIGEST: OnceLock<[u8; 32]> = OnceLock::new();
692    DIGEST.get_or_init(|| {
693        let mut entries: Vec<(&'static str, &'static str)> = harn_stdlib::STDLIB_SOURCES
694            .iter()
695            .map(|src| (src.module, src.source))
696            .collect();
697        entries.sort_by(|a, b| a.0.cmp(b.0));
698        let mut hasher = Sha256::new();
699        for (module, source) in entries {
700            hasher.update(module.as_bytes());
701            hasher.update(b"\0");
702            hasher.update(source.as_bytes());
703            hasher.update(b"\0");
704        }
705        hasher.finalize().into()
706    })
707}
708
709/// Walk the user-import graph rooted at `source_path` and produce a
710/// stable hash of every transitively-reachable file. The hash is
711/// order-independent: each visited file is keyed by canonical path and
712/// emitted in sorted order, so reordering imports inside a file does
713/// not invalidate the cache while changing any file's content does.
714///
715/// Embedded stdlib content is folded into the hash too — `collect_user_imports`
716/// deliberately skips `std/*` paths (they resolve to in-binary sources, not
717/// disk files), so without this fold a stdlib edit between development
718/// builds would leave user-file caches pinned to a stale stdlib snapshot.
719fn hash_transitive_user_imports(source_path: &Path, source: &str) -> [u8; 32] {
720    hash_transitive_user_imports_fingerprinted(source_path, source, CODEGEN_FINGERPRINT)
721}
722
723/// Process-wide memo of `(file content, collect_user_imports(content))` keyed by
724/// the resolved file path plus its stat identity `(len, mtime_ns)`. Walking a
725/// large pipeline's import graph re-encounters the same shared library files for
726/// nearly every module, so without this memo `from_source` re-reads and
727/// re-scans those files hundreds of times in a single cold run. Because the key
728/// includes `(len, mtime_ns)`, any on-disk edit produces a fresh key and the
729/// stale entry is never reused — a warm long-lived process recompiles edited
730/// pipelines correctly. The returned bytes are identical to the un-memoized
731/// path, so cache keys are byte-for-byte unchanged.
732fn imports_file_memo() -> &'static ImportsFileMemo {
733    use std::sync::OnceLock;
734    static MEMO: OnceLock<ImportsFileMemo> = OnceLock::new();
735    MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()))
736}
737
738/// Process-wide memo of `Path::canonicalize`. The import-graph walk canonicalizes
739/// the same resolved module paths hundreds of times across a cold `from_source`
740/// fan-out, and each call is a `realpath(3)` syscall. A successful
741/// canonicalization is stable for the process lifetime (the pipeline tree is not
742/// moved mid-run), so it is memoized. A *failed* canonicalization (the path does
743/// not exist yet) is NOT memoized: a file that later appears — or a symlink that
744/// is created — must canonicalize freshly so the folded path key matches what a
745/// cold process would produce. This keeps the memo a pure speed optimization with
746/// byte-identical output.
747fn canonicalize_cached(path: &Path) -> PathBuf {
748    use std::sync::OnceLock;
749    static MEMO: OnceLock<std::sync::Mutex<std::collections::HashMap<PathBuf, PathBuf>>> =
750        OnceLock::new();
751    let memo = MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()));
752    if let Some(hit) = memo.lock().unwrap().get(path).cloned() {
753        return hit;
754    }
755    match path.canonicalize() {
756        Ok(canonical) => {
757            memo.lock()
758                .unwrap()
759                .insert(path.to_path_buf(), canonical.clone());
760            canonical
761        }
762        // Unresolved path: fall back to the input, but do not memoize, so a file
763        // that appears later canonicalizes correctly on the next walk.
764        Err(_) => path.to_path_buf(),
765    }
766}
767
768fn file_stat_identity(path: &Path) -> Option<(u64, i128)> {
769    let meta = fs::metadata(path).ok()?;
770    let len = meta.len();
771    // Nanosecond mtime where available; fall back to coarse seconds. Any change
772    // to either component on disk invalidates the memo entry.
773    let mtime_ns = meta
774        .modified()
775        .ok()
776        .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
777        .map(|d| d.as_nanos() as i128)
778        .unwrap_or(0);
779    Some((len, mtime_ns))
780}
781
782/// Read `path` and scan its user imports, memoized by stat identity. On an I/O
783/// error, returns the `ErrorKind` string the un-memoized path folded in (errors
784/// are not memoized — a transient failure should not be sticky).
785fn read_and_scan_imports_cached(path: &Path) -> Result<(String, Vec<String>), String> {
786    if let Some((len, mtime_ns)) = file_stat_identity(path) {
787        let key = (path.to_path_buf(), len, mtime_ns);
788        if let Some(hit) = imports_file_memo().lock().unwrap().get(&key).cloned() {
789            return Ok((hit.0.clone(), hit.1.clone()));
790        }
791        match fs::read_to_string(path) {
792            Ok(content) => {
793                let nested = collect_user_imports(&content);
794                let entry = std::sync::Arc::new((content.clone(), nested.clone()));
795                imports_file_memo().lock().unwrap().insert(key, entry);
796                Ok((content, nested))
797            }
798            Err(err) => Err(err.kind().to_string()),
799        }
800    } else {
801        // No stat (file vanished between resolve and read): fall back to a direct
802        // read so behavior matches the un-memoized path exactly.
803        match fs::read_to_string(path) {
804            Ok(content) => {
805                let nested = collect_user_imports(&content);
806                Ok((content, nested))
807            }
808            Err(err) => Err(err.kind().to_string()),
809        }
810    }
811}
812
813/// Inner form of [`hash_transitive_user_imports`] parameterized on the compiler
814/// fingerprint so tests can vary it; production always passes
815/// [`CODEGEN_FINGERPRINT`].
816fn hash_transitive_user_imports_fingerprinted(
817    source_path: &Path,
818    source: &str,
819    codegen_fingerprint: &str,
820) -> [u8; 32] {
821    let mut visited: std::collections::BTreeMap<PathBuf, ImportNode> =
822        std::collections::BTreeMap::new();
823    let mut frontier: Vec<(PathBuf, String)> = collect_user_imports(source)
824        .into_iter()
825        .map(|import| (source_path.to_path_buf(), import))
826        .collect();
827
828    while let Some((anchor, import)) = frontier.pop() {
829        let Some(resolved) = harn_modules::resolve_import_path(&anchor, &import) else {
830            // Unresolved imports get a sentinel keyed by their resolution
831            // anchor so that dropping a real file under that anchor later
832            // produces a different key.
833            let sentinel = anchor.join(format!("__unresolved__/{import}"));
834            visited
835                .entry(sentinel)
836                .or_insert(ImportNode::Unresolved { import });
837            continue;
838        };
839        let canonical = canonicalize_cached(&resolved);
840        if visited.contains_key(&canonical) {
841            continue;
842        }
843        // Per-file read + import-scan is memoized process-wide, keyed by the
844        // file's identity stat `(len, mtime)`. The same handful of core library
845        // modules (`lib/host/*`, `lib/runtime/*`, ...) sit on the import graph of
846        // nearly every module, so a cold `from_source` over a large pipeline used
847        // to re-read and re-scan the same files hundreds of times across the
848        // module-load fan-out. The memo is invalidated automatically the moment a
849        // file's stat changes on disk, so a warm long-lived process still recompiles
850        // edited pipelines correctly. The folded hash bytes are byte-identical to
851        // the un-memoized path (same content + same `collect_user_imports` output),
852        // so cache keys are unchanged. See `imports_file_memo`.
853        match read_and_scan_imports_cached(&resolved) {
854            Ok((content, nested)) => {
855                visited.insert(canonical.clone(), ImportNode::Resolved { content });
856                for nested_import in nested {
857                    frontier.push((resolved.clone(), nested_import));
858                }
859            }
860            Err(kind) => {
861                visited.insert(canonical, ImportNode::IoError { kind });
862            }
863        }
864    }
865
866    let mut hasher = Sha256::new();
867    hasher.update(b"stdlib-digest\0");
868    hasher.update(embedded_stdlib_digest());
869    hasher.update(b"\0");
870    // Fold in the compiler's code-generation fingerprint so a compiler change
871    // that alters emitted bytecode for unchanged source busts stale cache
872    // entries within a single version — the gap that masked the #2610 fix until
873    // the cache was cleared by hand. See `build.rs` and `CODEGEN_FINGERPRINT`.
874    hasher.update(b"codegen-fingerprint\0");
875    hasher.update(codegen_fingerprint.as_bytes());
876    hasher.update(b"\0");
877    for (path, node) in &visited {
878        hasher.update(path.to_string_lossy().as_bytes());
879        hasher.update(b"\0");
880        match node {
881            ImportNode::Resolved { content } => {
882                hasher.update(b"resolved\0");
883                hasher.update(content.as_bytes());
884            }
885            ImportNode::Unresolved { import } => {
886                hasher.update(b"unresolved\0");
887                hasher.update(import.as_bytes());
888            }
889            ImportNode::IoError { kind } => {
890                hasher.update(b"ioerror\0");
891                hasher.update(kind.as_bytes());
892            }
893        }
894        hasher.update(b"\0");
895    }
896    hasher.finalize().into()
897}
898
899enum ImportNode {
900    Resolved { content: String },
901    Unresolved { import: String },
902    IoError { kind: String },
903}
904
905#[cfg(test)]
906mod tests {
907    use super::*;
908    use crate::compile_source;
909
910    #[test]
911    fn header_round_trips_chunk() {
912        let chunk = compile_source("__io_println(\"hello\")").expect("compile");
913        let key = CacheKey::from_source(Path::new("/tmp/example.harn"), "__io_println(\"hello\")");
914        let tmp = tempfile::tempdir().unwrap();
915        let path = tmp.path().join("entry.harnbc");
916        store_at(&path, &key, &chunk).expect("write");
917        let loaded = read_chunk_if_matches(&path, &key).unwrap();
918        assert!(loaded.is_some(), "expected cached chunk to load");
919    }
920
921    #[test]
922    fn serialize_chunk_artifact_matches_store_at() {
923        // `serialize_chunk_artifact` packages an artifact into a buffer for
924        // in-memory consumers (e.g. `harn pack` writing into a tar.zst
925        // bundle). The contract is: the resulting bytes match what
926        // `store_at` would have written for the same key+chunk, so the
927        // shipped artifact is byte-identical to the on-disk cache form.
928        let chunk = compile_source("__io_println(\"hi\")").expect("compile");
929        let key = CacheKey::from_source(Path::new("/tmp/pack.harn"), "__io_println(\"hi\")");
930        let tmp = tempfile::tempdir().unwrap();
931        let on_disk = tmp.path().join("pack.harnbc");
932        store_at(&on_disk, &key, &chunk).expect("write");
933        let on_disk_bytes = std::fs::read(&on_disk).unwrap();
934        let in_memory_bytes = serialize_chunk_artifact(&key, &chunk).expect("serialize");
935        assert_eq!(in_memory_bytes, on_disk_bytes);
936    }
937
938    #[test]
939    fn header_mismatch_returns_none() {
940        let chunk = compile_source("1 + 1").expect("compile");
941        let key = CacheKey::from_source(Path::new("/tmp/a.harn"), "1 + 1");
942        let tmp = tempfile::tempdir().unwrap();
943        let path = tmp.path().join("a.harnbc");
944        store_at(&path, &key, &chunk).expect("write");
945        let other = CacheKey {
946            source_hash: [0xAB; 32],
947            import_graph_hash: key.import_graph_hash,
948            harn_version: HARN_VERSION,
949            compiler_tag: key.compiler_tag,
950        };
951        assert!(read_chunk_if_matches(&path, &other).unwrap().is_none());
952    }
953
954    #[test]
955    fn compiler_tag_mismatch_returns_none() {
956        let chunk = compile_source("1 + 1").expect("compile");
957        let key = CacheKey::from_source(Path::new("/tmp/b.harn"), "1 + 1");
958        let tmp = tempfile::tempdir().unwrap();
959        let path = tmp.path().join("b.harnbc");
960        store_at(&path, &key, &chunk).expect("write");
961        let other = CacheKey {
962            compiler_tag: key.compiler_tag ^ 0xFF,
963            ..key
964        };
965        assert!(
966            read_chunk_if_matches(&path, &other).unwrap().is_none(),
967            "flipped HARN_DISABLE_OPTIMIZATIONS must not reuse a chunk \
968             compiled under the opposite setting"
969        );
970    }
971
972    #[test]
973    fn codegen_fingerprint_is_populated() {
974        // In-workspace builds always hash real compiler sources, so the
975        // fingerprint must be a non-empty digest; an empty value would silently
976        // disable the within-version compiler-staleness guard.
977        assert!(!CODEGEN_FINGERPRINT.is_empty());
978    }
979
980    #[test]
981    fn codegen_fingerprint_changes_cache_key() {
982        // A compiler whose code-generation source differs must produce a
983        // different cache key for the *same* user source, so a stale artifact
984        // compiled by a prior compiler at the same version misses on load
985        // rather than being replayed (#2621). The fingerprint is a compile-time
986        // constant, so exercise the parameterized inner hash directly.
987        let tmp = tempfile::tempdir().unwrap();
988        let entry = tmp.path().join("entry.harn");
989        std::fs::write(&entry, "__io_println(\"hi\")\n").unwrap();
990        let source = std::fs::read_to_string(&entry).unwrap();
991        let a = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-A");
992        let b = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-B");
993        let a_again = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-A");
994        assert_ne!(
995            a, b,
996            "differing compiler fingerprints must change the cache key"
997        );
998        assert_eq!(
999            a, a_again,
1000            "an unchanged compiler fingerprint must be stable"
1001        );
1002    }
1003
1004    #[test]
1005    fn collect_user_imports_ignores_stdlib_and_comments() {
1006        let source = r#"
1007            // import "comment/should/be/ignored"
1008            import "std/agents"
1009            import { foo } from "pkg/bar"
1010            import "./relative/path"
1011        "#;
1012        let imports = collect_user_imports(source);
1013        assert_eq!(
1014            imports,
1015            vec!["pkg/bar".to_string(), "./relative/path".to_string()]
1016        );
1017    }
1018
1019    #[test]
1020    fn cache_enabled_respects_env() {
1021        std::env::set_var(CACHE_ENABLED_ENV, "0");
1022        assert!(!cache_enabled());
1023        std::env::set_var(CACHE_ENABLED_ENV, "1");
1024        assert!(cache_enabled());
1025        std::env::remove_var(CACHE_ENABLED_ENV);
1026        assert!(cache_enabled());
1027    }
1028
1029    #[test]
1030    fn import_path_inside_string_literal_is_ignored() {
1031        let source = r#"
1032            let payload = "import { foo } from \"./other\""
1033            import "./real"
1034        "#;
1035        let imports = collect_user_imports(source);
1036        assert_eq!(imports, vec!["./real".to_string()]);
1037    }
1038
1039    #[test]
1040    fn import_hash_is_stable_across_import_order() {
1041        let tmp = tempfile::tempdir().unwrap();
1042        std::fs::write(
1043            tmp.path().join("a.harn"),
1044            "pub fn a() -> int { return 1 }\n",
1045        )
1046        .unwrap();
1047        std::fs::write(
1048            tmp.path().join("b.harn"),
1049            "pub fn b() -> int { return 2 }\n",
1050        )
1051        .unwrap();
1052        let ab = tmp.path().join("entry_ab.harn");
1053        std::fs::write(
1054            &ab,
1055            "import \"./a\"\nimport \"./b\"\n__io_println(\"hi\")\n",
1056        )
1057        .unwrap();
1058        let ba = tmp.path().join("entry_ba.harn");
1059        std::fs::write(
1060            &ba,
1061            "import \"./b\"\nimport \"./a\"\n__io_println(\"hi\")\n",
1062        )
1063        .unwrap();
1064        let hash_ab = hash_transitive_user_imports(&ab, &std::fs::read_to_string(&ab).unwrap());
1065        let hash_ba = hash_transitive_user_imports(&ba, &std::fs::read_to_string(&ba).unwrap());
1066        assert_eq!(
1067            hash_ab, hash_ba,
1068            "import-graph hash must be order-independent so reordering imports \
1069             does not bust the cache"
1070        );
1071    }
1072
1073    #[test]
1074    fn import_hash_picks_up_nested_imports() {
1075        let tmp = tempfile::tempdir().unwrap();
1076        std::fs::write(
1077            tmp.path().join("leaf.harn"),
1078            "pub fn x() -> int { return 1 }\n",
1079        )
1080        .unwrap();
1081        std::fs::write(
1082            tmp.path().join("mid.harn"),
1083            "import \"./leaf\"\npub fn y() -> int { return 2 }\n",
1084        )
1085        .unwrap();
1086        let entry = tmp.path().join("entry.harn");
1087        std::fs::write(&entry, "import \"./mid\"\n__io_println(\"hi\")\n").unwrap();
1088
1089        let before =
1090            hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1091        std::fs::write(
1092            tmp.path().join("leaf.harn"),
1093            "pub fn x() -> int { return 999 }\n",
1094        )
1095        .unwrap();
1096        let after = hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1097        assert_ne!(
1098            before, after,
1099            "editing a transitively-imported file must change the import-graph hash"
1100        );
1101    }
1102
1103    #[test]
1104    fn import_hash_busts_on_same_length_edit_in_same_process() {
1105        // The per-file read/scan memo is keyed by `(path, len, mtime_ns)`. The
1106        // hardest case for that key is an edit that preserves byte length: only
1107        // the mtime distinguishes the two versions. Guard that a same-length edit
1108        // to a transitively-imported file, recomputed in the SAME process so the
1109        // memo is warm, still busts the import-graph hash. Without a working
1110        // staleness check a warm long-lived process would replay stale bytecode.
1111        let tmp = tempfile::tempdir().unwrap();
1112        let leaf = tmp.path().join("leaf.harn");
1113        std::fs::write(&leaf, "pub fn x() -> int { return 111 }\n").unwrap();
1114        let entry = tmp.path().join("entry.harn");
1115        std::fs::write(&entry, "import \"./leaf\"\n__io_println(\"hi\")\n").unwrap();
1116
1117        let before =
1118            hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1119
1120        // Same byte length (`111` -> `222`), so the memo must rely on mtime.
1121        // Sleep past the coarsest plausible mtime granularity so the stat key
1122        // genuinely changes on every filesystem this runs on.
1123        std::thread::sleep(std::time::Duration::from_millis(1100));
1124        std::fs::write(&leaf, "pub fn x() -> int { return 222 }\n").unwrap();
1125        assert_eq!(
1126            std::fs::metadata(&leaf).unwrap().len(),
1127            33,
1128            "the two leaf versions must be the same byte length for this test to \
1129             exercise the mtime path"
1130        );
1131
1132        let after = hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1133        assert_ne!(
1134            before, after,
1135            "a same-length edit to a transitively-imported file must still change \
1136             the import-graph hash when recomputed in a warm process"
1137        );
1138    }
1139
1140    #[test]
1141    fn import_hash_stable_across_repeated_calls_same_process() {
1142        // The memo must be a pure speed optimization: repeated `from_source`
1143        // calls over an unchanged tree (the cold-start module-load fan-out
1144        // pattern) must return byte-identical hashes.
1145        let tmp = tempfile::tempdir().unwrap();
1146        std::fs::write(
1147            tmp.path().join("dep.harn"),
1148            "pub fn d() -> int { return 7 }\n",
1149        )
1150        .unwrap();
1151        let entry = tmp.path().join("entry.harn");
1152        std::fs::write(&entry, "import \"./dep\"\n__io_println(\"hi\")\n").unwrap();
1153        let src = std::fs::read_to_string(&entry).unwrap();
1154        let first = hash_transitive_user_imports(&entry, &src);
1155        for _ in 0..50 {
1156            assert_eq!(
1157                hash_transitive_user_imports(&entry, &src),
1158                first,
1159                "repeated import-graph hashing over an unchanged tree must be stable"
1160            );
1161        }
1162    }
1163}