Skip to main content

harn_vm/
bytecode_cache.rs

1//! Content-addressed on-disk cache for compiled `.harn` pipelines.
2//!
3//! Cold-start `harn run` re-parses, type-checks, and compiles the entry
4//! pipeline before the VM gets a single instruction to execute. For short
5//! Harn subcommands that wrap a few `llm_call`s in a small pipeline, that
6//! compile cost dominates wall-clock time.
7//!
8//! This module persists [`Chunk`] bytecode under
9//! `$HARN_CACHE_DIR/<source-hash>.harnbc` (XDG-aware). The cache key is
10//! derived from the entry source plus the content hash of every
11//! transitively-imported user file; stdlib imports are covered by the
12//! embedded `harn_version` field in the header. Any change to any input
13//! flips the key and the next run recompiles.
14//!
15//! File layout — little-endian throughout:
16//!
17//! ```text
18//! magic        : [u8; 8]   = "HARNBC\0\0"
19//! schema_ver   : u32       = SCHEMA_VERSION
20//! version_len  : u32
21//! harn_version : [u8; version_len]
22//! compiler_tag : u8        bitmask of active CompilerOptions
23//! kind         : u8        1 = entry chunk, 2 = module artifact
24//! source_hash  : [u8; 32]
25//! import_hash  : [u8; 32]
26//! payload      : bincode-serialized payload for `kind`
27//! ```
28//!
29//! The header lets a stale binary detect a future-version artifact
30//! without crashing: a magic mismatch, schema mismatch, or version
31//! mismatch is returned as `Ok(None)` so the caller transparently
32//! recompiles. Real I/O errors propagate.
33//!
34//! Concurrency: writes are atomic (write-tmp-then-rename), and parallel
35//! invocations on a cache miss race safely — the last writer wins, but
36//! every reader observes a consistent file because the rename is atomic
37//! on every supported filesystem.
38
39use std::fs;
40use std::io::{self, Read as _, Write as _};
41use std::path::{Path, PathBuf};
42
43use sha2::{Digest, Sha256};
44
45use crate::chunk::{CachedChunk, Chunk};
46use crate::compiler::CompilerOptions;
47use crate::module_artifact::ModuleArtifact;
48
49type ImportScan = (String, Vec<String>);
50type SharedImportScan = std::sync::Arc<ImportScan>;
51type ImportsFileMemoKey = (PathBuf, u64, i128);
52type ImportsFileMemo =
53    std::sync::Mutex<std::collections::HashMap<ImportsFileMemoKey, SharedImportScan>>;
54
55/// Header magic for all bytecode-cache artifact families.
56pub const MAGIC: &[u8; 8] = b"HARNBC\0\0";
57
58/// On-disk format version. Bump when [`CachedChunk`] or the header
59/// layout changes in a backwards-incompatible way.
60pub const SCHEMA_VERSION: u32 = 4;
61
62/// Compile-time Harn release. Cache files written by a different release
63/// are rejected on load.
64pub const HARN_VERSION: &str = env!("CARGO_PKG_VERSION");
65
66/// Build-time fingerprint of the compiler front-end — the lexer, parser, IR,
67/// and code generator — computed in `build.rs` from those crates' source and
68/// baked in via `cargo:rustc-env`. Folded into the cache key so a compiler
69/// change that alters emitted bytecode for unchanged source invalidates stale
70/// entries automatically, within a single version, with no manual cache wipe.
71/// `HARN_VERSION` only busts the cache across release bumps; this closes the
72/// same gap for the within-version compiler edits that masked #2610. See #2621.
73pub const CODEGEN_FINGERPRINT: &str = env!("HARN_CODEGEN_FINGERPRINT");
74
75/// Conventional extension for entry-chunk cache files.
76pub const CACHE_EXTENSION: &str = "harnbc";
77
78/// Conventional extension for module-artifact cache files. Distinct from
79/// [`CACHE_EXTENSION`] so the same `.harn` source can have both shipped
80/// adjacent if needed (e.g. when a file is both an executable entry and
81/// imported by other files).
82pub const MODULE_CACHE_EXTENSION: &str = "harnmod";
83
84/// On-disk discriminant for a [`Chunk`] payload.
85const KIND_ENTRY_CHUNK: u8 = 1;
86/// On-disk discriminant for a [`ModuleArtifact`] payload.
87const KIND_MODULE_ARTIFACT: u8 = 2;
88
89/// Environment override for the cache directory. When set, takes
90/// precedence over the XDG and home-directory fallbacks.
91pub const CACHE_DIR_ENV: &str = "HARN_CACHE_DIR";
92
93/// Environment override that turns the cache off entirely. Setting this
94/// to `0`, `false`, `no`, or `off` skips both reads and writes; useful
95/// when debugging compiler changes.
96pub const CACHE_ENABLED_ENV: &str = "HARN_BYTECODE_CACHE";
97
98/// Result of a cache lookup. Carries the precomputed key so the caller
99/// can write it back on a miss without rehashing.
100pub struct LookupOutcome {
101    pub key: CacheKey,
102    pub chunk: Option<Chunk>,
103}
104
105/// Cache key components for a single pipeline source. Equality of all
106/// fields is necessary and sufficient for cache reuse.
107#[derive(Clone, Debug, PartialEq, Eq)]
108pub struct CacheKey {
109    pub source_hash: [u8; 32],
110    pub import_graph_hash: [u8; 32],
111    pub harn_version: &'static str,
112    /// Compact tag for active [`CompilerOptions`]. Flipping
113    /// `HARN_DISABLE_OPTIMIZATIONS` between runs would otherwise reuse a
114    /// chunk compiled under the wrong setting.
115    pub compiler_tag: u8,
116}
117
118impl CacheKey {
119    /// Compute the cache key for a `.harn` source file plus its transitive
120    /// user imports. `read_source` is the entry-file contents; the import
121    /// graph is walked from disk relative to `source_path`.
122    pub fn from_source(source_path: &Path, source: &str) -> Self {
123        let source_hash = sha256(source.as_bytes());
124        let import_graph_hash = hash_transitive_user_imports(source_path, source);
125        Self {
126            source_hash,
127            import_graph_hash,
128            harn_version: HARN_VERSION,
129            compiler_tag: compiler_options_tag(CompilerOptions::from_env()),
130        }
131    }
132
133    /// Entry-chunk filename for this key. We hash by source content
134    /// alone so two invocations of the same source from different paths
135    /// share a cache entry; the header's import-graph hash still gates
136    /// reuse on a per-load basis.
137    pub fn filename(&self) -> String {
138        format!("{}.{}", hex(&self.source_hash), CACHE_EXTENSION)
139    }
140
141    /// Module-artifact filename for this key.
142    pub fn module_filename(&self) -> String {
143        format!("{}.{}", hex(&self.source_hash), MODULE_CACHE_EXTENSION)
144    }
145}
146
147/// Returns the directory the shared cache lives in. Honors
148/// `$HARN_CACHE_DIR`, then `$XDG_CACHE_HOME/harn/bytecode`, then
149/// `$HOME/.cache/harn/bytecode`. The directory is *not* created here —
150/// [`store`] creates it lazily on write so read-only environments don't
151/// pay an mkdir cost.
152pub fn cache_dir() -> PathBuf {
153    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
154        return PathBuf::from(custom);
155    }
156    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
157        let xdg = PathBuf::from(xdg);
158        if !xdg.as_os_str().is_empty() {
159            return xdg.join("harn").join("bytecode");
160        }
161    }
162    if let Some(home) = crate::user_dirs::home_dir() {
163        return home.join(".cache").join("harn").join("bytecode");
164    }
165    // Final fallback: a directory beside the binary's working dir. Mostly
166    // hit in tests that scrub HOME from the environment.
167    PathBuf::from(".harn-cache").join("bytecode")
168}
169
170/// Root for `.harnpack` archives unpacked by `harn run <bundle.harnpack>`.
171/// Each verified bundle is replayed into `<root>/<sanitized-bundle-hash>/`
172/// so re-runs reuse the unpacked tree. Honors `$HARN_CACHE_DIR/packs`
173/// when set, otherwise XDG / `$HOME/.cache/harn/packs`.
174pub fn packs_cache_dir() -> PathBuf {
175    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
176        return PathBuf::from(custom).join("packs");
177    }
178    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
179        let xdg = PathBuf::from(xdg);
180        if !xdg.as_os_str().is_empty() {
181            return xdg.join("harn").join("packs");
182        }
183    }
184    if let Some(home) = crate::user_dirs::home_dir() {
185        return home.join(".cache").join("harn").join("packs");
186    }
187    PathBuf::from(".harn-cache").join("packs")
188}
189
190/// True when the cache is enabled by the current environment.
191pub fn cache_enabled() -> bool {
192    match std::env::var(CACHE_ENABLED_ENV).ok().as_deref() {
193        Some(value) => !matches!(
194            value.to_ascii_lowercase().as_str(),
195            "0" | "false" | "no" | "off"
196        ),
197        None => true,
198    }
199}
200
201/// Try to load a cached chunk for `source_path` whose contents are
202/// `source`. Returns the key alongside the (optional) chunk so callers
203/// avoid recomputing the key on miss.
204pub fn load(source_path: &Path, source: &str) -> LookupOutcome {
205    let key = CacheKey::from_source(source_path, source);
206    if !cache_enabled() {
207        return LookupOutcome { key, chunk: None };
208    }
209    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
210    if let Some(adjacent) = adjacent_cache_path(source_path) {
211        candidates.push(adjacent);
212    }
213    candidates.push(cache_dir().join(key.filename()));
214    for path in candidates {
215        match read_chunk_if_matches(&path, &key) {
216            Ok(Some(chunk)) => {
217                return LookupOutcome {
218                    key,
219                    chunk: Some(chunk),
220                }
221            }
222            Ok(None) => continue,
223            Err(_) => continue,
224        }
225    }
226    LookupOutcome { key, chunk: None }
227}
228
229/// Persist `chunk` to the shared cache directory under `key`. Atomic: a
230/// temp file is written then renamed into place. Concurrent invocations
231/// on the same key race safely.
232pub fn store(key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
233    if !cache_enabled() {
234        return Ok(());
235    }
236    let dir = cache_dir();
237    fs::create_dir_all(&dir)?;
238    write_atomic_chunk(&dir.join(key.filename()), key, chunk)
239}
240
241/// Write a precompiled entry-chunk artifact to an explicit path, for
242/// use by the `harn precompile` subcommand. The header still records
243/// the key, so adjacent artifacts shipped with source are validated
244/// like any other cache hit.
245pub fn store_at(path: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
246    ensure_parent_dir(path)?;
247    write_atomic_chunk(path, key, chunk)
248}
249
250/// Look up the [`ModuleArtifact`] for `source_path` (whose contents are
251/// `source`). Mirrors [`load`] but for the `.harnmod` family.
252pub fn load_module(source_path: &Path, source: &str) -> ModuleLookupOutcome {
253    let key = CacheKey::from_source(source_path, source);
254    if !cache_enabled() {
255        return ModuleLookupOutcome {
256            key,
257            artifact: None,
258        };
259    }
260    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
261    if let Some(adjacent) = adjacent_module_cache_path(source_path) {
262        candidates.push(adjacent);
263    }
264    candidates.push(cache_dir().join(key.module_filename()));
265    for path in candidates {
266        match read_module_if_matches(&path, &key) {
267            Ok(Some(artifact)) => {
268                return ModuleLookupOutcome {
269                    key,
270                    artifact: Some(artifact),
271                }
272            }
273            Ok(None) => continue,
274            Err(_) => continue,
275        }
276    }
277    ModuleLookupOutcome {
278        key,
279        artifact: None,
280    }
281}
282
283/// Persist `artifact` to the shared cache under `key`. Atomic;
284/// concurrent invocations race safely.
285pub fn store_module(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
286    if !cache_enabled() {
287        return Ok(());
288    }
289    let dir = cache_dir();
290    fs::create_dir_all(&dir)?;
291    write_atomic_module(&dir.join(key.module_filename()), key, artifact)
292}
293
294/// Write a module artifact to an explicit path.
295pub fn store_module_at(path: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
296    ensure_parent_dir(path)?;
297    write_atomic_module(path, key, artifact)
298}
299
300/// Result of a [`load_module`] lookup. Carries the precomputed key so
301/// the caller can write it back on a miss without rehashing.
302pub struct ModuleLookupOutcome {
303    pub key: CacheKey,
304    pub artifact: Option<ModuleArtifact>,
305}
306
307/// Path to the adjacent precompiled entry-chunk artifact for
308/// `source_path`. `foo.harn` → `foo.harnbc`.
309pub fn adjacent_cache_path(source_path: &Path) -> Option<PathBuf> {
310    adjacent_path_with_extension(source_path, CACHE_EXTENSION)
311}
312
313/// Path to the adjacent precompiled module-artifact for `source_path`.
314/// `foo.harn` → `foo.harnmod`.
315pub fn adjacent_module_cache_path(source_path: &Path) -> Option<PathBuf> {
316    adjacent_path_with_extension(source_path, MODULE_CACHE_EXTENSION)
317}
318
319fn adjacent_path_with_extension(source_path: &Path, ext: &str) -> Option<PathBuf> {
320    let stem = source_path.file_stem()?;
321    if stem.is_empty() {
322        return None;
323    }
324    let parent = source_path.parent().unwrap_or_else(|| Path::new(""));
325    let mut out = parent.join(stem);
326    out.set_extension(ext);
327    Some(out)
328}
329
330fn ensure_parent_dir(path: &Path) -> io::Result<()> {
331    if let Some(parent) = path.parent() {
332        if !parent.as_os_str().is_empty() {
333            fs::create_dir_all(parent)?;
334        }
335    }
336    Ok(())
337}
338
339fn write_atomic_chunk(target: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
340    let buf = serialize_chunk_artifact(key, chunk)?;
341    write_atomic(target, &buf)
342}
343
344fn write_atomic_module(target: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
345    let buf = serialize_module_artifact(key, artifact)?;
346    write_atomic(target, &buf)
347}
348
349/// Serialize an entry-chunk artifact (header + payload) to bytes. The
350/// resulting buffer is byte-identical to the file [`store_at`] would
351/// have written for the same `(key, chunk)`. Use this when packaging
352/// artifacts into a container (e.g. `harn pack`) without going through
353/// the filesystem.
354pub fn serialize_chunk_artifact(key: &CacheKey, chunk: &Chunk) -> io::Result<Vec<u8>> {
355    let cached = chunk.freeze_for_cache();
356    let payload = bincode::serde::encode_to_vec(&cached, bincode::config::standard())
357        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
358    Ok(encode_artifact(key, KIND_ENTRY_CHUNK, &payload))
359}
360
361/// Serialize a module artifact (header + payload) to bytes. Companion
362/// to [`serialize_chunk_artifact`] for the `.harnmod` family.
363pub fn serialize_module_artifact(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<Vec<u8>> {
364    let payload = bincode::serde::encode_to_vec(artifact, bincode::config::standard())
365        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
366    Ok(encode_artifact(key, KIND_MODULE_ARTIFACT, &payload))
367}
368
369fn encode_artifact(key: &CacheKey, kind: u8, payload: &[u8]) -> Vec<u8> {
370    let mut buf: Vec<u8> = Vec::with_capacity(payload.len() + 128);
371    buf.extend_from_slice(MAGIC);
372    buf.extend_from_slice(&SCHEMA_VERSION.to_le_bytes());
373    let version_bytes = HARN_VERSION.as_bytes();
374    buf.extend_from_slice(&(version_bytes.len() as u32).to_le_bytes());
375    buf.extend_from_slice(version_bytes);
376    buf.push(key.compiler_tag);
377    buf.push(kind);
378    buf.extend_from_slice(&key.source_hash);
379    buf.extend_from_slice(&key.import_graph_hash);
380    buf.extend_from_slice(payload);
381    buf
382}
383
384fn write_atomic(target: &Path, buf: &[u8]) -> io::Result<()> {
385    let tmp_path = atomic_tmp_path(target);
386    let mut tmp_file = fs::File::create(&tmp_path)?;
387    tmp_file.write_all(buf)?;
388    tmp_file.sync_all()?;
389    drop(tmp_file);
390    match fs::rename(&tmp_path, target) {
391        Ok(()) => Ok(()),
392        Err(err) => {
393            let _ = fs::remove_file(&tmp_path);
394            Err(err)
395        }
396    }
397}
398
399fn atomic_tmp_path(target: &Path) -> PathBuf {
400    static NEXT_TMP_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
401    let id = NEXT_TMP_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
402    let tmp_name = match target.file_name() {
403        Some(name) => format!(
404            ".{}.{}.{}.tmp",
405            name.to_string_lossy(),
406            std::process::id(),
407            id
408        ),
409        None => format!(".harn-cache.{}.{}.tmp", std::process::id(), id),
410    };
411    target.with_file_name(tmp_name)
412}
413
414/// Parsed cache header. Read by both the chunk and module loaders so the
415/// header-validation logic stays in one place.
416struct ParsedHeader {
417    kind: u8,
418    payload: Vec<u8>,
419}
420
421fn read_header_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ParsedHeader>> {
422    let mut file = match fs::File::open(path) {
423        Ok(f) => f,
424        Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(None),
425        Err(err) => return Err(err),
426    };
427    let mut header = [0u8; 8 + 4 + 4];
428    if file.read_exact(&mut header).is_err() {
429        return Ok(None);
430    }
431    if &header[..8] != MAGIC {
432        return Ok(None);
433    }
434    let schema = u32::from_le_bytes(header[8..12].try_into().unwrap());
435    if schema != SCHEMA_VERSION {
436        return Ok(None);
437    }
438    let version_len = u32::from_le_bytes(header[12..16].try_into().unwrap()) as usize;
439    if version_len > 256 {
440        // Bound the alloc so a corrupted file cannot force an unbounded read.
441        return Ok(None);
442    }
443    let mut version_buf = vec![0u8; version_len];
444    if file.read_exact(&mut version_buf).is_err() {
445        return Ok(None);
446    }
447    if version_buf != key.harn_version.as_bytes() {
448        return Ok(None);
449    }
450    let mut compiler_and_kind = [0u8; 2];
451    if file.read_exact(&mut compiler_and_kind).is_err() {
452        return Ok(None);
453    }
454    if compiler_and_kind[0] != key.compiler_tag {
455        return Ok(None);
456    }
457    let kind = compiler_and_kind[1];
458    let mut hashes = [0u8; 64];
459    if file.read_exact(&mut hashes).is_err() {
460        return Ok(None);
461    }
462    if hashes[..32] != key.source_hash || hashes[32..] != key.import_graph_hash {
463        return Ok(None);
464    }
465    let mut payload = Vec::new();
466    if file.read_to_end(&mut payload).is_err() {
467        return Ok(None);
468    }
469    Ok(Some(ParsedHeader { kind, payload }))
470}
471
472fn read_chunk_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<Chunk>> {
473    let Some(header) = read_header_if_matches(path, key)? else {
474        return Ok(None);
475    };
476    if header.kind != KIND_ENTRY_CHUNK {
477        return Ok(None);
478    }
479    let cached: CachedChunk =
480        match bincode::serde::decode_from_slice(&header.payload, bincode::config::standard()) {
481            Ok((c, _)) => c,
482            Err(_) => return Ok(None),
483        };
484    Ok(Some(Chunk::from_cached(&cached)))
485}
486
487fn read_module_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ModuleArtifact>> {
488    let Some(header) = read_header_if_matches(path, key)? else {
489        return Ok(None);
490    };
491    if header.kind != KIND_MODULE_ARTIFACT {
492        return Ok(None);
493    }
494    match bincode::serde::decode_from_slice::<ModuleArtifact, _>(
495        &header.payload,
496        bincode::config::standard(),
497    ) {
498        Ok((artifact, _)) => Ok(Some(artifact)),
499        Err(_) => Ok(None),
500    }
501}
502
503/// Compact representation of [`CompilerOptions`] for the cache header.
504/// Independent flags get distinct bits so adding a new flag never
505/// silently changes existing keys when an old binary reads a new
506/// artifact — the header check will fail-closed before we get there
507/// anyway, but mapping to bits also keeps the tag a stable function
508/// of the option set.
509fn compiler_options_tag(options: CompilerOptions) -> u8 {
510    let mut tag: u8 = 0;
511    if options.optimizations_enabled() {
512        tag |= 0b0000_0001;
513    }
514    tag
515}
516
517fn sha256(bytes: &[u8]) -> [u8; 32] {
518    let mut hasher = Sha256::new();
519    hasher.update(bytes);
520    hasher.finalize().into()
521}
522
523fn hex(bytes: &[u8]) -> String {
524    let mut out = String::with_capacity(bytes.len() * 2);
525    for byte in bytes {
526        out.push_str(&format!("{byte:02x}"));
527    }
528    out
529}
530
531/// Lightweight regex-free scan that surfaces user imports without paying
532/// a full lex+parse. False positives only increase cache churn, never
533/// correctness; comments and string literals are skipped so neither a
534/// commented-out import nor a `"import …"` value appearing inside an
535/// unrelated string gates the hash.
536fn collect_user_imports(source: &str) -> Vec<String> {
537    let scrubbed = strip_comments(source);
538    let mut out: Vec<String> = Vec::new();
539    let bytes = scrubbed.as_bytes();
540    let mut i = 0;
541    while i < bytes.len() {
542        if bytes[i] == b'"' {
543            // Skip past any string literal so identifiers inside string
544            // values cannot trigger the keyword match below.
545            match read_string_literal(bytes, i) {
546                Some((_, end)) => {
547                    i = end;
548                    continue;
549                }
550                None => {
551                    i += 1;
552                    continue;
553                }
554            }
555        }
556        if !matches_keyword(bytes, i, b"import") {
557            i += 1;
558            continue;
559        }
560        // Skip past `import` and any selective `{ ... } from` clause; we
561        // only need the source-position of the path string literal.
562        let mut j = i + b"import".len();
563        let mut depth = 0i32;
564        while j < bytes.len() {
565            match bytes[j] {
566                b'"' => {
567                    if let Some((path, end)) = read_string_literal(bytes, j) {
568                        if !path.starts_with("std/") {
569                            out.push(path);
570                        }
571                        i = end;
572                        break;
573                    }
574                    j += 1;
575                }
576                b'{' => {
577                    depth += 1;
578                    j += 1;
579                }
580                b'}' => {
581                    depth -= 1;
582                    j += 1;
583                }
584                b'\n' if depth == 0 => {
585                    // No string literal on this logical line; bail and
586                    // continue scanning after the keyword to avoid an
587                    // infinite loop.
588                    i = j;
589                    break;
590                }
591                _ => j += 1,
592            }
593        }
594        if j >= bytes.len() {
595            break;
596        }
597        if i < j {
598            // Defensive: ensure forward progress when the inner loop
599            // exited without setting `i`.
600            i = j;
601        }
602    }
603    out
604}
605
606fn matches_keyword(bytes: &[u8], at: usize, keyword: &[u8]) -> bool {
607    let end = at + keyword.len();
608    if end > bytes.len() {
609        return false;
610    }
611    if &bytes[at..end] != keyword {
612        return false;
613    }
614    if at > 0 && is_ident_char(bytes[at - 1]) {
615        return false;
616    }
617    if end < bytes.len() && is_ident_char(bytes[end]) {
618        return false;
619    }
620    true
621}
622
623fn is_ident_char(b: u8) -> bool {
624    b.is_ascii_alphanumeric() || b == b'_'
625}
626
627fn read_string_literal(bytes: &[u8], at: usize) -> Option<(String, usize)> {
628    debug_assert_eq!(bytes[at], b'"');
629    let mut out = String::new();
630    let mut i = at + 1;
631    while i < bytes.len() {
632        match bytes[i] {
633            b'"' => return Some((out, i + 1)),
634            b'\\' => {
635                if i + 1 >= bytes.len() {
636                    return None;
637                }
638                match bytes[i + 1] {
639                    b'"' => out.push('"'),
640                    b'\\' => out.push('\\'),
641                    b'n' => out.push('\n'),
642                    b'r' => out.push('\r'),
643                    b't' => out.push('\t'),
644                    other => out.push(other as char),
645                }
646                i += 2;
647            }
648            b'\n' => return None,
649            byte => {
650                out.push(byte as char);
651                i += 1;
652            }
653        }
654    }
655    None
656}
657
658fn strip_comments(source: &str) -> String {
659    let bytes = source.as_bytes();
660    let mut out = String::with_capacity(source.len());
661    let mut i = 0;
662    while i < bytes.len() {
663        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'/' {
664            while i < bytes.len() && bytes[i] != b'\n' {
665                i += 1;
666            }
667            continue;
668        }
669        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'*' {
670            i += 2;
671            while i + 1 < bytes.len() && !(bytes[i] == b'*' && bytes[i + 1] == b'/') {
672                i += 1;
673            }
674            i = (i + 2).min(bytes.len());
675            continue;
676        }
677        if bytes[i] == b'"' {
678            if let Some((_, end)) = read_string_literal(bytes, i) {
679                out.push_str(&source[i..end]);
680                i = end;
681                continue;
682            }
683        }
684        out.push(bytes[i] as char);
685        i += 1;
686    }
687    out
688}
689
690/// Stable digest over every embedded stdlib source. Folded into the
691/// user-file cache key so that bumping a stdlib module (changing its
692/// embedded `.harn` content) invalidates cached user bytecode that may
693/// reference stale function-pool layouts from a prior stdlib snapshot.
694/// `HARN_VERSION` already busts the cache across release bumps; this
695/// closes the same gap for within-version stdlib edits (a frequent
696/// pattern during local development).
697///
698/// Cached in a `OnceLock` because `STDLIB_SOURCES` is a static `const`
699/// slice — the digest is identical for the lifetime of the process.
700fn embedded_stdlib_digest() -> &'static [u8; 32] {
701    use std::sync::OnceLock;
702    static DIGEST: OnceLock<[u8; 32]> = OnceLock::new();
703    DIGEST.get_or_init(|| {
704        let mut entries: Vec<(&'static str, &'static str)> = harn_stdlib::STDLIB_SOURCES
705            .iter()
706            .map(|src| (src.module, src.source))
707            .collect();
708        entries.sort_by(|a, b| a.0.cmp(b.0));
709        let mut hasher = Sha256::new();
710        for (module, source) in entries {
711            hasher.update(module.as_bytes());
712            hasher.update(b"\0");
713            hasher.update(source.as_bytes());
714            hasher.update(b"\0");
715        }
716        hasher.finalize().into()
717    })
718}
719
720/// Walk the user-import graph rooted at `source_path` and produce a
721/// stable hash of every transitively-reachable file. The hash is
722/// order-independent: each visited file is keyed by canonical path and
723/// emitted in sorted order, so reordering imports inside a file does
724/// not invalidate the cache while changing any file's content does.
725///
726/// Embedded stdlib content is folded into the hash too — `collect_user_imports`
727/// deliberately skips `std/*` paths (they resolve to in-binary sources, not
728/// disk files), so without this fold a stdlib edit between development
729/// builds would leave user-file caches pinned to a stale stdlib snapshot.
730fn hash_transitive_user_imports(source_path: &Path, source: &str) -> [u8; 32] {
731    hash_transitive_user_imports_fingerprinted(source_path, source, CODEGEN_FINGERPRINT)
732}
733
734/// Process-wide memo of `(file content, collect_user_imports(content))` keyed by
735/// the resolved file path plus its stat identity `(len, mtime_ns)`. Walking a
736/// large pipeline's import graph re-encounters the same shared library files for
737/// nearly every module, so without this memo `from_source` re-reads and
738/// re-scans those files hundreds of times in a single cold run. Because the key
739/// includes `(len, mtime_ns)`, any on-disk edit produces a fresh key and the
740/// stale entry is never reused — a warm long-lived process recompiles edited
741/// pipelines correctly. The returned bytes are identical to the un-memoized
742/// path, so cache keys are byte-for-byte unchanged.
743fn imports_file_memo() -> &'static ImportsFileMemo {
744    use std::sync::OnceLock;
745    static MEMO: OnceLock<ImportsFileMemo> = OnceLock::new();
746    MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()))
747}
748
749/// Process-wide memo of `Path::canonicalize`. The import-graph walk canonicalizes
750/// the same resolved module paths hundreds of times across a cold `from_source`
751/// fan-out, and each call is a `realpath(3)` syscall. A successful
752/// canonicalization is stable for the process lifetime (the pipeline tree is not
753/// moved mid-run), so it is memoized. A *failed* canonicalization (the path does
754/// not exist yet) is NOT memoized: a file that later appears — or a symlink that
755/// is created — must canonicalize freshly so the folded path key matches what a
756/// cold process would produce. This keeps the memo a pure speed optimization with
757/// byte-identical output.
758fn canonicalize_cached(path: &Path) -> PathBuf {
759    use std::sync::OnceLock;
760    static MEMO: OnceLock<std::sync::Mutex<std::collections::HashMap<PathBuf, PathBuf>>> =
761        OnceLock::new();
762    let memo = MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()));
763    if let Some(hit) = memo.lock().unwrap().get(path).cloned() {
764        return hit;
765    }
766    match path.canonicalize() {
767        Ok(canonical) => {
768            memo.lock()
769                .unwrap()
770                .insert(path.to_path_buf(), canonical.clone());
771            canonical
772        }
773        // Unresolved path: fall back to the input, but do not memoize, so a file
774        // that appears later canonicalizes correctly on the next walk.
775        Err(_) => path.to_path_buf(),
776    }
777}
778
779fn file_stat_identity(path: &Path) -> Option<(u64, i128)> {
780    let meta = fs::metadata(path).ok()?;
781    let len = meta.len();
782    // Nanosecond mtime where available; fall back to coarse seconds. Any change
783    // to either component on disk invalidates the memo entry.
784    let mtime_ns = meta
785        .modified()
786        .ok()
787        .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
788        .map(|d| d.as_nanos() as i128)
789        .unwrap_or(0);
790    Some((len, mtime_ns))
791}
792
793/// Read `path` and scan its user imports, memoized by stat identity. On an I/O
794/// error, returns the `ErrorKind` string the un-memoized path folded in (errors
795/// are not memoized — a transient failure should not be sticky).
796fn read_and_scan_imports_cached(path: &Path) -> Result<(String, Vec<String>), String> {
797    if let Some((len, mtime_ns)) = file_stat_identity(path) {
798        let key = (path.to_path_buf(), len, mtime_ns);
799        if let Some(hit) = imports_file_memo().lock().unwrap().get(&key).cloned() {
800            return Ok((hit.0.clone(), hit.1.clone()));
801        }
802        match fs::read_to_string(path) {
803            Ok(content) => {
804                let nested = collect_user_imports(&content);
805                let entry = std::sync::Arc::new((content.clone(), nested.clone()));
806                imports_file_memo().lock().unwrap().insert(key, entry);
807                Ok((content, nested))
808            }
809            Err(err) => Err(err.kind().to_string()),
810        }
811    } else {
812        // No stat (file vanished between resolve and read): fall back to a direct
813        // read so behavior matches the un-memoized path exactly.
814        match fs::read_to_string(path) {
815            Ok(content) => {
816                let nested = collect_user_imports(&content);
817                Ok((content, nested))
818            }
819            Err(err) => Err(err.kind().to_string()),
820        }
821    }
822}
823
824/// Inner form of [`hash_transitive_user_imports`] parameterized on the compiler
825/// fingerprint so tests can vary it; production always passes
826/// [`CODEGEN_FINGERPRINT`].
827fn hash_transitive_user_imports_fingerprinted(
828    source_path: &Path,
829    source: &str,
830    codegen_fingerprint: &str,
831) -> [u8; 32] {
832    let mut visited: std::collections::BTreeMap<PathBuf, ImportNode> =
833        std::collections::BTreeMap::new();
834    let mut frontier: Vec<(PathBuf, String)> = collect_user_imports(source)
835        .into_iter()
836        .map(|import| (source_path.to_path_buf(), import))
837        .collect();
838
839    while let Some((anchor, import)) = frontier.pop() {
840        let Some(resolved) = harn_modules::resolve_import_path(&anchor, &import) else {
841            // Unresolved imports get a sentinel keyed by their resolution
842            // anchor so that dropping a real file under that anchor later
843            // produces a different key.
844            let sentinel = anchor.join(format!("__unresolved__/{import}"));
845            visited
846                .entry(sentinel)
847                .or_insert(ImportNode::Unresolved { import });
848            continue;
849        };
850        let canonical = canonicalize_cached(&resolved);
851        if visited.contains_key(&canonical) {
852            continue;
853        }
854        // Per-file read + import-scan is memoized process-wide, keyed by the
855        // file's identity stat `(len, mtime)`. The same handful of core library
856        // modules (`lib/host/*`, `lib/runtime/*`, ...) sit on the import graph of
857        // nearly every module, so a cold `from_source` over a large pipeline used
858        // to re-read and re-scan the same files hundreds of times across the
859        // module-load fan-out. The memo is invalidated automatically the moment a
860        // file's stat changes on disk, so a warm long-lived process still recompiles
861        // edited pipelines correctly. The folded hash bytes are byte-identical to
862        // the un-memoized path (same content + same `collect_user_imports` output),
863        // so cache keys are unchanged. See `imports_file_memo`.
864        match read_and_scan_imports_cached(&resolved) {
865            Ok((content, nested)) => {
866                visited.insert(canonical.clone(), ImportNode::Resolved { content });
867                for nested_import in nested {
868                    frontier.push((resolved.clone(), nested_import));
869                }
870            }
871            Err(kind) => {
872                visited.insert(canonical, ImportNode::IoError { kind });
873            }
874        }
875    }
876
877    let mut hasher = Sha256::new();
878    hasher.update(b"stdlib-digest\0");
879    hasher.update(embedded_stdlib_digest());
880    hasher.update(b"\0");
881    // Fold in the compiler's code-generation fingerprint so a compiler change
882    // that alters emitted bytecode for unchanged source busts stale cache
883    // entries within a single version — the gap that masked the #2610 fix until
884    // the cache was cleared by hand. See `build.rs` and `CODEGEN_FINGERPRINT`.
885    hasher.update(b"codegen-fingerprint\0");
886    hasher.update(codegen_fingerprint.as_bytes());
887    hasher.update(b"\0");
888    for (path, node) in &visited {
889        hasher.update(path.to_string_lossy().as_bytes());
890        hasher.update(b"\0");
891        match node {
892            ImportNode::Resolved { content } => {
893                hasher.update(b"resolved\0");
894                hasher.update(content.as_bytes());
895            }
896            ImportNode::Unresolved { import } => {
897                hasher.update(b"unresolved\0");
898                hasher.update(import.as_bytes());
899            }
900            ImportNode::IoError { kind } => {
901                hasher.update(b"ioerror\0");
902                hasher.update(kind.as_bytes());
903            }
904        }
905        hasher.update(b"\0");
906    }
907    hasher.finalize().into()
908}
909
910enum ImportNode {
911    Resolved { content: String },
912    Unresolved { import: String },
913    IoError { kind: String },
914}
915
916#[cfg(test)]
917mod tests {
918    use super::*;
919    use crate::compile_source;
920
921    #[test]
922    fn header_round_trips_chunk() {
923        let chunk = compile_source("__io_println(\"hello\")").expect("compile");
924        let key = CacheKey::from_source(Path::new("/tmp/example.harn"), "__io_println(\"hello\")");
925        let tmp = tempfile::tempdir().unwrap();
926        let path = tmp.path().join("entry.harnbc");
927        store_at(&path, &key, &chunk).expect("write");
928        let loaded = read_chunk_if_matches(&path, &key).unwrap();
929        assert!(loaded.is_some(), "expected cached chunk to load");
930    }
931
932    #[test]
933    fn serialize_chunk_artifact_matches_store_at() {
934        // `serialize_chunk_artifact` packages an artifact into a buffer for
935        // in-memory consumers (e.g. `harn pack` writing into a tar.zst
936        // bundle). The contract is: the resulting bytes match what
937        // `store_at` would have written for the same key+chunk, so the
938        // shipped artifact is byte-identical to the on-disk cache form.
939        let chunk = compile_source("__io_println(\"hi\")").expect("compile");
940        let key = CacheKey::from_source(Path::new("/tmp/pack.harn"), "__io_println(\"hi\")");
941        let tmp = tempfile::tempdir().unwrap();
942        let on_disk = tmp.path().join("pack.harnbc");
943        store_at(&on_disk, &key, &chunk).expect("write");
944        let on_disk_bytes = std::fs::read(&on_disk).unwrap();
945        let in_memory_bytes = serialize_chunk_artifact(&key, &chunk).expect("serialize");
946        assert_eq!(in_memory_bytes, on_disk_bytes);
947    }
948
949    #[test]
950    fn atomic_temp_paths_are_unique_within_process() {
951        let target = Path::new("entry.harnbc");
952        let first = atomic_tmp_path(target);
953        let second = atomic_tmp_path(target);
954        assert_ne!(
955            first, second,
956            "same-process concurrent cache writes must not share a temp file"
957        );
958    }
959
960    #[test]
961    fn header_mismatch_returns_none() {
962        let chunk = compile_source("1 + 1").expect("compile");
963        let key = CacheKey::from_source(Path::new("/tmp/a.harn"), "1 + 1");
964        let tmp = tempfile::tempdir().unwrap();
965        let path = tmp.path().join("a.harnbc");
966        store_at(&path, &key, &chunk).expect("write");
967        let other = CacheKey {
968            source_hash: [0xAB; 32],
969            import_graph_hash: key.import_graph_hash,
970            harn_version: HARN_VERSION,
971            compiler_tag: key.compiler_tag,
972        };
973        assert!(read_chunk_if_matches(&path, &other).unwrap().is_none());
974    }
975
976    #[test]
977    fn compiler_tag_mismatch_returns_none() {
978        let chunk = compile_source("1 + 1").expect("compile");
979        let key = CacheKey::from_source(Path::new("/tmp/b.harn"), "1 + 1");
980        let tmp = tempfile::tempdir().unwrap();
981        let path = tmp.path().join("b.harnbc");
982        store_at(&path, &key, &chunk).expect("write");
983        let other = CacheKey {
984            compiler_tag: key.compiler_tag ^ 0xFF,
985            ..key
986        };
987        assert!(
988            read_chunk_if_matches(&path, &other).unwrap().is_none(),
989            "flipped HARN_DISABLE_OPTIMIZATIONS must not reuse a chunk \
990             compiled under the opposite setting"
991        );
992    }
993
994    #[test]
995    fn codegen_fingerprint_is_populated() {
996        // In-workspace builds always hash real compiler sources, so the
997        // fingerprint must be a non-empty digest; an empty value would silently
998        // disable the within-version compiler-staleness guard.
999        assert!(!CODEGEN_FINGERPRINT.is_empty());
1000    }
1001
1002    #[test]
1003    fn codegen_fingerprint_changes_cache_key() {
1004        // A compiler whose code-generation source differs must produce a
1005        // different cache key for the *same* user source, so a stale artifact
1006        // compiled by a prior compiler at the same version misses on load
1007        // rather than being replayed (#2621). The fingerprint is a compile-time
1008        // constant, so exercise the parameterized inner hash directly.
1009        let tmp = tempfile::tempdir().unwrap();
1010        let entry = tmp.path().join("entry.harn");
1011        std::fs::write(&entry, "__io_println(\"hi\")\n").unwrap();
1012        let source = std::fs::read_to_string(&entry).unwrap();
1013        let a = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-A");
1014        let b = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-B");
1015        let a_again = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-A");
1016        assert_ne!(
1017            a, b,
1018            "differing compiler fingerprints must change the cache key"
1019        );
1020        assert_eq!(
1021            a, a_again,
1022            "an unchanged compiler fingerprint must be stable"
1023        );
1024    }
1025
1026    #[test]
1027    fn collect_user_imports_ignores_stdlib_and_comments() {
1028        let source = r#"
1029            // import "comment/should/be/ignored"
1030            import "std/agents"
1031            import { foo } from "pkg/bar"
1032            import "./relative/path"
1033        "#;
1034        let imports = collect_user_imports(source);
1035        assert_eq!(
1036            imports,
1037            vec!["pkg/bar".to_string(), "./relative/path".to_string()]
1038        );
1039    }
1040
1041    #[test]
1042    fn cache_enabled_respects_env() {
1043        std::env::set_var(CACHE_ENABLED_ENV, "0");
1044        assert!(!cache_enabled());
1045        std::env::set_var(CACHE_ENABLED_ENV, "1");
1046        assert!(cache_enabled());
1047        std::env::remove_var(CACHE_ENABLED_ENV);
1048        assert!(cache_enabled());
1049    }
1050
1051    #[test]
1052    fn import_path_inside_string_literal_is_ignored() {
1053        let source = r#"
1054            let payload = "import { foo } from \"./other\""
1055            import "./real"
1056        "#;
1057        let imports = collect_user_imports(source);
1058        assert_eq!(imports, vec!["./real".to_string()]);
1059    }
1060
1061    #[test]
1062    fn import_hash_is_stable_across_import_order() {
1063        let tmp = tempfile::tempdir().unwrap();
1064        std::fs::write(
1065            tmp.path().join("a.harn"),
1066            "pub fn a() -> int { return 1 }\n",
1067        )
1068        .unwrap();
1069        std::fs::write(
1070            tmp.path().join("b.harn"),
1071            "pub fn b() -> int { return 2 }\n",
1072        )
1073        .unwrap();
1074        let ab = tmp.path().join("entry_ab.harn");
1075        std::fs::write(
1076            &ab,
1077            "import \"./a\"\nimport \"./b\"\n__io_println(\"hi\")\n",
1078        )
1079        .unwrap();
1080        let ba = tmp.path().join("entry_ba.harn");
1081        std::fs::write(
1082            &ba,
1083            "import \"./b\"\nimport \"./a\"\n__io_println(\"hi\")\n",
1084        )
1085        .unwrap();
1086        let hash_ab = hash_transitive_user_imports(&ab, &std::fs::read_to_string(&ab).unwrap());
1087        let hash_ba = hash_transitive_user_imports(&ba, &std::fs::read_to_string(&ba).unwrap());
1088        assert_eq!(
1089            hash_ab, hash_ba,
1090            "import-graph hash must be order-independent so reordering imports \
1091             does not bust the cache"
1092        );
1093    }
1094
1095    #[test]
1096    fn import_hash_picks_up_nested_imports() {
1097        let tmp = tempfile::tempdir().unwrap();
1098        std::fs::write(
1099            tmp.path().join("leaf.harn"),
1100            "pub fn x() -> int { return 1 }\n",
1101        )
1102        .unwrap();
1103        std::fs::write(
1104            tmp.path().join("mid.harn"),
1105            "import \"./leaf\"\npub fn y() -> int { return 2 }\n",
1106        )
1107        .unwrap();
1108        let entry = tmp.path().join("entry.harn");
1109        std::fs::write(&entry, "import \"./mid\"\n__io_println(\"hi\")\n").unwrap();
1110
1111        let before =
1112            hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1113        std::fs::write(
1114            tmp.path().join("leaf.harn"),
1115            "pub fn x() -> int { return 999 }\n",
1116        )
1117        .unwrap();
1118        let after = hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1119        assert_ne!(
1120            before, after,
1121            "editing a transitively-imported file must change the import-graph hash"
1122        );
1123    }
1124
1125    #[test]
1126    fn import_hash_busts_on_same_length_edit_in_same_process() {
1127        // The per-file read/scan memo is keyed by `(path, len, mtime_ns)`. The
1128        // hardest case for that key is an edit that preserves byte length: only
1129        // the mtime distinguishes the two versions. Guard that a same-length edit
1130        // to a transitively-imported file, recomputed in the SAME process so the
1131        // memo is warm, still busts the import-graph hash. Without a working
1132        // staleness check a warm long-lived process would replay stale bytecode.
1133        let tmp = tempfile::tempdir().unwrap();
1134        let leaf = tmp.path().join("leaf.harn");
1135        std::fs::write(&leaf, "pub fn x() -> int { return 111 }\n").unwrap();
1136        let entry = tmp.path().join("entry.harn");
1137        std::fs::write(&entry, "import \"./leaf\"\n__io_println(\"hi\")\n").unwrap();
1138
1139        let before =
1140            hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1141
1142        // Same byte length (`111` -> `222`), so the memo must rely on mtime.
1143        // Sleep past the coarsest plausible mtime granularity so the stat key
1144        // genuinely changes on every filesystem this runs on.
1145        std::thread::sleep(std::time::Duration::from_millis(1100));
1146        std::fs::write(&leaf, "pub fn x() -> int { return 222 }\n").unwrap();
1147        assert_eq!(
1148            std::fs::metadata(&leaf).unwrap().len(),
1149            33,
1150            "the two leaf versions must be the same byte length for this test to \
1151             exercise the mtime path"
1152        );
1153
1154        let after = hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1155        assert_ne!(
1156            before, after,
1157            "a same-length edit to a transitively-imported file must still change \
1158             the import-graph hash when recomputed in a warm process"
1159        );
1160    }
1161
1162    #[test]
1163    fn import_hash_stable_across_repeated_calls_same_process() {
1164        // The memo must be a pure speed optimization: repeated `from_source`
1165        // calls over an unchanged tree (the cold-start module-load fan-out
1166        // pattern) must return byte-identical hashes.
1167        let tmp = tempfile::tempdir().unwrap();
1168        std::fs::write(
1169            tmp.path().join("dep.harn"),
1170            "pub fn d() -> int { return 7 }\n",
1171        )
1172        .unwrap();
1173        let entry = tmp.path().join("entry.harn");
1174        std::fs::write(&entry, "import \"./dep\"\n__io_println(\"hi\")\n").unwrap();
1175        let src = std::fs::read_to_string(&entry).unwrap();
1176        let first = hash_transitive_user_imports(&entry, &src);
1177        for _ in 0..50 {
1178            assert_eq!(
1179                hash_transitive_user_imports(&entry, &src),
1180                first,
1181                "repeated import-graph hashing over an unchanged tree must be stable"
1182            );
1183        }
1184    }
1185}