Skip to main content

harn_vm/
bytecode_cache.rs

1//! Content-addressed on-disk cache for compiled `.harn` pipelines.
2//!
3//! Cold-start `harn run` re-parses, type-checks, and compiles the entry
4//! pipeline before the VM gets a single instruction to execute. For short
5//! Harn subcommands that wrap a few `llm_call`s in a small pipeline, that
6//! compile cost dominates wall-clock time.
7//!
8//! This module persists [`Chunk`] bytecode under
9//! `$HARN_CACHE_DIR/<source-hash>.harnbc` (XDG-aware). The cache key is
10//! derived from the entry source plus the content hash of every
11//! transitively-imported user file; stdlib imports are covered by the
12//! embedded `harn_version` field in the header. Any change to any input
13//! flips the key and the next run recompiles.
14//!
15//! File layout — little-endian throughout:
16//!
17//! ```text
18//! magic        : [u8; 8]   = "HARNBC\0\0"
19//! schema_ver   : u32       = SCHEMA_VERSION
20//! version_len  : u32
21//! harn_version : [u8; version_len]
22//! compiler_tag : u8        bitmask of active CompilerOptions
23//! kind         : u8        1 = entry chunk, 2 = module artifact
24//! source_hash  : [u8; 32]
25//! import_hash  : [u8; 32]
26//! payload      : bincode-serialized payload for `kind`
27//! ```
28//!
29//! The header lets a stale binary detect a future-version artifact
30//! without crashing: a magic mismatch, schema mismatch, or version
31//! mismatch is returned as `Ok(None)` so the caller transparently
32//! recompiles. Real I/O errors propagate.
33//!
34//! Concurrency: writes are atomic (write-tmp-then-rename), and parallel
35//! invocations on a cache miss race safely — the last writer wins, but
36//! every reader observes a consistent file because the rename is atomic
37//! on every supported filesystem.
38
39use std::fs;
40use std::io::{self, Read as _, Write as _};
41use std::path::{Path, PathBuf};
42
43use sha2::{Digest, Sha256};
44
45use crate::chunk::{CachedChunk, Chunk};
46use crate::compiler::CompilerOptions;
47use crate::module_artifact::ModuleArtifact;
48
49type ImportScan = (String, Vec<String>);
50type SharedImportScan = std::sync::Arc<ImportScan>;
51type ImportsFileMemoKey = (PathBuf, u64, i128);
52type ImportsFileMemo =
53    std::sync::Mutex<std::collections::HashMap<ImportsFileMemoKey, SharedImportScan>>;
54
55/// Header magic for all bytecode-cache artifact families.
56pub const MAGIC: &[u8; 8] = b"HARNBC\0\0";
57
58/// On-disk format version. Bump when [`CachedChunk`] or the header
59/// layout changes in a backwards-incompatible way.
60/// v5: `ModuleArtifact` gained `public_type_names` (`pub type` exports).
61pub const SCHEMA_VERSION: u32 = 5;
62
63/// Compile-time Harn release. Cache files written by a different release
64/// are rejected on load.
65pub const HARN_VERSION: &str = env!("CARGO_PKG_VERSION");
66
67/// Build-time fingerprint of the compiler front-end — the lexer, parser, IR,
68/// and code generator — computed in `build.rs` from those crates' source and
69/// baked in via `cargo:rustc-env`. Folded into the cache key so a compiler
70/// change that alters emitted bytecode for unchanged source invalidates stale
71/// entries automatically, within a single version, with no manual cache wipe.
72/// `HARN_VERSION` only busts the cache across release bumps; this closes the
73/// same gap for the within-version compiler edits that masked #2610. See #2621.
74pub const CODEGEN_FINGERPRINT: &str = env!("HARN_CODEGEN_FINGERPRINT");
75
76/// Conventional extension for entry-chunk cache files.
77pub const CACHE_EXTENSION: &str = "harnbc";
78
79/// Conventional extension for module-artifact cache files. Distinct from
80/// [`CACHE_EXTENSION`] so the same `.harn` source can have both shipped
81/// adjacent if needed (e.g. when a file is both an executable entry and
82/// imported by other files).
83pub const MODULE_CACHE_EXTENSION: &str = "harnmod";
84
85/// On-disk discriminant for a [`Chunk`] payload.
86const KIND_ENTRY_CHUNK: u8 = 1;
87/// On-disk discriminant for a [`ModuleArtifact`] payload.
88const KIND_MODULE_ARTIFACT: u8 = 2;
89
90/// Environment override for the cache directory. When set, takes
91/// precedence over the XDG and home-directory fallbacks.
92pub const CACHE_DIR_ENV: &str = "HARN_CACHE_DIR";
93
94/// Environment override that turns the cache off entirely. Setting this
95/// to `0`, `false`, `no`, or `off` skips both reads and writes; useful
96/// when debugging compiler changes.
97pub const CACHE_ENABLED_ENV: &str = "HARN_BYTECODE_CACHE";
98
99/// Result of a cache lookup. Carries the precomputed key so the caller
100/// can write it back on a miss without rehashing.
101pub struct LookupOutcome {
102    pub key: CacheKey,
103    pub chunk: Option<Chunk>,
104}
105
106/// Cache key components for a single pipeline source. Equality of all
107/// fields is necessary and sufficient for cache reuse.
108#[derive(Clone, Debug, PartialEq, Eq)]
109pub struct CacheKey {
110    pub source_hash: [u8; 32],
111    pub import_graph_hash: [u8; 32],
112    pub harn_version: &'static str,
113    /// Compact tag for active [`CompilerOptions`]. Flipping
114    /// `HARN_DISABLE_OPTIMIZATIONS` between runs would otherwise reuse a
115    /// chunk compiled under the wrong setting.
116    pub compiler_tag: u8,
117}
118
119impl CacheKey {
120    /// Compute the cache key for a `.harn` source file plus its transitive
121    /// user imports. `read_source` is the entry-file contents; the import
122    /// graph is walked from disk relative to `source_path`.
123    pub fn from_source(source_path: &Path, source: &str) -> Self {
124        let source_hash = sha256(source.as_bytes());
125        let import_graph_hash = hash_transitive_user_imports(source_path, source);
126        Self {
127            source_hash,
128            import_graph_hash,
129            harn_version: HARN_VERSION,
130            compiler_tag: compiler_options_tag(CompilerOptions::from_env()),
131        }
132    }
133
134    /// Entry-chunk filename for this key. We hash by source content
135    /// alone so two invocations of the same source from different paths
136    /// share a cache entry; the header's import-graph hash still gates
137    /// reuse on a per-load basis.
138    pub fn filename(&self) -> String {
139        format!("{}.{}", hex(&self.source_hash), CACHE_EXTENSION)
140    }
141
142    /// Module-artifact filename for this key.
143    pub fn module_filename(&self) -> String {
144        format!("{}.{}", hex(&self.source_hash), MODULE_CACHE_EXTENSION)
145    }
146}
147
148/// Returns the directory the shared cache lives in. Honors
149/// `$HARN_CACHE_DIR`, then `$XDG_CACHE_HOME/harn/bytecode`, then
150/// `$HOME/.cache/harn/bytecode`. The directory is *not* created here —
151/// [`store`] creates it lazily on write so read-only environments don't
152/// pay an mkdir cost.
153pub fn cache_dir() -> PathBuf {
154    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
155        return PathBuf::from(custom);
156    }
157    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
158        let xdg = PathBuf::from(xdg);
159        if !xdg.as_os_str().is_empty() {
160            return xdg.join("harn").join("bytecode");
161        }
162    }
163    if let Some(home) = crate::user_dirs::home_dir() {
164        return home.join(".cache").join("harn").join("bytecode");
165    }
166    // Final fallback: a directory beside the binary's working dir. Mostly
167    // hit in tests that scrub HOME from the environment.
168    PathBuf::from(".harn-cache").join("bytecode")
169}
170
171/// Root for `.harnpack` archives unpacked by `harn run <bundle.harnpack>`.
172/// Each verified bundle is replayed into `<root>/<sanitized-bundle-hash>/`
173/// so re-runs reuse the unpacked tree. Honors `$HARN_CACHE_DIR/packs`
174/// when set, otherwise XDG / `$HOME/.cache/harn/packs`.
175pub fn packs_cache_dir() -> PathBuf {
176    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
177        return PathBuf::from(custom).join("packs");
178    }
179    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
180        let xdg = PathBuf::from(xdg);
181        if !xdg.as_os_str().is_empty() {
182            return xdg.join("harn").join("packs");
183        }
184    }
185    if let Some(home) = crate::user_dirs::home_dir() {
186        return home.join(".cache").join("harn").join("packs");
187    }
188    PathBuf::from(".harn-cache").join("packs")
189}
190
191/// True when the cache is enabled by the current environment.
192pub fn cache_enabled() -> bool {
193    match std::env::var(CACHE_ENABLED_ENV).ok().as_deref() {
194        Some(value) => !matches!(
195            value.to_ascii_lowercase().as_str(),
196            "0" | "false" | "no" | "off"
197        ),
198        None => true,
199    }
200}
201
202/// Try to load a cached chunk for `source_path` whose contents are
203/// `source`. Returns the key alongside the (optional) chunk so callers
204/// avoid recomputing the key on miss.
205pub fn load(source_path: &Path, source: &str) -> LookupOutcome {
206    let key = CacheKey::from_source(source_path, source);
207    if !cache_enabled() {
208        return LookupOutcome { key, chunk: None };
209    }
210    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
211    if let Some(adjacent) = adjacent_cache_path(source_path) {
212        candidates.push(adjacent);
213    }
214    candidates.push(cache_dir().join(key.filename()));
215    for path in candidates {
216        match read_chunk_if_matches(&path, &key) {
217            Ok(Some(chunk)) => {
218                return LookupOutcome {
219                    key,
220                    chunk: Some(chunk),
221                }
222            }
223            Ok(None) => continue,
224            Err(_) => continue,
225        }
226    }
227    LookupOutcome { key, chunk: None }
228}
229
230/// Persist `chunk` to the shared cache directory under `key`. Atomic: a
231/// temp file is written then renamed into place. Concurrent invocations
232/// on the same key race safely.
233pub fn store(key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
234    if !cache_enabled() {
235        return Ok(());
236    }
237    let dir = cache_dir();
238    fs::create_dir_all(&dir)?;
239    write_atomic_chunk(&dir.join(key.filename()), key, chunk)
240}
241
242/// Write a precompiled entry-chunk artifact to an explicit path, for
243/// use by the `harn precompile` subcommand. The header still records
244/// the key, so adjacent artifacts shipped with source are validated
245/// like any other cache hit.
246pub fn store_at(path: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
247    ensure_parent_dir(path)?;
248    write_atomic_chunk(path, key, chunk)
249}
250
251/// Look up the [`ModuleArtifact`] for `source_path` (whose contents are
252/// `source`). Mirrors [`load`] but for the `.harnmod` family.
253pub fn load_module(source_path: &Path, source: &str) -> ModuleLookupOutcome {
254    let key = CacheKey::from_source(source_path, source);
255    if !cache_enabled() {
256        return ModuleLookupOutcome {
257            key,
258            artifact: None,
259        };
260    }
261    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
262    if let Some(adjacent) = adjacent_module_cache_path(source_path) {
263        candidates.push(adjacent);
264    }
265    candidates.push(cache_dir().join(key.module_filename()));
266    for path in candidates {
267        match read_module_if_matches(&path, &key) {
268            Ok(Some(artifact)) => {
269                return ModuleLookupOutcome {
270                    key,
271                    artifact: Some(artifact),
272                }
273            }
274            Ok(None) => continue,
275            Err(_) => continue,
276        }
277    }
278    ModuleLookupOutcome {
279        key,
280        artifact: None,
281    }
282}
283
284/// Persist `artifact` to the shared cache under `key`. Atomic;
285/// concurrent invocations race safely.
286pub fn store_module(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
287    if !cache_enabled() {
288        return Ok(());
289    }
290    let dir = cache_dir();
291    fs::create_dir_all(&dir)?;
292    write_atomic_module(&dir.join(key.module_filename()), key, artifact)
293}
294
295/// Write a module artifact to an explicit path.
296pub fn store_module_at(path: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
297    ensure_parent_dir(path)?;
298    write_atomic_module(path, key, artifact)
299}
300
301/// Result of a [`load_module`] lookup. Carries the precomputed key so
302/// the caller can write it back on a miss without rehashing.
303pub struct ModuleLookupOutcome {
304    pub key: CacheKey,
305    pub artifact: Option<ModuleArtifact>,
306}
307
308/// Path to the adjacent precompiled entry-chunk artifact for
309/// `source_path`. `foo.harn` → `foo.harnbc`.
310pub fn adjacent_cache_path(source_path: &Path) -> Option<PathBuf> {
311    adjacent_path_with_extension(source_path, CACHE_EXTENSION)
312}
313
314/// Path to the adjacent precompiled module-artifact for `source_path`.
315/// `foo.harn` → `foo.harnmod`.
316pub fn adjacent_module_cache_path(source_path: &Path) -> Option<PathBuf> {
317    adjacent_path_with_extension(source_path, MODULE_CACHE_EXTENSION)
318}
319
320fn adjacent_path_with_extension(source_path: &Path, ext: &str) -> Option<PathBuf> {
321    let stem = source_path.file_stem()?;
322    if stem.is_empty() {
323        return None;
324    }
325    let parent = source_path.parent().unwrap_or_else(|| Path::new(""));
326    let mut out = parent.join(stem);
327    out.set_extension(ext);
328    Some(out)
329}
330
331fn ensure_parent_dir(path: &Path) -> io::Result<()> {
332    if let Some(parent) = path.parent() {
333        if !parent.as_os_str().is_empty() {
334            fs::create_dir_all(parent)?;
335        }
336    }
337    Ok(())
338}
339
340fn write_atomic_chunk(target: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
341    let buf = serialize_chunk_artifact(key, chunk)?;
342    write_atomic(target, &buf)
343}
344
345fn write_atomic_module(target: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
346    let buf = serialize_module_artifact(key, artifact)?;
347    write_atomic(target, &buf)
348}
349
350/// Serialize an entry-chunk artifact (header + payload) to bytes. The
351/// resulting buffer is byte-identical to the file [`store_at`] would
352/// have written for the same `(key, chunk)`. Use this when packaging
353/// artifacts into a container (e.g. `harn pack`) without going through
354/// the filesystem.
355pub fn serialize_chunk_artifact(key: &CacheKey, chunk: &Chunk) -> io::Result<Vec<u8>> {
356    let cached = chunk.freeze_for_cache();
357    let payload = bincode::serde::encode_to_vec(&cached, bincode::config::standard())
358        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
359    Ok(encode_artifact(key, KIND_ENTRY_CHUNK, &payload))
360}
361
362/// Serialize a module artifact (header + payload) to bytes. Companion
363/// to [`serialize_chunk_artifact`] for the `.harnmod` family.
364pub fn serialize_module_artifact(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<Vec<u8>> {
365    let payload = bincode::serde::encode_to_vec(artifact, bincode::config::standard())
366        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
367    Ok(encode_artifact(key, KIND_MODULE_ARTIFACT, &payload))
368}
369
370fn encode_artifact(key: &CacheKey, kind: u8, payload: &[u8]) -> Vec<u8> {
371    let mut buf: Vec<u8> = Vec::with_capacity(payload.len() + 128);
372    buf.extend_from_slice(MAGIC);
373    buf.extend_from_slice(&SCHEMA_VERSION.to_le_bytes());
374    let version_bytes = HARN_VERSION.as_bytes();
375    buf.extend_from_slice(&(version_bytes.len() as u32).to_le_bytes());
376    buf.extend_from_slice(version_bytes);
377    buf.push(key.compiler_tag);
378    buf.push(kind);
379    buf.extend_from_slice(&key.source_hash);
380    buf.extend_from_slice(&key.import_graph_hash);
381    buf.extend_from_slice(payload);
382    buf
383}
384
385fn write_atomic(target: &Path, buf: &[u8]) -> io::Result<()> {
386    let tmp_path = atomic_tmp_path(target);
387    let mut tmp_file = fs::File::create(&tmp_path)?;
388    tmp_file.write_all(buf)?;
389    tmp_file.sync_all()?;
390    drop(tmp_file);
391    match fs::rename(&tmp_path, target) {
392        Ok(()) => Ok(()),
393        Err(err) => {
394            let _ = fs::remove_file(&tmp_path);
395            Err(err)
396        }
397    }
398}
399
400fn atomic_tmp_path(target: &Path) -> PathBuf {
401    static NEXT_TMP_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
402    let id = NEXT_TMP_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
403    let tmp_name = match target.file_name() {
404        Some(name) => format!(
405            ".{}.{}.{}.tmp",
406            name.to_string_lossy(),
407            std::process::id(),
408            id
409        ),
410        None => format!(".harn-cache.{}.{}.tmp", std::process::id(), id),
411    };
412    target.with_file_name(tmp_name)
413}
414
415/// Parsed cache header. Read by both the chunk and module loaders so the
416/// header-validation logic stays in one place.
417struct ParsedHeader {
418    kind: u8,
419    payload: Vec<u8>,
420}
421
422fn read_header_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ParsedHeader>> {
423    let mut file = match fs::File::open(path) {
424        Ok(f) => f,
425        Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(None),
426        Err(err) => return Err(err),
427    };
428    let mut header = [0u8; 8 + 4 + 4];
429    if file.read_exact(&mut header).is_err() {
430        return Ok(None);
431    }
432    if &header[..8] != MAGIC {
433        return Ok(None);
434    }
435    let schema = u32::from_le_bytes(header[8..12].try_into().unwrap());
436    if schema != SCHEMA_VERSION {
437        return Ok(None);
438    }
439    let version_len = u32::from_le_bytes(header[12..16].try_into().unwrap()) as usize;
440    if version_len > 256 {
441        // Bound the alloc so a corrupted file cannot force an unbounded read.
442        return Ok(None);
443    }
444    let mut version_buf = vec![0u8; version_len];
445    if file.read_exact(&mut version_buf).is_err() {
446        return Ok(None);
447    }
448    if version_buf != key.harn_version.as_bytes() {
449        return Ok(None);
450    }
451    let mut compiler_and_kind = [0u8; 2];
452    if file.read_exact(&mut compiler_and_kind).is_err() {
453        return Ok(None);
454    }
455    if compiler_and_kind[0] != key.compiler_tag {
456        return Ok(None);
457    }
458    let kind = compiler_and_kind[1];
459    let mut hashes = [0u8; 64];
460    if file.read_exact(&mut hashes).is_err() {
461        return Ok(None);
462    }
463    if hashes[..32] != key.source_hash || hashes[32..] != key.import_graph_hash {
464        return Ok(None);
465    }
466    let mut payload = Vec::new();
467    if file.read_to_end(&mut payload).is_err() {
468        return Ok(None);
469    }
470    Ok(Some(ParsedHeader { kind, payload }))
471}
472
473fn read_chunk_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<Chunk>> {
474    let Some(header) = read_header_if_matches(path, key)? else {
475        return Ok(None);
476    };
477    if header.kind != KIND_ENTRY_CHUNK {
478        return Ok(None);
479    }
480    let cached: CachedChunk =
481        match bincode::serde::decode_from_slice(&header.payload, bincode::config::standard()) {
482            Ok((c, _)) => c,
483            Err(_) => return Ok(None),
484        };
485    Ok(Some(Chunk::from_cached(&cached)))
486}
487
488fn read_module_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ModuleArtifact>> {
489    let Some(header) = read_header_if_matches(path, key)? else {
490        return Ok(None);
491    };
492    if header.kind != KIND_MODULE_ARTIFACT {
493        return Ok(None);
494    }
495    match bincode::serde::decode_from_slice::<ModuleArtifact, _>(
496        &header.payload,
497        bincode::config::standard(),
498    ) {
499        Ok((artifact, _)) => Ok(Some(artifact)),
500        Err(_) => Ok(None),
501    }
502}
503
504/// Compact representation of [`CompilerOptions`] for the cache header.
505/// Independent flags get distinct bits so adding a new flag never
506/// silently changes existing keys when an old binary reads a new
507/// artifact — the header check will fail-closed before we get there
508/// anyway, but mapping to bits also keeps the tag a stable function
509/// of the option set.
510fn compiler_options_tag(options: CompilerOptions) -> u8 {
511    let mut tag: u8 = 0;
512    if options.optimizations_enabled() {
513        tag |= 0b0000_0001;
514    }
515    tag
516}
517
518fn sha256(bytes: &[u8]) -> [u8; 32] {
519    let mut hasher = Sha256::new();
520    hasher.update(bytes);
521    hasher.finalize().into()
522}
523
524fn hex(bytes: &[u8]) -> String {
525    let mut out = String::with_capacity(bytes.len() * 2);
526    for byte in bytes {
527        out.push_str(&format!("{byte:02x}"));
528    }
529    out
530}
531
532/// Lightweight regex-free scan that surfaces user imports without paying
533/// a full lex+parse. False positives only increase cache churn, never
534/// correctness; comments and string literals are skipped so neither a
535/// commented-out import nor a `"import …"` value appearing inside an
536/// unrelated string gates the hash.
537fn collect_user_imports(source: &str) -> Vec<String> {
538    let scrubbed = strip_comments(source);
539    let mut out: Vec<String> = Vec::new();
540    let bytes = scrubbed.as_bytes();
541    let mut i = 0;
542    while i < bytes.len() {
543        if bytes[i] == b'"' {
544            // Skip past any string literal so identifiers inside string
545            // values cannot trigger the keyword match below.
546            match read_string_literal(bytes, i) {
547                Some((_, end)) => {
548                    i = end;
549                    continue;
550                }
551                None => {
552                    i += 1;
553                    continue;
554                }
555            }
556        }
557        if !matches_keyword(bytes, i, b"import") {
558            i += 1;
559            continue;
560        }
561        // Skip past `import` and any selective `{ ... } from` clause; we
562        // only need the source-position of the path string literal.
563        let mut j = i + b"import".len();
564        let mut depth = 0i32;
565        while j < bytes.len() {
566            match bytes[j] {
567                b'"' => {
568                    if let Some((path, end)) = read_string_literal(bytes, j) {
569                        if !path.starts_with("std/") {
570                            out.push(path);
571                        }
572                        i = end;
573                        break;
574                    }
575                    j += 1;
576                }
577                b'{' => {
578                    depth += 1;
579                    j += 1;
580                }
581                b'}' => {
582                    depth -= 1;
583                    j += 1;
584                }
585                b'\n' if depth == 0 => {
586                    // No string literal on this logical line; bail and
587                    // continue scanning after the keyword to avoid an
588                    // infinite loop.
589                    i = j;
590                    break;
591                }
592                _ => j += 1,
593            }
594        }
595        if j >= bytes.len() {
596            break;
597        }
598        if i < j {
599            // Defensive: ensure forward progress when the inner loop
600            // exited without setting `i`.
601            i = j;
602        }
603    }
604    out
605}
606
607fn matches_keyword(bytes: &[u8], at: usize, keyword: &[u8]) -> bool {
608    let end = at + keyword.len();
609    if end > bytes.len() {
610        return false;
611    }
612    if &bytes[at..end] != keyword {
613        return false;
614    }
615    if at > 0 && is_ident_char(bytes[at - 1]) {
616        return false;
617    }
618    if end < bytes.len() && is_ident_char(bytes[end]) {
619        return false;
620    }
621    true
622}
623
624fn is_ident_char(b: u8) -> bool {
625    b.is_ascii_alphanumeric() || b == b'_'
626}
627
628fn read_string_literal(bytes: &[u8], at: usize) -> Option<(String, usize)> {
629    debug_assert_eq!(bytes[at], b'"');
630    let mut out = String::new();
631    let mut i = at + 1;
632    while i < bytes.len() {
633        match bytes[i] {
634            b'"' => return Some((out, i + 1)),
635            b'\\' => {
636                if i + 1 >= bytes.len() {
637                    return None;
638                }
639                match bytes[i + 1] {
640                    b'"' => out.push('"'),
641                    b'\\' => out.push('\\'),
642                    b'n' => out.push('\n'),
643                    b'r' => out.push('\r'),
644                    b't' => out.push('\t'),
645                    other => out.push(other as char),
646                }
647                i += 2;
648            }
649            b'\n' => return None,
650            byte => {
651                out.push(byte as char);
652                i += 1;
653            }
654        }
655    }
656    None
657}
658
659fn strip_comments(source: &str) -> String {
660    let bytes = source.as_bytes();
661    let mut out = String::with_capacity(source.len());
662    let mut i = 0;
663    while i < bytes.len() {
664        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'/' {
665            while i < bytes.len() && bytes[i] != b'\n' {
666                i += 1;
667            }
668            continue;
669        }
670        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'*' {
671            i += 2;
672            while i + 1 < bytes.len() && !(bytes[i] == b'*' && bytes[i + 1] == b'/') {
673                i += 1;
674            }
675            i = (i + 2).min(bytes.len());
676            continue;
677        }
678        if bytes[i] == b'"' {
679            if let Some((_, end)) = read_string_literal(bytes, i) {
680                out.push_str(&source[i..end]);
681                i = end;
682                continue;
683            }
684        }
685        out.push(bytes[i] as char);
686        i += 1;
687    }
688    out
689}
690
691/// Stable digest over every embedded stdlib source. Folded into the
692/// user-file cache key so that bumping a stdlib module (changing its
693/// embedded `.harn` content) invalidates cached user bytecode that may
694/// reference stale function-pool layouts from a prior stdlib snapshot.
695/// `HARN_VERSION` already busts the cache across release bumps; this
696/// closes the same gap for within-version stdlib edits (a frequent
697/// pattern during local development).
698///
699/// Cached in a `OnceLock` because `STDLIB_SOURCES` is a static `const`
700/// slice — the digest is identical for the lifetime of the process.
701fn embedded_stdlib_digest() -> &'static [u8; 32] {
702    use std::sync::OnceLock;
703    static DIGEST: OnceLock<[u8; 32]> = OnceLock::new();
704    DIGEST.get_or_init(|| {
705        let mut entries: Vec<(&'static str, &'static str)> = harn_stdlib::STDLIB_SOURCES
706            .iter()
707            .map(|src| (src.module, src.source))
708            .collect();
709        entries.sort_by(|a, b| a.0.cmp(b.0));
710        let mut hasher = Sha256::new();
711        for (module, source) in entries {
712            hasher.update(module.as_bytes());
713            hasher.update(b"\0");
714            hasher.update(source.as_bytes());
715            hasher.update(b"\0");
716        }
717        hasher.finalize().into()
718    })
719}
720
721/// Walk the user-import graph rooted at `source_path` and produce a
722/// stable hash of every transitively-reachable file. The hash is
723/// order-independent: each visited file is keyed by canonical path and
724/// emitted in sorted order, so reordering imports inside a file does
725/// not invalidate the cache while changing any file's content does.
726///
727/// Embedded stdlib content is folded into the hash too — `collect_user_imports`
728/// deliberately skips `std/*` paths (they resolve to in-binary sources, not
729/// disk files), so without this fold a stdlib edit between development
730/// builds would leave user-file caches pinned to a stale stdlib snapshot.
731fn hash_transitive_user_imports(source_path: &Path, source: &str) -> [u8; 32] {
732    hash_transitive_user_imports_fingerprinted(source_path, source, CODEGEN_FINGERPRINT)
733}
734
735/// Process-wide memo of `(file content, collect_user_imports(content))` keyed by
736/// the resolved file path plus its stat identity `(len, mtime_ns)`. Walking a
737/// large pipeline's import graph re-encounters the same shared library files for
738/// nearly every module, so without this memo `from_source` re-reads and
739/// re-scans those files hundreds of times in a single cold run. Because the key
740/// includes `(len, mtime_ns)`, any on-disk edit produces a fresh key and the
741/// stale entry is never reused — a warm long-lived process recompiles edited
742/// pipelines correctly. The returned bytes are identical to the un-memoized
743/// path, so cache keys are byte-for-byte unchanged.
744fn imports_file_memo() -> &'static ImportsFileMemo {
745    use std::sync::OnceLock;
746    static MEMO: OnceLock<ImportsFileMemo> = OnceLock::new();
747    MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()))
748}
749
750/// Process-wide memo of `Path::canonicalize`. The import-graph walk canonicalizes
751/// the same resolved module paths hundreds of times across a cold `from_source`
752/// fan-out, and each call is a `realpath(3)` syscall. A successful
753/// canonicalization is stable for the process lifetime (the pipeline tree is not
754/// moved mid-run), so it is memoized. A *failed* canonicalization (the path does
755/// not exist yet) is NOT memoized: a file that later appears — or a symlink that
756/// is created — must canonicalize freshly so the folded path key matches what a
757/// cold process would produce. This keeps the memo a pure speed optimization with
758/// byte-identical output.
759fn canonicalize_cached(path: &Path) -> PathBuf {
760    use std::sync::OnceLock;
761    static MEMO: OnceLock<std::sync::Mutex<std::collections::HashMap<PathBuf, PathBuf>>> =
762        OnceLock::new();
763    let memo = MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()));
764    if let Some(hit) = memo.lock().unwrap().get(path).cloned() {
765        return hit;
766    }
767    match path.canonicalize() {
768        Ok(canonical) => {
769            memo.lock()
770                .unwrap()
771                .insert(path.to_path_buf(), canonical.clone());
772            canonical
773        }
774        // Unresolved path: fall back to the input, but do not memoize, so a file
775        // that appears later canonicalizes correctly on the next walk.
776        Err(_) => path.to_path_buf(),
777    }
778}
779
780fn file_stat_identity(path: &Path) -> Option<(u64, i128)> {
781    let meta = fs::metadata(path).ok()?;
782    let len = meta.len();
783    // Nanosecond mtime where available; fall back to coarse seconds. Any change
784    // to either component on disk invalidates the memo entry.
785    let mtime_ns = meta
786        .modified()
787        .ok()
788        .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
789        .map(|d| d.as_nanos() as i128)
790        .unwrap_or(0);
791    Some((len, mtime_ns))
792}
793
794/// Read `path` and scan its user imports, memoized by stat identity. On an I/O
795/// error, returns the `ErrorKind` string the un-memoized path folded in (errors
796/// are not memoized — a transient failure should not be sticky).
797fn read_and_scan_imports_cached(path: &Path) -> Result<(String, Vec<String>), String> {
798    if let Some((len, mtime_ns)) = file_stat_identity(path) {
799        let key = (path.to_path_buf(), len, mtime_ns);
800        if let Some(hit) = imports_file_memo().lock().unwrap().get(&key).cloned() {
801            return Ok((hit.0.clone(), hit.1.clone()));
802        }
803        match fs::read_to_string(path) {
804            Ok(content) => {
805                let nested = collect_user_imports(&content);
806                let entry = std::sync::Arc::new((content.clone(), nested.clone()));
807                imports_file_memo().lock().unwrap().insert(key, entry);
808                Ok((content, nested))
809            }
810            Err(err) => Err(err.kind().to_string()),
811        }
812    } else {
813        // No stat (file vanished between resolve and read): fall back to a direct
814        // read so behavior matches the un-memoized path exactly.
815        match fs::read_to_string(path) {
816            Ok(content) => {
817                let nested = collect_user_imports(&content);
818                Ok((content, nested))
819            }
820            Err(err) => Err(err.kind().to_string()),
821        }
822    }
823}
824
825/// Inner form of [`hash_transitive_user_imports`] parameterized on the compiler
826/// fingerprint so tests can vary it; production always passes
827/// [`CODEGEN_FINGERPRINT`].
828fn hash_transitive_user_imports_fingerprinted(
829    source_path: &Path,
830    source: &str,
831    codegen_fingerprint: &str,
832) -> [u8; 32] {
833    let mut visited: std::collections::BTreeMap<PathBuf, ImportNode> =
834        std::collections::BTreeMap::new();
835    let mut frontier: Vec<(PathBuf, String)> = collect_user_imports(source)
836        .into_iter()
837        .map(|import| (source_path.to_path_buf(), import))
838        .collect();
839
840    while let Some((anchor, import)) = frontier.pop() {
841        let Some(resolved) = harn_modules::resolve_import_path(&anchor, &import) else {
842            // Unresolved imports get a sentinel keyed by their resolution
843            // anchor so that dropping a real file under that anchor later
844            // produces a different key.
845            let sentinel = anchor.join(format!("__unresolved__/{import}"));
846            visited
847                .entry(sentinel)
848                .or_insert(ImportNode::Unresolved { import });
849            continue;
850        };
851        let canonical = canonicalize_cached(&resolved);
852        if visited.contains_key(&canonical) {
853            continue;
854        }
855        // Per-file read + import-scan is memoized process-wide, keyed by the
856        // file's identity stat `(len, mtime)`. The same handful of core library
857        // modules (`lib/host/*`, `lib/runtime/*`, ...) sit on the import graph of
858        // nearly every module, so a cold `from_source` over a large pipeline used
859        // to re-read and re-scan the same files hundreds of times across the
860        // module-load fan-out. The memo is invalidated automatically the moment a
861        // file's stat changes on disk, so a warm long-lived process still recompiles
862        // edited pipelines correctly. The folded hash bytes are byte-identical to
863        // the un-memoized path (same content + same `collect_user_imports` output),
864        // so cache keys are unchanged. See `imports_file_memo`.
865        match read_and_scan_imports_cached(&resolved) {
866            Ok((content, nested)) => {
867                visited.insert(canonical.clone(), ImportNode::Resolved { content });
868                for nested_import in nested {
869                    frontier.push((resolved.clone(), nested_import));
870                }
871            }
872            Err(kind) => {
873                visited.insert(canonical, ImportNode::IoError { kind });
874            }
875        }
876    }
877
878    let mut hasher = Sha256::new();
879    hasher.update(b"stdlib-digest\0");
880    hasher.update(embedded_stdlib_digest());
881    hasher.update(b"\0");
882    // Fold in the compiler's code-generation fingerprint so a compiler change
883    // that alters emitted bytecode for unchanged source busts stale cache
884    // entries within a single version — the gap that masked the #2610 fix until
885    // the cache was cleared by hand. See `build.rs` and `CODEGEN_FINGERPRINT`.
886    hasher.update(b"codegen-fingerprint\0");
887    hasher.update(codegen_fingerprint.as_bytes());
888    hasher.update(b"\0");
889    for (path, node) in &visited {
890        hasher.update(path.to_string_lossy().as_bytes());
891        hasher.update(b"\0");
892        match node {
893            ImportNode::Resolved { content } => {
894                hasher.update(b"resolved\0");
895                hasher.update(content.as_bytes());
896            }
897            ImportNode::Unresolved { import } => {
898                hasher.update(b"unresolved\0");
899                hasher.update(import.as_bytes());
900            }
901            ImportNode::IoError { kind } => {
902                hasher.update(b"ioerror\0");
903                hasher.update(kind.as_bytes());
904            }
905        }
906        hasher.update(b"\0");
907    }
908    hasher.finalize().into()
909}
910
911enum ImportNode {
912    Resolved { content: String },
913    Unresolved { import: String },
914    IoError { kind: String },
915}
916
917#[cfg(test)]
918mod tests {
919    use super::*;
920    use crate::compile_source;
921
922    #[test]
923    fn header_round_trips_chunk() {
924        let chunk = compile_source("__io_println(\"hello\")").expect("compile");
925        let key = CacheKey::from_source(Path::new("/tmp/example.harn"), "__io_println(\"hello\")");
926        let tmp = tempfile::tempdir().unwrap();
927        let path = tmp.path().join("entry.harnbc");
928        store_at(&path, &key, &chunk).expect("write");
929        let loaded = read_chunk_if_matches(&path, &key).unwrap();
930        assert!(loaded.is_some(), "expected cached chunk to load");
931    }
932
933    #[test]
934    fn serialize_chunk_artifact_matches_store_at() {
935        // `serialize_chunk_artifact` packages an artifact into a buffer for
936        // in-memory consumers (e.g. `harn pack` writing into a tar.zst
937        // bundle). The contract is: the resulting bytes match what
938        // `store_at` would have written for the same key+chunk, so the
939        // shipped artifact is byte-identical to the on-disk cache form.
940        let chunk = compile_source("__io_println(\"hi\")").expect("compile");
941        let key = CacheKey::from_source(Path::new("/tmp/pack.harn"), "__io_println(\"hi\")");
942        let tmp = tempfile::tempdir().unwrap();
943        let on_disk = tmp.path().join("pack.harnbc");
944        store_at(&on_disk, &key, &chunk).expect("write");
945        let on_disk_bytes = std::fs::read(&on_disk).unwrap();
946        let in_memory_bytes = serialize_chunk_artifact(&key, &chunk).expect("serialize");
947        assert_eq!(in_memory_bytes, on_disk_bytes);
948    }
949
950    #[test]
951    fn atomic_temp_paths_are_unique_within_process() {
952        let target = Path::new("entry.harnbc");
953        let first = atomic_tmp_path(target);
954        let second = atomic_tmp_path(target);
955        assert_ne!(
956            first, second,
957            "same-process concurrent cache writes must not share a temp file"
958        );
959    }
960
961    #[test]
962    fn header_mismatch_returns_none() {
963        let chunk = compile_source("1 + 1").expect("compile");
964        let key = CacheKey::from_source(Path::new("/tmp/a.harn"), "1 + 1");
965        let tmp = tempfile::tempdir().unwrap();
966        let path = tmp.path().join("a.harnbc");
967        store_at(&path, &key, &chunk).expect("write");
968        let other = CacheKey {
969            source_hash: [0xAB; 32],
970            import_graph_hash: key.import_graph_hash,
971            harn_version: HARN_VERSION,
972            compiler_tag: key.compiler_tag,
973        };
974        assert!(read_chunk_if_matches(&path, &other).unwrap().is_none());
975    }
976
977    #[test]
978    fn compiler_tag_mismatch_returns_none() {
979        let chunk = compile_source("1 + 1").expect("compile");
980        let key = CacheKey::from_source(Path::new("/tmp/b.harn"), "1 + 1");
981        let tmp = tempfile::tempdir().unwrap();
982        let path = tmp.path().join("b.harnbc");
983        store_at(&path, &key, &chunk).expect("write");
984        let other = CacheKey {
985            compiler_tag: key.compiler_tag ^ 0xFF,
986            ..key
987        };
988        assert!(
989            read_chunk_if_matches(&path, &other).unwrap().is_none(),
990            "flipped HARN_DISABLE_OPTIMIZATIONS must not reuse a chunk \
991             compiled under the opposite setting"
992        );
993    }
994
995    #[test]
996    fn codegen_fingerprint_is_populated() {
997        // In-workspace builds always hash real compiler sources, so the
998        // fingerprint must be a non-empty digest; an empty value would silently
999        // disable the within-version compiler-staleness guard.
1000        assert!(!CODEGEN_FINGERPRINT.is_empty());
1001    }
1002
1003    #[test]
1004    fn codegen_fingerprint_changes_cache_key() {
1005        // A compiler whose code-generation source differs must produce a
1006        // different cache key for the *same* user source, so a stale artifact
1007        // compiled by a prior compiler at the same version misses on load
1008        // rather than being replayed (#2621). The fingerprint is a compile-time
1009        // constant, so exercise the parameterized inner hash directly.
1010        let tmp = tempfile::tempdir().unwrap();
1011        let entry = tmp.path().join("entry.harn");
1012        std::fs::write(&entry, "__io_println(\"hi\")\n").unwrap();
1013        let source = std::fs::read_to_string(&entry).unwrap();
1014        let a = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-A");
1015        let b = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-B");
1016        let a_again = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-A");
1017        assert_ne!(
1018            a, b,
1019            "differing compiler fingerprints must change the cache key"
1020        );
1021        assert_eq!(
1022            a, a_again,
1023            "an unchanged compiler fingerprint must be stable"
1024        );
1025    }
1026
1027    #[test]
1028    fn collect_user_imports_ignores_stdlib_and_comments() {
1029        let source = r#"
1030            // import "comment/should/be/ignored"
1031            import "std/agents"
1032            import { foo } from "pkg/bar"
1033            import "./relative/path"
1034        "#;
1035        let imports = collect_user_imports(source);
1036        assert_eq!(
1037            imports,
1038            vec!["pkg/bar".to_string(), "./relative/path".to_string()]
1039        );
1040    }
1041
1042    #[test]
1043    fn cache_enabled_respects_env() {
1044        std::env::set_var(CACHE_ENABLED_ENV, "0");
1045        assert!(!cache_enabled());
1046        std::env::set_var(CACHE_ENABLED_ENV, "1");
1047        assert!(cache_enabled());
1048        std::env::remove_var(CACHE_ENABLED_ENV);
1049        assert!(cache_enabled());
1050    }
1051
1052    #[test]
1053    fn import_path_inside_string_literal_is_ignored() {
1054        let source = r#"
1055            let payload = "import { foo } from \"./other\""
1056            import "./real"
1057        "#;
1058        let imports = collect_user_imports(source);
1059        assert_eq!(imports, vec!["./real".to_string()]);
1060    }
1061
1062    #[test]
1063    fn import_hash_is_stable_across_import_order() {
1064        let tmp = tempfile::tempdir().unwrap();
1065        std::fs::write(
1066            tmp.path().join("a.harn"),
1067            "pub fn a() -> int { return 1 }\n",
1068        )
1069        .unwrap();
1070        std::fs::write(
1071            tmp.path().join("b.harn"),
1072            "pub fn b() -> int { return 2 }\n",
1073        )
1074        .unwrap();
1075        let ab = tmp.path().join("entry_ab.harn");
1076        std::fs::write(
1077            &ab,
1078            "import \"./a\"\nimport \"./b\"\n__io_println(\"hi\")\n",
1079        )
1080        .unwrap();
1081        let ba = tmp.path().join("entry_ba.harn");
1082        std::fs::write(
1083            &ba,
1084            "import \"./b\"\nimport \"./a\"\n__io_println(\"hi\")\n",
1085        )
1086        .unwrap();
1087        let hash_ab = hash_transitive_user_imports(&ab, &std::fs::read_to_string(&ab).unwrap());
1088        let hash_ba = hash_transitive_user_imports(&ba, &std::fs::read_to_string(&ba).unwrap());
1089        assert_eq!(
1090            hash_ab, hash_ba,
1091            "import-graph hash must be order-independent so reordering imports \
1092             does not bust the cache"
1093        );
1094    }
1095
1096    #[test]
1097    fn import_hash_picks_up_nested_imports() {
1098        let tmp = tempfile::tempdir().unwrap();
1099        std::fs::write(
1100            tmp.path().join("leaf.harn"),
1101            "pub fn x() -> int { return 1 }\n",
1102        )
1103        .unwrap();
1104        std::fs::write(
1105            tmp.path().join("mid.harn"),
1106            "import \"./leaf\"\npub fn y() -> int { return 2 }\n",
1107        )
1108        .unwrap();
1109        let entry = tmp.path().join("entry.harn");
1110        std::fs::write(&entry, "import \"./mid\"\n__io_println(\"hi\")\n").unwrap();
1111
1112        let before =
1113            hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1114        std::fs::write(
1115            tmp.path().join("leaf.harn"),
1116            "pub fn x() -> int { return 999 }\n",
1117        )
1118        .unwrap();
1119        let after = hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1120        assert_ne!(
1121            before, after,
1122            "editing a transitively-imported file must change the import-graph hash"
1123        );
1124    }
1125
1126    #[test]
1127    fn import_hash_busts_on_same_length_edit_in_same_process() {
1128        // The per-file read/scan memo is keyed by `(path, len, mtime_ns)`. The
1129        // hardest case for that key is an edit that preserves byte length: only
1130        // the mtime distinguishes the two versions. Guard that a same-length edit
1131        // to a transitively-imported file, recomputed in the SAME process so the
1132        // memo is warm, still busts the import-graph hash. Without a working
1133        // staleness check a warm long-lived process would replay stale bytecode.
1134        let tmp = tempfile::tempdir().unwrap();
1135        let leaf = tmp.path().join("leaf.harn");
1136        std::fs::write(&leaf, "pub fn x() -> int { return 111 }\n").unwrap();
1137        let entry = tmp.path().join("entry.harn");
1138        std::fs::write(&entry, "import \"./leaf\"\n__io_println(\"hi\")\n").unwrap();
1139
1140        let before =
1141            hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1142
1143        // Same byte length (`111` -> `222`), so the memo must rely on mtime.
1144        // Instead of sleeping out the coarsest plausible mtime granularity,
1145        // push the rewritten file's mtime deterministically into the future so
1146        // the `(path, len, mtime_ns)` stat key changes instantly on every
1147        // filesystem this runs on.
1148        std::fs::write(&leaf, "pub fn x() -> int { return 222 }\n").unwrap();
1149        // Bump from the file's own current mtime by a fixed margin instead of
1150        // sleeping or using a large absolute timestamp literal.
1151        let future = std::fs::metadata(&leaf).unwrap().modified().unwrap()
1152            + std::time::Duration::from_secs(10);
1153        std::fs::OpenOptions::new()
1154            .write(true)
1155            .open(&leaf)
1156            .unwrap()
1157            .set_times(std::fs::FileTimes::new().set_modified(future))
1158            .unwrap();
1159        assert_eq!(
1160            std::fs::metadata(&leaf).unwrap().len(),
1161            33,
1162            "the two leaf versions must be the same byte length for this test to \
1163             exercise the mtime path"
1164        );
1165
1166        let after = hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1167        assert_ne!(
1168            before, after,
1169            "a same-length edit to a transitively-imported file must still change \
1170             the import-graph hash when recomputed in a warm process"
1171        );
1172    }
1173
1174    #[test]
1175    fn import_hash_stable_across_repeated_calls_same_process() {
1176        // The memo must be a pure speed optimization: repeated `from_source`
1177        // calls over an unchanged tree (the cold-start module-load fan-out
1178        // pattern) must return byte-identical hashes.
1179        let tmp = tempfile::tempdir().unwrap();
1180        std::fs::write(
1181            tmp.path().join("dep.harn"),
1182            "pub fn d() -> int { return 7 }\n",
1183        )
1184        .unwrap();
1185        let entry = tmp.path().join("entry.harn");
1186        std::fs::write(&entry, "import \"./dep\"\n__io_println(\"hi\")\n").unwrap();
1187        let src = std::fs::read_to_string(&entry).unwrap();
1188        let first = hash_transitive_user_imports(&entry, &src);
1189        for _ in 0..50 {
1190            assert_eq!(
1191                hash_transitive_user_imports(&entry, &src),
1192                first,
1193                "repeated import-graph hashing over an unchanged tree must be stable"
1194            );
1195        }
1196    }
1197}