Skip to main content

harn_vm/
bytecode_cache.rs

1//! Content-addressed on-disk cache for compiled `.harn` pipelines.
2//!
3//! Cold-start `harn run` re-parses, type-checks, and compiles the entry
4//! pipeline before the VM gets a single instruction to execute. For short
5//! Harn subcommands that wrap a few `llm_call`s in a small pipeline, that
6//! compile cost dominates wall-clock time.
7//!
8//! This module persists [`Chunk`] bytecode under
9//! `$HARN_CACHE_DIR/<source-hash>.harnbc` (XDG-aware). The cache key is
10//! derived from the entry source plus the content hash of every
11//! transitively-imported user file; stdlib imports are covered by the
12//! embedded `harn_version` field in the header. Any change to any input
13//! flips the key and the next run recompiles.
14//!
15//! File layout — little-endian throughout:
16//!
17//! ```text
18//! magic        : [u8; 8]   = "HARNBC\0\0"
19//! schema_ver   : u32       = SCHEMA_VERSION
20//! version_len  : u32
21//! harn_version : [u8; version_len]
22//! compiler_tag : u8        bitmask of active CompilerOptions
23//! kind         : u8        1 = entry chunk, 2 = module artifact
24//! source_hash  : [u8; 32]
25//! import_hash  : [u8; 32]
26//! payload      : bincode-serialized payload for `kind`
27//! ```
28//!
29//! The header lets a stale binary detect a future-version artifact
30//! without crashing: a magic mismatch, schema mismatch, or version
31//! mismatch is returned as `Ok(None)` so the caller transparently
32//! recompiles. Real I/O errors propagate.
33//!
34//! Concurrency: writes are atomic (write-tmp-then-rename), and parallel
35//! invocations on a cache miss race safely — the last writer wins, but
36//! every reader observes a consistent file because the rename is atomic
37//! on every supported filesystem.
38
39use std::fs;
40use std::io::{self, Read as _, Write as _};
41use std::path::{Path, PathBuf};
42
43use sha2::{Digest, Sha256};
44
45use crate::chunk::{CachedChunk, Chunk};
46use crate::compiler::CompilerOptions;
47use crate::module_artifact::ModuleArtifact;
48
49/// Header magic for all bytecode-cache artifact families.
50pub const MAGIC: &[u8; 8] = b"HARNBC\0\0";
51
52/// On-disk format version. Bump when [`CachedChunk`] or the header
53/// layout changes in a backwards-incompatible way.
54pub const SCHEMA_VERSION: u32 = 4;
55
56/// Compile-time Harn release. Cache files written by a different release
57/// are rejected on load.
58pub const HARN_VERSION: &str = env!("CARGO_PKG_VERSION");
59
60/// Build-time fingerprint of the compiler front-end — the lexer, parser, IR,
61/// and code generator — computed in `build.rs` from those crates' source and
62/// baked in via `cargo:rustc-env`. Folded into the cache key so a compiler
63/// change that alters emitted bytecode for unchanged source invalidates stale
64/// entries automatically, within a single version, with no manual cache wipe.
65/// `HARN_VERSION` only busts the cache across release bumps; this closes the
66/// same gap for the within-version compiler edits that masked #2610. See #2621.
67pub const CODEGEN_FINGERPRINT: &str = env!("HARN_CODEGEN_FINGERPRINT");
68
69/// Conventional extension for entry-chunk cache files.
70pub const CACHE_EXTENSION: &str = "harnbc";
71
72/// Conventional extension for module-artifact cache files. Distinct from
73/// [`CACHE_EXTENSION`] so the same `.harn` source can have both shipped
74/// adjacent if needed (e.g. when a file is both an executable entry and
75/// imported by other files).
76pub const MODULE_CACHE_EXTENSION: &str = "harnmod";
77
78/// On-disk discriminant for a [`Chunk`] payload.
79const KIND_ENTRY_CHUNK: u8 = 1;
80/// On-disk discriminant for a [`ModuleArtifact`] payload.
81const KIND_MODULE_ARTIFACT: u8 = 2;
82
83/// Environment override for the cache directory. When set, takes
84/// precedence over the XDG and home-directory fallbacks.
85pub const CACHE_DIR_ENV: &str = "HARN_CACHE_DIR";
86
87/// Environment override that turns the cache off entirely. Setting this
88/// to `0`, `false`, `no`, or `off` skips both reads and writes; useful
89/// when debugging compiler changes.
90pub const CACHE_ENABLED_ENV: &str = "HARN_BYTECODE_CACHE";
91
92/// Result of a cache lookup. Carries the precomputed key so the caller
93/// can write it back on a miss without rehashing.
94pub struct LookupOutcome {
95    pub key: CacheKey,
96    pub chunk: Option<Chunk>,
97}
98
99/// Cache key components for a single pipeline source. Equality of all
100/// fields is necessary and sufficient for cache reuse.
101#[derive(Clone, Debug, PartialEq, Eq)]
102pub struct CacheKey {
103    pub source_hash: [u8; 32],
104    pub import_graph_hash: [u8; 32],
105    pub harn_version: &'static str,
106    /// Compact tag for active [`CompilerOptions`]. Flipping
107    /// `HARN_DISABLE_OPTIMIZATIONS` between runs would otherwise reuse a
108    /// chunk compiled under the wrong setting.
109    pub compiler_tag: u8,
110}
111
112impl CacheKey {
113    /// Compute the cache key for a `.harn` source file plus its transitive
114    /// user imports. `read_source` is the entry-file contents; the import
115    /// graph is walked from disk relative to `source_path`.
116    pub fn from_source(source_path: &Path, source: &str) -> Self {
117        let source_hash = sha256(source.as_bytes());
118        let import_graph_hash = hash_transitive_user_imports(source_path, source);
119        Self {
120            source_hash,
121            import_graph_hash,
122            harn_version: HARN_VERSION,
123            compiler_tag: compiler_options_tag(CompilerOptions::from_env()),
124        }
125    }
126
127    /// Entry-chunk filename for this key. We hash by source content
128    /// alone so two invocations of the same source from different paths
129    /// share a cache entry; the header's import-graph hash still gates
130    /// reuse on a per-load basis.
131    pub fn filename(&self) -> String {
132        format!("{}.{}", hex(&self.source_hash), CACHE_EXTENSION)
133    }
134
135    /// Module-artifact filename for this key.
136    pub fn module_filename(&self) -> String {
137        format!("{}.{}", hex(&self.source_hash), MODULE_CACHE_EXTENSION)
138    }
139}
140
141/// Returns the directory the shared cache lives in. Honors
142/// `$HARN_CACHE_DIR`, then `$XDG_CACHE_HOME/harn/bytecode`, then
143/// `$HOME/.cache/harn/bytecode`. The directory is *not* created here —
144/// [`store`] creates it lazily on write so read-only environments don't
145/// pay an mkdir cost.
146pub fn cache_dir() -> PathBuf {
147    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
148        return PathBuf::from(custom);
149    }
150    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
151        let xdg = PathBuf::from(xdg);
152        if !xdg.as_os_str().is_empty() {
153            return xdg.join("harn").join("bytecode");
154        }
155    }
156    if let Some(home) = crate::user_dirs::home_dir() {
157        return home.join(".cache").join("harn").join("bytecode");
158    }
159    // Final fallback: a directory beside the binary's working dir. Mostly
160    // hit in tests that scrub HOME from the environment.
161    PathBuf::from(".harn-cache").join("bytecode")
162}
163
164/// Root for `.harnpack` archives unpacked by `harn run <bundle.harnpack>`.
165/// Each verified bundle is replayed into `<root>/<sanitized-bundle-hash>/`
166/// so re-runs reuse the unpacked tree. Honors `$HARN_CACHE_DIR/packs`
167/// when set, otherwise XDG / `$HOME/.cache/harn/packs`.
168pub fn packs_cache_dir() -> PathBuf {
169    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
170        return PathBuf::from(custom).join("packs");
171    }
172    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
173        let xdg = PathBuf::from(xdg);
174        if !xdg.as_os_str().is_empty() {
175            return xdg.join("harn").join("packs");
176        }
177    }
178    if let Some(home) = crate::user_dirs::home_dir() {
179        return home.join(".cache").join("harn").join("packs");
180    }
181    PathBuf::from(".harn-cache").join("packs")
182}
183
184/// True when the cache is enabled by the current environment.
185pub fn cache_enabled() -> bool {
186    match std::env::var(CACHE_ENABLED_ENV).ok().as_deref() {
187        Some(value) => !matches!(
188            value.to_ascii_lowercase().as_str(),
189            "0" | "false" | "no" | "off"
190        ),
191        None => true,
192    }
193}
194
195/// Try to load a cached chunk for `source_path` whose contents are
196/// `source`. Returns the key alongside the (optional) chunk so callers
197/// avoid recomputing the key on miss.
198pub fn load(source_path: &Path, source: &str) -> LookupOutcome {
199    let key = CacheKey::from_source(source_path, source);
200    if !cache_enabled() {
201        return LookupOutcome { key, chunk: None };
202    }
203    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
204    if let Some(adjacent) = adjacent_cache_path(source_path) {
205        candidates.push(adjacent);
206    }
207    candidates.push(cache_dir().join(key.filename()));
208    for path in candidates {
209        match read_chunk_if_matches(&path, &key) {
210            Ok(Some(chunk)) => {
211                return LookupOutcome {
212                    key,
213                    chunk: Some(chunk),
214                }
215            }
216            Ok(None) => continue,
217            Err(_) => continue,
218        }
219    }
220    LookupOutcome { key, chunk: None }
221}
222
223/// Persist `chunk` to the shared cache directory under `key`. Atomic: a
224/// temp file is written then renamed into place. Concurrent invocations
225/// on the same key race safely.
226pub fn store(key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
227    if !cache_enabled() {
228        return Ok(());
229    }
230    let dir = cache_dir();
231    fs::create_dir_all(&dir)?;
232    write_atomic_chunk(&dir.join(key.filename()), key, chunk)
233}
234
235/// Write a precompiled entry-chunk artifact to an explicit path, for
236/// use by the `harn precompile` subcommand. The header still records
237/// the key, so adjacent artifacts shipped with source are validated
238/// like any other cache hit.
239pub fn store_at(path: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
240    ensure_parent_dir(path)?;
241    write_atomic_chunk(path, key, chunk)
242}
243
244/// Look up the [`ModuleArtifact`] for `source_path` (whose contents are
245/// `source`). Mirrors [`load`] but for the `.harnmod` family.
246pub fn load_module(source_path: &Path, source: &str) -> ModuleLookupOutcome {
247    let key = CacheKey::from_source(source_path, source);
248    if !cache_enabled() {
249        return ModuleLookupOutcome {
250            key,
251            artifact: None,
252        };
253    }
254    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
255    if let Some(adjacent) = adjacent_module_cache_path(source_path) {
256        candidates.push(adjacent);
257    }
258    candidates.push(cache_dir().join(key.module_filename()));
259    for path in candidates {
260        match read_module_if_matches(&path, &key) {
261            Ok(Some(artifact)) => {
262                return ModuleLookupOutcome {
263                    key,
264                    artifact: Some(artifact),
265                }
266            }
267            Ok(None) => continue,
268            Err(_) => continue,
269        }
270    }
271    ModuleLookupOutcome {
272        key,
273        artifact: None,
274    }
275}
276
277/// Persist `artifact` to the shared cache under `key`. Atomic;
278/// concurrent invocations race safely.
279pub fn store_module(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
280    if !cache_enabled() {
281        return Ok(());
282    }
283    let dir = cache_dir();
284    fs::create_dir_all(&dir)?;
285    write_atomic_module(&dir.join(key.module_filename()), key, artifact)
286}
287
288/// Write a module artifact to an explicit path.
289pub fn store_module_at(path: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
290    ensure_parent_dir(path)?;
291    write_atomic_module(path, key, artifact)
292}
293
294/// Result of a [`load_module`] lookup. Carries the precomputed key so
295/// the caller can write it back on a miss without rehashing.
296pub struct ModuleLookupOutcome {
297    pub key: CacheKey,
298    pub artifact: Option<ModuleArtifact>,
299}
300
301/// Path to the adjacent precompiled entry-chunk artifact for
302/// `source_path`. `foo.harn` → `foo.harnbc`.
303pub fn adjacent_cache_path(source_path: &Path) -> Option<PathBuf> {
304    adjacent_path_with_extension(source_path, CACHE_EXTENSION)
305}
306
307/// Path to the adjacent precompiled module-artifact for `source_path`.
308/// `foo.harn` → `foo.harnmod`.
309pub fn adjacent_module_cache_path(source_path: &Path) -> Option<PathBuf> {
310    adjacent_path_with_extension(source_path, MODULE_CACHE_EXTENSION)
311}
312
313fn adjacent_path_with_extension(source_path: &Path, ext: &str) -> Option<PathBuf> {
314    let stem = source_path.file_stem()?;
315    if stem.is_empty() {
316        return None;
317    }
318    let parent = source_path.parent().unwrap_or_else(|| Path::new(""));
319    let mut out = parent.join(stem);
320    out.set_extension(ext);
321    Some(out)
322}
323
324fn ensure_parent_dir(path: &Path) -> io::Result<()> {
325    if let Some(parent) = path.parent() {
326        if !parent.as_os_str().is_empty() {
327            fs::create_dir_all(parent)?;
328        }
329    }
330    Ok(())
331}
332
333fn write_atomic_chunk(target: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
334    let buf = serialize_chunk_artifact(key, chunk)?;
335    write_atomic(target, &buf)
336}
337
338fn write_atomic_module(target: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
339    let buf = serialize_module_artifact(key, artifact)?;
340    write_atomic(target, &buf)
341}
342
343/// Serialize an entry-chunk artifact (header + payload) to bytes. The
344/// resulting buffer is byte-identical to the file [`store_at`] would
345/// have written for the same `(key, chunk)`. Use this when packaging
346/// artifacts into a container (e.g. `harn pack`) without going through
347/// the filesystem.
348pub fn serialize_chunk_artifact(key: &CacheKey, chunk: &Chunk) -> io::Result<Vec<u8>> {
349    let cached = chunk.freeze_for_cache();
350    let payload = bincode::serde::encode_to_vec(&cached, bincode::config::standard())
351        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
352    Ok(encode_artifact(key, KIND_ENTRY_CHUNK, &payload))
353}
354
355/// Serialize a module artifact (header + payload) to bytes. Companion
356/// to [`serialize_chunk_artifact`] for the `.harnmod` family.
357pub fn serialize_module_artifact(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<Vec<u8>> {
358    let payload = bincode::serde::encode_to_vec(artifact, bincode::config::standard())
359        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
360    Ok(encode_artifact(key, KIND_MODULE_ARTIFACT, &payload))
361}
362
363fn encode_artifact(key: &CacheKey, kind: u8, payload: &[u8]) -> Vec<u8> {
364    let mut buf: Vec<u8> = Vec::with_capacity(payload.len() + 128);
365    buf.extend_from_slice(MAGIC);
366    buf.extend_from_slice(&SCHEMA_VERSION.to_le_bytes());
367    let version_bytes = HARN_VERSION.as_bytes();
368    buf.extend_from_slice(&(version_bytes.len() as u32).to_le_bytes());
369    buf.extend_from_slice(version_bytes);
370    buf.push(key.compiler_tag);
371    buf.push(kind);
372    buf.extend_from_slice(&key.source_hash);
373    buf.extend_from_slice(&key.import_graph_hash);
374    buf.extend_from_slice(payload);
375    buf
376}
377
378fn write_atomic(target: &Path, buf: &[u8]) -> io::Result<()> {
379    let tmp_name = match target.file_name() {
380        Some(name) => format!(".{}.{}.tmp", name.to_string_lossy(), std::process::id()),
381        None => format!(".harn-cache.{}.tmp", std::process::id()),
382    };
383    let tmp_path = target.with_file_name(tmp_name);
384    let mut tmp_file = fs::File::create(&tmp_path)?;
385    tmp_file.write_all(buf)?;
386    tmp_file.sync_all()?;
387    drop(tmp_file);
388    match fs::rename(&tmp_path, target) {
389        Ok(()) => Ok(()),
390        Err(err) => {
391            let _ = fs::remove_file(&tmp_path);
392            Err(err)
393        }
394    }
395}
396
397/// Parsed cache header. Read by both the chunk and module loaders so the
398/// header-validation logic stays in one place.
399struct ParsedHeader {
400    kind: u8,
401    payload: Vec<u8>,
402}
403
404fn read_header_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ParsedHeader>> {
405    let mut file = match fs::File::open(path) {
406        Ok(f) => f,
407        Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(None),
408        Err(err) => return Err(err),
409    };
410    let mut header = [0u8; 8 + 4 + 4];
411    if file.read_exact(&mut header).is_err() {
412        return Ok(None);
413    }
414    if &header[..8] != MAGIC {
415        return Ok(None);
416    }
417    let schema = u32::from_le_bytes(header[8..12].try_into().unwrap());
418    if schema != SCHEMA_VERSION {
419        return Ok(None);
420    }
421    let version_len = u32::from_le_bytes(header[12..16].try_into().unwrap()) as usize;
422    if version_len > 256 {
423        // Bound the alloc so a corrupted file cannot force an unbounded read.
424        return Ok(None);
425    }
426    let mut version_buf = vec![0u8; version_len];
427    if file.read_exact(&mut version_buf).is_err() {
428        return Ok(None);
429    }
430    if version_buf != key.harn_version.as_bytes() {
431        return Ok(None);
432    }
433    let mut compiler_and_kind = [0u8; 2];
434    if file.read_exact(&mut compiler_and_kind).is_err() {
435        return Ok(None);
436    }
437    if compiler_and_kind[0] != key.compiler_tag {
438        return Ok(None);
439    }
440    let kind = compiler_and_kind[1];
441    let mut hashes = [0u8; 64];
442    if file.read_exact(&mut hashes).is_err() {
443        return Ok(None);
444    }
445    if hashes[..32] != key.source_hash || hashes[32..] != key.import_graph_hash {
446        return Ok(None);
447    }
448    let mut payload = Vec::new();
449    if file.read_to_end(&mut payload).is_err() {
450        return Ok(None);
451    }
452    Ok(Some(ParsedHeader { kind, payload }))
453}
454
455fn read_chunk_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<Chunk>> {
456    let Some(header) = read_header_if_matches(path, key)? else {
457        return Ok(None);
458    };
459    if header.kind != KIND_ENTRY_CHUNK {
460        return Ok(None);
461    }
462    let cached: CachedChunk =
463        match bincode::serde::decode_from_slice(&header.payload, bincode::config::standard()) {
464            Ok((c, _)) => c,
465            Err(_) => return Ok(None),
466        };
467    Ok(Some(Chunk::from_cached(&cached)))
468}
469
470fn read_module_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ModuleArtifact>> {
471    let Some(header) = read_header_if_matches(path, key)? else {
472        return Ok(None);
473    };
474    if header.kind != KIND_MODULE_ARTIFACT {
475        return Ok(None);
476    }
477    match bincode::serde::decode_from_slice::<ModuleArtifact, _>(
478        &header.payload,
479        bincode::config::standard(),
480    ) {
481        Ok((artifact, _)) => Ok(Some(artifact)),
482        Err(_) => Ok(None),
483    }
484}
485
486/// Compact representation of [`CompilerOptions`] for the cache header.
487/// Independent flags get distinct bits so adding a new flag never
488/// silently changes existing keys when an old binary reads a new
489/// artifact — the header check will fail-closed before we get there
490/// anyway, but mapping to bits also keeps the tag a stable function
491/// of the option set.
492fn compiler_options_tag(options: CompilerOptions) -> u8 {
493    let mut tag: u8 = 0;
494    if options.optimizations_enabled() {
495        tag |= 0b0000_0001;
496    }
497    tag
498}
499
500fn sha256(bytes: &[u8]) -> [u8; 32] {
501    let mut hasher = Sha256::new();
502    hasher.update(bytes);
503    hasher.finalize().into()
504}
505
506fn hex(bytes: &[u8]) -> String {
507    let mut out = String::with_capacity(bytes.len() * 2);
508    for byte in bytes {
509        out.push_str(&format!("{byte:02x}"));
510    }
511    out
512}
513
514/// Lightweight regex-free scan that surfaces user imports without paying
515/// a full lex+parse. False positives only increase cache churn, never
516/// correctness; comments and string literals are skipped so neither a
517/// commented-out import nor a `"import …"` value appearing inside an
518/// unrelated string gates the hash.
519fn collect_user_imports(source: &str) -> Vec<String> {
520    let scrubbed = strip_comments(source);
521    let mut out: Vec<String> = Vec::new();
522    let bytes = scrubbed.as_bytes();
523    let mut i = 0;
524    while i < bytes.len() {
525        if bytes[i] == b'"' {
526            // Skip past any string literal so identifiers inside string
527            // values cannot trigger the keyword match below.
528            match read_string_literal(bytes, i) {
529                Some((_, end)) => {
530                    i = end;
531                    continue;
532                }
533                None => {
534                    i += 1;
535                    continue;
536                }
537            }
538        }
539        if !matches_keyword(bytes, i, b"import") {
540            i += 1;
541            continue;
542        }
543        // Skip past `import` and any selective `{ ... } from` clause; we
544        // only need the source-position of the path string literal.
545        let mut j = i + b"import".len();
546        let mut depth = 0i32;
547        while j < bytes.len() {
548            match bytes[j] {
549                b'"' => {
550                    if let Some((path, end)) = read_string_literal(bytes, j) {
551                        if !path.starts_with("std/") {
552                            out.push(path);
553                        }
554                        i = end;
555                        break;
556                    }
557                    j += 1;
558                }
559                b'{' => {
560                    depth += 1;
561                    j += 1;
562                }
563                b'}' => {
564                    depth -= 1;
565                    j += 1;
566                }
567                b'\n' if depth == 0 => {
568                    // No string literal on this logical line; bail and
569                    // continue scanning after the keyword to avoid an
570                    // infinite loop.
571                    i = j;
572                    break;
573                }
574                _ => j += 1,
575            }
576        }
577        if j >= bytes.len() {
578            break;
579        }
580        if i < j {
581            // Defensive: ensure forward progress when the inner loop
582            // exited without setting `i`.
583            i = j;
584        }
585    }
586    out
587}
588
589fn matches_keyword(bytes: &[u8], at: usize, keyword: &[u8]) -> bool {
590    let end = at + keyword.len();
591    if end > bytes.len() {
592        return false;
593    }
594    if &bytes[at..end] != keyword {
595        return false;
596    }
597    if at > 0 && is_ident_char(bytes[at - 1]) {
598        return false;
599    }
600    if end < bytes.len() && is_ident_char(bytes[end]) {
601        return false;
602    }
603    true
604}
605
606fn is_ident_char(b: u8) -> bool {
607    b.is_ascii_alphanumeric() || b == b'_'
608}
609
610fn read_string_literal(bytes: &[u8], at: usize) -> Option<(String, usize)> {
611    debug_assert_eq!(bytes[at], b'"');
612    let mut out = String::new();
613    let mut i = at + 1;
614    while i < bytes.len() {
615        match bytes[i] {
616            b'"' => return Some((out, i + 1)),
617            b'\\' => {
618                if i + 1 >= bytes.len() {
619                    return None;
620                }
621                match bytes[i + 1] {
622                    b'"' => out.push('"'),
623                    b'\\' => out.push('\\'),
624                    b'n' => out.push('\n'),
625                    b'r' => out.push('\r'),
626                    b't' => out.push('\t'),
627                    other => out.push(other as char),
628                }
629                i += 2;
630            }
631            b'\n' => return None,
632            byte => {
633                out.push(byte as char);
634                i += 1;
635            }
636        }
637    }
638    None
639}
640
641fn strip_comments(source: &str) -> String {
642    let bytes = source.as_bytes();
643    let mut out = String::with_capacity(source.len());
644    let mut i = 0;
645    while i < bytes.len() {
646        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'/' {
647            while i < bytes.len() && bytes[i] != b'\n' {
648                i += 1;
649            }
650            continue;
651        }
652        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'*' {
653            i += 2;
654            while i + 1 < bytes.len() && !(bytes[i] == b'*' && bytes[i + 1] == b'/') {
655                i += 1;
656            }
657            i = (i + 2).min(bytes.len());
658            continue;
659        }
660        if bytes[i] == b'"' {
661            if let Some((_, end)) = read_string_literal(bytes, i) {
662                out.push_str(&source[i..end]);
663                i = end;
664                continue;
665            }
666        }
667        out.push(bytes[i] as char);
668        i += 1;
669    }
670    out
671}
672
673/// Stable digest over every embedded stdlib source. Folded into the
674/// user-file cache key so that bumping a stdlib module (changing its
675/// embedded `.harn` content) invalidates cached user bytecode that may
676/// reference stale function-pool layouts from a prior stdlib snapshot.
677/// `HARN_VERSION` already busts the cache across release bumps; this
678/// closes the same gap for within-version stdlib edits (a frequent
679/// pattern during local development).
680///
681/// Cached in a `OnceLock` because `STDLIB_SOURCES` is a static `const`
682/// slice — the digest is identical for the lifetime of the process.
683fn embedded_stdlib_digest() -> &'static [u8; 32] {
684    use std::sync::OnceLock;
685    static DIGEST: OnceLock<[u8; 32]> = OnceLock::new();
686    DIGEST.get_or_init(|| {
687        let mut entries: Vec<(&'static str, &'static str)> = harn_stdlib::STDLIB_SOURCES
688            .iter()
689            .map(|src| (src.module, src.source))
690            .collect();
691        entries.sort_by(|a, b| a.0.cmp(b.0));
692        let mut hasher = Sha256::new();
693        for (module, source) in entries {
694            hasher.update(module.as_bytes());
695            hasher.update(b"\0");
696            hasher.update(source.as_bytes());
697            hasher.update(b"\0");
698        }
699        hasher.finalize().into()
700    })
701}
702
703/// Walk the user-import graph rooted at `source_path` and produce a
704/// stable hash of every transitively-reachable file. The hash is
705/// order-independent: each visited file is keyed by canonical path and
706/// emitted in sorted order, so reordering imports inside a file does
707/// not invalidate the cache while changing any file's content does.
708///
709/// Embedded stdlib content is folded into the hash too — `collect_user_imports`
710/// deliberately skips `std/*` paths (they resolve to in-binary sources, not
711/// disk files), so without this fold a stdlib edit between development
712/// builds would leave user-file caches pinned to a stale stdlib snapshot.
713fn hash_transitive_user_imports(source_path: &Path, source: &str) -> [u8; 32] {
714    hash_transitive_user_imports_fingerprinted(source_path, source, CODEGEN_FINGERPRINT)
715}
716
717/// Inner form of [`hash_transitive_user_imports`] parameterized on the compiler
718/// fingerprint so tests can vary it; production always passes
719/// [`CODEGEN_FINGERPRINT`].
720fn hash_transitive_user_imports_fingerprinted(
721    source_path: &Path,
722    source: &str,
723    codegen_fingerprint: &str,
724) -> [u8; 32] {
725    let mut visited: std::collections::BTreeMap<PathBuf, ImportNode> =
726        std::collections::BTreeMap::new();
727    let mut frontier: Vec<(PathBuf, String)> = collect_user_imports(source)
728        .into_iter()
729        .map(|import| (source_path.to_path_buf(), import))
730        .collect();
731
732    while let Some((anchor, import)) = frontier.pop() {
733        let Some(resolved) = harn_modules::resolve_import_path(&anchor, &import) else {
734            // Unresolved imports get a sentinel keyed by their resolution
735            // anchor so that dropping a real file under that anchor later
736            // produces a different key.
737            let sentinel = anchor.join(format!("__unresolved__/{import}"));
738            visited
739                .entry(sentinel)
740                .or_insert(ImportNode::Unresolved { import });
741            continue;
742        };
743        let canonical = resolved.canonicalize().unwrap_or_else(|_| resolved.clone());
744        if visited.contains_key(&canonical) {
745            continue;
746        }
747        match fs::read_to_string(&resolved) {
748            Ok(content) => {
749                let nested = collect_user_imports(&content);
750                visited.insert(
751                    canonical.clone(),
752                    ImportNode::Resolved {
753                        content: content.clone(),
754                    },
755                );
756                for nested_import in nested {
757                    frontier.push((resolved.clone(), nested_import));
758                }
759            }
760            Err(err) => {
761                visited.insert(
762                    canonical,
763                    ImportNode::IoError {
764                        kind: err.kind().to_string(),
765                    },
766                );
767            }
768        }
769    }
770
771    let mut hasher = Sha256::new();
772    hasher.update(b"stdlib-digest\0");
773    hasher.update(embedded_stdlib_digest());
774    hasher.update(b"\0");
775    // Fold in the compiler's code-generation fingerprint so a compiler change
776    // that alters emitted bytecode for unchanged source busts stale cache
777    // entries within a single version — the gap that masked the #2610 fix until
778    // the cache was cleared by hand. See `build.rs` and `CODEGEN_FINGERPRINT`.
779    hasher.update(b"codegen-fingerprint\0");
780    hasher.update(codegen_fingerprint.as_bytes());
781    hasher.update(b"\0");
782    for (path, node) in &visited {
783        hasher.update(path.to_string_lossy().as_bytes());
784        hasher.update(b"\0");
785        match node {
786            ImportNode::Resolved { content } => {
787                hasher.update(b"resolved\0");
788                hasher.update(content.as_bytes());
789            }
790            ImportNode::Unresolved { import } => {
791                hasher.update(b"unresolved\0");
792                hasher.update(import.as_bytes());
793            }
794            ImportNode::IoError { kind } => {
795                hasher.update(b"ioerror\0");
796                hasher.update(kind.as_bytes());
797            }
798        }
799        hasher.update(b"\0");
800    }
801    hasher.finalize().into()
802}
803
804enum ImportNode {
805    Resolved { content: String },
806    Unresolved { import: String },
807    IoError { kind: String },
808}
809
810#[cfg(test)]
811mod tests {
812    use super::*;
813    use crate::compile_source;
814
815    #[test]
816    fn header_round_trips_chunk() {
817        let chunk = compile_source("__io_println(\"hello\")").expect("compile");
818        let key = CacheKey::from_source(Path::new("/tmp/example.harn"), "__io_println(\"hello\")");
819        let tmp = tempfile::tempdir().unwrap();
820        let path = tmp.path().join("entry.harnbc");
821        store_at(&path, &key, &chunk).expect("write");
822        let loaded = read_chunk_if_matches(&path, &key).unwrap();
823        assert!(loaded.is_some(), "expected cached chunk to load");
824    }
825
826    #[test]
827    fn serialize_chunk_artifact_matches_store_at() {
828        // `serialize_chunk_artifact` packages an artifact into a buffer for
829        // in-memory consumers (e.g. `harn pack` writing into a tar.zst
830        // bundle). The contract is: the resulting bytes match what
831        // `store_at` would have written for the same key+chunk, so the
832        // shipped artifact is byte-identical to the on-disk cache form.
833        let chunk = compile_source("__io_println(\"hi\")").expect("compile");
834        let key = CacheKey::from_source(Path::new("/tmp/pack.harn"), "__io_println(\"hi\")");
835        let tmp = tempfile::tempdir().unwrap();
836        let on_disk = tmp.path().join("pack.harnbc");
837        store_at(&on_disk, &key, &chunk).expect("write");
838        let on_disk_bytes = std::fs::read(&on_disk).unwrap();
839        let in_memory_bytes = serialize_chunk_artifact(&key, &chunk).expect("serialize");
840        assert_eq!(in_memory_bytes, on_disk_bytes);
841    }
842
843    #[test]
844    fn header_mismatch_returns_none() {
845        let chunk = compile_source("1 + 1").expect("compile");
846        let key = CacheKey::from_source(Path::new("/tmp/a.harn"), "1 + 1");
847        let tmp = tempfile::tempdir().unwrap();
848        let path = tmp.path().join("a.harnbc");
849        store_at(&path, &key, &chunk).expect("write");
850        let other = CacheKey {
851            source_hash: [0xAB; 32],
852            import_graph_hash: key.import_graph_hash,
853            harn_version: HARN_VERSION,
854            compiler_tag: key.compiler_tag,
855        };
856        assert!(read_chunk_if_matches(&path, &other).unwrap().is_none());
857    }
858
859    #[test]
860    fn compiler_tag_mismatch_returns_none() {
861        let chunk = compile_source("1 + 1").expect("compile");
862        let key = CacheKey::from_source(Path::new("/tmp/b.harn"), "1 + 1");
863        let tmp = tempfile::tempdir().unwrap();
864        let path = tmp.path().join("b.harnbc");
865        store_at(&path, &key, &chunk).expect("write");
866        let other = CacheKey {
867            compiler_tag: key.compiler_tag ^ 0xFF,
868            ..key
869        };
870        assert!(
871            read_chunk_if_matches(&path, &other).unwrap().is_none(),
872            "flipped HARN_DISABLE_OPTIMIZATIONS must not reuse a chunk \
873             compiled under the opposite setting"
874        );
875    }
876
877    #[test]
878    fn codegen_fingerprint_is_populated() {
879        // In-workspace builds always hash real compiler sources, so the
880        // fingerprint must be a non-empty digest; an empty value would silently
881        // disable the within-version compiler-staleness guard.
882        assert!(!CODEGEN_FINGERPRINT.is_empty());
883    }
884
885    #[test]
886    fn codegen_fingerprint_changes_cache_key() {
887        // A compiler whose code-generation source differs must produce a
888        // different cache key for the *same* user source, so a stale artifact
889        // compiled by a prior compiler at the same version misses on load
890        // rather than being replayed (#2621). The fingerprint is a compile-time
891        // constant, so exercise the parameterized inner hash directly.
892        let tmp = tempfile::tempdir().unwrap();
893        let entry = tmp.path().join("entry.harn");
894        std::fs::write(&entry, "__io_println(\"hi\")\n").unwrap();
895        let source = std::fs::read_to_string(&entry).unwrap();
896        let a = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-A");
897        let b = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-B");
898        let a_again = hash_transitive_user_imports_fingerprinted(&entry, &source, "compiler-A");
899        assert_ne!(
900            a, b,
901            "differing compiler fingerprints must change the cache key"
902        );
903        assert_eq!(
904            a, a_again,
905            "an unchanged compiler fingerprint must be stable"
906        );
907    }
908
909    #[test]
910    fn collect_user_imports_ignores_stdlib_and_comments() {
911        let source = r#"
912            // import "comment/should/be/ignored"
913            import "std/agents"
914            import { foo } from "pkg/bar"
915            import "./relative/path"
916        "#;
917        let imports = collect_user_imports(source);
918        assert_eq!(
919            imports,
920            vec!["pkg/bar".to_string(), "./relative/path".to_string()]
921        );
922    }
923
924    #[test]
925    fn cache_enabled_respects_env() {
926        std::env::set_var(CACHE_ENABLED_ENV, "0");
927        assert!(!cache_enabled());
928        std::env::set_var(CACHE_ENABLED_ENV, "1");
929        assert!(cache_enabled());
930        std::env::remove_var(CACHE_ENABLED_ENV);
931        assert!(cache_enabled());
932    }
933
934    #[test]
935    fn import_path_inside_string_literal_is_ignored() {
936        let source = r#"
937            let payload = "import { foo } from \"./other\""
938            import "./real"
939        "#;
940        let imports = collect_user_imports(source);
941        assert_eq!(imports, vec!["./real".to_string()]);
942    }
943
944    #[test]
945    fn import_hash_is_stable_across_import_order() {
946        let tmp = tempfile::tempdir().unwrap();
947        std::fs::write(
948            tmp.path().join("a.harn"),
949            "pub fn a() -> int { return 1 }\n",
950        )
951        .unwrap();
952        std::fs::write(
953            tmp.path().join("b.harn"),
954            "pub fn b() -> int { return 2 }\n",
955        )
956        .unwrap();
957        let ab = tmp.path().join("entry_ab.harn");
958        std::fs::write(
959            &ab,
960            "import \"./a\"\nimport \"./b\"\n__io_println(\"hi\")\n",
961        )
962        .unwrap();
963        let ba = tmp.path().join("entry_ba.harn");
964        std::fs::write(
965            &ba,
966            "import \"./b\"\nimport \"./a\"\n__io_println(\"hi\")\n",
967        )
968        .unwrap();
969        let hash_ab = hash_transitive_user_imports(&ab, &std::fs::read_to_string(&ab).unwrap());
970        let hash_ba = hash_transitive_user_imports(&ba, &std::fs::read_to_string(&ba).unwrap());
971        assert_eq!(
972            hash_ab, hash_ba,
973            "import-graph hash must be order-independent so reordering imports \
974             does not bust the cache"
975        );
976    }
977
978    #[test]
979    fn import_hash_picks_up_nested_imports() {
980        let tmp = tempfile::tempdir().unwrap();
981        std::fs::write(
982            tmp.path().join("leaf.harn"),
983            "pub fn x() -> int { return 1 }\n",
984        )
985        .unwrap();
986        std::fs::write(
987            tmp.path().join("mid.harn"),
988            "import \"./leaf\"\npub fn y() -> int { return 2 }\n",
989        )
990        .unwrap();
991        let entry = tmp.path().join("entry.harn");
992        std::fs::write(&entry, "import \"./mid\"\n__io_println(\"hi\")\n").unwrap();
993
994        let before =
995            hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
996        std::fs::write(
997            tmp.path().join("leaf.harn"),
998            "pub fn x() -> int { return 999 }\n",
999        )
1000        .unwrap();
1001        let after = hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
1002        assert_ne!(
1003            before, after,
1004            "editing a transitively-imported file must change the import-graph hash"
1005        );
1006    }
1007}