Skip to main content

harn_vm/
bytecode_cache.rs

1//! Content-addressed on-disk cache for compiled `.harn` pipelines.
2//!
3//! Cold-start `harn run` re-parses, type-checks, and compiles the entry
4//! pipeline before the VM gets a single instruction to execute. For short
5//! Harn subcommands that wrap a few `llm_call`s in a small pipeline, that
6//! compile cost dominates wall-clock time.
7//!
8//! This module persists [`Chunk`] bytecode under
9//! `$HARN_CACHE_DIR/<source-hash>.harnbc` (XDG-aware). The cache key is
10//! derived from the entry source plus the content hash of every
11//! transitively-imported user file; stdlib imports are covered by the
12//! embedded `harn_version` field in the header. Any change to any input
13//! flips the key and the next run recompiles.
14//!
15//! File layout — little-endian throughout:
16//!
17//! ```text
18//! magic        : [u8; 8]   = "HARNBC\0\0"
19//! schema_ver   : u32       = SCHEMA_VERSION
20//! version_len  : u32
21//! harn_version : [u8; version_len]
22//! compiler_tag : u8        bitmask of active CompilerOptions
23//! kind         : u8        1 = entry chunk, 2 = module artifact
24//! source_hash  : [u8; 32]
25//! import_hash  : [u8; 32]
26//! payload      : bincode-serialized payload for `kind`
27//! ```
28//!
29//! The header lets a stale binary detect a future-version artifact
30//! without crashing: a magic mismatch, schema mismatch, or version
31//! mismatch is returned as `Ok(None)` so the caller transparently
32//! recompiles. Real I/O errors propagate.
33//!
34//! Concurrency: writes are atomic (write-tmp-then-rename), and parallel
35//! invocations on a cache miss race safely — the last writer wins, but
36//! every reader observes a consistent file because the rename is atomic
37//! on every supported filesystem.
38
39use std::fs;
40use std::io::{self, Read as _, Write as _};
41use std::path::{Path, PathBuf};
42
43use sha2::{Digest, Sha256};
44
45use crate::chunk::{CachedChunk, Chunk};
46use crate::compiler::CompilerOptions;
47use crate::module_artifact::ModuleArtifact;
48
49/// Header magic for all bytecode-cache artifact families.
50pub const MAGIC: &[u8; 8] = b"HARNBC\0\0";
51
52/// On-disk format version. Bump when [`CachedChunk`] or the header
53/// layout changes in a backwards-incompatible way.
54pub const SCHEMA_VERSION: u32 = 2;
55
56/// Compile-time Harn release. Cache files written by a different release
57/// are rejected on load.
58pub const HARN_VERSION: &str = env!("CARGO_PKG_VERSION");
59
60/// Conventional extension for entry-chunk cache files.
61pub const CACHE_EXTENSION: &str = "harnbc";
62
63/// Conventional extension for module-artifact cache files. Distinct from
64/// [`CACHE_EXTENSION`] so the same `.harn` source can have both shipped
65/// adjacent if needed (e.g. when a file is both an executable entry and
66/// imported by other files).
67pub const MODULE_CACHE_EXTENSION: &str = "harnmod";
68
69/// On-disk discriminant for a [`Chunk`] payload.
70const KIND_ENTRY_CHUNK: u8 = 1;
71/// On-disk discriminant for a [`ModuleArtifact`] payload.
72const KIND_MODULE_ARTIFACT: u8 = 2;
73
74/// Environment override for the cache directory. When set, takes
75/// precedence over the XDG and home-directory fallbacks.
76pub const CACHE_DIR_ENV: &str = "HARN_CACHE_DIR";
77
78/// Environment override that turns the cache off entirely. Setting this
79/// to `0`, `false`, `no`, or `off` skips both reads and writes; useful
80/// when debugging compiler changes.
81pub const CACHE_ENABLED_ENV: &str = "HARN_BYTECODE_CACHE";
82
83/// Result of a cache lookup. Carries the precomputed key so the caller
84/// can write it back on a miss without rehashing.
85pub struct LookupOutcome {
86    pub key: CacheKey,
87    pub chunk: Option<Chunk>,
88}
89
90/// Cache key components for a single pipeline source. Equality of all
91/// fields is necessary and sufficient for cache reuse.
92#[derive(Clone, Debug, PartialEq, Eq)]
93pub struct CacheKey {
94    pub source_hash: [u8; 32],
95    pub import_graph_hash: [u8; 32],
96    pub harn_version: &'static str,
97    /// Compact tag for active [`CompilerOptions`]. Flipping
98    /// `HARN_DISABLE_OPTIMIZATIONS` between runs would otherwise reuse a
99    /// chunk compiled under the wrong setting.
100    pub compiler_tag: u8,
101}
102
103impl CacheKey {
104    /// Compute the cache key for a `.harn` source file plus its transitive
105    /// user imports. `read_source` is the entry-file contents; the import
106    /// graph is walked from disk relative to `source_path`.
107    pub fn from_source(source_path: &Path, source: &str) -> Self {
108        let source_hash = sha256(source.as_bytes());
109        let import_graph_hash = hash_transitive_user_imports(source_path, source);
110        Self {
111            source_hash,
112            import_graph_hash,
113            harn_version: HARN_VERSION,
114            compiler_tag: compiler_options_tag(CompilerOptions::from_env()),
115        }
116    }
117
118    /// Entry-chunk filename for this key. We hash by source content
119    /// alone so two invocations of the same source from different paths
120    /// share a cache entry; the header's import-graph hash still gates
121    /// reuse on a per-load basis.
122    pub fn filename(&self) -> String {
123        format!("{}.{}", hex(&self.source_hash), CACHE_EXTENSION)
124    }
125
126    /// Module-artifact filename for this key.
127    pub fn module_filename(&self) -> String {
128        format!("{}.{}", hex(&self.source_hash), MODULE_CACHE_EXTENSION)
129    }
130}
131
132/// Returns the directory the shared cache lives in. Honors
133/// `$HARN_CACHE_DIR`, then `$XDG_CACHE_HOME/harn/bytecode`, then
134/// `$HOME/.cache/harn/bytecode`. The directory is *not* created here —
135/// [`store`] creates it lazily on write so read-only environments don't
136/// pay an mkdir cost.
137pub fn cache_dir() -> PathBuf {
138    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
139        return PathBuf::from(custom);
140    }
141    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
142        let xdg = PathBuf::from(xdg);
143        if !xdg.as_os_str().is_empty() {
144            return xdg.join("harn").join("bytecode");
145        }
146    }
147    if let Some(home) = std::env::var_os("HOME") {
148        return PathBuf::from(home)
149            .join(".cache")
150            .join("harn")
151            .join("bytecode");
152    }
153    // Final fallback: a directory beside the binary's working dir. Mostly
154    // hit in tests that scrub HOME from the environment.
155    PathBuf::from(".harn-cache").join("bytecode")
156}
157
158/// Root for `.harnpack` archives unpacked by `harn run <bundle.harnpack>`.
159/// Each verified bundle is replayed into `<root>/<sanitized-bundle-hash>/`
160/// so re-runs reuse the unpacked tree. Honors `$HARN_CACHE_DIR/packs`
161/// when set, otherwise XDG / `$HOME/.cache/harn/packs`.
162pub fn packs_cache_dir() -> PathBuf {
163    if let Some(custom) = std::env::var_os(CACHE_DIR_ENV) {
164        return PathBuf::from(custom).join("packs");
165    }
166    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
167        let xdg = PathBuf::from(xdg);
168        if !xdg.as_os_str().is_empty() {
169            return xdg.join("harn").join("packs");
170        }
171    }
172    if let Some(home) = std::env::var_os("HOME") {
173        return PathBuf::from(home)
174            .join(".cache")
175            .join("harn")
176            .join("packs");
177    }
178    PathBuf::from(".harn-cache").join("packs")
179}
180
181/// True when the cache is enabled by the current environment.
182pub fn cache_enabled() -> bool {
183    match std::env::var(CACHE_ENABLED_ENV).ok().as_deref() {
184        Some(value) => !matches!(
185            value.to_ascii_lowercase().as_str(),
186            "0" | "false" | "no" | "off"
187        ),
188        None => true,
189    }
190}
191
192/// Try to load a cached chunk for `source_path` whose contents are
193/// `source`. Returns the key alongside the (optional) chunk so callers
194/// avoid recomputing the key on miss.
195pub fn load(source_path: &Path, source: &str) -> LookupOutcome {
196    let key = CacheKey::from_source(source_path, source);
197    if !cache_enabled() {
198        return LookupOutcome { key, chunk: None };
199    }
200    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
201    if let Some(adjacent) = adjacent_cache_path(source_path) {
202        candidates.push(adjacent);
203    }
204    candidates.push(cache_dir().join(key.filename()));
205    for path in candidates {
206        match read_chunk_if_matches(&path, &key) {
207            Ok(Some(chunk)) => {
208                return LookupOutcome {
209                    key,
210                    chunk: Some(chunk),
211                }
212            }
213            Ok(None) => continue,
214            Err(_) => continue,
215        }
216    }
217    LookupOutcome { key, chunk: None }
218}
219
220/// Persist `chunk` to the shared cache directory under `key`. Atomic: a
221/// temp file is written then renamed into place. Concurrent invocations
222/// on the same key race safely.
223pub fn store(key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
224    if !cache_enabled() {
225        return Ok(());
226    }
227    let dir = cache_dir();
228    fs::create_dir_all(&dir)?;
229    write_atomic_chunk(&dir.join(key.filename()), key, chunk)
230}
231
232/// Write a precompiled entry-chunk artifact to an explicit path, for
233/// use by the `harn precompile` subcommand. The header still records
234/// the key, so adjacent artifacts shipped with source are validated
235/// like any other cache hit.
236pub fn store_at(path: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
237    ensure_parent_dir(path)?;
238    write_atomic_chunk(path, key, chunk)
239}
240
241/// Look up the [`ModuleArtifact`] for `source_path` (whose contents are
242/// `source`). Mirrors [`load`] but for the `.harnmod` family.
243pub fn load_module(source_path: &Path, source: &str) -> ModuleLookupOutcome {
244    let key = CacheKey::from_source(source_path, source);
245    if !cache_enabled() {
246        return ModuleLookupOutcome {
247            key,
248            artifact: None,
249        };
250    }
251    let mut candidates: Vec<PathBuf> = Vec::with_capacity(2);
252    if let Some(adjacent) = adjacent_module_cache_path(source_path) {
253        candidates.push(adjacent);
254    }
255    candidates.push(cache_dir().join(key.module_filename()));
256    for path in candidates {
257        match read_module_if_matches(&path, &key) {
258            Ok(Some(artifact)) => {
259                return ModuleLookupOutcome {
260                    key,
261                    artifact: Some(artifact),
262                }
263            }
264            Ok(None) => continue,
265            Err(_) => continue,
266        }
267    }
268    ModuleLookupOutcome {
269        key,
270        artifact: None,
271    }
272}
273
274/// Persist `artifact` to the shared cache under `key`. Atomic;
275/// concurrent invocations race safely.
276pub fn store_module(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
277    if !cache_enabled() {
278        return Ok(());
279    }
280    let dir = cache_dir();
281    fs::create_dir_all(&dir)?;
282    write_atomic_module(&dir.join(key.module_filename()), key, artifact)
283}
284
285/// Write a module artifact to an explicit path.
286pub fn store_module_at(path: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
287    ensure_parent_dir(path)?;
288    write_atomic_module(path, key, artifact)
289}
290
291/// Result of a [`load_module`] lookup. Carries the precomputed key so
292/// the caller can write it back on a miss without rehashing.
293pub struct ModuleLookupOutcome {
294    pub key: CacheKey,
295    pub artifact: Option<ModuleArtifact>,
296}
297
298/// Path to the adjacent precompiled entry-chunk artifact for
299/// `source_path`. `foo.harn` → `foo.harnbc`.
300pub fn adjacent_cache_path(source_path: &Path) -> Option<PathBuf> {
301    adjacent_path_with_extension(source_path, CACHE_EXTENSION)
302}
303
304/// Path to the adjacent precompiled module-artifact for `source_path`.
305/// `foo.harn` → `foo.harnmod`.
306pub fn adjacent_module_cache_path(source_path: &Path) -> Option<PathBuf> {
307    adjacent_path_with_extension(source_path, MODULE_CACHE_EXTENSION)
308}
309
310fn adjacent_path_with_extension(source_path: &Path, ext: &str) -> Option<PathBuf> {
311    let stem = source_path.file_stem()?;
312    if stem.is_empty() {
313        return None;
314    }
315    let parent = source_path.parent().unwrap_or_else(|| Path::new(""));
316    let mut out = parent.join(stem);
317    out.set_extension(ext);
318    Some(out)
319}
320
321fn ensure_parent_dir(path: &Path) -> io::Result<()> {
322    if let Some(parent) = path.parent() {
323        if !parent.as_os_str().is_empty() {
324            fs::create_dir_all(parent)?;
325        }
326    }
327    Ok(())
328}
329
330fn write_atomic_chunk(target: &Path, key: &CacheKey, chunk: &Chunk) -> io::Result<()> {
331    let buf = serialize_chunk_artifact(key, chunk)?;
332    write_atomic(target, &buf)
333}
334
335fn write_atomic_module(target: &Path, key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<()> {
336    let buf = serialize_module_artifact(key, artifact)?;
337    write_atomic(target, &buf)
338}
339
340/// Serialize an entry-chunk artifact (header + payload) to bytes. The
341/// resulting buffer is byte-identical to the file [`store_at`] would
342/// have written for the same `(key, chunk)`. Use this when packaging
343/// artifacts into a container (e.g. `harn pack`) without going through
344/// the filesystem.
345pub fn serialize_chunk_artifact(key: &CacheKey, chunk: &Chunk) -> io::Result<Vec<u8>> {
346    let cached = chunk.freeze_for_cache();
347    let payload = bincode::serialize(&cached)
348        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
349    Ok(encode_artifact(key, KIND_ENTRY_CHUNK, &payload))
350}
351
352/// Serialize a module artifact (header + payload) to bytes. Companion
353/// to [`serialize_chunk_artifact`] for the `.harnmod` family.
354pub fn serialize_module_artifact(key: &CacheKey, artifact: &ModuleArtifact) -> io::Result<Vec<u8>> {
355    let payload = bincode::serialize(artifact)
356        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?;
357    Ok(encode_artifact(key, KIND_MODULE_ARTIFACT, &payload))
358}
359
360fn encode_artifact(key: &CacheKey, kind: u8, payload: &[u8]) -> Vec<u8> {
361    let mut buf: Vec<u8> = Vec::with_capacity(payload.len() + 128);
362    buf.extend_from_slice(MAGIC);
363    buf.extend_from_slice(&SCHEMA_VERSION.to_le_bytes());
364    let version_bytes = HARN_VERSION.as_bytes();
365    buf.extend_from_slice(&(version_bytes.len() as u32).to_le_bytes());
366    buf.extend_from_slice(version_bytes);
367    buf.push(key.compiler_tag);
368    buf.push(kind);
369    buf.extend_from_slice(&key.source_hash);
370    buf.extend_from_slice(&key.import_graph_hash);
371    buf.extend_from_slice(payload);
372    buf
373}
374
375fn write_atomic(target: &Path, buf: &[u8]) -> io::Result<()> {
376    let tmp_name = match target.file_name() {
377        Some(name) => format!(".{}.{}.tmp", name.to_string_lossy(), std::process::id(),),
378        None => format!(".harn-cache.{}.tmp", std::process::id()),
379    };
380    let tmp_path = target.with_file_name(tmp_name);
381    let mut tmp_file = fs::File::create(&tmp_path)?;
382    tmp_file.write_all(buf)?;
383    tmp_file.sync_all()?;
384    drop(tmp_file);
385    match fs::rename(&tmp_path, target) {
386        Ok(()) => Ok(()),
387        Err(err) => {
388            let _ = fs::remove_file(&tmp_path);
389            Err(err)
390        }
391    }
392}
393
394/// Parsed cache header. Read by both the chunk and module loaders so the
395/// header-validation logic stays in one place.
396struct ParsedHeader {
397    kind: u8,
398    payload: Vec<u8>,
399}
400
401fn read_header_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ParsedHeader>> {
402    let mut file = match fs::File::open(path) {
403        Ok(f) => f,
404        Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(None),
405        Err(err) => return Err(err),
406    };
407    let mut header = [0u8; 8 + 4 + 4];
408    if file.read_exact(&mut header).is_err() {
409        return Ok(None);
410    }
411    if &header[..8] != MAGIC {
412        return Ok(None);
413    }
414    let schema = u32::from_le_bytes(header[8..12].try_into().unwrap());
415    if schema != SCHEMA_VERSION {
416        return Ok(None);
417    }
418    let version_len = u32::from_le_bytes(header[12..16].try_into().unwrap()) as usize;
419    if version_len > 256 {
420        // Bound the alloc so a corrupted file cannot force an unbounded read.
421        return Ok(None);
422    }
423    let mut version_buf = vec![0u8; version_len];
424    if file.read_exact(&mut version_buf).is_err() {
425        return Ok(None);
426    }
427    if version_buf != key.harn_version.as_bytes() {
428        return Ok(None);
429    }
430    let mut compiler_and_kind = [0u8; 2];
431    if file.read_exact(&mut compiler_and_kind).is_err() {
432        return Ok(None);
433    }
434    if compiler_and_kind[0] != key.compiler_tag {
435        return Ok(None);
436    }
437    let kind = compiler_and_kind[1];
438    let mut hashes = [0u8; 64];
439    if file.read_exact(&mut hashes).is_err() {
440        return Ok(None);
441    }
442    if hashes[..32] != key.source_hash || hashes[32..] != key.import_graph_hash {
443        return Ok(None);
444    }
445    let mut payload = Vec::new();
446    if file.read_to_end(&mut payload).is_err() {
447        return Ok(None);
448    }
449    Ok(Some(ParsedHeader { kind, payload }))
450}
451
452fn read_chunk_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<Chunk>> {
453    let Some(header) = read_header_if_matches(path, key)? else {
454        return Ok(None);
455    };
456    if header.kind != KIND_ENTRY_CHUNK {
457        return Ok(None);
458    }
459    let cached: CachedChunk = match bincode::deserialize(&header.payload) {
460        Ok(c) => c,
461        Err(_) => return Ok(None),
462    };
463    Ok(Some(Chunk::from_cached(&cached)))
464}
465
466fn read_module_if_matches(path: &Path, key: &CacheKey) -> io::Result<Option<ModuleArtifact>> {
467    let Some(header) = read_header_if_matches(path, key)? else {
468        return Ok(None);
469    };
470    if header.kind != KIND_MODULE_ARTIFACT {
471        return Ok(None);
472    }
473    match bincode::deserialize::<ModuleArtifact>(&header.payload) {
474        Ok(artifact) => Ok(Some(artifact)),
475        Err(_) => Ok(None),
476    }
477}
478
479/// Compact representation of [`CompilerOptions`] for the cache header.
480/// Independent flags get distinct bits so adding a new flag never
481/// silently changes existing keys when an old binary reads a new
482/// artifact — the header check will fail-closed before we get there
483/// anyway, but mapping to bits also keeps the tag a stable function
484/// of the option set.
485fn compiler_options_tag(options: CompilerOptions) -> u8 {
486    let mut tag: u8 = 0;
487    if options.optimizations_enabled() {
488        tag |= 0b0000_0001;
489    }
490    tag
491}
492
493fn sha256(bytes: &[u8]) -> [u8; 32] {
494    let mut hasher = Sha256::new();
495    hasher.update(bytes);
496    hasher.finalize().into()
497}
498
499fn hex(bytes: &[u8]) -> String {
500    let mut out = String::with_capacity(bytes.len() * 2);
501    for byte in bytes {
502        out.push_str(&format!("{byte:02x}"));
503    }
504    out
505}
506
507/// Lightweight regex-free scan that surfaces user imports without paying
508/// a full lex+parse. False positives only increase cache churn, never
509/// correctness; comments and string literals are skipped so neither a
510/// commented-out import nor a `"import …"` value appearing inside an
511/// unrelated string gates the hash.
512fn collect_user_imports(source: &str) -> Vec<String> {
513    let scrubbed = strip_comments(source);
514    let mut out: Vec<String> = Vec::new();
515    let bytes = scrubbed.as_bytes();
516    let mut i = 0;
517    while i < bytes.len() {
518        if bytes[i] == b'"' {
519            // Skip past any string literal so identifiers inside string
520            // values cannot trigger the keyword match below.
521            match read_string_literal(bytes, i) {
522                Some((_, end)) => {
523                    i = end;
524                    continue;
525                }
526                None => {
527                    i += 1;
528                    continue;
529                }
530            }
531        }
532        if !matches_keyword(bytes, i, b"import") {
533            i += 1;
534            continue;
535        }
536        // Skip past `import` and any selective `{ ... } from` clause; we
537        // only need the source-position of the path string literal.
538        let mut j = i + b"import".len();
539        let mut depth = 0i32;
540        while j < bytes.len() {
541            match bytes[j] {
542                b'"' => {
543                    if let Some((path, end)) = read_string_literal(bytes, j) {
544                        if !path.starts_with("std/") {
545                            out.push(path);
546                        }
547                        i = end;
548                        break;
549                    }
550                    j += 1;
551                }
552                b'{' => {
553                    depth += 1;
554                    j += 1;
555                }
556                b'}' => {
557                    depth -= 1;
558                    j += 1;
559                }
560                b'\n' if depth == 0 => {
561                    // No string literal on this logical line; bail and
562                    // continue scanning after the keyword to avoid an
563                    // infinite loop.
564                    i = j;
565                    break;
566                }
567                _ => j += 1,
568            }
569        }
570        if j >= bytes.len() {
571            break;
572        }
573        if i < j {
574            // Defensive: ensure forward progress when the inner loop
575            // exited without setting `i`.
576            i = j;
577        }
578    }
579    out
580}
581
582fn matches_keyword(bytes: &[u8], at: usize, keyword: &[u8]) -> bool {
583    let end = at + keyword.len();
584    if end > bytes.len() {
585        return false;
586    }
587    if &bytes[at..end] != keyword {
588        return false;
589    }
590    if at > 0 && is_ident_char(bytes[at - 1]) {
591        return false;
592    }
593    if end < bytes.len() && is_ident_char(bytes[end]) {
594        return false;
595    }
596    true
597}
598
599fn is_ident_char(b: u8) -> bool {
600    b.is_ascii_alphanumeric() || b == b'_'
601}
602
603fn read_string_literal(bytes: &[u8], at: usize) -> Option<(String, usize)> {
604    debug_assert_eq!(bytes[at], b'"');
605    let mut out = String::new();
606    let mut i = at + 1;
607    while i < bytes.len() {
608        match bytes[i] {
609            b'"' => return Some((out, i + 1)),
610            b'\\' => {
611                if i + 1 >= bytes.len() {
612                    return None;
613                }
614                match bytes[i + 1] {
615                    b'"' => out.push('"'),
616                    b'\\' => out.push('\\'),
617                    b'n' => out.push('\n'),
618                    b'r' => out.push('\r'),
619                    b't' => out.push('\t'),
620                    other => out.push(other as char),
621                }
622                i += 2;
623            }
624            b'\n' => return None,
625            byte => {
626                out.push(byte as char);
627                i += 1;
628            }
629        }
630    }
631    None
632}
633
634fn strip_comments(source: &str) -> String {
635    let bytes = source.as_bytes();
636    let mut out = String::with_capacity(source.len());
637    let mut i = 0;
638    while i < bytes.len() {
639        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'/' {
640            while i < bytes.len() && bytes[i] != b'\n' {
641                i += 1;
642            }
643            continue;
644        }
645        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'*' {
646            i += 2;
647            while i + 1 < bytes.len() && !(bytes[i] == b'*' && bytes[i + 1] == b'/') {
648                i += 1;
649            }
650            i = (i + 2).min(bytes.len());
651            continue;
652        }
653        if bytes[i] == b'"' {
654            if let Some((_, end)) = read_string_literal(bytes, i) {
655                out.push_str(&source[i..end]);
656                i = end;
657                continue;
658            }
659        }
660        out.push(bytes[i] as char);
661        i += 1;
662    }
663    out
664}
665
666/// Walk the user-import graph rooted at `source_path` and produce a
667/// stable hash of every transitively-reachable file. The hash is
668/// order-independent: each visited file is keyed by canonical path and
669/// emitted in sorted order, so reordering imports inside a file does
670/// not invalidate the cache while changing any file's content does.
671fn hash_transitive_user_imports(source_path: &Path, source: &str) -> [u8; 32] {
672    let mut visited: std::collections::BTreeMap<PathBuf, ImportNode> =
673        std::collections::BTreeMap::new();
674    let mut frontier: Vec<(PathBuf, String)> = collect_user_imports(source)
675        .into_iter()
676        .map(|import| (source_path.to_path_buf(), import))
677        .collect();
678
679    while let Some((anchor, import)) = frontier.pop() {
680        let Some(resolved) = harn_modules::resolve_import_path(&anchor, &import) else {
681            // Unresolved imports get a sentinel keyed by their resolution
682            // anchor so that dropping a real file under that anchor later
683            // produces a different key.
684            let sentinel = anchor.join(format!("__unresolved__/{import}"));
685            visited
686                .entry(sentinel)
687                .or_insert(ImportNode::Unresolved { import });
688            continue;
689        };
690        let canonical = resolved.canonicalize().unwrap_or_else(|_| resolved.clone());
691        if visited.contains_key(&canonical) {
692            continue;
693        }
694        match fs::read_to_string(&resolved) {
695            Ok(content) => {
696                let nested = collect_user_imports(&content);
697                visited.insert(
698                    canonical.clone(),
699                    ImportNode::Resolved {
700                        content: content.clone(),
701                    },
702                );
703                for nested_import in nested {
704                    frontier.push((resolved.clone(), nested_import));
705                }
706            }
707            Err(err) => {
708                visited.insert(
709                    canonical,
710                    ImportNode::IoError {
711                        kind: err.kind().to_string(),
712                    },
713                );
714            }
715        }
716    }
717
718    let mut hasher = Sha256::new();
719    for (path, node) in &visited {
720        hasher.update(path.to_string_lossy().as_bytes());
721        hasher.update(b"\0");
722        match node {
723            ImportNode::Resolved { content } => {
724                hasher.update(b"resolved\0");
725                hasher.update(content.as_bytes());
726            }
727            ImportNode::Unresolved { import } => {
728                hasher.update(b"unresolved\0");
729                hasher.update(import.as_bytes());
730            }
731            ImportNode::IoError { kind } => {
732                hasher.update(b"ioerror\0");
733                hasher.update(kind.as_bytes());
734            }
735        }
736        hasher.update(b"\0");
737    }
738    hasher.finalize().into()
739}
740
741enum ImportNode {
742    Resolved { content: String },
743    Unresolved { import: String },
744    IoError { kind: String },
745}
746
747#[cfg(test)]
748mod tests {
749    use super::*;
750    use crate::compile_source;
751
752    #[test]
753    fn header_round_trips_chunk() {
754        let chunk = compile_source("println(\"hello\")").expect("compile");
755        let key = CacheKey::from_source(Path::new("/tmp/example.harn"), "println(\"hello\")");
756        let tmp = tempfile::tempdir().unwrap();
757        let path = tmp.path().join("entry.harnbc");
758        store_at(&path, &key, &chunk).expect("write");
759        let loaded = read_chunk_if_matches(&path, &key).unwrap();
760        assert!(loaded.is_some(), "expected cached chunk to load");
761    }
762
763    #[test]
764    fn serialize_chunk_artifact_matches_store_at() {
765        // `serialize_chunk_artifact` packages an artifact into a buffer for
766        // in-memory consumers (e.g. `harn pack` writing into a tar.zst
767        // bundle). The contract is: the resulting bytes match what
768        // `store_at` would have written for the same key+chunk, so the
769        // shipped artifact is byte-identical to the on-disk cache form.
770        let chunk = compile_source("println(\"hi\")").expect("compile");
771        let key = CacheKey::from_source(Path::new("/tmp/pack.harn"), "println(\"hi\")");
772        let tmp = tempfile::tempdir().unwrap();
773        let on_disk = tmp.path().join("pack.harnbc");
774        store_at(&on_disk, &key, &chunk).expect("write");
775        let on_disk_bytes = std::fs::read(&on_disk).unwrap();
776        let in_memory_bytes = serialize_chunk_artifact(&key, &chunk).expect("serialize");
777        assert_eq!(in_memory_bytes, on_disk_bytes);
778    }
779
780    #[test]
781    fn header_mismatch_returns_none() {
782        let chunk = compile_source("1 + 1").expect("compile");
783        let key = CacheKey::from_source(Path::new("/tmp/a.harn"), "1 + 1");
784        let tmp = tempfile::tempdir().unwrap();
785        let path = tmp.path().join("a.harnbc");
786        store_at(&path, &key, &chunk).expect("write");
787        let other = CacheKey {
788            source_hash: [0xAB; 32],
789            import_graph_hash: key.import_graph_hash,
790            harn_version: HARN_VERSION,
791            compiler_tag: key.compiler_tag,
792        };
793        assert!(read_chunk_if_matches(&path, &other).unwrap().is_none());
794    }
795
796    #[test]
797    fn compiler_tag_mismatch_returns_none() {
798        let chunk = compile_source("1 + 1").expect("compile");
799        let key = CacheKey::from_source(Path::new("/tmp/b.harn"), "1 + 1");
800        let tmp = tempfile::tempdir().unwrap();
801        let path = tmp.path().join("b.harnbc");
802        store_at(&path, &key, &chunk).expect("write");
803        let other = CacheKey {
804            compiler_tag: key.compiler_tag ^ 0xFF,
805            ..key.clone()
806        };
807        assert!(
808            read_chunk_if_matches(&path, &other).unwrap().is_none(),
809            "flipped HARN_DISABLE_OPTIMIZATIONS must not reuse a chunk \
810             compiled under the opposite setting"
811        );
812    }
813
814    #[test]
815    fn collect_user_imports_ignores_stdlib_and_comments() {
816        let source = r#"
817            // import "comment/should/be/ignored"
818            import "std/agents"
819            import { foo } from "pkg/bar"
820            import "./relative/path"
821        "#;
822        let imports = collect_user_imports(source);
823        assert_eq!(
824            imports,
825            vec!["pkg/bar".to_string(), "./relative/path".to_string()]
826        );
827    }
828
829    #[test]
830    fn cache_enabled_respects_env() {
831        std::env::set_var(CACHE_ENABLED_ENV, "0");
832        assert!(!cache_enabled());
833        std::env::set_var(CACHE_ENABLED_ENV, "1");
834        assert!(cache_enabled());
835        std::env::remove_var(CACHE_ENABLED_ENV);
836        assert!(cache_enabled());
837    }
838
839    #[test]
840    fn import_path_inside_string_literal_is_ignored() {
841        let source = r#"
842            let payload = "import { foo } from \"./other\""
843            import "./real"
844        "#;
845        let imports = collect_user_imports(source);
846        assert_eq!(imports, vec!["./real".to_string()]);
847    }
848
849    #[test]
850    fn import_hash_is_stable_across_import_order() {
851        let tmp = tempfile::tempdir().unwrap();
852        std::fs::write(
853            tmp.path().join("a.harn"),
854            "pub fn a() -> int { return 1 }\n",
855        )
856        .unwrap();
857        std::fs::write(
858            tmp.path().join("b.harn"),
859            "pub fn b() -> int { return 2 }\n",
860        )
861        .unwrap();
862        let ab = tmp.path().join("entry_ab.harn");
863        std::fs::write(&ab, "import \"./a\"\nimport \"./b\"\nprintln(\"hi\")\n").unwrap();
864        let ba = tmp.path().join("entry_ba.harn");
865        std::fs::write(&ba, "import \"./b\"\nimport \"./a\"\nprintln(\"hi\")\n").unwrap();
866        let hash_ab = hash_transitive_user_imports(&ab, &std::fs::read_to_string(&ab).unwrap());
867        let hash_ba = hash_transitive_user_imports(&ba, &std::fs::read_to_string(&ba).unwrap());
868        assert_eq!(
869            hash_ab, hash_ba,
870            "import-graph hash must be order-independent so reordering imports \
871             does not bust the cache"
872        );
873    }
874
875    #[test]
876    fn import_hash_picks_up_nested_imports() {
877        let tmp = tempfile::tempdir().unwrap();
878        std::fs::write(
879            tmp.path().join("leaf.harn"),
880            "pub fn x() -> int { return 1 }\n",
881        )
882        .unwrap();
883        std::fs::write(
884            tmp.path().join("mid.harn"),
885            "import \"./leaf\"\npub fn y() -> int { return 2 }\n",
886        )
887        .unwrap();
888        let entry = tmp.path().join("entry.harn");
889        std::fs::write(&entry, "import \"./mid\"\nprintln(\"hi\")\n").unwrap();
890
891        let before =
892            hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
893        std::fs::write(
894            tmp.path().join("leaf.harn"),
895            "pub fn x() -> int { return 999 }\n",
896        )
897        .unwrap();
898        let after = hash_transitive_user_imports(&entry, &std::fs::read_to_string(&entry).unwrap());
899        assert_ne!(
900            before, after,
901            "editing a transitively-imported file must change the import-graph hash"
902        );
903    }
904}