atomr-accel-cuda 0.10.0

GPU acceleration via the actor model. Wraps NVIDIA CUDA libraries (cuBLAS, cuDNN, cuFFT, cuRAND, cuSOLVER, cuSPARSE, cuTENSOR, cuBLASLt, NVRTC, NCCL) as supervised atomr actors with generation-validated buffers and a uniform async surface.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
//! Persistent disk cache for NVRTC-compiled CUDA kernels (Phase 0.6).
//!
//! Modern CUDA kernels — Hopper/Blackwell hand-rolled CUDA-C, CUTLASS
//! template instantiations, FlashAttention 2/3 variants — take 10s to
//! 60s each through NVRTC. A persistent disk cache turns subsequent
//! runs into single-digit-millisecond hot starts.
//!
//! ## Design
//!
//! - **Key**: `(source_hash, arch, options_hash)` where `arch` is the
//!   SM compute capability (e.g. `80`, `90`, `100`) and `options_hash`
//!   is FNV-1a of the NVRTC compile options in their original order
//!   (callers should sort beforehand if they want order-insensitive
//!   keys — see [`hash_options`]).
//! - **Value**: serialised PTX (and optional CUBIN) bytes wrapped in
//!   [`CachedKernel`].
//! - **Storage**: filesystem under
//!   `$XDG_CACHE_HOME/atomr-accel/nvrtc/` (or
//!   `$HOME/.cache/atomr-accel/nvrtc/`, falling back to
//!   `$TMPDIR/atomr-accel/nvrtc/`). One file per cache entry, named
//!   `{source_hash:016x}-{arch}-{options_hash:016x}.bin`.
//! - **Format**: bincode of [`CachedKernel`]. Entries whose
//!   `atomr_accel_version` does not match
//!   [`env!("CARGO_PKG_VERSION")`] are rejected on load.
//! - **Concurrency**: in-process [`RwLock`]ed [`HashMap`] read-through
//!   cache. Cross-process safety via atomic file write
//!   (`<name>.tmp` then `rename`).
//!
//! ## Usage
//!
//! ```no_run
//! use atomr_accel_cuda::nvrtc_cache::{
//!     hash_options, hash_source, CachedKernel, NvrtcCache, NvrtcCacheKey,
//! };
//!
//! let cache = NvrtcCache::new().unwrap();
//! let src = "extern \"C\" __global__ void noop() {}";
//! let key = NvrtcCacheKey {
//!     source_hash: hash_source(src),
//!     arch: 80,
//!     options_hash: hash_options(["-std=c++17", "--use_fast_math"]),
//! };
//! if let Some(entry) = cache.get(key) {
//!     println!("hot: {} bytes of PTX", entry.ptx.len());
//! } else {
//!     // ... NVRTC compile ...
//!     let ptx: Vec<u8> = b"PTX...".to_vec();
//!     cache.insert(key, CachedKernel::new(ptx, None)).unwrap();
//! }
//! ```
//!
//! Phase 5 will wire `NvrtcActor` through this cache; this module ships
//! the storage layer alone.

use crate::error::GpuError;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs;
use std::hash::Hasher;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock};

/// Composite cache key. `source_hash` and `options_hash` are produced
/// by [`hash_source`] / [`hash_options`]; `arch` is the SM compute
/// capability as an integer (e.g. `80`, `90`, `100`).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct NvrtcCacheKey {
    pub source_hash: u64,
    pub arch: u32,
    pub options_hash: u64,
}

/// On-disk and in-memory cache value.
///
/// `atomr_accel_version` is checked on load: entries from older
/// crate versions are silently rejected so a cache built against a
/// stale `cudarc` / NVRTC ABI never gets loaded into a newer build.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CachedKernel {
    pub ptx: Vec<u8>,
    pub cubin: Option<Vec<u8>>,
    pub atomr_accel_version: String,
}

impl CachedKernel {
    /// Build a [`CachedKernel`] stamped with the current crate version.
    pub fn new(ptx: Vec<u8>, cubin: Option<Vec<u8>>) -> Self {
        Self {
            ptx,
            cubin,
            atomr_accel_version: env!("CARGO_PKG_VERSION").to_string(),
        }
    }
}

/// Read-through disk cache for compiled NVRTC kernels.
///
/// Cross-process safe: writes go to a temp file and are atomically
/// renamed into place. Reads tolerate concurrent writers (a partial
/// `.tmp` is invisible to readers; a corrupt `.bin` returns `None`).
#[derive(Debug)]
pub struct NvrtcCache {
    dir: PathBuf,
    memory: RwLock<HashMap<NvrtcCacheKey, Arc<CachedKernel>>>,
}

impl NvrtcCache {
    /// Construct a cache rooted at the OS-default location.
    ///
    /// Probe order:
    /// 1. `$XDG_CACHE_HOME/atomr-accel/nvrtc/`
    /// 2. `$HOME/.cache/atomr-accel/nvrtc/` (via [`dirs::cache_dir`])
    /// 3. `<temp_dir>/atomr-accel/nvrtc/`
    pub fn new() -> Result<Self, GpuError> {
        Self::with_dir(default_cache_dir())
    }

    /// Construct a cache rooted at an explicit directory. Creates the
    /// directory (recursively) if it does not exist.
    pub fn with_dir(path: PathBuf) -> Result<Self, GpuError> {
        fs::create_dir_all(&path).map_err(|e| {
            GpuError::Unrecoverable(format!(
                "NvrtcCache: failed to create cache dir {}: {}",
                path.display(),
                e
            ))
        })?;
        Ok(Self {
            dir: path,
            memory: RwLock::new(HashMap::new()),
        })
    }

    /// Cache root directory.
    pub fn dir(&self) -> &Path {
        &self.dir
    }

    /// Look up a cached kernel.
    ///
    /// Returns `None` if the entry is absent, unreadable, mis-encoded,
    /// or stamped with a non-matching `atomr_accel_version`. A read
    /// hit on disk but cold in memory promotes the entry into the
    /// in-memory map for subsequent lookups.
    pub fn get(&self, key: NvrtcCacheKey) -> Option<Arc<CachedKernel>> {
        if let Some(hit) = self
            .memory
            .read()
            .ok()
            .and_then(|guard| guard.get(&key).cloned())
        {
            return Some(hit);
        }

        let path = self.entry_path(&key);
        let bytes = fs::read(&path).ok()?;
        let entry: CachedKernel = bincode::deserialize(&bytes).ok()?;
        if entry.atomr_accel_version != env!("CARGO_PKG_VERSION") {
            return None;
        }
        let arc = Arc::new(entry);
        if let Ok(mut guard) = self.memory.write() {
            guard.insert(key, arc.clone());
        }
        Some(arc)
    }

    /// Store a kernel.
    ///
    /// The on-disk write is atomic (write to `<name>.tmp` then
    /// rename). Concurrent writers race the rename; the loser's bytes
    /// are silently overwritten — bincode payloads of the same key are
    /// expected to be identical so this is benign.
    pub fn insert(&self, key: NvrtcCacheKey, value: CachedKernel) -> Result<(), GpuError> {
        let bytes = bincode::serialize(&value).map_err(|e| {
            GpuError::Unrecoverable(format!("NvrtcCache: bincode serialize: {}", e))
        })?;
        let final_path = self.entry_path(&key);
        let tmp_path = final_path.with_extension("bin.tmp");

        {
            let mut f = fs::File::create(&tmp_path).map_err(|e| {
                GpuError::Unrecoverable(format!("NvrtcCache: create {}: {}", tmp_path.display(), e))
            })?;
            f.write_all(&bytes).map_err(|e| {
                GpuError::Unrecoverable(format!("NvrtcCache: write {}: {}", tmp_path.display(), e))
            })?;
            f.sync_all().map_err(|e| {
                GpuError::Unrecoverable(format!("NvrtcCache: fsync {}: {}", tmp_path.display(), e))
            })?;
        }

        fs::rename(&tmp_path, &final_path).map_err(|e| {
            // Best-effort clean up the temp file on rename failure.
            let _ = fs::remove_file(&tmp_path);
            GpuError::Unrecoverable(format!(
                "NvrtcCache: rename {} -> {}: {}",
                tmp_path.display(),
                final_path.display(),
                e
            ))
        })?;

        if let Ok(mut guard) = self.memory.write() {
            guard.insert(key, Arc::new(value));
        }
        Ok(())
    }

    /// Drop every in-memory entry. Disk contents are left untouched
    /// — subsequent `get` calls will re-populate from disk.
    pub fn clear_in_memory(&self) {
        if let Ok(mut guard) = self.memory.write() {
            guard.clear();
        }
    }

    fn entry_path(&self, key: &NvrtcCacheKey) -> PathBuf {
        self.dir.join(format!(
            "{:016x}-{}-{:016x}.bin",
            key.source_hash, key.arch, key.options_hash
        ))
    }
}

/// FNV-1a 64-bit hash of a kernel source string. Stable across
/// processes and across crate compilations.
pub fn hash_source(src: &str) -> u64 {
    let mut h = FnvHasher::new();
    h.write(src.as_bytes());
    h.finish()
}

/// FNV-1a 64-bit hash of an iterable of NVRTC compile options.
///
/// **Order-sensitive**: callers that want order-insensitive keys must
/// sort the iterable before passing it in. NVRTC's `--define-macro`
/// and `--include-path` flags are order-significant in general, so
/// the cache key preserves the caller's order.
pub fn hash_options<I, S>(opts: I) -> u64
where
    I: IntoIterator<Item = S>,
    S: AsRef<str>,
{
    let mut h = FnvHasher::new();
    for opt in opts {
        let bytes = opt.as_ref().as_bytes();
        // Length prefix so ["ab", "c"] != ["a", "bc"].
        h.write_u64(bytes.len() as u64);
        h.write(bytes);
        // Separator byte to make the boundary unambiguous.
        h.write_u8(0xff);
    }
    h.finish()
}

fn default_cache_dir() -> PathBuf {
    if let Some(xdg) = std::env::var_os("XDG_CACHE_HOME") {
        let p = PathBuf::from(xdg);
        if !p.as_os_str().is_empty() {
            return p.join("atomr-accel").join("nvrtc");
        }
    }
    if let Some(cache) = dirs::cache_dir() {
        return cache.join("atomr-accel").join("nvrtc");
    }
    std::env::temp_dir().join("atomr-accel").join("nvrtc")
}

// ---------------------------------------------------------------------------
// FNV-1a 64-bit. Tiny, dependency-free, deterministic — good enough for a
// kernel-source content hash. Not cryptographic.
// ---------------------------------------------------------------------------

const FNV_OFFSET_BASIS_64: u64 = 0xcbf2_9ce4_8422_2325;
const FNV_PRIME_64: u64 = 0x0000_0100_0000_01b3;

struct FnvHasher(u64);

impl FnvHasher {
    fn new() -> Self {
        Self(FNV_OFFSET_BASIS_64)
    }
}

impl Hasher for FnvHasher {
    fn finish(&self) -> u64 {
        self.0
    }
    fn write(&mut self, bytes: &[u8]) {
        let mut h = self.0;
        for &b in bytes {
            h ^= b as u64;
            h = h.wrapping_mul(FNV_PRIME_64);
        }
        self.0 = h;
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::tempdir;

    fn sample_kernel(seed: u8) -> CachedKernel {
        CachedKernel::new(vec![seed; 64], Some(vec![seed.wrapping_add(1); 32]))
    }

    fn key(source_hash: u64, arch: u32, options_hash: u64) -> NvrtcCacheKey {
        NvrtcCacheKey {
            source_hash,
            arch,
            options_hash,
        }
    }

    #[test]
    fn round_trip_via_with_dir() {
        let tmp = tempdir().unwrap();
        let cache = NvrtcCache::with_dir(tmp.path().to_path_buf()).unwrap();
        let k = key(0xdead_beef, 80, 0x1234);
        let v = sample_kernel(7);

        assert!(cache.get(k).is_none(), "cold cache should miss");

        cache.insert(k, v.clone()).unwrap();

        let got = cache.get(k).expect("hot lookup must hit");
        assert_eq!(got.ptx, v.ptx);
        assert_eq!(got.cubin, v.cubin);
        assert_eq!(got.atomr_accel_version, env!("CARGO_PKG_VERSION"));
    }

    #[test]
    fn cache_persists_across_fresh_handles() {
        let tmp = tempdir().unwrap();
        let dir = tmp.path().to_path_buf();
        let k = key(0x9999, 90, 0xabcd);
        let v = sample_kernel(42);

        {
            let cache = NvrtcCache::with_dir(dir.clone()).unwrap();
            cache.insert(k, v.clone()).unwrap();
        } // drop the cache, only the file remains

        let cache2 = NvrtcCache::with_dir(dir).unwrap();
        let got = cache2.get(k).expect("disk-backed entry must survive");
        assert_eq!(got.ptx, v.ptx);
        assert_eq!(got.cubin, v.cubin);
    }

    #[test]
    fn distinct_keys_distinct_paths() {
        let tmp = tempdir().unwrap();
        let cache = NvrtcCache::with_dir(tmp.path().to_path_buf()).unwrap();

        let k_src = key(1, 80, 0);
        let k_arch = key(0, 90, 0);
        let k_opts = key(0, 80, 1);
        let k_zero = key(0, 80, 0);

        let p_src = cache.entry_path(&k_src);
        let p_arch = cache.entry_path(&k_arch);
        let p_opts = cache.entry_path(&k_opts);
        let p_zero = cache.entry_path(&k_zero);

        assert_ne!(p_src, p_arch);
        assert_ne!(p_src, p_opts);
        assert_ne!(p_src, p_zero);
        assert_ne!(p_arch, p_opts);
        assert_ne!(p_arch, p_zero);
        assert_ne!(p_opts, p_zero);

        // Inserting under each key writes a separate file.
        cache.insert(k_src, sample_kernel(1)).unwrap();
        cache.insert(k_arch, sample_kernel(2)).unwrap();
        cache.insert(k_opts, sample_kernel(3)).unwrap();
        cache.insert(k_zero, sample_kernel(4)).unwrap();

        assert!(p_src.exists());
        assert!(p_arch.exists());
        assert!(p_opts.exists());
        assert!(p_zero.exists());

        // And reads come back distinct.
        assert_eq!(cache.get(k_src).unwrap().ptx, vec![1u8; 64]);
        assert_eq!(cache.get(k_arch).unwrap().ptx, vec![2u8; 64]);
        assert_eq!(cache.get(k_opts).unwrap().ptx, vec![3u8; 64]);
        assert_eq!(cache.get(k_zero).unwrap().ptx, vec![4u8; 64]);
    }

    #[test]
    fn version_mismatch_rejected_on_load() {
        let tmp = tempdir().unwrap();
        let cache = NvrtcCache::with_dir(tmp.path().to_path_buf()).unwrap();
        let k = key(11, 80, 22);

        // Hand-write an entry with a stale version stamp.
        let stale = CachedKernel {
            ptx: vec![0xaa; 16],
            cubin: None,
            atomr_accel_version: "0.0.0-impossible".to_string(),
        };
        let bytes = bincode::serialize(&stale).unwrap();
        let path = cache.entry_path(&k);
        fs::write(&path, &bytes).unwrap();

        // Skip the in-memory shortcut by clearing it explicitly.
        cache.clear_in_memory();
        assert!(
            cache.get(k).is_none(),
            "entry with mismatched atomr_accel_version must not be returned"
        );
    }

    #[test]
    fn hash_source_is_deterministic() {
        let a = hash_source("__global__ void k() {}");
        let b = hash_source("__global__ void k() {}");
        let c = hash_source("__global__ void other() {}");
        assert_eq!(a, b);
        assert_ne!(a, c);
    }

    #[test]
    fn hash_options_is_deterministic_and_order_sensitive() {
        let a = hash_options(["-std=c++17", "--use_fast_math"]);
        let b = hash_options(["-std=c++17", "--use_fast_math"]);
        let c = hash_options(["--use_fast_math", "-std=c++17"]);
        let d = hash_options(["-std=c++17"]);
        let e = hash_options(Vec::<&str>::new());

        assert_eq!(a, b, "same input must produce same hash");
        assert_ne!(a, c, "option order must change the hash");
        assert_ne!(a, d, "option count must change the hash");
        assert_ne!(a, e);
        assert_ne!(d, e);

        // Length-prefix invariant: ["ab","c"] != ["a","bc"].
        let split1 = hash_options(["ab", "c"]);
        let split2 = hash_options(["a", "bc"]);
        assert_ne!(split1, split2);
    }

    #[test]
    fn clear_in_memory_keeps_disk() {
        let tmp = tempdir().unwrap();
        let cache = NvrtcCache::with_dir(tmp.path().to_path_buf()).unwrap();
        let k = key(7, 80, 7);
        cache.insert(k, sample_kernel(9)).unwrap();
        cache.clear_in_memory();
        let got = cache.get(k).expect("disk entry survives clear_in_memory");
        assert_eq!(got.ptx, vec![9u8; 64]);
    }
}