prism-q 0.14.0

PRISM-Q — Performance Rust Interoperable Simulator for Quantum
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
//! Shared GPU execution resource.
//!
//! GPU support in PRISM-Q is not a standalone backend. It is an execution
//! context that CPU backends opt into for hot operations. Today the
//! statevector and stabilizer backends can attach an `Arc<GpuContext>` and
//! route their heavy kernels through this module.
//!
//! # Module layout
//!
//! - [`device`]: cudarc device wrapper, availability checks, VRAM queries
//! - [`memory`]: [`GpuBuffer`] RAII wrapper over device allocations
//! - `kernels::dense`: statevector kernels (CUDA C source plus launch helpers)
//! - `kernels::stabilizer`: stabilizer tableau and measurement kernels
//! - `kernels::bts`: compiled-sampler BTS kernels
//!
//! # Device state layout
//!
//! The statevector lives on device as a `CudaSlice<f64>` of length `2 * 2^n` holding
//! interleaved (re, im) pairs. This matches `num_complex::Complex64` and CUDA's `double2`
//! builtin, allowing zero-cost reinterpretation at kernel boundaries.

pub mod device;
pub(crate) mod kernels;
pub mod memory;

use std::sync::Arc;

use num_complex::Complex64;

use crate::error::Result;

pub use self::device::GpuDevice;
pub use self::memory::GpuBuffer;

/// Default minimum qubit count for routing a sub-circuit to GPU when
/// [`crate::BackendKind::StatevectorGpu`] is selected.
///
/// Below this threshold the dispatch layer builds a plain host-side
/// `StatevectorBackend` instead, keeping PCIe round-trips and kernel launch
/// latency off the critical path for small circuits that fit in L3.
/// Empirically measured at 14 on GTX 1080 Ti; override at runtime via the
/// `PRISM_GPU_MIN_QUBITS` environment variable.
pub const MIN_QUBITS_DEFAULT: usize = 14;

/// Whether a CUDA device is present and usable in this process.
///
/// Safe to call without a [`GpuContext`] and without the `cudarc` dynamic
/// library loaded. Returns `false` if detection fails for any reason.
pub fn is_available() -> bool {
    GpuContext::is_available()
}

/// Effective GPU crossover threshold. Reads `PRISM_GPU_MIN_QUBITS` once per
/// process and caches the result.
pub fn min_qubits() -> usize {
    static CACHED: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
    *CACHED.get_or_init(|| {
        if let Ok(val) = std::env::var("PRISM_GPU_MIN_QUBITS") {
            if let Ok(n) = val.parse::<usize>() {
                return n;
            }
        }
        MIN_QUBITS_DEFAULT
    })
}

/// Default minimum shot count for routing compiled BTS sampling to the GPU.
///
/// Below this threshold the compiled sampler stays on the CPU BTS path, even
/// when a GPU context is attached, so repeated small shot batches do not pay
/// the device launch and transfer setup cost. Override at runtime via
/// `PRISM_GPU_BTS_MIN_SHOTS`.
pub const BTS_MIN_SHOTS_DEFAULT: usize = 131_072;

/// Default minimum compiled-sampler rank for routing BTS sampling to the GPU.
///
/// Very low-rank circuits such as GHZ or independent H layers are typically
/// faster on the CPU even at large shot counts because the host path can
/// expand each shot from a tiny number of random bits. Override at runtime via
/// `PRISM_GPU_BTS_MIN_RANK`.
pub const BTS_MIN_RANK_DEFAULT: usize = 4;

/// Default minimum average parity-row weight factor for routing compiled BTS
/// sampling to the GPU.
///
/// The effective requirement is `total_weight >= num_measurements * factor`,
/// which filters out low-weight parity maps whose device launch overhead tends
/// to dominate. Override at runtime via `PRISM_GPU_BTS_MIN_WEIGHT_FACTOR`.
pub const BTS_MIN_WEIGHT_FACTOR_DEFAULT: usize = 2;

/// Effective GPU BTS shot threshold. Reads `PRISM_GPU_BTS_MIN_SHOTS` once per
/// process and caches the result.
pub(crate) fn bts_min_shots() -> usize {
    static CACHED: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
    *CACHED.get_or_init(|| {
        if let Ok(val) = std::env::var("PRISM_GPU_BTS_MIN_SHOTS") {
            if let Ok(n) = val.parse::<usize>() {
                return n;
            }
        }
        BTS_MIN_SHOTS_DEFAULT
    })
}

/// Effective GPU BTS rank threshold. Reads `PRISM_GPU_BTS_MIN_RANK` once per
/// process and caches the result.
pub(crate) fn bts_min_rank() -> usize {
    static CACHED: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
    *CACHED.get_or_init(|| {
        if let Ok(val) = std::env::var("PRISM_GPU_BTS_MIN_RANK") {
            if let Ok(n) = val.parse::<usize>() {
                return n.max(1);
            }
        }
        BTS_MIN_RANK_DEFAULT
    })
}

/// Effective GPU BTS parity-weight threshold factor. Reads
/// `PRISM_GPU_BTS_MIN_WEIGHT_FACTOR` once per process.
pub(crate) fn bts_min_weight_factor() -> usize {
    static CACHED: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
    *CACHED.get_or_init(|| {
        if let Ok(val) = std::env::var("PRISM_GPU_BTS_MIN_WEIGHT_FACTOR") {
            if let Ok(n) = val.parse::<usize>() {
                return n.max(1);
            }
        }
        BTS_MIN_WEIGHT_FACTOR_DEFAULT
    })
}

/// Default minimum qubit count for routing a sub-circuit to GPU when
/// [`crate::BackendKind::StabilizerGpu`] is selected.
///
/// Set deliberately high. The stabilizer device path now batches Clifford
/// launches and keeps measurement on device, but the product still defaults to
/// the host path until direct backend benchmarks justify lowering the
/// crossover.
///
/// Until then, opt in explicitly via `PRISM_STABILIZER_GPU_MIN_QUBITS=0` or
/// a similar low value for experimentation. The dispatch is correct; only
/// the performance story is pending.
pub const STABILIZER_MIN_QUBITS_DEFAULT: usize = 100_000;

/// Effective stabilizer GPU crossover threshold. Reads
/// `PRISM_STABILIZER_GPU_MIN_QUBITS` once per process.
pub fn stabilizer_min_qubits() -> usize {
    static CACHED: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
    *CACHED.get_or_init(|| {
        if let Ok(val) = std::env::var("PRISM_STABILIZER_GPU_MIN_QUBITS") {
            if let Ok(n) = val.parse::<usize>() {
                return n;
            }
        }
        STABILIZER_MIN_QUBITS_DEFAULT
    })
}

/// Shared GPU execution context.
///
/// Holds the device handle and compiled kernel module. Cheap to clone via `Arc`. Pass by
/// `Arc<GpuContext>` so multiple backends or multiple simulations can share one device
/// initialisation.
pub struct GpuContext {
    device: Arc<GpuDevice>,
    launcher_scratch: std::sync::Mutex<kernels::LauncherScratch>,
}

impl std::fmt::Debug for GpuContext {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("GpuContext")
            .field("device", &self.device)
            .finish_non_exhaustive()
    }
}

impl GpuContext {
    /// Initialise the context for the given CUDA device ordinal.
    ///
    /// Compiles the kernel module at construction. Subsequent calls reuse the cached PTX.
    pub fn new(device_id: usize) -> Result<Arc<Self>> {
        let device = Arc::new(GpuDevice::new(device_id)?);
        Ok(Arc::new(Self {
            device,
            launcher_scratch: std::sync::Mutex::new(kernels::LauncherScratch::default()),
        }))
    }

    /// Whether a CUDA device is present and usable.
    pub fn is_available() -> bool {
        GpuDevice::is_available()
    }

    /// Total VRAM on the device bound to this context.
    pub fn vram_bytes(&self) -> Result<usize> {
        self.device.vram_bytes()
    }

    /// Free VRAM currently available on the device bound to this context.
    ///
    /// Reflects allocations by all processes sharing the device, not only those
    /// made through this `GpuContext`. Use this before allocating a large
    /// statevector to avoid trial-and-error out-of-memory failures.
    pub fn vram_available(&self) -> Result<usize> {
        self.device.vram_available()
    }

    /// Maximum qubit count for a dense Complex64 statevector on this device.
    pub fn max_qubits_for_statevector(&self) -> Result<usize> {
        self.device.max_qubits_for_statevector()
    }

    /// Whether the currently-available VRAM can hold a dense Complex64
    /// statevector for `num_qubits` qubits.
    ///
    /// Counts only the main amplitude buffer (`2 * 2^num_qubits` f64s,
    /// 16 bytes per amplitude). Auxiliary scratch (probabilities kernel,
    /// measurement partials) typically adds up to 2^num_qubits additional
    /// f64s; callers expecting many concurrent measurements should leave
    /// headroom by checking against a smaller qubit count.
    pub fn fits_statevector(&self, num_qubits: usize) -> Result<bool> {
        if num_qubits >= usize::BITS as usize - 4 {
            return Ok(false);
        }
        let amplitude_bytes = (1usize << num_qubits).checked_mul(16).ok_or_else(|| {
            crate::error::PrismError::InvalidParameter {
                message: format!("num_qubits={num_qubits} overflows usize"),
            }
        })?;
        let available = self.vram_available()?;
        Ok(amplitude_bytes <= available)
    }

    pub(crate) fn device(&self) -> &GpuDevice {
        &self.device
    }

    pub(crate) fn launcher_scratch(&self) -> std::sync::MutexGuard<'_, kernels::LauncherScratch> {
        self.launcher_scratch
            .lock()
            .unwrap_or_else(|poisoned| poisoned.into_inner())
    }

    #[cfg(test)]
    pub(crate) fn stub_for_tests() -> Arc<Self> {
        Arc::new(Self {
            device: Arc::new(GpuDevice::stub_for_tests()),
            launcher_scratch: std::sync::Mutex::new(kernels::LauncherScratch::default()),
        })
    }
}

/// Per-simulation device-resident state.
///
/// Owns a `GpuBuffer<f64>` holding `2 * 2^num_qubits` f64s (interleaved re/im). Tracks
/// `pending_norm` the same way the CPU statevector backend does, measurement collapse
/// accumulates into this scalar and the final scale is applied at `export_statevector` or
/// `probabilities` time.
#[derive(Debug)]
pub struct GpuState {
    context: Arc<GpuContext>,
    buffer: GpuBuffer<f64>,
    num_qubits: usize,
    pending_norm: f64,
    /// Device-side scratch buffer for `probabilities()` output, reused across calls so
    /// shot-sampling workflows don't re-allocate `2^n` f64s on every read.
    probs_scratch: std::cell::RefCell<Option<GpuBuffer<f64>>>,
}

impl GpuState {
    /// Allocate a fresh |0…0⟩ state on the device bound to `context`.
    pub fn new(context: Arc<GpuContext>, num_qubits: usize) -> Result<Self> {
        let len = 2usize.checked_shl(num_qubits as u32).ok_or_else(|| {
            crate::error::PrismError::InvalidParameter {
                message: format!("num_qubits={num_qubits} overflows addressable memory"),
            }
        })?;
        let buffer = GpuBuffer::<f64>::alloc_zeros(context.device(), len)?;
        let mut state = Self {
            context: context.clone(),
            buffer,
            num_qubits,
            pending_norm: 1.0,
            probs_scratch: std::cell::RefCell::new(None),
        };
        kernels::dense::launch_set_initial_state(&context, &mut state)?;
        Ok(state)
    }

    /// Number of qubits the buffer is sized for.
    pub fn num_qubits(&self) -> usize {
        self.num_qubits
    }

    /// Multiplicative norm correction deferred from measurement collapse.
    pub fn pending_norm(&self) -> f64 {
        self.pending_norm
    }

    /// Read the raw (unnormalised) amplitude buffer back to host, as interleaved f64 pairs.
    pub fn copy_to_host_raw(&self) -> Result<Vec<f64>> {
        let mut host = vec![0.0_f64; self.buffer.len()];
        self.buffer.copy_to_host(self.context.device(), &mut host)?;
        Ok(host)
    }

    /// Read the amplitude buffer as `Vec<Complex64>` with the deferred `pending_norm`
    /// already applied.
    pub fn export_statevector(&self) -> Result<Vec<Complex64>> {
        let raw = self.copy_to_host_raw()?;
        let norm = self.pending_norm;
        let out = raw
            .chunks_exact(2)
            .map(|p| Complex64::new(p[0] * norm, p[1] * norm))
            .collect();
        Ok(out)
    }

    /// Compute per-basis-state probabilities with `pending_norm²` scaling, via a GPU
    /// reduction kernel. Only `2^n` f64s cross PCIe (vs `2·2^n` for raw amplitudes).
    pub fn probabilities(&self) -> Result<Vec<f64>> {
        kernels::dense::launch_compute_probabilities(&self.context, self)
    }

    pub(crate) fn context(&self) -> &Arc<GpuContext> {
        &self.context
    }

    pub(crate) fn buffer(&self) -> &GpuBuffer<f64> {
        &self.buffer
    }

    pub(crate) fn buffer_mut(&mut self) -> &mut GpuBuffer<f64> {
        &mut self.buffer
    }

    pub(crate) fn set_pending_norm(&mut self, norm: f64) {
        self.pending_norm = norm;
    }

    /// Access (and lazily allocate) the cached device-side probabilities buffer. Returned as
    /// a `RefMut` so the caller can pass it to a kernel launch for the duration of the call.
    pub(crate) fn probs_scratch(&self) -> std::cell::RefMut<'_, Option<GpuBuffer<f64>>> {
        self.probs_scratch.borrow_mut()
    }
}

/// Per-simulation device-resident stabilizer tableau.
///
/// Owns two buffers that mirror the CPU tableau layout in
/// [`crate::backend::stabilizer::StabilizerBackend`]:
///
/// - `xz`: `(2n+1)` rows × `2 * num_words` u64s per row. Word ordering per row is
///   X-bits in `[0, num_words)` then Z-bits in `[num_words, 2*num_words)`.
/// - `phase`: `(2n+1)` bytes, one per row (0 = +1, 1 = -1).
///
/// `num_words = ceil(n / 64)`. The scratch row at index `2n` is reserved for
/// measurement computations.
#[derive(Debug)]
pub struct GpuTableau {
    #[allow(dead_code)]
    context: Arc<GpuContext>,
    xz: GpuBuffer<u64>,
    #[allow(dead_code)]
    phase: GpuBuffer<u8>,
    measure_pivot: GpuBuffer<i32>,
    measure_outcome: GpuBuffer<u8>,
    num_qubits: usize,
    num_words: usize,
}

impl GpuTableau {
    /// Allocate a fresh identity tableau on the device bound to `context`.
    ///
    /// Both buffers are zero-initialised by `GpuBuffer::alloc_zeros`, then a
    /// `stab_set_initial_tableau` kernel launch sets the destabilizer X-bits
    /// and stabilizer Z-bits in place. Phase stays all zero (identity).
    pub fn new(context: Arc<GpuContext>, num_qubits: usize) -> Result<Self> {
        let num_words = num_qubits.div_ceil(64);
        let total_rows = 2 * num_qubits + 1;
        let xz_len = total_rows * 2 * num_words.max(1);
        let phase_len = total_rows;

        let xz = GpuBuffer::<u64>::alloc_zeros(context.device(), xz_len)?;
        let phase = GpuBuffer::<u8>::alloc_zeros(context.device(), phase_len)?;
        let measure_pivot = GpuBuffer::<i32>::alloc_zeros(context.device(), 1)?;
        let measure_outcome = GpuBuffer::<u8>::alloc_zeros(context.device(), 1)?;

        let mut tableau = Self {
            context: context.clone(),
            xz,
            phase,
            measure_pivot,
            measure_outcome,
            num_qubits,
            num_words,
        };
        kernels::stabilizer::launch_set_initial_tableau(&context, &mut tableau)?;
        Ok(tableau)
    }

    /// Qubit count the tableau is sized for.
    pub fn num_qubits(&self) -> usize {
        self.num_qubits
    }

    /// Number of u64 words per bit-packed row half (ceil(n / 64)).
    pub fn num_words(&self) -> usize {
        self.num_words
    }

    pub(crate) fn xz_mut(&mut self) -> &mut GpuBuffer<u64> {
        &mut self.xz
    }

    /// Split-borrow accessor returning both `xz` and `phase` buffers mutably.
    /// Kernel launchers need to pass both buffers as arguments to the same
    /// CUDA function, which requires holding concurrent mutable borrows of
    /// separate fields on the tableau.
    pub(crate) fn xz_phase_mut(&mut self) -> (&mut GpuBuffer<u64>, &mut GpuBuffer<u8>) {
        (&mut self.xz, &mut self.phase)
    }

    /// Split-borrow accessor returning the tableau XZ buffer and the cached
    /// pivot sentinel scratch used by `stab_measure_find_pivot`.
    pub(crate) fn xz_pivot_mut(&mut self) -> (&mut GpuBuffer<u64>, &mut GpuBuffer<i32>) {
        (&mut self.xz, &mut self.measure_pivot)
    }

    /// Split-borrow accessor returning the tableau XZ and phase buffers plus
    /// the cached one-byte deterministic outcome scratch.
    pub(crate) fn xz_phase_outcome_mut(
        &mut self,
    ) -> (&mut GpuBuffer<u64>, &mut GpuBuffer<u8>, &mut GpuBuffer<u8>) {
        (&mut self.xz, &mut self.phase, &mut self.measure_outcome)
    }

    pub(crate) fn total_rows(&self) -> usize {
        2 * self.num_qubits + 1
    }

    /// Copy the full tableau back to host: `xz` as `Vec<u64>`, `phase` as
    /// `Vec<bool>` (0 → false, non-zero → true). Host shape mirrors the CPU
    /// `StabilizerBackend` layout exactly; lets golden tests compare tableau
    /// state byte for byte.
    pub fn copy_to_host(&self) -> Result<(Vec<u64>, Vec<bool>)> {
        let device = self.context.device();
        let mut xz = vec![0u64; self.xz.len()];
        self.xz.copy_to_host(device, &mut xz)?;
        let mut phase_bytes = vec![0u8; self.phase.len()];
        self.phase.copy_to_host(device, &mut phase_bytes)?;
        let phase = phase_bytes.iter().map(|&b| b != 0).collect();
        Ok((xz, phase))
    }

    /// Upload `xz` and `phase` host buffers back into the device tableau.
    /// Used by the GPU measurement path's host copy-back: the CPU measurement
    /// routine mutates the tableau in-place, then this call syncs the result
    /// back to device so subsequent gate kernels see the collapsed state.
    pub fn copy_from_host(&mut self, xz: &[u64], phase: &[bool]) -> Result<()> {
        let device = self.context.device();
        self.xz.copy_from_host(device, xz)?;
        let phase_bytes: Vec<u8> = phase.iter().map(|&b| u8::from(b)).collect();
        self.phase.copy_from_host(device, &phase_bytes)?;
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::error::PrismError;

    #[test]
    fn stub_context_reports_available_false() {
        // Even if a device is present, this test only uses the stub path.
        let ctx = GpuContext::stub_for_tests();
        assert!(ctx.device().is_stub());
    }

    #[test]
    fn state_new_on_stub_returns_unsupported() {
        let ctx = GpuContext::stub_for_tests();
        assert!(matches!(
            GpuState::new(ctx, 4).unwrap_err(),
            PrismError::BackendUnsupported { .. }
        ));
    }

    #[test]
    fn tableau_new_on_stub_returns_unsupported() {
        let ctx = GpuContext::stub_for_tests();
        assert!(matches!(
            GpuTableau::new(ctx, 4).unwrap_err(),
            PrismError::BackendUnsupported { .. }
        ));
    }

    #[test]
    fn min_qubits_default_when_env_unset() {
        // `min_qubits()` caches its result in a OnceLock at first call. Other
        // tests and benchmarks in the same process also read it. The invariant
        // this test enforces is that the cached value is a plausible threshold,
        // not a specific number (the env var may legitimately override it).
        let n = min_qubits();
        assert!(
            (1..=32).contains(&n),
            "implausible gpu crossover threshold: {n}"
        );
    }

    #[test]
    fn stub_vram_available_rejects_cleanly() {
        let ctx = GpuContext::stub_for_tests();
        assert!(matches!(
            ctx.vram_available().unwrap_err(),
            PrismError::BackendUnsupported { .. }
        ));
    }

    #[test]
    fn stub_fits_statevector_rejects_cleanly() {
        let ctx = GpuContext::stub_for_tests();
        // Any query should surface the underlying unsupported error.
        assert!(matches!(
            ctx.fits_statevector(4).unwrap_err(),
            PrismError::BackendUnsupported { .. }
        ));
    }

    #[test]
    fn fits_statevector_rejects_overflowing_qubit_counts() {
        let ctx = GpuContext::stub_for_tests();
        // usize::BITS - 4 boundary: `1 << 60` times 16 bytes is already 16 EiB
        // which no GPU has. The function clamps these to `Ok(false)` before
        // touching the device, so even the stub context returns cleanly.
        assert!(!ctx.fits_statevector(128).unwrap());
    }
}