dsfb_gpu_debug_cuda/lib.rs
1//! CUDA dispatch and FFI bridge for `dsfb-gpu-debug` --- the
2//! **CUDA Evidence Factory** for the DSFB-GPU densorial /
3//! tekmeric inference stack.
4//!
5//! **Front-door identity (panel-locked)**:
6//!
7//! > DSFB-GPU is a CUDA-accelerated deterministic evidence
8//! > court: byte-exact witness-family kernels shade residual
9//! > densors into canonical evidence bytes, then a CPU-side
10//! > jurisprudence layer admits, challenges, contraindicates,
11//! > and records them into replayable case files.
12//!
13//! This crate is the GPU half of that posture. It is not a
14//! neural inference backend. It executes deterministic
15//! witness-family kernels over residual densors and returns
16//! canonical witness bytes, candidate summaries, and stage
17//! digests to the CPU-side court (the `dsfb-gpu-debug-core`
18//! and `dsfb-gpu-atlas-corpus` crates).
19//!
20//! The GPU has no semantic authority. It does not admit
21//! episodes or assign final meaning. It produces byte-exact
22//! evidence under a declared numeric, boundary, reduction,
23//! parameter, and hashing contract.
24//!
25//! ```text
26//! residual densors
27//! → CUDA deterministic witness families
28//! → witness densors / candidate summaries / stage digests
29//! → CPU court admission
30//! → replayable case file
31//! ```
32//!
33//! Panel-locked non-claims: this crate does NOT claim peak
34//! memory-bandwidth saturation, optimal kernel occupancy,
35//! optimal multi-GPU scaling, or production CUDA performance.
36//! Those are explicit future performance-campaign targets.
37//! Current artifacts establish the deterministic-evidence-
38//! factory shape, the byte-exact CPU/GPU equivalence, and
39//! the Semantic Non-Bypass Axiom (the bank stage stays CPU-
40//! side; episodes only enter the case file through the
41//! bank-private admission token).
42//!
43//! Build behavior is split by the `cuda` feature flag:
44//!
45//! * Without `cuda`: the crate compiles to a thin shim. Every public entry
46//! point returns `GpuError::CudaUnavailable`. This lets the rest of the
47//! workspace build and run on hosts without `nvcc` installed, which is
48//! important because the CPU reference path is the v0 reproducibility
49//! target — the GPU path is a parity check.
50//! * With `cuda`: `build.rs` invokes `nvcc` to compile the kernels under
51//! `cuda/kernels.cu` into a static archive, links it, and the dispatch
52//! module wires Rust callers to the C ABI functions exposed in `ffi.rs`.
53//!
54//! `unsafe` is restricted to the CUDA FFI boundary and the
55//! resource-lifetime wrappers that own raw device handles: the
56//! kernel-dispatch calls in `dispatch`, the workspace allocate /
57//! free / readback paths in `workspace`, the `cudaMallocHost` /
58//! `cudaFreeHost` owning handle in `pinned`, and the stream /
59//! graph handle plumbing the throughput path needs. All semantic
60//! code (case-file construction, hash chain, bank admission) remains
61//! safe Rust. The unsafe surface is small, auditable, and never
62//! reaches across module boundaries; each `unsafe` block sits next
63//! to a comment naming the invariant it carries.
64
65#![deny(missing_docs)]
66
67/// Errors that the GPU pipeline can surface.
68///
69/// The most common variant on developer machines without an NVIDIA toolkit
70/// is `CudaUnavailable`; the CLI translates this into exit code 2 so it can
71/// be distinguished from a hash-chain divergence.
72#[derive(Debug, Clone, Copy, Eq, PartialEq)]
73pub enum GpuError {
74 /// The crate was built without the `cuda` feature, so no kernels are
75 /// present. The CPU reference path still runs.
76 CudaUnavailable,
77 /// A kernel returned a non-zero error code. The contained value is the
78 /// raw status from the FFI boundary.
79 KernelFailed(i32),
80 /// One of the pipeline inputs disagrees with the contract (bad
81 /// dimensions, oversize buffer, null pointer at the FFI boundary).
82 InvalidInput(&'static str),
83}
84
85#[cfg(feature = "cuda")]
86mod ffi;
87
88#[cfg(feature = "cuda")]
89mod dispatch;
90
91#[cfg(feature = "cuda")]
92mod pinned;
93
94#[cfg(feature = "cuda")]
95mod workspace;
96
97#[cfg(feature = "cuda")]
98pub use dispatch::{
99 build_gpu, build_gpu_batched_throughput, build_gpu_batched_throughput_device_digests,
100 build_gpu_fused_throughput_digests_on_workspace, build_gpu_layer_a_batched,
101 build_gpu_layer_a_on_workspace, build_gpu_on_workspace,
102 build_gpu_throughput_device_digests_on_workspace, build_gpu_throughput_graph_or_demote,
103 build_gpu_throughput_on_workspace, build_gpu_throughput_pinned_async_on_workspace,
104 build_gpu_throughput_pinned_async_on_workspace_d128_tree_compact,
105 build_gpu_throughput_pinned_async_on_workspace_d205_tree_compact,
106 build_gpu_throughput_pinned_async_on_workspace_d64_compact_densor_compact_timed,
107 build_gpu_throughput_pinned_async_on_workspace_d64_tree_compact,
108 build_gpu_throughput_pinned_async_on_workspace_d64_tree_compact_timed,
109 build_gpu_throughput_pinned_async_on_workspace_timed,
110 build_gpu_throughput_pinned_async_on_workspace_tree,
111 build_gpu_throughput_pinned_async_on_workspace_tree_compact, build_gpu_timed,
112 build_gpu_timed_on_workspace, compact_densor_root_path1a_sweep_time,
113 compact_densor_root_streaming_sweep_time, evaluate_detector_wide_d64_on_workspace,
114 sha256_device, sha256_device_streaming, D64ThroughputHostStageTimings,
115 D64ThroughputStageTimings, PipelineTimings, R8HostStageTimings, R8StageTimings,
116};
117#[cfg(feature = "cuda")]
118pub use pinned::PinnedHostBuf;
119#[cfg(feature = "cuda")]
120pub use workspace::{BatchedGpuWorkspace, GpuWorkspace, GraphCaptureStatus};
121
122#[cfg(not(feature = "cuda"))]
123mod stubs {
124 use super::GpuError;
125 use dsfb_gpu_debug_core::casefile::CaseFile;
126 use dsfb_gpu_debug_core::contract::Contract;
127 use dsfb_gpu_debug_core::event::TraceEvent;
128
129 /// Stub for non-CUDA builds. Always returns `GpuError::CudaUnavailable`.
130 ///
131 /// # Errors
132 ///
133 /// Always returns `GpuError::CudaUnavailable` on builds without the
134 /// `cuda` feature.
135 pub fn build_gpu(_events: &[TraceEvent], _contract: &Contract) -> Result<CaseFile, GpuError> {
136 Err(GpuError::CudaUnavailable)
137 }
138}
139
140#[cfg(not(feature = "cuda"))]
141pub use stubs::build_gpu;
142
143/// Placeholder entry point retained for the workspace smoke build.
144///
145/// # Errors
146///
147/// Returns `GpuError::CudaUnavailable` whenever the crate was built without
148/// the `cuda` feature.
149pub fn pipeline_available() -> Result<(), GpuError> {
150 if cfg!(feature = "cuda") {
151 Ok(())
152 } else {
153 Err(GpuError::CudaUnavailable)
154 }
155}