Skip to main content

dsfb_gpu_debug_cuda/
lib.rs

1//! CUDA dispatch and FFI bridge for `dsfb-gpu-debug` --- the
2//! **CUDA Evidence Factory** for the DSFB-GPU densorial /
3//! tekmeric inference stack.
4//!
5//! **Front-door identity (panel-locked)**:
6//!
7//! > DSFB-GPU is a CUDA-accelerated deterministic evidence
8//! > court: byte-exact witness-family kernels shade residual
9//! > densors into canonical evidence bytes, then a CPU-side
10//! > jurisprudence layer admits, challenges, contraindicates,
11//! > and records them into replayable case files.
12//!
13//! This crate is the GPU half of that posture. It is not a
14//! neural inference backend. It executes deterministic
15//! witness-family kernels over residual densors and returns
16//! canonical witness bytes, candidate summaries, and stage
17//! digests to the CPU-side court (the `dsfb-gpu-debug-core`
18//! and `dsfb-gpu-atlas-corpus` crates).
19//!
20//! The GPU has no semantic authority. It does not admit
21//! episodes or assign final meaning. It produces byte-exact
22//! evidence under a declared numeric, boundary, reduction,
23//! parameter, and hashing contract.
24//!
25//! ```text
26//! residual densors
27//!   → CUDA deterministic witness families
28//!   → witness densors / candidate summaries / stage digests
29//!   → CPU court admission
30//!   → replayable case file
31//! ```
32//!
33//! Panel-locked non-claims: this crate does NOT claim peak
34//! memory-bandwidth saturation, optimal kernel occupancy,
35//! optimal multi-GPU scaling, or production CUDA performance.
36//! Those are explicit future performance-campaign targets.
37//! Current artifacts establish the deterministic-evidence-
38//! factory shape, the byte-exact CPU/GPU equivalence, and
39//! the Semantic Non-Bypass Axiom (the bank stage stays CPU-
40//! side; episodes only enter the case file through the
41//! bank-private admission token).
42//!
43//! Build behavior is split by the `cuda` feature flag:
44//!
45//! * Without `cuda`: the crate compiles to a thin shim. Every public entry
46//!   point returns `GpuError::CudaUnavailable`. This lets the rest of the
47//!   workspace build and run on hosts without `nvcc` installed, which is
48//!   important because the CPU reference path is the v0 reproducibility
49//!   target — the GPU path is a parity check.
50//! * With `cuda`: `build.rs` invokes `nvcc` to compile the kernels under
51//!   `cuda/kernels.cu` into a static archive, links it, and the dispatch
52//!   module wires Rust callers to the C ABI functions exposed in `ffi.rs`.
53//!
54//! `unsafe` is restricted to the CUDA FFI boundary and the
55//! resource-lifetime wrappers that own raw device handles: the
56//! kernel-dispatch calls in `dispatch`, the workspace allocate /
57//! free / readback paths in `workspace`, the `cudaMallocHost` /
58//! `cudaFreeHost` owning handle in `pinned`, and the stream /
59//! graph handle plumbing the throughput path needs. All semantic
60//! code (case-file construction, hash chain, bank admission) remains
61//! safe Rust. The unsafe surface is small, auditable, and never
62//! reaches across module boundaries; each `unsafe` block sits next
63//! to a comment naming the invariant it carries.
64
65#![deny(missing_docs)]
66
67/// Errors that the GPU pipeline can surface.
68///
69/// The most common variant on developer machines without an NVIDIA toolkit
70/// is `CudaUnavailable`; the CLI translates this into exit code 2 so it can
71/// be distinguished from a hash-chain divergence.
72#[derive(Debug, Clone, Copy, Eq, PartialEq)]
73pub enum GpuError {
74    /// The crate was built without the `cuda` feature, so no kernels are
75    /// present. The CPU reference path still runs.
76    CudaUnavailable,
77    /// A kernel returned a non-zero error code. The contained value is the
78    /// raw status from the FFI boundary.
79    KernelFailed(i32),
80    /// One of the pipeline inputs disagrees with the contract (bad
81    /// dimensions, oversize buffer, null pointer at the FFI boundary).
82    InvalidInput(&'static str),
83}
84
85#[cfg(feature = "cuda")]
86mod ffi;
87
88#[cfg(feature = "cuda")]
89mod dispatch;
90
91#[cfg(feature = "cuda")]
92mod pinned;
93
94#[cfg(feature = "cuda")]
95mod workspace;
96
97#[cfg(feature = "cuda")]
98pub use dispatch::{
99    build_gpu, build_gpu_batched_throughput, build_gpu_batched_throughput_device_digests,
100    build_gpu_fused_throughput_digests_on_workspace, build_gpu_layer_a_batched,
101    build_gpu_layer_a_on_workspace, build_gpu_on_workspace,
102    build_gpu_throughput_device_digests_on_workspace, build_gpu_throughput_graph_or_demote,
103    build_gpu_throughput_on_workspace, build_gpu_throughput_pinned_async_on_workspace,
104    build_gpu_throughput_pinned_async_on_workspace_d128_tree_compact,
105    build_gpu_throughput_pinned_async_on_workspace_d205_tree_compact,
106    build_gpu_throughput_pinned_async_on_workspace_d64_compact_densor_compact_timed,
107    build_gpu_throughput_pinned_async_on_workspace_d64_tree_compact,
108    build_gpu_throughput_pinned_async_on_workspace_d64_tree_compact_timed,
109    build_gpu_throughput_pinned_async_on_workspace_timed,
110    build_gpu_throughput_pinned_async_on_workspace_tree,
111    build_gpu_throughput_pinned_async_on_workspace_tree_compact, build_gpu_timed,
112    build_gpu_timed_on_workspace, compact_densor_root_path1a_sweep_time,
113    compact_densor_root_streaming_sweep_time, evaluate_detector_wide_d64_on_workspace,
114    sha256_device, sha256_device_streaming, D64ThroughputHostStageTimings,
115    D64ThroughputStageTimings, PipelineTimings, R8HostStageTimings, R8StageTimings,
116};
117#[cfg(feature = "cuda")]
118pub use pinned::PinnedHostBuf;
119#[cfg(feature = "cuda")]
120pub use workspace::{BatchedGpuWorkspace, GpuWorkspace, GraphCaptureStatus};
121
122#[cfg(not(feature = "cuda"))]
123mod stubs {
124    use super::GpuError;
125    use dsfb_gpu_debug_core::casefile::CaseFile;
126    use dsfb_gpu_debug_core::contract::Contract;
127    use dsfb_gpu_debug_core::event::TraceEvent;
128
129    /// Stub for non-CUDA builds. Always returns `GpuError::CudaUnavailable`.
130    ///
131    /// # Errors
132    ///
133    /// Always returns `GpuError::CudaUnavailable` on builds without the
134    /// `cuda` feature.
135    pub fn build_gpu(_events: &[TraceEvent], _contract: &Contract) -> Result<CaseFile, GpuError> {
136        Err(GpuError::CudaUnavailable)
137    }
138}
139
140#[cfg(not(feature = "cuda"))]
141pub use stubs::build_gpu;
142
143/// Placeholder entry point retained for the workspace smoke build.
144///
145/// # Errors
146///
147/// Returns `GpuError::CudaUnavailable` whenever the crate was built without
148/// the `cuda` feature.
149pub fn pipeline_available() -> Result<(), GpuError> {
150    if cfg!(feature = "cuda") {
151        Ok(())
152    } else {
153        Err(GpuError::CudaUnavailable)
154    }
155}