warp_types/lib.rs
1//! Warp Types: Type-safe GPU warp programming via linear typestate.
2//!
3//! Prevents warp divergence bugs at compile time using linear typestate.
4//! A diverged warp literally cannot call shuffle — the method doesn't exist.
5//!
6//! # Core Idea
7//!
8//! ```
9//! use warp_types::*;
10//!
11//! let warp: Warp<All> = Warp::kernel_entry();
12//! let data = data::PerLane::new(42i32);
13//!
14//! // OK: shuffle on full warp
15//! let _shuffled = warp.shuffle_xor(data, 1);
16//!
17//! // After diverge, shuffle is gone from the type:
18//! let (evens, odds) = warp.diverge_even_odd();
19//! // evens.shuffle_xor(data, 1); // COMPILE ERROR — method not found
20//! let merged: Warp<All> = merge(evens, odds);
21//! ```
22//!
23//! # Module Overview
24//!
25//! - [`active_set`] — Lane subset types (`All`, `Even`, `Odd`, ...) and complement proofs
26//! - [`warp`] — `Warp<S>` type parameterized by active set
27//! - [`data`] — Value categories: `PerLane<T>`, `Uniform<T>`, `SingleLane<T, N>`
28//! - [`diverge`] — Split warps by predicate (produces complementary sub-warps)
29//! - [mod@merge] — Rejoin complementary sub-warps (compile-time verified)
30//! - [`shuffle`] — Shuffle/ballot/reduce (restricted to `Warp<All>`) + permutation algebra
31//! - [`fence`] — Fence-divergence interactions (§5.6) — type-state write tracking
32//! - [`block`] — Block-level: shared memory ownership, inter-block sessions, reductions
33//! - `proof` — Soundness proof sketch (progress + preservation)
34//! - [`platform`] — CPU/GPU platform trait for dual-mode algorithms
35//! - [`gradual`] — `DynWarp` ↔ `Warp<S>` bridge for gradual typing (§9.4)
36//! - [`gpu`] — GPU intrinsics for nvptx64 and amdgpu targets
37//! - [`cub`] — Typed CUB-equivalent warp primitives (reduce, scan, broadcast)
38//! - [`sort`] — Typed warp-level bitonic sort
39//! - [`tile`] — Cooperative Groups: thread block tiles with typed shuffle safety
40//! - [`dynamic`] — Data-dependent divergence with structural complement guarantees
41//! - [`simwarp`] — Multi-lane warp simulator with real shuffle semantics (testing)
42
43#![cfg_attr(target_arch = "nvptx64", no_std)]
44#![cfg_attr(target_arch = "nvptx64", no_main)]
45#![cfg_attr(target_arch = "nvptx64", feature(abi_ptx, asm_experimental_arch))]
46// dead_code is allowed only in the research module (experimental prototypes).
47// Core modules should not have dead code — if it's unused, remove it or
48// mark it #[allow(dead_code)] individually with a justification comment.
49
50// ============================================================================
51// Warp size configuration
52// ============================================================================
53
54/// Number of lanes per warp/wavefront.
55///
56/// - NVIDIA: 32 lanes (default)
57/// - AMD: 64 lanes (enable `warp64` feature)
58#[cfg(not(feature = "warp64"))]
59pub const WARP_SIZE: u32 = 32;
60
61/// Number of lanes per warp/wavefront (AMD 64-lane mode).
62#[cfg(feature = "warp64")]
63pub const WARP_SIZE: u32 = 64;
64
65// ============================================================================
66// Core modules (public API)
67// ============================================================================
68
69pub mod active_set;
70pub mod block;
71pub mod cub;
72pub mod data;
73pub mod diverge;
74pub mod dynamic;
75pub mod fence;
76pub mod gpu;
77pub mod gradual;
78pub mod merge;
79pub mod platform;
80#[cfg(any(test, feature = "formal-proof"))]
81pub mod proof;
82pub mod shuffle;
83pub mod simwarp;
84pub mod sort;
85pub mod tile;
86pub mod warp;
87
88// ============================================================================
89// Research explorations (compiled, not re-exported)
90// ============================================================================
91
92#[cfg(any(test, feature = "research"))]
93#[cfg(not(target_arch = "nvptx64"))]
94#[allow(dead_code)]
95// Research modules: design-space explorations and prototypes.
96// Not part of the production API — gated behind `research` feature.
97// Always compiled during `cargo test` so research tests stay exercised.
98#[allow(
99 clippy::new_without_default,
100 clippy::needless_range_loop,
101 clippy::module_inception,
102 clippy::doc_markdown,
103 clippy::empty_line_after_doc_comments,
104 clippy::items_after_test_module,
105 clippy::approx_constant,
106 rustdoc::invalid_html_tags,
107 rustdoc::broken_intra_doc_links,
108 rustdoc::invalid_rust_codeblocks
109)]
110pub mod research;
111
112// ============================================================================
113// Zero-overhead verification: inspectable functions for LLVM IR comparison
114// ============================================================================
115
116/// Zero-overhead benchmark: 5 shuffle permutations + butterfly reduction.
117///
118/// This function exercises shuffle and reduce to verify type erasure.
119/// The 5 `shuffle_xor` calls permute data; `reduce_sum` does the actual
120/// butterfly reduction (5 more shuffle-XOR + add steps). Total: 10 shuffles.
121///
122/// In optimized LLVM IR, this function contains NO traces of `Warp<S>`,
123/// `PhantomData`, or active-set types. The type system is fully erased.
124/// Inspect with: `cargo rustc --release --lib -- --emit=llvm-ir`
125/// then search for `zero_overhead_butterfly` in the .ll file.
126#[export_name = "warp_types_zero_overhead_butterfly"]
127#[inline(never)]
128pub fn zero_overhead_butterfly(data: data::PerLane<i32>) -> i32 {
129 let warp: Warp<All> = Warp::kernel_entry();
130 // Shuffle XOR 16: exchange with partner 16 lanes away
131 let step1 = warp.shuffle_xor(data, 16);
132 // Shuffle XOR 8
133 let step2 = warp.shuffle_xor(step1, 8);
134 // Shuffle XOR 4
135 let step3 = warp.shuffle_xor(step2, 4);
136 // Shuffle XOR 2
137 let step4 = warp.shuffle_xor(step3, 2);
138 // Shuffle XOR 1
139 let step5 = warp.shuffle_xor(step4, 1);
140 // Final reduction
141 warp.reduce_sum(step5).get()
142}
143
144/// Diverge-merge round trip: the type system's core mechanism.
145///
146/// In optimized LLVM IR, this compiles to a no-op (returns input unchanged).
147/// The diverge, merge, and all warp handles are completely erased.
148#[export_name = "warp_types_zero_overhead_diverge_merge"]
149#[inline(never)]
150pub fn zero_overhead_diverge_merge(data: data::PerLane<i32>) -> data::PerLane<i32> {
151 let warp: Warp<All> = Warp::kernel_entry();
152 let (evens, odds) = warp.diverge_even_odd();
153 let _merged: Warp<All> = merge(evens, odds);
154 data // diverge/merge is pure type-level — data passes through unchanged
155}
156
157// ============================================================================
158// GpuValue trait (with its own seal, separate from ActiveSet's seal)
159// ============================================================================
160
161/// Sealed trait module for GPU value types — separate from ActiveSet sealing.
162#[doc(hidden)]
163pub mod gpu_sealed {
164 #[doc(hidden)]
165 pub(crate) struct GpuSealToken;
166
167 #[allow(private_interfaces)]
168 pub trait GpuSealed {
169 #[doc(hidden)]
170 fn _gpu_sealed() -> GpuSealToken;
171 }
172}
173
174/// Marker trait for types that can live in GPU registers.
175///
176/// Requires `Copy` (registers are value types), `Send + Sync` (cross-lane),
177/// `Default` (inactive lanes need a value), and `'static` (no borrows).
178///
179/// Sealed: only primitive GPU types implement this trait. External crates
180/// cannot add implementations, ensuring `PerLane<T>` and `Uniform<T>`
181/// only wrap types with known GPU register semantics.
182pub trait GpuValue: gpu_sealed::GpuSealed + Copy + Send + Sync + Default + 'static {}
183
184macro_rules! impl_gpu_value {
185 ($($t:ty),*) => {
186 $(
187 #[allow(private_interfaces)]
188 impl gpu_sealed::GpuSealed for $t {
189 fn _gpu_sealed() -> gpu_sealed::GpuSealToken {
190 gpu_sealed::GpuSealToken
191 }
192 }
193 impl GpuValue for $t {}
194 )*
195 };
196}
197
198impl_gpu_value!(i32, u32, f32, i64, u64, f64, bool);
199
200// ============================================================================
201// Re-exports — flat access to the most-used types
202// ============================================================================
203
204pub use active_set::{
205 ActiveSet, All, CanDiverge, ComplementOf, ComplementWithin, Empty, Even, EvenHigh, EvenLow,
206 HighHalf, Lane0, LowHalf, NotLane0, Odd, OddHigh, OddLow,
207};
208pub use block::{BlockId, SharedRegion, ThreadId};
209pub use data::{LaneId, PerLane, Role, SingleLane, Uniform, WarpId};
210pub use dynamic::DynDiverge;
211pub use fence::{Fenced, FullWrite, GlobalRegion, PartialWrite, Unwritten, WriteState};
212pub use gradual::DynWarp;
213pub use merge::{merge, merge_within};
214pub use platform::{CpuSimd, GpuWarp32, GpuWarp64, Platform, SimdVector};
215pub use shuffle::{
216 BallotResult, Compose, HasDual, Identity, Permutation, RotateDown, RotateUp, ShuffleSafe, Xor,
217};
218pub use tile::Tile;
219pub use warp::Warp;
220pub use warp_types_kernel::warp_kernel;
221
222/// Convenience prelude — import everything needed for typical usage.
223///
224/// ```rust
225/// use warp_types::prelude::*;
226///
227/// let warp: Warp<All> = Warp::kernel_entry();
228/// let (evens, odds) = warp.diverge_even_odd();
229/// let merged: Warp<All> = merge(evens, odds);
230/// ```
231pub mod prelude {
232 pub use crate::data;
233 pub use crate::gpu::GpuShuffle;
234 pub use crate::{
235 merge, merge_within, warp_kernel, ActiveSet, All, BallotResult, CanDiverge, ComplementOf,
236 ComplementWithin, DynDiverge, DynWarp, Empty, Even, EvenHigh, EvenLow, Fenced, FullWrite,
237 GlobalRegion, GpuValue, HighHalf, Lane0, LaneId, LowHalf, NotLane0, Odd, OddHigh, OddLow,
238 PartialWrite, PerLane, SingleLane, Tile, Uniform, Unwritten, Warp, WarpId, WriteState,
239 };
240}