warp_types/
lib.rs

1//! Warp Types: Type-safe GPU warp programming via linear typestate.
2//!
3//! Prevents warp divergence bugs at compile time using linear typestate.
4//! A diverged warp literally cannot call shuffle — the method doesn't exist.
5//!
6//! # Core Idea
7//!
8//! ```
9//! use warp_types::*;
10//!
11//! let warp: Warp<All> = Warp::kernel_entry();
12//! let data = data::PerLane::new(42i32);
13//!
14//! // OK: shuffle on full warp
15//! let _shuffled = warp.shuffle_xor(data, 1);
16//!
17//! // After diverge, shuffle is gone from the type:
18//! let (evens, odds) = warp.diverge_even_odd();
19//! // evens.shuffle_xor(data, 1);  // COMPILE ERROR — method not found
20//! let merged: Warp<All> = merge(evens, odds);
21//! ```
22//!
23//! # Module Overview
24//!
25//! - [`active_set`] — Lane subset types (`All`, `Even`, `Odd`, ...) and complement proofs
26//! - [`warp`] — `Warp<S>` type parameterized by active set
27//! - [`data`] — Value categories: `PerLane<T>`, `Uniform<T>`, `SingleLane<T, N>`
28//! - [`diverge`] — Split warps by predicate (produces complementary sub-warps)
29//! - [mod@merge] — Rejoin complementary sub-warps (compile-time verified)
30//! - [`shuffle`] — Shuffle/ballot/reduce (restricted to `Warp<All>`) + permutation algebra
31//! - [`fence`] — Fence-divergence interactions (§5.6) — type-state write tracking
32//! - [`block`] — Block-level: shared memory ownership, inter-block sessions, reductions
33//! - `proof` — Soundness proof sketch (progress + preservation)
34//! - [`platform`] — CPU/GPU platform trait for dual-mode algorithms
35//! - [`gradual`] — `DynWarp` ↔ `Warp<S>` bridge for gradual typing (§9.4)
36//! - [`gpu`] — GPU intrinsics for nvptx64 and amdgpu targets
37//! - [`cub`] — Typed CUB-equivalent warp primitives (reduce, scan, broadcast)
38//! - [`sort`] — Typed warp-level bitonic sort
39//! - [`tile`] — Cooperative Groups: thread block tiles with typed shuffle safety
40//! - [`dynamic`] — Data-dependent divergence with structural complement guarantees
41//! - [`simwarp`] — Multi-lane warp simulator with real shuffle semantics (testing)
42
43#![cfg_attr(target_arch = "nvptx64", no_std)]
44#![cfg_attr(target_arch = "nvptx64", no_main)]
45#![cfg_attr(target_arch = "nvptx64", feature(abi_ptx, asm_experimental_arch))]
46// dead_code is allowed only in the research module (experimental prototypes).
47// Core modules should not have dead code — if it's unused, remove it or
48// mark it #[allow(dead_code)] individually with a justification comment.
49
50// ============================================================================
51// Core modules (public API)
52// ============================================================================
53
54pub mod active_set;
55pub mod block;
56pub mod cub;
57pub mod data;
58pub mod diverge;
59pub mod dynamic;
60pub mod fence;
61pub mod gpu;
62pub mod gradual;
63pub mod merge;
64pub mod platform;
65#[cfg(any(test, feature = "formal-proof"))]
66pub mod proof;
67pub mod shuffle;
68pub mod simwarp;
69pub mod sort;
70pub mod tile;
71pub mod warp;
72
73// ============================================================================
74// Research explorations (compiled, not re-exported)
75// ============================================================================
76
77#[cfg(not(target_arch = "nvptx64"))]
78#[allow(dead_code)]
79// Research modules contain experimental prototypes with unused code
80// Research modules: exploratory demos, not production API.
81// Suppress clippy lints inappropriate for proof-of-concept code.
82#[allow(
83    clippy::new_without_default,
84    clippy::needless_range_loop,
85    clippy::module_inception,
86    clippy::doc_markdown,
87    clippy::empty_line_after_doc_comments,
88    clippy::items_after_test_module,
89    clippy::approx_constant,
90    rustdoc::invalid_html_tags,
91    rustdoc::broken_intra_doc_links,
92    rustdoc::invalid_rust_codeblocks
93)]
94pub mod research;
95
96// ============================================================================
97// Zero-overhead verification: inspectable functions for LLVM IR comparison
98// ============================================================================
99
100/// Zero-overhead benchmark: 5 shuffle permutations + butterfly reduction.
101///
102/// This function exercises shuffle and reduce to verify type erasure.
103/// The 5 `shuffle_xor` calls permute data; `reduce_sum` does the actual
104/// butterfly reduction (5 more shuffle-XOR + add steps). Total: 10 shuffles.
105///
106/// In optimized LLVM IR, this function contains NO traces of `Warp<S>`,
107/// `PhantomData`, or active-set types. The type system is fully erased.
108/// Inspect with: `cargo rustc --release --lib -- --emit=llvm-ir`
109/// then search for `zero_overhead_butterfly` in the .ll file.
110#[no_mangle]
111#[inline(never)]
112pub fn zero_overhead_butterfly(data: data::PerLane<i32>) -> i32 {
113    let warp: Warp<All> = Warp::kernel_entry();
114    // Shuffle XOR 16: exchange with partner 16 lanes away
115    let step1 = warp.shuffle_xor(data, 16);
116    // Shuffle XOR 8
117    let step2 = warp.shuffle_xor(step1, 8);
118    // Shuffle XOR 4
119    let step3 = warp.shuffle_xor(step2, 4);
120    // Shuffle XOR 2
121    let step4 = warp.shuffle_xor(step3, 2);
122    // Shuffle XOR 1
123    let step5 = warp.shuffle_xor(step4, 1);
124    // Final reduction
125    warp.reduce_sum(step5).get()
126}
127
128/// Diverge-merge round trip: the type system's core mechanism.
129///
130/// In optimized LLVM IR, this compiles to a no-op (returns input unchanged).
131/// The diverge, merge, and all warp handles are completely erased.
132#[no_mangle]
133#[inline(never)]
134pub fn zero_overhead_diverge_merge(data: data::PerLane<i32>) -> data::PerLane<i32> {
135    let warp: Warp<All> = Warp::kernel_entry();
136    let (evens, odds) = warp.diverge_even_odd();
137    let _merged: Warp<All> = merge(evens, odds);
138    data // diverge/merge is pure type-level — data passes through unchanged
139}
140
141// ============================================================================
142// GpuValue trait
143// ============================================================================
144
145/// Marker trait for types that can live in GPU registers.
146///
147/// Requires `Copy` (registers are value types), `Send + Sync` (cross-lane),
148/// `Default` (inactive lanes need a value), and `'static` (no borrows).
149///
150/// Sealed: only primitive GPU types implement this trait. External crates
151/// cannot add implementations, ensuring `PerLane<T>` and `Uniform<T>`
152/// only wrap types with known GPU register semantics.
153pub trait GpuValue: active_set::sealed::Sealed + Copy + Send + Sync + Default + 'static {}
154
155#[allow(private_interfaces)]
156impl active_set::sealed::Sealed for i32 {
157    fn _sealed() -> active_set::sealed::SealToken {
158        active_set::sealed::SealToken
159    }
160}
161impl GpuValue for i32 {}
162#[allow(private_interfaces)]
163impl active_set::sealed::Sealed for u32 {
164    fn _sealed() -> active_set::sealed::SealToken {
165        active_set::sealed::SealToken
166    }
167}
168impl GpuValue for u32 {}
169#[allow(private_interfaces)]
170impl active_set::sealed::Sealed for f32 {
171    fn _sealed() -> active_set::sealed::SealToken {
172        active_set::sealed::SealToken
173    }
174}
175impl GpuValue for f32 {}
176#[allow(private_interfaces)]
177impl active_set::sealed::Sealed for i64 {
178    fn _sealed() -> active_set::sealed::SealToken {
179        active_set::sealed::SealToken
180    }
181}
182impl GpuValue for i64 {}
183#[allow(private_interfaces)]
184impl active_set::sealed::Sealed for u64 {
185    fn _sealed() -> active_set::sealed::SealToken {
186        active_set::sealed::SealToken
187    }
188}
189impl GpuValue for u64 {}
190#[allow(private_interfaces)]
191impl active_set::sealed::Sealed for f64 {
192    fn _sealed() -> active_set::sealed::SealToken {
193        active_set::sealed::SealToken
194    }
195}
196impl GpuValue for f64 {}
197#[allow(private_interfaces)]
198impl active_set::sealed::Sealed for bool {
199    fn _sealed() -> active_set::sealed::SealToken {
200        active_set::sealed::SealToken
201    }
202}
203impl GpuValue for bool {}
204
205// ============================================================================
206// Re-exports — flat access to the most-used types
207// ============================================================================
208
209pub use active_set::{
210    ActiveSet, All, CanDiverge, ComplementOf, ComplementWithin, Empty, Even, EvenHigh, EvenLow,
211    HighHalf, Lane0, LowHalf, NotLane0, Odd, OddHigh, OddLow,
212};
213pub use block::{BlockId, SharedRegion, ThreadId};
214pub use data::{LaneId, PerLane, Role, SingleLane, Uniform, WarpId};
215pub use dynamic::DynDiverge;
216pub use fence::{Fenced, FullWrite, GlobalRegion, PartialWrite, Unwritten, WriteState};
217pub use gradual::DynWarp;
218pub use merge::{merge, merge_within};
219pub use platform::{CpuSimd, GpuWarp32, GpuWarp64, Platform, SimdVector};
220pub use shuffle::{
221    BallotResult, Compose, HasDual, Identity, Permutation, RotateDown, RotateUp, ShuffleSafe, Xor,
222};
223pub use tile::Tile;
224pub use warp::Warp;
225pub use warp_types_kernel::warp_kernel;
226
227/// Convenience prelude — import everything needed for typical usage.
228///
229/// ```rust
230/// use warp_types::prelude::*;
231///
232/// let warp: Warp<All> = Warp::kernel_entry();
233/// let (evens, odds) = warp.diverge_even_odd();
234/// let merged: Warp<All> = merge(evens, odds);
235/// ```
236pub mod prelude {
237    pub use crate::data;
238    pub use crate::gpu::GpuShuffle;
239    pub use crate::{
240        merge, merge_within, ActiveSet, All, CanDiverge, ComplementOf, ComplementWithin,
241        DynDiverge, DynWarp, Empty, Even, EvenHigh, EvenLow, GpuValue, HighHalf, Lane0, LowHalf,
242        NotLane0, Odd, OddHigh, OddLow, PerLane, SingleLane, Tile, Uniform, Warp,
243    };
244}
warp_types/lib.rs

warp_types/
lib.rs