[−][src]Crate accel
GPGPU framework for Rust based on CUDA Driver API
Basic Examples
Vector Add
use accel::*; use accel_derive::kernel; #[kernel] unsafe fn add(a: *const f64, b: *const f64, c: *mut f64, n: usize) { let i = accel_core::index(); if (i as usize) < n { *c.offset(i) = *a.offset(i) + *b.offset(i); } } fn main() -> error::Result<()> { let device = Device::nth(0)?; let ctx = device.create_context(); // Allocate memories on GPU let n = 32; let mut a = DeviceMemory::<f64>::new(&ctx, n); let mut b = DeviceMemory::<f64>::new(&ctx, n); let mut c = DeviceMemory::<f64>::new(&ctx, n); // Accessible from CPU as usual Rust slice (though this will be slow) for i in 0..n { a[i] = i as f64; b[i] = 2.0 * i as f64; } println!("a = {:?}", a.as_slice()); println!("b = {:?}", b.as_slice()); // Launch kernel synchronously add(&ctx, 1 /* grid */, n /* block */, &(&a.as_ptr(), &b.as_ptr(), &c.as_mut_ptr(), &n) ).expect("Kernel call failed"); println!("c = {:?}", c.as_slice()); Ok(()) }
Assertion on GPU
use accel::*; use accel_derive::kernel; #[kernel] fn assert() { accel_core::assert_eq!(1 + 2, 4); // will fail } fn main() -> error::Result<()> { let device = Device::nth(0)?; let ctx = device.create_context(); let result = assert(&ctx, 1 /* grid */, 4 /* block */, &()); assert!(result.is_err()); // assertion failed Ok(()) }
Print from GPU
use accel::*; use accel_derive::kernel; #[kernel] pub fn print() { let i = accel_core::index(); accel_core::println!("Hello from {}", i); } fn main() -> error::Result<()> { let device = Device::nth(0)?; let ctx = device.create_context(); print(&ctx, 1, 4, &())?; Ok(()) }
Advanced Examples
Get compiled PTX as String
The proc-macro #[kernel]
creates a submodule add::
in addition to a function add
.
Kernel Rust code is compiled into PTX string using rustc's nvptx64-nvidia-cuda
toolchain.
Generated PTX string is embedded into proc-macro output as {kernel_name}::PTX_STR
.
use accel_derive::kernel; #[kernel] unsafe fn add(a: *const f64, b: *const f64, c: *mut f64, n: usize) { let i = accel_core::index(); if (i as usize) < n { *c.offset(i) = *a.offset(i) + *b.offset(i); } } fn main() { // PTX assembler code is embedded as `add::PTX_STR` println!("{}", add::PTX_STR); }
Asynchronous launch
#[kernel]
creates assert::Module
type definition which implements Launchable trait.
This struct will read PTX_STR
using Module.
use accel::*; use accel_derive::kernel; #[kernel] fn assert() { accel_core::assert_eq!(1 + 2, 4); } fn main() -> error::Result<()> { let device = Device::nth(0)?; let ctx = device.create_context(); let stream = Stream::new(&ctx); let module = assert::Module::new(&ctx)?; module.stream_launch(&stream, 1, 4, &())?; // lanch will succeed assert!(stream.sync().is_err()); // assertion failed is detected in next sync Ok(()) }
Re-exports
pub use array::*; |
pub use device::*; |
pub use linker::*; |
pub use memory::*; |
pub use module::*; |
pub use stream::*; |
Modules
array | |
device | |
error | |
linker | CUDA JIT compiler and Linkers |
memory | Device and Host memory handlers |
module | CUDA Module (i.e. loaded PTX or cubin) |
stream |
Macros
ffi_call | |
ffi_new |