Struct cudarc::driver::safe::CudaDevice

source ·

pub struct CudaDevice { /* private fields */ }

Expand description

A wrapper around sys::CUdevice, sys::CUcontext, sys::CUstream, and CudaFunction.

let dev = CudaDevice::new(0).unwrap();

Safety

impl Drop to call all the corresponding resource cleanup methods
Doesn’t impl clone, so you can’t have multiple device pointers hanging around.
Any allocations enforce that self is an Arc, meaning no allocation can outlive the CudaDevice

Implementations§

source §

impl CudaDevice

source

pub unsafe fn upgrade_device_ptr<T>( self: &Arc<Self>, cu_device_ptr: CUdeviceptr, len: usize ) -> CudaSlice<T>

Creates a CudaSlice from a sys::CUdeviceptr. Useful in conjunction with CudaSlice::leak().

Safety

cu_device_ptr must be a valid allocation
cu_device_ptr must space for len * std::mem::size_of<T>() bytes
The memory may not be valid for type T, so some sort of memset operation should be called on the memory.

source §

impl CudaDevice

source

pub fn null<T>(self: &Arc<Self>) -> Result<CudaSlice<T>, DriverError>

Allocates an empty CudaSlice with 0 length.

source

pub unsafe fn alloc<T: DeviceRepr>( self: &Arc<Self>, len: usize ) -> Result<CudaSlice<T>, DriverError>

Allocates device memory and increments the reference counter of CudaDevice.

Safety

This is unsafe because the device memory is unset after this call.

Examples found in repository ?

examples/01-allocate.rs (line 7)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    // unsafe initialization of unset memory
    let _: CudaSlice<f32> = unsafe { dev.alloc::<f32>(10) }?;

    // this will have memory initialized as 0
    let _: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;

    // initialize with a rust vec
    let _: CudaSlice<usize> = dev.htod_copy(vec![0; 10])?;

    // or finially, initialize with a slice. this is synchronous though.
    let _: CudaSlice<u32> = dev.htod_sync_copy(&[1, 2, 3])?;

    Ok(())
}

source

pub fn alloc_zeros<T: ValidAsZeroBits + DeviceRepr>( self: &Arc<Self>, len: usize ) -> Result<CudaSlice<T>, DriverError>

Allocates device memory with no associated host memory, and memsets the device memory to all 0s.

Safety

T is marked as ValidAsZeroBits, so the device memory is valid to use
Self is Arc<Self>, and this method increments the rc for self

Examples found in repository ?

examples/01-allocate.rs (line 10)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    // unsafe initialization of unset memory
    let _: CudaSlice<f32> = unsafe { dev.alloc::<f32>(10) }?;

    // this will have memory initialized as 0
    let _: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;

    // initialize with a rust vec
    let _: CudaSlice<usize> = dev.htod_copy(vec![0; 10])?;

    // or finially, initialize with a slice. this is synchronous though.
    let _: CudaSlice<u32> = dev.htod_sync_copy(&[1, 2, 3])?;

    Ok(())
}

More examples

Hide additional examples

examples/02-copy.rs (line 6)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
    let mut b = dev.alloc_zeros::<f64>(10)?;

    // you can do device to device copies of course
    dev.dtod_copy(&a, &mut b)?;

    // but also host to device copys with already allocated buffers
    dev.htod_copy_into(vec![2.0; 10], &mut b)?;

    // if you want to use slices, you can do synchronous copy
    dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;

    // you can transfer back using reclaim:
    let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
    assert_eq!(a_host, [0.0; 10]);

    // or copy back without losing ownership:
    let b_host = dev.dtoh_sync_copy(&b)?;
    assert_eq!(b_host, [3.0; 10]);

    // or use a slice
    dev.dtoh_sync_copy_into(&b, &mut a_host)?;
    assert_eq!(a_host, b_host);

    Ok(())
}

source

pub fn memset_zeros<T: ValidAsZeroBits + DeviceRepr, Dst: DevicePtrMut<T>>( self: &Arc<Self>, dst: &mut Dst ) -> Result<(), DriverError>

Sets all memory to 0 asynchronously.

Safety

T is marked as ValidAsZeroBits, so the device memory is valid to use
Self is Arc<Self>, and this method increments the rc for self

source

pub fn dtod_copy<T: DeviceRepr, Src: DevicePtr<T>, Dst: DevicePtrMut<T>>( self: &Arc<Self>, src: &Src, dst: &mut Dst ) -> Result<(), DriverError>

Device to device copy (safe version of result::memcpy_dtod_async).

Panics

If the length of the two values are different

Safety

We are guarunteed that src and dst are pointers to the same underlying type T
Since they are both references, they can’t have been freed
Self is Arc<Self>, and this method increments the rc for self

Examples found in repository ?

examples/02-copy.rs (line 10)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
    let mut b = dev.alloc_zeros::<f64>(10)?;

    // you can do device to device copies of course
    dev.dtod_copy(&a, &mut b)?;

    // but also host to device copys with already allocated buffers
    dev.htod_copy_into(vec![2.0; 10], &mut b)?;

    // if you want to use slices, you can do synchronous copy
    dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;

    // you can transfer back using reclaim:
    let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
    assert_eq!(a_host, [0.0; 10]);

    // or copy back without losing ownership:
    let b_host = dev.dtoh_sync_copy(&b)?;
    assert_eq!(b_host, [3.0; 10]);

    // or use a slice
    dev.dtoh_sync_copy_into(&b, &mut a_host)?;
    assert_eq!(a_host, b_host);

    Ok(())
}

source

pub fn htod_copy<T: Unpin + DeviceRepr>( self: &Arc<Self>, src: Vec<T> ) -> Result<CudaSlice<T>, DriverError>

Takes ownership of the host data and copies it to device data asynchronously.

Safety

Since src is owned by this funcion, it is safe to copy data. Any actions executed after this will take place after the data has been successfully copied.
Self is Arc<Self>, and this method increments the rc for self

Examples found in repository ?

examples/01-allocate.rs (line 13)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    // unsafe initialization of unset memory
    let _: CudaSlice<f32> = unsafe { dev.alloc::<f32>(10) }?;

    // this will have memory initialized as 0
    let _: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;

    // initialize with a rust vec
    let _: CudaSlice<usize> = dev.htod_copy(vec![0; 10])?;

    // or finially, initialize with a slice. this is synchronous though.
    let _: CudaSlice<u32> = dev.htod_sync_copy(&[1, 2, 3])?;

    Ok(())
}

More examples

Hide additional examples

examples/03-launch-kernel.rs (line 17)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    // You can load a function from a pre-compiled PTX like so:
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    // and then retrieve the function with `get_func`
    let f = dev.get_func("sin", "sin_kernel").unwrap();

    let a_host = [1.0, 2.0, 3.0];

    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);
    unsafe { f.launch(cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

examples/04-streams.rs (line 14)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);

    let a_host = [1.0, 2.0, 3.0];
    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    // create a stream with `fork_default_stream()`
    // This synchronizes with the default stream, so since
    // we put this call **after** the `htod_copy` & `clone` above,
    // cuda will complete those orders **before** work on this stream
    // can start.
    let stream = dev.fork_default_stream()?;

    let f = dev.get_func("sin", "sin_kernel").unwrap();

    // we launch it differently too
    unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    // and we must join with the default work stream in order for copies
    // to work corrently.
    // NOTE: this is actually async with respect to the host!
    dev.wait_for(&stream)?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

source

pub fn htod_copy_into<T: DeviceRepr + Unpin>( self: &Arc<Self>, src: Vec<T>, dst: &mut CudaSlice<T> ) -> Result<(), DriverError>

Takes ownership of the host data and copies it to device data asynchronously.

Safety

Since src is owned by this funcion, it is safe to copy data. Any actions executed after this will take place after the data has been successfully copied.
Self is Arc<Self>, and this method increments the rc for self

Examples found in repository ?

examples/02-copy.rs (line 13)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
    let mut b = dev.alloc_zeros::<f64>(10)?;

    // you can do device to device copies of course
    dev.dtod_copy(&a, &mut b)?;

    // but also host to device copys with already allocated buffers
    dev.htod_copy_into(vec![2.0; 10], &mut b)?;

    // if you want to use slices, you can do synchronous copy
    dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;

    // you can transfer back using reclaim:
    let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
    assert_eq!(a_host, [0.0; 10]);

    // or copy back without losing ownership:
    let b_host = dev.dtoh_sync_copy(&b)?;
    assert_eq!(b_host, [3.0; 10]);

    // or use a slice
    dev.dtoh_sync_copy_into(&b, &mut a_host)?;
    assert_eq!(a_host, b_host);

    Ok(())
}

source

pub fn htod_sync_copy<T: DeviceRepr>( self: &Arc<Self>, src: &[T] ) -> Result<CudaSlice<T>, DriverError>

Allocates new device memory and synchronously copies data from src into the new allocation.

If you want an asynchronous copy, see CudaDevice::htod_copy().

Safety

Since this function doesn’t own src it is executed synchronously.
Self is Arc<Self>, and this method increments the rc for self

Examples found in repository ?

examples/01-allocate.rs (line 16)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    // unsafe initialization of unset memory
    let _: CudaSlice<f32> = unsafe { dev.alloc::<f32>(10) }?;

    // this will have memory initialized as 0
    let _: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;

    // initialize with a rust vec
    let _: CudaSlice<usize> = dev.htod_copy(vec![0; 10])?;

    // or finially, initialize with a slice. this is synchronous though.
    let _: CudaSlice<u32> = dev.htod_sync_copy(&[1, 2, 3])?;

    Ok(())
}

More examples

Hide additional examples

examples/matmul-kernel.rs (line 39)

fn main() -> Result<(), DriverError> {
    let start = std::time::Instant::now();

    let ptx = compile_ptx(PTX_SRC).unwrap();
    println!("Compilation succeeded in {:?}", start.elapsed());

    let dev = CudaDevice::new(0)?;
    println!("Built in {:?}", start.elapsed());

    dev.load_ptx(ptx, "matmul", &["matmul"])?;
    let f = dev.get_func("matmul", "matmul").unwrap();
    println!("Loaded in {:?}", start.elapsed());

    let a_host = [1.0f32, 2.0, 3.0, 4.0];
    let b_host = [1.0f32, 2.0, 3.0, 4.0];
    let mut c_host = [0.0f32; 4];

    let a_dev = dev.htod_sync_copy(&a_host)?;
    let b_dev = dev.htod_sync_copy(&b_host)?;
    let mut c_dev = dev.htod_sync_copy(&c_host)?;

    println!("Copied in {:?}", start.elapsed());

    let cfg = LaunchConfig {
        block_dim: (2, 2, 1),
        grid_dim: (1, 1, 1),
        shared_mem_bytes: 0,
    };
    unsafe { f.launch(cfg, (&a_dev, &b_dev, &mut c_dev, 2i32)) }?;

    dev.dtoh_sync_copy_into(&c_dev, &mut c_host)?;
    println!("Found {:?} in {:?}", c_host, start.elapsed());
    Ok(())
}

source

pub fn htod_sync_copy_into<T: DeviceRepr, Dst: DevicePtrMut<T>>( self: &Arc<Self>, src: &[T], dst: &mut Dst ) -> Result<(), DriverError>

Synchronously copies data from src into the new allocation.

If you want an asynchronous copy, see CudaDevice::htod_copy().

Panics

If the lengths of slices are not equal, this method panics.

Safety

Since this function doesn’t own src it is executed synchronously.
Self is Arc<Self>, and this method increments the rc for self

Examples found in repository ?

examples/02-copy.rs (line 16)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
    let mut b = dev.alloc_zeros::<f64>(10)?;

    // you can do device to device copies of course
    dev.dtod_copy(&a, &mut b)?;

    // but also host to device copys with already allocated buffers
    dev.htod_copy_into(vec![2.0; 10], &mut b)?;

    // if you want to use slices, you can do synchronous copy
    dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;

    // you can transfer back using reclaim:
    let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
    assert_eq!(a_host, [0.0; 10]);

    // or copy back without losing ownership:
    let b_host = dev.dtoh_sync_copy(&b)?;
    assert_eq!(b_host, [3.0; 10]);

    // or use a slice
    dev.dtoh_sync_copy_into(&b, &mut a_host)?;
    assert_eq!(a_host, b_host);

    Ok(())
}

source

pub fn dtoh_sync_copy<T: DeviceRepr>( self: &Arc<Self>, src: &CudaSlice<T> ) -> Result<Vec<T>, DriverError>

Synchronously copies device memory into host memory. Unlike CudaDevice::dtoh_sync_copy_into this returns a Vec<T>.

Safety

Since this function doesn’t own dst (after returning) it is executed synchronously.
Self is Arc<Self>, and this method increments the rc for self

Examples found in repository ?

examples/02-copy.rs (line 23)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
    let mut b = dev.alloc_zeros::<f64>(10)?;

    // you can do device to device copies of course
    dev.dtod_copy(&a, &mut b)?;

    // but also host to device copys with already allocated buffers
    dev.htod_copy_into(vec![2.0; 10], &mut b)?;

    // if you want to use slices, you can do synchronous copy
    dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;

    // you can transfer back using reclaim:
    let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
    assert_eq!(a_host, [0.0; 10]);

    // or copy back without losing ownership:
    let b_host = dev.dtoh_sync_copy(&b)?;
    assert_eq!(b_host, [3.0; 10]);

    // or use a slice
    dev.dtoh_sync_copy_into(&b, &mut a_host)?;
    assert_eq!(a_host, b_host);

    Ok(())
}

source

pub fn dtoh_sync_copy_into<T: DeviceRepr, Src: DevicePtr<T>>( self: &Arc<Self>, src: &Src, dst: &mut [T] ) -> Result<(), DriverError>

Synchronously copies device memory into host memory

Use CudaDevice::dtoh_sync_copy if you need Vec<T> and can’t provide a correctly sized slice.

Panics

If the lengths of slices are not equal, this method panics.

Safety

Since this function doesn’t own dst it is executed synchronously.
Self is Arc<Self>, and this method increments the rc for self

Examples found in repository ?

examples/02-copy.rs (line 27)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
    let mut b = dev.alloc_zeros::<f64>(10)?;

    // you can do device to device copies of course
    dev.dtod_copy(&a, &mut b)?;

    // but also host to device copys with already allocated buffers
    dev.htod_copy_into(vec![2.0; 10], &mut b)?;

    // if you want to use slices, you can do synchronous copy
    dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;

    // you can transfer back using reclaim:
    let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
    assert_eq!(a_host, [0.0; 10]);

    // or copy back without losing ownership:
    let b_host = dev.dtoh_sync_copy(&b)?;
    assert_eq!(b_host, [3.0; 10]);

    // or use a slice
    dev.dtoh_sync_copy_into(&b, &mut a_host)?;
    assert_eq!(a_host, b_host);

    Ok(())
}

More examples

Hide additional examples

examples/matmul-kernel.rs (line 52)

fn main() -> Result<(), DriverError> {
    let start = std::time::Instant::now();

    let ptx = compile_ptx(PTX_SRC).unwrap();
    println!("Compilation succeeded in {:?}", start.elapsed());

    let dev = CudaDevice::new(0)?;
    println!("Built in {:?}", start.elapsed());

    dev.load_ptx(ptx, "matmul", &["matmul"])?;
    let f = dev.get_func("matmul", "matmul").unwrap();
    println!("Loaded in {:?}", start.elapsed());

    let a_host = [1.0f32, 2.0, 3.0, 4.0];
    let b_host = [1.0f32, 2.0, 3.0, 4.0];
    let mut c_host = [0.0f32; 4];

    let a_dev = dev.htod_sync_copy(&a_host)?;
    let b_dev = dev.htod_sync_copy(&b_host)?;
    let mut c_dev = dev.htod_sync_copy(&c_host)?;

    println!("Copied in {:?}", start.elapsed());

    let cfg = LaunchConfig {
        block_dim: (2, 2, 1),
        grid_dim: (1, 1, 1),
        shared_mem_bytes: 0,
    };
    unsafe { f.launch(cfg, (&a_dev, &b_dev, &mut c_dev, 2i32)) }?;

    dev.dtoh_sync_copy_into(&c_dev, &mut c_host)?;
    println!("Found {:?} in {:?}", c_host, start.elapsed());
    Ok(())
}

source

pub fn sync_reclaim<T: Clone + Default + DeviceRepr + Unpin>( self: &Arc<Self>, src: CudaSlice<T> ) -> Result<Vec<T>, DriverError>

Synchronously de-allocates src and converts it into it’s host value. You can just drop the slice if you don’t need the host data.

Safety

Self is Arc<Self>, and this method increments the rc for self

Examples found in repository ?

examples/03-launch-kernel.rs (line 24)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    // You can load a function from a pre-compiled PTX like so:
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    // and then retrieve the function with `get_func`
    let f = dev.get_func("sin", "sin_kernel").unwrap();

    let a_host = [1.0, 2.0, 3.0];

    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);
    unsafe { f.launch(cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

More examples

Hide additional examples

examples/02-copy.rs (line 19)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
    let mut b = dev.alloc_zeros::<f64>(10)?;

    // you can do device to device copies of course
    dev.dtod_copy(&a, &mut b)?;

    // but also host to device copys with already allocated buffers
    dev.htod_copy_into(vec![2.0; 10], &mut b)?;

    // if you want to use slices, you can do synchronous copy
    dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;

    // you can transfer back using reclaim:
    let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
    assert_eq!(a_host, [0.0; 10]);

    // or copy back without losing ownership:
    let b_host = dev.dtoh_sync_copy(&b)?;
    assert_eq!(b_host, [3.0; 10]);

    // or use a slice
    dev.dtoh_sync_copy_into(&b, &mut a_host)?;
    assert_eq!(a_host, b_host);

    Ok(())
}

examples/04-streams.rs (line 34)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);

    let a_host = [1.0, 2.0, 3.0];
    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    // create a stream with `fork_default_stream()`
    // This synchronizes with the default stream, so since
    // we put this call **after** the `htod_copy` & `clone` above,
    // cuda will complete those orders **before** work on this stream
    // can start.
    let stream = dev.fork_default_stream()?;

    let f = dev.get_func("sin", "sin_kernel").unwrap();

    // we launch it differently too
    unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    // and we must join with the default work stream in order for copies
    // to work corrently.
    // NOTE: this is actually async with respect to the host!
    dev.wait_for(&stream)?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

source

pub fn synchronize(self: &Arc<Self>) -> Result<(), DriverError>

Synchronizes the stream.

source §

impl CudaDevice

source

pub fn new(ordinal: usize) -> Result<Arc<Self>, DriverError>

Creates a new CudaDevice on device index ordinal.

Examples found in repository ?

examples/01-allocate.rs (line 4)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    // unsafe initialization of unset memory
    let _: CudaSlice<f32> = unsafe { dev.alloc::<f32>(10) }?;

    // this will have memory initialized as 0
    let _: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;

    // initialize with a rust vec
    let _: CudaSlice<usize> = dev.htod_copy(vec![0; 10])?;

    // or finially, initialize with a slice. this is synchronous though.
    let _: CudaSlice<u32> = dev.htod_sync_copy(&[1, 2, 3])?;

    Ok(())
}

More examples

Hide additional examples

examples/05-device-repr.rs (line 33)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    let ptx = compile_ptx(PTX_SRC).unwrap();
    dev.load_ptx(ptx, "module", &["my_custom_kernel"])?;

    // try changing some of these values to see a device assert
    let thing = MyCoolRustStruct {
        a: 1.0,
        b: 2.34,
        c: 57,
        d: 420,
    };

    let f = dev.get_func("module", "my_custom_kernel").unwrap();

    // since MyCoolRustStruct implements DeviceRepr, we can pass it to launch.
    unsafe { f.launch(LaunchConfig::for_num_elems(1), (thing,)) }?;

    Ok(())
}

examples/03-launch-kernel.rs (line 7)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    // You can load a function from a pre-compiled PTX like so:
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    // and then retrieve the function with `get_func`
    let f = dev.get_func("sin", "sin_kernel").unwrap();

    let a_host = [1.0, 2.0, 3.0];

    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);
    unsafe { f.launch(cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

examples/02-copy.rs (line 4)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
    let mut b = dev.alloc_zeros::<f64>(10)?;

    // you can do device to device copies of course
    dev.dtod_copy(&a, &mut b)?;

    // but also host to device copys with already allocated buffers
    dev.htod_copy_into(vec![2.0; 10], &mut b)?;

    // if you want to use slices, you can do synchronous copy
    dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;

    // you can transfer back using reclaim:
    let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
    assert_eq!(a_host, [0.0; 10]);

    // or copy back without losing ownership:
    let b_host = dev.dtoh_sync_copy(&b)?;
    assert_eq!(b_host, [3.0; 10]);

    // or use a slice
    dev.dtoh_sync_copy_into(&b, &mut a_host)?;
    assert_eq!(a_host, b_host);

    Ok(())
}

examples/matmul-kernel.rs (line 28)

fn main() -> Result<(), DriverError> {
    let start = std::time::Instant::now();

    let ptx = compile_ptx(PTX_SRC).unwrap();
    println!("Compilation succeeded in {:?}", start.elapsed());

    let dev = CudaDevice::new(0)?;
    println!("Built in {:?}", start.elapsed());

    dev.load_ptx(ptx, "matmul", &["matmul"])?;
    let f = dev.get_func("matmul", "matmul").unwrap();
    println!("Loaded in {:?}", start.elapsed());

    let a_host = [1.0f32, 2.0, 3.0, 4.0];
    let b_host = [1.0f32, 2.0, 3.0, 4.0];
    let mut c_host = [0.0f32; 4];

    let a_dev = dev.htod_sync_copy(&a_host)?;
    let b_dev = dev.htod_sync_copy(&b_host)?;
    let mut c_dev = dev.htod_sync_copy(&c_host)?;

    println!("Copied in {:?}", start.elapsed());

    let cfg = LaunchConfig {
        block_dim: (2, 2, 1),
        grid_dim: (1, 1, 1),
        shared_mem_bytes: 0,
    };
    unsafe { f.launch(cfg, (&a_dev, &b_dev, &mut c_dev, 2i32)) }?;

    dev.dtoh_sync_copy_into(&c_dev, &mut c_host)?;
    println!("Found {:?} in {:?}", c_host, start.elapsed());
    Ok(())
}

examples/04-streams.rs (line 7)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);

    let a_host = [1.0, 2.0, 3.0];
    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    // create a stream with `fork_default_stream()`
    // This synchronizes with the default stream, so since
    // we put this call **after** the `htod_copy` & `clone` above,
    // cuda will complete those orders **before** work on this stream
    // can start.
    let stream = dev.fork_default_stream()?;

    let f = dev.get_func("sin", "sin_kernel").unwrap();

    // we launch it differently too
    unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    // and we must join with the default work stream in order for copies
    // to work corrently.
    // NOTE: this is actually async with respect to the host!
    dev.wait_for(&stream)?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

Additional examples can be found in:

examples/06-threading.rs

source

pub fn ordinal(&self) -> usize

Get the ordinal index of this CudaDevice.

source

pub fn cu_device(&self) -> &CUdevice

Get the underlying sys::CUdevice of this CudaDevice.

Safety

While this function is marked as safe, actually using the returned object is unsafe.

You must not free/release the device pointer, as it is still owned by the CudaDevice.

source

pub fn cu_primary_ctx(&self) -> &CUcontext

Get the underlying sys::CUcontext of this CudaDevice.

Safety

While this function is marked as safe, actually using the returned object is unsafe.

You must not free/release the context pointer, as it is still owned by the CudaDevice.

source

pub fn cu_stream(&self) -> &CUstream

Get the underlying sys::CUstream that this CudaDevice executes all of its work on.

Safety

While this function is marked as safe, actually using the returned object is unsafe.

You must not free/release the stream pointer, as it is still owned by the CudaDevice.

source §

impl CudaDevice

source

pub fn fork_default_stream(self: &Arc<Self>) -> Result<CudaStream, DriverError>

Allocates a new stream that can execute kernels concurrently to the default stream.

The synchronization with default stream happens in code order. See CudaStream docstring.

This stream synchronizes in the following way:

On creation it adds a wait for any existing work on the default work stream to complete
On drop it adds a wait for any existign work on Self to complete to the default stream.

Examples found in repository ?

examples/04-streams.rs (line 22)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);

    let a_host = [1.0, 2.0, 3.0];
    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    // create a stream with `fork_default_stream()`
    // This synchronizes with the default stream, so since
    // we put this call **after** the `htod_copy` & `clone` above,
    // cuda will complete those orders **before** work on this stream
    // can start.
    let stream = dev.fork_default_stream()?;

    let f = dev.get_func("sin", "sin_kernel").unwrap();

    // we launch it differently too
    unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    // and we must join with the default work stream in order for copies
    // to work corrently.
    // NOTE: this is actually async with respect to the host!
    dev.wait_for(&stream)?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

source

pub fn wait_for( self: &Arc<Self>, stream: &CudaStream ) -> Result<(), DriverError>

Forces CudaStream to drop, causing the default work stream to block on streams completion. This is asynchronous with respect to the host.

Examples found in repository ?

examples/04-streams.rs (line 32)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);

    let a_host = [1.0, 2.0, 3.0];
    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    // create a stream with `fork_default_stream()`
    // This synchronizes with the default stream, so since
    // we put this call **after** the `htod_copy` & `clone` above,
    // cuda will complete those orders **before** work on this stream
    // can start.
    let stream = dev.fork_default_stream()?;

    let f = dev.get_func("sin", "sin_kernel").unwrap();

    // we launch it differently too
    unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    // and we must join with the default work stream in order for copies
    // to work corrently.
    // NOTE: this is actually async with respect to the host!
    dev.wait_for(&stream)?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

source §

impl CudaDevice

source

pub fn has_func(self: &Arc<Self>, module_name: &str, func_name: &str) -> bool

Whether a module and function are currently loaded into the device.

source

pub fn get_func( self: &Arc<Self>, module_name: &str, func_name: &str ) -> Option<CudaFunction>

Retrieves a CudaFunction that was registered under module_name and func_name.

Examples found in repository ?

examples/05-device-repr.rs (line 46)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    let ptx = compile_ptx(PTX_SRC).unwrap();
    dev.load_ptx(ptx, "module", &["my_custom_kernel"])?;

    // try changing some of these values to see a device assert
    let thing = MyCoolRustStruct {
        a: 1.0,
        b: 2.34,
        c: 57,
        d: 420,
    };

    let f = dev.get_func("module", "my_custom_kernel").unwrap();

    // since MyCoolRustStruct implements DeviceRepr, we can pass it to launch.
    unsafe { f.launch(LaunchConfig::for_num_elems(1), (thing,)) }?;

    Ok(())
}

More examples

Hide additional examples

examples/03-launch-kernel.rs (line 13)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    // You can load a function from a pre-compiled PTX like so:
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    // and then retrieve the function with `get_func`
    let f = dev.get_func("sin", "sin_kernel").unwrap();

    let a_host = [1.0, 2.0, 3.0];

    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);
    unsafe { f.launch(cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

examples/matmul-kernel.rs (line 32)

fn main() -> Result<(), DriverError> {
    let start = std::time::Instant::now();

    let ptx = compile_ptx(PTX_SRC).unwrap();
    println!("Compilation succeeded in {:?}", start.elapsed());

    let dev = CudaDevice::new(0)?;
    println!("Built in {:?}", start.elapsed());

    dev.load_ptx(ptx, "matmul", &["matmul"])?;
    let f = dev.get_func("matmul", "matmul").unwrap();
    println!("Loaded in {:?}", start.elapsed());

    let a_host = [1.0f32, 2.0, 3.0, 4.0];
    let b_host = [1.0f32, 2.0, 3.0, 4.0];
    let mut c_host = [0.0f32; 4];

    let a_dev = dev.htod_sync_copy(&a_host)?;
    let b_dev = dev.htod_sync_copy(&b_host)?;
    let mut c_dev = dev.htod_sync_copy(&c_host)?;

    println!("Copied in {:?}", start.elapsed());

    let cfg = LaunchConfig {
        block_dim: (2, 2, 1),
        grid_dim: (1, 1, 1),
        shared_mem_bytes: 0,
    };
    unsafe { f.launch(cfg, (&a_dev, &b_dev, &mut c_dev, 2i32)) }?;

    dev.dtoh_sync_copy_into(&c_dev, &mut c_host)?;
    println!("Found {:?} in {:?}", c_host, start.elapsed());
    Ok(())
}

examples/04-streams.rs (line 24)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);

    let a_host = [1.0, 2.0, 3.0];
    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    // create a stream with `fork_default_stream()`
    // This synchronizes with the default stream, so since
    // we put this call **after** the `htod_copy` & `clone` above,
    // cuda will complete those orders **before** work on this stream
    // can start.
    let stream = dev.fork_default_stream()?;

    let f = dev.get_func("sin", "sin_kernel").unwrap();

    // we launch it differently too
    unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    // and we must join with the default work stream in order for copies
    // to work corrently.
    // NOTE: this is actually async with respect to the host!
    dev.wait_for(&stream)?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

examples/06-threading.rs (line 38)

fn main() -> Result<(), DriverError> {
    let cfg = LaunchConfig {
        grid_dim: (1, 1, 1),
        block_dim: (1, 1, 1),
        shared_mem_bytes: 0,
    };

    {
        // Option 1: use the same device on each thread.
        // This requires calling the CudaDevice::bind_to_thread() method.
        // Note that all kernels are submitted to the same stream/context,
        // so the kernels will still execute in sequentially in the order
        // they are submitted to the gpu.
        let dev = CudaDevice::new(0)?;
        let ptx = compile_ptx(KERNEL_SRC).unwrap();
        dev.load_ptx(ptx, "kernel", &["hello_world"])?;

        // explicit borrow so we don't have to re-clone the device for each thread
        let dev = &dev;

        thread::scope(move |s| {
            for i in 0..10i32 {
                s.spawn(move || {
                    // NOTE: this is the important call to have
                    // without this, you'll get a CUDA_ERROR_INVALID_CONTEXT
                    dev.bind_to_thread()?;
                    let f = dev.get_func("kernel", "hello_world").unwrap();
                    unsafe { f.launch(cfg, (i,)) }
                });
            }
        });
    }

    {
        // Option 2: create a new device in each thread
        // This requires loading the PTX for each device, since they won't
        // share a loaded modules on the Rust side of things.
        let ptx = compile_ptx(KERNEL_SRC).unwrap();

        thread::scope(|s| {
            for i in 0..10i32 {
                let ptx = ptx.clone();
                s.spawn(move || {
                    let dev = CudaDevice::new(0)?;
                    dev.load_ptx(ptx, "kernel", &["hello_world"])?;
                    let f = dev.get_func("kernel", "hello_world").unwrap();
                    unsafe { f.launch(cfg, (i + 100,)) }
                });
            }
        });
    }

    Ok(())
}

source §

impl CudaDevice

source

pub fn load_ptx( self: &Arc<Self>, ptx: Ptx, module_name: &str, func_names: &[&'static str] ) -> Result<(), DriverError>

Dynamically load a set of crate::driver::CudaFunction from a jit compiled ptx.

ptx contains the compilex ptx
module_name is a unique identifier used to access the module later on with CudaDevice::get_func()
func_names is a slice of function names to load into the module during build.

Examples found in repository ?

examples/05-device-repr.rs (line 36)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    let ptx = compile_ptx(PTX_SRC).unwrap();
    dev.load_ptx(ptx, "module", &["my_custom_kernel"])?;

    // try changing some of these values to see a device assert
    let thing = MyCoolRustStruct {
        a: 1.0,
        b: 2.34,
        c: 57,
        d: 420,
    };

    let f = dev.get_func("module", "my_custom_kernel").unwrap();

    // since MyCoolRustStruct implements DeviceRepr, we can pass it to launch.
    unsafe { f.launch(LaunchConfig::for_num_elems(1), (thing,)) }?;

    Ok(())
}

More examples

Hide additional examples

examples/03-launch-kernel.rs (line 10)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;

    // You can load a function from a pre-compiled PTX like so:
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    // and then retrieve the function with `get_func`
    let f = dev.get_func("sin", "sin_kernel").unwrap();

    let a_host = [1.0, 2.0, 3.0];

    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);
    unsafe { f.launch(cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

examples/matmul-kernel.rs (line 31)

fn main() -> Result<(), DriverError> {
    let start = std::time::Instant::now();

    let ptx = compile_ptx(PTX_SRC).unwrap();
    println!("Compilation succeeded in {:?}", start.elapsed());

    let dev = CudaDevice::new(0)?;
    println!("Built in {:?}", start.elapsed());

    dev.load_ptx(ptx, "matmul", &["matmul"])?;
    let f = dev.get_func("matmul", "matmul").unwrap();
    println!("Loaded in {:?}", start.elapsed());

    let a_host = [1.0f32, 2.0, 3.0, 4.0];
    let b_host = [1.0f32, 2.0, 3.0, 4.0];
    let mut c_host = [0.0f32; 4];

    let a_dev = dev.htod_sync_copy(&a_host)?;
    let b_dev = dev.htod_sync_copy(&b_host)?;
    let mut c_dev = dev.htod_sync_copy(&c_host)?;

    println!("Copied in {:?}", start.elapsed());

    let cfg = LaunchConfig {
        block_dim: (2, 2, 1),
        grid_dim: (1, 1, 1),
        shared_mem_bytes: 0,
    };
    unsafe { f.launch(cfg, (&a_dev, &b_dev, &mut c_dev, 2i32)) }?;

    dev.dtoh_sync_copy_into(&c_dev, &mut c_host)?;
    println!("Found {:?} in {:?}", c_host, start.elapsed());
    Ok(())
}

examples/04-streams.rs (line 8)

fn main() -> Result<(), DriverError> {
    let dev = CudaDevice::new(0)?;
    dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;

    let n = 3;
    let cfg = LaunchConfig::for_num_elems(n);

    let a_host = [1.0, 2.0, 3.0];
    let a_dev = dev.htod_copy(a_host.into())?;
    let mut b_dev = a_dev.clone();

    // create a stream with `fork_default_stream()`
    // This synchronizes with the default stream, so since
    // we put this call **after** the `htod_copy` & `clone` above,
    // cuda will complete those orders **before** work on this stream
    // can start.
    let stream = dev.fork_default_stream()?;

    let f = dev.get_func("sin", "sin_kernel").unwrap();

    // we launch it differently too
    unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;

    // and we must join with the default work stream in order for copies
    // to work corrently.
    // NOTE: this is actually async with respect to the host!
    dev.wait_for(&stream)?;

    let a_host_2 = dev.sync_reclaim(a_dev)?;
    let b_host = dev.sync_reclaim(b_dev)?;

    println!("Found {:?}", b_host);
    println!("Expected {:?}", a_host.map(f32::sin));
    assert_eq!(&a_host, a_host_2.as_slice());

    Ok(())
}

examples/06-threading.rs (line 27)

fn main() -> Result<(), DriverError> {
    let cfg = LaunchConfig {
        grid_dim: (1, 1, 1),
        block_dim: (1, 1, 1),
        shared_mem_bytes: 0,
    };

    {
        // Option 1: use the same device on each thread.
        // This requires calling the CudaDevice::bind_to_thread() method.
        // Note that all kernels are submitted to the same stream/context,
        // so the kernels will still execute in sequentially in the order
        // they are submitted to the gpu.
        let dev = CudaDevice::new(0)?;
        let ptx = compile_ptx(KERNEL_SRC).unwrap();
        dev.load_ptx(ptx, "kernel", &["hello_world"])?;

        // explicit borrow so we don't have to re-clone the device for each thread
        let dev = &dev;

        thread::scope(move |s| {
            for i in 0..10i32 {
                s.spawn(move || {
                    // NOTE: this is the important call to have
                    // without this, you'll get a CUDA_ERROR_INVALID_CONTEXT
                    dev.bind_to_thread()?;
                    let f = dev.get_func("kernel", "hello_world").unwrap();
                    unsafe { f.launch(cfg, (i,)) }
                });
            }
        });
    }

    {
        // Option 2: create a new device in each thread
        // This requires loading the PTX for each device, since they won't
        // share a loaded modules on the Rust side of things.
        let ptx = compile_ptx(KERNEL_SRC).unwrap();

        thread::scope(|s| {
            for i in 0..10i32 {
                let ptx = ptx.clone();
                s.spawn(move || {
                    let dev = CudaDevice::new(0)?;
                    dev.load_ptx(ptx, "kernel", &["hello_world"])?;
                    let f = dev.get_func("kernel", "hello_world").unwrap();
                    unsafe { f.launch(cfg, (i + 100,)) }
                });
            }
        });
    }

    Ok(())
}

source §

impl CudaDevice

source

pub fn bind_to_thread(self: &Arc<Self>) -> Result<(), DriverError>

Binds the device to the calling thread. You must call this before using the device on a separate thread!

Examples found in repository ?

examples/06-threading.rs (line 37)

fn main() -> Result<(), DriverError> {
    let cfg = LaunchConfig {
        grid_dim: (1, 1, 1),
        block_dim: (1, 1, 1),
        shared_mem_bytes: 0,
    };

    {
        // Option 1: use the same device on each thread.
        // This requires calling the CudaDevice::bind_to_thread() method.
        // Note that all kernels are submitted to the same stream/context,
        // so the kernels will still execute in sequentially in the order
        // they are submitted to the gpu.
        let dev = CudaDevice::new(0)?;
        let ptx = compile_ptx(KERNEL_SRC).unwrap();
        dev.load_ptx(ptx, "kernel", &["hello_world"])?;

        // explicit borrow so we don't have to re-clone the device for each thread
        let dev = &dev;

        thread::scope(move |s| {
            for i in 0..10i32 {
                s.spawn(move || {
                    // NOTE: this is the important call to have
                    // without this, you'll get a CUDA_ERROR_INVALID_CONTEXT
                    dev.bind_to_thread()?;
                    let f = dev.get_func("kernel", "hello_world").unwrap();
                    unsafe { f.launch(cfg, (i,)) }
                });
            }
        });
    }

    {
        // Option 2: create a new device in each thread
        // This requires loading the PTX for each device, since they won't
        // share a loaded modules on the Rust side of things.
        let ptx = compile_ptx(KERNEL_SRC).unwrap();

        thread::scope(|s| {
            for i in 0..10i32 {
                let ptx = ptx.clone();
                s.spawn(move || {
                    let dev = CudaDevice::new(0)?;
                    dev.load_ptx(ptx, "kernel", &["hello_world"])?;
                    let f = dev.get_func("kernel", "hello_world").unwrap();
                    unsafe { f.launch(cfg, (i + 100,)) }
                });
            }
        });
    }

    Ok(())
}