Struct cudarc::driver::safe::CudaDevice
source · pub struct CudaDevice { /* private fields */ }
Expand description
A wrapper around sys::CUdevice, sys::CUcontext, sys::CUstream, and CudaFunction.
let dev = CudaDevice::new(0).unwrap();
Safety
- impl Drop to call all the corresponding resource cleanup methods
- Doesn’t impl clone, so you can’t have multiple device pointers hanging around.
- Any allocations enforce that self is an Arc, meaning no allocation can outlive the CudaDevice
Implementations§
source§impl CudaDevice
impl CudaDevice
sourcepub unsafe fn upgrade_device_ptr<T>(
self: &Arc<Self>,
cu_device_ptr: CUdeviceptr,
len: usize
) -> CudaSlice<T>
pub unsafe fn upgrade_device_ptr<T>( self: &Arc<Self>, cu_device_ptr: CUdeviceptr, len: usize ) -> CudaSlice<T>
Creates a CudaSlice from a sys::CUdeviceptr. Useful in conjunction with
CudaSlice::leak()
.
Safety
cu_device_ptr
must be a valid allocationcu_device_ptr
must space forlen * std::mem::size_of<T>()
bytes- The memory may not be valid for type
T
, so some sort of memset operation should be called on the memory.
source§impl CudaDevice
impl CudaDevice
sourcepub fn null<T>(self: &Arc<Self>) -> Result<CudaSlice<T>, DriverError>
pub fn null<T>(self: &Arc<Self>) -> Result<CudaSlice<T>, DriverError>
Allocates an empty CudaSlice with 0 length.
sourcepub unsafe fn alloc<T: DeviceRepr>(
self: &Arc<Self>,
len: usize
) -> Result<CudaSlice<T>, DriverError>
pub unsafe fn alloc<T: DeviceRepr>( self: &Arc<Self>, len: usize ) -> Result<CudaSlice<T>, DriverError>
Allocates device memory and increments the reference counter of CudaDevice.
Safety
This is unsafe because the device memory is unset after this call.
Examples found in repository?
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
// unsafe initialization of unset memory
let _: CudaSlice<f32> = unsafe { dev.alloc::<f32>(10) }?;
// this will have memory initialized as 0
let _: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
// initialize with a rust vec
let _: CudaSlice<usize> = dev.htod_copy(vec![0; 10])?;
// or finially, initialize with a slice. this is synchronous though.
let _: CudaSlice<u32> = dev.htod_sync_copy(&[1, 2, 3])?;
Ok(())
}
sourcepub fn alloc_zeros<T: ValidAsZeroBits + DeviceRepr>(
self: &Arc<Self>,
len: usize
) -> Result<CudaSlice<T>, DriverError>
pub fn alloc_zeros<T: ValidAsZeroBits + DeviceRepr>( self: &Arc<Self>, len: usize ) -> Result<CudaSlice<T>, DriverError>
Allocates device memory with no associated host memory, and memsets the device memory to all 0s.
Safety
T
is marked as ValidAsZeroBits, so the device memory is valid to use- Self is
Arc<Self>
, and this method increments the rc for self
Examples found in repository?
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
// unsafe initialization of unset memory
let _: CudaSlice<f32> = unsafe { dev.alloc::<f32>(10) }?;
// this will have memory initialized as 0
let _: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
// initialize with a rust vec
let _: CudaSlice<usize> = dev.htod_copy(vec![0; 10])?;
// or finially, initialize with a slice. this is synchronous though.
let _: CudaSlice<u32> = dev.htod_sync_copy(&[1, 2, 3])?;
Ok(())
}
More examples
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
let mut b = dev.alloc_zeros::<f64>(10)?;
// you can do device to device copies of course
dev.dtod_copy(&a, &mut b)?;
// but also host to device copys with already allocated buffers
dev.htod_copy_into(vec![2.0; 10], &mut b)?;
// if you want to use slices, you can do synchronous copy
dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;
// you can transfer back using reclaim:
let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
assert_eq!(a_host, [0.0; 10]);
// or copy back without losing ownership:
let b_host = dev.dtoh_sync_copy(&b)?;
assert_eq!(b_host, [3.0; 10]);
// or use a slice
dev.dtoh_sync_copy_into(&b, &mut a_host)?;
assert_eq!(a_host, b_host);
Ok(())
}
sourcepub fn memset_zeros<T: ValidAsZeroBits + DeviceRepr, Dst: DevicePtrMut<T>>(
self: &Arc<Self>,
dst: &mut Dst
) -> Result<(), DriverError>
pub fn memset_zeros<T: ValidAsZeroBits + DeviceRepr, Dst: DevicePtrMut<T>>( self: &Arc<Self>, dst: &mut Dst ) -> Result<(), DriverError>
Sets all memory to 0 asynchronously.
Safety
T
is marked as ValidAsZeroBits, so the device memory is valid to use- Self is
Arc<Self>
, and this method increments the rc for self
sourcepub fn dtod_copy<T: DeviceRepr, Src: DevicePtr<T>, Dst: DevicePtrMut<T>>(
self: &Arc<Self>,
src: &Src,
dst: &mut Dst
) -> Result<(), DriverError>
pub fn dtod_copy<T: DeviceRepr, Src: DevicePtr<T>, Dst: DevicePtrMut<T>>( self: &Arc<Self>, src: &Src, dst: &mut Dst ) -> Result<(), DriverError>
Device to device copy (safe version of result::memcpy_dtod_async).
Panics
If the length of the two values are different
Safety
- We are guarunteed that
src
anddst
are pointers to the same underlying typeT
- Since they are both references, they can’t have been freed
- Self is
Arc<Self>
, and this method increments the rc for self
Examples found in repository?
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
let mut b = dev.alloc_zeros::<f64>(10)?;
// you can do device to device copies of course
dev.dtod_copy(&a, &mut b)?;
// but also host to device copys with already allocated buffers
dev.htod_copy_into(vec![2.0; 10], &mut b)?;
// if you want to use slices, you can do synchronous copy
dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;
// you can transfer back using reclaim:
let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
assert_eq!(a_host, [0.0; 10]);
// or copy back without losing ownership:
let b_host = dev.dtoh_sync_copy(&b)?;
assert_eq!(b_host, [3.0; 10]);
// or use a slice
dev.dtoh_sync_copy_into(&b, &mut a_host)?;
assert_eq!(a_host, b_host);
Ok(())
}
sourcepub fn htod_copy<T: Unpin + DeviceRepr>(
self: &Arc<Self>,
src: Vec<T>
) -> Result<CudaSlice<T>, DriverError>
pub fn htod_copy<T: Unpin + DeviceRepr>( self: &Arc<Self>, src: Vec<T> ) -> Result<CudaSlice<T>, DriverError>
Takes ownership of the host data and copies it to device data asynchronously.
Safety
- Since
src
is owned by this funcion, it is safe to copy data. Any actions executed after this will take place after the data has been successfully copied. - Self is
Arc<Self>
, and this method increments the rc for self
Examples found in repository?
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
// unsafe initialization of unset memory
let _: CudaSlice<f32> = unsafe { dev.alloc::<f32>(10) }?;
// this will have memory initialized as 0
let _: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
// initialize with a rust vec
let _: CudaSlice<usize> = dev.htod_copy(vec![0; 10])?;
// or finially, initialize with a slice. this is synchronous though.
let _: CudaSlice<u32> = dev.htod_sync_copy(&[1, 2, 3])?;
Ok(())
}
More examples
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
// You can load a function from a pre-compiled PTX like so:
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
// and then retrieve the function with `get_func`
let f = dev.get_func("sin", "sin_kernel").unwrap();
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
unsafe { f.launch(cfg, (&mut b_dev, &a_dev, n as i32)) }?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
// create a stream with `fork_default_stream()`
// This synchronizes with the default stream, so since
// we put this call **after** the `htod_copy` & `clone` above,
// cuda will complete those orders **before** work on this stream
// can start.
let stream = dev.fork_default_stream()?;
let f = dev.get_func("sin", "sin_kernel").unwrap();
// we launch it differently too
unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;
// and we must join with the default work stream in order for copies
// to work corrently.
// NOTE: this is actually async with respect to the host!
dev.wait_for(&stream)?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
sourcepub fn htod_copy_into<T: DeviceRepr + Unpin>(
self: &Arc<Self>,
src: Vec<T>,
dst: &mut CudaSlice<T>
) -> Result<(), DriverError>
pub fn htod_copy_into<T: DeviceRepr + Unpin>( self: &Arc<Self>, src: Vec<T>, dst: &mut CudaSlice<T> ) -> Result<(), DriverError>
Takes ownership of the host data and copies it to device data asynchronously.
Safety
- Since
src
is owned by this funcion, it is safe to copy data. Any actions executed after this will take place after the data has been successfully copied. - Self is
Arc<Self>
, and this method increments the rc for self
Examples found in repository?
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
let mut b = dev.alloc_zeros::<f64>(10)?;
// you can do device to device copies of course
dev.dtod_copy(&a, &mut b)?;
// but also host to device copys with already allocated buffers
dev.htod_copy_into(vec![2.0; 10], &mut b)?;
// if you want to use slices, you can do synchronous copy
dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;
// you can transfer back using reclaim:
let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
assert_eq!(a_host, [0.0; 10]);
// or copy back without losing ownership:
let b_host = dev.dtoh_sync_copy(&b)?;
assert_eq!(b_host, [3.0; 10]);
// or use a slice
dev.dtoh_sync_copy_into(&b, &mut a_host)?;
assert_eq!(a_host, b_host);
Ok(())
}
sourcepub fn htod_sync_copy<T: DeviceRepr>(
self: &Arc<Self>,
src: &[T]
) -> Result<CudaSlice<T>, DriverError>
pub fn htod_sync_copy<T: DeviceRepr>( self: &Arc<Self>, src: &[T] ) -> Result<CudaSlice<T>, DriverError>
Allocates new device memory and synchronously copies data from src
into the new allocation.
If you want an asynchronous copy, see CudaDevice::htod_copy().
Safety
- Since this function doesn’t own
src
it is executed synchronously. - Self is
Arc<Self>
, and this method increments the rc for self
Examples found in repository?
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
// unsafe initialization of unset memory
let _: CudaSlice<f32> = unsafe { dev.alloc::<f32>(10) }?;
// this will have memory initialized as 0
let _: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
// initialize with a rust vec
let _: CudaSlice<usize> = dev.htod_copy(vec![0; 10])?;
// or finially, initialize with a slice. this is synchronous though.
let _: CudaSlice<u32> = dev.htod_sync_copy(&[1, 2, 3])?;
Ok(())
}
More examples
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
fn main() -> Result<(), DriverError> {
let start = std::time::Instant::now();
let ptx = compile_ptx(PTX_SRC).unwrap();
println!("Compilation succeeded in {:?}", start.elapsed());
let dev = CudaDevice::new(0)?;
println!("Built in {:?}", start.elapsed());
dev.load_ptx(ptx, "matmul", &["matmul"])?;
let f = dev.get_func("matmul", "matmul").unwrap();
println!("Loaded in {:?}", start.elapsed());
let a_host = [1.0f32, 2.0, 3.0, 4.0];
let b_host = [1.0f32, 2.0, 3.0, 4.0];
let mut c_host = [0.0f32; 4];
let a_dev = dev.htod_sync_copy(&a_host)?;
let b_dev = dev.htod_sync_copy(&b_host)?;
let mut c_dev = dev.htod_sync_copy(&c_host)?;
println!("Copied in {:?}", start.elapsed());
let cfg = LaunchConfig {
block_dim: (2, 2, 1),
grid_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
unsafe { f.launch(cfg, (&a_dev, &b_dev, &mut c_dev, 2i32)) }?;
dev.dtoh_sync_copy_into(&c_dev, &mut c_host)?;
println!("Found {:?} in {:?}", c_host, start.elapsed());
Ok(())
}
sourcepub fn htod_sync_copy_into<T: DeviceRepr, Dst: DevicePtrMut<T>>(
self: &Arc<Self>,
src: &[T],
dst: &mut Dst
) -> Result<(), DriverError>
pub fn htod_sync_copy_into<T: DeviceRepr, Dst: DevicePtrMut<T>>( self: &Arc<Self>, src: &[T], dst: &mut Dst ) -> Result<(), DriverError>
Synchronously copies data from src
into the new allocation.
If you want an asynchronous copy, see CudaDevice::htod_copy().
Panics
If the lengths of slices are not equal, this method panics.
Safety
- Since this function doesn’t own
src
it is executed synchronously. - Self is
Arc<Self>
, and this method increments the rc for self
Examples found in repository?
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
let mut b = dev.alloc_zeros::<f64>(10)?;
// you can do device to device copies of course
dev.dtod_copy(&a, &mut b)?;
// but also host to device copys with already allocated buffers
dev.htod_copy_into(vec![2.0; 10], &mut b)?;
// if you want to use slices, you can do synchronous copy
dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;
// you can transfer back using reclaim:
let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
assert_eq!(a_host, [0.0; 10]);
// or copy back without losing ownership:
let b_host = dev.dtoh_sync_copy(&b)?;
assert_eq!(b_host, [3.0; 10]);
// or use a slice
dev.dtoh_sync_copy_into(&b, &mut a_host)?;
assert_eq!(a_host, b_host);
Ok(())
}
sourcepub fn dtoh_sync_copy<T: DeviceRepr>(
self: &Arc<Self>,
src: &CudaSlice<T>
) -> Result<Vec<T>, DriverError>
pub fn dtoh_sync_copy<T: DeviceRepr>( self: &Arc<Self>, src: &CudaSlice<T> ) -> Result<Vec<T>, DriverError>
Synchronously copies device memory into host memory.
Unlike CudaDevice::dtoh_sync_copy_into
this returns a Vec<T>
.
Safety
- Since this function doesn’t own
dst
(after returning) it is executed synchronously. - Self is
Arc<Self>
, and this method increments the rc for self
Examples found in repository?
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
let mut b = dev.alloc_zeros::<f64>(10)?;
// you can do device to device copies of course
dev.dtod_copy(&a, &mut b)?;
// but also host to device copys with already allocated buffers
dev.htod_copy_into(vec![2.0; 10], &mut b)?;
// if you want to use slices, you can do synchronous copy
dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;
// you can transfer back using reclaim:
let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
assert_eq!(a_host, [0.0; 10]);
// or copy back without losing ownership:
let b_host = dev.dtoh_sync_copy(&b)?;
assert_eq!(b_host, [3.0; 10]);
// or use a slice
dev.dtoh_sync_copy_into(&b, &mut a_host)?;
assert_eq!(a_host, b_host);
Ok(())
}
sourcepub fn dtoh_sync_copy_into<T: DeviceRepr, Src: DevicePtr<T>>(
self: &Arc<Self>,
src: &Src,
dst: &mut [T]
) -> Result<(), DriverError>
pub fn dtoh_sync_copy_into<T: DeviceRepr, Src: DevicePtr<T>>( self: &Arc<Self>, src: &Src, dst: &mut [T] ) -> Result<(), DriverError>
Synchronously copies device memory into host memory
Use CudaDevice::dtoh_sync_copy
if you need Vec<T>
and can’t provide
a correctly sized slice.
Panics
If the lengths of slices are not equal, this method panics.
Safety
- Since this function doesn’t own
dst
it is executed synchronously. - Self is
Arc<Self>
, and this method increments the rc for self
Examples found in repository?
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
let mut b = dev.alloc_zeros::<f64>(10)?;
// you can do device to device copies of course
dev.dtod_copy(&a, &mut b)?;
// but also host to device copys with already allocated buffers
dev.htod_copy_into(vec![2.0; 10], &mut b)?;
// if you want to use slices, you can do synchronous copy
dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;
// you can transfer back using reclaim:
let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
assert_eq!(a_host, [0.0; 10]);
// or copy back without losing ownership:
let b_host = dev.dtoh_sync_copy(&b)?;
assert_eq!(b_host, [3.0; 10]);
// or use a slice
dev.dtoh_sync_copy_into(&b, &mut a_host)?;
assert_eq!(a_host, b_host);
Ok(())
}
More examples
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
fn main() -> Result<(), DriverError> {
let start = std::time::Instant::now();
let ptx = compile_ptx(PTX_SRC).unwrap();
println!("Compilation succeeded in {:?}", start.elapsed());
let dev = CudaDevice::new(0)?;
println!("Built in {:?}", start.elapsed());
dev.load_ptx(ptx, "matmul", &["matmul"])?;
let f = dev.get_func("matmul", "matmul").unwrap();
println!("Loaded in {:?}", start.elapsed());
let a_host = [1.0f32, 2.0, 3.0, 4.0];
let b_host = [1.0f32, 2.0, 3.0, 4.0];
let mut c_host = [0.0f32; 4];
let a_dev = dev.htod_sync_copy(&a_host)?;
let b_dev = dev.htod_sync_copy(&b_host)?;
let mut c_dev = dev.htod_sync_copy(&c_host)?;
println!("Copied in {:?}", start.elapsed());
let cfg = LaunchConfig {
block_dim: (2, 2, 1),
grid_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
unsafe { f.launch(cfg, (&a_dev, &b_dev, &mut c_dev, 2i32)) }?;
dev.dtoh_sync_copy_into(&c_dev, &mut c_host)?;
println!("Found {:?} in {:?}", c_host, start.elapsed());
Ok(())
}
sourcepub fn sync_reclaim<T: Clone + Default + DeviceRepr + Unpin>(
self: &Arc<Self>,
src: CudaSlice<T>
) -> Result<Vec<T>, DriverError>
pub fn sync_reclaim<T: Clone + Default + DeviceRepr + Unpin>( self: &Arc<Self>, src: CudaSlice<T> ) -> Result<Vec<T>, DriverError>
Synchronously de-allocates src
and converts it into it’s host value.
You can just drop the slice if you don’t need the host data.
Safety
- Self is
Arc<Self>
, and this method increments the rc for self
Examples found in repository?
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
// You can load a function from a pre-compiled PTX like so:
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
// and then retrieve the function with `get_func`
let f = dev.get_func("sin", "sin_kernel").unwrap();
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
unsafe { f.launch(cfg, (&mut b_dev, &a_dev, n as i32)) }?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
More examples
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
let mut b = dev.alloc_zeros::<f64>(10)?;
// you can do device to device copies of course
dev.dtod_copy(&a, &mut b)?;
// but also host to device copys with already allocated buffers
dev.htod_copy_into(vec![2.0; 10], &mut b)?;
// if you want to use slices, you can do synchronous copy
dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;
// you can transfer back using reclaim:
let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
assert_eq!(a_host, [0.0; 10]);
// or copy back without losing ownership:
let b_host = dev.dtoh_sync_copy(&b)?;
assert_eq!(b_host, [3.0; 10]);
// or use a slice
dev.dtoh_sync_copy_into(&b, &mut a_host)?;
assert_eq!(a_host, b_host);
Ok(())
}
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
// create a stream with `fork_default_stream()`
// This synchronizes with the default stream, so since
// we put this call **after** the `htod_copy` & `clone` above,
// cuda will complete those orders **before** work on this stream
// can start.
let stream = dev.fork_default_stream()?;
let f = dev.get_func("sin", "sin_kernel").unwrap();
// we launch it differently too
unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;
// and we must join with the default work stream in order for copies
// to work corrently.
// NOTE: this is actually async with respect to the host!
dev.wait_for(&stream)?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
sourcepub fn synchronize(self: &Arc<Self>) -> Result<(), DriverError>
pub fn synchronize(self: &Arc<Self>) -> Result<(), DriverError>
Synchronizes the stream.
source§impl CudaDevice
impl CudaDevice
sourcepub fn new(ordinal: usize) -> Result<Arc<Self>, DriverError>
pub fn new(ordinal: usize) -> Result<Arc<Self>, DriverError>
Creates a new CudaDevice on device index ordinal
.
Examples found in repository?
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
// unsafe initialization of unset memory
let _: CudaSlice<f32> = unsafe { dev.alloc::<f32>(10) }?;
// this will have memory initialized as 0
let _: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
// initialize with a rust vec
let _: CudaSlice<usize> = dev.htod_copy(vec![0; 10])?;
// or finially, initialize with a slice. this is synchronous though.
let _: CudaSlice<u32> = dev.htod_sync_copy(&[1, 2, 3])?;
Ok(())
}
More examples
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
let ptx = compile_ptx(PTX_SRC).unwrap();
dev.load_ptx(ptx, "module", &["my_custom_kernel"])?;
// try changing some of these values to see a device assert
let thing = MyCoolRustStruct {
a: 1.0,
b: 2.34,
c: 57,
d: 420,
};
let f = dev.get_func("module", "my_custom_kernel").unwrap();
// since MyCoolRustStruct implements DeviceRepr, we can pass it to launch.
unsafe { f.launch(LaunchConfig::for_num_elems(1), (thing,)) }?;
Ok(())
}
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
// You can load a function from a pre-compiled PTX like so:
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
// and then retrieve the function with `get_func`
let f = dev.get_func("sin", "sin_kernel").unwrap();
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
unsafe { f.launch(cfg, (&mut b_dev, &a_dev, n as i32)) }?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
let a: CudaSlice<f64> = dev.alloc_zeros::<f64>(10)?;
let mut b = dev.alloc_zeros::<f64>(10)?;
// you can do device to device copies of course
dev.dtod_copy(&a, &mut b)?;
// but also host to device copys with already allocated buffers
dev.htod_copy_into(vec![2.0; 10], &mut b)?;
// if you want to use slices, you can do synchronous copy
dev.htod_sync_copy_into(&[3.0; 10], &mut b)?;
// you can transfer back using reclaim:
let mut a_host: Vec<f64> = dev.sync_reclaim(a)?;
assert_eq!(a_host, [0.0; 10]);
// or copy back without losing ownership:
let b_host = dev.dtoh_sync_copy(&b)?;
assert_eq!(b_host, [3.0; 10]);
// or use a slice
dev.dtoh_sync_copy_into(&b, &mut a_host)?;
assert_eq!(a_host, b_host);
Ok(())
}
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
fn main() -> Result<(), DriverError> {
let start = std::time::Instant::now();
let ptx = compile_ptx(PTX_SRC).unwrap();
println!("Compilation succeeded in {:?}", start.elapsed());
let dev = CudaDevice::new(0)?;
println!("Built in {:?}", start.elapsed());
dev.load_ptx(ptx, "matmul", &["matmul"])?;
let f = dev.get_func("matmul", "matmul").unwrap();
println!("Loaded in {:?}", start.elapsed());
let a_host = [1.0f32, 2.0, 3.0, 4.0];
let b_host = [1.0f32, 2.0, 3.0, 4.0];
let mut c_host = [0.0f32; 4];
let a_dev = dev.htod_sync_copy(&a_host)?;
let b_dev = dev.htod_sync_copy(&b_host)?;
let mut c_dev = dev.htod_sync_copy(&c_host)?;
println!("Copied in {:?}", start.elapsed());
let cfg = LaunchConfig {
block_dim: (2, 2, 1),
grid_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
unsafe { f.launch(cfg, (&a_dev, &b_dev, &mut c_dev, 2i32)) }?;
dev.dtoh_sync_copy_into(&c_dev, &mut c_host)?;
println!("Found {:?} in {:?}", c_host, start.elapsed());
Ok(())
}
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
// create a stream with `fork_default_stream()`
// This synchronizes with the default stream, so since
// we put this call **after** the `htod_copy` & `clone` above,
// cuda will complete those orders **before** work on this stream
// can start.
let stream = dev.fork_default_stream()?;
let f = dev.get_func("sin", "sin_kernel").unwrap();
// we launch it differently too
unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;
// and we must join with the default work stream in order for copies
// to work corrently.
// NOTE: this is actually async with respect to the host!
dev.wait_for(&stream)?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
sourcepub fn ordinal(&self) -> usize
pub fn ordinal(&self) -> usize
Get the ordinal
index of this CudaDevice.
sourcepub fn cu_device(&self) -> &CUdevice
pub fn cu_device(&self) -> &CUdevice
Get the underlying sys::CUdevice of this CudaDevice.
Safety
While this function is marked as safe, actually using the returned object is unsafe.
You must not free/release the device pointer, as it is still owned by the CudaDevice.
sourcepub fn cu_primary_ctx(&self) -> &CUcontext
pub fn cu_primary_ctx(&self) -> &CUcontext
Get the underlying sys::CUcontext of this CudaDevice.
Safety
While this function is marked as safe, actually using the returned object is unsafe.
You must not free/release the context pointer, as it is still owned by the CudaDevice.
sourcepub fn cu_stream(&self) -> &CUstream
pub fn cu_stream(&self) -> &CUstream
Get the underlying sys::CUstream that this CudaDevice executes all of its work on.
Safety
While this function is marked as safe, actually using the returned object is unsafe.
You must not free/release the stream pointer, as it is still owned by the CudaDevice.
source§impl CudaDevice
impl CudaDevice
sourcepub fn fork_default_stream(self: &Arc<Self>) -> Result<CudaStream, DriverError>
pub fn fork_default_stream(self: &Arc<Self>) -> Result<CudaStream, DriverError>
Allocates a new stream that can execute kernels concurrently to the default stream.
The synchronization with default stream happens in code order. See CudaStream docstring.
This stream synchronizes in the following way:
- On creation it adds a wait for any existing work on the default work stream to complete
- On drop it adds a wait for any existign work on Self to complete to the default stream.
Examples found in repository?
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
// create a stream with `fork_default_stream()`
// This synchronizes with the default stream, so since
// we put this call **after** the `htod_copy` & `clone` above,
// cuda will complete those orders **before** work on this stream
// can start.
let stream = dev.fork_default_stream()?;
let f = dev.get_func("sin", "sin_kernel").unwrap();
// we launch it differently too
unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;
// and we must join with the default work stream in order for copies
// to work corrently.
// NOTE: this is actually async with respect to the host!
dev.wait_for(&stream)?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
sourcepub fn wait_for(
self: &Arc<Self>,
stream: &CudaStream
) -> Result<(), DriverError>
pub fn wait_for( self: &Arc<Self>, stream: &CudaStream ) -> Result<(), DriverError>
Forces CudaStream to drop, causing the default work stream to block on streams
completion.
This is asynchronous with respect to the host.
Examples found in repository?
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
// create a stream with `fork_default_stream()`
// This synchronizes with the default stream, so since
// we put this call **after** the `htod_copy` & `clone` above,
// cuda will complete those orders **before** work on this stream
// can start.
let stream = dev.fork_default_stream()?;
let f = dev.get_func("sin", "sin_kernel").unwrap();
// we launch it differently too
unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;
// and we must join with the default work stream in order for copies
// to work corrently.
// NOTE: this is actually async with respect to the host!
dev.wait_for(&stream)?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
source§impl CudaDevice
impl CudaDevice
sourcepub fn has_func(self: &Arc<Self>, module_name: &str, func_name: &str) -> bool
pub fn has_func(self: &Arc<Self>, module_name: &str, func_name: &str) -> bool
Whether a module and function are currently loaded into the device.
sourcepub fn get_func(
self: &Arc<Self>,
module_name: &str,
func_name: &str
) -> Option<CudaFunction>
pub fn get_func( self: &Arc<Self>, module_name: &str, func_name: &str ) -> Option<CudaFunction>
Retrieves a CudaFunction that was registered under module_name
and func_name
.
Examples found in repository?
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
let ptx = compile_ptx(PTX_SRC).unwrap();
dev.load_ptx(ptx, "module", &["my_custom_kernel"])?;
// try changing some of these values to see a device assert
let thing = MyCoolRustStruct {
a: 1.0,
b: 2.34,
c: 57,
d: 420,
};
let f = dev.get_func("module", "my_custom_kernel").unwrap();
// since MyCoolRustStruct implements DeviceRepr, we can pass it to launch.
unsafe { f.launch(LaunchConfig::for_num_elems(1), (thing,)) }?;
Ok(())
}
More examples
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
// You can load a function from a pre-compiled PTX like so:
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
// and then retrieve the function with `get_func`
let f = dev.get_func("sin", "sin_kernel").unwrap();
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
unsafe { f.launch(cfg, (&mut b_dev, &a_dev, n as i32)) }?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
fn main() -> Result<(), DriverError> {
let start = std::time::Instant::now();
let ptx = compile_ptx(PTX_SRC).unwrap();
println!("Compilation succeeded in {:?}", start.elapsed());
let dev = CudaDevice::new(0)?;
println!("Built in {:?}", start.elapsed());
dev.load_ptx(ptx, "matmul", &["matmul"])?;
let f = dev.get_func("matmul", "matmul").unwrap();
println!("Loaded in {:?}", start.elapsed());
let a_host = [1.0f32, 2.0, 3.0, 4.0];
let b_host = [1.0f32, 2.0, 3.0, 4.0];
let mut c_host = [0.0f32; 4];
let a_dev = dev.htod_sync_copy(&a_host)?;
let b_dev = dev.htod_sync_copy(&b_host)?;
let mut c_dev = dev.htod_sync_copy(&c_host)?;
println!("Copied in {:?}", start.elapsed());
let cfg = LaunchConfig {
block_dim: (2, 2, 1),
grid_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
unsafe { f.launch(cfg, (&a_dev, &b_dev, &mut c_dev, 2i32)) }?;
dev.dtoh_sync_copy_into(&c_dev, &mut c_host)?;
println!("Found {:?} in {:?}", c_host, start.elapsed());
Ok(())
}
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
// create a stream with `fork_default_stream()`
// This synchronizes with the default stream, so since
// we put this call **after** the `htod_copy` & `clone` above,
// cuda will complete those orders **before** work on this stream
// can start.
let stream = dev.fork_default_stream()?;
let f = dev.get_func("sin", "sin_kernel").unwrap();
// we launch it differently too
unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;
// and we must join with the default work stream in order for copies
// to work corrently.
// NOTE: this is actually async with respect to the host!
dev.wait_for(&stream)?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
fn main() -> Result<(), DriverError> {
let cfg = LaunchConfig {
grid_dim: (1, 1, 1),
block_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
{
// Option 1: use the same device on each thread.
// This requires calling the CudaDevice::bind_to_thread() method.
// Note that all kernels are submitted to the same stream/context,
// so the kernels will still execute in sequentially in the order
// they are submitted to the gpu.
let dev = CudaDevice::new(0)?;
let ptx = compile_ptx(KERNEL_SRC).unwrap();
dev.load_ptx(ptx, "kernel", &["hello_world"])?;
// explicit borrow so we don't have to re-clone the device for each thread
let dev = &dev;
thread::scope(move |s| {
for i in 0..10i32 {
s.spawn(move || {
// NOTE: this is the important call to have
// without this, you'll get a CUDA_ERROR_INVALID_CONTEXT
dev.bind_to_thread()?;
let f = dev.get_func("kernel", "hello_world").unwrap();
unsafe { f.launch(cfg, (i,)) }
});
}
});
}
{
// Option 2: create a new device in each thread
// This requires loading the PTX for each device, since they won't
// share a loaded modules on the Rust side of things.
let ptx = compile_ptx(KERNEL_SRC).unwrap();
thread::scope(|s| {
for i in 0..10i32 {
let ptx = ptx.clone();
s.spawn(move || {
let dev = CudaDevice::new(0)?;
dev.load_ptx(ptx, "kernel", &["hello_world"])?;
let f = dev.get_func("kernel", "hello_world").unwrap();
unsafe { f.launch(cfg, (i + 100,)) }
});
}
});
}
Ok(())
}
source§impl CudaDevice
impl CudaDevice
sourcepub fn load_ptx(
self: &Arc<Self>,
ptx: Ptx,
module_name: &str,
func_names: &[&'static str]
) -> Result<(), DriverError>
pub fn load_ptx( self: &Arc<Self>, ptx: Ptx, module_name: &str, func_names: &[&'static str] ) -> Result<(), DriverError>
Dynamically load a set of crate::driver::CudaFunction from a jit compiled ptx.
ptx
contains the compilex ptxmodule_name
is a unique identifier used to access the module later on with CudaDevice::get_func()func_names
is a slice of function names to load into the module during build.
Examples found in repository?
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
let ptx = compile_ptx(PTX_SRC).unwrap();
dev.load_ptx(ptx, "module", &["my_custom_kernel"])?;
// try changing some of these values to see a device assert
let thing = MyCoolRustStruct {
a: 1.0,
b: 2.34,
c: 57,
d: 420,
};
let f = dev.get_func("module", "my_custom_kernel").unwrap();
// since MyCoolRustStruct implements DeviceRepr, we can pass it to launch.
unsafe { f.launch(LaunchConfig::for_num_elems(1), (thing,)) }?;
Ok(())
}
More examples
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
// You can load a function from a pre-compiled PTX like so:
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
// and then retrieve the function with `get_func`
let f = dev.get_func("sin", "sin_kernel").unwrap();
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
unsafe { f.launch(cfg, (&mut b_dev, &a_dev, n as i32)) }?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
fn main() -> Result<(), DriverError> {
let start = std::time::Instant::now();
let ptx = compile_ptx(PTX_SRC).unwrap();
println!("Compilation succeeded in {:?}", start.elapsed());
let dev = CudaDevice::new(0)?;
println!("Built in {:?}", start.elapsed());
dev.load_ptx(ptx, "matmul", &["matmul"])?;
let f = dev.get_func("matmul", "matmul").unwrap();
println!("Loaded in {:?}", start.elapsed());
let a_host = [1.0f32, 2.0, 3.0, 4.0];
let b_host = [1.0f32, 2.0, 3.0, 4.0];
let mut c_host = [0.0f32; 4];
let a_dev = dev.htod_sync_copy(&a_host)?;
let b_dev = dev.htod_sync_copy(&b_host)?;
let mut c_dev = dev.htod_sync_copy(&c_host)?;
println!("Copied in {:?}", start.elapsed());
let cfg = LaunchConfig {
block_dim: (2, 2, 1),
grid_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
unsafe { f.launch(cfg, (&a_dev, &b_dev, &mut c_dev, 2i32)) }?;
dev.dtoh_sync_copy_into(&c_dev, &mut c_host)?;
println!("Found {:?} in {:?}", c_host, start.elapsed());
Ok(())
}
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
fn main() -> Result<(), DriverError> {
let dev = CudaDevice::new(0)?;
dev.load_ptx(Ptx::from_file("./examples/sin.ptx"), "sin", &["sin_kernel"])?;
let n = 3;
let cfg = LaunchConfig::for_num_elems(n);
let a_host = [1.0, 2.0, 3.0];
let a_dev = dev.htod_copy(a_host.into())?;
let mut b_dev = a_dev.clone();
// create a stream with `fork_default_stream()`
// This synchronizes with the default stream, so since
// we put this call **after** the `htod_copy` & `clone` above,
// cuda will complete those orders **before** work on this stream
// can start.
let stream = dev.fork_default_stream()?;
let f = dev.get_func("sin", "sin_kernel").unwrap();
// we launch it differently too
unsafe { f.launch_on_stream(&stream, cfg, (&mut b_dev, &a_dev, n as i32)) }?;
// and we must join with the default work stream in order for copies
// to work corrently.
// NOTE: this is actually async with respect to the host!
dev.wait_for(&stream)?;
let a_host_2 = dev.sync_reclaim(a_dev)?;
let b_host = dev.sync_reclaim(b_dev)?;
println!("Found {:?}", b_host);
println!("Expected {:?}", a_host.map(f32::sin));
assert_eq!(&a_host, a_host_2.as_slice());
Ok(())
}
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
fn main() -> Result<(), DriverError> {
let cfg = LaunchConfig {
grid_dim: (1, 1, 1),
block_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
{
// Option 1: use the same device on each thread.
// This requires calling the CudaDevice::bind_to_thread() method.
// Note that all kernels are submitted to the same stream/context,
// so the kernels will still execute in sequentially in the order
// they are submitted to the gpu.
let dev = CudaDevice::new(0)?;
let ptx = compile_ptx(KERNEL_SRC).unwrap();
dev.load_ptx(ptx, "kernel", &["hello_world"])?;
// explicit borrow so we don't have to re-clone the device for each thread
let dev = &dev;
thread::scope(move |s| {
for i in 0..10i32 {
s.spawn(move || {
// NOTE: this is the important call to have
// without this, you'll get a CUDA_ERROR_INVALID_CONTEXT
dev.bind_to_thread()?;
let f = dev.get_func("kernel", "hello_world").unwrap();
unsafe { f.launch(cfg, (i,)) }
});
}
});
}
{
// Option 2: create a new device in each thread
// This requires loading the PTX for each device, since they won't
// share a loaded modules on the Rust side of things.
let ptx = compile_ptx(KERNEL_SRC).unwrap();
thread::scope(|s| {
for i in 0..10i32 {
let ptx = ptx.clone();
s.spawn(move || {
let dev = CudaDevice::new(0)?;
dev.load_ptx(ptx, "kernel", &["hello_world"])?;
let f = dev.get_func("kernel", "hello_world").unwrap();
unsafe { f.launch(cfg, (i + 100,)) }
});
}
});
}
Ok(())
}
source§impl CudaDevice
impl CudaDevice
sourcepub fn bind_to_thread(self: &Arc<Self>) -> Result<(), DriverError>
pub fn bind_to_thread(self: &Arc<Self>) -> Result<(), DriverError>
Binds the device to the calling thread. You must call this before using the device on a separate thread!
Examples found in repository?
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
fn main() -> Result<(), DriverError> {
let cfg = LaunchConfig {
grid_dim: (1, 1, 1),
block_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
{
// Option 1: use the same device on each thread.
// This requires calling the CudaDevice::bind_to_thread() method.
// Note that all kernels are submitted to the same stream/context,
// so the kernels will still execute in sequentially in the order
// they are submitted to the gpu.
let dev = CudaDevice::new(0)?;
let ptx = compile_ptx(KERNEL_SRC).unwrap();
dev.load_ptx(ptx, "kernel", &["hello_world"])?;
// explicit borrow so we don't have to re-clone the device for each thread
let dev = &dev;
thread::scope(move |s| {
for i in 0..10i32 {
s.spawn(move || {
// NOTE: this is the important call to have
// without this, you'll get a CUDA_ERROR_INVALID_CONTEXT
dev.bind_to_thread()?;
let f = dev.get_func("kernel", "hello_world").unwrap();
unsafe { f.launch(cfg, (i,)) }
});
}
});
}
{
// Option 2: create a new device in each thread
// This requires loading the PTX for each device, since they won't
// share a loaded modules on the Rust side of things.
let ptx = compile_ptx(KERNEL_SRC).unwrap();
thread::scope(|s| {
for i in 0..10i32 {
let ptx = ptx.clone();
s.spawn(move || {
let dev = CudaDevice::new(0)?;
dev.load_ptx(ptx, "kernel", &["hello_world"])?;
let f = dev.get_func("kernel", "hello_world").unwrap();
unsafe { f.launch(cfg, (i + 100,)) }
});
}
});
}
Ok(())
}