pub struct CudaStream { /* private fields */ }Expand description
A wrapper around sys::CUstream that you can schedule work on.
- Create with CudaContext::new_stream(), CudaContext::default_stream(), or CudaStream::fork().
Work done on this is asynchronous with respect to the host.
See CUDA C/C++ Streams and Concurrency See 3. Stream synchronization behavior See 6.6. Event Management See Out-of-order execution See Dependence analysis
Implementations§
Source§impl CudaStream
impl CudaStream
Sourcepub fn memcpy_ftod<'a, T: DeviceRepr, Dst: DevicePtrMut<T>>(
self: &Arc<Self>,
fh: &'a FileHandle,
file_offset: i64,
dst: &mut Dst,
) -> Result<AsyncFileRead<'a>, CufileError>
pub fn memcpy_ftod<'a, T: DeviceRepr, Dst: DevicePtrMut<T>>( self: &Arc<Self>, fh: &'a FileHandle, file_offset: i64, dst: &mut Dst, ) -> Result<AsyncFileRead<'a>, CufileError>
Copy memory from a file into a destination buffer on the device.
The return value of this is initialized with 0, and after the operation successfully finishes on the stream, it will contain a value other than 0. See the docs for possible values.
Wrapper around cuFileReadAsync
See FileHandle::sync_read() for synchronous version.
Sourcepub fn memcpy_dtof<'a, T: DeviceRepr, Src: DevicePtr<T>>(
self: &Arc<Self>,
src: &Src,
fh: &'a mut FileHandle,
file_offset: i64,
) -> Result<AsyncFileWrite<'a>, CufileError>
pub fn memcpy_dtof<'a, T: DeviceRepr, Src: DevicePtr<T>>( self: &Arc<Self>, src: &Src, fh: &'a mut FileHandle, file_offset: i64, ) -> Result<AsyncFileWrite<'a>, CufileError>
Copy memory from a device buffer to a file.
The return value of this is initialized with 0, and after the operation successfully finishes on the stream, it will contain a value other than 0. See the docs for possible values.
Wrapper around cuFileWriteAsync
See FileHandle::sync_write() for synchronous version.
Source§impl CudaStream
impl CudaStream
Sourcepub fn fork(&self) -> Result<Arc<Self>, DriverError>
pub fn fork(&self) -> Result<Arc<Self>, DriverError>
Create’s a new stream and then makes the new stream wait on self
Examples found in repository?
6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11 let f = module.load_function("sin_kernel")?;
12
13 let n = 3i32;
14 let a_host = [1.0, 2.0, 3.0];
15 let a_dev = stream.clone_htod(&a_host)?;
16 let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18 // we can safely create a second stream using [CudaStream::fork()].
19 // This synchronizes with the source stream, so
20 // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21 // work on this stream can start.
22 let stream2 = stream.fork()?;
23
24 // now we launch this work on the other stream
25 let mut builder = stream2.launch_builder(&f);
26 builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27 builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28 builder.arg(&n);
29 unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31 // cudarc automatically manages multi stream synchronization,
32 // so even though we launched the above on a separate stream,
33 // doing this device to host transfer will still properly synchronize.
34 // a_dev doesn't need to synchronize at all since we specified it is just
35 // being read from.
36 // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37 let a_host_2 = stream.clone_dtoh(&a_dev)?;
38 let b_host = stream.clone_dtoh(&b_dev)?;
39
40 println!("Found {b_host:?}");
41 println!("Expected {:?}", a_host.map(f32::sin));
42 assert_eq!(&a_host, a_host_2.as_slice());
43
44 Ok(())
45}Sourcepub fn context(&self) -> &Arc<CudaContext>
pub fn context(&self) -> &Arc<CudaContext>
The context the stream belongs to.
Sourcepub fn synchronize(&self) -> Result<(), DriverError>
pub fn synchronize(&self) -> Result<(), DriverError>
Will only block CPU if you call CudaContext::set_flags() with sys::CUctx_flags::CU_CTX_SCHED_BLOCKING_SYNC.
See cuda docs
Sourcepub fn record_event(
&self,
flags: Option<CUevent_flags>,
) -> Result<CudaEvent, DriverError>
pub fn record_event( &self, flags: Option<CUevent_flags>, ) -> Result<CudaEvent, DriverError>
Creates a new CudaEvent and records the current work in the stream to the event.
Sourcepub fn join(&self, other: &CudaStream) -> Result<(), DriverError>
pub fn join(&self, other: &CudaStream) -> Result<(), DriverError>
Ensures this stream waits for the current workload in other to complete.
This is shorthand for self.wait(other.record_event())
Source§impl CudaStream
impl CudaStream
Sourcepub fn null<T>(self: &Arc<Self>) -> Result<CudaSlice<T>, DriverError>
pub fn null<T>(self: &Arc<Self>) -> Result<CudaSlice<T>, DriverError>
Allocates an empty CudaSlice with 0 length.
Sourcepub unsafe fn alloc<T: DeviceRepr>(
self: &Arc<Self>,
len: usize,
) -> Result<CudaSlice<T>, DriverError>
pub unsafe fn alloc<T: DeviceRepr>( self: &Arc<Self>, len: usize, ) -> Result<CudaSlice<T>, DriverError>
Allocates a CudaSlice with len elements of type T.
§Safety
This is unsafe because the memory is unset.
Examples found in repository?
3fn main() -> Result<(), DriverError> {
4 let ctx = CudaContext::new(0)?;
5 let stream = ctx.default_stream();
6
7 // unsafe initialization of unset memory
8 let _: CudaSlice<f32> = unsafe { stream.alloc::<f32>(10) }?;
9
10 // this will have memory initialized as 0
11 let _: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
12
13 // initialize with slices!
14 let _: CudaSlice<usize> = stream.clone_htod(&[0; 10])?;
15 let _: CudaSlice<u32> = stream.clone_htod(&[1, 2, 3])?;
16
17 Ok(())
18}Sourcepub fn alloc_zeros<T: DeviceRepr + ValidAsZeroBits>(
self: &Arc<Self>,
len: usize,
) -> Result<CudaSlice<T>, DriverError>
pub fn alloc_zeros<T: DeviceRepr + ValidAsZeroBits>( self: &Arc<Self>, len: usize, ) -> Result<CudaSlice<T>, DriverError>
Allocates a CudaSlice with len elements of type T. All values are zero’d out.
Examples found in repository?
3fn main() -> Result<(), DriverError> {
4 let size = 10;
5
6 let ctx1 = CudaContext::new(0)?;
7 let stream1 = ctx1.default_stream();
8 let a: CudaSlice<f64> = stream1.alloc_zeros::<f64>(size)?;
9
10 let ctx2 = CudaContext::new(1)?;
11 let stream2 = ctx2.default_stream();
12
13 let b = stream2.clone_dtod(&a)?;
14
15 stream2.clone_dtoh(&b)?;
16 stream1.clone_dtoh(&a)?;
17
18 Ok(())
19}More examples
3fn main() -> Result<(), DriverError> {
4 let ctx = CudaContext::new(0)?;
5 let stream = ctx.default_stream();
6
7 // unsafe initialization of unset memory
8 let _: CudaSlice<f32> = unsafe { stream.alloc::<f32>(10) }?;
9
10 // this will have memory initialized as 0
11 let _: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
12
13 // initialize with slices!
14 let _: CudaSlice<usize> = stream.clone_htod(&[0; 10])?;
15 let _: CudaSlice<u32> = stream.clone_htod(&[1, 2, 3])?;
16
17 Ok(())
18}2fn main() -> Result<(), Box<dyn std::error::Error>> {
3 use std::fs;
4
5 use cudarc::{cufile::safe::Cufile, driver::CudaContext};
6
7 const N: usize = 100000;
8 let data: Vec<u8> = (0..N).flat_map(|x| (x as f32).to_le_bytes()).collect();
9 let data_sz = data.len();
10 let src_file = "/tmp/cufile_test.bin";
11 fs::write(src_file, &data)?;
12
13 let cufile = Cufile::new()?;
14 println!("{:?}", cufile.get_properties()?);
15
16 let file = fs::File::open(src_file)?;
17 let handle = cufile.register(file)?;
18
19 let ctx = CudaContext::new(0)?;
20 let stream = ctx.default_stream();
21 let mut buf = stream.alloc_zeros::<u8>(data_sz)?;
22
23 handle.sync_read(0, &mut buf)?;
24
25 let verify_dst = stream.clone_dtoh(&buf)?;
26 assert_eq!(verify_dst, data);
27
28 Ok(())
29}3fn main() -> Result<(), DriverError> {
4 let ctx = CudaContext::new(0)?;
5 let stream = ctx.default_stream();
6
7 let a: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
8 let mut b = stream.alloc_zeros::<f64>(10)?;
9
10 // you can do device to device copies of course
11 stream.memcpy_dtod(&a, &mut b)?;
12
13 // but also host to device copys with already allocated buffers
14 stream.memcpy_htod(&vec![2.0; b.len()], &mut b)?;
15 // you can use any type of slice
16 stream.memcpy_htod(&[3.0; 10], &mut b)?;
17
18 // you can transfer back using clone_dtoh
19 let mut a_host: Vec<f64> = stream.clone_dtoh(&a)?;
20 assert_eq!(a_host, [0.0; 10]);
21
22 let b_host = stream.clone_dtoh(&b)?;
23 assert_eq!(b_host, [3.0; 10]);
24
25 // or transfer into a pre allocated slice
26 stream.memcpy_dtoh(&b, &mut a_host)?;
27 assert_eq!(a_host, b_host);
28
29 Ok(())
30}6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11 let f = module.load_function("sin_kernel")?;
12
13 let n = 3i32;
14 let a_host = [1.0, 2.0, 3.0];
15 let a_dev = stream.clone_htod(&a_host)?;
16 let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18 // we can safely create a second stream using [CudaStream::fork()].
19 // This synchronizes with the source stream, so
20 // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21 // work on this stream can start.
22 let stream2 = stream.fork()?;
23
24 // now we launch this work on the other stream
25 let mut builder = stream2.launch_builder(&f);
26 builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27 builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28 builder.arg(&n);
29 unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31 // cudarc automatically manages multi stream synchronization,
32 // so even though we launched the above on a separate stream,
33 // doing this device to host transfer will still properly synchronize.
34 // a_dev doesn't need to synchronize at all since we specified it is just
35 // being read from.
36 // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37 let a_host_2 = stream.clone_dtoh(&a_dev)?;
38 let b_host = stream.clone_dtoh(&b_dev)?;
39
40 println!("Found {b_host:?}");
41 println!("Expected {:?}", a_host.map(f32::sin));
42 assert_eq!(&a_host, a_host_2.as_slice());
43
44 Ok(())
45}6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 // Load the module containing the kernel with constant memory
11 let ptx = compile_ptx(include_str!("./constant_memory.cu")).expect("compile failure");
12 let module = ctx.load_module(ptx)?;
13
14 // Get the constant memory symbol as a CudaSlice<u8>
15 let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
16 println!(
17 "Constant memory symbol 'coefficients' has {} bytes",
18 coefficients_symbol.len()
19 );
20
21 // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
22 let coefficients = [1.0f32, 2.0, 3.0, 4.0];
23
24 // Transmute the symbol to f32 and copy coefficients to constant memory
25 let mut symbol_f32 = unsafe { coefficients_symbol.transmute_mut::<f32>(4).unwrap() };
26 stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
27
28 // Load the kernel function
29 let polynomial_kernel = module.load_function("polynomial_kernel")?;
30
31 // Prepare input data
32 let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
33 let n = input.len();
34
35 // Copy input to device
36 let input_dev = stream.clone_htod(&input)?;
37 let mut output_dev = stream.alloc_zeros::<f32>(n)?;
38
39 // Launch kernel
40 let cfg = LaunchConfig::for_num_elems(n as u32);
41 unsafe {
42 stream
43 .launch_builder(&polynomial_kernel)
44 .arg(&mut output_dev)
45 .arg(&input_dev)
46 .arg(&(n as i32))
47 .launch(cfg)
48 }?;
49
50 // Copy results back
51 let output = stream.clone_dtoh(&output_dev)?;
52
53 // Verify results
54 println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
55 for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
56 let expected = coefficients[0]
57 + coefficients[1] * x
58 + coefficients[2] * x * x
59 + coefficients[3] * x * x * x;
60 println!(" f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
61 assert!(
62 (y - expected).abs() < 1e-4,
63 "Mismatch at index {}: got {}, expected {}",
64 i,
65 y,
66 expected
67 );
68 }
69
70 println!("\nAll results match expected values!");
71
72 Ok(())
73}Sourcepub fn memset_zeros<T: DeviceRepr + ValidAsZeroBits, Dst: DevicePtrMut<T>>(
self: &Arc<Self>,
dst: &mut Dst,
) -> Result<(), DriverError>
pub fn memset_zeros<T: DeviceRepr + ValidAsZeroBits, Dst: DevicePtrMut<T>>( self: &Arc<Self>, dst: &mut Dst, ) -> Result<(), DriverError>
Set’s all the memory in dst to 0. dst can be a CudaSlice or CudaViewMut
Sourcepub fn memcpy_stod<T: DeviceRepr, Src: HostSlice<T> + ?Sized>(
self: &Arc<Self>,
src: &Src,
) -> Result<CudaSlice<T>, DriverError>
👎Deprecated: Use clone_htod
pub fn memcpy_stod<T: DeviceRepr, Src: HostSlice<T> + ?Sized>( self: &Arc<Self>, src: &Src, ) -> Result<CudaSlice<T>, DriverError>
Copy a [T]/Vec<T>/PinnedHostSlice<T> to a new CudaSlice.
Sourcepub fn clone_htod<T: DeviceRepr, Src: HostSlice<T> + ?Sized>(
self: &Arc<Self>,
src: &Src,
) -> Result<CudaSlice<T>, DriverError>
pub fn clone_htod<T: DeviceRepr, Src: HostSlice<T> + ?Sized>( self: &Arc<Self>, src: &Src, ) -> Result<CudaSlice<T>, DriverError>
Copy a [T]/Vec<T>/PinnedHostSlice<T> to a new CudaSlice.
Examples found in repository?
3fn main() -> Result<(), DriverError> {
4 let ctx = CudaContext::new(0)?;
5 let stream = ctx.default_stream();
6
7 // unsafe initialization of unset memory
8 let _: CudaSlice<f32> = unsafe { stream.alloc::<f32>(10) }?;
9
10 // this will have memory initialized as 0
11 let _: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
12
13 // initialize with slices!
14 let _: CudaSlice<usize> = stream.clone_htod(&[0; 10])?;
15 let _: CudaSlice<u32> = stream.clone_htod(&[1, 2, 3])?;
16
17 Ok(())
18}More examples
6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 // You can load a function from a pre-compiled PTX like so:
11 let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
12
13 // and then load a function from it:
14 let f = module.load_function("sin_kernel").unwrap();
15
16 let a_host = [1.0, 2.0, 3.0];
17
18 let a_dev = stream.clone_htod(&a_host)?;
19 let mut b_dev = a_dev.clone();
20
21 // we use a buidler pattern to launch kernels.
22 let n = 3i32;
23 let cfg = LaunchConfig::for_num_elems(n as u32);
24 let mut launch_args = stream.launch_builder(&f);
25 launch_args.arg(&mut b_dev);
26 launch_args.arg(&a_dev);
27 launch_args.arg(&n);
28 unsafe { launch_args.launch(cfg) }?;
29
30 let a_host_2 = stream.clone_dtoh(&a_dev)?;
31 let b_host = stream.clone_dtoh(&b_dev)?;
32
33 println!("Found {b_host:?}");
34 println!("Expected {:?}", a_host.map(f32::sin));
35 assert_eq!(&a_host, a_host_2.as_slice());
36
37 Ok(())
38}22fn main() -> Result<(), DriverError> {
23 let start = std::time::Instant::now();
24
25 let ptx = compile_ptx(PTX_SRC).unwrap();
26 println!("Compilation succeeded in {:?}", start.elapsed());
27
28 let ctx = CudaContext::new(0)?;
29 let stream = ctx.default_stream();
30 println!("Built in {:?}", start.elapsed());
31
32 let module = ctx.load_module(ptx)?;
33 let f = module.load_function("matmul")?;
34 println!("Loaded in {:?}", start.elapsed());
35
36 let a_host = [1.0f32, 2.0, 3.0, 4.0];
37 let b_host = [1.0f32, 2.0, 3.0, 4.0];
38 let mut c_host = [0.0f32; 4];
39
40 let a_dev = stream.clone_htod(&a_host)?;
41 let b_dev = stream.clone_htod(&b_host)?;
42 let mut c_dev = stream.clone_htod(&c_host)?;
43
44 println!("Copied in {:?}", start.elapsed());
45
46 let mut builder = stream.launch_builder(&f);
47 builder.arg(&a_dev);
48 builder.arg(&b_dev);
49 builder.arg(&mut c_dev);
50 builder.arg(&2i32);
51 let cfg = LaunchConfig {
52 block_dim: (2, 2, 1),
53 grid_dim: (1, 1, 1),
54 shared_mem_bytes: 0,
55 };
56 unsafe { builder.launch(cfg) }?;
57
58 stream.memcpy_dtoh(&c_dev, &mut c_host)?;
59 println!("Found {:?} in {:?}", c_host, start.elapsed());
60 Ok(())
61}6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11 let f = module.load_function("sin_kernel")?;
12
13 let n = 3i32;
14 let a_host = [1.0, 2.0, 3.0];
15 let a_dev = stream.clone_htod(&a_host)?;
16 let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18 // we can safely create a second stream using [CudaStream::fork()].
19 // This synchronizes with the source stream, so
20 // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21 // work on this stream can start.
22 let stream2 = stream.fork()?;
23
24 // now we launch this work on the other stream
25 let mut builder = stream2.launch_builder(&f);
26 builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27 builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28 builder.arg(&n);
29 unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31 // cudarc automatically manages multi stream synchronization,
32 // so even though we launched the above on a separate stream,
33 // doing this device to host transfer will still properly synchronize.
34 // a_dev doesn't need to synchronize at all since we specified it is just
35 // being read from.
36 // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37 let a_host_2 = stream.clone_dtoh(&a_dev)?;
38 let b_host = stream.clone_dtoh(&b_dev)?;
39
40 println!("Found {b_host:?}");
41 println!("Expected {:?}", a_host.map(f32::sin));
42 assert_eq!(&a_host, a_host_2.as_slice());
43
44 Ok(())
45}6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 // Load the module containing the kernel with constant memory
11 let ptx = compile_ptx(include_str!("./constant_memory.cu")).expect("compile failure");
12 let module = ctx.load_module(ptx)?;
13
14 // Get the constant memory symbol as a CudaSlice<u8>
15 let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
16 println!(
17 "Constant memory symbol 'coefficients' has {} bytes",
18 coefficients_symbol.len()
19 );
20
21 // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
22 let coefficients = [1.0f32, 2.0, 3.0, 4.0];
23
24 // Transmute the symbol to f32 and copy coefficients to constant memory
25 let mut symbol_f32 = unsafe { coefficients_symbol.transmute_mut::<f32>(4).unwrap() };
26 stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
27
28 // Load the kernel function
29 let polynomial_kernel = module.load_function("polynomial_kernel")?;
30
31 // Prepare input data
32 let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
33 let n = input.len();
34
35 // Copy input to device
36 let input_dev = stream.clone_htod(&input)?;
37 let mut output_dev = stream.alloc_zeros::<f32>(n)?;
38
39 // Launch kernel
40 let cfg = LaunchConfig::for_num_elems(n as u32);
41 unsafe {
42 stream
43 .launch_builder(&polynomial_kernel)
44 .arg(&mut output_dev)
45 .arg(&input_dev)
46 .arg(&(n as i32))
47 .launch(cfg)
48 }?;
49
50 // Copy results back
51 let output = stream.clone_dtoh(&output_dev)?;
52
53 // Verify results
54 println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
55 for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
56 let expected = coefficients[0]
57 + coefficients[1] * x
58 + coefficients[2] * x * x
59 + coefficients[3] * x * x * x;
60 println!(" f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
61 assert!(
62 (y - expected).abs() < 1e-4,
63 "Mismatch at index {}: got {}, expected {}",
64 i,
65 y,
66 expected
67 );
68 }
69
70 println!("\nAll results match expected values!");
71
72 Ok(())
73}Sourcepub fn memcpy_htod<T: DeviceRepr, Src: HostSlice<T> + ?Sized, Dst: DevicePtrMut<T>>(
self: &Arc<Self>,
src: &Src,
dst: &mut Dst,
) -> Result<(), DriverError>
pub fn memcpy_htod<T: DeviceRepr, Src: HostSlice<T> + ?Sized, Dst: DevicePtrMut<T>>( self: &Arc<Self>, src: &Src, dst: &mut Dst, ) -> Result<(), DriverError>
Copy a [T]/Vec<T>/PinnedHostSlice<T> into an existing CudaSlice/CudaViewMut.
Examples found in repository?
3fn main() -> Result<(), DriverError> {
4 let ctx = CudaContext::new(0)?;
5 let stream = ctx.default_stream();
6
7 let a: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
8 let mut b = stream.alloc_zeros::<f64>(10)?;
9
10 // you can do device to device copies of course
11 stream.memcpy_dtod(&a, &mut b)?;
12
13 // but also host to device copys with already allocated buffers
14 stream.memcpy_htod(&vec![2.0; b.len()], &mut b)?;
15 // you can use any type of slice
16 stream.memcpy_htod(&[3.0; 10], &mut b)?;
17
18 // you can transfer back using clone_dtoh
19 let mut a_host: Vec<f64> = stream.clone_dtoh(&a)?;
20 assert_eq!(a_host, [0.0; 10]);
21
22 let b_host = stream.clone_dtoh(&b)?;
23 assert_eq!(b_host, [3.0; 10]);
24
25 // or transfer into a pre allocated slice
26 stream.memcpy_dtoh(&b, &mut a_host)?;
27 assert_eq!(a_host, b_host);
28
29 Ok(())
30}More examples
6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 // Load the module containing the kernel with constant memory
11 let ptx = compile_ptx(include_str!("./constant_memory.cu")).expect("compile failure");
12 let module = ctx.load_module(ptx)?;
13
14 // Get the constant memory symbol as a CudaSlice<u8>
15 let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
16 println!(
17 "Constant memory symbol 'coefficients' has {} bytes",
18 coefficients_symbol.len()
19 );
20
21 // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
22 let coefficients = [1.0f32, 2.0, 3.0, 4.0];
23
24 // Transmute the symbol to f32 and copy coefficients to constant memory
25 let mut symbol_f32 = unsafe { coefficients_symbol.transmute_mut::<f32>(4).unwrap() };
26 stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
27
28 // Load the kernel function
29 let polynomial_kernel = module.load_function("polynomial_kernel")?;
30
31 // Prepare input data
32 let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
33 let n = input.len();
34
35 // Copy input to device
36 let input_dev = stream.clone_htod(&input)?;
37 let mut output_dev = stream.alloc_zeros::<f32>(n)?;
38
39 // Launch kernel
40 let cfg = LaunchConfig::for_num_elems(n as u32);
41 unsafe {
42 stream
43 .launch_builder(&polynomial_kernel)
44 .arg(&mut output_dev)
45 .arg(&input_dev)
46 .arg(&(n as i32))
47 .launch(cfg)
48 }?;
49
50 // Copy results back
51 let output = stream.clone_dtoh(&output_dev)?;
52
53 // Verify results
54 println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
55 for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
56 let expected = coefficients[0]
57 + coefficients[1] * x
58 + coefficients[2] * x * x
59 + coefficients[3] * x * x * x;
60 println!(" f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
61 assert!(
62 (y - expected).abs() < 1e-4,
63 "Mismatch at index {}: got {}, expected {}",
64 i,
65 y,
66 expected
67 );
68 }
69
70 println!("\nAll results match expected values!");
71
72 Ok(())
73}Sourcepub fn memcpy_dtov<T: DeviceRepr, Src: DevicePtr<T>>(
self: &Arc<Self>,
src: &Src,
) -> Result<Vec<T>, DriverError>
👎Deprecated: Use clone_dtoh
pub fn memcpy_dtov<T: DeviceRepr, Src: DevicePtr<T>>( self: &Arc<Self>, src: &Src, ) -> Result<Vec<T>, DriverError>
Sourcepub fn clone_dtoh<T: DeviceRepr, Src: DevicePtr<T>>(
self: &Arc<Self>,
src: &Src,
) -> Result<Vec<T>, DriverError>
pub fn clone_dtoh<T: DeviceRepr, Src: DevicePtr<T>>( self: &Arc<Self>, src: &Src, ) -> Result<Vec<T>, DriverError>
Examples found in repository?
3fn main() -> Result<(), DriverError> {
4 let size = 10;
5
6 let ctx1 = CudaContext::new(0)?;
7 let stream1 = ctx1.default_stream();
8 let a: CudaSlice<f64> = stream1.alloc_zeros::<f64>(size)?;
9
10 let ctx2 = CudaContext::new(1)?;
11 let stream2 = ctx2.default_stream();
12
13 let b = stream2.clone_dtod(&a)?;
14
15 stream2.clone_dtoh(&b)?;
16 stream1.clone_dtoh(&a)?;
17
18 Ok(())
19}More examples
2fn main() -> Result<(), Box<dyn std::error::Error>> {
3 use std::fs;
4
5 use cudarc::{cufile::safe::Cufile, driver::CudaContext};
6
7 const N: usize = 100000;
8 let data: Vec<u8> = (0..N).flat_map(|x| (x as f32).to_le_bytes()).collect();
9 let data_sz = data.len();
10 let src_file = "/tmp/cufile_test.bin";
11 fs::write(src_file, &data)?;
12
13 let cufile = Cufile::new()?;
14 println!("{:?}", cufile.get_properties()?);
15
16 let file = fs::File::open(src_file)?;
17 let handle = cufile.register(file)?;
18
19 let ctx = CudaContext::new(0)?;
20 let stream = ctx.default_stream();
21 let mut buf = stream.alloc_zeros::<u8>(data_sz)?;
22
23 handle.sync_read(0, &mut buf)?;
24
25 let verify_dst = stream.clone_dtoh(&buf)?;
26 assert_eq!(verify_dst, data);
27
28 Ok(())
29}3fn main() -> Result<(), DriverError> {
4 let ctx = CudaContext::new(0)?;
5 let stream = ctx.default_stream();
6
7 let a: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
8 let mut b = stream.alloc_zeros::<f64>(10)?;
9
10 // you can do device to device copies of course
11 stream.memcpy_dtod(&a, &mut b)?;
12
13 // but also host to device copys with already allocated buffers
14 stream.memcpy_htod(&vec![2.0; b.len()], &mut b)?;
15 // you can use any type of slice
16 stream.memcpy_htod(&[3.0; 10], &mut b)?;
17
18 // you can transfer back using clone_dtoh
19 let mut a_host: Vec<f64> = stream.clone_dtoh(&a)?;
20 assert_eq!(a_host, [0.0; 10]);
21
22 let b_host = stream.clone_dtoh(&b)?;
23 assert_eq!(b_host, [3.0; 10]);
24
25 // or transfer into a pre allocated slice
26 stream.memcpy_dtoh(&b, &mut a_host)?;
27 assert_eq!(a_host, b_host);
28
29 Ok(())
30}6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 // You can load a function from a pre-compiled PTX like so:
11 let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
12
13 // and then load a function from it:
14 let f = module.load_function("sin_kernel").unwrap();
15
16 let a_host = [1.0, 2.0, 3.0];
17
18 let a_dev = stream.clone_htod(&a_host)?;
19 let mut b_dev = a_dev.clone();
20
21 // we use a buidler pattern to launch kernels.
22 let n = 3i32;
23 let cfg = LaunchConfig::for_num_elems(n as u32);
24 let mut launch_args = stream.launch_builder(&f);
25 launch_args.arg(&mut b_dev);
26 launch_args.arg(&a_dev);
27 launch_args.arg(&n);
28 unsafe { launch_args.launch(cfg) }?;
29
30 let a_host_2 = stream.clone_dtoh(&a_dev)?;
31 let b_host = stream.clone_dtoh(&b_dev)?;
32
33 println!("Found {b_host:?}");
34 println!("Expected {:?}", a_host.map(f32::sin));
35 assert_eq!(&a_host, a_host_2.as_slice());
36
37 Ok(())
38}6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11 let f = module.load_function("sin_kernel")?;
12
13 let n = 3i32;
14 let a_host = [1.0, 2.0, 3.0];
15 let a_dev = stream.clone_htod(&a_host)?;
16 let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18 // we can safely create a second stream using [CudaStream::fork()].
19 // This synchronizes with the source stream, so
20 // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21 // work on this stream can start.
22 let stream2 = stream.fork()?;
23
24 // now we launch this work on the other stream
25 let mut builder = stream2.launch_builder(&f);
26 builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27 builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28 builder.arg(&n);
29 unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31 // cudarc automatically manages multi stream synchronization,
32 // so even though we launched the above on a separate stream,
33 // doing this device to host transfer will still properly synchronize.
34 // a_dev doesn't need to synchronize at all since we specified it is just
35 // being read from.
36 // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37 let a_host_2 = stream.clone_dtoh(&a_dev)?;
38 let b_host = stream.clone_dtoh(&b_dev)?;
39
40 println!("Found {b_host:?}");
41 println!("Expected {:?}", a_host.map(f32::sin));
42 assert_eq!(&a_host, a_host_2.as_slice());
43
44 Ok(())
45}6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 // Load the module containing the kernel with constant memory
11 let ptx = compile_ptx(include_str!("./constant_memory.cu")).expect("compile failure");
12 let module = ctx.load_module(ptx)?;
13
14 // Get the constant memory symbol as a CudaSlice<u8>
15 let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
16 println!(
17 "Constant memory symbol 'coefficients' has {} bytes",
18 coefficients_symbol.len()
19 );
20
21 // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
22 let coefficients = [1.0f32, 2.0, 3.0, 4.0];
23
24 // Transmute the symbol to f32 and copy coefficients to constant memory
25 let mut symbol_f32 = unsafe { coefficients_symbol.transmute_mut::<f32>(4).unwrap() };
26 stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
27
28 // Load the kernel function
29 let polynomial_kernel = module.load_function("polynomial_kernel")?;
30
31 // Prepare input data
32 let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
33 let n = input.len();
34
35 // Copy input to device
36 let input_dev = stream.clone_htod(&input)?;
37 let mut output_dev = stream.alloc_zeros::<f32>(n)?;
38
39 // Launch kernel
40 let cfg = LaunchConfig::for_num_elems(n as u32);
41 unsafe {
42 stream
43 .launch_builder(&polynomial_kernel)
44 .arg(&mut output_dev)
45 .arg(&input_dev)
46 .arg(&(n as i32))
47 .launch(cfg)
48 }?;
49
50 // Copy results back
51 let output = stream.clone_dtoh(&output_dev)?;
52
53 // Verify results
54 println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
55 for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
56 let expected = coefficients[0]
57 + coefficients[1] * x
58 + coefficients[2] * x * x
59 + coefficients[3] * x * x * x;
60 println!(" f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
61 assert!(
62 (y - expected).abs() < 1e-4,
63 "Mismatch at index {}: got {}, expected {}",
64 i,
65 y,
66 expected
67 );
68 }
69
70 println!("\nAll results match expected values!");
71
72 Ok(())
73}Sourcepub fn memcpy_dtoh<T: DeviceRepr, Src: DevicePtr<T>, Dst: HostSlice<T> + ?Sized>(
self: &Arc<Self>,
src: &Src,
dst: &mut Dst,
) -> Result<(), DriverError>
pub fn memcpy_dtoh<T: DeviceRepr, Src: DevicePtr<T>, Dst: HostSlice<T> + ?Sized>( self: &Arc<Self>, src: &Src, dst: &mut Dst, ) -> Result<(), DriverError>
Copy a CudaSlice/CudaView to a existing [T]/Vec<T>/PinnedHostSlice<T>.
Examples found in repository?
3fn main() -> Result<(), DriverError> {
4 let ctx = CudaContext::new(0)?;
5 let stream = ctx.default_stream();
6
7 let a: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
8 let mut b = stream.alloc_zeros::<f64>(10)?;
9
10 // you can do device to device copies of course
11 stream.memcpy_dtod(&a, &mut b)?;
12
13 // but also host to device copys with already allocated buffers
14 stream.memcpy_htod(&vec![2.0; b.len()], &mut b)?;
15 // you can use any type of slice
16 stream.memcpy_htod(&[3.0; 10], &mut b)?;
17
18 // you can transfer back using clone_dtoh
19 let mut a_host: Vec<f64> = stream.clone_dtoh(&a)?;
20 assert_eq!(a_host, [0.0; 10]);
21
22 let b_host = stream.clone_dtoh(&b)?;
23 assert_eq!(b_host, [3.0; 10]);
24
25 // or transfer into a pre allocated slice
26 stream.memcpy_dtoh(&b, &mut a_host)?;
27 assert_eq!(a_host, b_host);
28
29 Ok(())
30}More examples
22fn main() -> Result<(), DriverError> {
23 let start = std::time::Instant::now();
24
25 let ptx = compile_ptx(PTX_SRC).unwrap();
26 println!("Compilation succeeded in {:?}", start.elapsed());
27
28 let ctx = CudaContext::new(0)?;
29 let stream = ctx.default_stream();
30 println!("Built in {:?}", start.elapsed());
31
32 let module = ctx.load_module(ptx)?;
33 let f = module.load_function("matmul")?;
34 println!("Loaded in {:?}", start.elapsed());
35
36 let a_host = [1.0f32, 2.0, 3.0, 4.0];
37 let b_host = [1.0f32, 2.0, 3.0, 4.0];
38 let mut c_host = [0.0f32; 4];
39
40 let a_dev = stream.clone_htod(&a_host)?;
41 let b_dev = stream.clone_htod(&b_host)?;
42 let mut c_dev = stream.clone_htod(&c_host)?;
43
44 println!("Copied in {:?}", start.elapsed());
45
46 let mut builder = stream.launch_builder(&f);
47 builder.arg(&a_dev);
48 builder.arg(&b_dev);
49 builder.arg(&mut c_dev);
50 builder.arg(&2i32);
51 let cfg = LaunchConfig {
52 block_dim: (2, 2, 1),
53 grid_dim: (1, 1, 1),
54 shared_mem_bytes: 0,
55 };
56 unsafe { builder.launch(cfg) }?;
57
58 stream.memcpy_dtoh(&c_dev, &mut c_host)?;
59 println!("Found {:?} in {:?}", c_host, start.elapsed());
60 Ok(())
61}Sourcepub fn memcpy_dtod<T, Src: DevicePtr<T>, Dst: DevicePtrMut<T>>(
self: &Arc<Self>,
src: &Src,
dst: &mut Dst,
) -> Result<(), DriverError>
pub fn memcpy_dtod<T, Src: DevicePtr<T>, Dst: DevicePtrMut<T>>( self: &Arc<Self>, src: &Src, dst: &mut Dst, ) -> Result<(), DriverError>
Copy a CudaSlice/CudaView to a existing CudaSlice/CudaViewMut.
Examples found in repository?
3fn main() -> Result<(), DriverError> {
4 let ctx = CudaContext::new(0)?;
5 let stream = ctx.default_stream();
6
7 let a: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
8 let mut b = stream.alloc_zeros::<f64>(10)?;
9
10 // you can do device to device copies of course
11 stream.memcpy_dtod(&a, &mut b)?;
12
13 // but also host to device copys with already allocated buffers
14 stream.memcpy_htod(&vec![2.0; b.len()], &mut b)?;
15 // you can use any type of slice
16 stream.memcpy_htod(&[3.0; 10], &mut b)?;
17
18 // you can transfer back using clone_dtoh
19 let mut a_host: Vec<f64> = stream.clone_dtoh(&a)?;
20 assert_eq!(a_host, [0.0; 10]);
21
22 let b_host = stream.clone_dtoh(&b)?;
23 assert_eq!(b_host, [3.0; 10]);
24
25 // or transfer into a pre allocated slice
26 stream.memcpy_dtoh(&b, &mut a_host)?;
27 assert_eq!(a_host, b_host);
28
29 Ok(())
30}Sourcepub fn clone_dtod<T: DeviceRepr, Src: DevicePtr<T>>(
self: &Arc<Self>,
src: &Src,
) -> Result<CudaSlice<T>, DriverError>
pub fn clone_dtod<T: DeviceRepr, Src: DevicePtr<T>>( self: &Arc<Self>, src: &Src, ) -> Result<CudaSlice<T>, DriverError>
Examples found in repository?
3fn main() -> Result<(), DriverError> {
4 let size = 10;
5
6 let ctx1 = CudaContext::new(0)?;
7 let stream1 = ctx1.default_stream();
8 let a: CudaSlice<f64> = stream1.alloc_zeros::<f64>(size)?;
9
10 let ctx2 = CudaContext::new(1)?;
11 let stream2 = ctx2.default_stream();
12
13 let b = stream2.clone_dtod(&a)?;
14
15 stream2.clone_dtoh(&b)?;
16 stream1.clone_dtoh(&a)?;
17
18 Ok(())
19}Source§impl CudaStream
impl CudaStream
Sourcepub unsafe fn upgrade_device_ptr<T>(
self: &Arc<Self>,
cu_device_ptr: CUdeviceptr,
len: usize,
) -> CudaSlice<T>
pub unsafe fn upgrade_device_ptr<T>( self: &Arc<Self>, cu_device_ptr: CUdeviceptr, len: usize, ) -> CudaSlice<T>
Creates a CudaSlice from a sys::CUdeviceptr. Useful in conjunction with
CudaSlice::leak().
§Safety
cu_device_ptrmust be a valid allocationcu_device_ptrmust space forlen * std::mem::size_of<T>()bytes- The memory may not be valid for type
T, so some sort of memset operation should be called on the memory.
Source§impl CudaStream
impl CudaStream
Sourcepub fn begin_capture(
&self,
mode: CUstreamCaptureMode,
) -> Result<(), DriverError>
pub fn begin_capture( &self, mode: CUstreamCaptureMode, ) -> Result<(), DriverError>
See cuda docs
Sourcepub fn end_capture(
self: &Arc<Self>,
flags: CUgraphInstantiate_flags,
) -> Result<Option<CudaGraph>, DriverError>
pub fn end_capture( self: &Arc<Self>, flags: CUgraphInstantiate_flags, ) -> Result<Option<CudaGraph>, DriverError>
See cuda docs
flags is passed to cuGraphInstantiate
Sourcepub fn capture_status(&self) -> Result<CUstreamCaptureStatus, DriverError>
pub fn capture_status(&self) -> Result<CUstreamCaptureStatus, DriverError>
See cuda docs
Source§impl CudaStream
impl CudaStream
Sourcepub fn launch_builder<'a>(&'a self, func: &'a CudaFunction) -> LaunchArgs<'a>
pub fn launch_builder<'a>(&'a self, func: &'a CudaFunction) -> LaunchArgs<'a>
Creates a new kernel launch builder that will launch func on stream self.
Add arguments to the builder using LaunchArgs::arg(), and submit it to the stream using LaunchArgs::launch().
Examples found in repository?
32fn main() -> Result<(), DriverError> {
33 let ctx = CudaContext::new(0)?;
34 let stream = ctx.default_stream();
35
36 let ptx = compile_ptx(PTX_SRC).unwrap();
37 let module = ctx.load_module(ptx)?;
38 let f = module.load_function("my_custom_kernel")?;
39
40 // try changing some of these values to see a device assert
41 let thing = MyCoolRustStruct {
42 a: 1.0,
43 b: 2.34,
44 c: 57,
45 d: 420,
46 };
47
48 let mut builder = stream.launch_builder(&f);
49 // since MyCoolRustStruct implements DeviceRepr, we can pass it to launch.
50 builder.arg(&thing);
51 unsafe { builder.launch(LaunchConfig::for_num_elems(1)) }?;
52
53 Ok(())
54}More examples
6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 // You can load a function from a pre-compiled PTX like so:
11 let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
12
13 // and then load a function from it:
14 let f = module.load_function("sin_kernel").unwrap();
15
16 let a_host = [1.0, 2.0, 3.0];
17
18 let a_dev = stream.clone_htod(&a_host)?;
19 let mut b_dev = a_dev.clone();
20
21 // we use a buidler pattern to launch kernels.
22 let n = 3i32;
23 let cfg = LaunchConfig::for_num_elems(n as u32);
24 let mut launch_args = stream.launch_builder(&f);
25 launch_args.arg(&mut b_dev);
26 launch_args.arg(&a_dev);
27 launch_args.arg(&n);
28 unsafe { launch_args.launch(cfg) }?;
29
30 let a_host_2 = stream.clone_dtoh(&a_dev)?;
31 let b_host = stream.clone_dtoh(&b_dev)?;
32
33 println!("Found {b_host:?}");
34 println!("Expected {:?}", a_host.map(f32::sin));
35 assert_eq!(&a_host, a_host_2.as_slice());
36
37 Ok(())
38}22fn main() -> Result<(), DriverError> {
23 let start = std::time::Instant::now();
24
25 let ptx = compile_ptx(PTX_SRC).unwrap();
26 println!("Compilation succeeded in {:?}", start.elapsed());
27
28 let ctx = CudaContext::new(0)?;
29 let stream = ctx.default_stream();
30 println!("Built in {:?}", start.elapsed());
31
32 let module = ctx.load_module(ptx)?;
33 let f = module.load_function("matmul")?;
34 println!("Loaded in {:?}", start.elapsed());
35
36 let a_host = [1.0f32, 2.0, 3.0, 4.0];
37 let b_host = [1.0f32, 2.0, 3.0, 4.0];
38 let mut c_host = [0.0f32; 4];
39
40 let a_dev = stream.clone_htod(&a_host)?;
41 let b_dev = stream.clone_htod(&b_host)?;
42 let mut c_dev = stream.clone_htod(&c_host)?;
43
44 println!("Copied in {:?}", start.elapsed());
45
46 let mut builder = stream.launch_builder(&f);
47 builder.arg(&a_dev);
48 builder.arg(&b_dev);
49 builder.arg(&mut c_dev);
50 builder.arg(&2i32);
51 let cfg = LaunchConfig {
52 block_dim: (2, 2, 1),
53 grid_dim: (1, 1, 1),
54 shared_mem_bytes: 0,
55 };
56 unsafe { builder.launch(cfg) }?;
57
58 stream.memcpy_dtoh(&c_dev, &mut c_host)?;
59 println!("Found {:?} in {:?}", c_host, start.elapsed());
60 Ok(())
61}6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11 let f = module.load_function("sin_kernel")?;
12
13 let n = 3i32;
14 let a_host = [1.0, 2.0, 3.0];
15 let a_dev = stream.clone_htod(&a_host)?;
16 let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18 // we can safely create a second stream using [CudaStream::fork()].
19 // This synchronizes with the source stream, so
20 // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21 // work on this stream can start.
22 let stream2 = stream.fork()?;
23
24 // now we launch this work on the other stream
25 let mut builder = stream2.launch_builder(&f);
26 builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27 builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28 builder.arg(&n);
29 unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31 // cudarc automatically manages multi stream synchronization,
32 // so even though we launched the above on a separate stream,
33 // doing this device to host transfer will still properly synchronize.
34 // a_dev doesn't need to synchronize at all since we specified it is just
35 // being read from.
36 // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37 let a_host_2 = stream.clone_dtoh(&a_dev)?;
38 let b_host = stream.clone_dtoh(&b_dev)?;
39
40 println!("Found {b_host:?}");
41 println!("Expected {:?}", a_host.map(f32::sin));
42 assert_eq!(&a_host, a_host_2.as_slice());
43
44 Ok(())
45}12fn main() -> Result<(), DriverError> {
13 {
14 // Option 1: sharing ctx & module between threads
15 thread::scope(|s| {
16 let ptx = compile_ptx(KERNEL_SRC).unwrap();
17 let ctx = CudaContext::new(0)?;
18 let module = ctx.load_module(ptx)?;
19 for i in 0..10i32 {
20 let thread_ctx = ctx.clone();
21 let thread_module = module.clone();
22 s.spawn(move || {
23 let stream = thread_ctx.default_stream();
24 let f = thread_module.load_function("hello_world")?;
25 unsafe {
26 stream
27 .launch_builder(&f)
28 .arg(&i)
29 .launch(LaunchConfig::for_num_elems(1))
30 }
31 });
32 }
33 Ok(())
34 })?;
35 }
36
37 {
38 // Option 2: initializing different context in each
39 // Note that this will still schedule to the same stream since we are using the
40 // default stream here on the same device.
41 thread::scope(move |s| {
42 for i in 0..10i32 {
43 s.spawn(move || {
44 let ptx = compile_ptx(KERNEL_SRC).unwrap();
45 let ctx = CudaContext::new(0)?;
46 let module = ctx.load_module(ptx)?;
47 let stream = ctx.default_stream();
48 let f = module.load_function("hello_world")?;
49 unsafe {
50 stream
51 .launch_builder(&f)
52 .arg(&i)
53 .launch(LaunchConfig::for_num_elems(1))
54 }
55 });
56 }
57 Ok(())
58 })?;
59 }
60
61 Ok(())
62}6fn main() -> Result<(), DriverError> {
7 let ctx = CudaContext::new(0)?;
8 let stream = ctx.default_stream();
9
10 // Load the module containing the kernel with constant memory
11 let ptx = compile_ptx(include_str!("./constant_memory.cu")).expect("compile failure");
12 let module = ctx.load_module(ptx)?;
13
14 // Get the constant memory symbol as a CudaSlice<u8>
15 let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
16 println!(
17 "Constant memory symbol 'coefficients' has {} bytes",
18 coefficients_symbol.len()
19 );
20
21 // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
22 let coefficients = [1.0f32, 2.0, 3.0, 4.0];
23
24 // Transmute the symbol to f32 and copy coefficients to constant memory
25 let mut symbol_f32 = unsafe { coefficients_symbol.transmute_mut::<f32>(4).unwrap() };
26 stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
27
28 // Load the kernel function
29 let polynomial_kernel = module.load_function("polynomial_kernel")?;
30
31 // Prepare input data
32 let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
33 let n = input.len();
34
35 // Copy input to device
36 let input_dev = stream.clone_htod(&input)?;
37 let mut output_dev = stream.alloc_zeros::<f32>(n)?;
38
39 // Launch kernel
40 let cfg = LaunchConfig::for_num_elems(n as u32);
41 unsafe {
42 stream
43 .launch_builder(&polynomial_kernel)
44 .arg(&mut output_dev)
45 .arg(&input_dev)
46 .arg(&(n as i32))
47 .launch(cfg)
48 }?;
49
50 // Copy results back
51 let output = stream.clone_dtoh(&output_dev)?;
52
53 // Verify results
54 println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
55 for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
56 let expected = coefficients[0]
57 + coefficients[1] * x
58 + coefficients[2] * x * x
59 + coefficients[3] * x * x * x;
60 println!(" f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
61 assert!(
62 (y - expected).abs() < 1e-4,
63 "Mismatch at index {}: got {}, expected {}",
64 i,
65 y,
66 expected
67 );
68 }
69
70 println!("\nAll results match expected values!");
71
72 Ok(())
73}