Struct CudaStream

Source

pub struct CudaStream { /* private fields */ }

Expand description

A wrapper around sys::CUstream that you can schedule work on.

Create with CudaContext::new_stream(), CudaContext::default_stream(), or CudaStream::fork().

Work done on this is asynchronous with respect to the host.

See CUDA C/C++ Streams and Concurrency See 3. Stream synchronization behavior See 6.6. Event Management See Out-of-order execution See Dependence analysis

Implementations§

Source §

impl CudaStream

Source

pub fn memcpy_ftod<'a, T: DeviceRepr, Dst: DevicePtrMut<T>>( self: &Arc<Self>, fh: &'a FileHandle, file_offset: i64, dst: &mut Dst, ) -> Result<AsyncFileRead<'a>, CufileError>

Copy memory from a file into a destination buffer on the device.

The return value of this is initialized with 0, and after the operation successfully finishes on the stream, it will contain a value other than 0. See the docs for possible values.

Wrapper around cuFileReadAsync

See FileHandle::sync_read() for synchronous version.

Source

pub fn memcpy_dtof<'a, T: DeviceRepr, Src: DevicePtr<T>>( self: &Arc<Self>, src: &Src, fh: &'a mut FileHandle, file_offset: i64, ) -> Result<AsyncFileWrite<'a>, CufileError>

Copy memory from a device buffer to a file.

The return value of this is initialized with 0, and after the operation successfully finishes on the stream, it will contain a value other than 0. See the docs for possible values.

Wrapper around cuFileWriteAsync

See FileHandle::sync_write() for synchronous version.

Source §

impl CudaStream

Source

pub fn fork(&self) -> Result<Arc<Self>, DriverError>

Create’s a new stream and then makes the new stream wait on self

Examples found in repository ?

examples/04-streams.rs (line 22)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11    let f = module.load_function("sin_kernel")?;
12
13    let n = 3i32;
14    let a_host = [1.0, 2.0, 3.0];
15    let a_dev = stream.clone_htod(&a_host)?;
16    let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18    // we can safely create a second stream using [CudaStream::fork()].
19    // This synchronizes with the source stream, so
20    // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21    // work on this stream can start.
22    let stream2 = stream.fork()?;
23
24    // now we launch this work on the other stream
25    let mut builder = stream2.launch_builder(&f);
26    builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27    builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28    builder.arg(&n);
29    unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31    // cudarc automatically manages multi stream synchronization,
32    // so even though we launched the above on a separate stream,
33    // doing this device to host transfer will still properly synchronize.
34    // a_dev doesn't need to synchronize at all since we specified it is just
35    // being read from.
36    // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37    let a_host_2 = stream.clone_dtoh(&a_dev)?;
38    let b_host = stream.clone_dtoh(&b_dev)?;
39
40    println!("Found {b_host:?}");
41    println!("Expected {:?}", a_host.map(f32::sin));
42    assert_eq!(&a_host, a_host_2.as_slice());
43
44    Ok(())
45}

Source

pub fn cu_stream(&self) -> CUstream

The underlying cuda stream object

§Safety

Do not destroy this value.

Source

pub fn context(&self) -> &Arc<CudaContext>

The context the stream belongs to.

Source

pub fn synchronize(&self) -> Result<(), DriverError>

Will only block CPU if you call CudaContext::set_flags() with sys::CUctx_flags::CU_CTX_SCHED_BLOCKING_SYNC.

See cuda docs

Source

pub fn record_event( &self, flags: Option<CUevent_flags>, ) -> Result<CudaEvent, DriverError>

Creates a new CudaEvent and records the current work in the stream to the event.

Source

pub fn wait(&self, event: &CudaEvent) -> Result<(), DriverError>

Waits for the work recorded in CudaEvent to be completed.

You can record new work in event after calling this method without affecting this call.

See cuda docs

Source

pub fn join(&self, other: &CudaStream) -> Result<(), DriverError>

Ensures this stream waits for the current workload in other to complete. This is shorthand for self.wait(other.record_event())

Source §

impl CudaStream

Source

pub fn null<T>(self: &Arc<Self>) -> Result<CudaSlice<T>, DriverError>

Allocates an empty CudaSlice with 0 length.

Source

pub unsafe fn alloc<T: DeviceRepr>( self: &Arc<Self>, len: usize, ) -> Result<CudaSlice<T>, DriverError>

Allocates a CudaSlice with len elements of type T.

§Safety

This is unsafe because the memory is unset.

Examples found in repository ?

examples/01-allocate.rs (line 8)

3fn main() -> Result<(), DriverError> {
4    let ctx = CudaContext::new(0)?;
5    let stream = ctx.default_stream();
6
7    // unsafe initialization of unset memory
8    let _: CudaSlice<f32> = unsafe { stream.alloc::<f32>(10) }?;
9
10    // this will have memory initialized as 0
11    let _: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
12
13    // initialize with slices!
14    let _: CudaSlice<usize> = stream.clone_htod(&[0; 10])?;
15    let _: CudaSlice<u32> = stream.clone_htod(&[1, 2, 3])?;
16
17    Ok(())
18}

Source

pub fn alloc_zeros<T: DeviceRepr + ValidAsZeroBits>( self: &Arc<Self>, len: usize, ) -> Result<CudaSlice<T>, DriverError>

Allocates a CudaSlice with len elements of type T. All values are zero’d out.

Examples found in repository ?

examples/13-copy-multi-gpu.rs (line 8)

3fn main() -> Result<(), DriverError> {
4    let size = 10;
5
6    let ctx1 = CudaContext::new(0)?;
7    let stream1 = ctx1.default_stream();
8    let a: CudaSlice<f64> = stream1.alloc_zeros::<f64>(size)?;
9
10    let ctx2 = CudaContext::new(1)?;
11    let stream2 = ctx2.default_stream();
12
13    let b = stream2.clone_dtod(&a)?;
14
15    stream2.clone_dtoh(&b)?;
16    stream1.clone_dtoh(&a)?;
17
18    Ok(())
19}

More examples

Hide additional examples

examples/01-allocate.rs (line 11)

3fn main() -> Result<(), DriverError> {
4    let ctx = CudaContext::new(0)?;
5    let stream = ctx.default_stream();
6
7    // unsafe initialization of unset memory
8    let _: CudaSlice<f32> = unsafe { stream.alloc::<f32>(10) }?;
9
10    // this will have memory initialized as 0
11    let _: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
12
13    // initialize with slices!
14    let _: CudaSlice<usize> = stream.clone_htod(&[0; 10])?;
15    let _: CudaSlice<u32> = stream.clone_htod(&[1, 2, 3])?;
16
17    Ok(())
18}

examples/cufile-copy.rs (line 21)

2fn main() -> Result<(), Box<dyn std::error::Error>> {
3    use std::fs;
4
5    use cudarc::{cufile::safe::Cufile, driver::CudaContext};
6
7    const N: usize = 100000;
8    let data: Vec<u8> = (0..N).flat_map(|x| (x as f32).to_le_bytes()).collect();
9    let data_sz = data.len();
10    let src_file = "/tmp/cufile_test.bin";
11    fs::write(src_file, &data)?;
12
13    let cufile = Cufile::new()?;
14    println!("{:?}", cufile.get_properties()?);
15
16    let file = fs::File::open(src_file)?;
17    let handle = cufile.register(file)?;
18
19    let ctx = CudaContext::new(0)?;
20    let stream = ctx.default_stream();
21    let mut buf = stream.alloc_zeros::<u8>(data_sz)?;
22
23    handle.sync_read(0, &mut buf)?;
24
25    let verify_dst = stream.clone_dtoh(&buf)?;
26    assert_eq!(verify_dst, data);
27
28    Ok(())
29}

examples/02-copy.rs (line 7)

3fn main() -> Result<(), DriverError> {
4    let ctx = CudaContext::new(0)?;
5    let stream = ctx.default_stream();
6
7    let a: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
8    let mut b = stream.alloc_zeros::<f64>(10)?;
9
10    // you can do device to device copies of course
11    stream.memcpy_dtod(&a, &mut b)?;
12
13    // but also host to device copys with already allocated buffers
14    stream.memcpy_htod(&vec![2.0; b.len()], &mut b)?;
15    // you can use any type of slice
16    stream.memcpy_htod(&[3.0; 10], &mut b)?;
17
18    // you can transfer back using clone_dtoh
19    let mut a_host: Vec<f64> = stream.clone_dtoh(&a)?;
20    assert_eq!(a_host, [0.0; 10]);
21
22    let b_host = stream.clone_dtoh(&b)?;
23    assert_eq!(b_host, [3.0; 10]);
24
25    // or transfer into a pre allocated slice
26    stream.memcpy_dtoh(&b, &mut a_host)?;
27    assert_eq!(a_host, b_host);
28
29    Ok(())
30}

examples/04-streams.rs (line 16)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11    let f = module.load_function("sin_kernel")?;
12
13    let n = 3i32;
14    let a_host = [1.0, 2.0, 3.0];
15    let a_dev = stream.clone_htod(&a_host)?;
16    let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18    // we can safely create a second stream using [CudaStream::fork()].
19    // This synchronizes with the source stream, so
20    // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21    // work on this stream can start.
22    let stream2 = stream.fork()?;
23
24    // now we launch this work on the other stream
25    let mut builder = stream2.launch_builder(&f);
26    builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27    builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28    builder.arg(&n);
29    unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31    // cudarc automatically manages multi stream synchronization,
32    // so even though we launched the above on a separate stream,
33    // doing this device to host transfer will still properly synchronize.
34    // a_dev doesn't need to synchronize at all since we specified it is just
35    // being read from.
36    // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37    let a_host_2 = stream.clone_dtoh(&a_dev)?;
38    let b_host = stream.clone_dtoh(&b_dev)?;
39
40    println!("Found {b_host:?}");
41    println!("Expected {:?}", a_host.map(f32::sin));
42    assert_eq!(&a_host, a_host_2.as_slice());
43
44    Ok(())
45}

examples/09-constant-memory.rs (line 37)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    // Load the module containing the kernel with constant memory
11    let ptx = compile_ptx(include_str!("./constant_memory.cu")).expect("compile failure");
12    let module = ctx.load_module(ptx)?;
13
14    // Get the constant memory symbol as a CudaSlice<u8>
15    let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
16    println!(
17        "Constant memory symbol 'coefficients' has {} bytes",
18        coefficients_symbol.len()
19    );
20
21    // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
22    let coefficients = [1.0f32, 2.0, 3.0, 4.0];
23
24    // Transmute the symbol to f32 and copy coefficients to constant memory
25    let mut symbol_f32 = unsafe { coefficients_symbol.transmute_mut::<f32>(4).unwrap() };
26    stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
27
28    // Load the kernel function
29    let polynomial_kernel = module.load_function("polynomial_kernel")?;
30
31    // Prepare input data
32    let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
33    let n = input.len();
34
35    // Copy input to device
36    let input_dev = stream.clone_htod(&input)?;
37    let mut output_dev = stream.alloc_zeros::<f32>(n)?;
38
39    // Launch kernel
40    let cfg = LaunchConfig::for_num_elems(n as u32);
41    unsafe {
42        stream
43            .launch_builder(&polynomial_kernel)
44            .arg(&mut output_dev)
45            .arg(&input_dev)
46            .arg(&(n as i32))
47            .launch(cfg)
48    }?;
49
50    // Copy results back
51    let output = stream.clone_dtoh(&output_dev)?;
52
53    // Verify results
54    println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
55    for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
56        let expected = coefficients[0]
57            + coefficients[1] * x
58            + coefficients[2] * x * x
59            + coefficients[3] * x * x * x;
60        println!("  f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
61        assert!(
62            (y - expected).abs() < 1e-4,
63            "Mismatch at index {}: got {}, expected {}",
64            i,
65            y,
66            expected
67        );
68    }
69
70    println!("\nAll results match expected values!");
71
72    Ok(())
73}

Source

pub fn memset_zeros<T: DeviceRepr + ValidAsZeroBits, Dst: DevicePtrMut<T>>( self: &Arc<Self>, dst: &mut Dst, ) -> Result<(), DriverError>

Set’s all the memory in dst to 0. dst can be a CudaSlice or CudaViewMut

Source

pub fn memcpy_stod<T: DeviceRepr, Src: HostSlice<T> + ?Sized>( self: &Arc<Self>, src: &Src, ) -> Result<CudaSlice<T>, DriverError>

👎Deprecated: Use clone_htod

Copy a [T]/Vec<T>/PinnedHostSlice<T> to a new CudaSlice.

Source

pub fn clone_htod<T: DeviceRepr, Src: HostSlice<T> + ?Sized>( self: &Arc<Self>, src: &Src, ) -> Result<CudaSlice<T>, DriverError>

Copy a [T]/Vec<T>/PinnedHostSlice<T> to a new CudaSlice.

Examples found in repository ?

examples/01-allocate.rs (line 14)

3fn main() -> Result<(), DriverError> {
4    let ctx = CudaContext::new(0)?;
5    let stream = ctx.default_stream();
6
7    // unsafe initialization of unset memory
8    let _: CudaSlice<f32> = unsafe { stream.alloc::<f32>(10) }?;
9
10    // this will have memory initialized as 0
11    let _: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
12
13    // initialize with slices!
14    let _: CudaSlice<usize> = stream.clone_htod(&[0; 10])?;
15    let _: CudaSlice<u32> = stream.clone_htod(&[1, 2, 3])?;
16
17    Ok(())
18}

More examples

Hide additional examples

examples/03-launch-kernel.rs (line 18)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    // You can load a function from a pre-compiled PTX like so:
11    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
12
13    // and then load a function from it:
14    let f = module.load_function("sin_kernel").unwrap();
15
16    let a_host = [1.0, 2.0, 3.0];
17
18    let a_dev = stream.clone_htod(&a_host)?;
19    let mut b_dev = a_dev.clone();
20
21    // we use a buidler pattern to launch kernels.
22    let n = 3i32;
23    let cfg = LaunchConfig::for_num_elems(n as u32);
24    let mut launch_args = stream.launch_builder(&f);
25    launch_args.arg(&mut b_dev);
26    launch_args.arg(&a_dev);
27    launch_args.arg(&n);
28    unsafe { launch_args.launch(cfg) }?;
29
30    let a_host_2 = stream.clone_dtoh(&a_dev)?;
31    let b_host = stream.clone_dtoh(&b_dev)?;
32
33    println!("Found {b_host:?}");
34    println!("Expected {:?}", a_host.map(f32::sin));
35    assert_eq!(&a_host, a_host_2.as_slice());
36
37    Ok(())
38}

examples/matmul-kernel.rs (line 40)

22fn main() -> Result<(), DriverError> {
23    let start = std::time::Instant::now();
24
25    let ptx = compile_ptx(PTX_SRC).unwrap();
26    println!("Compilation succeeded in {:?}", start.elapsed());
27
28    let ctx = CudaContext::new(0)?;
29    let stream = ctx.default_stream();
30    println!("Built in {:?}", start.elapsed());
31
32    let module = ctx.load_module(ptx)?;
33    let f = module.load_function("matmul")?;
34    println!("Loaded in {:?}", start.elapsed());
35
36    let a_host = [1.0f32, 2.0, 3.0, 4.0];
37    let b_host = [1.0f32, 2.0, 3.0, 4.0];
38    let mut c_host = [0.0f32; 4];
39
40    let a_dev = stream.clone_htod(&a_host)?;
41    let b_dev = stream.clone_htod(&b_host)?;
42    let mut c_dev = stream.clone_htod(&c_host)?;
43
44    println!("Copied in {:?}", start.elapsed());
45
46    let mut builder = stream.launch_builder(&f);
47    builder.arg(&a_dev);
48    builder.arg(&b_dev);
49    builder.arg(&mut c_dev);
50    builder.arg(&2i32);
51    let cfg = LaunchConfig {
52        block_dim: (2, 2, 1),
53        grid_dim: (1, 1, 1),
54        shared_mem_bytes: 0,
55    };
56    unsafe { builder.launch(cfg) }?;
57
58    stream.memcpy_dtoh(&c_dev, &mut c_host)?;
59    println!("Found {:?} in {:?}", c_host, start.elapsed());
60    Ok(())
61}

examples/04-streams.rs (line 15)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11    let f = module.load_function("sin_kernel")?;
12
13    let n = 3i32;
14    let a_host = [1.0, 2.0, 3.0];
15    let a_dev = stream.clone_htod(&a_host)?;
16    let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18    // we can safely create a second stream using [CudaStream::fork()].
19    // This synchronizes with the source stream, so
20    // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21    // work on this stream can start.
22    let stream2 = stream.fork()?;
23
24    // now we launch this work on the other stream
25    let mut builder = stream2.launch_builder(&f);
26    builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27    builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28    builder.arg(&n);
29    unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31    // cudarc automatically manages multi stream synchronization,
32    // so even though we launched the above on a separate stream,
33    // doing this device to host transfer will still properly synchronize.
34    // a_dev doesn't need to synchronize at all since we specified it is just
35    // being read from.
36    // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37    let a_host_2 = stream.clone_dtoh(&a_dev)?;
38    let b_host = stream.clone_dtoh(&b_dev)?;
39
40    println!("Found {b_host:?}");
41    println!("Expected {:?}", a_host.map(f32::sin));
42    assert_eq!(&a_host, a_host_2.as_slice());
43
44    Ok(())
45}

examples/09-constant-memory.rs (line 36)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    // Load the module containing the kernel with constant memory
11    let ptx = compile_ptx(include_str!("./constant_memory.cu")).expect("compile failure");
12    let module = ctx.load_module(ptx)?;
13
14    // Get the constant memory symbol as a CudaSlice<u8>
15    let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
16    println!(
17        "Constant memory symbol 'coefficients' has {} bytes",
18        coefficients_symbol.len()
19    );
20
21    // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
22    let coefficients = [1.0f32, 2.0, 3.0, 4.0];
23
24    // Transmute the symbol to f32 and copy coefficients to constant memory
25    let mut symbol_f32 = unsafe { coefficients_symbol.transmute_mut::<f32>(4).unwrap() };
26    stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
27
28    // Load the kernel function
29    let polynomial_kernel = module.load_function("polynomial_kernel")?;
30
31    // Prepare input data
32    let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
33    let n = input.len();
34
35    // Copy input to device
36    let input_dev = stream.clone_htod(&input)?;
37    let mut output_dev = stream.alloc_zeros::<f32>(n)?;
38
39    // Launch kernel
40    let cfg = LaunchConfig::for_num_elems(n as u32);
41    unsafe {
42        stream
43            .launch_builder(&polynomial_kernel)
44            .arg(&mut output_dev)
45            .arg(&input_dev)
46            .arg(&(n as i32))
47            .launch(cfg)
48    }?;
49
50    // Copy results back
51    let output = stream.clone_dtoh(&output_dev)?;
52
53    // Verify results
54    println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
55    for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
56        let expected = coefficients[0]
57            + coefficients[1] * x
58            + coefficients[2] * x * x
59            + coefficients[3] * x * x * x;
60        println!("  f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
61        assert!(
62            (y - expected).abs() < 1e-4,
63            "Mismatch at index {}: got {}, expected {}",
64            i,
65            y,
66            expected
67        );
68    }
69
70    println!("\nAll results match expected values!");
71
72    Ok(())
73}

Source

pub fn memcpy_htod<T: DeviceRepr, Src: HostSlice<T> + ?Sized, Dst: DevicePtrMut<T>>( self: &Arc<Self>, src: &Src, dst: &mut Dst, ) -> Result<(), DriverError>

Copy a [T]/Vec<T>/PinnedHostSlice<T> into an existing CudaSlice/CudaViewMut.

Examples found in repository ?

examples/02-copy.rs (line 14)

3fn main() -> Result<(), DriverError> {
4    let ctx = CudaContext::new(0)?;
5    let stream = ctx.default_stream();
6
7    let a: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
8    let mut b = stream.alloc_zeros::<f64>(10)?;
9
10    // you can do device to device copies of course
11    stream.memcpy_dtod(&a, &mut b)?;
12
13    // but also host to device copys with already allocated buffers
14    stream.memcpy_htod(&vec![2.0; b.len()], &mut b)?;
15    // you can use any type of slice
16    stream.memcpy_htod(&[3.0; 10], &mut b)?;
17
18    // you can transfer back using clone_dtoh
19    let mut a_host: Vec<f64> = stream.clone_dtoh(&a)?;
20    assert_eq!(a_host, [0.0; 10]);
21
22    let b_host = stream.clone_dtoh(&b)?;
23    assert_eq!(b_host, [3.0; 10]);
24
25    // or transfer into a pre allocated slice
26    stream.memcpy_dtoh(&b, &mut a_host)?;
27    assert_eq!(a_host, b_host);
28
29    Ok(())
30}

More examples

Hide additional examples

examples/09-constant-memory.rs (line 26)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    // Load the module containing the kernel with constant memory
11    let ptx = compile_ptx(include_str!("./constant_memory.cu")).expect("compile failure");
12    let module = ctx.load_module(ptx)?;
13
14    // Get the constant memory symbol as a CudaSlice<u8>
15    let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
16    println!(
17        "Constant memory symbol 'coefficients' has {} bytes",
18        coefficients_symbol.len()
19    );
20
21    // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
22    let coefficients = [1.0f32, 2.0, 3.0, 4.0];
23
24    // Transmute the symbol to f32 and copy coefficients to constant memory
25    let mut symbol_f32 = unsafe { coefficients_symbol.transmute_mut::<f32>(4).unwrap() };
26    stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
27
28    // Load the kernel function
29    let polynomial_kernel = module.load_function("polynomial_kernel")?;
30
31    // Prepare input data
32    let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
33    let n = input.len();
34
35    // Copy input to device
36    let input_dev = stream.clone_htod(&input)?;
37    let mut output_dev = stream.alloc_zeros::<f32>(n)?;
38
39    // Launch kernel
40    let cfg = LaunchConfig::for_num_elems(n as u32);
41    unsafe {
42        stream
43            .launch_builder(&polynomial_kernel)
44            .arg(&mut output_dev)
45            .arg(&input_dev)
46            .arg(&(n as i32))
47            .launch(cfg)
48    }?;
49
50    // Copy results back
51    let output = stream.clone_dtoh(&output_dev)?;
52
53    // Verify results
54    println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
55    for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
56        let expected = coefficients[0]
57            + coefficients[1] * x
58            + coefficients[2] * x * x
59            + coefficients[3] * x * x * x;
60        println!("  f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
61        assert!(
62            (y - expected).abs() < 1e-4,
63            "Mismatch at index {}: got {}, expected {}",
64            i,
65            y,
66            expected
67        );
68    }
69
70    println!("\nAll results match expected values!");
71
72    Ok(())
73}

Source

pub fn memcpy_dtov<T: DeviceRepr, Src: DevicePtr<T>>( self: &Arc<Self>, src: &Src, ) -> Result<Vec<T>, DriverError>

👎Deprecated: Use clone_dtoh

Copy a CudaSlice/CudaView to a new Vec<T>.

Source

pub fn clone_dtoh<T: DeviceRepr, Src: DevicePtr<T>>( self: &Arc<Self>, src: &Src, ) -> Result<Vec<T>, DriverError>

Copy a CudaSlice/CudaView to a new Vec<T>.

Examples found in repository ?

examples/13-copy-multi-gpu.rs (line 15)

3fn main() -> Result<(), DriverError> {
4    let size = 10;
5
6    let ctx1 = CudaContext::new(0)?;
7    let stream1 = ctx1.default_stream();
8    let a: CudaSlice<f64> = stream1.alloc_zeros::<f64>(size)?;
9
10    let ctx2 = CudaContext::new(1)?;
11    let stream2 = ctx2.default_stream();
12
13    let b = stream2.clone_dtod(&a)?;
14
15    stream2.clone_dtoh(&b)?;
16    stream1.clone_dtoh(&a)?;
17
18    Ok(())
19}

More examples

Hide additional examples

examples/cufile-copy.rs (line 25)

2fn main() -> Result<(), Box<dyn std::error::Error>> {
3    use std::fs;
4
5    use cudarc::{cufile::safe::Cufile, driver::CudaContext};
6
7    const N: usize = 100000;
8    let data: Vec<u8> = (0..N).flat_map(|x| (x as f32).to_le_bytes()).collect();
9    let data_sz = data.len();
10    let src_file = "/tmp/cufile_test.bin";
11    fs::write(src_file, &data)?;
12
13    let cufile = Cufile::new()?;
14    println!("{:?}", cufile.get_properties()?);
15
16    let file = fs::File::open(src_file)?;
17    let handle = cufile.register(file)?;
18
19    let ctx = CudaContext::new(0)?;
20    let stream = ctx.default_stream();
21    let mut buf = stream.alloc_zeros::<u8>(data_sz)?;
22
23    handle.sync_read(0, &mut buf)?;
24
25    let verify_dst = stream.clone_dtoh(&buf)?;
26    assert_eq!(verify_dst, data);
27
28    Ok(())
29}

examples/02-copy.rs (line 19)

3fn main() -> Result<(), DriverError> {
4    let ctx = CudaContext::new(0)?;
5    let stream = ctx.default_stream();
6
7    let a: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
8    let mut b = stream.alloc_zeros::<f64>(10)?;
9
10    // you can do device to device copies of course
11    stream.memcpy_dtod(&a, &mut b)?;
12
13    // but also host to device copys with already allocated buffers
14    stream.memcpy_htod(&vec![2.0; b.len()], &mut b)?;
15    // you can use any type of slice
16    stream.memcpy_htod(&[3.0; 10], &mut b)?;
17
18    // you can transfer back using clone_dtoh
19    let mut a_host: Vec<f64> = stream.clone_dtoh(&a)?;
20    assert_eq!(a_host, [0.0; 10]);
21
22    let b_host = stream.clone_dtoh(&b)?;
23    assert_eq!(b_host, [3.0; 10]);
24
25    // or transfer into a pre allocated slice
26    stream.memcpy_dtoh(&b, &mut a_host)?;
27    assert_eq!(a_host, b_host);
28
29    Ok(())
30}

examples/03-launch-kernel.rs (line 30)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    // You can load a function from a pre-compiled PTX like so:
11    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
12
13    // and then load a function from it:
14    let f = module.load_function("sin_kernel").unwrap();
15
16    let a_host = [1.0, 2.0, 3.0];
17
18    let a_dev = stream.clone_htod(&a_host)?;
19    let mut b_dev = a_dev.clone();
20
21    // we use a buidler pattern to launch kernels.
22    let n = 3i32;
23    let cfg = LaunchConfig::for_num_elems(n as u32);
24    let mut launch_args = stream.launch_builder(&f);
25    launch_args.arg(&mut b_dev);
26    launch_args.arg(&a_dev);
27    launch_args.arg(&n);
28    unsafe { launch_args.launch(cfg) }?;
29
30    let a_host_2 = stream.clone_dtoh(&a_dev)?;
31    let b_host = stream.clone_dtoh(&b_dev)?;
32
33    println!("Found {b_host:?}");
34    println!("Expected {:?}", a_host.map(f32::sin));
35    assert_eq!(&a_host, a_host_2.as_slice());
36
37    Ok(())
38}

examples/04-streams.rs (line 37)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11    let f = module.load_function("sin_kernel")?;
12
13    let n = 3i32;
14    let a_host = [1.0, 2.0, 3.0];
15    let a_dev = stream.clone_htod(&a_host)?;
16    let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18    // we can safely create a second stream using [CudaStream::fork()].
19    // This synchronizes with the source stream, so
20    // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21    // work on this stream can start.
22    let stream2 = stream.fork()?;
23
24    // now we launch this work on the other stream
25    let mut builder = stream2.launch_builder(&f);
26    builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27    builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28    builder.arg(&n);
29    unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31    // cudarc automatically manages multi stream synchronization,
32    // so even though we launched the above on a separate stream,
33    // doing this device to host transfer will still properly synchronize.
34    // a_dev doesn't need to synchronize at all since we specified it is just
35    // being read from.
36    // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37    let a_host_2 = stream.clone_dtoh(&a_dev)?;
38    let b_host = stream.clone_dtoh(&b_dev)?;
39
40    println!("Found {b_host:?}");
41    println!("Expected {:?}", a_host.map(f32::sin));
42    assert_eq!(&a_host, a_host_2.as_slice());
43
44    Ok(())
45}

examples/09-constant-memory.rs (line 51)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    // Load the module containing the kernel with constant memory
11    let ptx = compile_ptx(include_str!("./constant_memory.cu")).expect("compile failure");
12    let module = ctx.load_module(ptx)?;
13
14    // Get the constant memory symbol as a CudaSlice<u8>
15    let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
16    println!(
17        "Constant memory symbol 'coefficients' has {} bytes",
18        coefficients_symbol.len()
19    );
20
21    // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
22    let coefficients = [1.0f32, 2.0, 3.0, 4.0];
23
24    // Transmute the symbol to f32 and copy coefficients to constant memory
25    let mut symbol_f32 = unsafe { coefficients_symbol.transmute_mut::<f32>(4).unwrap() };
26    stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
27
28    // Load the kernel function
29    let polynomial_kernel = module.load_function("polynomial_kernel")?;
30
31    // Prepare input data
32    let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
33    let n = input.len();
34
35    // Copy input to device
36    let input_dev = stream.clone_htod(&input)?;
37    let mut output_dev = stream.alloc_zeros::<f32>(n)?;
38
39    // Launch kernel
40    let cfg = LaunchConfig::for_num_elems(n as u32);
41    unsafe {
42        stream
43            .launch_builder(&polynomial_kernel)
44            .arg(&mut output_dev)
45            .arg(&input_dev)
46            .arg(&(n as i32))
47            .launch(cfg)
48    }?;
49
50    // Copy results back
51    let output = stream.clone_dtoh(&output_dev)?;
52
53    // Verify results
54    println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
55    for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
56        let expected = coefficients[0]
57            + coefficients[1] * x
58            + coefficients[2] * x * x
59            + coefficients[3] * x * x * x;
60        println!("  f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
61        assert!(
62            (y - expected).abs() < 1e-4,
63            "Mismatch at index {}: got {}, expected {}",
64            i,
65            y,
66            expected
67        );
68    }
69
70    println!("\nAll results match expected values!");
71
72    Ok(())
73}

Source

pub fn memcpy_dtoh<T: DeviceRepr, Src: DevicePtr<T>, Dst: HostSlice<T> + ?Sized>( self: &Arc<Self>, src: &Src, dst: &mut Dst, ) -> Result<(), DriverError>

Copy a CudaSlice/CudaView to a existing [T]/Vec<T>/PinnedHostSlice<T>.

Examples found in repository ?

examples/02-copy.rs (line 26)

3fn main() -> Result<(), DriverError> {
4    let ctx = CudaContext::new(0)?;
5    let stream = ctx.default_stream();
6
7    let a: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
8    let mut b = stream.alloc_zeros::<f64>(10)?;
9
10    // you can do device to device copies of course
11    stream.memcpy_dtod(&a, &mut b)?;
12
13    // but also host to device copys with already allocated buffers
14    stream.memcpy_htod(&vec![2.0; b.len()], &mut b)?;
15    // you can use any type of slice
16    stream.memcpy_htod(&[3.0; 10], &mut b)?;
17
18    // you can transfer back using clone_dtoh
19    let mut a_host: Vec<f64> = stream.clone_dtoh(&a)?;
20    assert_eq!(a_host, [0.0; 10]);
21
22    let b_host = stream.clone_dtoh(&b)?;
23    assert_eq!(b_host, [3.0; 10]);
24
25    // or transfer into a pre allocated slice
26    stream.memcpy_dtoh(&b, &mut a_host)?;
27    assert_eq!(a_host, b_host);
28
29    Ok(())
30}

More examples

Hide additional examples

examples/matmul-kernel.rs (line 58)

22fn main() -> Result<(), DriverError> {
23    let start = std::time::Instant::now();
24
25    let ptx = compile_ptx(PTX_SRC).unwrap();
26    println!("Compilation succeeded in {:?}", start.elapsed());
27
28    let ctx = CudaContext::new(0)?;
29    let stream = ctx.default_stream();
30    println!("Built in {:?}", start.elapsed());
31
32    let module = ctx.load_module(ptx)?;
33    let f = module.load_function("matmul")?;
34    println!("Loaded in {:?}", start.elapsed());
35
36    let a_host = [1.0f32, 2.0, 3.0, 4.0];
37    let b_host = [1.0f32, 2.0, 3.0, 4.0];
38    let mut c_host = [0.0f32; 4];
39
40    let a_dev = stream.clone_htod(&a_host)?;
41    let b_dev = stream.clone_htod(&b_host)?;
42    let mut c_dev = stream.clone_htod(&c_host)?;
43
44    println!("Copied in {:?}", start.elapsed());
45
46    let mut builder = stream.launch_builder(&f);
47    builder.arg(&a_dev);
48    builder.arg(&b_dev);
49    builder.arg(&mut c_dev);
50    builder.arg(&2i32);
51    let cfg = LaunchConfig {
52        block_dim: (2, 2, 1),
53        grid_dim: (1, 1, 1),
54        shared_mem_bytes: 0,
55    };
56    unsafe { builder.launch(cfg) }?;
57
58    stream.memcpy_dtoh(&c_dev, &mut c_host)?;
59    println!("Found {:?} in {:?}", c_host, start.elapsed());
60    Ok(())
61}

Source

pub fn memcpy_dtod<T, Src: DevicePtr<T>, Dst: DevicePtrMut<T>>( self: &Arc<Self>, src: &Src, dst: &mut Dst, ) -> Result<(), DriverError>

Copy a CudaSlice/CudaView to a existing CudaSlice/CudaViewMut.

Examples found in repository ?

examples/02-copy.rs (line 11)

3fn main() -> Result<(), DriverError> {
4    let ctx = CudaContext::new(0)?;
5    let stream = ctx.default_stream();
6
7    let a: CudaSlice<f64> = stream.alloc_zeros::<f64>(10)?;
8    let mut b = stream.alloc_zeros::<f64>(10)?;
9
10    // you can do device to device copies of course
11    stream.memcpy_dtod(&a, &mut b)?;
12
13    // but also host to device copys with already allocated buffers
14    stream.memcpy_htod(&vec![2.0; b.len()], &mut b)?;
15    // you can use any type of slice
16    stream.memcpy_htod(&[3.0; 10], &mut b)?;
17
18    // you can transfer back using clone_dtoh
19    let mut a_host: Vec<f64> = stream.clone_dtoh(&a)?;
20    assert_eq!(a_host, [0.0; 10]);
21
22    let b_host = stream.clone_dtoh(&b)?;
23    assert_eq!(b_host, [3.0; 10]);
24
25    // or transfer into a pre allocated slice
26    stream.memcpy_dtoh(&b, &mut a_host)?;
27    assert_eq!(a_host, b_host);
28
29    Ok(())
30}

Source

pub fn clone_dtod<T: DeviceRepr, Src: DevicePtr<T>>( self: &Arc<Self>, src: &Src, ) -> Result<CudaSlice<T>, DriverError>

Copy a CudaSlice/CudaView to a new CudaSlice.

Examples found in repository ?

examples/13-copy-multi-gpu.rs (line 13)

3fn main() -> Result<(), DriverError> {
4    let size = 10;
5
6    let ctx1 = CudaContext::new(0)?;
7    let stream1 = ctx1.default_stream();
8    let a: CudaSlice<f64> = stream1.alloc_zeros::<f64>(size)?;
9
10    let ctx2 = CudaContext::new(1)?;
11    let stream2 = ctx2.default_stream();
12
13    let b = stream2.clone_dtod(&a)?;
14
15    stream2.clone_dtoh(&b)?;
16    stream1.clone_dtoh(&a)?;
17
18    Ok(())
19}

Source §

impl CudaStream

Source

pub unsafe fn upgrade_device_ptr<T>( self: &Arc<Self>, cu_device_ptr: CUdeviceptr, len: usize, ) -> CudaSlice<T>

Creates a CudaSlice from a sys::CUdeviceptr. Useful in conjunction with CudaSlice::leak().

§Safety

cu_device_ptr must be a valid allocation
cu_device_ptr must space for len * std::mem::size_of<T>() bytes
The memory may not be valid for type T, so some sort of memset operation should be called on the memory.

Source §

impl CudaStream

Source

pub fn begin_capture( &self, mode: CUstreamCaptureMode, ) -> Result<(), DriverError>

See cuda docs

Source

pub fn end_capture( self: &Arc<Self>, flags: CUgraphInstantiate_flags, ) -> Result<Option<CudaGraph>, DriverError>

See cuda docs

flags is passed to cuGraphInstantiate

Source

pub fn capture_status(&self) -> Result<CUstreamCaptureStatus, DriverError>

See cuda docs

Source §

impl CudaStream

Source

pub fn launch_builder<'a>(&'a self, func: &'a CudaFunction) -> LaunchArgs<'a>

Creates a new kernel launch builder that will launch func on stream self.

Add arguments to the builder using LaunchArgs::arg(), and submit it to the stream using LaunchArgs::launch().

Examples found in repository ?

examples/05-device-repr.rs (line 48)

32fn main() -> Result<(), DriverError> {
33    let ctx = CudaContext::new(0)?;
34    let stream = ctx.default_stream();
35
36    let ptx = compile_ptx(PTX_SRC).unwrap();
37    let module = ctx.load_module(ptx)?;
38    let f = module.load_function("my_custom_kernel")?;
39
40    // try changing some of these values to see a device assert
41    let thing = MyCoolRustStruct {
42        a: 1.0,
43        b: 2.34,
44        c: 57,
45        d: 420,
46    };
47
48    let mut builder = stream.launch_builder(&f);
49    // since MyCoolRustStruct implements DeviceRepr, we can pass it to launch.
50    builder.arg(&thing);
51    unsafe { builder.launch(LaunchConfig::for_num_elems(1)) }?;
52
53    Ok(())
54}

More examples

Hide additional examples

examples/03-launch-kernel.rs (line 24)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    // You can load a function from a pre-compiled PTX like so:
11    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
12
13    // and then load a function from it:
14    let f = module.load_function("sin_kernel").unwrap();
15
16    let a_host = [1.0, 2.0, 3.0];
17
18    let a_dev = stream.clone_htod(&a_host)?;
19    let mut b_dev = a_dev.clone();
20
21    // we use a buidler pattern to launch kernels.
22    let n = 3i32;
23    let cfg = LaunchConfig::for_num_elems(n as u32);
24    let mut launch_args = stream.launch_builder(&f);
25    launch_args.arg(&mut b_dev);
26    launch_args.arg(&a_dev);
27    launch_args.arg(&n);
28    unsafe { launch_args.launch(cfg) }?;
29
30    let a_host_2 = stream.clone_dtoh(&a_dev)?;
31    let b_host = stream.clone_dtoh(&b_dev)?;
32
33    println!("Found {b_host:?}");
34    println!("Expected {:?}", a_host.map(f32::sin));
35    assert_eq!(&a_host, a_host_2.as_slice());
36
37    Ok(())
38}

examples/matmul-kernel.rs (line 46)

22fn main() -> Result<(), DriverError> {
23    let start = std::time::Instant::now();
24
25    let ptx = compile_ptx(PTX_SRC).unwrap();
26    println!("Compilation succeeded in {:?}", start.elapsed());
27
28    let ctx = CudaContext::new(0)?;
29    let stream = ctx.default_stream();
30    println!("Built in {:?}", start.elapsed());
31
32    let module = ctx.load_module(ptx)?;
33    let f = module.load_function("matmul")?;
34    println!("Loaded in {:?}", start.elapsed());
35
36    let a_host = [1.0f32, 2.0, 3.0, 4.0];
37    let b_host = [1.0f32, 2.0, 3.0, 4.0];
38    let mut c_host = [0.0f32; 4];
39
40    let a_dev = stream.clone_htod(&a_host)?;
41    let b_dev = stream.clone_htod(&b_host)?;
42    let mut c_dev = stream.clone_htod(&c_host)?;
43
44    println!("Copied in {:?}", start.elapsed());
45
46    let mut builder = stream.launch_builder(&f);
47    builder.arg(&a_dev);
48    builder.arg(&b_dev);
49    builder.arg(&mut c_dev);
50    builder.arg(&2i32);
51    let cfg = LaunchConfig {
52        block_dim: (2, 2, 1),
53        grid_dim: (1, 1, 1),
54        shared_mem_bytes: 0,
55    };
56    unsafe { builder.launch(cfg) }?;
57
58    stream.memcpy_dtoh(&c_dev, &mut c_host)?;
59    println!("Found {:?} in {:?}", c_host, start.elapsed());
60    Ok(())
61}

examples/04-streams.rs (line 25)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
11    let f = module.load_function("sin_kernel")?;
12
13    let n = 3i32;
14    let a_host = [1.0, 2.0, 3.0];
15    let a_dev = stream.clone_htod(&a_host)?;
16    let mut b_dev = stream.alloc_zeros::<f32>(n as usize)?;
17
18    // we can safely create a second stream using [CudaStream::fork()].
19    // This synchronizes with the source stream, so
20    // the `memcpy_vtod` & `alloc_zeros` above will complete **before**
21    // work on this stream can start.
22    let stream2 = stream.fork()?;
23
24    // now we launch this work on the other stream
25    let mut builder = stream2.launch_builder(&f);
26    builder.arg(&mut b_dev); // NOTE: tells cudarc that we are mutating this.
27    builder.arg(&a_dev); // NOTE: tells cudarc that we are reading from this slice
28    builder.arg(&n);
29    unsafe { builder.launch(LaunchConfig::for_num_elems(n as u32)) }?;
30
31    // cudarc automatically manages multi stream synchronization,
32    // so even though we launched the above on a separate stream,
33    // doing this device to host transfer will still properly synchronize.
34    // a_dev doesn't need to synchronize at all since we specified it is just
35    // being read from.
36    // b_dev DOES need to be synchronized, because it was mutated on a different stream.
37    let a_host_2 = stream.clone_dtoh(&a_dev)?;
38    let b_host = stream.clone_dtoh(&b_dev)?;
39
40    println!("Found {b_host:?}");
41    println!("Expected {:?}", a_host.map(f32::sin));
42    assert_eq!(&a_host, a_host_2.as_slice());
43
44    Ok(())
45}

examples/06-threading.rs (line 27)

12fn main() -> Result<(), DriverError> {
13    {
14        // Option 1: sharing ctx & module between threads
15        thread::scope(|s| {
16            let ptx = compile_ptx(KERNEL_SRC).unwrap();
17            let ctx = CudaContext::new(0)?;
18            let module = ctx.load_module(ptx)?;
19            for i in 0..10i32 {
20                let thread_ctx = ctx.clone();
21                let thread_module = module.clone();
22                s.spawn(move || {
23                    let stream = thread_ctx.default_stream();
24                    let f = thread_module.load_function("hello_world")?;
25                    unsafe {
26                        stream
27                            .launch_builder(&f)
28                            .arg(&i)
29                            .launch(LaunchConfig::for_num_elems(1))
30                    }
31                });
32            }
33            Ok(())
34        })?;
35    }
36
37    {
38        // Option 2: initializing different context in each
39        // Note that this will still schedule to the same stream since we are using the
40        // default stream here on the same device.
41        thread::scope(move |s| {
42            for i in 0..10i32 {
43                s.spawn(move || {
44                    let ptx = compile_ptx(KERNEL_SRC).unwrap();
45                    let ctx = CudaContext::new(0)?;
46                    let module = ctx.load_module(ptx)?;
47                    let stream = ctx.default_stream();
48                    let f = module.load_function("hello_world")?;
49                    unsafe {
50                        stream
51                            .launch_builder(&f)
52                            .arg(&i)
53                            .launch(LaunchConfig::for_num_elems(1))
54                    }
55                });
56            }
57            Ok(())
58        })?;
59    }
60
61    Ok(())
62}

examples/09-constant-memory.rs (line 43)

6fn main() -> Result<(), DriverError> {
7    let ctx = CudaContext::new(0)?;
8    let stream = ctx.default_stream();
9
10    // Load the module containing the kernel with constant memory
11    let ptx = compile_ptx(include_str!("./constant_memory.cu")).expect("compile failure");
12    let module = ctx.load_module(ptx)?;
13
14    // Get the constant memory symbol as a CudaSlice<u8>
15    let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
16    println!(
17        "Constant memory symbol 'coefficients' has {} bytes",
18        coefficients_symbol.len()
19    );
20
21    // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
22    let coefficients = [1.0f32, 2.0, 3.0, 4.0];
23
24    // Transmute the symbol to f32 and copy coefficients to constant memory
25    let mut symbol_f32 = unsafe { coefficients_symbol.transmute_mut::<f32>(4).unwrap() };
26    stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
27
28    // Load the kernel function
29    let polynomial_kernel = module.load_function("polynomial_kernel")?;
30
31    // Prepare input data
32    let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
33    let n = input.len();
34
35    // Copy input to device
36    let input_dev = stream.clone_htod(&input)?;
37    let mut output_dev = stream.alloc_zeros::<f32>(n)?;
38
39    // Launch kernel
40    let cfg = LaunchConfig::for_num_elems(n as u32);
41    unsafe {
42        stream
43            .launch_builder(&polynomial_kernel)
44            .arg(&mut output_dev)
45            .arg(&input_dev)
46            .arg(&(n as i32))
47            .launch(cfg)
48    }?;
49
50    // Copy results back
51    let output = stream.clone_dtoh(&output_dev)?;
52
53    // Verify results
54    println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
55    for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
56        let expected = coefficients[0]
57            + coefficients[1] * x
58            + coefficients[2] * x * x
59            + coefficients[3] * x * x * x;
60        println!("  f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
61        assert!(
62            (y - expected).abs() < 1e-4,
63            "Mismatch at index {}: got {}, expected {}",
64            i,
65            y,
66            expected
67        );
68    }
69
70    println!("\nAll results match expected values!");
71
72    Ok(())
73}