1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
//! Safe Rust wrappers for the CUDA Runtime API.
//!
//! The Runtime API is "higher level" than the Driver API: contexts are
//! implicit (each device has a primary context the runtime uses
//! automatically), kernels are typically linked at build time by `nvcc`,
//! and most operations dispatch to the current thread's current device.
//! baracuda-runtime mirrors the Driver-side types where it makes sense
//! ([`Device`], [`Stream`], [`Event`], [`DeviceBuffer`]) and uses the
//! CUDA 12.0+ library API ([`Library`], [`Kernel`]) for loading PTX at
//! runtime — the Driver-API equivalent of `Module::load_ptx` +
//! `Module::get_function`.
//!
//! # Driver ↔ Runtime interop
//!
//! `CUstream` and `cudaStream_t` are the same C type. With the
//! `driver-interop` feature, `Stream::as_raw_driver()` and
//! `Event::as_raw_driver()` return views usable by `baracuda-driver`
//! APIs. See [`interop`].
//!
//! # Examples
//!
//! **Device query** — discover the visible GPUs and inspect compute
//! capability + SM count.
//!
//! ```no_run
//! use baracuda_runtime::Device;
//!
//! # fn demo() -> Result<(), Box<dyn std::error::Error>> {
//! let count = Device::count()?;
//! for d in Device::all()? {
//! let (major, minor) = d.compute_capability()?;
//! println!("device {}: cc {major}.{minor}, {} SMs", d.ordinal(),
//! d.multiprocessor_count()?);
//! }
//! # let _ = count; Ok(()) }
//! ```
//!
//! **Async memory copy** — overlap H2D upload with later kernel launches
//! by issuing on a non-blocking [`Stream`].
//!
//! ```no_run
//! use baracuda_runtime::{Device, DeviceBuffer, Stream};
//!
//! # fn demo() -> Result<(), Box<dyn std::error::Error>> {
//! Device::from_ordinal(0).set_current()?;
//! let stream = Stream::non_blocking()?;
//!
//! let host: Vec<f32> = (0..4096).map(|i| i as f32).collect();
//! let device: DeviceBuffer<f32> = DeviceBuffer::new(host.len())?;
//! device.copy_from_host_async(&host, &stream)?;
//!
//! let mut back = vec![0.0f32; host.len()];
//! device.copy_to_host_async(&mut back, &stream)?;
//! stream.synchronize()?;
//! assert_eq!(host, back);
//! # Ok(()) }
//! ```
//!
//! **Event timing** — measure the elapsed device time between two
//! [`Event::record`] calls.
//!
//! ```no_run
//! use baracuda_runtime::{Device, DeviceBuffer, Event, Stream};
//!
//! # fn demo() -> Result<(), Box<dyn std::error::Error>> {
//! Device::from_ordinal(0).set_current()?;
//! let stream = Stream::new()?;
//! let start = Event::new()?;
//! let end = Event::new()?;
//!
//! // Record START -> issue some work -> record END.
//! start.record(&stream)?;
//! let buf: DeviceBuffer<f32> = DeviceBuffer::zeros(1 << 20)?;
//! end.record(&stream)?;
//! end.synchronize()?;
//!
//! let ms = Event::elapsed_time_ms(&start, &end)?;
//! println!("device-side elapsed: {ms} ms");
//! # let _ = buf; Ok(()) }
//! ```
pub use Device;
pub use ;
pub use Event;
pub use ;
pub use ;
pub use ;
pub use DeviceBuffer;
pub use ;
pub use Stream;