pub struct CudaExecutable { /* private fields */ }Implementations§
Source§impl CudaExecutable
impl CudaExecutable
Sourcepub fn compile(graph: Graph) -> Self
pub fn compile(graph: Graph) -> Self
JIT compile, stream-mode execution. Default entry point.
Honors RLX_CUDA_COMPILE_MODE=aot and RLX_CUDA_EXEC_MODE=graph|multistream:N.
Examples found in repository?
26fn bench(m: usize, k: usize, n: usize, warmup: usize, iters: usize) {
27 let mut g = Graph::new("mm");
28 let x = g.input("x", Shape::new(&[m, k], DType::F32));
29 let w = g.param("w", Shape::new(&[k, n], DType::F32));
30 let y = g.matmul(x, w, Shape::new(&[m, n], DType::F32));
31 g.set_outputs(vec![y]);
32
33 let mut exe = CudaExecutable::compile(g);
34 let wv: Vec<f32> = (0..k * n).map(|i| (i as f32) * 1e-3).collect();
35 exe.set_param("w", &wv);
36 let xv: Vec<f32> = (0..m * k).map(|i| (i as f32) * 1e-3).collect();
37
38 for _ in 0..warmup {
39 let _ = exe.run(&[("x", &xv)]);
40 }
41
42 let t0 = Instant::now();
43 for _ in 0..iters {
44 let _ = exe.run(&[("x", &xv)]);
45 }
46 let dt = t0.elapsed().as_secs_f64() / iters as f64;
47 let flops = 2.0 * (m * k * n) as f64;
48 let gflops = flops / dt / 1e9;
49 println!(
50 " M={:>5} K={:>5} N={:>5} {:>8.3} ms {:>8.1} GFLOP/s",
51 m,
52 k,
53 n,
54 dt * 1e3,
55 gflops
56 );
57}Sourcepub fn eager(graph: Graph, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>>
pub fn eager(graph: Graph, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>>
One-shot eager run. Compiles, executes once with the given inputs, and drops the executable. No persistent state.
Sourcepub fn compile_with(
graph: Graph,
compile_mode: CompileMode,
exec_mode: ExecMode,
) -> Self
pub fn compile_with( graph: Graph, compile_mode: CompileMode, exec_mode: ExecMode, ) -> Self
Full constructor with explicit compile + exec modes.
Sourcepub fn arena_ptr(&self) -> *const u8
pub fn arena_ptr(&self) -> *const u8
Host buffer base for reading outputs after Self::run_slots.
Offsets in the returned slot pairs are byte offsets into this buffer.
pub fn output_slots(&self) -> &[(usize, usize)]
Sourcepub fn run_slots(&mut self, inputs: &[&[f32]]) -> &[(usize, usize)]
pub fn run_slots(&mut self, inputs: &[&[f32]]) -> &[(usize, usize)]
Fast path: positional inputs, D2H into Self::host_arena, no per-output Vec.
Sourcepub fn set_active_extent(&mut self, extent: Option<(usize, usize)>)
pub fn set_active_extent(&mut self, extent: Option<(usize, usize)>)
Hint the next run to process only the first actual rows
along the bucket axis (out of upper, the compile extent).
Honored when every step in the schedule passes
Step::safe_for_active_extent. Bypasses captured CUDA Graph
(recorded at full extent) when active. See PLAN L1.
Sourcepub fn output_dtypes(&self) -> Vec<DType>
pub fn output_dtypes(&self) -> Vec<DType>
Declared graph-output dtypes, in graph.outputs order. Used by
the runtime wrapper’s run_typed to narrow f32 outputs back to
the declared dtype on the way out.
Sourcepub fn set_param(&mut self, name: &str, data: &[f32])
pub fn set_param(&mut self, name: &str, data: &[f32])
Examples found in repository?
26fn bench(m: usize, k: usize, n: usize, warmup: usize, iters: usize) {
27 let mut g = Graph::new("mm");
28 let x = g.input("x", Shape::new(&[m, k], DType::F32));
29 let w = g.param("w", Shape::new(&[k, n], DType::F32));
30 let y = g.matmul(x, w, Shape::new(&[m, n], DType::F32));
31 g.set_outputs(vec![y]);
32
33 let mut exe = CudaExecutable::compile(g);
34 let wv: Vec<f32> = (0..k * n).map(|i| (i as f32) * 1e-3).collect();
35 exe.set_param("w", &wv);
36 let xv: Vec<f32> = (0..m * k).map(|i| (i as f32) * 1e-3).collect();
37
38 for _ in 0..warmup {
39 let _ = exe.run(&[("x", &xv)]);
40 }
41
42 let t0 = Instant::now();
43 for _ in 0..iters {
44 let _ = exe.run(&[("x", &xv)]);
45 }
46 let dt = t0.elapsed().as_secs_f64() / iters as f64;
47 let flops = 2.0 * (m * k * n) as f64;
48 let gflops = flops / dt / 1e9;
49 println!(
50 " M={:>5} K={:>5} N={:>5} {:>8.3} ms {:>8.1} GFLOP/s",
51 m,
52 k,
53 n,
54 dt * 1e3,
55 gflops
56 );
57}Sourcepub fn set_param_bytes(&mut self, name: &str, data: &[u8])
pub fn set_param_bytes(&mut self, name: &str, data: &[u8])
Upload packed U8/I8 GGUF weights into the param slot (byte offset).
Sourcepub fn set_param_half(&mut self, name: &str, dtype: HalfDtype, bits: &[u16])
pub fn set_param_half(&mut self, name: &str, dtype: HalfDtype, bits: &[u16])
Upload a param as packed half-precision bits (u16 per element).
Caller passes the raw IEEE-754 binary16 (F16) or BFloat16
(Bf16) bit pattern; the backend stores it in the half-arena
side-buffer and skips the f32 slot entirely. Use cases:
2× weight-memory savings for inference, plus Tensor Core matmul
via cublasGemmEx when both A and B (or just B) are stored
half-precision.
When the same name is also set_param’d as f32, the
half-arena entry takes precedence in the matmul dispatch. Use
only one of the two for any given param.
Sourcepub fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>>
pub fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>>
Examples found in repository?
26fn bench(m: usize, k: usize, n: usize, warmup: usize, iters: usize) {
27 let mut g = Graph::new("mm");
28 let x = g.input("x", Shape::new(&[m, k], DType::F32));
29 let w = g.param("w", Shape::new(&[k, n], DType::F32));
30 let y = g.matmul(x, w, Shape::new(&[m, n], DType::F32));
31 g.set_outputs(vec![y]);
32
33 let mut exe = CudaExecutable::compile(g);
34 let wv: Vec<f32> = (0..k * n).map(|i| (i as f32) * 1e-3).collect();
35 exe.set_param("w", &wv);
36 let xv: Vec<f32> = (0..m * k).map(|i| (i as f32) * 1e-3).collect();
37
38 for _ in 0..warmup {
39 let _ = exe.run(&[("x", &xv)]);
40 }
41
42 let t0 = Instant::now();
43 for _ in 0..iters {
44 let _ = exe.run(&[("x", &xv)]);
45 }
46 let dt = t0.elapsed().as_secs_f64() / iters as f64;
47 let flops = 2.0 * (m * k * n) as f64;
48 let gflops = flops / dt / 1e9;
49 println!(
50 " M={:>5} K={:>5} N={:>5} {:>8.3} ms {:>8.1} GFLOP/s",
51 m,
52 k,
53 n,
54 dt * 1e3,
55 gflops
56 );
57}Sourcepub fn run_read_outputs(
&mut self,
inputs: &[(&str, &[f32])],
read_indices: Option<&[usize]>,
) -> Vec<Vec<f32>>
pub fn run_read_outputs( &mut self, inputs: &[(&str, &[f32])], read_indices: Option<&[usize]>, ) -> Vec<Vec<f32>>
Run and read back only selected outputs (+ GPU handle feed outputs).
pub fn bind_gpu_handle(&mut self, name: &str, data: &[f32]) -> bool
pub fn has_gpu_handle(&self, name: &str) -> bool
pub fn set_gpu_handle_feed(&mut self, handle_name: &str, output_index: usize)
pub fn read_gpu_handle(&self, name: &str) -> Option<Vec<f32>>
Auto Trait Implementations§
impl !Send for CudaExecutable
impl !Sync for CudaExecutable
impl Freeze for CudaExecutable
impl RefUnwindSafe for CudaExecutable
impl Unpin for CudaExecutable
impl UnsafeUnpin for CudaExecutable
impl UnwindSafe for CudaExecutable
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more