Struct CudaExecutable

Source

pub struct CudaExecutable { /* private fields */ }

Implementations§

Source §

impl CudaExecutable

Source

pub fn compile(graph: Graph) -> Self

JIT compile, stream-mode execution. Default entry point.

Honors RLX_CUDA_COMPILE_MODE=aot and RLX_CUDA_EXEC_MODE=graph|multistream:N.

Examples found in repository ?

examples/bench_matmul.rs (line 33)

26fn bench(m: usize, k: usize, n: usize, warmup: usize, iters: usize) {
27    let mut g = Graph::new("mm");
28    let x = g.input("x", Shape::new(&[m, k], DType::F32));
29    let w = g.param("w", Shape::new(&[k, n], DType::F32));
30    let y = g.matmul(x, w, Shape::new(&[m, n], DType::F32));
31    g.set_outputs(vec![y]);
32
33    let mut exe = CudaExecutable::compile(g);
34    let wv: Vec<f32> = (0..k * n).map(|i| (i as f32) * 1e-3).collect();
35    exe.set_param("w", &wv);
36    let xv: Vec<f32> = (0..m * k).map(|i| (i as f32) * 1e-3).collect();
37
38    for _ in 0..warmup {
39        let _ = exe.run(&[("x", &xv)]);
40    }
41
42    let t0 = Instant::now();
43    for _ in 0..iters {
44        let _ = exe.run(&[("x", &xv)]);
45    }
46    let dt = t0.elapsed().as_secs_f64() / iters as f64;
47    let flops = 2.0 * (m * k * n) as f64;
48    let gflops = flops / dt / 1e9;
49    println!(
50        "  M={:>5} K={:>5} N={:>5}   {:>8.3} ms   {:>8.1} GFLOP/s",
51        m,
52        k,
53        n,
54        dt * 1e3,
55        gflops
56    );
57}

Source

pub fn eager(graph: Graph, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>>

One-shot eager run. Compiles, executes once with the given inputs, and drops the executable. No persistent state.

Source

pub fn compile_with( graph: Graph, compile_mode: CompileMode, exec_mode: ExecMode, ) -> Self

Full constructor with explicit compile + exec modes.

Source

pub fn arena_ptr(&self) -> *const u8

Host buffer base for reading outputs after Self::run_slots. Offsets in the returned slot pairs are byte offsets into this buffer.

Source

pub fn output_slots(&self) -> &[(usize, usize)]

Source

pub fn run_slots(&mut self, inputs: &[&[f32]]) -> &[(usize, usize)]

Fast path: positional inputs, D2H into Self::host_arena, no per-output Vec.

Source

pub fn set_active_extent(&mut self, extent: Option<(usize, usize)>)

Hint the next run to process only the first actual rows along the bucket axis (out of upper, the compile extent). Honored when every step in the schedule passes Step::safe_for_active_extent. Bypasses captured CUDA Graph (recorded at full extent) when active. See PLAN L1.

Source

pub fn output_dtypes(&self) -> Vec<DType>

Declared graph-output dtypes, in graph.outputs order. Used by the runtime wrapper’s run_typed to narrow f32 outputs back to the declared dtype on the way out.

Source

pub fn set_param(&mut self, name: &str, data: &[f32])

Examples found in repository ?

examples/bench_matmul.rs (line 35)

26fn bench(m: usize, k: usize, n: usize, warmup: usize, iters: usize) {
27    let mut g = Graph::new("mm");
28    let x = g.input("x", Shape::new(&[m, k], DType::F32));
29    let w = g.param("w", Shape::new(&[k, n], DType::F32));
30    let y = g.matmul(x, w, Shape::new(&[m, n], DType::F32));
31    g.set_outputs(vec![y]);
32
33    let mut exe = CudaExecutable::compile(g);
34    let wv: Vec<f32> = (0..k * n).map(|i| (i as f32) * 1e-3).collect();
35    exe.set_param("w", &wv);
36    let xv: Vec<f32> = (0..m * k).map(|i| (i as f32) * 1e-3).collect();
37
38    for _ in 0..warmup {
39        let _ = exe.run(&[("x", &xv)]);
40    }
41
42    let t0 = Instant::now();
43    for _ in 0..iters {
44        let _ = exe.run(&[("x", &xv)]);
45    }
46    let dt = t0.elapsed().as_secs_f64() / iters as f64;
47    let flops = 2.0 * (m * k * n) as f64;
48    let gflops = flops / dt / 1e9;
49    println!(
50        "  M={:>5} K={:>5} N={:>5}   {:>8.3} ms   {:>8.1} GFLOP/s",
51        m,
52        k,
53        n,
54        dt * 1e3,
55        gflops
56    );
57}

Source

pub fn set_param_bytes(&mut self, name: &str, data: &[u8])

Upload packed U8/I8 GGUF weights into the param slot (byte offset).

Source

pub fn set_param_half(&mut self, name: &str, dtype: HalfDtype, bits: &[u16])

Upload a param as packed half-precision bits (u16 per element). Caller passes the raw IEEE-754 binary16 (F16) or BFloat16 (Bf16) bit pattern; the backend stores it in the half-arena side-buffer and skips the f32 slot entirely. Use cases: 2× weight-memory savings for inference, plus Tensor Core matmul via cublasGemmEx when both A and B (or just B) are stored half-precision.

When the same name is also set_param’d as f32, the half-arena entry takes precedence in the matmul dispatch. Use only one of the two for any given param.

Source

pub fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>>

Examples found in repository ?

examples/bench_matmul.rs (line 39)

26fn bench(m: usize, k: usize, n: usize, warmup: usize, iters: usize) {
27    let mut g = Graph::new("mm");
28    let x = g.input("x", Shape::new(&[m, k], DType::F32));
29    let w = g.param("w", Shape::new(&[k, n], DType::F32));
30    let y = g.matmul(x, w, Shape::new(&[m, n], DType::F32));
31    g.set_outputs(vec![y]);
32
33    let mut exe = CudaExecutable::compile(g);
34    let wv: Vec<f32> = (0..k * n).map(|i| (i as f32) * 1e-3).collect();
35    exe.set_param("w", &wv);
36    let xv: Vec<f32> = (0..m * k).map(|i| (i as f32) * 1e-3).collect();
37
38    for _ in 0..warmup {
39        let _ = exe.run(&[("x", &xv)]);
40    }
41
42    let t0 = Instant::now();
43    for _ in 0..iters {
44        let _ = exe.run(&[("x", &xv)]);
45    }
46    let dt = t0.elapsed().as_secs_f64() / iters as f64;
47    let flops = 2.0 * (m * k * n) as f64;
48    let gflops = flops / dt / 1e9;
49    println!(
50        "  M={:>5} K={:>5} N={:>5}   {:>8.3} ms   {:>8.1} GFLOP/s",
51        m,
52        k,
53        n,
54        dt * 1e3,
55        gflops
56    );
57}

Source

pub fn run_read_outputs( &mut self, inputs: &[(&str, &[f32])], read_indices: Option<&[usize]>, ) -> Vec<Vec<f32>>

Run and read back only selected outputs (+ GPU handle feed outputs).

Source

pub fn read_gpu_handle(&self, name: &str) -> Option<Vec<f32>>

Auto Trait Implementations§

§

impl !Send for CudaExecutable

§

impl !Sync for CudaExecutable

§

impl Freeze for CudaExecutable

§

impl RefUnwindSafe for CudaExecutable

§

impl Unpin for CudaExecutable

§

impl UnsafeUnpin for CudaExecutable

§

impl UnwindSafe for CudaExecutable

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

impl<T> Pointable for T

Source §

const ALIGN: usize

The alignment of pointer.

Source §

type Init = T

The type for initializers.

Source §

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

Source §

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

Source §

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

Source §

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Struct CudaExecutable Copy item path

Implementations§

impl CudaExecutable

pub fn compile(graph: Graph) -> Self

pub fn eager(graph: Graph, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>>

pub fn compile_with( graph: Graph, compile_mode: CompileMode, exec_mode: ExecMode, ) -> Self

pub fn arena_ptr(&self) -> *const u8

pub fn output_slots(&self) -> &[(usize, usize)]

pub fn run_slots(&mut self, inputs: &[&[f32]]) -> &[(usize, usize)]

pub fn set_active_extent(&mut self, extent: Option<(usize, usize)>)

pub fn output_dtypes(&self) -> Vec<DType>

pub fn set_param(&mut self, name: &str, data: &[f32])

pub fn set_param_bytes(&mut self, name: &str, data: &[u8])

pub fn set_param_half(&mut self, name: &str, dtype: HalfDtype, bits: &[u16])

pub fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>>

pub fn run_read_outputs( &mut self, inputs: &[(&str, &[f32])], read_indices: Option<&[usize]>, ) -> Vec<Vec<f32>>

pub fn bind_gpu_handle(&mut self, name: &str, data: &[f32]) -> bool

pub fn has_gpu_handle(&self, name: &str) -> bool

pub fn set_gpu_handle_feed(&mut self, handle_name: &str, output_index: usize)

pub fn read_gpu_handle(&self, name: &str) -> Option<Vec<f32>>

Auto Trait Implementations§

impl !Send for CudaExecutable

impl !Sync for CudaExecutable

impl Freeze for CudaExecutable

impl RefUnwindSafe for CudaExecutable

impl Unpin for CudaExecutable

impl UnsafeUnpin for CudaExecutable

impl UnwindSafe for CudaExecutable

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct CudaExecutable

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,