Struct KernelDispatcher

Source

pub struct KernelDispatcher { /* private fields */ }

Expand description

Dispatches kernel calls to the best available implementation.

Uses scirs2_core::simd::detect::CpuFeatures for CPU feature detection, ensuring consistent SIMD dispatch across the COOLJAPAN ecosystem.

Implementations§

Source §

impl KernelDispatcher

Source

pub fn auto_detect() -> Self

Create a dispatcher that auto-detects the best available kernel tier.

Queries SciRS2-Core’s cached CpuFeatures to determine the optimal tier for the current CPU.

Source

pub fn with_tier(tier: KernelTier) -> Self

Create a dispatcher with a specific tier (for testing/benchmarks).

Source

pub fn tier(&self) -> KernelTier

Get the selected kernel tier.

Trait Implementations§

Source §

impl Debug for KernelDispatcher

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Fp8Kernel for KernelDispatcher

Source §

fn dequant_fp8_e4m3( &self, blocks: &[BlockFP8E4M3], output: &mut [f32], ) -> KernelResult<()>

Dequantize FP8 E4M3FN blocks — tier-aware SIMD dispatch.

Source §

fn dequant_fp8_e5m2( &self, blocks: &[BlockFP8E5M2], output: &mut [f32], ) -> KernelResult<()>

Dequantize FP8 E5M2 blocks — tier-aware SIMD dispatch.

Source §

fn gemv_fp8_e4m3( &self, blocks: &[BlockFP8E4M3], input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

FP8 E4M3FN GEMV — tier-aware SIMD dispatch with optional GPU acceleration.

Dispatch priority on the KernelTier::Gpu path:

Metal (macOS + metal feature) — metal_gemv_fp8_e4m3.
CUDA (Linux/Windows + native-cuda feature) — cuda_gemv_fp8_e4m3.
CPU SIMD fallback (AVX-512 / AVX2 / NEON / scalar).

The raw-byte cast of blocks to *const u8 is sound because BlockFP8E4M3 is #[repr(C)] with size BLOCK_FP8_BYTES = 34.

Source §

fn gemv_fp8_e5m2( &self, blocks: &[BlockFP8E5M2], input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

FP8 E5M2 GEMV — tier-aware SIMD dispatch with optional GPU acceleration.

Mirrors gemv_fp8_e4m3: Metal → CUDA → CPU SIMD. The raw-byte cast is sound because BlockFP8E5M2 is #[repr(C)] with size BLOCK_FP8_BYTES = 34.

Source §

fn gemm_fp8_e4m3( &self, blocks: &[BlockFP8E4M3], inputs: &[f32], outputs: &mut [f32], n_rows: usize, k: usize, batch: usize, ) -> KernelResult<()>

FP8 E4M3FN GEMM — tier-aware SIMD dispatch.

Source §

fn gemm_fp8_e5m2( &self, blocks: &[BlockFP8E5M2], inputs: &[f32], outputs: &mut [f32], n_rows: usize, k: usize, batch: usize, ) -> KernelResult<()>

FP8 E5M2 GEMM — tier-aware SIMD dispatch.

Source §

fn name_fp8(&self) -> &'static str

Display name for this FP8 kernel implementation.

Source §

impl OneBitKernel for KernelDispatcher

Source §

fn dequant( &self, blocks: &[BlockQ1_0G128], output: &mut [f32], ) -> KernelResult<()>

Dequantize blocks to FP32 values. Read more

Source §

fn gemv( &self, blocks: &[BlockQ1_0G128], input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

Fused 1-bit matrix × FP32 vector product (GEMV). Read more

Source §

fn gemm( &self, blocks: &[BlockQ1_0G128], input: &[f32], output: &mut [f32], m: usize, n_rows: usize, k: usize, ) -> KernelResult<()>

Fused 1-bit matrix × FP32 matrix product (GEMM). Read more

Source §

fn name(&self) -> &'static str

Display name for this kernel implementation.

Source §

fn is_gpu_accelerated(&self) -> bool

Whether this kernel routes ops through GPU hardware. Read more

Source §

fn upload_weights(&self, blocks: &[BlockQ1_0G128]) -> Option<GpuWeightHandle>

Upload weight blocks to GPU memory for future cached GEMV/GEMM calls. Read more

Source §

fn gemv_cached( &self, handle: GpuWeightHandle, input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

GEMV using a pre-uploaded weight buffer (no host→device copy for weights). Read more

Source §

fn batch_attn_phase( &self, hidden: &[f32], norm_weight: &[f32], norm_eps: f32, qkv_handle: GpuWeightHandle, q_rows: usize, k_rows: usize, h: usize, ) -> KernelResult<Option<(Vec<f32>, Vec<f32>, Vec<f32>)>>

Batch-accelerated attention input phase (RMSNorm + QKV in one command buffer). Read more

Source §

fn batch_ffn_phase( &self, hidden: &mut [f32], attn_out: &[f32], norm_weight: &[f32], norm_eps: f32, attn_proj_handle: GpuWeightHandle, gate_up_handle: GpuWeightHandle, down_handle: GpuWeightHandle, h: usize, intermediate: usize, attn_proj_k: usize, ) -> KernelResult<bool>

Batch-accelerated FFN phase (attn_proj + residual + norm + gate_up + swiglu + down + residual). Read more

Source §

impl TernaryKernel for KernelDispatcher

Source §

fn dequant_ternary_g128( &self, blocks: &[BlockTQ2_0_g128], output: &mut [f32], ) -> KernelResult<()>

Dequantize TQ2_0_g128 blocks to FP32 values. Read more

Source §

fn gemv_ternary_g128( &self, blocks: &[BlockTQ2_0_g128], input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

Fused ternary matrix × FP32 vector product (GEMV). Read more

Source §

fn gemm_ternary_g128( &self, blocks: &[BlockTQ2_0_g128], input: &[f32], output: &mut [f32], m: usize, n_rows: usize, k: usize, ) -> KernelResult<()>

Fused ternary matrix × FP32 matrix product (GEMM). Read more

Source §

fn upload_weights_ternary( &self, blocks: &[BlockTQ2_0_g128], ) -> Option<GpuWeightHandle>

Upload TQ2_0_g128 weight blocks to GPU memory for future cached GEMV calls. Read more

Source §

fn gemv_ternary_g128_cached( &self, handle: GpuWeightHandle, input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

GEMV using a pre-uploaded ternary weight buffer (no host→device copy for weights). Read more

Auto Trait Implementations§

§

impl UnwindSafe for KernelDispatcher

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T> Instrument for T

Source §

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more

Source §

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

impl<T> Pointable for T

Source §

const ALIGN: usize

The alignment of pointer.

Source §

type Init = T

The type for initializers.

Source §

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

Source §

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

Source §

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

Source §

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Source §

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source §

fn vzip(self) -> V

Source §

impl<T> WithSubscriber for T

Source §

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more

Source §

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more

Struct KernelDispatcher Copy item path

Implementations§

impl KernelDispatcher

pub fn auto_detect() -> Self

pub fn with_tier(tier: KernelTier) -> Self

pub fn tier(&self) -> KernelTier

Trait Implementations§

impl Debug for KernelDispatcher

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Fp8Kernel for KernelDispatcher

fn dequant_fp8_e4m3( &self, blocks: &[BlockFP8E4M3], output: &mut [f32], ) -> KernelResult<()>

fn dequant_fp8_e5m2( &self, blocks: &[BlockFP8E5M2], output: &mut [f32], ) -> KernelResult<()>

fn gemv_fp8_e4m3( &self, blocks: &[BlockFP8E4M3], input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

fn gemv_fp8_e5m2( &self, blocks: &[BlockFP8E5M2], input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

fn gemm_fp8_e4m3( &self, blocks: &[BlockFP8E4M3], inputs: &[f32], outputs: &mut [f32], n_rows: usize, k: usize, batch: usize, ) -> KernelResult<()>

fn gemm_fp8_e5m2( &self, blocks: &[BlockFP8E5M2], inputs: &[f32], outputs: &mut [f32], n_rows: usize, k: usize, batch: usize, ) -> KernelResult<()>

fn name_fp8(&self) -> &'static str

impl OneBitKernel for KernelDispatcher

fn dequant( &self, blocks: &[BlockQ1_0G128], output: &mut [f32], ) -> KernelResult<()>

fn gemv( &self, blocks: &[BlockQ1_0G128], input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

fn gemm( &self, blocks: &[BlockQ1_0G128], input: &[f32], output: &mut [f32], m: usize, n_rows: usize, k: usize, ) -> KernelResult<()>

fn name(&self) -> &'static str

fn is_gpu_accelerated(&self) -> bool

fn upload_weights(&self, blocks: &[BlockQ1_0G128]) -> Option<GpuWeightHandle>

fn gemv_cached( &self, handle: GpuWeightHandle, input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

fn batch_attn_phase( &self, hidden: &[f32], norm_weight: &[f32], norm_eps: f32, qkv_handle: GpuWeightHandle, q_rows: usize, k_rows: usize, h: usize, ) -> KernelResult<Option<(Vec<f32>, Vec<f32>, Vec<f32>)>>

fn batch_ffn_phase( &self, hidden: &mut [f32], attn_out: &[f32], norm_weight: &[f32], norm_eps: f32, attn_proj_handle: GpuWeightHandle, gate_up_handle: GpuWeightHandle, down_handle: GpuWeightHandle, h: usize, intermediate: usize, attn_proj_k: usize, ) -> KernelResult<bool>

impl TernaryKernel for KernelDispatcher

fn dequant_ternary_g128( &self, blocks: &[BlockTQ2_0_g128], output: &mut [f32], ) -> KernelResult<()>

fn gemv_ternary_g128( &self, blocks: &[BlockTQ2_0_g128], input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

fn gemm_ternary_g128( &self, blocks: &[BlockTQ2_0_g128], input: &[f32], output: &mut [f32], m: usize, n_rows: usize, k: usize, ) -> KernelResult<()>

fn upload_weights_ternary( &self, blocks: &[BlockTQ2_0_g128], ) -> Option<GpuWeightHandle>

fn gemv_ternary_g128_cached( &self, handle: GpuWeightHandle, input: &[f32], output: &mut [f32], n_rows: usize, k: usize, ) -> KernelResult<()>

Auto Trait Implementations§

impl Freeze for KernelDispatcher

impl RefUnwindSafe for KernelDispatcher

impl Send for KernelDispatcher

impl Sync for KernelDispatcher

impl Unpin for KernelDispatcher

impl UnsafeUnpin for KernelDispatcher

impl UnwindSafe for KernelDispatcher

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

Struct KernelDispatcher

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,