mlx-native 0.6.3

Pure-Rust Metal GPU compute library for MLX-compatible inference on Apple Silicon
Documentation
//! GPU kernel host-side dispatch functions.
//!
//! Each submodule implements dispatch for a specific kernel family.

pub mod argmax;
pub mod argsort;
pub mod top_k;
pub mod copy;
pub mod cumsum;
pub mod dense_gemm;
pub mod dense_gemv_bf16;
pub mod dense_mm_bf16;
pub mod dense_mm_f16;
pub mod dense_mm_f32_f32;
pub mod elementwise;
pub mod embedding;
pub mod gather;
pub mod gather_bench;
pub mod hadamard;
pub mod hadamard_quantize_kv;
pub mod encode_helpers;
pub mod fused_head_norm_rope;
pub mod fused_norm_add;
pub mod fused_residual_norm;
pub mod gelu;
pub mod kv_cache_copy;
pub mod l2_norm;
pub mod moe_dispatch;
pub mod moe_gate;
pub mod moe_softmax_topk;
pub mod moe_weighted_reduce;
pub mod qkv_split;
pub mod repeat_tiled;
pub mod quantized_matmul;
pub mod quantized_matmul_ggml;
pub mod quantized_matmul_id;
pub mod quantized_matmul_id_ggml;
pub mod rms_norm;
pub mod rope;
pub mod rope_multi;
pub mod vision_2d_rope;
pub mod scale_mask_softmax;
pub mod sigmoid_mul;
pub mod silu_mul;
pub mod compute_g_beta;
pub mod ssm_norm_gate;
pub mod flash_attn_prefill;
pub mod flash_attn_prefill_blk;
pub mod flash_attn_prefill_d512;
pub mod flash_attn_prefill_mask;
pub mod flash_attn_vec;
pub mod flash_attn_vec_tq;
pub mod flash_attn_vec_tq_hb;
pub mod fwht_standalone;
pub mod chunk_gated_delta_rule;
pub mod chunk_gated_delta_rule_tri_solve_invert;
pub mod gated_delta_net;
pub mod gated_delta_net_decode;
pub mod gated_delta_net_chunk;
pub mod gated_delta_net_chunk_o;
pub mod gated_delta_net_kkt;
pub mod gated_delta_net_recompute_wu;
pub mod tq_dequantize_kv;
pub mod sdpa;
pub mod sdpa_decode;
pub mod sdpa_sliding;
pub mod softcap;
pub mod softmax;
pub mod softmax_sample;
pub mod ssm_conv;
pub mod transpose;
pub mod tri_solve;