entrenar/autograd/cuda_forward/
mod.rs1mod activations;
31pub mod bf16_cast;
32mod cache;
33mod elementwise;
34mod matmul;
35pub mod matmul_f16;
36mod normalization;
37#[cfg(test)]
38mod tests;
39
40pub use activations::{
41 batched_softmax_forward, gelu_forward, relu_forward, silu_forward, softmax_forward,
42};
43pub use bf16_cast::{bf16_slice_to_f32, f32_slice_to_bf16};
44#[cfg(feature = "cuda")]
45pub use bf16_cast::{cast_bf16_to_f32_gpu, cast_f32_to_bf16_gpu, cast_f32_to_f16_gpu};
46#[cfg(feature = "cuda")]
47pub(crate) use cache::set_forward_cublas_stream;
48pub use cache::{
49 init_forward_kernel_cache, pre_warm_forward_kernels, pre_warm_lora_backward_kernels,
50 set_cublas_workspace,
51};
52pub use elementwise::{
53 batched_to_interleaved_forward, batched_transpose_forward, elementwise_mul_forward,
54 expand_kv_heads, inplace_add_gpu, interleaved_to_batched_forward, residual_add_forward,
55 scale_forward,
56};
57#[cfg(feature = "cuda")]
58pub use matmul::gemm_forward_bf16;
59pub use matmul::{
60 batched_4d_gemm_forward, fused_swiglu_forward, gemm_forward, gemm_forward_bt,
61 gemm_nf4_backward_a, gemm_nf4_forward, gemm_nf4_gate_up_forward, gemm_nf4_tc_backward_a,
62 gemm_nf4_tc_forward,
63};
64#[cfg(feature = "cuda")]
65pub(crate) use matmul::{
66 cublas_gemm_backward_a, cublas_gemm_backward_a_accumulate, cublas_gemm_backward_b,
67};
68#[cfg(feature = "cuda")]
69pub use matmul::{gemm_nf4_backward_a_cublas, gemm_nf4_dequant_cublas};
70#[cfg(feature = "cuda")]
71pub use matmul_f16::{gemm_f16_to_f32_backward_a, gemm_f16_to_f32_forward, gemm_forward_f16};
72pub use normalization::{
73 batched_rope_neox_backward, batched_rope_neox_forward, fused_residual_rmsnorm_forward,
74 layer_norm_forward, per_head_rmsnorm_forward, rms_norm_forward, rope_neox_forward,
75};