1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
//! CUDA kernel implementations for tensor operations
//!
//! This module provides native CUDA kernels for all tensor operations,
//! eliminating the need for CPU fallback in most cases.
//!
//! # Architecture
//!
//! Kernels are written in CUDA C++ (.cu files) and compiled to PTX by build.rs.
//! The PTX is loaded at runtime and cached per-device for efficient reuse.
//!
//! # Module Organization
//!
//! - `loader` - Kernel loading, caching, and generic launch infrastructure
//! - `binary` - Binary element-wise operations (add, sub, mul, div, pow, max, min, logical_and/or/xor)
//! - `unary` - Unary element-wise operations (neg, abs, sqrt, exp, log, sign, isnan, isinf, logical_not, etc.)
//! - `scalar` - Tensor-scalar operations (add_scalar, mul_scalar, etc.)
//! - `reduce` - Reduction operations (sum, max, min)
//! - `compare` - Comparison operations (eq, ne, lt, le, gt, ge)
//! - `activation` - Activation functions (relu, sigmoid, softmax, silu, gelu)
//! - `norm` - Normalization operations (rms_norm, layer_norm)
//! - `cast` - Type casting operations
//! - `utility` - Utility operations (fill)
//! - `ternary` - Ternary operations (where)
//! - `sparse_spmv` - Sparse matrix operations (SpMV, SpMM)
//! - `sparse_merge` - Sparse matrix merge operations
//! - `sparse_convert` - Sparse format conversions (COO/CSR/CSC)
//! - `sparse_coo` - COO sparse element-wise operations with GPU sorting
//! - `scan` - Prefix sum operations
//!
//! # Kernel Files
//!
//! - `binary.cu` - Binary element-wise operations
//! - `unary.cu` - Unary element-wise operations
//! - `scalar.cu` - Tensor-scalar operations
//! - `reduce.cu` - Reduction operations
//! - `compare.cu` - Comparison operations
//! - `activation.cu` - Activation functions
//! - `norm.cu` - Normalization operations
//! - `cast.cu` - Type casting operations
//! - `utility.cu` - Utility operations
//! - `ternary.cu` - Ternary operations
//! - `sparse_spmv.cu` - Sparse matrix operations
//! - `sparse_merge.cu` - Sparse matrix merge operations
//! - `sparse_convert.cu` - Sparse format conversions
//! - `sparse_coo.cu` - COO sparse element-wise operations with GPU sorting
//! - `scan.cu` - Prefix sum operations
mod activation;
mod advanced_random;
mod binary;
mod cast;
mod compare;
mod complex;
mod conv;
mod cumulative;
mod distance;
mod distributions;
mod fft;
#[cfg(feature = "fp8")]
mod fp8_matmul;
mod fused_activation_mul;
mod fused_add_norm;
mod fused_elementwise;
mod gemm_epilogue;
mod index;
mod linalg;
pub mod linalg_launchers;
pub(in crate::runtime::cuda) mod loader;
mod norm;
mod quasirandom;
mod reduce;
mod scalar;
#[cfg(feature = "sparse")]
mod scan;
mod shape;
mod sort;
#[cfg(feature = "sparse")]
mod sparse_24_launcher;
#[cfg(feature = "sparse")]
mod sparse_convert;
#[cfg(feature = "sparse")]
mod sparse_coo;
#[cfg(feature = "sparse")]
mod sparse_linalg;
#[cfg(feature = "sparse")]
mod sparse_merge;
#[cfg(feature = "sparse")]
mod sparse_spmv;
#[cfg(feature = "sparse")]
mod sparse_strategy;
#[cfg(feature = "sparse")]
mod sparse_utils;
mod special;
#[cfg(feature = "sparse")]
mod spgemm;
mod statistics;
mod strided_copy;
mod ternary;
mod unary;
mod utility;
pub use activation::*;
pub use advanced_random::*;
pub use binary::*;
pub use cast::*;
pub use compare::*;
pub use complex::*;
pub use conv::*;
pub use cumulative::*;
pub use distance::*;
pub use distributions::*;
pub use fft::*;
#[cfg(feature = "fp8")]
pub use fp8_matmul::*;
pub use fused_activation_mul::*;
pub use fused_add_norm::*;
pub use fused_elementwise::*;
pub use gemm_epilogue::*;
pub use index::*;
pub use linalg::*;
pub use norm::*;
pub use quasirandom::*;
pub use reduce::*;
pub use scalar::*;
#[cfg(feature = "sparse")]
#[allow(unused_imports)]
pub use scan::*;
pub use shape::*;
pub use sort::*;
#[cfg(feature = "sparse")]
pub use sparse_24_launcher::*;
#[cfg(feature = "sparse")]
pub use sparse_convert::*;
#[cfg(feature = "sparse")]
pub use sparse_coo::*;
#[cfg(feature = "sparse")]
pub use sparse_linalg::*;
#[cfg(feature = "sparse")]
pub use sparse_merge::*;
#[cfg(feature = "sparse")]
pub use sparse_spmv::*;
#[cfg(feature = "sparse")]
#[allow(unused_imports)]
// Sparse strategy types (AddMerge, SubMerge, etc.) used internally in sparse_merge
pub use sparse_strategy::*;
#[cfg(feature = "sparse")]
pub use sparse_utils::*;
pub use special::*;
#[cfg(feature = "sparse")]
pub use spgemm::*;
pub use statistics::*;
pub use strided_copy::*;
pub use ternary::*;
pub use unary::*;
#[allow(unused_imports)] // Prepared for future tensor creation optimization
pub use utility::*;
// Re-export commonly used items from loader for advanced users
#[allow(unused_imports)]
pub use loader::{
BLOCK_SIZE, LaunchConfig, kernel_names, launch_gemv_kernel_bt, launch_gemv_kernel_bt_mr,
launch_matmul_batched_kernel, launch_matmul_bias_batched_kernel, launch_matmul_bias_kernel,
launch_matmul_kernel, launch_semiring_matmul_batched_kernel, launch_semiring_matmul_kernel,
preload_modules,
};