trueno/backends/gpu/backend_ops.rs
1//! GPU backend operation implementations
2//!
3//! Contains all compute operations for [`GpuBackend`] including:
4//! - Vector operations (add, dot product)
5//! - Activation functions (ReLU, sigmoid, tanh, swish, GELU, softmax, etc.)
6//! - Matrix operations (matmul, convolve2d, eigendecomposition)
7//! - Tiled reductions (sum, max, min)
8
9use super::GpuBackend;
10
11#[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
12impl GpuBackend {
13 /// Vector addition on GPU: c = a + b
14 ///
15 /// # Arguments
16 ///
17 /// * `a` - Vector a
18 /// * `b` - Vector b
19 ///
20 /// # Returns
21 ///
22 /// Vector c (element-wise sum)
23 pub fn vec_add(&mut self, a: &[f32], b: &[f32]) -> Result<Vec<f32>, String> {
24 if a.len() != b.len() {
25 return Err(format!("Vector length mismatch: {} != {}", a.len(), b.len()));
26 }
27
28 // wgpu doesn't allow zero-sized buffers
29 if a.is_empty() {
30 return Err("Cannot perform GPU operation on empty vectors".to_string());
31 }
32
33 let device = self.ensure_device()?;
34
35 // Create output buffer
36 let mut result = vec![0.0f32; a.len()];
37
38 // Execute GPU compute
39 device.vec_add(a, b, &mut result)?;
40
41 Ok(result)
42 }
43
44 /// Dot product on GPU: result = sum(a[i] * b[i])
45 ///
46 /// # Arguments
47 ///
48 /// * `a` - Vector a
49 /// * `b` - Vector b
50 ///
51 /// # Returns
52 ///
53 /// Scalar dot product result
54 pub fn dot(&mut self, a: &[f32], b: &[f32]) -> Result<f32, String> {
55 if a.len() != b.len() {
56 return Err(format!("Vector length mismatch: {} != {}", a.len(), b.len()));
57 }
58
59 let device = self.ensure_device()?;
60
61 // Execute GPU compute
62 device.dot(a, b)
63 }
64
65 /// ReLU activation on GPU: result[i] = max(0, input[i])
66 ///
67 /// # Arguments
68 ///
69 /// * `input` - Input vector
70 ///
71 /// # Returns
72 ///
73 /// Vector with ReLU applied element-wise
74 pub fn relu(&mut self, input: &[f32]) -> Result<Vec<f32>, String> {
75 let device = self.ensure_device()?;
76
77 // Create output buffer
78 let mut result = vec![0.0f32; input.len()];
79
80 // Execute GPU compute
81 device.relu(input, &mut result)?;
82
83 Ok(result)
84 }
85
86 /// Leaky ReLU activation on GPU: result[i] = max(negative_slope * input[i], input[i])
87 ///
88 /// # Arguments
89 ///
90 /// * `input` - Input vector
91 /// * `negative_slope` - Slope for negative values (typically 0.01)
92 ///
93 /// # Returns
94 ///
95 /// Vector with leaky ReLU applied element-wise
96 pub fn leaky_relu(&mut self, input: &[f32], negative_slope: f32) -> Result<Vec<f32>, String> {
97 let device = self.ensure_device()?;
98
99 // Create output buffer
100 let mut result = vec![0.0f32; input.len()];
101
102 // Execute GPU compute
103 device.leaky_relu(input, &mut result, negative_slope)?;
104
105 Ok(result)
106 }
107
108 /// ELU activation on GPU: result[i] = x if x > 0, else alpha * (exp(x) - 1)
109 ///
110 /// # Arguments
111 ///
112 /// * `input` - Input vector
113 /// * `alpha` - Scaling factor for negative values (typically 1.0)
114 ///
115 /// # Returns
116 ///
117 /// Vector with ELU applied element-wise
118 pub fn elu(&mut self, input: &[f32], alpha: f32) -> Result<Vec<f32>, String> {
119 let device = self.ensure_device()?;
120
121 // Create output buffer
122 let mut result = vec![0.0f32; input.len()];
123
124 // Execute GPU compute
125 device.elu(input, &mut result, alpha)?;
126
127 Ok(result)
128 }
129
130 /// Clip (clamp) operation on GPU: result[i] = clamp(input[i], min_val, max_val)
131 ///
132 /// # Arguments
133 ///
134 /// * `input` - Input vector
135 /// * `min_val` - Minimum value
136 /// * `max_val` - Maximum value
137 ///
138 /// # Returns
139 ///
140 /// Vector with clip applied element-wise
141 pub fn clip(&mut self, input: &[f32], min_val: f32, max_val: f32) -> Result<Vec<f32>, String> {
142 let device = self.ensure_device()?;
143
144 // Create output buffer
145 let mut result = vec![0.0f32; input.len()];
146
147 // Execute GPU compute
148 device.clip(input, &mut result, min_val, max_val)?;
149
150 Ok(result)
151 }
152
153 /// Sigmoid activation on GPU: result[i] = 1 / (1 + exp(-input[i]))
154 ///
155 /// # Arguments
156 ///
157 /// * `input` - Input vector
158 ///
159 /// # Returns
160 ///
161 /// Vector with sigmoid applied element-wise
162 pub fn sigmoid(&mut self, input: &[f32]) -> Result<Vec<f32>, String> {
163 let device = self.ensure_device()?;
164
165 // Create output buffer
166 let mut result = vec![0.0f32; input.len()];
167
168 // Execute GPU compute
169 device.sigmoid(input, &mut result)?;
170
171 Ok(result)
172 }
173
174 /// Tanh activation on GPU: result[i] = tanh(input[i])
175 ///
176 /// # Arguments
177 ///
178 /// * `input` - Input vector
179 ///
180 /// # Returns
181 ///
182 /// Vector with tanh applied element-wise
183 pub fn tanh(&mut self, input: &[f32]) -> Result<Vec<f32>, String> {
184 let device = self.ensure_device()?;
185
186 // Create output buffer
187 let mut result = vec![0.0f32; input.len()];
188
189 // Execute GPU compute
190 device.tanh(input, &mut result)?;
191
192 Ok(result)
193 }
194
195 /// Swish activation on GPU: result[i] = input[i] / (1 + exp(-input[i]))
196 ///
197 /// # Arguments
198 ///
199 /// * `input` - Input vector
200 ///
201 /// # Returns
202 ///
203 /// Vector with swish applied element-wise
204 pub fn swish(&mut self, input: &[f32]) -> Result<Vec<f32>, String> {
205 let device = self.ensure_device()?;
206
207 // Create output buffer
208 let mut result = vec![0.0f32; input.len()];
209
210 // Execute GPU compute
211 device.swish(input, &mut result)?;
212
213 Ok(result)
214 }
215
216 /// GELU activation on GPU: result[i] = 0.5 * input[i] * (1 + tanh(...))
217 ///
218 /// # Arguments
219 ///
220 /// * `input` - Input vector
221 ///
222 /// # Returns
223 ///
224 /// Vector with GELU applied element-wise
225 pub fn gelu(&mut self, input: &[f32]) -> Result<Vec<f32>, String> {
226 let device = self.ensure_device()?;
227
228 // Create output buffer
229 let mut result = vec![0.0f32; input.len()];
230
231 // Execute GPU compute
232 device.gelu(input, &mut result)?;
233
234 Ok(result)
235 }
236
237 /// Softmax activation on GPU: result[i] = exp(input[i] - max) / sum(exp(input - max))
238 ///
239 /// Uses multi-pass reduction for numerical stability:
240 /// - Pass 1: Max reduction (parallel)
241 /// - Pass 2: Exp-subtract (element-wise)
242 /// - Pass 3: Sum reduction (parallel)
243 /// - Pass 4: Normalize (element-wise)
244 ///
245 /// # Arguments
246 ///
247 /// * `input` - Input vector
248 ///
249 /// # Returns
250 ///
251 /// Vector with softmax applied element-wise
252 pub fn softmax(&mut self, input: &[f32]) -> Result<Vec<f32>, String> {
253 contract_pre_softmax!(input);
254 let device = self.ensure_device()?;
255
256 // Create output buffer
257 let mut result = vec![0.0f32; input.len()];
258
259 // Execute GPU compute
260 device.softmax(input, &mut result)?;
261
262 contract_post_softmax!(&result);
263 Ok(result)
264 }
265
266 /// Log-softmax activation on GPU: result[i] = log(softmax(input)[i])
267 ///
268 /// Uses multi-pass reduction for numerical stability:
269 /// - Pass 1: Max reduction (parallel)
270 /// - Pass 2: Exp-subtract (element-wise)
271 /// - Pass 3: Sum reduction (parallel)
272 /// - Pass 4: Log-normalize (element-wise)
273 ///
274 /// # Arguments
275 ///
276 /// * `input` - Input vector
277 ///
278 /// # Returns
279 ///
280 /// Vector with log-softmax applied element-wise
281 pub fn log_softmax(&mut self, input: &[f32]) -> Result<Vec<f32>, String> {
282 contract_pre_log_softmax!(input);
283 let device = self.ensure_device()?;
284
285 // Create output buffer
286 let mut result = vec![0.0f32; input.len()];
287
288 // Execute GPU compute
289 device.log_softmax(input, &mut result)?;
290
291 Ok(result)
292 }
293
294 /// 2D Convolution on GPU: output = input (convolved with) kernel
295 ///
296 /// # Arguments
297 ///
298 /// * `input` - Input matrix (flattened row-major)
299 /// * `kernel` - Convolution kernel (flattened row-major)
300 /// * `input_rows` - Number of rows in input
301 /// * `input_cols` - Number of columns in input
302 /// * `kernel_rows` - Number of rows in kernel
303 /// * `kernel_cols` - Number of columns in kernel
304 ///
305 /// # Returns
306 ///
307 /// Output matrix (flattened row-major, "valid" convolution)
308 /// - output_rows = input_rows - kernel_rows + 1
309 /// - output_cols = input_cols - kernel_cols + 1
310 pub fn convolve2d(
311 &mut self,
312 input: &[f32],
313 kernel: &[f32],
314 input_rows: usize,
315 input_cols: usize,
316 kernel_rows: usize,
317 kernel_cols: usize,
318 ) -> Result<Vec<f32>, String> {
319 let device = self.ensure_device()?;
320
321 // Calculate output dimensions
322 let output_rows = input_rows.saturating_sub(kernel_rows).saturating_add(1);
323 let output_cols = input_cols.saturating_sub(kernel_cols).saturating_add(1);
324
325 // Create output buffer
326 let mut result = vec![0.0f32; output_rows * output_cols];
327
328 // Execute GPU compute
329 device.convolve2d(
330 input,
331 kernel,
332 &mut result,
333 input_rows,
334 input_cols,
335 kernel_rows,
336 kernel_cols,
337 )?;
338
339 Ok(result)
340 }
341
342 /// Matrix multiplication on GPU: C = A x B
343 ///
344 /// # Arguments
345 ///
346 /// * `a` - Matrix A (m x k) in row-major order
347 /// * `b` - Matrix B (k x n) in row-major order
348 /// * `m` - Rows of A and C
349 /// * `k` - Cols of A, rows of B
350 /// * `n` - Cols of B and C
351 ///
352 /// # Returns
353 ///
354 /// Matrix C (m x n) in row-major order
355 pub fn matmul(
356 &mut self,
357 a: &[f32],
358 b: &[f32],
359 m: usize,
360 k: usize,
361 n: usize,
362 ) -> Result<Vec<f32>, String> {
363 let device = self.ensure_device()?;
364
365 // Create output buffer
366 let mut result = vec![0.0f32; m * n];
367
368 // Execute GPU compute
369 device.matmul(a, b, &mut result, m, k, n)?;
370
371 Ok(result)
372 }
373
374 /// Symmetric eigendecomposition on GPU
375 ///
376 /// Computes eigenvalues and eigenvectors using Jacobi algorithm with
377 /// GPU-accelerated Givens rotations.
378 ///
379 /// # Arguments
380 ///
381 /// * `matrix` - Symmetric matrix data (row-major, n x n)
382 /// * `n` - Matrix dimension
383 ///
384 /// # Returns
385 ///
386 /// Tuple of (eigenvalues, eigenvector_data) where eigenvector_data is row-major
387 pub fn symmetric_eigen(
388 &mut self,
389 matrix: &[f32],
390 n: usize,
391 ) -> Result<(Vec<f32>, Vec<f32>), String> {
392 let device = self.ensure_device()?;
393 device.symmetric_eigen(matrix, n)
394 }
395
396 /// 2D Tiled Sum Reduction on GPU
397 ///
398 /// Uses 16x16 workgroups for efficient parallel reduction with
399 /// optimal memory coalescing.
400 ///
401 /// # Arguments
402 ///
403 /// * `data` - Input 2D data in row-major order
404 /// * `width` - Number of columns
405 /// * `height` - Number of rows
406 ///
407 /// # Returns
408 ///
409 /// Sum of all elements
410 pub fn tiled_sum_2d_gpu(
411 &mut self,
412 data: &[f32],
413 width: usize,
414 height: usize,
415 ) -> Result<f32, String> {
416 let device = self.ensure_device()?;
417 device.tiled_sum_2d(data, width, height)
418 }
419
420 /// 2D Tiled Max Reduction on GPU
421 ///
422 /// Uses 16x16 workgroups for efficient parallel max reduction.
423 pub fn tiled_max_2d_gpu(
424 &mut self,
425 data: &[f32],
426 width: usize,
427 height: usize,
428 ) -> Result<f32, String> {
429 let device = self.ensure_device()?;
430 device.tiled_max_2d(data, width, height)
431 }
432
433 /// 2D Tiled Min Reduction on GPU
434 ///
435 /// Uses 16x16 workgroups for efficient parallel min reduction.
436 pub fn tiled_min_2d_gpu(
437 &mut self,
438 data: &[f32],
439 width: usize,
440 height: usize,
441 ) -> Result<f32, String> {
442 let device = self.ensure_device()?;
443 device.tiled_min_2d(data, width, height)
444 }
445}