1#![allow(unused_variables)] use crate::errors::{Result, TrustformersError};
4use crate::gpu::GpuBackend;
5#[cfg(feature = "cuda")]
6use crate::kernels::cuda_kernels::CudaKernel;
7#[cfg(feature = "intel")]
8use crate::kernels::intel_kernels::{IntelKernel, IntelKernelConfig};
9#[cfg(feature = "rocm")]
10use crate::kernels::rocm_kernels::RocmKernel;
11#[cfg(feature = "vulkan")]
12use crate::kernels::vulkan_kernels::VulkanKernel;
13use crate::tensor::Tensor;
14#[cfg(any(
15 feature = "cuda",
16 feature = "rocm",
17 feature = "intel",
18 feature = "vulkan"
19))]
20use std::sync::{Arc, Mutex};
21
22pub struct GpuAcceleratedOps {
27 backend: GpuBackend,
28 #[cfg(feature = "cuda")]
29 cuda_kernel: Option<Arc<Mutex<CudaKernel>>>,
30 #[cfg(feature = "rocm")]
31 rocm_kernel: Option<Arc<Mutex<RocmKernel>>>,
32 #[cfg(feature = "intel")]
33 intel_kernel: Option<Arc<Mutex<IntelKernel>>>,
34 #[cfg(feature = "vulkan")]
35 vulkan_kernel: Option<Arc<Mutex<VulkanKernel>>>,
36 #[allow(dead_code)]
37 device_id: usize,
38 #[allow(dead_code)]
39 enable_async: bool,
40}
41
42#[derive(Debug, Clone)]
44pub struct GpuOpsConfig {
45 pub device_id: usize,
46 pub enable_async: bool,
47 pub memory_pool_size: u64,
48 pub kernel_cache_size: usize,
49 pub precision: GpuPrecision,
50}
51
52#[derive(Debug, Clone, Copy, PartialEq)]
54pub enum GpuPrecision {
55 FP32,
56 FP16,
57 BF16,
58 INT8,
59 INT4,
60}
61
62impl Default for GpuOpsConfig {
63 fn default() -> Self {
64 Self {
65 device_id: 0,
66 enable_async: true,
67 memory_pool_size: 2 * 1024 * 1024 * 1024, kernel_cache_size: 1000,
69 precision: GpuPrecision::FP32,
70 }
71 }
72}
73
74impl GpuAcceleratedOps {
75 pub fn new(config: GpuOpsConfig) -> Result<Self> {
77 let backend = GpuBackend::default();
78
79 #[cfg(feature = "cuda")]
80 let cuda_kernel = if backend == GpuBackend::Cuda {
81 Some(Arc::new(Mutex::new(CudaKernel::new()?)))
82 } else {
83 None
84 };
85
86 #[cfg(feature = "rocm")]
87 let rocm_kernel = if backend == GpuBackend::Rocm {
88 Some(Arc::new(Mutex::new(RocmKernel::new()?)))
89 } else {
90 None
91 };
92
93 #[cfg(feature = "intel")]
94 let intel_kernel = if backend == GpuBackend::Intel {
95 let intel_config = IntelKernelConfig {
96 device_id: config.device_id,
97 workgroup_size: 256,
98 ..Default::default()
99 };
100 Some(Arc::new(Mutex::new(IntelKernel::new(intel_config)?)))
101 } else {
102 None
103 };
104
105 #[cfg(feature = "vulkan")]
106 let vulkan_kernel = if backend == GpuBackend::Vulkan {
107 let mut kernel = VulkanKernel::new()?;
108 kernel.initialize(config.device_id)?;
109 Some(Arc::new(Mutex::new(kernel)))
110 } else {
111 None
112 };
113
114 Ok(Self {
115 backend,
116 #[cfg(feature = "cuda")]
117 cuda_kernel,
118 #[cfg(feature = "rocm")]
119 rocm_kernel,
120 #[cfg(feature = "intel")]
121 intel_kernel,
122 #[cfg(feature = "vulkan")]
123 vulkan_kernel,
124 device_id: config.device_id,
125 enable_async: config.enable_async,
126 })
127 }
128
129 pub fn matmul(&self, a: &Tensor, b: &Tensor) -> Result<Tensor> {
131 let a_shape = a.shape();
132 let b_shape = b.shape();
133
134 if a_shape.len() != 2 || b_shape.len() != 2 {
135 return Err(TrustformersError::tensor_op_error(
136 "Matrix multiplication requires 2D tensors",
137 "matmul",
138 ));
139 }
140
141 if a_shape[1] != b_shape[0] {
142 return Err(TrustformersError::tensor_op_error(
143 "Matrix dimensions incompatible for multiplication",
144 "matmul",
145 ));
146 }
147
148 let mut result = Tensor::zeros(&[a_shape[0], b_shape[1]])?;
149
150 match self.backend {
151 GpuBackend::Cuda => {
152 #[cfg(feature = "cuda")]
153 if let Some(ref cuda_kernel) = self.cuda_kernel {
154 let mut kernel = cuda_kernel.lock().expect("lock should not be poisoned");
155 kernel.matmul(a, b, &mut result, None)?;
156 } else {
157 return Err(TrustformersError::tensor_op_error(
158 "CUDA kernel not available",
159 "matmul",
160 ));
161 }
162 #[cfg(not(feature = "cuda"))]
163 return Err(TrustformersError::tensor_op_error(
164 "CUDA support not enabled",
165 "matmul",
166 ));
167 },
168 GpuBackend::Rocm => {
169 #[cfg(feature = "rocm")]
170 if let Some(ref rocm_kernel) = self.rocm_kernel {
171 let mut kernel = rocm_kernel.lock().expect("lock should not be poisoned");
172 kernel.matmul(a, b, &mut result, None)?;
173 } else {
174 return Err(TrustformersError::tensor_op_error(
175 "ROCm kernel not available",
176 "matmul",
177 ));
178 }
179 #[cfg(not(feature = "rocm"))]
180 return Err(TrustformersError::tensor_op_error(
181 "ROCm support not enabled",
182 "matmul",
183 ));
184 },
185 GpuBackend::Intel => {
186 #[cfg(feature = "intel")]
187 if let Some(ref intel_kernel) = self.intel_kernel {
188 let mut kernel = intel_kernel.lock().expect("lock should not be poisoned");
189 kernel.gemm(
190 a,
191 b,
192 &mut result,
193 1.0,
194 0.0,
195 crate::kernels::intel_kernels::IntelPrecision::FP32,
196 )?;
197 } else {
198 return Err(TrustformersError::tensor_op_error(
199 "Intel kernel not available",
200 "matmul",
201 ));
202 }
203 #[cfg(not(feature = "intel"))]
204 return Err(TrustformersError::tensor_op_error(
205 "Intel oneAPI support not enabled",
206 "matmul",
207 ));
208 },
209 GpuBackend::Vulkan => {
210 #[cfg(feature = "vulkan")]
211 if let Some(ref vulkan_kernel) = self.vulkan_kernel {
212 let mut kernel = vulkan_kernel.lock().expect("lock should not be poisoned");
213 kernel.matmul(a, b, &mut result, None)?;
214 } else {
215 return Err(TrustformersError::tensor_op_error(
216 "Vulkan kernel not available",
217 "matmul",
218 ));
219 }
220 #[cfg(not(feature = "vulkan"))]
221 return Err(TrustformersError::tensor_op_error(
222 "Vulkan support not enabled",
223 "matmul",
224 ));
225 },
226 _ => {
227 self.cpu_matmul(a, b, &mut result)?;
229 },
230 }
231
232 Ok(result)
233 }
234
235 pub fn batch_matmul(&self, a: &Tensor, b: &Tensor) -> Result<Tensor> {
237 let a_shape = a.shape();
238 let b_shape = b.shape();
239
240 if a_shape.len() != 3 || b_shape.len() != 3 {
241 return Err(TrustformersError::tensor_op_error(
242 "Batch matrix multiplication requires 3D tensors",
243 "batch_matmul",
244 ));
245 }
246
247 if a_shape[0] != b_shape[0] || a_shape[2] != b_shape[1] {
248 return Err(TrustformersError::tensor_op_error(
249 "Batch matrix dimensions incompatible",
250 "batch_matmul",
251 ));
252 }
253
254 let result = Tensor::zeros(&[a_shape[0], a_shape[1], b_shape[2]])?;
255
256 for batch in 0..a_shape[0] {
258 let a_slice = a.slice(0, batch, batch + 1)?;
259 let b_slice = b.slice(0, batch, batch + 1)?;
260 let mut result_slice = result.slice(0, batch, batch + 1)?;
261
262 match self.backend {
263 GpuBackend::Cuda => {
264 #[cfg(feature = "cuda")]
265 if let Some(ref cuda_kernel) = self.cuda_kernel {
266 let mut kernel = cuda_kernel.lock().expect("lock should not be poisoned");
267 kernel.matmul(&a_slice, &b_slice, &mut result_slice, None)?;
268 } else {
269 self.cpu_matmul(&a_slice, &b_slice, &mut result_slice)?;
270 }
271 #[cfg(not(feature = "cuda"))]
272 self.cpu_matmul(&a_slice, &b_slice, &mut result_slice)?;
273 },
274 GpuBackend::Rocm => {
275 #[cfg(feature = "rocm")]
276 if let Some(ref rocm_kernel) = self.rocm_kernel {
277 let mut kernel = rocm_kernel.lock().expect("lock should not be poisoned");
278 kernel.matmul(&a_slice, &b_slice, &mut result_slice, None)?;
279 } else {
280 self.cpu_matmul(&a_slice, &b_slice, &mut result_slice)?;
281 }
282 #[cfg(not(feature = "rocm"))]
283 self.cpu_matmul(&a_slice, &b_slice, &mut result_slice)?;
284 },
285 _ => {
286 self.cpu_matmul(&a_slice, &b_slice, &mut result_slice)?;
287 },
288 }
289 }
290
291 Ok(result)
292 }
293
294 pub fn flash_attention(
296 &self,
297 query: &Tensor,
298 key: &Tensor,
299 value: &Tensor,
300 scale: f32,
301 mask: Option<&Tensor>,
302 ) -> Result<Tensor> {
303 let q_shape = query.shape();
304 let k_shape = key.shape();
305 let v_shape = value.shape();
306
307 if q_shape.len() != 3 || k_shape.len() != 3 || v_shape.len() != 3 {
308 return Err(TrustformersError::tensor_op_error(
309 "Attention requires 3D tensors [batch, seq_len, hidden_dim]",
310 "flash_attention",
311 ));
312 }
313
314 if q_shape[0] != k_shape[0] || q_shape[0] != v_shape[0] {
315 return Err(TrustformersError::tensor_op_error(
316 "Batch dimensions must match for attention",
317 "flash_attention",
318 ));
319 }
320
321 let mut output = Tensor::zeros(&q_shape)?;
322
323 match self.backend {
324 GpuBackend::Cuda => {
325 #[cfg(feature = "cuda")]
326 if let Some(ref cuda_kernel) = self.cuda_kernel {
327 let mut kernel = cuda_kernel.lock().expect("lock should not be poisoned");
328 kernel.flash_attention(query, key, value, &mut output, None)?;
329 } else {
330 self.cpu_attention(query, key, value, &mut output, scale, mask)?;
331 }
332 #[cfg(not(feature = "cuda"))]
333 self.cpu_attention(query, key, value, &mut output, scale, mask)?;
334 },
335 GpuBackend::Rocm => {
336 #[cfg(feature = "rocm")]
337 if let Some(ref rocm_kernel) = self.rocm_kernel {
338 let mut kernel = rocm_kernel.lock().expect("lock should not be poisoned");
339 kernel.flash_attention(query, key, value, &mut output, None)?;
340 } else {
341 self.cpu_attention(query, key, value, &mut output, scale, mask)?;
342 }
343 #[cfg(not(feature = "rocm"))]
344 self.cpu_attention(query, key, value, &mut output, scale, mask)?;
345 },
346 GpuBackend::Intel => {
347 #[cfg(feature = "intel")]
348 if let Some(ref intel_kernel) = self.intel_kernel {
349 let mut kernel = intel_kernel.lock().expect("lock should not be poisoned");
350 kernel.attention(
351 query,
352 key,
353 value,
354 &mut output,
355 scale,
356 crate::kernels::intel_kernels::IntelPrecision::FP32,
357 )?;
358 } else {
359 self.cpu_attention(query, key, value, &mut output, scale, mask)?;
360 }
361 #[cfg(not(feature = "intel"))]
362 self.cpu_attention(query, key, value, &mut output, scale, mask)?;
363 },
364 GpuBackend::Vulkan => {
365 #[cfg(feature = "vulkan")]
366 if let Some(ref vulkan_kernel) = self.vulkan_kernel {
367 let mut kernel = vulkan_kernel.lock().expect("lock should not be poisoned");
368 kernel.flash_attention(query, key, value, &mut output, None)?;
369 } else {
370 self.cpu_attention(query, key, value, &mut output, scale, mask)?;
371 }
372 #[cfg(not(feature = "vulkan"))]
373 self.cpu_attention(query, key, value, &mut output, scale, mask)?;
374 },
375 _ => {
376 self.cpu_attention(query, key, value, &mut output, scale, mask)?;
377 },
378 }
379
380 Ok(output)
381 }
382
383 pub fn layer_norm(
385 &self,
386 input: &Tensor,
387 gamma: &Tensor,
388 beta: &Tensor,
389 epsilon: f32,
390 ) -> Result<Tensor> {
391 let input_shape = input.shape();
392 let mut output = Tensor::zeros(&input_shape)?;
393
394 match self.backend {
395 GpuBackend::Cuda => {
396 #[cfg(feature = "cuda")]
397 if let Some(ref cuda_kernel) = self.cuda_kernel {
398 let mut kernel = cuda_kernel.lock().expect("lock should not be poisoned");
399 kernel.layer_norm(input, gamma, beta, &mut output, epsilon, None)?;
400 } else {
401 self.cpu_layer_norm(input, gamma, beta, &mut output, epsilon)?;
402 }
403 #[cfg(not(feature = "cuda"))]
404 self.cpu_layer_norm(input, gamma, beta, &mut output, epsilon)?;
405 },
406 GpuBackend::Rocm => {
407 #[cfg(feature = "rocm")]
408 if let Some(ref rocm_kernel) = self.rocm_kernel {
409 let mut kernel = rocm_kernel.lock().expect("lock should not be poisoned");
410 kernel.layer_norm(input, gamma, beta, &mut output, epsilon, None)?;
411 } else {
412 self.cpu_layer_norm(input, gamma, beta, &mut output, epsilon)?;
413 }
414 #[cfg(not(feature = "rocm"))]
415 self.cpu_layer_norm(input, gamma, beta, &mut output, epsilon)?;
416 },
417 GpuBackend::Intel => {
418 #[cfg(feature = "intel")]
419 if let Some(ref intel_kernel) = self.intel_kernel {
420 let mut kernel = intel_kernel.lock().expect("lock should not be poisoned");
421 kernel.layer_norm(
422 input,
423 gamma,
424 Some(beta),
425 &mut output,
426 epsilon,
427 crate::kernels::intel_kernels::IntelPrecision::FP32,
428 )?;
429 } else {
430 self.cpu_layer_norm(input, gamma, beta, &mut output, epsilon)?;
431 }
432 #[cfg(not(feature = "intel"))]
433 self.cpu_layer_norm(input, gamma, beta, &mut output, epsilon)?;
434 },
435 GpuBackend::Vulkan => {
436 #[cfg(feature = "vulkan")]
437 if let Some(ref vulkan_kernel) = self.vulkan_kernel {
438 let mut kernel = vulkan_kernel.lock().expect("lock should not be poisoned");
439 kernel.layer_norm(
440 input,
441 gamma,
442 Some(beta),
443 &mut output,
444 epsilon,
445 crate::kernels::vulkan_kernels::VulkanPrecision::FP32,
446 )?;
447 } else {
448 self.cpu_layer_norm(input, gamma, beta, &mut output, epsilon)?;
449 }
450 #[cfg(not(feature = "vulkan"))]
451 self.cpu_layer_norm(input, gamma, beta, &mut output, epsilon)?;
452 },
453 _ => {
454 self.cpu_layer_norm(input, gamma, beta, &mut output, epsilon)?;
455 },
456 }
457
458 Ok(output)
459 }
460
461 pub fn gelu(&self, input: &Tensor) -> Result<Tensor> {
463 let input_shape = input.shape();
464 let mut output = Tensor::zeros(&input_shape)?;
465
466 match self.backend {
467 GpuBackend::Cuda => {
468 #[cfg(feature = "cuda")]
469 if let Some(ref cuda_kernel) = self.cuda_kernel {
470 let mut kernel = cuda_kernel.lock().expect("lock should not be poisoned");
471 kernel.fused_gelu(input, &mut output, None)?;
472 } else {
473 self.cpu_gelu(input, &mut output)?;
474 }
475 #[cfg(not(feature = "cuda"))]
476 self.cpu_gelu(input, &mut output)?;
477 },
478 GpuBackend::Rocm => {
479 #[cfg(feature = "rocm")]
480 if let Some(ref rocm_kernel) = self.rocm_kernel {
481 let mut kernel = rocm_kernel.lock().expect("lock should not be poisoned");
482 kernel.fused_gelu(input, &mut output, None)?;
483 } else {
484 self.cpu_gelu(input, &mut output)?;
485 }
486 #[cfg(not(feature = "rocm"))]
487 self.cpu_gelu(input, &mut output)?;
488 },
489 GpuBackend::Vulkan => {
490 #[cfg(feature = "vulkan")]
491 if let Some(ref vulkan_kernel) = self.vulkan_kernel {
492 let mut kernel = vulkan_kernel.lock().expect("lock should not be poisoned");
493 kernel.gelu(input, &mut output, None)?;
494 } else {
495 self.cpu_gelu(input, &mut output)?;
496 }
497 #[cfg(not(feature = "vulkan"))]
498 self.cpu_gelu(input, &mut output)?;
499 },
500 _ => {
501 self.cpu_gelu(input, &mut output)?;
502 },
503 }
504
505 Ok(output)
506 }
507
508 pub fn reduce_sum(&self, input: &Tensor, dim: usize) -> Result<Tensor> {
510 let input_shape = input.shape();
511
512 if dim >= input_shape.len() {
513 return Err(TrustformersError::tensor_op_error(
514 "Reduction dimension out of bounds",
515 "reduce_sum",
516 ));
517 }
518
519 let mut output_shape = input_shape.clone();
520 output_shape.remove(dim);
521 let mut output = Tensor::zeros(&output_shape)?;
522
523 match self.backend {
524 GpuBackend::Cuda => {
525 #[cfg(feature = "cuda")]
526 if let Some(ref cuda_kernel) = self.cuda_kernel {
527 let mut kernel = cuda_kernel.lock().expect("lock should not be poisoned");
528 kernel.reduce_sum(input, &mut output, dim, None)?;
529 } else {
530 self.cpu_reduce_sum(input, &mut output, dim)?;
531 }
532 #[cfg(not(feature = "cuda"))]
533 self.cpu_reduce_sum(input, &mut output, dim)?;
534 },
535 GpuBackend::Rocm => {
536 #[cfg(feature = "rocm")]
537 if let Some(ref rocm_kernel) = self.rocm_kernel {
538 let mut kernel = rocm_kernel.lock().expect("lock should not be poisoned");
539 kernel.reduce_sum(input, &mut output, dim, None)?;
540 } else {
541 self.cpu_reduce_sum(input, &mut output, dim)?;
542 }
543 #[cfg(not(feature = "rocm"))]
544 self.cpu_reduce_sum(input, &mut output, dim)?;
545 },
546 GpuBackend::Vulkan => {
547 #[cfg(feature = "vulkan")]
548 if let Some(ref vulkan_kernel) = self.vulkan_kernel {
549 let mut kernel = vulkan_kernel.lock().expect("lock should not be poisoned");
550 kernel.reduce_sum(input, &mut output, dim, None)?;
551 } else {
552 self.cpu_reduce_sum(input, &mut output, dim)?;
553 }
554 #[cfg(not(feature = "vulkan"))]
555 self.cpu_reduce_sum(input, &mut output, dim)?;
556 },
557 _ => {
558 self.cpu_reduce_sum(input, &mut output, dim)?;
559 },
560 }
561
562 Ok(output)
563 }
564
565 pub fn softmax(&self, input: &Tensor, dim: usize) -> Result<Tensor> {
567 let input_shape = input.shape();
568 let output = Tensor::zeros(&input_shape)?;
569
570 let max_vals = self.reduce_max(input, dim)?;
572 let shifted = self.subtract_broadcast(input, &max_vals, dim)?;
573 let exp_vals = self.exp(&shifted)?;
574 let sum_exp = self.reduce_sum(&exp_vals, dim)?;
575 let result = self.divide_broadcast(&exp_vals, &sum_exp, dim)?;
576
577 Ok(result)
578 }
579
580 pub fn is_gpu_available(&self) -> bool {
582 match self.backend {
583 GpuBackend::Cuda => {
584 #[cfg(feature = "cuda")]
585 return self.cuda_kernel.is_some();
586 #[cfg(not(feature = "cuda"))]
587 return false;
588 },
589 GpuBackend::Rocm => {
590 #[cfg(feature = "rocm")]
591 return self.rocm_kernel.is_some();
592 #[cfg(not(feature = "rocm"))]
593 return false;
594 },
595 GpuBackend::Vulkan => {
596 #[cfg(feature = "vulkan")]
597 return self.vulkan_kernel.is_some();
598 #[cfg(not(feature = "vulkan"))]
599 return false;
600 },
601 GpuBackend::Cpu => false,
602 _ => false, }
604 }
605
606 pub fn get_backend(&self) -> GpuBackend {
608 self.backend
609 }
610
611 pub fn get_memory_usage(&self) -> Result<(u64, u64, u64)> {
613 match self.backend {
614 GpuBackend::Cuda => {
615 #[cfg(feature = "cuda")]
616 if let Some(ref cuda_kernel) = self.cuda_kernel {
617 let kernel = cuda_kernel.lock().expect("lock should not be poisoned");
618 kernel.get_memory_stats(self.device_id)
619 } else {
620 Ok((0, 0, 0))
621 }
622 #[cfg(not(feature = "cuda"))]
623 Ok((0, 0, 0))
624 },
625 GpuBackend::Rocm => {
626 #[cfg(feature = "rocm")]
627 if let Some(ref rocm_kernel) = self.rocm_kernel {
628 let kernel = rocm_kernel.lock().expect("lock should not be poisoned");
629 kernel.get_memory_stats(self.device_id)
630 } else {
631 Ok((0, 0, 0))
632 }
633 #[cfg(not(feature = "rocm"))]
634 Ok((0, 0, 0))
635 },
636 GpuBackend::Vulkan => {
637 #[cfg(feature = "vulkan")]
638 if let Some(ref vulkan_kernel) = self.vulkan_kernel {
639 let kernel = vulkan_kernel.lock().expect("lock should not be poisoned");
640 kernel.get_memory_stats(self.device_id)
641 } else {
642 Ok((0, 0, 0))
643 }
644 #[cfg(not(feature = "vulkan"))]
645 Ok((0, 0, 0))
646 },
647 _ => Ok((0, 0, 0)),
648 }
649 }
650
651 pub fn synchronize(&self) -> Result<()> {
653 Ok(())
655 }
656
657 fn cpu_matmul(&self, a: &Tensor, b: &Tensor, result: &mut Tensor) -> Result<()> {
659 let a_data = a.data()?;
660 let b_data = b.data()?;
661 let result_data = result.data_mut()?;
662
663 let a_shape = a.shape();
664 let b_shape = b.shape();
665
666 for i in 0..a_shape[0] {
668 for j in 0..b_shape[1] {
669 let mut sum = 0.0;
670 for k in 0..a_shape[1] {
671 sum += a_data[i * a_shape[1] + k] * b_data[k * b_shape[1] + j];
672 }
673 result_data[i * b_shape[1] + j] = sum;
674 }
675 }
676
677 Ok(())
678 }
679
680 fn cpu_attention(
681 &self,
682 query: &Tensor,
683 key: &Tensor,
684 value: &Tensor,
685 output: &mut Tensor,
686 scale: f32,
687 _mask: Option<&Tensor>,
688 ) -> Result<()> {
689 let q_shape = query.shape();
690 let q_data = query.data()?;
691 let k_data = key.data()?;
692 let v_data = value.data()?;
693 let o_data = output.data_mut()?;
694
695 let batch_size = q_shape[0];
696 let seq_len = q_shape[1];
697 let hidden_dim = q_shape[2];
698
699 for batch in 0..batch_size {
700 for i in 0..seq_len {
701 let mut scores = vec![0.0; seq_len];
703 for (j, score_ref) in scores.iter_mut().enumerate() {
704 let mut score = 0.0;
705 for d in 0..hidden_dim {
706 let q_idx = batch * seq_len * hidden_dim + i * hidden_dim + d;
707 let k_idx = batch * seq_len * hidden_dim + j * hidden_dim + d;
708 score += q_data[q_idx] * k_data[k_idx];
709 }
710 *score_ref = score * scale;
711 }
712
713 let max_score = scores.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
715 let mut exp_sum = 0.0;
716 for score in &mut scores {
717 *score = (*score - max_score).exp();
718 exp_sum += *score;
719 }
720 for score in &mut scores {
721 *score /= exp_sum;
722 }
723
724 for d in 0..hidden_dim {
726 let mut output_val = 0.0;
727 for (j, &score) in scores.iter().enumerate() {
728 let v_idx = batch * seq_len * hidden_dim + j * hidden_dim + d;
729 output_val += score * v_data[v_idx];
730 }
731 let o_idx = batch * seq_len * hidden_dim + i * hidden_dim + d;
732 o_data[o_idx] = output_val;
733 }
734 }
735 }
736
737 Ok(())
738 }
739
740 fn cpu_layer_norm(
741 &self,
742 input: &Tensor,
743 gamma: &Tensor,
744 beta: &Tensor,
745 output: &mut Tensor,
746 epsilon: f32,
747 ) -> Result<()> {
748 let input_data = input.data()?;
749 let gamma_data = gamma.data()?;
750 let beta_data = beta.data()?;
751 let output_data = output.data_mut()?;
752
753 let input_shape = input.shape();
754 let last_dim = input_shape[input_shape.len() - 1];
755 let num_elements = input_data.len() / last_dim;
756
757 for i in 0..num_elements {
758 let start = i * last_dim;
759 let end = start + last_dim;
760 let slice = &input_data[start..end];
761
762 let mean = slice.iter().sum::<f32>() / last_dim as f32;
764
765 let variance = slice.iter().map(|x| (x - mean).powi(2)).sum::<f32>() / last_dim as f32;
767 let std_dev = (variance + epsilon).sqrt();
768
769 for j in 0..last_dim {
771 let normalized = (slice[j] - mean) / std_dev;
772 output_data[start + j] = normalized * gamma_data[j] + beta_data[j];
773 }
774 }
775
776 Ok(())
777 }
778
779 fn cpu_gelu(&self, input: &Tensor, output: &mut Tensor) -> Result<()> {
780 let input_data = input.data()?;
781 let output_data = output.data_mut()?;
782
783 for i in 0..input_data.len() {
784 let x = input_data[i];
785 let x_cubed = x * x * x;
787 let tanh_arg = 0.797885 * (x + 0.044715 * x_cubed);
788 let tanh_val = tanh_arg.tanh();
789 output_data[i] = 0.5 * x * (1.0 + tanh_val);
790 }
791
792 Ok(())
793 }
794
795 fn cpu_reduce_sum(&self, input: &Tensor, output: &mut Tensor, dim: usize) -> Result<()> {
796 let input_data = input.data()?;
797 let output_data = output.data_mut()?;
798 let input_shape = input.shape();
799
800 let reduce_size = input_shape[dim];
801 let outer_size = input_shape[..dim].iter().product::<usize>();
802 let inner_size = input_shape[dim + 1..].iter().product::<usize>();
803
804 for outer in 0..outer_size {
805 for inner in 0..inner_size {
806 let mut sum = 0.0;
807 for reduce_idx in 0..reduce_size {
808 let input_idx =
809 outer * reduce_size * inner_size + reduce_idx * inner_size + inner;
810 sum += input_data[input_idx];
811 }
812 let output_idx = outer * inner_size + inner;
813 output_data[output_idx] = sum;
814 }
815 }
816
817 Ok(())
818 }
819
820 fn reduce_max(&self, input: &Tensor, dim: usize) -> Result<Tensor> {
822 let input_shape = input.shape();
824 let mut output_shape = input_shape.clone();
825 output_shape.remove(dim);
826
827 Tensor::zeros(&output_shape)
829 }
830
831 fn subtract_broadcast(&self, a: &Tensor, b: &Tensor, dim: usize) -> Result<Tensor> {
832 let a_shape = a.shape();
834 Tensor::zeros(&a_shape)
835 }
836
837 fn exp(&self, input: &Tensor) -> Result<Tensor> {
838 let input_shape = input.shape();
840 Tensor::zeros(&input_shape)
841 }
842
843 fn divide_broadcast(&self, a: &Tensor, b: &Tensor, dim: usize) -> Result<Tensor> {
844 let a_shape = a.shape();
846 Tensor::zeros(&a_shape)
847 }
848}
849
850#[cfg(test)]
851mod tests {
852 use super::*;
853
854 #[test]
855 fn test_gpu_accelerated_ops_creation() {
856 let config = GpuOpsConfig::default();
857 let ops = GpuAcceleratedOps::new(config);
858 assert!(ops.is_ok());
859 }
860
861 #[test]
862 fn test_gpu_ops_config_default() {
863 let config = GpuOpsConfig::default();
864 assert_eq!(config.device_id, 0);
865 assert!(config.enable_async);
866 assert_eq!(config.precision, GpuPrecision::FP32);
867 }
868
869 #[test]
870 fn test_backend_detection() {
871 let config = GpuOpsConfig::default();
872 let ops = GpuAcceleratedOps::new(config).expect("operation failed in test");
873
874 let backend = ops.get_backend();
876 assert!(matches!(
877 backend,
878 GpuBackend::Cuda | GpuBackend::Rocm | GpuBackend::Cpu | GpuBackend::Metal
879 ));
880 }
881
882 #[test]
883 fn test_matmul_dimensions() {
884 let config = GpuOpsConfig::default();
885 let ops = GpuAcceleratedOps::new(config).expect("operation failed in test");
886
887 let a = Tensor::ones(&[2, 3]).expect("Failed to create ones tensor");
888 let b = Tensor::ones(&[3, 4]).expect("Failed to create ones tensor");
889
890 let result = ops.matmul(&a, &b);
891 assert!(result.is_ok());
892
893 let result_tensor = result.expect("tensor operation failed");
894 assert_eq!(result_tensor.shape(), &[2, 4]);
895 }
896
897 #[test]
898 fn test_batch_matmul_dimensions() {
899 let config = GpuOpsConfig::default();
900 let ops = GpuAcceleratedOps::new(config).expect("operation failed in test");
901
902 let a = Tensor::ones(&[2, 3, 4]).expect("Failed to create ones tensor");
903 let b = Tensor::ones(&[2, 4, 5]).expect("Failed to create ones tensor");
904
905 let result = ops.batch_matmul(&a, &b);
906 assert!(result.is_ok());
907
908 let result_tensor = result.expect("tensor operation failed");
909 assert_eq!(result_tensor.shape(), &[2, 3, 5]);
910 }
911
912 #[test]
913 fn test_flash_attention_dimensions() {
914 let config = GpuOpsConfig::default();
915 let ops = GpuAcceleratedOps::new(config).expect("operation failed in test");
916
917 let batch_size = 2;
918 let seq_len = 10;
919 let hidden_dim = 64;
920
921 let query =
922 Tensor::ones(&[batch_size, seq_len, hidden_dim]).expect("Failed to create ones tensor");
923 let key =
924 Tensor::ones(&[batch_size, seq_len, hidden_dim]).expect("Failed to create ones tensor");
925 let value =
926 Tensor::ones(&[batch_size, seq_len, hidden_dim]).expect("Failed to create ones tensor");
927
928 let result = ops.flash_attention(&query, &key, &value, 0.125, None);
929 assert!(result.is_ok());
930
931 let result_tensor = result.expect("tensor operation failed");
932 assert_eq!(result_tensor.shape(), &[batch_size, seq_len, hidden_dim]);
933 }
934
935 #[test]
936 fn test_layer_norm_dimensions() {
937 let config = GpuOpsConfig::default();
938 let ops = GpuAcceleratedOps::new(config).expect("operation failed in test");
939
940 let input = Tensor::ones(&[2, 10, 64]).expect("Failed to create ones tensor");
941 let gamma = Tensor::ones(&[64]).expect("Failed to create ones tensor");
942 let beta = Tensor::zeros(&[64]).expect("Failed to create zero tensor");
943
944 let result = ops.layer_norm(&input, &gamma, &beta, 1e-5);
945 assert!(result.is_ok());
946
947 let result_tensor = result.expect("tensor operation failed");
948 assert_eq!(result_tensor.shape(), &[2, 10, 64]);
949 }
950
951 #[test]
952 fn test_gelu_dimensions() {
953 let config = GpuOpsConfig::default();
954 let ops = GpuAcceleratedOps::new(config).expect("operation failed in test");
955
956 let input = Tensor::ones(&[2, 10, 64]).expect("Failed to create ones tensor");
957
958 let result = ops.gelu(&input);
959 assert!(result.is_ok());
960
961 let result_tensor = result.expect("tensor operation failed");
962 assert_eq!(result_tensor.shape(), &[2, 10, 64]);
963 }
964
965 #[test]
966 fn test_memory_usage() {
967 let config = GpuOpsConfig::default();
968 let ops = GpuAcceleratedOps::new(config).expect("operation failed in test");
969
970 let (_total, _peak, _free) = ops.get_memory_usage().expect("operation failed in test");
971 }
974
975 #[test]
976 fn test_synchronize() {
977 let config = GpuOpsConfig::default();
978 let ops = GpuAcceleratedOps::new(config).expect("operation failed in test");
979
980 let result = ops.synchronize();
981 assert!(result.is_ok());
982 }
983}