1use crate::autograd::{BackwardOp, Tensor};
8use crate::trace::{TraceStep, TRACER};
9use ndarray::Array1;
10use std::cell::RefCell;
11use std::rc::Rc;
12
13#[cfg(all(feature = "realizar", feature = "cuda"))]
14use std::sync::atomic::{AtomicBool, Ordering};
15#[cfg(all(feature = "realizar", feature = "cuda"))]
16use std::sync::{Mutex, OnceLock};
17
18#[cfg(all(feature = "realizar", feature = "cuda"))]
19use realizar::cuda::CudaExecutor;
20
21#[cfg(all(feature = "realizar", feature = "cuda"))]
26static CUDA_MATMUL_DISABLED: AtomicBool = AtomicBool::new(false);
27
28#[cfg(all(feature = "realizar", feature = "cuda"))]
30static CUDA_EXECUTOR: OnceLock<Option<Mutex<CudaExecutor>>> = OnceLock::new();
31
32#[cfg(all(feature = "realizar", feature = "cuda"))]
34fn get_cuda_executor() -> Option<&'static Mutex<CudaExecutor>> {
35 CUDA_EXECUTOR
36 .get_or_init(|| match CudaExecutor::new(0) {
37 Ok(executor) => {
38 TRACER.end(TraceStep::Transfer, "realizar CUDA executor initialized on GPU 0");
39 Some(Mutex::new(executor))
40 }
41 Err(_e) => {
42 CUDA_MATMUL_DISABLED.store(true, Ordering::Relaxed);
43 None
44 }
45 })
46 .as_ref()
47}
48
49#[inline]
52pub fn transpose(data: &[f32], rows: usize, cols: usize) -> Vec<f32> {
53 contract_pre_transpose!(data);
54 TRACER.start(TraceStep::Transpose);
55 let mut transposed = vec![0.0f32; rows * cols];
56
57 const BLOCK_SIZE: usize = 32;
58 if rows >= BLOCK_SIZE && cols >= BLOCK_SIZE {
59 transpose_blocked(data, &mut transposed, rows, cols, BLOCK_SIZE);
60 } else {
61 transpose_simple(data, &mut transposed, rows, cols);
62 }
63
64 TRACER.end(TraceStep::Transpose, format!("{rows}x{cols}"));
65 transposed
66}
67
68pub fn transpose_tracked(tensor: &Tensor, rows: usize, cols: usize) -> Tensor {
80 contract_pre_transpose_tracked!();
81 let data = tensor.data();
82 let slice = data.as_slice().expect("transpose_tracked: tensor must be contiguous");
83 let transposed_data = transpose(slice, rows, cols);
84 let mut result = Tensor::from_vec(transposed_data, tensor.requires_grad());
85
86 if tensor.requires_grad() {
87 let backward_op = Rc::new(TransposeBackward {
88 original: tensor.clone(),
89 rows,
90 cols,
91 result_grad: result.grad_cell(),
92 });
93 result.set_backward_op(backward_op);
94 }
95
96 result
97}
98
99struct TransposeBackward {
105 original: Tensor,
106 rows: usize,
107 cols: usize,
108 result_grad: Rc<RefCell<Option<Array1<f32>>>>,
109}
110
111impl BackwardOp for TransposeBackward {
112 fn backward(&self) {
113 if let Some(grad) = self.result_grad.borrow().as_ref() {
114 let grad_slice = grad.as_slice().expect("gradient must be contiguous");
115 let grad_original = transpose(grad_slice, self.cols, self.rows);
117 self.original.accumulate_grad(Array1::from(grad_original));
118 if let Some(op) = self.original.backward_op() {
119 op.backward();
120 }
121 }
122 }
123}
124
125#[inline]
127fn transpose_blocked(src: &[f32], dst: &mut [f32], rows: usize, cols: usize, block: usize) {
128 for r_block in (0..rows).step_by(block) {
129 for c_block in (0..cols).step_by(block) {
130 let r_end = (r_block + block).min(rows);
131 let c_end = (c_block + block).min(cols);
132 for r in r_block..r_end {
133 for c in c_block..c_end {
134 dst[c * rows + r] = src[r * cols + c];
135 }
136 }
137 }
138 }
139}
140
141#[inline]
143fn transpose_simple(src: &[f32], dst: &mut [f32], rows: usize, cols: usize) {
144 for r in 0..rows {
145 for c in 0..cols {
146 dst[c * rows + r] = src[r * cols + c];
147 }
148 }
149}
150
151#[cfg(all(feature = "realizar", feature = "cuda"))]
156pub fn matmul_compute(a: &[f32], b: &[f32], m: usize, k: usize, n: usize) -> Vec<f32> {
157 contract_pre_matmul!(a);
158 if !CUDA_MATMUL_DISABLED.load(Ordering::Relaxed) {
161 if let Some(executor_mutex) = get_cuda_executor() {
162 if let Ok(mut executor) = executor_mutex.lock() {
163 match cuda_matmul(&mut executor, a, b, m, k, n) {
164 Ok(result) => return result,
165 Err(_e) => {
166 CUDA_MATMUL_DISABLED.store(true, Ordering::Relaxed);
168 TRACER.end(
169 TraceStep::Matmul,
170 "realizar CUDA matmul disabled (JIT failure), using trueno SIMD",
171 );
172 }
173 }
174 }
175 }
176 }
177
178 #[cfg(feature = "gpu")]
181 if !WGPU_BATCH_MODE.load(std::sync::atomic::Ordering::Relaxed) && m * k * n > 32_768 {
182 if let Some(result) = wgpu_matmul(a, b, m, k, n) {
183 return result;
184 }
185 }
186
187 cpu_matmul(a, b, m, k, n)
189}
190
191#[cfg(all(feature = "realizar", feature = "cuda"))]
206pub fn pre_warm_realizador_gemm(
207 seq_len: usize,
208 hidden_size: usize,
209 kv_hidden_size: usize,
210 intermediate_size: usize,
211 lora_rank: usize,
212 num_classes: usize,
213) -> usize {
214 let executor_mutex = match get_cuda_executor() {
215 Some(e) => e,
216 None => return 0,
217 };
218 let mut executor = match executor_mutex.lock() {
219 Ok(e) => e,
220 Err(_) => return 0,
221 };
222
223 let s = seq_len;
225 let h = hidden_size;
226 let kv = kv_hidden_size;
227 let i = intermediate_size;
228 let r = lora_rank;
229
230 let mut shapes: Vec<(usize, usize, usize)> = vec![
231 (s, h, h), (s, h, kv), (s, h, i), (s, i, h), (s, h, r), (s, r, h), (s, kv, r), (s, r, kv), (s, kv, h), (s, i, h), (s, h, i), (h, s, h), (h, s, kv), (h, s, i), (i, s, h), (s, r, h), (h, s, r), (s, h, r), (r, s, h), (r, s, kv), (1, h, num_classes),
259 ];
260
261 shapes.sort_unstable();
263 shapes.dedup();
264 shapes.retain(|&(m, k, n)| m > 0 && k > 0 && n > 0);
266
267 let mut warmed = 0usize;
268 for &(m, k, n) in &shapes {
269 let a = vec![0.0f32; m * k];
270 let b = vec![0.0f32; k * n];
271 match cuda_matmul(&mut executor, &a, &b, m, k, n) {
272 Ok(_) => warmed += 1,
273 Err(e) => {
274 eprintln!("[CUDA] realizador GEMM pre-warm failed for ({m},{k},{n}): {e}");
275 }
276 }
277 }
278
279 if warmed == 0 {
280 CUDA_MATMUL_DISABLED.store(true, Ordering::Relaxed);
281 }
282
283 warmed
284}
285
286#[cfg(all(feature = "realizar", feature = "cuda"))]
288fn cuda_matmul(
289 executor: &mut CudaExecutor,
290 a: &[f32],
291 b: &[f32],
292 m: usize,
293 k: usize,
294 n: usize,
295) -> Result<Vec<f32>, String> {
296 TRACER.start(TraceStep::Alloc);
297 let mut c = vec![0.0f32; m * n];
298 TRACER.end(TraceStep::Alloc, format!("{m}x{n}"));
299
300 TRACER.start(TraceStep::Matmul);
301 executor.gemm(a, b, &mut c, m as u32, n as u32, k as u32).map_err(|e| format!("{e:?}"))?;
302 TRACER.end(TraceStep::Matmul, format!("{m}x{k}x{n}"));
303 Ok(c)
304}
305
306fn cpu_matmul(a: &[f32], b: &[f32], m: usize, k: usize, n: usize) -> Vec<f32> {
308 let mut c = vec![0.0f32; m * n];
309
310 if let Err(_e) = trueno::blis::gemm(m, n, k, a, b, &mut c) {
311 for i in 0..m {
313 for j in 0..n {
314 let mut sum = 0.0;
315 for p in 0..k {
316 sum += a[i * k + p] * b[p * n + j];
317 }
318 c[i * n + j] = sum;
319 }
320 }
321 }
322
323 c
324}
325
326#[cfg(feature = "gpu")]
330static WGPU_BATCH_MODE: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
331
332#[cfg(feature = "gpu")]
339pub fn suppress_per_op_wgpu() {
340 WGPU_BATCH_MODE.store(true, std::sync::atomic::Ordering::Relaxed);
341}
342
343#[cfg(feature = "gpu")]
345pub fn unsuppress_per_op_wgpu() {
346 WGPU_BATCH_MODE.store(false, std::sync::atomic::Ordering::Relaxed);
347}
348
349#[cfg(not(all(feature = "realizar", feature = "cuda")))]
355pub fn matmul_compute(a: &[f32], b: &[f32], m: usize, k: usize, n: usize) -> Vec<f32> {
356 #[cfg(feature = "gpu")]
357 {
358 if !WGPU_BATCH_MODE.load(std::sync::atomic::Ordering::Relaxed) && m * k * n > 32_768 {
361 if let Some(result) = wgpu_matmul(a, b, m, k, n) {
362 return result;
363 }
364 }
365 }
366 cpu_matmul(a, b, m, k, n)
367}
368
369#[cfg(feature = "gpu")]
374fn wgpu_matmul(a: &[f32], b: &[f32], m: usize, k: usize, n: usize) -> Option<Vec<f32>> {
375 use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
376 use std::sync::OnceLock;
377 static WGPU_DISABLED: AtomicBool = AtomicBool::new(false);
378 static WGPU_LOGGED: AtomicBool = AtomicBool::new(false);
379 static WGPU_CALLS: AtomicU64 = AtomicU64::new(0);
380 static WGPU_DEVICE: OnceLock<Option<trueno::backends::gpu::GpuDevice>> = OnceLock::new();
381
382 if WGPU_DISABLED.load(Ordering::Relaxed) {
383 return None;
384 }
385
386 let device_opt = WGPU_DEVICE.get_or_init(|| {
387 if !trueno::backends::gpu::GpuBackend::is_available() {
388 eprintln!("[wgpu] No GPU available, using CPU");
389 return None;
390 }
391 match trueno::backends::gpu::GpuDevice::new() {
392 Ok(d) => {
393 eprintln!("[wgpu] GPU device initialized for matmul");
394 Some(d)
395 }
396 Err(e) => {
397 eprintln!("[wgpu] GPU init failed: {e}, using CPU");
398 None
399 }
400 }
401 });
402
403 let device = match device_opt.as_ref() {
404 Some(d) => d,
405 None => {
406 WGPU_DISABLED.store(true, Ordering::Relaxed);
407 return None;
408 }
409 };
410
411 let mut result = vec![0.0f32; m * n];
412 match device.matmul(a, b, &mut result, m, k, n) {
413 Ok(()) => {
414 let calls = WGPU_CALLS.fetch_add(1, Ordering::Relaxed);
415 if !WGPU_LOGGED.swap(true, Ordering::Relaxed) {
416 eprintln!("[wgpu] GPU matmul active ({m}x{k}x{n})");
417 }
418 if calls > 0 && calls.is_multiple_of(10_000) {
420 eprintln!("[wgpu] {calls} GPU matmuls completed");
421 }
422 Some(result)
423 }
424 Err(_e) => {
425 WGPU_DISABLED.store(true, Ordering::Relaxed);
426 None
427 }
428 }
429}
430
431#[provable_contracts_macros::contract("matmul-v1", equation = "matmul")]
447pub fn matmul(a: &Tensor, b: &Tensor, m: usize, k: usize, n: usize) -> Tensor {
448 assert_eq!(a.len(), m * k, "Matrix A size mismatch");
449 assert_eq!(b.len(), k * n, "Matrix B size mismatch");
450
451 let result_data = matmul_compute(
453 a.data().as_slice().expect("matrix A must be contiguous"),
454 b.data().as_slice().expect("matrix B must be contiguous"),
455 m,
456 k,
457 n,
458 );
459
460 let requires_grad = a.requires_grad() || b.requires_grad();
461 let mut result = Tensor::new(Array1::from(result_data), requires_grad);
462
463 if requires_grad {
464 let a_clone = a.clone();
465 let b_clone = b.clone();
466 let backward_op = Rc::new(MatmulBackward {
467 a: a_clone,
468 b: b_clone,
469 m,
470 k,
471 n,
472 result_grad: result.grad_cell(),
473 });
474 result.set_backward_op(backward_op);
475 }
476
477 contract_post_matmul!(result.data().as_slice().unwrap_or(&[]));
478 result
479}
480
481struct MatmulBackward {
482 a: Tensor,
483 b: Tensor,
484 m: usize,
485 k: usize,
486 n: usize,
487 result_grad: Rc<RefCell<Option<Array1<f32>>>>,
488}
489
490impl BackwardOp for MatmulBackward {
491 fn backward(&self) {
492 if let Some(grad_output) = self.result_grad.borrow().as_ref() {
493 let grad_c = grad_output.as_slice().expect("gradient output must be contiguous");
497 let a_data = self.a.data();
498 let b_data = self.b.data();
499 let a_slice = a_data.as_slice().expect("matrix A must be contiguous");
500 let b_slice = b_data.as_slice().expect("matrix B must be contiguous");
501
502 if self.a.requires_grad() {
503 let b_t = transpose(b_slice, self.k, self.n);
507 let grad_a = matmul_compute(grad_c, &b_t, self.m, self.n, self.k);
508 self.a.accumulate_grad(Array1::from(grad_a));
509 }
510
511 if self.b.requires_grad() {
512 let a_t = transpose(a_slice, self.m, self.k);
516 let grad_b = matmul_compute(&a_t, grad_c, self.k, self.m, self.n);
517 self.b.accumulate_grad(Array1::from(grad_b));
518 }
519
520 if let Some(op) = self.a.backward_op() {
522 op.backward();
523 }
524 if let Some(op) = self.b.backward_op() {
525 op.backward();
526 }
527 }
528 }
529}
530
531#[provable_contracts_macros::contract("matmul-v1", equation = "matmul_nt")]
547pub fn matmul_nt(a: &Tensor, b: &Tensor, m: usize, k: usize, n: usize) -> Tensor {
548 assert_eq!(
549 a.len(),
550 m * k,
551 "Matrix A size mismatch: expected {}×{} = {}, got {}",
552 m,
553 k,
554 m * k,
555 a.len()
556 );
557 assert_eq!(
558 b.len(),
559 n * k,
560 "Matrix B size mismatch: expected {}×{} = {}, got {}",
561 n,
562 k,
563 n * k,
564 b.len()
565 );
566
567 let a_slice = a.data();
568 let b_slice = b.data();
569 let a_data = a_slice.as_slice().expect("matrix A must be contiguous");
570 let b_data = b_slice.as_slice().expect("matrix B must be contiguous");
571
572 let result_data = matmul_nt_compute(a_data, b_data, m, k, n);
574
575 let requires_grad = a.requires_grad() || b.requires_grad();
576 let mut result = Tensor::new(Array1::from(result_data), requires_grad);
577
578 if requires_grad {
579 let a_clone = a.clone();
580 let b_clone = b.clone();
581 let backward_op = Rc::new(MatmulNtBackward {
582 a: a_clone,
583 b: b_clone,
584 m,
585 k,
586 n,
587 result_grad: result.grad_cell(),
588 });
589 result.set_backward_op(backward_op);
590 }
591
592 contract_post_matmul!(result.data().as_slice().unwrap_or(&[]));
593 result
594}
595
596pub fn matmul_nt_compute(a: &[f32], b: &[f32], m: usize, k: usize, n: usize) -> Vec<f32> {
600 let b_t = transpose(b, n, k); cpu_matmul(a, &b_t, m, k, n)
603}
604
605struct MatmulNtBackward {
606 a: Tensor,
607 b: Tensor,
608 m: usize,
609 k: usize,
610 n: usize,
611 result_grad: Rc<RefCell<Option<Array1<f32>>>>,
612}
613
614impl BackwardOp for MatmulNtBackward {
615 fn backward(&self) {
616 if let Some(grad_output) = self.result_grad.borrow().as_ref() {
617 let grad_c = grad_output.as_slice().expect("gradient output must be contiguous");
623
624 if self.a.requires_grad() {
625 let b_data = self.b.data();
627 let b_slice = b_data.as_slice().expect("matrix B must be contiguous");
628 let grad_a = matmul_compute(grad_c, b_slice, self.m, self.n, self.k);
629 self.a.accumulate_grad(Array1::from(grad_a));
630 }
631
632 if self.b.requires_grad() {
633 let a_data = self.a.data();
635 let a_slice = a_data.as_slice().expect("matrix A must be contiguous");
636 let grad_c_t = transpose(grad_c, self.m, self.n);
637 let grad_b = matmul_compute(&grad_c_t, a_slice, self.n, self.m, self.k);
638 self.b.accumulate_grad(Array1::from(grad_b));
639 }
640
641 if let Some(op) = self.a.backward_op() {
643 op.backward();
644 }
645 if let Some(op) = self.b.backward_op() {
646 op.backward();
647 }
648 }
649 }
650}
651
652#[cfg(test)]
653mod tests {
654 use super::*;
655
656 #[test]
657 fn test_transpose_identity() {
658 let data = vec![5.0];
660 let result = transpose(&data, 1, 1);
661 assert_eq!(result, vec![5.0]);
662 }
663
664 #[test]
665 fn test_transpose_2x3() {
666 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
670 let result = transpose(&data, 2, 3);
671 assert_eq!(result, vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0]);
676 }
677
678 #[test]
679 fn test_transpose_3x2() {
680 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
682 let result = transpose(&data, 3, 2);
683 assert_eq!(result, vec![1.0, 3.0, 5.0, 2.0, 4.0, 6.0]);
685 }
686
687 #[test]
688 fn test_matmul_compute_2x2() {
689 let a = vec![1.0, 2.0, 3.0, 4.0];
693 let b = vec![5.0, 6.0, 7.0, 8.0];
694 let c = matmul_compute(&a, &b, 2, 2, 2);
695 assert_eq!(c, vec![19.0, 22.0, 43.0, 50.0]);
696 }
697
698 #[test]
699 fn test_matmul_compute_2x3_3x2() {
700 let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
704 let b = vec![7.0, 8.0, 9.0, 10.0, 11.0, 12.0];
705 let c = matmul_compute(&a, &b, 2, 3, 2);
706 assert_eq!(c, vec![58.0, 64.0, 139.0, 154.0]);
709 }
710
711 #[test]
712 fn test_matmul_no_grad() {
713 let a = Tensor::new(Array1::from(vec![1.0, 2.0, 3.0, 4.0]), false);
714 let b = Tensor::new(Array1::from(vec![5.0, 6.0, 7.0, 8.0]), false);
715 let c = matmul(&a, &b, 2, 2, 2);
716 assert!(!c.requires_grad());
717 assert_eq!(
718 c.data().as_slice().expect("operation should succeed"),
719 &[19.0, 22.0, 43.0, 50.0]
720 );
721 }
722
723 #[test]
724 fn test_matmul_with_grad() {
725 let a = Tensor::new(Array1::from(vec![1.0, 2.0, 3.0, 4.0]), true);
726 let b = Tensor::new(Array1::from(vec![5.0, 6.0, 7.0, 8.0]), true);
727 let c = matmul(&a, &b, 2, 2, 2);
728 assert!(c.requires_grad());
729 assert!(c.backward_op().is_some());
730 }
731
732 #[test]
733 fn test_matmul_backward() {
734 let a = Tensor::new(Array1::from(vec![1.0, 2.0, 3.0, 4.0]), true);
735 let b = Tensor::new(Array1::from(vec![5.0, 6.0, 7.0, 8.0]), true);
736 let c = matmul(&a, &b, 2, 2, 2);
737
738 c.set_grad(Array1::from(vec![1.0, 1.0, 1.0, 1.0]));
740
741 if let Some(op) = c.backward_op() {
743 op.backward();
744 }
745
746 assert!(a.grad().is_some());
748 assert!(b.grad().is_some());
749 }
750
751 #[test]
752 fn test_matmul_a_requires_grad_only() {
753 let a = Tensor::new(Array1::from(vec![1.0, 2.0, 3.0, 4.0]), true);
754 let b = Tensor::new(Array1::from(vec![5.0, 6.0, 7.0, 8.0]), false);
755 let c = matmul(&a, &b, 2, 2, 2);
756 assert!(c.requires_grad());
757
758 c.set_grad(Array1::from(vec![1.0, 1.0, 1.0, 1.0]));
759 if let Some(op) = c.backward_op() {
760 op.backward();
761 }
762
763 assert!(a.grad().is_some());
764 assert!(b.grad().is_none());
765 }
766
767 #[test]
768 fn test_matmul_b_requires_grad_only() {
769 let a = Tensor::new(Array1::from(vec![1.0, 2.0, 3.0, 4.0]), false);
770 let b = Tensor::new(Array1::from(vec![5.0, 6.0, 7.0, 8.0]), true);
771 let c = matmul(&a, &b, 2, 2, 2);
772 assert!(c.requires_grad());
773
774 c.set_grad(Array1::from(vec![1.0, 1.0, 1.0, 1.0]));
775 if let Some(op) = c.backward_op() {
776 op.backward();
777 }
778
779 assert!(a.grad().is_none());
780 assert!(b.grad().is_some());
781 }
782
783 #[test]
784 #[should_panic(expected = "Pre-condition violated")]
785 fn test_matmul_size_mismatch_a() {
786 let a = Tensor::new(Array1::from(vec![1.0, 2.0, 3.0]), false);
787 let b = Tensor::new(Array1::from(vec![5.0, 6.0, 7.0, 8.0]), false);
788 let _ = matmul(&a, &b, 2, 2, 2);
789 }
790
791 #[test]
792 #[should_panic(expected = "Pre-condition violated")]
793 fn test_matmul_size_mismatch_b() {
794 let a = Tensor::new(Array1::from(vec![1.0, 2.0, 3.0, 4.0]), false);
795 let b = Tensor::new(Array1::from(vec![5.0, 6.0, 7.0]), false);
796 let _ = matmul(&a, &b, 2, 2, 2);
797 }
798
799 #[test]
800 fn test_transpose_double_transpose() {
801 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
803 let t1 = transpose(&data, 2, 3);
804 let t2 = transpose(&t1, 3, 2);
805 assert_eq!(data, t2);
806 }
807
808 #[test]
824 fn falsify_mm_001e_shape_correctness() {
825 for (m, k, n) in [(2, 3, 4), (1, 5, 1), (4, 4, 4), (3, 1, 2)] {
826 let result = matmul_compute(&vec![1.0; m * k], &vec![1.0; k * n], m, k, n);
827 assert_eq!(
828 result.len(),
829 m * n,
830 "FALSIFIED MM-001e: output len = {}, expected {} for ({m}x{k}) @ ({k}x{n})",
831 result.len(),
832 m * n
833 );
834 }
835 }
836
837 #[test]
839 fn falsify_mm_005e_identity_matrix() {
840 let m = 3;
841 let k = 4;
842 let a: Vec<f32> = (0..m * k).map(|i| (i as f32 + 1.0) * 0.5).collect();
843 let mut identity = vec![0.0; k * k];
844 for i in 0..k {
845 identity[i * k + i] = 1.0;
846 }
847 let result = matmul_compute(&a, &identity, m, k, k);
848 for (i, (&got, &exp)) in result.iter().zip(a.iter()).enumerate() {
849 assert!(
850 (got - exp).abs() < 1e-5,
851 "FALSIFIED MM-005e: (A@I)[{i}] = {got}, expected {exp}"
852 );
853 }
854 }
855
856 #[test]
858 fn falsify_mm_002e_numerical_accuracy() {
859 let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
861 let b = vec![7.0, 8.0, 9.0, 10.0, 11.0, 12.0];
862 let result = matmul_compute(&a, &b, 2, 3, 2);
863 let expected = [58.0, 64.0, 139.0, 154.0];
864 for (i, (&got, &exp)) in result.iter().zip(expected.iter()).enumerate() {
865 assert!(
866 (got - exp).abs() < 1e-4,
867 "FALSIFIED MM-002e: result[{i}] = {got}, expected {exp}"
868 );
869 }
870 }
871
872 #[test]
877 fn test_matmul_nt_compute_2x2() {
878 let a = vec![1.0, 2.0, 3.0, 4.0];
885 let b = vec![5.0, 6.0, 7.0, 8.0];
886 let c = matmul_nt_compute(&a, &b, 2, 2, 2);
887 assert_eq!(c, vec![17.0, 23.0, 39.0, 53.0]);
888 }
889
890 #[test]
891 fn test_matmul_nt_compute_2x3_4x3() {
892 let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
904 let b = vec![1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0];
905 let c = matmul_nt_compute(&a, &b, 2, 3, 4);
906 assert_eq!(c, vec![1.0, 2.0, 3.0, 6.0, 4.0, 5.0, 6.0, 15.0]);
907 }
908
909 #[test]
910 fn test_matmul_nt_equivalence_to_transpose_matmul() {
911 let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]; let b = vec![7.0, 8.0, 9.0, 10.0, 11.0, 12.0]; let b_t = transpose(&b, 2, 3); let c_nt = matmul_nt_compute(&a, &b, 2, 3, 2);
917 let c_ref = matmul_compute(&a, &b_t, 2, 3, 2);
918
919 for (i, (&got, &exp)) in c_nt.iter().zip(c_ref.iter()).enumerate() {
920 assert!(
921 (got - exp).abs() < 1e-5,
922 "matmul_nt[{i}] = {got}, matmul(A, B^T)[{i}] = {exp}"
923 );
924 }
925 }
926
927 #[test]
928 fn test_matmul_nt_backward_grad_flows_to_b() {
929 let a = Tensor::new(Array1::from(vec![1.0, 2.0, 3.0, 4.0]), false); let b = Tensor::new(Array1::from(vec![5.0, 6.0, 7.0, 8.0]), true); let c = matmul_nt(&a, &b, 2, 2, 2);
934 assert!(c.requires_grad());
935
936 c.set_grad(Array1::from(vec![1.0, 1.0, 1.0, 1.0]));
937 if let Some(op) = c.backward_op() {
938 op.backward();
939 }
940
941 let b_grad = b.grad().expect("KAIZEN-011: B must receive gradient from matmul_nt");
943
944 let expected_grad_b = vec![4.0, 6.0, 4.0, 6.0];
947 for (i, (&got, &exp)) in b_grad.iter().zip(expected_grad_b.iter()).enumerate() {
948 assert!((got - exp).abs() < 1e-4, "KAIZEN-011: grad_B[{i}] = {got}, expected {exp}");
949 }
950 }
951
952 #[test]
953 fn test_matmul_nt_backward_grad_flows_to_a() {
954 let a = Tensor::new(Array1::from(vec![1.0, 2.0, 3.0, 4.0]), true); let b = Tensor::new(Array1::from(vec![5.0, 6.0, 7.0, 8.0]), false); let c = matmul_nt(&a, &b, 2, 2, 2);
958 c.set_grad(Array1::from(vec![1.0, 1.0, 1.0, 1.0]));
959 if let Some(op) = c.backward_op() {
960 op.backward();
961 }
962
963 let a_grad = a.grad().expect("A must receive gradient");
964
965 let expected_grad_a = vec![12.0, 14.0, 12.0, 14.0];
967 for (i, (&got, &exp)) in a_grad.iter().zip(expected_grad_a.iter()).enumerate() {
968 assert!((got - exp).abs() < 1e-4, "grad_A[{i}] = {got}, expected {exp}");
969 }
970 }
971
972 mod mm_proptest_falsify {
973 use super::*;
974 use proptest::prelude::*;
975
976 proptest! {
978 #![proptest_config(ProptestConfig::with_cases(100))]
979
980 #[test]
981 fn falsify_mm_001e_prop_shape(
982 m in 1..=8usize,
983 k in 1..=8usize,
984 n in 1..=8usize,
985 ) {
986 let result = matmul_compute(&vec![1.0; m * k], &vec![1.0; k * n], m, k, n);
987 prop_assert_eq!(result.len(), m * n);
988 }
989 }
990
991 proptest! {
993 #![proptest_config(ProptestConfig::with_cases(50))]
994
995 #[test]
996 fn falsify_mm_005e_prop_identity(
997 m in 1..=6usize,
998 k in 1..=6usize,
999 seed in 0..500u32,
1000 ) {
1001 let a: Vec<f32> = (0..m * k)
1002 .map(|i| ((i as f32 + seed as f32) * 0.37).sin())
1003 .collect();
1004 let mut identity = vec![0.0; k * k];
1005 for i in 0..k {
1006 identity[i * k + i] = 1.0;
1007 }
1008 let result = matmul_compute(&a, &identity, m, k, k);
1009 for (i, (&got, &exp)) in result.iter().zip(a.iter()).enumerate() {
1010 prop_assert!(
1011 (got - exp).abs() < 1e-4,
1012 "FALSIFIED MM-005e-prop: (A@I)[{}] = {}, expected {}",
1013 i, got, exp
1014 );
1015 }
1016 }
1017 }
1018
1019 proptest! {
1021 #![proptest_config(ProptestConfig::with_cases(50))]
1022
1023 #[test]
1024 fn falsify_mm_nt_equivalence(
1025 m in 1..=6usize,
1026 k in 1..=6usize,
1027 n in 1..=6usize,
1028 seed in 0..500u32,
1029 ) {
1030 let a: Vec<f32> = (0..m * k)
1031 .map(|i| ((i as f32 + seed as f32) * 0.31).sin())
1032 .collect();
1033 let b: Vec<f32> = (0..n * k)
1034 .map(|i| ((i as f32 + seed as f32 + 100.0) * 0.47).cos())
1035 .collect();
1036
1037 let c_nt = matmul_nt_compute(&a, &b, m, k, n);
1038 let b_t = transpose(&b, n, k);
1039 let c_ref = matmul_compute(&a, &b_t, m, k, n);
1040
1041 for (i, (&got, &exp)) in c_nt.iter().zip(c_ref.iter()).enumerate() {
1042 prop_assert!(
1043 (got - exp).abs() < 1e-3,
1044 "matmul_nt[{}] = {}, expected {}",
1045 i, got, exp
1046 );
1047 }
1048 }
1049 }
1050 }
1051
1052 #[test]
1055 fn test_transpose_tracked_backward_gradient_flow() {
1056 let a = Tensor::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], true);
1058
1059 let a_t = transpose_tracked(&a, 2, 3);
1061 assert_eq!(a_t.len(), 6);
1062
1063 let at_data = a_t.data();
1065 let at_slice = at_data.as_slice().expect("contiguous");
1066 assert_eq!(at_slice, &[1.0, 4.0, 2.0, 5.0, 3.0, 6.0]);
1067
1068 a_t.set_grad(Array1::from(vec![10.0, 40.0, 20.0, 50.0, 30.0, 60.0]));
1071
1072 if let Some(op) = a_t.backward_op() {
1074 op.backward();
1075 }
1076
1077 let grad = a.grad().expect("original tensor should have gradient");
1079 let grad_slice = grad.as_slice().expect("contiguous");
1080 assert_eq!(grad_slice, &[10.0, 20.0, 30.0, 40.0, 50.0, 60.0]);
1082 }
1083
1084 #[test]
1087 fn test_transpose_tracked_lora_gradient_chain() {
1088 let lora_a = Tensor::from_vec(vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6], true);
1090 let x = Tensor::from_vec(vec![1.0, 2.0, 3.0], true); let lora_a_t = transpose_tracked(&lora_a, 2, 3);
1094
1095 let result = matmul(&x, &lora_a_t, 1, 3, 2);
1097 assert_eq!(result.len(), 2);
1098
1099 result.set_grad(Array1::from(vec![1.0, 1.0]));
1101
1102 if let Some(op) = result.backward_op() {
1104 op.backward();
1105 }
1106
1107 let grad = lora_a.grad().expect("LoRA A should receive gradient via transpose_tracked");
1109 assert_eq!(grad.len(), 6);
1110
1111 for (i, &val) in grad.as_slice().expect("contiguous").iter().enumerate() {
1113 assert!(val.is_finite(), "Gradient element {i} is not finite: {val}");
1114 }
1115 let grad_sum: f32 = grad.iter().sum();
1116 assert!(grad_sum.abs() > 1e-6, "Gradient should be non-zero, got sum={grad_sum}");
1117 }
1118}