1#![no_std]
31
32use core::ffi::c_void;
33
34#[cfg(any(feature = "sm80", feature = "sm90a"))]
52unsafe extern "C" {
53 pub fn baracuda_cutlass_gemm_f16_rcr_sm80_run(
60 m: i32,
61 n: i32,
62 k: i32,
63 a: *const c_void,
64 lda: i64,
65 b: *const c_void,
66 ldb: i64,
67 c: *const c_void,
68 ldc: i64,
69 d: *mut c_void,
70 ldd: i64,
71 alpha: f32,
72 beta: f32,
73 workspace: *mut c_void,
74 workspace_bytes: usize,
75 stream: *mut c_void,
76 ) -> i32;
77
78 pub fn baracuda_cutlass_gemm_f16_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
80
81 pub fn baracuda_cutlass_gemm_f16_rcr_sm80_can_implement(
93 m: i32,
94 n: i32,
95 k: i32,
96 a: *const c_void,
97 lda: i64,
98 b: *const c_void,
99 ldb: i64,
100 c: *const c_void,
101 ldc: i64,
102 d: *mut c_void,
103 ldd: i64,
104 ) -> i32;
105
106 pub fn baracuda_cutlass_gemm_bf16_rcr_sm80_run(
111 m: i32,
112 n: i32,
113 k: i32,
114 a: *const c_void,
115 lda: i64,
116 b: *const c_void,
117 ldb: i64,
118 c: *const c_void,
119 ldc: i64,
120 d: *mut c_void,
121 ldd: i64,
122 alpha: f32,
123 beta: f32,
124 workspace: *mut c_void,
125 workspace_bytes: usize,
126 stream: *mut c_void,
127 ) -> i32;
128
129 pub fn baracuda_cutlass_gemm_bf16_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
131
132 pub fn baracuda_cutlass_gemm_bf16_rcr_sm80_can_implement(
137 m: i32,
138 n: i32,
139 k: i32,
140 a: *const c_void,
141 lda: i64,
142 b: *const c_void,
143 ldb: i64,
144 c: *const c_void,
145 ldc: i64,
146 d: *mut c_void,
147 ldd: i64,
148 ) -> i32;
149}
150
151#[cfg(any(feature = "sm80", feature = "sm90a"))]
171unsafe extern "C" {
172 #[allow(clippy::too_many_arguments)]
179 pub fn baracuda_cutlass_gemm_bias_f16_rcr_sm80_run(
180 m: i32,
181 n: i32,
182 k: i32,
183 a: *const c_void,
184 lda: i64,
185 b: *const c_void,
186 ldb: i64,
187 c: *const c_void,
188 ldc: i64,
189 d: *mut c_void,
190 ldd: i64,
191 bias: *const c_void,
192 alpha: f32,
193 beta: f32,
194 workspace: *mut c_void,
195 workspace_bytes: usize,
196 stream: *mut c_void,
197 ) -> i32;
198
199 pub fn baracuda_cutlass_gemm_bias_f16_rcr_sm80_workspace_size(
201 m: i32,
202 n: i32,
203 k: i32,
204 ) -> usize;
205
206 pub fn baracuda_cutlass_gemm_bias_f16_rcr_sm80_can_implement(
211 m: i32,
212 n: i32,
213 k: i32,
214 a: *const c_void,
215 lda: i64,
216 b: *const c_void,
217 ldb: i64,
218 c: *const c_void,
219 ldc: i64,
220 d: *mut c_void,
221 ldd: i64,
222 bias: *const c_void,
223 ) -> i32;
224
225 #[allow(clippy::too_many_arguments)]
230 pub fn baracuda_cutlass_gemm_bias_bf16_rcr_sm80_run(
231 m: i32,
232 n: i32,
233 k: i32,
234 a: *const c_void,
235 lda: i64,
236 b: *const c_void,
237 ldb: i64,
238 c: *const c_void,
239 ldc: i64,
240 d: *mut c_void,
241 ldd: i64,
242 bias: *const c_void,
243 alpha: f32,
244 beta: f32,
245 workspace: *mut c_void,
246 workspace_bytes: usize,
247 stream: *mut c_void,
248 ) -> i32;
249
250 pub fn baracuda_cutlass_gemm_bias_bf16_rcr_sm80_workspace_size(
252 m: i32,
253 n: i32,
254 k: i32,
255 ) -> usize;
256
257 pub fn baracuda_cutlass_gemm_bias_bf16_rcr_sm80_can_implement(
262 m: i32,
263 n: i32,
264 k: i32,
265 a: *const c_void,
266 lda: i64,
267 b: *const c_void,
268 ldb: i64,
269 c: *const c_void,
270 ldc: i64,
271 d: *mut c_void,
272 ldd: i64,
273 bias: *const c_void,
274 ) -> i32;
275
276 #[allow(clippy::too_many_arguments)]
284 pub fn baracuda_cutlass_gemm_bias_relu_f16_rcr_sm80_run(
285 m: i32, n: i32, k: i32,
286 a: *const c_void, lda: i64,
287 b: *const c_void, ldb: i64,
288 c: *const c_void, ldc: i64,
289 d: *mut c_void, ldd: i64,
290 bias: *const c_void,
291 alpha: f32, beta: f32,
292 workspace: *mut c_void, workspace_bytes: usize,
293 stream: *mut c_void,
294 ) -> i32;
295
296 pub fn baracuda_cutlass_gemm_bias_relu_f16_rcr_sm80_workspace_size(
298 m: i32, n: i32, k: i32,
299 ) -> usize;
300
301 pub fn baracuda_cutlass_gemm_bias_relu_f16_rcr_sm80_can_implement(
305 m: i32, n: i32, k: i32,
306 a: *const c_void, lda: i64,
307 b: *const c_void, ldb: i64,
308 c: *const c_void, ldc: i64,
309 d: *mut c_void, ldd: i64,
310 bias: *const c_void,
311 ) -> i32;
312
313 #[allow(clippy::too_many_arguments)]
317 pub fn baracuda_cutlass_gemm_bias_relu_bf16_rcr_sm80_run(
318 m: i32, n: i32, k: i32,
319 a: *const c_void, lda: i64,
320 b: *const c_void, ldb: i64,
321 c: *const c_void, ldc: i64,
322 d: *mut c_void, ldd: i64,
323 bias: *const c_void,
324 alpha: f32, beta: f32,
325 workspace: *mut c_void, workspace_bytes: usize,
326 stream: *mut c_void,
327 ) -> i32;
328
329 pub fn baracuda_cutlass_gemm_bias_relu_bf16_rcr_sm80_workspace_size(
331 m: i32, n: i32, k: i32,
332 ) -> usize;
333
334 pub fn baracuda_cutlass_gemm_bias_relu_bf16_rcr_sm80_can_implement(
338 m: i32, n: i32, k: i32,
339 a: *const c_void, lda: i64,
340 b: *const c_void, ldb: i64,
341 c: *const c_void, ldc: i64,
342 d: *mut c_void, ldd: i64,
343 bias: *const c_void,
344 ) -> i32;
345
346 #[allow(clippy::too_many_arguments)]
356 pub fn baracuda_cutlass_gemm_bias_gelu_f16_rcr_sm80_run(
357 m: i32, n: i32, k: i32,
358 a: *const c_void, lda: i64,
359 b: *const c_void, ldb: i64,
360 c: *const c_void, ldc: i64,
361 d: *mut c_void, ldd: i64,
362 bias: *const c_void,
363 alpha: f32, beta: f32,
364 workspace: *mut c_void, workspace_bytes: usize,
365 stream: *mut c_void,
366 ) -> i32;
367
368 pub fn baracuda_cutlass_gemm_bias_gelu_f16_rcr_sm80_workspace_size(
370 m: i32, n: i32, k: i32,
371 ) -> usize;
372
373 pub fn baracuda_cutlass_gemm_bias_gelu_f16_rcr_sm80_can_implement(
377 m: i32, n: i32, k: i32,
378 a: *const c_void, lda: i64,
379 b: *const c_void, ldb: i64,
380 c: *const c_void, ldc: i64,
381 d: *mut c_void, ldd: i64,
382 bias: *const c_void,
383 ) -> i32;
384
385 #[allow(clippy::too_many_arguments)]
389 pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rcr_sm80_run(
390 m: i32, n: i32, k: i32,
391 a: *const c_void, lda: i64,
392 b: *const c_void, ldb: i64,
393 c: *const c_void, ldc: i64,
394 d: *mut c_void, ldd: i64,
395 bias: *const c_void,
396 alpha: f32, beta: f32,
397 workspace: *mut c_void, workspace_bytes: usize,
398 stream: *mut c_void,
399 ) -> i32;
400
401 pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rcr_sm80_workspace_size(
403 m: i32, n: i32, k: i32,
404 ) -> usize;
405
406 pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rcr_sm80_can_implement(
410 m: i32, n: i32, k: i32,
411 a: *const c_void, lda: i64,
412 b: *const c_void, ldb: i64,
413 c: *const c_void, ldc: i64,
414 d: *mut c_void, ldd: i64,
415 bias: *const c_void,
416 ) -> i32;
417
418 #[allow(clippy::too_many_arguments)]
427 pub fn baracuda_cutlass_gemm_bias_silu_f16_rcr_sm80_run(
428 m: i32, n: i32, k: i32,
429 a: *const c_void, lda: i64,
430 b: *const c_void, ldb: i64,
431 c: *const c_void, ldc: i64,
432 d: *mut c_void, ldd: i64,
433 bias: *const c_void,
434 alpha: f32, beta: f32,
435 workspace: *mut c_void, workspace_bytes: usize,
436 stream: *mut c_void,
437 ) -> i32;
438
439 pub fn baracuda_cutlass_gemm_bias_silu_f16_rcr_sm80_workspace_size(
441 m: i32, n: i32, k: i32,
442 ) -> usize;
443
444 pub fn baracuda_cutlass_gemm_bias_silu_f16_rcr_sm80_can_implement(
448 m: i32, n: i32, k: i32,
449 a: *const c_void, lda: i64,
450 b: *const c_void, ldb: i64,
451 c: *const c_void, ldc: i64,
452 d: *mut c_void, ldd: i64,
453 bias: *const c_void,
454 ) -> i32;
455
456 #[allow(clippy::too_many_arguments)]
460 pub fn baracuda_cutlass_gemm_bias_silu_bf16_rcr_sm80_run(
461 m: i32, n: i32, k: i32,
462 a: *const c_void, lda: i64,
463 b: *const c_void, ldb: i64,
464 c: *const c_void, ldc: i64,
465 d: *mut c_void, ldd: i64,
466 bias: *const c_void,
467 alpha: f32, beta: f32,
468 workspace: *mut c_void, workspace_bytes: usize,
469 stream: *mut c_void,
470 ) -> i32;
471
472 pub fn baracuda_cutlass_gemm_bias_silu_bf16_rcr_sm80_workspace_size(
474 m: i32, n: i32, k: i32,
475 ) -> usize;
476
477 pub fn baracuda_cutlass_gemm_bias_silu_bf16_rcr_sm80_can_implement(
481 m: i32, n: i32, k: i32,
482 a: *const c_void, lda: i64,
483 b: *const c_void, ldb: i64,
484 c: *const c_void, ldc: i64,
485 d: *mut c_void, ldd: i64,
486 bias: *const c_void,
487 ) -> i32;
488}
489
490#[cfg(any(feature = "sm80", feature = "sm90a"))]
505unsafe extern "C" {
506 pub fn baracuda_cutlass_gemm_f16_rrr_sm80_run(
511 m: i32,
512 n: i32,
513 k: i32,
514 a: *const c_void,
515 lda: i64,
516 b: *const c_void,
517 ldb: i64,
518 c: *const c_void,
519 ldc: i64,
520 d: *mut c_void,
521 ldd: i64,
522 alpha: f32,
523 beta: f32,
524 workspace: *mut c_void,
525 workspace_bytes: usize,
526 stream: *mut c_void,
527 ) -> i32;
528
529 pub fn baracuda_cutlass_gemm_f16_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
531
532 pub fn baracuda_cutlass_gemm_f16_rrr_sm80_can_implement(
537 m: i32,
538 n: i32,
539 k: i32,
540 a: *const c_void,
541 lda: i64,
542 b: *const c_void,
543 ldb: i64,
544 c: *const c_void,
545 ldc: i64,
546 d: *mut c_void,
547 ldd: i64,
548 ) -> i32;
549
550 pub fn baracuda_cutlass_gemm_bf16_rrr_sm80_run(
555 m: i32,
556 n: i32,
557 k: i32,
558 a: *const c_void,
559 lda: i64,
560 b: *const c_void,
561 ldb: i64,
562 c: *const c_void,
563 ldc: i64,
564 d: *mut c_void,
565 ldd: i64,
566 alpha: f32,
567 beta: f32,
568 workspace: *mut c_void,
569 workspace_bytes: usize,
570 stream: *mut c_void,
571 ) -> i32;
572
573 pub fn baracuda_cutlass_gemm_bf16_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
575
576 pub fn baracuda_cutlass_gemm_bf16_rrr_sm80_can_implement(
581 m: i32,
582 n: i32,
583 k: i32,
584 a: *const c_void,
585 lda: i64,
586 b: *const c_void,
587 ldb: i64,
588 c: *const c_void,
589 ldc: i64,
590 d: *mut c_void,
591 ldd: i64,
592 ) -> i32;
593}
594
595#[cfg(any(feature = "sm80", feature = "sm90a"))]
607unsafe extern "C" {
608 #[allow(clippy::too_many_arguments)]
614 pub fn baracuda_cutlass_gemm_bias_f16_rrr_sm80_run(
615 m: i32, n: i32, k: i32,
616 a: *const c_void, lda: i64,
617 b: *const c_void, ldb: i64,
618 c: *const c_void, ldc: i64,
619 d: *mut c_void, ldd: i64,
620 bias: *const c_void,
621 alpha: f32, beta: f32,
622 workspace: *mut c_void, workspace_bytes: usize,
623 stream: *mut c_void,
624 ) -> i32;
625
626 pub fn baracuda_cutlass_gemm_bias_f16_rrr_sm80_workspace_size(
628 m: i32, n: i32, k: i32,
629 ) -> usize;
630
631 pub fn baracuda_cutlass_gemm_bias_f16_rrr_sm80_can_implement(
634 m: i32, n: i32, k: i32,
635 a: *const c_void, lda: i64,
636 b: *const c_void, ldb: i64,
637 c: *const c_void, ldc: i64,
638 d: *mut c_void, ldd: i64,
639 bias: *const c_void,
640 ) -> i32;
641
642 #[allow(clippy::too_many_arguments)]
646 pub fn baracuda_cutlass_gemm_bias_bf16_rrr_sm80_run(
647 m: i32, n: i32, k: i32,
648 a: *const c_void, lda: i64,
649 b: *const c_void, ldb: i64,
650 c: *const c_void, ldc: i64,
651 d: *mut c_void, ldd: i64,
652 bias: *const c_void,
653 alpha: f32, beta: f32,
654 workspace: *mut c_void, workspace_bytes: usize,
655 stream: *mut c_void,
656 ) -> i32;
657
658 pub fn baracuda_cutlass_gemm_bias_bf16_rrr_sm80_workspace_size(
660 m: i32, n: i32, k: i32,
661 ) -> usize;
662
663 pub fn baracuda_cutlass_gemm_bias_bf16_rrr_sm80_can_implement(
666 m: i32, n: i32, k: i32,
667 a: *const c_void, lda: i64,
668 b: *const c_void, ldb: i64,
669 c: *const c_void, ldc: i64,
670 d: *mut c_void, ldd: i64,
671 bias: *const c_void,
672 ) -> i32;
673
674 #[allow(clippy::too_many_arguments)]
680 pub fn baracuda_cutlass_gemm_bias_relu_f16_rrr_sm80_run(
681 m: i32, n: i32, k: i32,
682 a: *const c_void, lda: i64,
683 b: *const c_void, ldb: i64,
684 c: *const c_void, ldc: i64,
685 d: *mut c_void, ldd: i64,
686 bias: *const c_void,
687 alpha: f32, beta: f32,
688 workspace: *mut c_void, workspace_bytes: usize,
689 stream: *mut c_void,
690 ) -> i32;
691
692 pub fn baracuda_cutlass_gemm_bias_relu_f16_rrr_sm80_workspace_size(
694 m: i32, n: i32, k: i32,
695 ) -> usize;
696
697 pub fn baracuda_cutlass_gemm_bias_relu_f16_rrr_sm80_can_implement(
700 m: i32, n: i32, k: i32,
701 a: *const c_void, lda: i64,
702 b: *const c_void, ldb: i64,
703 c: *const c_void, ldc: i64,
704 d: *mut c_void, ldd: i64,
705 bias: *const c_void,
706 ) -> i32;
707
708 #[allow(clippy::too_many_arguments)]
712 pub fn baracuda_cutlass_gemm_bias_relu_bf16_rrr_sm80_run(
713 m: i32, n: i32, k: i32,
714 a: *const c_void, lda: i64,
715 b: *const c_void, ldb: i64,
716 c: *const c_void, ldc: i64,
717 d: *mut c_void, ldd: i64,
718 bias: *const c_void,
719 alpha: f32, beta: f32,
720 workspace: *mut c_void, workspace_bytes: usize,
721 stream: *mut c_void,
722 ) -> i32;
723
724 pub fn baracuda_cutlass_gemm_bias_relu_bf16_rrr_sm80_workspace_size(
726 m: i32, n: i32, k: i32,
727 ) -> usize;
728
729 pub fn baracuda_cutlass_gemm_bias_relu_bf16_rrr_sm80_can_implement(
732 m: i32, n: i32, k: i32,
733 a: *const c_void, lda: i64,
734 b: *const c_void, ldb: i64,
735 c: *const c_void, ldc: i64,
736 d: *mut c_void, ldd: i64,
737 bias: *const c_void,
738 ) -> i32;
739
740 #[allow(clippy::too_many_arguments)]
746 pub fn baracuda_cutlass_gemm_bias_gelu_f16_rrr_sm80_run(
747 m: i32, n: i32, k: i32,
748 a: *const c_void, lda: i64,
749 b: *const c_void, ldb: i64,
750 c: *const c_void, ldc: i64,
751 d: *mut c_void, ldd: i64,
752 bias: *const c_void,
753 alpha: f32, beta: f32,
754 workspace: *mut c_void, workspace_bytes: usize,
755 stream: *mut c_void,
756 ) -> i32;
757
758 pub fn baracuda_cutlass_gemm_bias_gelu_f16_rrr_sm80_workspace_size(
760 m: i32, n: i32, k: i32,
761 ) -> usize;
762
763 pub fn baracuda_cutlass_gemm_bias_gelu_f16_rrr_sm80_can_implement(
766 m: i32, n: i32, k: i32,
767 a: *const c_void, lda: i64,
768 b: *const c_void, ldb: i64,
769 c: *const c_void, ldc: i64,
770 d: *mut c_void, ldd: i64,
771 bias: *const c_void,
772 ) -> i32;
773
774 #[allow(clippy::too_many_arguments)]
778 pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rrr_sm80_run(
779 m: i32, n: i32, k: i32,
780 a: *const c_void, lda: i64,
781 b: *const c_void, ldb: i64,
782 c: *const c_void, ldc: i64,
783 d: *mut c_void, ldd: i64,
784 bias: *const c_void,
785 alpha: f32, beta: f32,
786 workspace: *mut c_void, workspace_bytes: usize,
787 stream: *mut c_void,
788 ) -> i32;
789
790 pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rrr_sm80_workspace_size(
792 m: i32, n: i32, k: i32,
793 ) -> usize;
794
795 pub fn baracuda_cutlass_gemm_bias_gelu_bf16_rrr_sm80_can_implement(
798 m: i32, n: i32, k: i32,
799 a: *const c_void, lda: i64,
800 b: *const c_void, ldb: i64,
801 c: *const c_void, ldc: i64,
802 d: *mut c_void, ldd: i64,
803 bias: *const c_void,
804 ) -> i32;
805
806 #[allow(clippy::too_many_arguments)]
812 pub fn baracuda_cutlass_gemm_bias_silu_f16_rrr_sm80_run(
813 m: i32, n: i32, k: i32,
814 a: *const c_void, lda: i64,
815 b: *const c_void, ldb: i64,
816 c: *const c_void, ldc: i64,
817 d: *mut c_void, ldd: i64,
818 bias: *const c_void,
819 alpha: f32, beta: f32,
820 workspace: *mut c_void, workspace_bytes: usize,
821 stream: *mut c_void,
822 ) -> i32;
823
824 pub fn baracuda_cutlass_gemm_bias_silu_f16_rrr_sm80_workspace_size(
826 m: i32, n: i32, k: i32,
827 ) -> usize;
828
829 pub fn baracuda_cutlass_gemm_bias_silu_f16_rrr_sm80_can_implement(
832 m: i32, n: i32, k: i32,
833 a: *const c_void, lda: i64,
834 b: *const c_void, ldb: i64,
835 c: *const c_void, ldc: i64,
836 d: *mut c_void, ldd: i64,
837 bias: *const c_void,
838 ) -> i32;
839
840 #[allow(clippy::too_many_arguments)]
844 pub fn baracuda_cutlass_gemm_bias_silu_bf16_rrr_sm80_run(
845 m: i32, n: i32, k: i32,
846 a: *const c_void, lda: i64,
847 b: *const c_void, ldb: i64,
848 c: *const c_void, ldc: i64,
849 d: *mut c_void, ldd: i64,
850 bias: *const c_void,
851 alpha: f32, beta: f32,
852 workspace: *mut c_void, workspace_bytes: usize,
853 stream: *mut c_void,
854 ) -> i32;
855
856 pub fn baracuda_cutlass_gemm_bias_silu_bf16_rrr_sm80_workspace_size(
858 m: i32, n: i32, k: i32,
859 ) -> usize;
860
861 pub fn baracuda_cutlass_gemm_bias_silu_bf16_rrr_sm80_can_implement(
864 m: i32, n: i32, k: i32,
865 a: *const c_void, lda: i64,
866 b: *const c_void, ldb: i64,
867 c: *const c_void, ldc: i64,
868 d: *mut c_void, ldd: i64,
869 bias: *const c_void,
870 ) -> i32;
871}
872
873#[cfg(any(feature = "sm80", feature = "sm90a"))]
884unsafe extern "C" {
885 pub fn baracuda_cutlass_gemm_tf32_rcr_sm80_run(
890 m: i32,
891 n: i32,
892 k: i32,
893 a: *const c_void,
894 lda: i64,
895 b: *const c_void,
896 ldb: i64,
897 c: *const c_void,
898 ldc: i64,
899 d: *mut c_void,
900 ldd: i64,
901 alpha: f32,
902 beta: f32,
903 workspace: *mut c_void,
904 workspace_bytes: usize,
905 stream: *mut c_void,
906 ) -> i32;
907
908 pub fn baracuda_cutlass_gemm_tf32_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
910
911 pub fn baracuda_cutlass_gemm_tf32_rcr_sm80_can_implement(
916 m: i32,
917 n: i32,
918 k: i32,
919 a: *const c_void,
920 lda: i64,
921 b: *const c_void,
922 ldb: i64,
923 c: *const c_void,
924 ldc: i64,
925 d: *mut c_void,
926 ldd: i64,
927 ) -> i32;
928
929 pub fn baracuda_cutlass_gemm_tf32_rrr_sm80_run(
938 m: i32,
939 n: i32,
940 k: i32,
941 a: *const c_void,
942 lda: i64,
943 b: *const c_void,
944 ldb: i64,
945 c: *const c_void,
946 ldc: i64,
947 d: *mut c_void,
948 ldd: i64,
949 alpha: f32,
950 beta: f32,
951 workspace: *mut c_void,
952 workspace_bytes: usize,
953 stream: *mut c_void,
954 ) -> i32;
955
956 pub fn baracuda_cutlass_gemm_tf32_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
958
959 pub fn baracuda_cutlass_gemm_tf32_rrr_sm80_can_implement(
964 m: i32,
965 n: i32,
966 k: i32,
967 a: *const c_void,
968 lda: i64,
969 b: *const c_void,
970 ldb: i64,
971 c: *const c_void,
972 ldc: i64,
973 d: *mut c_void,
974 ldd: i64,
975 ) -> i32;
976}
977
978#[cfg(any(feature = "sm80", feature = "sm90a"))]
989unsafe extern "C" {
990 #[allow(clippy::too_many_arguments)]
996 pub fn baracuda_cutlass_gemm_bias_tf32_rcr_sm80_run(
997 m: i32, n: i32, k: i32,
998 a: *const c_void, lda: i64,
999 b: *const c_void, ldb: i64,
1000 c: *const c_void, ldc: i64,
1001 d: *mut c_void, ldd: i64,
1002 bias: *const c_void,
1003 alpha: f32, beta: f32,
1004 workspace: *mut c_void, workspace_bytes: usize,
1005 stream: *mut c_void,
1006 ) -> i32;
1007
1008 pub fn baracuda_cutlass_gemm_bias_tf32_rcr_sm80_workspace_size(
1010 m: i32, n: i32, k: i32,
1011 ) -> usize;
1012
1013 pub fn baracuda_cutlass_gemm_bias_tf32_rcr_sm80_can_implement(
1016 m: i32, n: i32, k: i32,
1017 a: *const c_void, lda: i64,
1018 b: *const c_void, ldb: i64,
1019 c: *const c_void, ldc: i64,
1020 d: *mut c_void, ldd: i64,
1021 bias: *const c_void,
1022 ) -> i32;
1023
1024 #[allow(clippy::too_many_arguments)]
1030 pub fn baracuda_cutlass_gemm_bias_relu_tf32_rcr_sm80_run(
1031 m: i32, n: i32, k: i32,
1032 a: *const c_void, lda: i64,
1033 b: *const c_void, ldb: i64,
1034 c: *const c_void, ldc: i64,
1035 d: *mut c_void, ldd: i64,
1036 bias: *const c_void,
1037 alpha: f32, beta: f32,
1038 workspace: *mut c_void, workspace_bytes: usize,
1039 stream: *mut c_void,
1040 ) -> i32;
1041
1042 pub fn baracuda_cutlass_gemm_bias_relu_tf32_rcr_sm80_workspace_size(
1044 m: i32, n: i32, k: i32,
1045 ) -> usize;
1046
1047 pub fn baracuda_cutlass_gemm_bias_relu_tf32_rcr_sm80_can_implement(
1050 m: i32, n: i32, k: i32,
1051 a: *const c_void, lda: i64,
1052 b: *const c_void, ldb: i64,
1053 c: *const c_void, ldc: i64,
1054 d: *mut c_void, ldd: i64,
1055 bias: *const c_void,
1056 ) -> i32;
1057
1058 #[allow(clippy::too_many_arguments)]
1064 pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rcr_sm80_run(
1065 m: i32, n: i32, k: i32,
1066 a: *const c_void, lda: i64,
1067 b: *const c_void, ldb: i64,
1068 c: *const c_void, ldc: i64,
1069 d: *mut c_void, ldd: i64,
1070 bias: *const c_void,
1071 alpha: f32, beta: f32,
1072 workspace: *mut c_void, workspace_bytes: usize,
1073 stream: *mut c_void,
1074 ) -> i32;
1075
1076 pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rcr_sm80_workspace_size(
1078 m: i32, n: i32, k: i32,
1079 ) -> usize;
1080
1081 pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rcr_sm80_can_implement(
1084 m: i32, n: i32, k: i32,
1085 a: *const c_void, lda: i64,
1086 b: *const c_void, ldb: i64,
1087 c: *const c_void, ldc: i64,
1088 d: *mut c_void, ldd: i64,
1089 bias: *const c_void,
1090 ) -> i32;
1091
1092 #[allow(clippy::too_many_arguments)]
1098 pub fn baracuda_cutlass_gemm_bias_silu_tf32_rcr_sm80_run(
1099 m: i32, n: i32, k: i32,
1100 a: *const c_void, lda: i64,
1101 b: *const c_void, ldb: i64,
1102 c: *const c_void, ldc: i64,
1103 d: *mut c_void, ldd: i64,
1104 bias: *const c_void,
1105 alpha: f32, beta: f32,
1106 workspace: *mut c_void, workspace_bytes: usize,
1107 stream: *mut c_void,
1108 ) -> i32;
1109
1110 pub fn baracuda_cutlass_gemm_bias_silu_tf32_rcr_sm80_workspace_size(
1112 m: i32, n: i32, k: i32,
1113 ) -> usize;
1114
1115 pub fn baracuda_cutlass_gemm_bias_silu_tf32_rcr_sm80_can_implement(
1118 m: i32, n: i32, k: i32,
1119 a: *const c_void, lda: i64,
1120 b: *const c_void, ldb: i64,
1121 c: *const c_void, ldc: i64,
1122 d: *mut c_void, ldd: i64,
1123 bias: *const c_void,
1124 ) -> i32;
1125}
1126
1127#[cfg(any(feature = "sm80", feature = "sm90a"))]
1137unsafe extern "C" {
1138 #[allow(clippy::too_many_arguments)]
1144 pub fn baracuda_cutlass_gemm_bias_tf32_rrr_sm80_run(
1145 m: i32, n: i32, k: i32,
1146 a: *const c_void, lda: i64,
1147 b: *const c_void, ldb: i64,
1148 c: *const c_void, ldc: i64,
1149 d: *mut c_void, ldd: i64,
1150 bias: *const c_void,
1151 alpha: f32, beta: f32,
1152 workspace: *mut c_void, workspace_bytes: usize,
1153 stream: *mut c_void,
1154 ) -> i32;
1155
1156 pub fn baracuda_cutlass_gemm_bias_tf32_rrr_sm80_workspace_size(
1158 m: i32, n: i32, k: i32,
1159 ) -> usize;
1160
1161 pub fn baracuda_cutlass_gemm_bias_tf32_rrr_sm80_can_implement(
1164 m: i32, n: i32, k: i32,
1165 a: *const c_void, lda: i64,
1166 b: *const c_void, ldb: i64,
1167 c: *const c_void, ldc: i64,
1168 d: *mut c_void, ldd: i64,
1169 bias: *const c_void,
1170 ) -> i32;
1171
1172 #[allow(clippy::too_many_arguments)]
1178 pub fn baracuda_cutlass_gemm_bias_relu_tf32_rrr_sm80_run(
1179 m: i32, n: i32, k: i32,
1180 a: *const c_void, lda: i64,
1181 b: *const c_void, ldb: i64,
1182 c: *const c_void, ldc: i64,
1183 d: *mut c_void, ldd: i64,
1184 bias: *const c_void,
1185 alpha: f32, beta: f32,
1186 workspace: *mut c_void, workspace_bytes: usize,
1187 stream: *mut c_void,
1188 ) -> i32;
1189
1190 pub fn baracuda_cutlass_gemm_bias_relu_tf32_rrr_sm80_workspace_size(
1192 m: i32, n: i32, k: i32,
1193 ) -> usize;
1194
1195 pub fn baracuda_cutlass_gemm_bias_relu_tf32_rrr_sm80_can_implement(
1198 m: i32, n: i32, k: i32,
1199 a: *const c_void, lda: i64,
1200 b: *const c_void, ldb: i64,
1201 c: *const c_void, ldc: i64,
1202 d: *mut c_void, ldd: i64,
1203 bias: *const c_void,
1204 ) -> i32;
1205
1206 #[allow(clippy::too_many_arguments)]
1212 pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rrr_sm80_run(
1213 m: i32, n: i32, k: i32,
1214 a: *const c_void, lda: i64,
1215 b: *const c_void, ldb: i64,
1216 c: *const c_void, ldc: i64,
1217 d: *mut c_void, ldd: i64,
1218 bias: *const c_void,
1219 alpha: f32, beta: f32,
1220 workspace: *mut c_void, workspace_bytes: usize,
1221 stream: *mut c_void,
1222 ) -> i32;
1223
1224 pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rrr_sm80_workspace_size(
1226 m: i32, n: i32, k: i32,
1227 ) -> usize;
1228
1229 pub fn baracuda_cutlass_gemm_bias_gelu_tf32_rrr_sm80_can_implement(
1232 m: i32, n: i32, k: i32,
1233 a: *const c_void, lda: i64,
1234 b: *const c_void, ldb: i64,
1235 c: *const c_void, ldc: i64,
1236 d: *mut c_void, ldd: i64,
1237 bias: *const c_void,
1238 ) -> i32;
1239
1240 #[allow(clippy::too_many_arguments)]
1246 pub fn baracuda_cutlass_gemm_bias_silu_tf32_rrr_sm80_run(
1247 m: i32, n: i32, k: i32,
1248 a: *const c_void, lda: i64,
1249 b: *const c_void, ldb: i64,
1250 c: *const c_void, ldc: i64,
1251 d: *mut c_void, ldd: i64,
1252 bias: *const c_void,
1253 alpha: f32, beta: f32,
1254 workspace: *mut c_void, workspace_bytes: usize,
1255 stream: *mut c_void,
1256 ) -> i32;
1257
1258 pub fn baracuda_cutlass_gemm_bias_silu_tf32_rrr_sm80_workspace_size(
1260 m: i32, n: i32, k: i32,
1261 ) -> usize;
1262
1263 pub fn baracuda_cutlass_gemm_bias_silu_tf32_rrr_sm80_can_implement(
1266 m: i32, n: i32, k: i32,
1267 a: *const c_void, lda: i64,
1268 b: *const c_void, ldb: i64,
1269 c: *const c_void, ldc: i64,
1270 d: *mut c_void, ldd: i64,
1271 bias: *const c_void,
1272 ) -> i32;
1273}
1274
1275#[cfg(any(feature = "sm80", feature = "sm90a"))]
1291unsafe extern "C" {
1292 pub fn baracuda_cutlass_gemm_f32_simt_rcr_sm80_run(
1298 m: i32, n: i32, k: i32,
1299 a: *const c_void, lda: i64,
1300 b: *const c_void, ldb: i64,
1301 c: *const c_void, ldc: i64,
1302 d: *mut c_void, ldd: i64,
1303 alpha: f32, beta: f32,
1304 workspace: *mut c_void, workspace_bytes: usize,
1305 stream: *mut c_void,
1306 ) -> i32;
1307
1308 pub fn baracuda_cutlass_gemm_f32_simt_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1310
1311 pub fn baracuda_cutlass_gemm_f32_simt_rcr_sm80_can_implement(
1314 m: i32, n: i32, k: i32,
1315 a: *const c_void, lda: i64,
1316 b: *const c_void, ldb: i64,
1317 c: *const c_void, ldc: i64,
1318 d: *mut c_void, ldd: i64,
1319 ) -> i32;
1320
1321 pub fn baracuda_cutlass_gemm_f32_simt_rrr_sm80_run(
1326 m: i32, n: i32, k: i32,
1327 a: *const c_void, lda: i64,
1328 b: *const c_void, ldb: i64,
1329 c: *const c_void, ldc: i64,
1330 d: *mut c_void, ldd: i64,
1331 alpha: f32, beta: f32,
1332 workspace: *mut c_void, workspace_bytes: usize,
1333 stream: *mut c_void,
1334 ) -> i32;
1335
1336 pub fn baracuda_cutlass_gemm_f32_simt_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1338
1339 pub fn baracuda_cutlass_gemm_f32_simt_rrr_sm80_can_implement(
1342 m: i32, n: i32, k: i32,
1343 a: *const c_void, lda: i64,
1344 b: *const c_void, ldb: i64,
1345 c: *const c_void, ldc: i64,
1346 d: *mut c_void, ldd: i64,
1347 ) -> i32;
1348}
1349
1350#[cfg(any(feature = "sm80", feature = "sm90a"))]
1359unsafe extern "C" {
1360 #[allow(clippy::too_many_arguments)]
1364 pub fn baracuda_cutlass_gemm_bias_f32_simt_rcr_sm80_run(
1365 m: i32, n: i32, k: i32,
1366 a: *const c_void, lda: i64,
1367 b: *const c_void, ldb: i64,
1368 c: *const c_void, ldc: i64,
1369 d: *mut c_void, ldd: i64,
1370 bias: *const c_void,
1371 alpha: f32, beta: f32,
1372 workspace: *mut c_void, workspace_bytes: usize,
1373 stream: *mut c_void,
1374 ) -> i32;
1375 pub fn baracuda_cutlass_gemm_bias_f32_simt_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1377 pub fn baracuda_cutlass_gemm_bias_f32_simt_rcr_sm80_can_implement(
1379 m: i32, n: i32, k: i32,
1380 a: *const c_void, lda: i64,
1381 b: *const c_void, ldb: i64,
1382 c: *const c_void, ldc: i64,
1383 d: *mut c_void, ldd: i64,
1384 bias: *const c_void,
1385 ) -> i32;
1386
1387 #[allow(clippy::too_many_arguments)]
1389 pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rcr_sm80_run(
1390 m: i32, n: i32, k: i32,
1391 a: *const c_void, lda: i64,
1392 b: *const c_void, ldb: i64,
1393 c: *const c_void, ldc: i64,
1394 d: *mut c_void, ldd: i64,
1395 bias: *const c_void,
1396 alpha: f32, beta: f32,
1397 workspace: *mut c_void, workspace_bytes: usize,
1398 stream: *mut c_void,
1399 ) -> i32;
1400 pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1402 pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rcr_sm80_can_implement(
1404 m: i32, n: i32, k: i32,
1405 a: *const c_void, lda: i64,
1406 b: *const c_void, ldb: i64,
1407 c: *const c_void, ldc: i64,
1408 d: *mut c_void, ldd: i64,
1409 bias: *const c_void,
1410 ) -> i32;
1411
1412 #[allow(clippy::too_many_arguments)]
1414 pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rcr_sm80_run(
1415 m: i32, n: i32, k: i32,
1416 a: *const c_void, lda: i64,
1417 b: *const c_void, ldb: i64,
1418 c: *const c_void, ldc: i64,
1419 d: *mut c_void, ldd: i64,
1420 bias: *const c_void,
1421 alpha: f32, beta: f32,
1422 workspace: *mut c_void, workspace_bytes: usize,
1423 stream: *mut c_void,
1424 ) -> i32;
1425 pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1427 pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rcr_sm80_can_implement(
1429 m: i32, n: i32, k: i32,
1430 a: *const c_void, lda: i64,
1431 b: *const c_void, ldb: i64,
1432 c: *const c_void, ldc: i64,
1433 d: *mut c_void, ldd: i64,
1434 bias: *const c_void,
1435 ) -> i32;
1436
1437 #[allow(clippy::too_many_arguments)]
1439 pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rcr_sm80_run(
1440 m: i32, n: i32, k: i32,
1441 a: *const c_void, lda: i64,
1442 b: *const c_void, ldb: i64,
1443 c: *const c_void, ldc: i64,
1444 d: *mut c_void, ldd: i64,
1445 bias: *const c_void,
1446 alpha: f32, beta: f32,
1447 workspace: *mut c_void, workspace_bytes: usize,
1448 stream: *mut c_void,
1449 ) -> i32;
1450 pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1452 pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rcr_sm80_can_implement(
1454 m: i32, n: i32, k: i32,
1455 a: *const c_void, lda: i64,
1456 b: *const c_void, ldb: i64,
1457 c: *const c_void, ldc: i64,
1458 d: *mut c_void, ldd: i64,
1459 bias: *const c_void,
1460 ) -> i32;
1461
1462 #[allow(clippy::too_many_arguments)]
1466 pub fn baracuda_cutlass_gemm_bias_f32_simt_rrr_sm80_run(
1467 m: i32, n: i32, k: i32,
1468 a: *const c_void, lda: i64,
1469 b: *const c_void, ldb: i64,
1470 c: *const c_void, ldc: i64,
1471 d: *mut c_void, ldd: i64,
1472 bias: *const c_void,
1473 alpha: f32, beta: f32,
1474 workspace: *mut c_void, workspace_bytes: usize,
1475 stream: *mut c_void,
1476 ) -> i32;
1477 pub fn baracuda_cutlass_gemm_bias_f32_simt_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1479 pub fn baracuda_cutlass_gemm_bias_f32_simt_rrr_sm80_can_implement(
1481 m: i32, n: i32, k: i32,
1482 a: *const c_void, lda: i64,
1483 b: *const c_void, ldb: i64,
1484 c: *const c_void, ldc: i64,
1485 d: *mut c_void, ldd: i64,
1486 bias: *const c_void,
1487 ) -> i32;
1488
1489 #[allow(clippy::too_many_arguments)]
1491 pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rrr_sm80_run(
1492 m: i32, n: i32, k: i32,
1493 a: *const c_void, lda: i64,
1494 b: *const c_void, ldb: i64,
1495 c: *const c_void, ldc: i64,
1496 d: *mut c_void, ldd: i64,
1497 bias: *const c_void,
1498 alpha: f32, beta: f32,
1499 workspace: *mut c_void, workspace_bytes: usize,
1500 stream: *mut c_void,
1501 ) -> i32;
1502 pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1504 pub fn baracuda_cutlass_gemm_bias_relu_f32_simt_rrr_sm80_can_implement(
1506 m: i32, n: i32, k: i32,
1507 a: *const c_void, lda: i64,
1508 b: *const c_void, ldb: i64,
1509 c: *const c_void, ldc: i64,
1510 d: *mut c_void, ldd: i64,
1511 bias: *const c_void,
1512 ) -> i32;
1513
1514 #[allow(clippy::too_many_arguments)]
1516 pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rrr_sm80_run(
1517 m: i32, n: i32, k: i32,
1518 a: *const c_void, lda: i64,
1519 b: *const c_void, ldb: i64,
1520 c: *const c_void, ldc: i64,
1521 d: *mut c_void, ldd: i64,
1522 bias: *const c_void,
1523 alpha: f32, beta: f32,
1524 workspace: *mut c_void, workspace_bytes: usize,
1525 stream: *mut c_void,
1526 ) -> i32;
1527 pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1529 pub fn baracuda_cutlass_gemm_bias_gelu_f32_simt_rrr_sm80_can_implement(
1531 m: i32, n: i32, k: i32,
1532 a: *const c_void, lda: i64,
1533 b: *const c_void, ldb: i64,
1534 c: *const c_void, ldc: i64,
1535 d: *mut c_void, ldd: i64,
1536 bias: *const c_void,
1537 ) -> i32;
1538
1539 #[allow(clippy::too_many_arguments)]
1541 pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rrr_sm80_run(
1542 m: i32, n: i32, k: i32,
1543 a: *const c_void, lda: i64,
1544 b: *const c_void, ldb: i64,
1545 c: *const c_void, ldc: i64,
1546 d: *mut c_void, ldd: i64,
1547 bias: *const c_void,
1548 alpha: f32, beta: f32,
1549 workspace: *mut c_void, workspace_bytes: usize,
1550 stream: *mut c_void,
1551 ) -> i32;
1552 pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1554 pub fn baracuda_cutlass_gemm_bias_silu_f32_simt_rrr_sm80_can_implement(
1556 m: i32, n: i32, k: i32,
1557 a: *const c_void, lda: i64,
1558 b: *const c_void, ldb: i64,
1559 c: *const c_void, ldc: i64,
1560 d: *mut c_void, ldd: i64,
1561 bias: *const c_void,
1562 ) -> i32;
1563}
1564
1565#[cfg(any(feature = "sm80", feature = "sm90a"))]
1579unsafe extern "C" {
1580 pub fn baracuda_cutlass_gemm_f64_rcr_sm80_run(
1585 m: i32, n: i32, k: i32,
1586 a: *const c_void, lda: i64,
1587 b: *const c_void, ldb: i64,
1588 c: *const c_void, ldc: i64,
1589 d: *mut c_void, ldd: i64,
1590 alpha: f64, beta: f64,
1591 workspace: *mut c_void, workspace_bytes: usize,
1592 stream: *mut c_void,
1593 ) -> i32;
1594
1595 pub fn baracuda_cutlass_gemm_f64_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1597
1598 pub fn baracuda_cutlass_gemm_f64_rcr_sm80_can_implement(
1601 m: i32, n: i32, k: i32,
1602 a: *const c_void, lda: i64,
1603 b: *const c_void, ldb: i64,
1604 c: *const c_void, ldc: i64,
1605 d: *mut c_void, ldd: i64,
1606 ) -> i32;
1607
1608 pub fn baracuda_cutlass_gemm_f64_rrr_sm80_run(
1613 m: i32, n: i32, k: i32,
1614 a: *const c_void, lda: i64,
1615 b: *const c_void, ldb: i64,
1616 c: *const c_void, ldc: i64,
1617 d: *mut c_void, ldd: i64,
1618 alpha: f64, beta: f64,
1619 workspace: *mut c_void, workspace_bytes: usize,
1620 stream: *mut c_void,
1621 ) -> i32;
1622
1623 pub fn baracuda_cutlass_gemm_f64_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1625
1626 pub fn baracuda_cutlass_gemm_f64_rrr_sm80_can_implement(
1629 m: i32, n: i32, k: i32,
1630 a: *const c_void, lda: i64,
1631 b: *const c_void, ldb: i64,
1632 c: *const c_void, ldc: i64,
1633 d: *mut c_void, ldd: i64,
1634 ) -> i32;
1635}
1636
1637#[cfg(any(feature = "sm80", feature = "sm90a"))]
1645unsafe extern "C" {
1646 #[allow(clippy::too_many_arguments)]
1650 pub fn baracuda_cutlass_gemm_bias_f64_rcr_sm80_run(
1651 m: i32, n: i32, k: i32,
1652 a: *const c_void, lda: i64,
1653 b: *const c_void, ldb: i64,
1654 c: *const c_void, ldc: i64,
1655 d: *mut c_void, ldd: i64,
1656 bias: *const c_void,
1657 alpha: f64, beta: f64,
1658 workspace: *mut c_void, workspace_bytes: usize,
1659 stream: *mut c_void,
1660 ) -> i32;
1661 pub fn baracuda_cutlass_gemm_bias_f64_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1663 pub fn baracuda_cutlass_gemm_bias_f64_rcr_sm80_can_implement(
1665 m: i32, n: i32, k: i32,
1666 a: *const c_void, lda: i64,
1667 b: *const c_void, ldb: i64,
1668 c: *const c_void, ldc: i64,
1669 d: *mut c_void, ldd: i64,
1670 bias: *const c_void,
1671 ) -> i32;
1672
1673 #[allow(clippy::too_many_arguments)]
1675 pub fn baracuda_cutlass_gemm_bias_relu_f64_rcr_sm80_run(
1676 m: i32, n: i32, k: i32,
1677 a: *const c_void, lda: i64,
1678 b: *const c_void, ldb: i64,
1679 c: *const c_void, ldc: i64,
1680 d: *mut c_void, ldd: i64,
1681 bias: *const c_void,
1682 alpha: f64, beta: f64,
1683 workspace: *mut c_void, workspace_bytes: usize,
1684 stream: *mut c_void,
1685 ) -> i32;
1686 pub fn baracuda_cutlass_gemm_bias_relu_f64_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1688 pub fn baracuda_cutlass_gemm_bias_relu_f64_rcr_sm80_can_implement(
1690 m: i32, n: i32, k: i32,
1691 a: *const c_void, lda: i64,
1692 b: *const c_void, ldb: i64,
1693 c: *const c_void, ldc: i64,
1694 d: *mut c_void, ldd: i64,
1695 bias: *const c_void,
1696 ) -> i32;
1697
1698 #[allow(clippy::too_many_arguments)]
1700 pub fn baracuda_cutlass_gemm_bias_gelu_f64_rcr_sm80_run(
1701 m: i32, n: i32, k: i32,
1702 a: *const c_void, lda: i64,
1703 b: *const c_void, ldb: i64,
1704 c: *const c_void, ldc: i64,
1705 d: *mut c_void, ldd: i64,
1706 bias: *const c_void,
1707 alpha: f64, beta: f64,
1708 workspace: *mut c_void, workspace_bytes: usize,
1709 stream: *mut c_void,
1710 ) -> i32;
1711 pub fn baracuda_cutlass_gemm_bias_gelu_f64_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1713 pub fn baracuda_cutlass_gemm_bias_gelu_f64_rcr_sm80_can_implement(
1715 m: i32, n: i32, k: i32,
1716 a: *const c_void, lda: i64,
1717 b: *const c_void, ldb: i64,
1718 c: *const c_void, ldc: i64,
1719 d: *mut c_void, ldd: i64,
1720 bias: *const c_void,
1721 ) -> i32;
1722
1723 #[allow(clippy::too_many_arguments)]
1725 pub fn baracuda_cutlass_gemm_bias_silu_f64_rcr_sm80_run(
1726 m: i32, n: i32, k: i32,
1727 a: *const c_void, lda: i64,
1728 b: *const c_void, ldb: i64,
1729 c: *const c_void, ldc: i64,
1730 d: *mut c_void, ldd: i64,
1731 bias: *const c_void,
1732 alpha: f64, beta: f64,
1733 workspace: *mut c_void, workspace_bytes: usize,
1734 stream: *mut c_void,
1735 ) -> i32;
1736 pub fn baracuda_cutlass_gemm_bias_silu_f64_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1738 pub fn baracuda_cutlass_gemm_bias_silu_f64_rcr_sm80_can_implement(
1740 m: i32, n: i32, k: i32,
1741 a: *const c_void, lda: i64,
1742 b: *const c_void, ldb: i64,
1743 c: *const c_void, ldc: i64,
1744 d: *mut c_void, ldd: i64,
1745 bias: *const c_void,
1746 ) -> i32;
1747
1748 #[allow(clippy::too_many_arguments)]
1752 pub fn baracuda_cutlass_gemm_bias_f64_rrr_sm80_run(
1753 m: i32, n: i32, k: i32,
1754 a: *const c_void, lda: i64,
1755 b: *const c_void, ldb: i64,
1756 c: *const c_void, ldc: i64,
1757 d: *mut c_void, ldd: i64,
1758 bias: *const c_void,
1759 alpha: f64, beta: f64,
1760 workspace: *mut c_void, workspace_bytes: usize,
1761 stream: *mut c_void,
1762 ) -> i32;
1763 pub fn baracuda_cutlass_gemm_bias_f64_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1765 pub fn baracuda_cutlass_gemm_bias_f64_rrr_sm80_can_implement(
1767 m: i32, n: i32, k: i32,
1768 a: *const c_void, lda: i64,
1769 b: *const c_void, ldb: i64,
1770 c: *const c_void, ldc: i64,
1771 d: *mut c_void, ldd: i64,
1772 bias: *const c_void,
1773 ) -> i32;
1774
1775 #[allow(clippy::too_many_arguments)]
1777 pub fn baracuda_cutlass_gemm_bias_relu_f64_rrr_sm80_run(
1778 m: i32, n: i32, k: i32,
1779 a: *const c_void, lda: i64,
1780 b: *const c_void, ldb: i64,
1781 c: *const c_void, ldc: i64,
1782 d: *mut c_void, ldd: i64,
1783 bias: *const c_void,
1784 alpha: f64, beta: f64,
1785 workspace: *mut c_void, workspace_bytes: usize,
1786 stream: *mut c_void,
1787 ) -> i32;
1788 pub fn baracuda_cutlass_gemm_bias_relu_f64_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1790 pub fn baracuda_cutlass_gemm_bias_relu_f64_rrr_sm80_can_implement(
1792 m: i32, n: i32, k: i32,
1793 a: *const c_void, lda: i64,
1794 b: *const c_void, ldb: i64,
1795 c: *const c_void, ldc: i64,
1796 d: *mut c_void, ldd: i64,
1797 bias: *const c_void,
1798 ) -> i32;
1799
1800 #[allow(clippy::too_many_arguments)]
1802 pub fn baracuda_cutlass_gemm_bias_gelu_f64_rrr_sm80_run(
1803 m: i32, n: i32, k: i32,
1804 a: *const c_void, lda: i64,
1805 b: *const c_void, ldb: i64,
1806 c: *const c_void, ldc: i64,
1807 d: *mut c_void, ldd: i64,
1808 bias: *const c_void,
1809 alpha: f64, beta: f64,
1810 workspace: *mut c_void, workspace_bytes: usize,
1811 stream: *mut c_void,
1812 ) -> i32;
1813 pub fn baracuda_cutlass_gemm_bias_gelu_f64_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1815 pub fn baracuda_cutlass_gemm_bias_gelu_f64_rrr_sm80_can_implement(
1817 m: i32, n: i32, k: i32,
1818 a: *const c_void, lda: i64,
1819 b: *const c_void, ldb: i64,
1820 c: *const c_void, ldc: i64,
1821 d: *mut c_void, ldd: i64,
1822 bias: *const c_void,
1823 ) -> i32;
1824
1825 #[allow(clippy::too_many_arguments)]
1827 pub fn baracuda_cutlass_gemm_bias_silu_f64_rrr_sm80_run(
1828 m: i32, n: i32, k: i32,
1829 a: *const c_void, lda: i64,
1830 b: *const c_void, ldb: i64,
1831 c: *const c_void, ldc: i64,
1832 d: *mut c_void, ldd: i64,
1833 bias: *const c_void,
1834 alpha: f64, beta: f64,
1835 workspace: *mut c_void, workspace_bytes: usize,
1836 stream: *mut c_void,
1837 ) -> i32;
1838 pub fn baracuda_cutlass_gemm_bias_silu_f64_rrr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
1840 pub fn baracuda_cutlass_gemm_bias_silu_f64_rrr_sm80_can_implement(
1842 m: i32, n: i32, k: i32,
1843 a: *const c_void, lda: i64,
1844 b: *const c_void, ldb: i64,
1845 c: *const c_void, ldc: i64,
1846 d: *mut c_void, ldd: i64,
1847 bias: *const c_void,
1848 ) -> i32;
1849}
1850
1851#[cfg(any(feature = "sm80", feature = "sm90a"))]
1862unsafe extern "C" {
1863 #[allow(clippy::too_many_arguments)]
1870 pub fn baracuda_cutlass_gemm_batched_f16_rcr_sm80_run(
1871 m: i32,
1872 n: i32,
1873 k: i32,
1874 a: *const c_void,
1875 lda: i64,
1876 stride_a: i64,
1877 b: *const c_void,
1878 ldb: i64,
1879 stride_b: i64,
1880 c: *const c_void,
1881 ldc: i64,
1882 stride_c: i64,
1883 d: *mut c_void,
1884 ldd: i64,
1885 stride_d: i64,
1886 alpha: f32,
1887 beta: f32,
1888 batch_count: i32,
1889 workspace: *mut c_void,
1890 workspace_bytes: usize,
1891 stream: *mut c_void,
1892 ) -> i32;
1893
1894 pub fn baracuda_cutlass_gemm_batched_f16_rcr_sm80_workspace_size(
1896 m: i32,
1897 n: i32,
1898 k: i32,
1899 batch_count: i32,
1900 ) -> usize;
1901
1902 #[allow(clippy::too_many_arguments)]
1907 pub fn baracuda_cutlass_gemm_batched_f16_rcr_sm80_can_implement(
1908 m: i32,
1909 n: i32,
1910 k: i32,
1911 a: *const c_void,
1912 lda: i64,
1913 stride_a: i64,
1914 b: *const c_void,
1915 ldb: i64,
1916 stride_b: i64,
1917 c: *const c_void,
1918 ldc: i64,
1919 stride_c: i64,
1920 d: *mut c_void,
1921 ldd: i64,
1922 stride_d: i64,
1923 batch_count: i32,
1924 ) -> i32;
1925
1926 #[allow(clippy::too_many_arguments)]
1931 pub fn baracuda_cutlass_gemm_batched_bf16_rcr_sm80_run(
1932 m: i32,
1933 n: i32,
1934 k: i32,
1935 a: *const c_void,
1936 lda: i64,
1937 stride_a: i64,
1938 b: *const c_void,
1939 ldb: i64,
1940 stride_b: i64,
1941 c: *const c_void,
1942 ldc: i64,
1943 stride_c: i64,
1944 d: *mut c_void,
1945 ldd: i64,
1946 stride_d: i64,
1947 alpha: f32,
1948 beta: f32,
1949 batch_count: i32,
1950 workspace: *mut c_void,
1951 workspace_bytes: usize,
1952 stream: *mut c_void,
1953 ) -> i32;
1954
1955 pub fn baracuda_cutlass_gemm_batched_bf16_rcr_sm80_workspace_size(
1957 m: i32,
1958 n: i32,
1959 k: i32,
1960 batch_count: i32,
1961 ) -> usize;
1962
1963 #[allow(clippy::too_many_arguments)]
1968 pub fn baracuda_cutlass_gemm_batched_bf16_rcr_sm80_can_implement(
1969 m: i32,
1970 n: i32,
1971 k: i32,
1972 a: *const c_void,
1973 lda: i64,
1974 stride_a: i64,
1975 b: *const c_void,
1976 ldb: i64,
1977 stride_b: i64,
1978 c: *const c_void,
1979 ldc: i64,
1980 stride_c: i64,
1981 d: *mut c_void,
1982 ldd: i64,
1983 stride_d: i64,
1984 batch_count: i32,
1985 ) -> i32;
1986}
1987
1988#[cfg(any(feature = "sm80", feature = "sm90a"))]
2003unsafe extern "C" {
2004 pub fn baracuda_cutlass_grouped_gemm_f16_rcr_sm80_sufficient(
2012 h_m: *const i32,
2013 h_n: *const i32,
2014 h_k: *const i32,
2015 group_count: i32,
2016 ) -> i32;
2017
2018 pub fn baracuda_cutlass_grouped_gemm_f16_rcr_sm80_scratch_bytes(
2023 h_m: *const i32,
2024 h_n: *const i32,
2025 h_k: *const i32,
2026 group_count: i32,
2027 threadblock_count: i32,
2028 ) -> usize;
2029
2030 pub fn baracuda_cutlass_grouped_gemm_f16_rcr_sm80_can_implement(
2035 h_m: *const i32,
2036 h_n: *const i32,
2037 h_k: *const i32,
2038 group_count: i32,
2039 ) -> i32;
2040
2041 #[allow(clippy::too_many_arguments)]
2051 pub fn baracuda_cutlass_grouped_gemm_f16_rcr_sm80_run(
2052 group_count: i32,
2053 threadblock_count: i32,
2054 d_problem_sizes: *const c_void,
2055 d_ptr_a: *const c_void,
2056 d_ptr_b: *const c_void,
2057 d_ptr_c: *const c_void,
2058 d_ptr_d: *mut c_void,
2059 d_lda: *const c_void,
2060 d_ldb: *const c_void,
2061 d_ldc: *const c_void,
2062 d_ldd: *const c_void,
2063 h_problem_sizes: *const c_void,
2064 alpha: f32,
2065 beta: f32,
2066 scratch: *mut c_void,
2067 scratch_bytes: usize,
2068 stream: *mut c_void,
2069 ) -> i32;
2070
2071 pub fn baracuda_cutlass_grouped_gemm_bf16_rcr_sm80_sufficient(
2076 h_m: *const i32,
2077 h_n: *const i32,
2078 h_k: *const i32,
2079 group_count: i32,
2080 ) -> i32;
2081
2082 pub fn baracuda_cutlass_grouped_gemm_bf16_rcr_sm80_scratch_bytes(
2085 h_m: *const i32,
2086 h_n: *const i32,
2087 h_k: *const i32,
2088 group_count: i32,
2089 threadblock_count: i32,
2090 ) -> usize;
2091
2092 pub fn baracuda_cutlass_grouped_gemm_bf16_rcr_sm80_can_implement(
2095 h_m: *const i32,
2096 h_n: *const i32,
2097 h_k: *const i32,
2098 group_count: i32,
2099 ) -> i32;
2100
2101 #[allow(clippy::too_many_arguments)]
2104 pub fn baracuda_cutlass_grouped_gemm_bf16_rcr_sm80_run(
2105 group_count: i32,
2106 threadblock_count: i32,
2107 d_problem_sizes: *const c_void,
2108 d_ptr_a: *const c_void,
2109 d_ptr_b: *const c_void,
2110 d_ptr_c: *const c_void,
2111 d_ptr_d: *mut c_void,
2112 d_lda: *const c_void,
2113 d_ldb: *const c_void,
2114 d_ldc: *const c_void,
2115 d_ldd: *const c_void,
2116 h_problem_sizes: *const c_void,
2117 alpha: f32,
2118 beta: f32,
2119 scratch: *mut c_void,
2120 scratch_bytes: usize,
2121 stream: *mut c_void,
2122 ) -> i32;
2123}
2124
2125#[cfg(any(feature = "sm80", feature = "sm90a"))]
2154unsafe extern "C" {
2155 pub fn baracuda_cutlass_gemm_s8_rcr_sm80_run(
2168 m: i32,
2169 n: i32,
2170 k: i32,
2171 a: *const c_void,
2172 lda: i64,
2173 b: *const c_void,
2174 ldb: i64,
2175 c: *const c_void,
2176 ldc: i64,
2177 d: *mut c_void,
2178 ldd: i64,
2179 alpha: f32,
2180 beta: f32,
2181 workspace: *mut c_void,
2182 workspace_bytes: usize,
2183 stream: *mut c_void,
2184 ) -> i32;
2185
2186 pub fn baracuda_cutlass_gemm_s8_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
2188
2189 pub fn baracuda_cutlass_gemm_s8_rcr_sm80_can_implement(
2195 m: i32,
2196 n: i32,
2197 k: i32,
2198 a: *const c_void,
2199 lda: i64,
2200 b: *const c_void,
2201 ldb: i64,
2202 c: *const c_void,
2203 ldc: i64,
2204 d: *mut c_void,
2205 ldd: i64,
2206 ) -> i32;
2207
2208 pub fn baracuda_cutlass_gemm_u8_rcr_sm80_run(
2218 m: i32,
2219 n: i32,
2220 k: i32,
2221 a: *const c_void,
2222 lda: i64,
2223 b: *const c_void,
2224 ldb: i64,
2225 c: *const c_void,
2226 ldc: i64,
2227 d: *mut c_void,
2228 ldd: i64,
2229 alpha: f32,
2230 beta: f32,
2231 workspace: *mut c_void,
2232 workspace_bytes: usize,
2233 stream: *mut c_void,
2234 ) -> i32;
2235
2236 pub fn baracuda_cutlass_gemm_u8_rcr_sm80_workspace_size(m: i32, n: i32, k: i32) -> usize;
2238
2239 pub fn baracuda_cutlass_gemm_u8_rcr_sm80_can_implement(
2244 m: i32,
2245 n: i32,
2246 k: i32,
2247 a: *const c_void,
2248 lda: i64,
2249 b: *const c_void,
2250 ldb: i64,
2251 c: *const c_void,
2252 ldc: i64,
2253 d: *mut c_void,
2254 ldd: i64,
2255 ) -> i32;
2256}
2257
2258macro_rules! int8_bias_ffi {
2275 ($run:ident, $ws:ident, $ck:ident) => {
2276 unsafe extern "C" {
2277 #[doc = concat!(
2278 "int8 bias-fused GEMM with optional fused activation.\n\n",
2279 "Computes `D = saturating_cast(activation(alpha * (A * B) ",
2280 "+ beta * C + bias_broadcast(N)))`. See the section header for ",
2281 "the layout / accumulator / clamp contract.\n\n",
2282 "# Safety\nSame contract as ",
2283 "[`baracuda_cutlass_gemm_s8_rcr_sm80_run`]."
2284 )]
2285 pub fn $run(
2286 m: i32,
2287 n: i32,
2288 k: i32,
2289 a: *const c_void,
2290 lda: i64,
2291 b: *const c_void,
2292 ldb: i64,
2293 c: *const c_void,
2294 ldc: i64,
2295 d: *mut c_void,
2296 ldd: i64,
2297 bias: *const c_void,
2298 alpha: f32,
2299 beta: f32,
2300 workspace: *mut c_void,
2301 workspace_bytes: usize,
2302 stream: *mut c_void,
2303 ) -> i32;
2304
2305 #[doc = "Workspace size in bytes for the corresponding `_run` entry point."]
2306 pub fn $ws(m: i32, n: i32, k: i32) -> usize;
2307
2308 #[doc = concat!(
2309 "Pre-launch implementability check for the corresponding ",
2310 "`_run` entry point.\n\n# Safety\nSame pointer-validity ",
2311 "contract as the matching `_run`, but only host-side ",
2312 "alignment and leading-dimension checks occur."
2313 )]
2314 pub fn $ck(
2315 m: i32,
2316 n: i32,
2317 k: i32,
2318 a: *const c_void,
2319 lda: i64,
2320 b: *const c_void,
2321 ldb: i64,
2322 c: *const c_void,
2323 ldc: i64,
2324 d: *mut c_void,
2325 ldd: i64,
2326 bias: *const c_void,
2327 ) -> i32;
2328 }
2329 };
2330}
2331
2332#[cfg(any(feature = "sm80", feature = "sm90a"))]
2333mod int8_bias_decls {
2334 use super::c_void;
2335
2336 int8_bias_ffi!(
2338 baracuda_cutlass_gemm_bias_f32bias_s8_rcr_sm80_run,
2339 baracuda_cutlass_gemm_bias_f32bias_s8_rcr_sm80_workspace_size,
2340 baracuda_cutlass_gemm_bias_f32bias_s8_rcr_sm80_can_implement
2341 );
2342 int8_bias_ffi!(
2343 baracuda_cutlass_gemm_bias_relu_f32bias_s8_rcr_sm80_run,
2344 baracuda_cutlass_gemm_bias_relu_f32bias_s8_rcr_sm80_workspace_size,
2345 baracuda_cutlass_gemm_bias_relu_f32bias_s8_rcr_sm80_can_implement
2346 );
2347 int8_bias_ffi!(
2348 baracuda_cutlass_gemm_bias_gelu_f32bias_s8_rcr_sm80_run,
2349 baracuda_cutlass_gemm_bias_gelu_f32bias_s8_rcr_sm80_workspace_size,
2350 baracuda_cutlass_gemm_bias_gelu_f32bias_s8_rcr_sm80_can_implement
2351 );
2352 int8_bias_ffi!(
2353 baracuda_cutlass_gemm_bias_silu_f32bias_s8_rcr_sm80_run,
2354 baracuda_cutlass_gemm_bias_silu_f32bias_s8_rcr_sm80_workspace_size,
2355 baracuda_cutlass_gemm_bias_silu_f32bias_s8_rcr_sm80_can_implement
2356 );
2357
2358 int8_bias_ffi!(
2360 baracuda_cutlass_gemm_bias_i32bias_s8_rcr_sm80_run,
2361 baracuda_cutlass_gemm_bias_i32bias_s8_rcr_sm80_workspace_size,
2362 baracuda_cutlass_gemm_bias_i32bias_s8_rcr_sm80_can_implement
2363 );
2364 int8_bias_ffi!(
2365 baracuda_cutlass_gemm_bias_relu_i32bias_s8_rcr_sm80_run,
2366 baracuda_cutlass_gemm_bias_relu_i32bias_s8_rcr_sm80_workspace_size,
2367 baracuda_cutlass_gemm_bias_relu_i32bias_s8_rcr_sm80_can_implement
2368 );
2369 int8_bias_ffi!(
2370 baracuda_cutlass_gemm_bias_gelu_i32bias_s8_rcr_sm80_run,
2371 baracuda_cutlass_gemm_bias_gelu_i32bias_s8_rcr_sm80_workspace_size,
2372 baracuda_cutlass_gemm_bias_gelu_i32bias_s8_rcr_sm80_can_implement
2373 );
2374 int8_bias_ffi!(
2375 baracuda_cutlass_gemm_bias_silu_i32bias_s8_rcr_sm80_run,
2376 baracuda_cutlass_gemm_bias_silu_i32bias_s8_rcr_sm80_workspace_size,
2377 baracuda_cutlass_gemm_bias_silu_i32bias_s8_rcr_sm80_can_implement
2378 );
2379
2380 int8_bias_ffi!(
2382 baracuda_cutlass_gemm_bias_f32bias_u8_rcr_sm80_run,
2383 baracuda_cutlass_gemm_bias_f32bias_u8_rcr_sm80_workspace_size,
2384 baracuda_cutlass_gemm_bias_f32bias_u8_rcr_sm80_can_implement
2385 );
2386 int8_bias_ffi!(
2387 baracuda_cutlass_gemm_bias_relu_f32bias_u8_rcr_sm80_run,
2388 baracuda_cutlass_gemm_bias_relu_f32bias_u8_rcr_sm80_workspace_size,
2389 baracuda_cutlass_gemm_bias_relu_f32bias_u8_rcr_sm80_can_implement
2390 );
2391 int8_bias_ffi!(
2392 baracuda_cutlass_gemm_bias_gelu_f32bias_u8_rcr_sm80_run,
2393 baracuda_cutlass_gemm_bias_gelu_f32bias_u8_rcr_sm80_workspace_size,
2394 baracuda_cutlass_gemm_bias_gelu_f32bias_u8_rcr_sm80_can_implement
2395 );
2396 int8_bias_ffi!(
2397 baracuda_cutlass_gemm_bias_silu_f32bias_u8_rcr_sm80_run,
2398 baracuda_cutlass_gemm_bias_silu_f32bias_u8_rcr_sm80_workspace_size,
2399 baracuda_cutlass_gemm_bias_silu_f32bias_u8_rcr_sm80_can_implement
2400 );
2401
2402 int8_bias_ffi!(
2404 baracuda_cutlass_gemm_bias_i32bias_u8_rcr_sm80_run,
2405 baracuda_cutlass_gemm_bias_i32bias_u8_rcr_sm80_workspace_size,
2406 baracuda_cutlass_gemm_bias_i32bias_u8_rcr_sm80_can_implement
2407 );
2408 int8_bias_ffi!(
2409 baracuda_cutlass_gemm_bias_relu_i32bias_u8_rcr_sm80_run,
2410 baracuda_cutlass_gemm_bias_relu_i32bias_u8_rcr_sm80_workspace_size,
2411 baracuda_cutlass_gemm_bias_relu_i32bias_u8_rcr_sm80_can_implement
2412 );
2413 int8_bias_ffi!(
2414 baracuda_cutlass_gemm_bias_gelu_i32bias_u8_rcr_sm80_run,
2415 baracuda_cutlass_gemm_bias_gelu_i32bias_u8_rcr_sm80_workspace_size,
2416 baracuda_cutlass_gemm_bias_gelu_i32bias_u8_rcr_sm80_can_implement
2417 );
2418 int8_bias_ffi!(
2419 baracuda_cutlass_gemm_bias_silu_i32bias_u8_rcr_sm80_run,
2420 baracuda_cutlass_gemm_bias_silu_i32bias_u8_rcr_sm80_workspace_size,
2421 baracuda_cutlass_gemm_bias_silu_i32bias_u8_rcr_sm80_can_implement
2422 );
2423}
2424
2425#[cfg(any(feature = "sm80", feature = "sm90a"))]
2426pub use int8_bias_decls::*;
2427