1use super::super::{MR, NR};
9
10#[cfg(target_arch = "x86_64")]
19#[target_feature(enable = "avx2", enable = "fma")]
20pub unsafe fn microkernel_8x6_avx2(
22 k: usize,
23 a: *const f32, b: *const f32, c: *mut f32, ldc: usize, ) {
28 unsafe {
29 use std::arch::x86_64::*;
30
31 let mut c0 = _mm256_loadu_ps(c);
33 let mut c1 = _mm256_loadu_ps(c.add(ldc));
34 let mut c2 = _mm256_loadu_ps(c.add(2 * ldc));
35 let mut c3 = _mm256_loadu_ps(c.add(3 * ldc));
36 let mut c4 = _mm256_loadu_ps(c.add(4 * ldc));
37 let mut c5 = _mm256_loadu_ps(c.add(5 * ldc));
38
39 for p in 0..k {
41 let a_col = _mm256_loadu_ps(a.add(p * MR));
43
44 let b0 = _mm256_set1_ps(*b.add(p * NR));
46 let b1 = _mm256_set1_ps(*b.add(p * NR + 1));
47 let b2 = _mm256_set1_ps(*b.add(p * NR + 2));
48 let b3 = _mm256_set1_ps(*b.add(p * NR + 3));
49 let b4 = _mm256_set1_ps(*b.add(p * NR + 4));
50 let b5 = _mm256_set1_ps(*b.add(p * NR + 5));
51
52 c0 = _mm256_fmadd_ps(a_col, b0, c0);
54 c1 = _mm256_fmadd_ps(a_col, b1, c1);
55 c2 = _mm256_fmadd_ps(a_col, b2, c2);
56 c3 = _mm256_fmadd_ps(a_col, b3, c3);
57 c4 = _mm256_fmadd_ps(a_col, b4, c4);
58 c5 = _mm256_fmadd_ps(a_col, b5, c5);
59 }
60
61 _mm256_storeu_ps(c, c0);
63 _mm256_storeu_ps(c.add(ldc), c1);
64 _mm256_storeu_ps(c.add(2 * ldc), c2);
65 _mm256_storeu_ps(c.add(3 * ldc), c3);
66 _mm256_storeu_ps(c.add(4 * ldc), c4);
67 _mm256_storeu_ps(c.add(5 * ldc), c5);
68 }
69}
70
71#[cfg(target_arch = "x86_64")]
92#[target_feature(enable = "avx2", enable = "fma")]
93pub unsafe fn microkernel_8x6_avx2_asm(
95 k: usize,
96 a: *const f32, b: *const f32, c: *mut f32, ldc: usize, ) {
101 unsafe {
102 use std::arch::x86_64::*;
103
104 if k < 4 {
106 microkernel_8x6_avx2(k, a, b, c, ldc);
107 return;
108 }
109
110 let mut c0 = _mm256_loadu_ps(c);
112 let mut c1 = _mm256_loadu_ps(c.add(ldc));
113 let mut c2 = _mm256_loadu_ps(c.add(2 * ldc));
114 let mut c3 = _mm256_loadu_ps(c.add(3 * ldc));
115 let mut c4 = _mm256_loadu_ps(c.add(4 * ldc));
116 let mut c5 = _mm256_loadu_ps(c.add(5 * ldc));
117
118 let k_unrolled = k / 4;
119 let k_remainder = k % 4;
120
121 for p in 0..k_unrolled {
124 let base_p = p * 4;
125
126 let a0 = _mm256_loadu_ps(a.add((base_p) * MR));
128 let b00 = _mm256_broadcast_ss(&*b.add((base_p) * NR));
129 let b01 = _mm256_broadcast_ss(&*b.add((base_p) * NR + 1));
130 let b02 = _mm256_broadcast_ss(&*b.add((base_p) * NR + 2));
131 let b03 = _mm256_broadcast_ss(&*b.add((base_p) * NR + 3));
132 let b04 = _mm256_broadcast_ss(&*b.add((base_p) * NR + 4));
133 let b05 = _mm256_broadcast_ss(&*b.add((base_p) * NR + 5));
134
135 let a1 = _mm256_loadu_ps(a.add((base_p + 1) * MR));
137 c0 = _mm256_fmadd_ps(a0, b00, c0);
138 c1 = _mm256_fmadd_ps(a0, b01, c1);
139 c2 = _mm256_fmadd_ps(a0, b02, c2);
140
141 let b10 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR));
142 let b11 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR + 1));
143 let b12 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR + 2));
144
145 c3 = _mm256_fmadd_ps(a0, b03, c3);
146 c4 = _mm256_fmadd_ps(a0, b04, c4);
147 c5 = _mm256_fmadd_ps(a0, b05, c5);
148
149 let b13 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR + 3));
150 let b14 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR + 4));
151 let b15 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR + 5));
152
153 let a2 = _mm256_loadu_ps(a.add((base_p + 2) * MR));
155 c0 = _mm256_fmadd_ps(a1, b10, c0);
156 c1 = _mm256_fmadd_ps(a1, b11, c1);
157 c2 = _mm256_fmadd_ps(a1, b12, c2);
158
159 let b20 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR));
160 let b21 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR + 1));
161 let b22 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR + 2));
162
163 c3 = _mm256_fmadd_ps(a1, b13, c3);
164 c4 = _mm256_fmadd_ps(a1, b14, c4);
165 c5 = _mm256_fmadd_ps(a1, b15, c5);
166
167 let b23 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR + 3));
168 let b24 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR + 4));
169 let b25 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR + 5));
170
171 let a3 = _mm256_loadu_ps(a.add((base_p + 3) * MR));
173 c0 = _mm256_fmadd_ps(a2, b20, c0);
174 c1 = _mm256_fmadd_ps(a2, b21, c1);
175 c2 = _mm256_fmadd_ps(a2, b22, c2);
176
177 let b30 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR));
178 let b31 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR + 1));
179 let b32 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR + 2));
180
181 c3 = _mm256_fmadd_ps(a2, b23, c3);
182 c4 = _mm256_fmadd_ps(a2, b24, c4);
183 c5 = _mm256_fmadd_ps(a2, b25, c5);
184
185 let b33 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR + 3));
186 let b34 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR + 4));
187 let b35 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR + 5));
188
189 c0 = _mm256_fmadd_ps(a3, b30, c0);
191 c1 = _mm256_fmadd_ps(a3, b31, c1);
192 c2 = _mm256_fmadd_ps(a3, b32, c2);
193 c3 = _mm256_fmadd_ps(a3, b33, c3);
194 c4 = _mm256_fmadd_ps(a3, b34, c4);
195 c5 = _mm256_fmadd_ps(a3, b35, c5);
196 }
197
198 let base_p = k_unrolled * 4;
200 for p in 0..k_remainder {
201 let pp = base_p + p;
202 let a_col = _mm256_loadu_ps(a.add(pp * MR));
203 let b0 = _mm256_broadcast_ss(&*b.add(pp * NR));
204 let b1 = _mm256_broadcast_ss(&*b.add(pp * NR + 1));
205 let b2 = _mm256_broadcast_ss(&*b.add(pp * NR + 2));
206 let b3 = _mm256_broadcast_ss(&*b.add(pp * NR + 3));
207 let b4 = _mm256_broadcast_ss(&*b.add(pp * NR + 4));
208 let b5 = _mm256_broadcast_ss(&*b.add(pp * NR + 5));
209
210 c0 = _mm256_fmadd_ps(a_col, b0, c0);
211 c1 = _mm256_fmadd_ps(a_col, b1, c1);
212 c2 = _mm256_fmadd_ps(a_col, b2, c2);
213 c3 = _mm256_fmadd_ps(a_col, b3, c3);
214 c4 = _mm256_fmadd_ps(a_col, b4, c4);
215 c5 = _mm256_fmadd_ps(a_col, b5, c5);
216 }
217
218 _mm256_storeu_ps(c, c0);
220 _mm256_storeu_ps(c.add(ldc), c1);
221 _mm256_storeu_ps(c.add(2 * ldc), c2);
222 _mm256_storeu_ps(c.add(3 * ldc), c3);
223 _mm256_storeu_ps(c.add(4 * ldc), c4);
224 _mm256_storeu_ps(c.add(5 * ldc), c5);
225 }
226}
227
228#[cfg(target_arch = "x86_64")]
255#[target_feature(enable = "avx2", enable = "fma")]
256pub unsafe fn microkernel_8x6_true_asm(
258 k: usize,
259 a: *const f32,
260 b: *const f32,
261 c: *mut f32,
262 ldc: usize,
263) {
264 unsafe {
265 use std::arch::asm;
266
267 if k < 4 {
269 microkernel_8x6_avx2(k, a, b, c, ldc);
270 return;
271 }
272
273 let ldc_bytes = ldc * 4;
275
276 asm!(
277 "vmovups ymm0, [{c_ptr}]",
281 "vmovups ymm1, [{c_ptr} + {ldc}]",
282 "vmovups ymm2, [{c_ptr} + {ldc}*2]",
283 "lea {tmp}, [{c_ptr} + {ldc}*2]",
284 "vmovups ymm3, [{tmp} + {ldc}]",
285 "vmovups ymm4, [{tmp} + {ldc}*2]",
286 "lea {tmp}, [{tmp} + {ldc}*2]",
287 "vmovups ymm5, [{tmp} + {ldc}]",
288
289 "vmovups ymm6, [{a_ptr}]", "vmovups ymm7, [{a_ptr} + 32]", "vmovups ymm8, [{a_ptr} + 64]", "vmovups ymm9, [{a_ptr} + 96]", "add {a_ptr}, 128", "mov {k_cnt}, {k}",
304 "shr {k_cnt}, 2", "test {k_cnt}, {k_cnt}",
306 "jz 2f", ".p2align 4", "3:",
316
317 "vbroadcastss ymm10, dword ptr [{b_ptr}]",
319 "vbroadcastss ymm11, dword ptr [{b_ptr} + 4]",
320 "vbroadcastss ymm12, dword ptr [{b_ptr} + 8]",
321 "vfmadd231ps ymm0, ymm6, ymm10", "vfmadd231ps ymm1, ymm6, ymm11", "vfmadd231ps ymm2, ymm6, ymm12", "vbroadcastss ymm13, dword ptr [{b_ptr} + 12]",
325 "vbroadcastss ymm14, dword ptr [{b_ptr} + 16]",
326 "vbroadcastss ymm15, dword ptr [{b_ptr} + 20]",
327 "vfmadd231ps ymm3, ymm6, ymm13", "vfmadd231ps ymm4, ymm6, ymm14", "vfmadd231ps ymm5, ymm6, ymm15", "vmovups ymm6, [{a_ptr}]", "vbroadcastss ymm10, dword ptr [{b_ptr} + 24]",
334 "vbroadcastss ymm11, dword ptr [{b_ptr} + 28]",
335 "vbroadcastss ymm12, dword ptr [{b_ptr} + 32]",
336 "vfmadd231ps ymm0, ymm7, ymm10",
337 "vfmadd231ps ymm1, ymm7, ymm11",
338 "vfmadd231ps ymm2, ymm7, ymm12",
339 "vbroadcastss ymm13, dword ptr [{b_ptr} + 36]",
340 "vbroadcastss ymm14, dword ptr [{b_ptr} + 40]",
341 "vbroadcastss ymm15, dword ptr [{b_ptr} + 44]",
342 "vfmadd231ps ymm3, ymm7, ymm13",
343 "vfmadd231ps ymm4, ymm7, ymm14",
344 "vfmadd231ps ymm5, ymm7, ymm15",
345 "vmovups ymm7, [{a_ptr} + 32]", "vbroadcastss ymm10, dword ptr [{b_ptr} + 48]",
349 "vbroadcastss ymm11, dword ptr [{b_ptr} + 52]",
350 "vbroadcastss ymm12, dword ptr [{b_ptr} + 56]",
351 "vfmadd231ps ymm0, ymm8, ymm10",
352 "vfmadd231ps ymm1, ymm8, ymm11",
353 "vfmadd231ps ymm2, ymm8, ymm12",
354 "vbroadcastss ymm13, dword ptr [{b_ptr} + 60]",
355 "vbroadcastss ymm14, dword ptr [{b_ptr} + 64]",
356 "vbroadcastss ymm15, dword ptr [{b_ptr} + 68]",
357 "vfmadd231ps ymm3, ymm8, ymm13",
358 "vfmadd231ps ymm4, ymm8, ymm14",
359 "vfmadd231ps ymm5, ymm8, ymm15",
360 "vmovups ymm8, [{a_ptr} + 64]", "vbroadcastss ymm10, dword ptr [{b_ptr} + 72]",
364 "vbroadcastss ymm11, dword ptr [{b_ptr} + 76]",
365 "vbroadcastss ymm12, dword ptr [{b_ptr} + 80]",
366 "vfmadd231ps ymm0, ymm9, ymm10",
367 "vfmadd231ps ymm1, ymm9, ymm11",
368 "vfmadd231ps ymm2, ymm9, ymm12",
369 "vbroadcastss ymm13, dword ptr [{b_ptr} + 84]",
370 "vbroadcastss ymm14, dword ptr [{b_ptr} + 88]",
371 "vbroadcastss ymm15, dword ptr [{b_ptr} + 92]",
372 "vfmadd231ps ymm3, ymm9, ymm13",
373 "vfmadd231ps ymm4, ymm9, ymm14",
374 "vfmadd231ps ymm5, ymm9, ymm15",
375 "vmovups ymm9, [{a_ptr} + 96]", "add {a_ptr}, 128", "add {b_ptr}, 96", "dec {k_cnt}",
383 "jnz 3b",
384
385 "2:",
386 "vmovups [{c_ptr}], ymm0",
397 "vmovups [{c_ptr} + {ldc}], ymm1",
398 "vmovups [{c_ptr} + {ldc}*2], ymm2",
399 "lea {tmp}, [{c_ptr} + {ldc}*2]",
400 "vmovups [{tmp} + {ldc}], ymm3",
401 "vmovups [{tmp} + {ldc}*2], ymm4",
402 "lea {tmp}, [{tmp} + {ldc}*2]",
403 "vmovups [{tmp} + {ldc}], ymm5",
404
405 a_ptr = inout(reg) a => _,
407 b_ptr = inout(reg) b => _,
408 c_ptr = in(reg) c,
409 k = in(reg) k,
410 ldc = in(reg) ldc_bytes,
411 k_cnt = out(reg) _,
412 tmp = out(reg) _,
413
414 out("ymm0") _,
416 out("ymm1") _,
417 out("ymm2") _,
418 out("ymm3") _,
419 out("ymm4") _,
420 out("ymm5") _,
421 out("ymm6") _,
422 out("ymm7") _,
423 out("ymm8") _,
424 out("ymm9") _,
425 out("ymm10") _,
426 out("ymm11") _,
427 out("ymm12") _,
428 out("ymm13") _,
429 out("ymm14") _,
430 out("ymm15") _,
431
432 options(nostack),
433 );
434
435 let k_rem = k % 4;
437 if k_rem > 0 {
438 let k_done = (k / 4) * 4;
440 let a_rem = a.add(k_done * MR);
441 let b_rem = b.add(k_done * NR);
442
443 use std::arch::x86_64::*;
445
446 let mut c0 = _mm256_loadu_ps(c);
447 let mut c1 = _mm256_loadu_ps(c.add(ldc));
448 let mut c2 = _mm256_loadu_ps(c.add(2 * ldc));
449 let mut c3 = _mm256_loadu_ps(c.add(3 * ldc));
450 let mut c4 = _mm256_loadu_ps(c.add(4 * ldc));
451 let mut c5 = _mm256_loadu_ps(c.add(5 * ldc));
452
453 for p in 0..k_rem {
454 let a_col = _mm256_loadu_ps(a_rem.add(p * MR));
455 let b0 = _mm256_broadcast_ss(&*b_rem.add(p * NR));
456 let b1 = _mm256_broadcast_ss(&*b_rem.add(p * NR + 1));
457 let b2 = _mm256_broadcast_ss(&*b_rem.add(p * NR + 2));
458 let b3 = _mm256_broadcast_ss(&*b_rem.add(p * NR + 3));
459 let b4 = _mm256_broadcast_ss(&*b_rem.add(p * NR + 4));
460 let b5 = _mm256_broadcast_ss(&*b_rem.add(p * NR + 5));
461
462 c0 = _mm256_fmadd_ps(a_col, b0, c0);
463 c1 = _mm256_fmadd_ps(a_col, b1, c1);
464 c2 = _mm256_fmadd_ps(a_col, b2, c2);
465 c3 = _mm256_fmadd_ps(a_col, b3, c3);
466 c4 = _mm256_fmadd_ps(a_col, b4, c4);
467 c5 = _mm256_fmadd_ps(a_col, b5, c5);
468 }
469
470 _mm256_storeu_ps(c, c0);
471 _mm256_storeu_ps(c.add(ldc), c1);
472 _mm256_storeu_ps(c.add(2 * ldc), c2);
473 _mm256_storeu_ps(c.add(3 * ldc), c3);
474 _mm256_storeu_ps(c.add(4 * ldc), c4);
475 _mm256_storeu_ps(c.add(5 * ldc), c5);
476 }
477 }
478}
479
480#[cfg(target_arch = "x86_64")]
486#[target_feature(enable = "avx2", enable = "fma")]
487pub unsafe fn microkernel_8x8_avx2_fma(
488 k: usize,
489 a: *const f32,
490 b: *const f32,
491 c: *mut f32,
492 ldc: usize,
493) {
494 unsafe {
495 use std::arch::x86_64::*;
496
497 let mut c0 = _mm256_loadu_ps(c);
499 let mut c1 = _mm256_loadu_ps(c.add(ldc));
500 let mut c2 = _mm256_loadu_ps(c.add(2 * ldc));
501 let mut c3 = _mm256_loadu_ps(c.add(3 * ldc));
502 let mut c4 = _mm256_loadu_ps(c.add(4 * ldc));
503 let mut c5 = _mm256_loadu_ps(c.add(5 * ldc));
504 let mut c6 = _mm256_loadu_ps(c.add(6 * ldc));
505 let mut c7 = _mm256_loadu_ps(c.add(7 * ldc));
506
507 let k4 = k / 4;
511 let k_rem = k % 4;
512
513 for p4 in 0..k4 {
514 let base = p4 * 4;
515
516 let a0 = _mm256_loadu_ps(a.add(base * 8));
518 let bp0 = b.add(base * 8);
519 c0 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0), c0);
520 c1 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(1)), c1);
521 c2 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(2)), c2);
522 c3 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(3)), c3);
523 c4 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(4)), c4);
524 c5 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(5)), c5);
525 c6 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(6)), c6);
526 c7 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(7)), c7);
527
528 let a1 = _mm256_loadu_ps(a.add((base + 1) * 8));
530 let bp1 = b.add((base + 1) * 8);
531 c0 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1), c0);
532 c1 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(1)), c1);
533 c2 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(2)), c2);
534 c3 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(3)), c3);
535 c4 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(4)), c4);
536 c5 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(5)), c5);
537 c6 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(6)), c6);
538 c7 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(7)), c7);
539
540 let a2 = _mm256_loadu_ps(a.add((base + 2) * 8));
542 let bp2 = b.add((base + 2) * 8);
543 c0 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2), c0);
544 c1 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(1)), c1);
545 c2 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(2)), c2);
546 c3 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(3)), c3);
547 c4 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(4)), c4);
548 c5 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(5)), c5);
549 c6 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(6)), c6);
550 c7 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(7)), c7);
551
552 let a3 = _mm256_loadu_ps(a.add((base + 3) * 8));
554 let bp3 = b.add((base + 3) * 8);
555 c0 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3), c0);
556 c1 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(1)), c1);
557 c2 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(2)), c2);
558 c3 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(3)), c3);
559 c4 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(4)), c4);
560 c5 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(5)), c5);
561 c6 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(6)), c6);
562 c7 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(7)), c7);
563 }
564
565 let base_rem = k4 * 4;
567 for p in 0..k_rem {
568 let pp = base_rem + p;
569 let a_col = _mm256_loadu_ps(a.add(pp * 8));
570 let bp = b.add(pp * 8);
571 c0 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp), c0);
572 c1 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(1)), c1);
573 c2 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(2)), c2);
574 c3 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(3)), c3);
575 c4 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(4)), c4);
576 c5 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(5)), c5);
577 c6 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(6)), c6);
578 c7 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(7)), c7);
579 }
580
581 _mm256_storeu_ps(c, c0);
583 _mm256_storeu_ps(c.add(ldc), c1);
584 _mm256_storeu_ps(c.add(2 * ldc), c2);
585 _mm256_storeu_ps(c.add(3 * ldc), c3);
586 _mm256_storeu_ps(c.add(4 * ldc), c4);
587 _mm256_storeu_ps(c.add(5 * ldc), c5);
588 _mm256_storeu_ps(c.add(6 * ldc), c6);
589 _mm256_storeu_ps(c.add(7 * ldc), c7);
590 }
591}