#ifdef USE_SUBGROUP_REDUCTION
enable subgroups;
#endif
enable f16;
#define DECLARE_BYTE_LOADERS_SRC0
#include "common_decls.tmpl"
#include "mul_mat_vec_acc.tmpl"
struct MulMatParams {
offset_src0: u32,
offset_src1: u32,
offset_dst: u32,
m: u32,
n: u32,
k: u32,
stride_01: u32,
stride_11: u32,
stride_02: u32,
stride_12: u32,
stride_03: u32,
stride_13: u32,
bs02: u32,
bs03: u32,
broadcast2: u32,
broadcast3: u32
};
@group(0) @binding(0) var<storage, read_write> src0: array<SRC0_TYPE>;
@group(0) @binding(1) var<storage, read_write> src1: array<SRC1_TYPE>;
@group(0) @binding(2) var<storage, read_write> dst: array<f32>;
// "mul_mat_vec_acc.tmpl" requires params.k, params.m, params.stride_01
@group(0) @binding(3) var<uniform> params: MulMatParams;
// Flattened as [row][thread] to keep each row's reduction contiguous in memory.
var<workgroup> partial_sums: array<f32, OUTPUTS_PER_WG * WG_SIZE>;
fn partial_index(row: u32, thread: u32) -> u32 {
return row * WG_SIZE + thread;
}
@compute @workgroup_size(WG_SIZE)
fn main(
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(workgroup_id) wg_id: vec3<u32>,
@builtin(num_workgroups) num_wg: vec3<u32>
#ifdef USE_SUBGROUP_REDUCTION
, @builtin(subgroup_id) subgroup_id: u32,
@builtin(subgroup_invocation_id) subgroup_invocation_id: u32,
@builtin(num_subgroups) num_subgroups: u32,
@builtin(subgroup_size) subgroup_size: u32
#endif
) {
let thread_id = local_id.x;
let total_batches = params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3;
let wg_linear = wg_id.y * num_wg.x + wg_id.x;
let output_groups = (params.m + OUTPUTS_PER_WG - 1u) / OUTPUTS_PER_WG;
let batch_idx = wg_linear / output_groups;
if (batch_idx >= total_batches) {
return;
}
let row_base = (wg_linear % output_groups) * OUTPUTS_PER_WG;
let dst2_stride = params.m * params.n;
let dst2_idx = batch_idx % (params.bs02 * params.broadcast2);
let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
let dst3_idx = batch_idx / (params.bs02 * params.broadcast2);
let src03_idx = dst3_idx / params.broadcast3;
let src13_idx = dst3_idx;
let src02_idx = dst2_idx / params.broadcast2;
let src12_idx = dst2_idx;
let src0_batch_offset = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02;
let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
let dst_idx_base = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row_base;
let acc = accumulate_vec_dot(thread_id, row_base, src0_batch_offset, src1_idx_base);
#ifdef USE_SUBGROUP_REDUCTION
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
let subgroup_total = subgroupAdd(acc[row]);
if (subgroup_invocation_id == 0u) {
partial_sums[partial_index(row, subgroup_id)] = subgroup_total;
}
}
workgroupBarrier();
for (var row = subgroup_id; (row < OUTPUTS_PER_WG) && (row_base + row < params.m); row += num_subgroups) {
let output_row = row_base + row;
var row_acc = 0.0f;
for (var k = subgroup_invocation_id; k < num_subgroups; k += subgroup_size) {
row_acc += partial_sums[partial_index(row, k)];
}
let row_total = subgroupAdd(row_acc);
if (subgroup_invocation_id == 0) {
dst[dst_idx_base + row] = row_total;
}
}
#endif
#ifdef USE_WORKGROUP_REDUCTION
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
partial_sums[partial_index(row, thread_id)] = acc[row];
}
workgroupBarrier();
var stride = WG_SIZE / 2u;
while (stride > 0) {
if (thread_id < stride) {
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
partial_sums[partial_index(row, thread_id)] += partial_sums[partial_index(row, thread_id + stride)];
}
}
workgroupBarrier();
stride = stride / 2;
}
if (thread_id < OUTPUTS_PER_WG) {
let output_row = row_base + thread_id;
if (output_row < params.m) {
dst[dst_idx_base + thread_id] = partial_sums[partial_index(thread_id, 0)];
}
}
#endif
}