#version 450
// Row-wise sum reduction over the last dim of a row-major [rows, cols] f32 buffer.
// One invocation per row: out[row] = sum_{c} in[row*cols + c]. Output length = rows.
layout(local_size_x = 64) in;
layout(set = 0, binding = 0) readonly buffer In { float inp[]; };
layout(set = 0, binding = 1) writeonly buffer Out { float o[]; };
layout(push_constant) uniform Pc { uint rows; uint cols; };
void main() {
uint row = gl_GlobalInvocationID.x;
if (row < rows) {
float acc = 0.0;
uint base = row * cols;
for (uint c = 0u; c < cols; c++) {
acc += inp[base + c];
}
o[row] = acc;
}
}