#version 450
// scatter add along `dim`. Same indexing as scatter_set but dst[idx] += src[g], done with an
// atomicCompSwap loop on a uint view of dst (Dozen/D3D12 lacks the float-atomic extension).
layout(local_size_x = 64) in;
layout(set = 0, binding = 0) buffer Dst { uint dst[]; };
layout(set = 0, binding = 1) readonly buffer Src { float src[]; };
layout(set = 0, binding = 2) readonly buffer Ids { uint ids[]; };
layout(push_constant) uniform Pc { uint n; uint right; uint dim_src; uint dim_dst; };
void main() {
uint g = gl_GlobalInvocationID.x;
if (g >= n) { return; }
uint inner = g % right;
uint outer = g / (right * dim_src);
uint id = ids[g];
uint idx = outer * (dim_dst * right) + id * right + inner;
float add = src[g];
uint old = dst[idx];
uint assumed;
do {
assumed = old;
float nv = uintBitsToFloat(assumed) + add;
old = atomicCompSwap(dst[idx], assumed, floatBitsToUint(nv));
} while (old != assumed);
}