hanzo-ml 0.10.2

#version 450
// Materialize a CONTIGUOUS f32 output from an arbitrarily-strided source.
// Powers .contiguous(), transpose, and broadcast. For each contiguous output
// index gid (< n), decode gid into a multi-index over the logical `shape`
// (last dim fastest / row-major), map it through `strides`+`offset` (in
// ELEMENTS) into `inp`, and write inp[src] -> outp[gid]. A stride of 0 along a
// dim broadcasts that dim. rank <= 6.
layout(local_size_x = 64) in;

layout(set = 0, binding = 0) readonly  buffer In  { float inp[]; };
layout(set = 0, binding = 1) writeonly buffer Out { float outp[]; };
layout(push_constant) uniform Pc {
    uint n;
    uint rank;
    uint offset;
    uint dst_offset;
    uint shape[6];
    uint strides[6];
};

void main() {
    uint gid = gl_GlobalInvocationID.x;
    if (gid < n) {
        uint rem = gid;
        uint src = offset;
        // Peel the last dim first (row-major: last dim varies fastest).
        for (uint d = 0u; d < rank; d++) {
            uint dd = rank - 1u - d;
            uint dim = shape[dd];
            uint idx = rem % dim;
            rem = rem / dim;
            src += idx * strides[dd];
        }
        outp[gid + dst_offset] = inp[src];
    }
}