hive-gpu 0.2.0

High-performance GPU acceleration for vector operations with Device Info API (Metal, CUDA, ROCm)
Documentation
#version 450
// SGEMV-like compute shader: scores[i] = sum_d matrix[i, d] * query[d].
// matrix is row-major (n_vectors, dimension).
// One thread per output row; each thread reads one full row.
//
// Matches the shape of src/shaders/metal_hnsw.metal::sgemv_dot and the
// implicit behaviour of cuBLAS/rocBLAS SGEMV with trans=T used by the
// CUDA and ROCm backends.

layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

layout(std430, binding = 0) readonly buffer Matrix  { float matrix[]; };
layout(std430, binding = 1) readonly buffer Query   { float query[];  };
layout(std430, binding = 2) writeonly buffer Scores { float scores[]; };

layout(push_constant) uniform PushConstants {
    uint dimension;
    uint n_vectors;
} pc;

void main() {
    uint tid = gl_GlobalInvocationID.x;
    if (tid >= pc.n_vectors) {
        return;
    }
    uint base = tid * pc.dimension;
    float sum = 0.0;
    for (uint d = 0u; d < pc.dimension; ++d) {
        sum += matrix[base + d] * query[d];
    }
    scores[tid] = sum;
}