llama-cpp-sys-4 0.3.1

#include <math.h>
#include <stdint.h>
#include <string.h>

#include "hvx-utils.h"
#include "hex-fastdiv.h"

#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"

#ifndef MIN
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif

#define HTP_GDN_MAX_SV 128


struct htp_gdn_context {
    struct htp_ops_context * octx;
    uint32_t rows_per_thread;
    size_t   state_bytes;
    uint8_t * vtcm_base;
    size_t   vtcm_per_thread;
};

static inline HVX_Vector gdn_mul_dot_f32(float * restrict dst, const float * restrict mul, const float * restrict dot, uint32_t n) {
    HVX_Vector acc = Q6_V_vzero();

    const uint32_t epv  = 128 / sizeof(float);
    const uint32_t nvec = n / epv;
    const uint32_t nloe = n % epv;
    for (uint32_t i = 0; i < nvec; ++i) {
        HVX_Vector vd   = hvx_vmemu(dst + i * epv);
        HVX_Vector vm   = hvx_vmem(mul + i * epv);
        HVX_Vector vdot = hvx_vmem(dot + i * epv);
        HVX_Vector out  = hvx_vec_mul_f32_f32(vd, vm);
        hvx_vmemu(dst + i * epv) = out;
        acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
    }

    if (nloe) {
        const uint32_t off = nvec * epv;
        HVX_Vector vd   = hvx_vmemu(dst + off);
        HVX_Vector vm   = hvx_vmem(mul + off);
        HVX_Vector vdot = hvx_vmem(dot + off);
        HVX_Vector out  = hvx_vec_mul_f32_f32(vd, vm);
        hvx_vec_store_u(dst + off, nloe * sizeof(float), out);
        HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
        HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
        acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
    }

    return hvx_vec_reduce_sum_f32(acc);
}

static inline HVX_Vector gdn_mul_scalar_dot_f32(float * restrict dst, float mul, const float * restrict dot, uint32_t n) {
    HVX_Vector acc = Q6_V_vzero();
    const HVX_Vector vmul = hvx_vec_splat_f32(mul);

    const uint32_t epv  = 128 / sizeof(float);
    const uint32_t nvec = n / epv;
    const uint32_t nloe = n % epv;
    for (uint32_t i = 0; i < nvec; ++i) {
        HVX_Vector vd   = hvx_vmemu(dst + i * epv);
        HVX_Vector vdot = hvx_vmem(dot + i * epv);
        HVX_Vector out  = hvx_vec_mul_f32_f32(vd, vmul);
        hvx_vmemu(dst + i * epv) = out;
        acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
    }

    if (nloe) {
        const uint32_t off = nvec * epv;
        HVX_Vector vd   = hvx_vmemu(dst + off);
        HVX_Vector vdot = hvx_vmem(dot + off);
        HVX_Vector out  = hvx_vec_mul_f32_f32(vd, vmul);
        hvx_vec_store_u(dst + off, nloe * sizeof(float), out);
        HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
        HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
        acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
    }

    return hvx_vec_reduce_sum_f32(acc);
}

static inline HVX_Vector gdn_add_scaled_dot_f32(float * restrict dst, const float * restrict src,
        HVX_Vector vscale, const float * restrict dot, uint32_t n) {
    HVX_Vector acc = Q6_V_vzero();

    const uint32_t epv  = 128 / sizeof(float);
    const uint32_t nvec = n / epv;
    const uint32_t nloe = n % epv;
    for (uint32_t i = 0; i < nvec; ++i) {
        HVX_Vector vd   = hvx_vmemu(dst + i * epv);
        HVX_Vector vs   = hvx_vmem(src + i * epv);
        HVX_Vector vdot = hvx_vmem(dot + i * epv);
        HVX_Vector out  = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
        hvx_vmemu(dst + i * epv) = out;
        acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
    }

    if (nloe) {
        const uint32_t off = nvec * epv;
        HVX_Vector vd   = hvx_vmemu(dst + off);
        HVX_Vector vs   = hvx_vmem(src + off);
        HVX_Vector vdot = hvx_vmem(dot + off);
        HVX_Vector out  = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
        hvx_vec_store_u(dst + off, nloe * sizeof(float), out);
        HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
        HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
        acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
    }

    return hvx_vec_reduce_sum_f32(acc);
}

static inline void gdn_mul_dot4_f32(float * restrict dst0, float * restrict dst1,
        float * restrict dst2, float * restrict dst3, const float * restrict mul,
        const float * restrict dot, uint32_t n, float * restrict sums) {
    HVX_Vector acc0 = Q6_V_vzero();
    HVX_Vector acc1 = Q6_V_vzero();
    HVX_Vector acc2 = Q6_V_vzero();
    HVX_Vector acc3 = Q6_V_vzero();

    const uint32_t epv = 128 / sizeof(float);
    const uint32_t nvec = n / epv;
    const uint32_t nloe = n % epv;
    for (uint32_t i = 0; i < nvec; ++i) {
        HVX_Vector vm = hvx_vmem(mul + i * epv);
        HVX_Vector vdot = hvx_vmem(dot + i * epv);

        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm);
        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm);
        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm);
        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm);

        hvx_vmemu(dst0 + i * epv) = out0;
        hvx_vmemu(dst1 + i * epv) = out1;
        hvx_vmemu(dst2 + i * epv) = out2;
        hvx_vmemu(dst3 + i * epv) = out3;

        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
    }

    if (nloe) {
        const uint32_t off = nvec * epv;
        HVX_Vector vm   = hvx_vmem(mul + off);
        HVX_Vector vdot = hvx_vmem(dot + off);
        HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
        HVX_Vector zero = Q6_V_vzero();

        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm);
        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm);
        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm);

        hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
        hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
        hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
        hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);

        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
    }

    HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
    hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
}

static inline void gdn_mul_scalar_dot4_f32(float * restrict dst0, float * restrict dst1,
        float * restrict dst2, float * restrict dst3, float mul,
        const float * restrict dot, uint32_t n, float * restrict sums) {
    HVX_Vector acc0 = Q6_V_vzero();
    HVX_Vector acc1 = Q6_V_vzero();
    HVX_Vector acc2 = Q6_V_vzero();
    HVX_Vector acc3 = Q6_V_vzero();
    const HVX_Vector vmul = hvx_vec_splat_f32(mul);

    const uint32_t epv = 128 / sizeof(float);
    const uint32_t nvec = n / epv;
    const uint32_t nloe = n % epv;
    for (uint32_t i = 0; i < nvec; ++i) {
        HVX_Vector vdot = hvx_vmem(dot + i * epv);

        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul);
        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul);
        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul);
        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul);

        hvx_vmemu(dst0 + i * epv) = out0;
        hvx_vmemu(dst1 + i * epv) = out1;
        hvx_vmemu(dst2 + i * epv) = out2;
        hvx_vmemu(dst3 + i * epv) = out3;

        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
    }

    if (nloe) {
        const uint32_t off = nvec * epv;
        HVX_Vector vdot = hvx_vmem(dot + off);
        HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
        HVX_Vector zero = Q6_V_vzero();

        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul);
        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul);
        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul);

        hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
        hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
        hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
        hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);

        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
    }

    HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
    hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
}

static inline void gdn_add_scaled_dot4_f32(float * restrict dst0, float * restrict dst1,
        float * restrict dst2, float * restrict dst3, const float * restrict src,
        const float * restrict scale, const float * restrict dot, uint32_t n,
        float * restrict sums) {
    HVX_Vector acc0 = Q6_V_vzero();
    HVX_Vector acc1 = Q6_V_vzero();
    HVX_Vector acc2 = Q6_V_vzero();
    HVX_Vector acc3 = Q6_V_vzero();
    const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]);
    const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]);
    const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]);
    const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]);

    const uint32_t epv = 128 / sizeof(float);
    const uint32_t nvec = n / epv;
    const uint32_t nloe = n % epv;
    for (uint32_t i = 0; i < nvec; ++i) {
        HVX_Vector vs = hvx_vmem(src + i * epv);
        HVX_Vector vdot = hvx_vmem(dot + i * epv);

        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0));
        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1));
        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2));
        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3));

        hvx_vmemu(dst0 + i * epv) = out0;
        hvx_vmemu(dst1 + i * epv) = out1;
        hvx_vmemu(dst2 + i * epv) = out2;
        hvx_vmemu(dst3 + i * epv) = out3;

        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
    }

    if (nloe) {
        const uint32_t off = nvec * epv;
        HVX_Vector vs = hvx_vmem(src + off);
        HVX_Vector vdot = hvx_vmem(dot + off);
        HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
        HVX_Vector zero = Q6_V_vzero();

        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1));
        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2));
        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3));

        hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
        hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
        hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
        hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);

        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
    }

    HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
    hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
}

static inline void gdn_mul_dot8_f32(float * restrict dst0, float * restrict dst1,
        float * restrict dst2, float * restrict dst3, float * restrict dst4,
        float * restrict dst5, float * restrict dst6, float * restrict dst7,
        const float * restrict mul, const float * restrict dot, uint32_t n,
        float * restrict sums) {
    HVX_Vector acc0 = Q6_V_vzero();
    HVX_Vector acc1 = Q6_V_vzero();
    HVX_Vector acc2 = Q6_V_vzero();
    HVX_Vector acc3 = Q6_V_vzero();
    HVX_Vector acc4 = Q6_V_vzero();
    HVX_Vector acc5 = Q6_V_vzero();
    HVX_Vector acc6 = Q6_V_vzero();
    HVX_Vector acc7 = Q6_V_vzero();

    const uint32_t epv = 128 / sizeof(float);
    const uint32_t nvec = n / epv;
    const uint32_t nloe = n % epv;
    for (uint32_t i = 0; i < nvec; ++i) {
        HVX_Vector vm = hvx_vmem(mul + i * epv);
        HVX_Vector vdot = hvx_vmem(dot + i * epv);

        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm);
        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm);
        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm);
        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm);
        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vm);
        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vm);
        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vm);
        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vm);

        hvx_vmemu(dst0 + i * epv) = out0;
        hvx_vmemu(dst1 + i * epv) = out1;
        hvx_vmemu(dst2 + i * epv) = out2;
        hvx_vmemu(dst3 + i * epv) = out3;
        hvx_vmemu(dst4 + i * epv) = out4;
        hvx_vmemu(dst5 + i * epv) = out5;
        hvx_vmemu(dst6 + i * epv) = out6;
        hvx_vmemu(dst7 + i * epv) = out7;

        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
        acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
        acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
        acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
        acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
    }

    if (nloe) {
        const uint32_t off = nvec * epv;
        HVX_Vector vm = hvx_vmem(mul + off);
        HVX_Vector vdot = hvx_vmem(dot + off);
        HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
        HVX_Vector zero = Q6_V_vzero();

        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm);
        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm);
        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm);
        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vm);
        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vm);
        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vm);
        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vm);

        hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
        hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
        hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
        hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);
        hvx_vec_store_u(dst4 + off, nloe * sizeof(float), out4);
        hvx_vec_store_u(dst5 + off, nloe * sizeof(float), out5);
        hvx_vec_store_u(dst6 + off, nloe * sizeof(float), out6);
        hvx_vec_store_u(dst7 + off, nloe * sizeof(float), out7);

        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
        acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
        acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
        acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
        acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
    }

    HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
    HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
    hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
    hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
}

static inline void gdn_mul_scalar_dot8_f32(float * restrict dst0, float * restrict dst1,
        float * restrict dst2, float * restrict dst3, float * restrict dst4,
        float * restrict dst5, float * restrict dst6, float * restrict dst7,
        float mul, const float * restrict dot, uint32_t n, float * restrict sums) {
    HVX_Vector acc0 = Q6_V_vzero();
    HVX_Vector acc1 = Q6_V_vzero();
    HVX_Vector acc2 = Q6_V_vzero();
    HVX_Vector acc3 = Q6_V_vzero();
    HVX_Vector acc4 = Q6_V_vzero();
    HVX_Vector acc5 = Q6_V_vzero();
    HVX_Vector acc6 = Q6_V_vzero();
    HVX_Vector acc7 = Q6_V_vzero();
    const HVX_Vector vmul = hvx_vec_splat_f32(mul);

    const uint32_t epv = 128 / sizeof(float);
    const uint32_t nvec = n / epv;
    const uint32_t nloe = n % epv;
    for (uint32_t i = 0; i < nvec; ++i) {
        HVX_Vector vdot = hvx_vmem(dot + i * epv);

        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul);
        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul);
        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul);
        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul);
        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vmul);
        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vmul);
        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vmul);
        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vmul);

        hvx_vmemu(dst0 + i * epv) = out0;
        hvx_vmemu(dst1 + i * epv) = out1;
        hvx_vmemu(dst2 + i * epv) = out2;
        hvx_vmemu(dst3 + i * epv) = out3;
        hvx_vmemu(dst4 + i * epv) = out4;
        hvx_vmemu(dst5 + i * epv) = out5;
        hvx_vmemu(dst6 + i * epv) = out6;
        hvx_vmemu(dst7 + i * epv) = out7;

        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
        acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
        acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
        acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
        acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
    }

    if (nloe) {
        const uint32_t off = nvec * epv;
        HVX_Vector vdot = hvx_vmem(dot + off);
        HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
        HVX_Vector zero = Q6_V_vzero();

        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul);
        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul);
        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul);
        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vmul);
        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vmul);
        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vmul);
        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vmul);

        hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
        hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
        hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
        hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);
        hvx_vec_store_u(dst4 + off, nloe * sizeof(float), out4);
        hvx_vec_store_u(dst5 + off, nloe * sizeof(float), out5);
        hvx_vec_store_u(dst6 + off, nloe * sizeof(float), out6);
        hvx_vec_store_u(dst7 + off, nloe * sizeof(float), out7);

        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
        acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
        acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
        acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
        acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
    }

    HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
    HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
    hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
    hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
}

static inline void gdn_add_scaled_dot8_f32(float * restrict dst0, float * restrict dst1,
        float * restrict dst2, float * restrict dst3, float * restrict dst4,
        float * restrict dst5, float * restrict dst6, float * restrict dst7,
        const float * restrict src, const float * restrict scale,
        const float * restrict dot, uint32_t n, float * restrict sums) {
    HVX_Vector acc0 = Q6_V_vzero();
    HVX_Vector acc1 = Q6_V_vzero();
    HVX_Vector acc2 = Q6_V_vzero();
    HVX_Vector acc3 = Q6_V_vzero();
    HVX_Vector acc4 = Q6_V_vzero();
    HVX_Vector acc5 = Q6_V_vzero();
    HVX_Vector acc6 = Q6_V_vzero();
    HVX_Vector acc7 = Q6_V_vzero();
    const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]);
    const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]);
    const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]);
    const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]);
    const HVX_Vector scale4 = hvx_vec_splat_f32(scale[4]);
    const HVX_Vector scale5 = hvx_vec_splat_f32(scale[5]);
    const HVX_Vector scale6 = hvx_vec_splat_f32(scale[6]);
    const HVX_Vector scale7 = hvx_vec_splat_f32(scale[7]);

    const uint32_t epv = 128 / sizeof(float);
    const uint32_t nvec = n / epv;
    const uint32_t nloe = n % epv;
    for (uint32_t i = 0; i < nvec; ++i) {
        HVX_Vector vs = hvx_vmem(src + i * epv);
        HVX_Vector vdot = hvx_vmem(dot + i * epv);

        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0));
        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1));
        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2));
        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3));
        HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + i * epv), hvx_vec_mul_f32_f32(vs, scale4));
        HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + i * epv), hvx_vec_mul_f32_f32(vs, scale5));
        HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + i * epv), hvx_vec_mul_f32_f32(vs, scale6));
        HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + i * epv), hvx_vec_mul_f32_f32(vs, scale7));

        hvx_vmemu(dst0 + i * epv) = out0;
        hvx_vmemu(dst1 + i * epv) = out1;
        hvx_vmemu(dst2 + i * epv) = out2;
        hvx_vmemu(dst3 + i * epv) = out3;
        hvx_vmemu(dst4 + i * epv) = out4;
        hvx_vmemu(dst5 + i * epv) = out5;
        hvx_vmemu(dst6 + i * epv) = out6;
        hvx_vmemu(dst7 + i * epv) = out7;

        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
        acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
        acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
        acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
        acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
    }

    if (nloe) {
        const uint32_t off = nvec * epv;
        HVX_Vector vs = hvx_vmem(src + off);
        HVX_Vector vdot = hvx_vmem(dot + off);
        HVX_VectorPred mask = Q6_Q_vsetq2_R(nloe * sizeof(float));
        HVX_Vector zero = Q6_V_vzero();

        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1));
        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2));
        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3));
        HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + off), hvx_vec_mul_f32_f32(vs, scale4));
        HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + off), hvx_vec_mul_f32_f32(vs, scale5));
        HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + off), hvx_vec_mul_f32_f32(vs, scale6));
        HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + off), hvx_vec_mul_f32_f32(vs, scale7));

        hvx_vec_store_u(dst0 + off, nloe * sizeof(float), out0);
        hvx_vec_store_u(dst1 + off, nloe * sizeof(float), out1);
        hvx_vec_store_u(dst2 + off, nloe * sizeof(float), out2);
        hvx_vec_store_u(dst3 + off, nloe * sizeof(float), out3);
        hvx_vec_store_u(dst4 + off, nloe * sizeof(float), out4);
        hvx_vec_store_u(dst5 + off, nloe * sizeof(float), out5);
        hvx_vec_store_u(dst6 + off, nloe * sizeof(float), out6);
        hvx_vec_store_u(dst7 + off, nloe * sizeof(float), out7);

        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
        acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
        acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
        acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
        acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
    }

    HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
    HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
    hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
    hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
}

static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, void * data) {
    struct htp_gdn_context * gctx = (struct htp_gdn_context *) data;
    struct htp_ops_context * octx = gctx->octx;

    const struct htp_tensor * q     = octx->src[0];
    const struct htp_tensor * k     = octx->src[1];
    const struct htp_tensor * v     = octx->src[2];
    const struct htp_tensor * g     = octx->src[3];
    const struct htp_tensor * beta  = octx->src[4];
    const struct htp_tensor * state = octx->src[5];
    const struct htp_tensor * dst   = octx->dst;

    const uint32_t S_v      = v->ne[0];
    const uint32_t H        = v->ne[1];
    const uint32_t n_tokens = v->ne[2];
    const uint32_t n_seqs   = v->ne[3];
    const uint32_t K        = state->ne[1];

    const uint32_t total_rows = H * n_seqs;
    if (ith >= total_rows) {
        return;
    }

    const uint32_t rq3 = n_seqs / q->ne[3];
    const uint32_t rk3 = n_seqs / k->ne[3];
    const float scale = 1.0f / sqrtf((float) S_v);

    float * dst_base       = (float *) (uintptr_t) dst->data;
    float * state_out_base = dst_base + (uint64_t) S_v * H * n_tokens * n_seqs;
    const float * state_in_base = (const float *) (uintptr_t) state->data;

    const bool kda = (g->ne[0] == S_v);
    float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
    float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
    float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
    float local_sums[32] __attribute__((aligned(128)));

    dma_queue * dma = octx->ctx->dma[ith];
    size_t state_aligned = (size_t) S_v * S_v * sizeof(float);
    state_aligned = (state_aligned + 127) & ~(size_t)127;
    float * s_work[2];
    s_work[0] = (float *) (gctx->vtcm_base + gctx->vtcm_per_thread * ith);
    s_work[1] = s_work[0] + state_aligned / sizeof(float);

    struct fastdiv_values fd_H = init_fastdiv_values(H);
    struct fastdiv_values fd_q1 = init_fastdiv_values(q->ne[1]);
    struct fastdiv_values fd_k1 = init_fastdiv_values(k->ne[1]);
    struct fastdiv_values fd_rq3 = init_fastdiv_values(rq3);
    struct fastdiv_values fd_rk3 = init_fastdiv_values(rk3);

    const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
    const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;
    const int64_t shift = (int64_t) n_tokens - (int64_t) K;

    uint32_t ir_prefetch = ith;
    int spad_idx = 0;

    // Prefetch preamble (up to 2 steps)
    for (int k = 0; k < 2 && ir_prefetch < total_rows; k++) {
        const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
        const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
        const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;
        float * ps_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) piv3 * H + piv1) * S_v * S_v;

        // Push dummy write-back
        dma_queue_push(dma, dma_make_ptr(ps_out, s_work[spad_idx]),
                       S_v * sizeof(float), S_v * sizeof(float),
                       S_v * sizeof(float), 0);

        // Push fetch
        dma_queue_push(dma, dma_make_ptr(s_work[spad_idx], ps_in),
                       S_v * sizeof(float), S_v * sizeof(float),
                       S_v * sizeof(float), S_v);

        ir_prefetch += nth;
        spad_idx ^= 1;
    }

    int curr_spad_idx = 0;
    for (uint32_t ir = ith; ir < total_rows; ir += nth) {
        dma_queue_pop(dma);
        dma_queue_pop(dma);

        float * s_work_curr = s_work[curr_spad_idx];

        const uint32_t iv1 = fastmodulo(ir, H, &fd_H);
        const uint32_t iv3 = fastdiv(ir, &fd_H);

        const uint32_t iq1 = fastmodulo(iv1, q->ne[1], &fd_q1);
        const uint32_t ik1 = fastmodulo(iv1, k->ne[1], &fd_k1);
        const uint32_t iq3 = fastdiv(iv3, &fd_rq3);
        const uint32_t ik3 = fastdiv(iv3, &fd_rk3);

        float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;

        float * attn_data = dst_base + ((uint64_t) iv3 * n_tokens * H + iv1) * S_v;

        for (uint32_t t = 0; t < n_tokens; ++t) {
            const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data +
                    (uint64_t) iq3 * q->nb[3] + (uint64_t) t * q->nb[2] + (uint64_t) iq1 * q->nb[1]);
            const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data +
                    (uint64_t) ik3 * k->nb[3] + (uint64_t) t * k->nb[2] + (uint64_t) ik1 * k->nb[1]);
            const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data +
                    (uint64_t) iv3 * v->nb[3] + (uint64_t) t * v->nb[2] + (uint64_t) iv1 * v->nb[1]);
            const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data +
                    (uint64_t) iv3 * g->nb[3] + (uint64_t) t * g->nb[2] + (uint64_t) iv1 * g->nb[1]);
            const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
                    (uint64_t) iv3 * beta->nb[3] + (uint64_t) t * beta->nb[2] + (uint64_t) iv1 * beta->nb[1]);

            hvx_copy_f32_au((uint8_t *) local_q, (const uint8_t *) q_t, S_v);
            hvx_copy_f32_au((uint8_t *) local_k, (const uint8_t *) k_t, S_v);

            if (kda) {
                hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);

                uint32_t j = 0;
                for (; j + 8 <= S_v; j += 8) {
                    float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
                    float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
                    float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
                    float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
                    float * row4 = s_work_curr + (uint64_t) (j + 4) * S_v;
                    float * row5 = s_work_curr + (uint64_t) (j + 5) * S_v;
                    float * row6 = s_work_curr + (uint64_t) (j + 6) * S_v;
                    float * row7 = s_work_curr + (uint64_t) (j + 7) * S_v;
                    gdn_mul_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
                                     local_gate, local_k, S_v, local_sums);

                    float local_delta_b[32] __attribute__((aligned(128)));
                    HVX_Vector vv_t = hvx_vmemu(v_t + j);
                    HVX_Vector v_local_sums = hvx_vmem(local_sums);
                    HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
                    hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));

                    gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
                                            local_k, local_delta_b, local_q, S_v, local_sums);

                    HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
                    hvx_vec_store_u(attn_data + j, 8 * sizeof(float), res_attn);
                }
                for (; j + 4 <= S_v; j += 4) {
                    float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
                    float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
                    float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
                    float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
                    gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);

                    float local_delta_b[32] __attribute__((aligned(128)));
                    HVX_Vector vv_t = hvx_vmemu(v_t + j);
                    HVX_Vector v_local_sums = hvx_vmem(local_sums);
                    HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
                    hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));

                    gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);

                    HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
                    hvx_vec_store_u(attn_data + j, 4 * sizeof(float), res_attn);
                }
                HVX_Vector vscale_splat = hvx_vec_splat_f32(scale);
                for (; j < S_v; ++j) {
                    float * row = s_work_curr + (uint64_t) j * S_v;
                    HVX_Vector vsum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
                    HVX_Vector vv_t = hvx_vec_splat_f32(v_t[j]);
                    HVX_Vector vdj = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(vv_t, vsum), hvx_vec_splat_f32(beta_val));
                    HVX_Vector vres = gdn_add_scaled_dot_f32(row, local_k, vdj, local_q, S_v);
                    attn_data[j] = hvx_vec_get_f32(hvx_vec_mul_f32_f32(vres, vscale_splat));
                }
            } else {
                const float gate = expf(g_t[0]);
                uint32_t j = 0;
                for (; j + 8 <= S_v; j += 8) {
                    float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
                    float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
                    float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
                    float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
                    float * row4 = s_work_curr + (uint64_t) (j + 4) * S_v;
                    float * row5 = s_work_curr + (uint64_t) (j + 5) * S_v;
                    float * row6 = s_work_curr + (uint64_t) (j + 6) * S_v;
                    float * row7 = s_work_curr + (uint64_t) (j + 7) * S_v;
                    gdn_mul_scalar_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
                                            gate, local_k, S_v, local_sums);

                    float local_delta_b[32] __attribute__((aligned(128)));
                    HVX_Vector vv_t = hvx_vmemu(v_t + j);
                    HVX_Vector v_local_sums = hvx_vmem(local_sums);
                    HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
                    hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));

                    gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
                                            local_k, local_delta_b, local_q, S_v, local_sums);

                    HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
                    hvx_vec_store_u(attn_data + j, 8 * sizeof(float), res_attn);
                }
                for (; j + 4 <= S_v; j += 4) {
                    float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
                    float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
                    float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
                    float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
                    gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);

                    float local_delta_b[32] __attribute__((aligned(128)));
                    HVX_Vector vv_t = hvx_vmemu(v_t + j);
                    HVX_Vector v_local_sums = hvx_vmem(local_sums);
                    HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
                    hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));

                    gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);

                    HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
                    hvx_vec_store_u(attn_data + j, 4 * sizeof(float), res_attn);
                }
                HVX_Vector vscale_splat = hvx_vec_splat_f32(scale);
                for (; j < S_v; ++j) {
                    float * row = s_work_curr + (uint64_t) j * S_v;
                    HVX_Vector vsum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
                    HVX_Vector vv_t = hvx_vec_splat_f32(v_t[j]);
                    HVX_Vector vdj = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(vv_t, vsum), hvx_vec_splat_f32(beta_val));
                    HVX_Vector vres = gdn_add_scaled_dot_f32(row, local_k, vdj, local_q, S_v);
                    attn_data[j] = hvx_vec_get_f32(hvx_vec_mul_f32_f32(vres, vscale_splat));
                }
            }

            if (K > 1) {
                const int64_t target_slot = (int64_t) t - shift;
                if (target_slot >= 0 && target_slot < (int64_t) K) {
                    float * curr_state_o = state_out_base + (uint64_t) target_slot * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
                    if (curr_state_o != s_out) {
                        hvx_copy_f32_uu((uint8_t *) curr_state_o, (const uint8_t *) s_work_curr, S_v * S_v);
                    }
                }
            }

            attn_data += (uint64_t) S_v * H;
        }

        // Push real write-back
        dma_queue_push(dma, dma_make_ptr(s_out, s_work_curr),
                       S_v * sizeof(float), S_v * sizeof(float),
                       S_v * sizeof(float), S_v);

        // Prefetch next block (if any)
        if (ir_prefetch < total_rows) {
            const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
            const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
            const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;

            dma_queue_push(dma, dma_make_ptr(s_work[spad_idx], ps_in),
                           S_v * sizeof(float), S_v * sizeof(float),
                           S_v * sizeof(float), S_v);

            ir_prefetch += nth;
            spad_idx ^= 1;
        }

        curr_spad_idx ^= 1;
    }
    dma_queue_flush(dma);
}


static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, void * data) {
    struct htp_gdn_context * gctx = (struct htp_gdn_context *) data;
    struct htp_ops_context * octx = gctx->octx;

    const struct htp_tensor * q     = octx->src[0];
    const struct htp_tensor * k     = octx->src[1];
    const struct htp_tensor * v     = octx->src[2];
    const struct htp_tensor * g     = octx->src[3];
    const struct htp_tensor * beta  = octx->src[4];
    const struct htp_tensor * state = octx->src[5];
    const struct htp_tensor * dst   = octx->dst;

    const uint32_t S_v      = v->ne[0];
    const uint32_t H        = v->ne[1];
    const uint32_t n_seqs   = v->ne[3];
    const uint32_t K        = state->ne[1];

    const uint32_t total_rows = H * n_seqs;
    if (ith >= total_rows) {
        return;
    }

    const uint32_t rq3 = n_seqs / q->ne[3];
    const uint32_t rk3 = n_seqs / k->ne[3];
    const float scale = 1.0f / sqrtf((float) S_v);

    float * dst_base       = (float *) (uintptr_t) dst->data;
    float * state_out_base = dst_base + (uint64_t) S_v * H * n_seqs;
    const float * state_in_base = (const float *) (uintptr_t) state->data;

    const bool kda = (g->ne[0] == S_v);
    float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
    float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
    float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
    float local_sums[32] __attribute__((aligned(128)));

    dma_queue * dma = octx->ctx->dma[ith];
    size_t state_aligned = (size_t) S_v * S_v * sizeof(float);
    state_aligned = (state_aligned + 127) & ~(size_t)127;
    float * s_work[2];
    s_work[0] = (float *) (gctx->vtcm_base + gctx->vtcm_per_thread * ith);
    s_work[1] = s_work[0] + state_aligned / sizeof(float);

    struct fastdiv_values fd_H = init_fastdiv_values(H);
    struct fastdiv_values fd_q1 = init_fastdiv_values(q->ne[1]);
    struct fastdiv_values fd_k1 = init_fastdiv_values(k->ne[1]);
    struct fastdiv_values fd_rq3 = init_fastdiv_values(rq3);
    struct fastdiv_values fd_rk3 = init_fastdiv_values(rk3);

    const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
    const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;

    uint32_t ir_prefetch = ith;
    int spad_idx = 0;

    // Prefetch preamble (up to 2 steps)
    for (int k = 0; k < 2 && ir_prefetch < total_rows; k++) {
        const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
        const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
        const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;
        float * ps_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) piv3 * H + piv1) * S_v * S_v;

        // Push dummy write-back
        dma_queue_push(dma, dma_make_ptr(ps_out, s_work[spad_idx]),
                       S_v * sizeof(float), S_v * sizeof(float),
                       S_v * sizeof(float), 0);

        // Push fetch
        dma_queue_push(dma, dma_make_ptr(s_work[spad_idx], ps_in),
                       S_v * sizeof(float), S_v * sizeof(float),
                       S_v * sizeof(float), S_v);

        ir_prefetch += nth;
        spad_idx ^= 1;
    }

    int curr_spad_idx = 0;
    for (uint32_t ir = ith; ir < total_rows; ir += nth) {
        dma_queue_pop(dma);
        dma_queue_pop(dma);

        float * s_work_curr = s_work[curr_spad_idx];

        const uint32_t iv1 = fastmodulo(ir, H, &fd_H);
        const uint32_t iv3 = fastdiv(ir, &fd_H);

        const uint32_t iq1 = fastmodulo(iv1, q->ne[1], &fd_q1);
        const uint32_t ik1 = fastmodulo(iv1, k->ne[1], &fd_k1);
        const uint32_t iq3 = fastdiv(iv3, &fd_rq3);
        const uint32_t ik3 = fastdiv(iv3, &fd_rk3);

        float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;

        float * attn_data = dst_base + ((uint64_t) iv3 * H + iv1) * S_v;

        const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data +
                (uint64_t) iq3 * q->nb[3] + (uint64_t) iq1 * q->nb[1]);
        const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data +
                (uint64_t) ik3 * k->nb[3] + (uint64_t) ik1 * k->nb[1]);
        const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data +
                (uint64_t) iv3 * v->nb[3] + (uint64_t) iv1 * v->nb[1]);
        const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data +
                (uint64_t) iv3 * g->nb[3] + (uint64_t) iv1 * g->nb[1]);
        const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
                (uint64_t) iv3 * beta->nb[3] + (uint64_t) iv1 * beta->nb[1]);

        hvx_copy_f32_au((uint8_t *) local_q, (const uint8_t *) q_t, S_v);
        hvx_copy_f32_au((uint8_t *) local_k, (const uint8_t *) k_t, S_v);

        if (kda) {
            hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);

            uint32_t j = 0;
            for (; j + 8 <= S_v; j += 8) {
                float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
                float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
                float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
                float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
                float * row4 = s_work_curr + (uint64_t) (j + 4) * S_v;
                float * row5 = s_work_curr + (uint64_t) (j + 5) * S_v;
                float * row6 = s_work_curr + (uint64_t) (j + 6) * S_v;
                float * row7 = s_work_curr + (uint64_t) (j + 7) * S_v;
                gdn_mul_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
                                 local_gate, local_k, S_v, local_sums);

                float local_delta_b[32] __attribute__((aligned(128)));
                HVX_Vector vv_t = hvx_vmemu(v_t + j);
                HVX_Vector v_local_sums = hvx_vmem(local_sums);
                HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
                hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));

                gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
                                        local_k, local_delta_b, local_q, S_v, local_sums);

                HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
                hvx_vec_store_u(attn_data + j, 8 * sizeof(float), res_attn);
            }
            for (; j + 4 <= S_v; j += 4) {
                float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
                float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
                float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
                float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
                gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);

                float local_delta_b[32] __attribute__((aligned(128)));
                HVX_Vector vv_t = hvx_vmemu(v_t + j);
                HVX_Vector v_local_sums = hvx_vmem(local_sums);
                HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
                hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));

                gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);

                HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
                hvx_vec_store_u(attn_data + j, 4 * sizeof(float), res_attn);
            }
            HVX_Vector vscale_splat = hvx_vec_splat_f32(scale);
            for (; j < S_v; ++j) {
                float * row = s_work_curr + (uint64_t) j * S_v;
                HVX_Vector vsum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
                HVX_Vector vv_t = hvx_vec_splat_f32(v_t[j]);
                HVX_Vector vdj = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(vv_t, vsum), hvx_vec_splat_f32(beta_val));
                HVX_Vector vres = gdn_add_scaled_dot_f32(row, local_k, vdj, local_q, S_v);
                attn_data[j] = hvx_vec_get_f32(hvx_vec_mul_f32_f32(vres, vscale_splat));
            }
        } else {
            const float gate = expf(g_t[0]);
            uint32_t j = 0;
            for (; j + 8 <= S_v; j += 8) {
                float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
                float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
                float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
                float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
                float * row4 = s_work_curr + (uint64_t) (j + 4) * S_v;
                float * row5 = s_work_curr + (uint64_t) (j + 5) * S_v;
                float * row6 = s_work_curr + (uint64_t) (j + 6) * S_v;
                float * row7 = s_work_curr + (uint64_t) (j + 7) * S_v;
                gdn_mul_scalar_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
                                        gate, local_k, S_v, local_sums);

                float local_delta_b[32] __attribute__((aligned(128)));
                HVX_Vector vv_t = hvx_vmemu(v_t + j);
                HVX_Vector v_local_sums = hvx_vmem(local_sums);
                HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
                hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));

                gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
                                        local_k, local_delta_b, local_q, S_v, local_sums);

                HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
                hvx_vec_store_u(attn_data + j, 8 * sizeof(float), res_attn);
            }
            for (; j + 4 <= S_v; j += 4) {
                float * row0 = s_work_curr + (uint64_t) (j + 0) * S_v;
                float * row1 = s_work_curr + (uint64_t) (j + 1) * S_v;
                float * row2 = s_work_curr + (uint64_t) (j + 2) * S_v;
                float * row3 = s_work_curr + (uint64_t) (j + 3) * S_v;
                gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);

                float local_delta_b[32] __attribute__((aligned(128)));
                HVX_Vector vv_t = hvx_vmemu(v_t + j);
                HVX_Vector v_local_sums = hvx_vmem(local_sums);
                HVX_Vector diff = hvx_vec_sub_f32_f32(vv_t, v_local_sums);
                hvx_vmem(local_delta_b) = hvx_vec_mul_f32_f32(diff, hvx_vec_splat_f32(beta_val));

                gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);

                HVX_Vector res_attn = hvx_vec_mul_f32_f32(hvx_vmem(local_sums), hvx_vec_splat_f32(scale));
                hvx_vec_store_u(attn_data + j, 4 * sizeof(float), res_attn);
            }
            HVX_Vector vscale_splat = hvx_vec_splat_f32(scale);
            for (; j < S_v; ++j) {
                float * row = s_work_curr + (uint64_t) j * S_v;
                HVX_Vector vsum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
                HVX_Vector vv_t = hvx_vec_splat_f32(v_t[j]);
                HVX_Vector vdj = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(vv_t, vsum), hvx_vec_splat_f32(beta_val));
                HVX_Vector vres = gdn_add_scaled_dot_f32(row, local_k, vdj, local_q, S_v);
                attn_data[j] = hvx_vec_get_f32(hvx_vec_mul_f32_f32(vres, vscale_splat));
            }
        }

        // Push real write-back
        dma_queue_push(dma, dma_make_ptr(s_out, s_work_curr),
                       S_v * sizeof(float), S_v * sizeof(float),
                       S_v * sizeof(float), S_v);

        // Prefetch next block (if any)
        if (ir_prefetch < total_rows) {
            const uint32_t piv1 = fastmodulo(ir_prefetch, H, &fd_H);
            const uint32_t piv3 = fastdiv(ir_prefetch, &fd_H);
            const float * ps_in = state_in_base + (uint64_t) piv3 * state_seq_stride + (uint64_t) piv1 * S_v * S_v;

            dma_queue_push(dma, dma_make_ptr(s_work[spad_idx], ps_in),
                           S_v * sizeof(float), S_v * sizeof(float),
                           S_v * sizeof(float), S_v);

            ir_prefetch += nth;
            spad_idx ^= 1;
        }

        curr_spad_idx ^= 1;
    }
    dma_queue_flush(dma);
}


int op_gated_delta_net(struct htp_ops_context * octx) {
    const struct htp_tensor * q     = octx->src[0];
    const struct htp_tensor * k     = octx->src[1];
    const struct htp_tensor * v     = octx->src[2];
    const struct htp_tensor * g     = octx->src[3];
    const struct htp_tensor * beta  = octx->src[4];
    const struct htp_tensor * state = octx->src[5];
    const struct htp_tensor * dst   = octx->dst;

    if (!q || !k || !v || !g || !beta || !state || !dst) {
        return HTP_STATUS_INVAL_PARAMS;
    }

    if (q->type != HTP_TYPE_F32 || k->type != HTP_TYPE_F32 || v->type != HTP_TYPE_F32 ||
        g->type != HTP_TYPE_F32 || beta->type != HTP_TYPE_F32 || state->type != HTP_TYPE_F32 ||
        dst->type != HTP_TYPE_F32) {
        return HTP_STATUS_NO_SUPPORT;
    }

    const uint32_t S_v      = v->ne[0];
    const uint32_t H        = v->ne[1];
    const uint32_t n_tokens = v->ne[2];
    const uint32_t n_seqs   = v->ne[3];
    const uint32_t K        = state->ne[1];

    if (S_v == 0 || S_v > HTP_GDN_MAX_SV || H == 0 || n_tokens == 0 || n_seqs == 0) {
        return HTP_STATUS_NO_SUPPORT;
    }
    if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
        return HTP_STATUS_NO_SUPPORT;
    }
    if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] == 0 || k->ne[1] == 0 ||
        q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] == 0 || k->ne[3] == 0 ||
        (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
        return HTP_STATUS_NO_SUPPORT;
    }
    if (state->ne[0] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) {
        return HTP_STATUS_NO_SUPPORT;
    }
    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
        return HTP_STATUS_NO_SUPPORT;
    }

    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
        return HTP_STATUS_OK;
    }

    struct htp_gdn_context gctx;
    gctx.octx = octx;
    gctx.rows_per_thread = (H * n_seqs + octx->n_threads - 1) / octx->n_threads;
    gctx.state_bytes = (size_t) S_v * S_v * sizeof(float);

    size_t state_aligned = (size_t) S_v * S_v * sizeof(float);
    state_aligned = (state_aligned + 127) & ~(size_t)127;

    assert(octx->ctx->vtcm_base != NULL);
    assert(octx->ctx->vtcm_size >= 2 * state_aligned * octx->n_threads);

    gctx.vtcm_base = octx->ctx->vtcm_base;
    gctx.vtcm_per_thread = 2 * state_aligned;

    if (n_tokens == 1) {
        worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_tg_thread, &gctx, octx->n_threads);
    } else {
        worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_pp_thread, &gctx, octx->n_threads);
    }

    return HTP_STATUS_OK;
}