llama-cpp-sys-4 0.2.46

#pragma OPENCL EXTENSION cl_khr_fp16 : enable

#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif

// max workgroup size is usually 1024, this covers various subgroups sizes
#define MAX_SUBGROUPS 128

#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_32
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_cumsum_blk(
        global char * src0,
        ulong offset0,
        global char * tmp,
        global char * dst,
        ulong offsetd,
        int   ne00,
        int   ne01,
        int   ne02,
        int   ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        uint net0,
        uint net1,
        uint net2
) {
    src0 = src0 + offset0;
    dst  = dst + offsetd;

    const int i3 = get_group_id(2);
    const int i2 = get_group_id(1);
    const int i1 = get_group_id(0);

    const int nth = get_local_size(0);
    const int tid = get_local_id(0);

    const uint sg_size = get_sub_group_size();
    const uint sg_id = get_sub_group_id();
    const uint sg_lid = get_sub_group_local_id();

    const int ib = i1 / ne01;
    const int i00 = ib * nth;
    const int i01 = i1 % ne01;
    const int i02 = i2;
    const int i03 = i3;

    global const float * src0_row = (global const float *)(src0 + i03*nb03 + i02*nb02 + i01*nb01);
    global       float * tmp_row  = (global float *)tmp + net0 * i01 + net0 * net1 * i02 + net0 * net1 * net2 * i03;
    global       float * dst_row  = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;

    __local float partial[MAX_SUBGROUPS];

    float v = 0.0f;
    if (i00 + tid < ne00) {
        v = src0_row[i00 + tid];
    }

    float s = sub_group_scan_inclusive_add(v);
    if (sg_lid == sg_size - 1) {
        partial[sg_id] = s;
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    // NB: subgroup size should be larger than number of subgroups
    // assuming max workgroup size of 1024, subgroup size should be >= 32
    if (sg_id == 0) {
        float x = 0.0f;
        if (sg_lid < get_num_sub_groups()) {
            x = partial[sg_lid];
        }
        float ex = sub_group_scan_exclusive_add(x);
        if (sg_lid < get_num_sub_groups()) {
            partial[sg_lid] = ex;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    s += partial[sg_id];

    if (i00 + tid < ne00) {
        dst_row[i00 + tid] = s;
    }
    if (ne00 > nth && tid == nth - 1) {
        tmp_row[ib] = s;
    }
}

kernel void kernel_cumsum_add(
        global char * tmp,
        global char * dst,
        ulong offsetd,
        int   ne00,
        int   ne01,
        int   ne02,
        int   ne03,
        uint nbt0,
        uint nbt1,
        uint nbt2,
        uint nbt3
) {
    dst  = dst + offsetd;

    const int i3 = get_group_id(2);
    const int i2 = get_group_id(1);
    const int i1 = get_group_id(0);

    const int nth = get_local_size(0);
    const int tid = get_local_id(0);

    const int ib = i1 / ne01;
    if (ib == 0) {
        return;
    }
    const int i00 = ib * nth;
    const int i01 = i1 % ne01;
    const int i02 = i2;
    const int i03 = i3;

    global float * tmp_row  = (global float *)(tmp + nbt1 * i01 + nbt2 * i02 + nbt3 * i03);
    global float * dst_row  = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;

    if (i00 + tid < ne00) {
        dst_row[i00 + tid] += tmp_row[ib - 1];
    }
}