// this kernel this to be repeatedly evaluated in a array
// about log(N) or ceil(log(N)) to get to the whole sum of the array
// where N is the total amount of numbers
// and the log here is being taken with
// a base that is the size of the local workgroups
kernel void sum_all_values_in_workgroups(
global float* original,
global float* reduced,
local float* workgroup_state,
int buffer_length
) {
int local_id = get_local_id(0) int global_id = get_global_id(0) int group_size = get_local_size(0)
if (group_size > buffer_length) {
group_size = buffer_length }
workgroup_state[local_id] = (float)original[global_id] barrier(CLK_LOCAL_MEM_FENCE)
int half_size = group_size / 2 while (group_size > 1) {
// if the id in the work group is in the first half
if (local_id < half_size) {
// sum it and the corresponding value in the other half together into the local_id
workgroup_state[local_id] += workgroup_state[local_id + half_size] if (local_id == 0) {
if ((half_size * 2) < group_size) {
workgroup_state[0] = (float) (workgroup_state[0] + workgroup_state[group_size - 1]) }
}
}
barrier(CLK_LOCAL_MEM_FENCE)
group_size = half_size half_size = group_size / 2 }
if (local_id == 0) {
// printf("%d-%d: %e\n", get_group_id(0), local_id, workgroup_state[local_id])
// after summing all of the items in the work group
// should just take them and associate it with the sum of the
// current workgroup in the reduced array
reduced[get_group_id(0)] = workgroup_state[0] }
}
kernel void clip_min_max_inplace(
global float *self,
float _min,
float _max,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
self[index] = max(min((float)self[index], _min), _max)}
kernel void scale_inplace(
global float *self,
float scaler,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
self[index] = (float)self[index] * scaler}
kernel void scale(
global float *nums,
global float *result,
float scaler,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
result[index] = (float)nums[index] * scaler}
kernel void sqrt_inplace(
global float *buf,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
buf[index] = sqrt(buf[index])}
kernel void squareroot(
global float *first,
global float *result,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
result[index] = sqrt(first[index])}
kernel void inverse_sqrt_inplace(
global float *buf,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
buf[index] = 1.0f / sqrt(buf[index])}
kernel void inverse_sqrt(
global float *first,
global float *result,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
result[index] = 1.0f / sqrt(first[index])}
kernel void shift_inplace(
global float *buf,
float num,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
buf[index] = buf[index] + num}
kernel void add_num(
global float *first,
global float *result,
float num,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
result[index] = first[index] + num}
kernel void add_inplace(
global float *self,
global float *other,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
self[index] = self[index] + other[index]}
kernel void add(
global float *first,
global float *second,
global float *result,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
result[index] = first[index] + second[index]}
kernel void subtract_inplace(
global float *self,
global float *other,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
self[index] = self[index] - other[index]}
kernel void subtract(
global float *first,
global float *second,
global float *result,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
result[index] = first[index] - second[index]}
kernel void multiply_inplace(
global float *self,
global float *other,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
self[index] = self[index] * other[index]}
kernel void multiply(
global float *first,
global float *second,
global float *result,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
result[index] = first[index] * second[index]}
kernel void divide_inplace(
global float *self,
global float *other,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
self[index] = self[index] / other[index]}
kernel void divide(
global float *first,
global float *second,
global float *result,
int size
) {
int index = get_global_id(0)
if (index >= size) {
return }
result[index] = first[index] / second[index]}