#include <stddef.h>
#include <stdint.h>
#include <math.h>
#define FUNC_CAT_INNER(A, B) A##B
#define FUNC_CAT(A, B) FUNC_CAT_INNER(A, B)
#define FUNC(N) FUNC_CAT(N, SUFFIX)
#if defined(__clang__)
#define FP16 __fp16
#elif defined(__GNUC__) || defined(__GNUG__)
#define FP16 _Float16
#endif
float FUNC(norm_l2_f16)(const FP16 *data, uint32_t dimension) {
float sum = 0;
#pragma clang loop unroll(enable) vectorize(enable) interleave(enable)
for (uint32_t i = 0; i < dimension; i++) {
sum += (float) data[i] * (float) data[i];
}
return sqrtf(sum);
}
float FUNC(dot_f16)(const FP16 *x, const FP16 *y, uint32_t dimension) {
float sum = 0;
#pragma clang loop unroll(enable) interleave(enable) vectorize(enable)
for (uint32_t i = 0; i < dimension; i++) {
sum += (float) x[i] * (float) y[i];
}
return sum;
}
float FUNC(l2_f16)(const FP16 *x, const FP16 *y, uint32_t dimension) {
float sum = 0.0;
#pragma clang loop unroll(enable) interleave(enable) vectorize(enable)
for (uint32_t i = 0; i < dimension; i++) {
float s = (float) x[i] - (float) y[i];
sum += s * s;
}
return sum;
}
float FUNC(cosine_f16)(const FP16 *x, float x_norm, const FP16 *y, uint32_t dimension) {
float dot = 0.0;
float l2_y = 0.0;
#pragma clang loop unroll(enable) interleave(enable) vectorize(enable)
for (uint32_t i = 0; i < dimension; i++) {
float y_i = (float) y[i];
dot += (float) x[i] * y_i;
l2_y += y_i * y_i;
}
return 1.0 - dot / (x_norm * sqrtf(l2_y));
}