#ifndef SIMSIMD_SPATIAL_H
#define SIMSIMD_SPATIAL_H
#include "types.h"
#include "dot.h"
#ifdef __cplusplus
extern "C" {
#endif
SIMSIMD_PUBLIC void simsimd_l2_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f64_serial(simsimd_f64_t const* a, simsimd_f64_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f32_serial(simsimd_f32_t const* a, simsimd_f32_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f16_serial(simsimd_f16_t const* a, simsimd_f16_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_bf16_serial(simsimd_bf16_t const* a, simsimd_bf16_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_i8_serial(simsimd_i8_t const* a, simsimd_i8_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_i8_serial(simsimd_i8_t const* a, simsimd_i8_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_i8_serial(simsimd_i8_t const* a, simsimd_i8_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_u8_serial(simsimd_u8_t const* a, simsimd_u8_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_u8_serial(simsimd_u8_t const* a, simsimd_u8_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_u8_serial(simsimd_u8_t const* a, simsimd_u8_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f32_accurate(simsimd_f32_t const* a, simsimd_f32_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const*, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f64_neon(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f64_neon(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f64_neon(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f32_sve(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_sve(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f32_sve(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f16_sve(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_sve(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f16_sve(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_bf16_sve(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_sve(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_bf16_sve(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f64_sve(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f64_sve(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f64_sve(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f32_haswell(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_haswell(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f32_haswell(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f64_haswell(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f64_haswell(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f64_haswell(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_i4x2_ice(simsimd_i4x2_t const* a, simsimd_i4x2_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_i4x2_ice(simsimd_i4x2_t const* a, simsimd_i4x2_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_i4x2_ice(simsimd_i4x2_t const* a, simsimd_i4x2_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_l2sq_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d);
SIMSIMD_PUBLIC void simsimd_cos_i8_sierra(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d);
#define SIMSIMD_MAKE_L2SQ(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_l2sq_##input_type##_##name(simsimd_##input_type##_t const *a, \
simsimd_##input_type##_t const *b, simsimd_size_t n, \
simsimd_distance_t *result) { \
simsimd_##accumulator_type##_t d2 = 0; \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \
simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \
d2 += (ai - bi) * (ai - bi); \
} \
*result = d2; \
}
#define SIMSIMD_MAKE_L2(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_l2_##input_type##_##name(simsimd_##input_type##_t const *a, \
simsimd_##input_type##_t const *b, simsimd_size_t n, \
simsimd_distance_t *result) { \
simsimd_l2sq_##input_type##_##name(a, b, n, result); \
*result = SIMSIMD_SQRT(*result); \
}
#define SIMSIMD_MAKE_COS(name, input_type, accumulator_type, load_and_convert) \
SIMSIMD_PUBLIC void simsimd_cos_##input_type##_##name(simsimd_##input_type##_t const *a, \
simsimd_##input_type##_t const *b, simsimd_size_t n, \
simsimd_distance_t *result) { \
simsimd_##accumulator_type##_t ab = 0, a2 = 0, b2 = 0; \
for (simsimd_size_t i = 0; i != n; ++i) { \
simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \
simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \
ab += ai * bi; \
a2 += ai * ai; \
b2 += bi * bi; \
} \
if (a2 == 0 && b2 == 0) { *result = 0; } \
else if (ab == 0) { *result = 1; } \
else { \
simsimd_distance_t unclipped_result = 1 - ab * SIMSIMD_RSQRT(a2) * SIMSIMD_RSQRT(b2); \
*result = unclipped_result > 0 ? unclipped_result : 0; \
} \
}
SIMSIMD_MAKE_COS(serial, f64, f64, SIMSIMD_DEREFERENCE) SIMSIMD_MAKE_L2SQ(serial, f64, f64, SIMSIMD_DEREFERENCE) SIMSIMD_MAKE_L2(serial, f64, f64, SIMSIMD_DEREFERENCE)
SIMSIMD_MAKE_COS(serial, f32, f32, SIMSIMD_DEREFERENCE) SIMSIMD_MAKE_L2SQ(serial, f32, f32, SIMSIMD_DEREFERENCE) SIMSIMD_MAKE_L2(serial, f32, f32, SIMSIMD_DEREFERENCE)
SIMSIMD_MAKE_COS(serial, f16, f32, SIMSIMD_F16_TO_F32) SIMSIMD_MAKE_L2SQ(serial, f16, f32, SIMSIMD_F16_TO_F32) SIMSIMD_MAKE_L2(serial, f16, f32, SIMSIMD_F16_TO_F32)
SIMSIMD_MAKE_COS(serial, bf16, f32, SIMSIMD_BF16_TO_F32) SIMSIMD_MAKE_L2SQ(serial, bf16, f32, SIMSIMD_BF16_TO_F32) SIMSIMD_MAKE_L2(serial, bf16, f32, SIMSIMD_BF16_TO_F32)
SIMSIMD_MAKE_COS(serial, i8, i32, SIMSIMD_DEREFERENCE) SIMSIMD_MAKE_L2SQ(serial, i8, i32, SIMSIMD_DEREFERENCE) SIMSIMD_MAKE_L2(serial, i8, i32, SIMSIMD_DEREFERENCE)
SIMSIMD_MAKE_COS(serial, u8, i32, SIMSIMD_DEREFERENCE) SIMSIMD_MAKE_L2SQ(serial, u8, i32, SIMSIMD_DEREFERENCE) SIMSIMD_MAKE_L2(serial, u8, i32, SIMSIMD_DEREFERENCE)
SIMSIMD_MAKE_COS(accurate, f32, f64, SIMSIMD_DEREFERENCE) SIMSIMD_MAKE_L2SQ(accurate, f32, f64, SIMSIMD_DEREFERENCE) SIMSIMD_MAKE_L2(accurate, f32, f64, SIMSIMD_DEREFERENCE)
SIMSIMD_MAKE_COS(accurate, f16, f64, SIMSIMD_F16_TO_F32) SIMSIMD_MAKE_L2SQ(accurate, f16, f64, SIMSIMD_F16_TO_F32) SIMSIMD_MAKE_L2(accurate, f16, f64, SIMSIMD_F16_TO_F32)
SIMSIMD_MAKE_COS(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) SIMSIMD_MAKE_L2SQ(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) SIMSIMD_MAKE_L2(accurate, bf16, f64, SIMSIMD_BF16_TO_F32)
#if _SIMSIMD_TARGET_ARM
#if SIMSIMD_TARGET_NEON
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+simd")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
SIMSIMD_INTERNAL simsimd_f32_t _simsimd_sqrt_f32_neon(simsimd_f32_t x) {
return vget_lane_f32(vsqrt_f32(vdup_n_f32(x)), 0);
}
SIMSIMD_INTERNAL simsimd_f64_t _simsimd_sqrt_f64_neon(simsimd_f64_t x) {
return vget_lane_f64(vsqrt_f64(vdup_n_f64(x)), 0);
}
SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f32_neon(simsimd_f32_t ab, simsimd_f32_t a2,
simsimd_f32_t b2) {
if (a2 == 0 && b2 == 0) return 0;
if (ab == 0) return 1;
simsimd_f32_t squares_arr[2] = {a2, b2};
float32x2_t squares = vld1_f32(squares_arr);
float32x2_t rsqrts = vrsqrte_f32(squares);
rsqrts = vmul_f32(rsqrts, vrsqrts_f32(vmul_f32(squares, rsqrts), rsqrts));
rsqrts = vmul_f32(rsqrts, vrsqrts_f32(vmul_f32(squares, rsqrts), rsqrts));
vst1_f32(squares_arr, rsqrts);
simsimd_distance_t result = 1 - ab * squares_arr[0] * squares_arr[1];
return result > 0 ? result : 0;
}
SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f64_neon(simsimd_f64_t ab, simsimd_f64_t a2,
simsimd_f64_t b2) {
if (a2 == 0 && b2 == 0) return 0;
if (ab == 0) return 1;
simsimd_f64_t squares_arr[2] = {a2, b2};
float64x2_t squares = vld1q_f64(squares_arr);
float64x2_t rsqrts = vrsqrteq_f64(squares);
rsqrts = vmulq_f64(rsqrts, vrsqrtsq_f64(vmulq_f64(squares, rsqrts), rsqrts));
rsqrts = vmulq_f64(rsqrts, vrsqrtsq_f64(vmulq_f64(squares, rsqrts), rsqrts));
vst1q_f64(squares_arr, rsqrts);
simsimd_distance_t result = 1 - ab * squares_arr[0] * squares_arr[1];
return result > 0 ? result : 0;
}
SIMSIMD_PUBLIC void simsimd_l2_f32_neon(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f32_neon(a, b, n, result);
*result = _simsimd_sqrt_f64_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f32_neon(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t sum_vec = vdupq_n_f32(0);
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vld1q_f32(a + i);
float32x4_t b_vec = vld1q_f32(b + i);
float32x4_t diff_vec = vsubq_f32(a_vec, b_vec);
sum_vec = vfmaq_f32(sum_vec, diff_vec, diff_vec);
}
simsimd_f32_t sum = vaddvq_f32(sum_vec);
for (; i < n; ++i) {
simsimd_f32_t diff = a[i] - b[i];
sum += diff * diff;
}
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_cos_f32_neon(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t ab_vec = vdupq_n_f32(0), a2_vec = vdupq_n_f32(0), b2_vec = vdupq_n_f32(0);
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t a_vec = vld1q_f32(a + i);
float32x4_t b_vec = vld1q_f32(b + i);
ab_vec = vfmaq_f32(ab_vec, a_vec, b_vec);
a2_vec = vfmaq_f32(a2_vec, a_vec, a_vec);
b2_vec = vfmaq_f32(b2_vec, b_vec, b_vec);
}
simsimd_f32_t ab = vaddvq_f32(ab_vec), a2 = vaddvq_f32(a2_vec), b2 = vaddvq_f32(b2_vec);
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f64_neon(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_f64_neon(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f64_neon(a, b, n, result);
*result = _simsimd_sqrt_f64_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f64_neon(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float64x2_t sum_vec = vdupq_n_f64(0);
simsimd_size_t i = 0;
for (; i + 2 <= n; i += 2) {
float64x2_t a_vec = vld1q_f64(a + i);
float64x2_t b_vec = vld1q_f64(b + i);
float64x2_t diff_vec = vsubq_f64(a_vec, b_vec);
sum_vec = vfmaq_f64(sum_vec, diff_vec, diff_vec);
}
simsimd_f64_t sum = vaddvq_f64(sum_vec);
for (; i < n; ++i) {
simsimd_f64_t diff = a[i] - b[i];
sum += diff * diff;
}
*result = sum;
}
SIMSIMD_PUBLIC void simsimd_cos_f64_neon(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float64x2_t ab_vec = vdupq_n_f64(0), a2_vec = vdupq_n_f64(0), b2_vec = vdupq_n_f64(0);
simsimd_size_t i = 0;
for (; i + 2 <= n; i += 2) {
float64x2_t a_vec = vld1q_f64(a + i);
float64x2_t b_vec = vld1q_f64(b + i);
ab_vec = vfmaq_f64(ab_vec, a_vec, b_vec);
a2_vec = vfmaq_f64(a2_vec, a_vec, a_vec);
b2_vec = vfmaq_f64(b2_vec, b_vec, b_vec);
}
simsimd_f64_t ab = vaddvq_f64(ab_vec), a2 = vaddvq_f64(a2_vec), b2 = vaddvq_f64(b2_vec);
for (; i < n; ++i) {
simsimd_f64_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f64_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
#if SIMSIMD_TARGET_NEON_F16
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+simd+fp16")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f16_neon(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f16_neon(a, b, n, result);
*result = _simsimd_sqrt_f32_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f16_neon(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t a_vec, b_vec;
float32x4_t sum_vec = vdupq_n_f32(0);
simsimd_l2sq_f16_neon_cycle:
if (n < 4) {
a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a, n));
b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b, n));
n = 0;
}
else {
a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)a));
b_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)b));
n -= 4, a += 4, b += 4;
}
float32x4_t diff_vec = vsubq_f32(a_vec, b_vec);
sum_vec = vfmaq_f32(sum_vec, diff_vec, diff_vec);
if (n) goto simsimd_l2sq_f16_neon_cycle;
*result = vaddvq_f32(sum_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_f16_neon(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t ab_vec = vdupq_n_f32(0), a2_vec = vdupq_n_f32(0), b2_vec = vdupq_n_f32(0);
float32x4_t a_vec, b_vec;
simsimd_cos_f16_neon_cycle:
if (n < 4) {
a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a, n));
b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b, n));
n = 0;
}
else {
a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)a));
b_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const *)b));
n -= 4, a += 4, b += 4;
}
ab_vec = vfmaq_f32(ab_vec, a_vec, b_vec);
a2_vec = vfmaq_f32(a2_vec, a_vec, a_vec);
b2_vec = vfmaq_f32(b2_vec, b_vec, b_vec);
if (n) goto simsimd_cos_f16_neon_cycle;
simsimd_f32_t ab = vaddvq_f32(ab_vec), a2 = vaddvq_f32(a2_vec), b2 = vaddvq_f32(b2_vec);
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
#if SIMSIMD_TARGET_NEON_BF16
#pragma GCC push_options
#pragma GCC target("arch=armv8.6-a+simd+bf16")
#pragma clang attribute push(__attribute__((target("arch=armv8.6-a+simd+bf16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_cos_bf16_neon(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t ab_vec = vdupq_n_f32(0);
float32x4_t a2_vec = vdupq_n_f32(0);
float32x4_t b2_vec = vdupq_n_f32(0);
bfloat16x8_t a_vec, b_vec;
simsimd_cos_bf16_neon_cycle:
if (n < 8) {
a_vec = _simsimd_partial_load_bf16x8_neon(a, n);
b_vec = _simsimd_partial_load_bf16x8_neon(b, n);
n = 0;
}
else {
a_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)a);
b_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)b);
n -= 8, a += 8, b += 8;
}
ab_vec = vbfdotq_f32(ab_vec, a_vec, b_vec);
a2_vec = vbfdotq_f32(a2_vec, a_vec, a_vec);
b2_vec = vbfdotq_f32(b2_vec, b_vec, b_vec);
if (n) goto simsimd_cos_bf16_neon_cycle;
simsimd_f32_t ab = vaddvq_f32(ab_vec), a2 = vaddvq_f32(a2_vec), b2 = vaddvq_f32(b2_vec);
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_bf16_neon(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_bf16_neon(a, b, n, result);
*result = _simsimd_sqrt_f64_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_neon(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
float32x4_t diff_high_vec, diff_low_vec;
float32x4_t sum_high_vec = vdupq_n_f32(0), sum_low_vec = vdupq_n_f32(0);
simsimd_l2sq_bf16_neon_cycle:
if (n < 8) {
bfloat16x8_t a_vec = _simsimd_partial_load_bf16x8_neon(a, n);
bfloat16x8_t b_vec = _simsimd_partial_load_bf16x8_neon(b, n);
diff_high_vec = vsubq_f32(vcvt_f32_bf16(vget_high_bf16(a_vec)), vcvt_f32_bf16(vget_high_bf16(b_vec)));
diff_low_vec = vsubq_f32(vcvt_f32_bf16(vget_low_bf16(a_vec)), vcvt_f32_bf16(vget_low_bf16(b_vec)));
n = 0;
}
else {
bfloat16x8_t a_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)a);
bfloat16x8_t b_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)b);
diff_high_vec = vsubq_f32(vcvt_f32_bf16(vget_high_bf16(a_vec)), vcvt_f32_bf16(vget_high_bf16(b_vec)));
diff_low_vec = vsubq_f32(vcvt_f32_bf16(vget_low_bf16(a_vec)), vcvt_f32_bf16(vget_low_bf16(b_vec)));
n -= 8, a += 8, b += 8;
}
sum_high_vec = vfmaq_f32(sum_high_vec, diff_high_vec, diff_high_vec);
sum_low_vec = vfmaq_f32(sum_low_vec, diff_low_vec, diff_low_vec);
if (n) goto simsimd_l2sq_bf16_neon_cycle;
*result = vaddvq_f32(vaddq_f32(sum_high_vec, sum_low_vec));
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
#if SIMSIMD_TARGET_NEON_I8
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+dotprod+i8mm")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+dotprod+i8mm"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_i8_neon(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_i8_neon(a, b, n, result);
*result = _simsimd_sqrt_f32_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_i8_neon(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
uint32x4_t d2_vec = vdupq_n_u32(0);
simsimd_size_t i = 0;
for (; i + 16 <= n; i += 16) {
int8x16_t a_vec = vld1q_s8(a + i);
int8x16_t b_vec = vld1q_s8(b + i);
uint8x16_t d_vec = vreinterpretq_u8_s8(vabdq_s8(a_vec, b_vec));
d2_vec = vdotq_u32(d2_vec, d_vec, d_vec);
}
simsimd_u32_t d2 = vaddvq_u32(d2_vec);
for (; i < n; ++i) {
simsimd_i32_t n = (simsimd_i32_t)a[i] - b[i];
d2 += (simsimd_u32_t)(n * n);
}
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
int32x4_t ab_vec = vdupq_n_s32(0);
int32x4_t a2_vec = vdupq_n_s32(0);
int32x4_t b2_vec = vdupq_n_s32(0);
for (; i + 16 <= n; i += 16) {
int8x16_t a_vec = vld1q_s8(a + i);
int8x16_t b_vec = vld1q_s8(b + i);
ab_vec = vdotq_s32(ab_vec, a_vec, b_vec);
a2_vec = vdotq_s32(a2_vec, a_vec, a_vec);
b2_vec = vdotq_s32(b2_vec, b_vec, b_vec);
}
simsimd_i32_t ab = vaddvq_s32(ab_vec);
simsimd_i32_t a2 = vaddvq_s32(a2_vec);
simsimd_i32_t b2 = vaddvq_s32(b2_vec);
for (; i < n; ++i) {
simsimd_i32_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_u8_neon(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_u8_neon(a, b, n, result);
*result = _simsimd_sqrt_f32_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_u8_neon(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
uint32x4_t d2_vec = vdupq_n_u32(0);
simsimd_size_t i = 0;
for (; i + 16 <= n; i += 16) {
uint8x16_t a_vec = vld1q_u8(a + i);
uint8x16_t b_vec = vld1q_u8(b + i);
uint8x16_t d_vec = vabdq_u8(a_vec, b_vec);
d2_vec = vdotq_u32(d2_vec, d_vec, d_vec);
}
simsimd_u32_t d2 = vaddvq_u32(d2_vec);
for (; i < n; ++i) {
simsimd_i32_t n = (simsimd_i32_t)a[i] - b[i];
d2 += (simsimd_u32_t)(n * n);
}
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_u8_neon(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
uint32x4_t ab_vec = vdupq_n_u32(0);
uint32x4_t a2_vec = vdupq_n_u32(0);
uint32x4_t b2_vec = vdupq_n_u32(0);
for (; i + 16 <= n; i += 16) {
uint8x16_t a_vec = vld1q_u8(a + i);
uint8x16_t b_vec = vld1q_u8(b + i);
ab_vec = vdotq_u32(ab_vec, a_vec, b_vec);
a2_vec = vdotq_u32(a2_vec, a_vec, a_vec);
b2_vec = vdotq_u32(b2_vec, b_vec, b_vec);
}
simsimd_u32_t ab = vaddvq_u32(ab_vec);
simsimd_u32_t a2 = vaddvq_u32(a2_vec);
simsimd_u32_t b2 = vaddvq_u32(b2_vec);
for (; i < n; ++i) {
simsimd_u32_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
#if SIMSIMD_TARGET_SVE
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+sve")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f32_sve(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f32_sve(a, b, n, result);
*result = _simsimd_sqrt_f64_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f32_sve(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat32_t d2_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
do {
svbool_t pg_vec = svwhilelt_b32((unsigned int)i, (unsigned int)n);
svfloat32_t a_vec = svld1_f32(pg_vec, a + i);
svfloat32_t b_vec = svld1_f32(pg_vec, b + i);
svfloat32_t a_minus_b_vec = svsub_f32_x(pg_vec, a_vec, b_vec);
d2_vec = svmla_f32_x(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
i += svcntw();
} while (i < n);
simsimd_f32_t d2 = svaddv_f32(svptrue_b32(), d2_vec);
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_f32_sve(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat32_t ab_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
svfloat32_t a2_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
svfloat32_t b2_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
do {
svbool_t pg_vec = svwhilelt_b32((unsigned int)i, (unsigned int)n);
svfloat32_t a_vec = svld1_f32(pg_vec, a + i);
svfloat32_t b_vec = svld1_f32(pg_vec, b + i);
ab_vec = svmla_f32_x(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f32_x(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f32_x(pg_vec, b2_vec, b_vec, b_vec);
i += svcntw();
} while (i < n);
simsimd_f32_t ab = svaddv_f32(svptrue_b32(), ab_vec);
simsimd_f32_t a2 = svaddv_f32(svptrue_b32(), a2_vec);
simsimd_f32_t b2 = svaddv_f32(svptrue_b32(), b2_vec);
*result = _simsimd_cos_normalize_f64_neon(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_f64_sve(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f64_sve(a, b, n, result);
*result = _simsimd_sqrt_f64_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f64_sve(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat64_t d2_vec = svdupq_n_f64(0.0, 0.0);
do {
svbool_t pg_vec = svwhilelt_b64((unsigned int)i, (unsigned int)n);
svfloat64_t a_vec = svld1_f64(pg_vec, a + i);
svfloat64_t b_vec = svld1_f64(pg_vec, b + i);
svfloat64_t a_minus_b_vec = svsub_f64_x(pg_vec, a_vec, b_vec);
d2_vec = svmla_f64_x(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
i += svcntd();
} while (i < n);
simsimd_f64_t d2 = svaddv_f64(svptrue_b32(), d2_vec);
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_f64_sve(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat64_t ab_vec = svdupq_n_f64(0.0, 0.0);
svfloat64_t a2_vec = svdupq_n_f64(0.0, 0.0);
svfloat64_t b2_vec = svdupq_n_f64(0.0, 0.0);
do {
svbool_t pg_vec = svwhilelt_b64((unsigned int)i, (unsigned int)n);
svfloat64_t a_vec = svld1_f64(pg_vec, a + i);
svfloat64_t b_vec = svld1_f64(pg_vec, b + i);
ab_vec = svmla_f64_x(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f64_x(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f64_x(pg_vec, b2_vec, b_vec, b_vec);
i += svcntd();
} while (i < n);
simsimd_f64_t ab = svaddv_f64(svptrue_b32(), ab_vec);
simsimd_f64_t a2 = svaddv_f64(svptrue_b32(), a2_vec);
simsimd_f64_t b2 = svaddv_f64(svptrue_b32(), b2_vec);
*result = _simsimd_cos_normalize_f64_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
#if SIMSIMD_TARGET_SVE_F16
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+sve+fp16")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+fp16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f16_sve(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f16_sve(a, b, n, result);
*result = _simsimd_sqrt_f32_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f16_sve(simsimd_f16_t const *a_enum, simsimd_f16_t const *b_enum, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat16_t d2_vec = svdupq_n_f16(0, 0, 0, 0, 0, 0, 0, 0);
simsimd_f16_for_arm_simd_t const *a = (simsimd_f16_for_arm_simd_t const *)(a_enum);
simsimd_f16_for_arm_simd_t const *b = (simsimd_f16_for_arm_simd_t const *)(b_enum);
do {
svbool_t pg_vec = svwhilelt_b16((unsigned int)i, (unsigned int)n);
svfloat16_t a_vec = svld1_f16(pg_vec, a + i);
svfloat16_t b_vec = svld1_f16(pg_vec, b + i);
svfloat16_t a_minus_b_vec = svsub_f16_x(pg_vec, a_vec, b_vec);
d2_vec = svmla_f16_x(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
i += svcnth();
} while (i < n);
simsimd_f16_for_arm_simd_t d2_f16 = svaddv_f16(svptrue_b16(), d2_vec);
*result = d2_f16;
}
SIMSIMD_PUBLIC void simsimd_cos_f16_sve(simsimd_f16_t const *a_enum, simsimd_f16_t const *b_enum, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat16_t ab_vec = svdupq_n_f16(0, 0, 0, 0, 0, 0, 0, 0);
svfloat16_t a2_vec = svdupq_n_f16(0, 0, 0, 0, 0, 0, 0, 0);
svfloat16_t b2_vec = svdupq_n_f16(0, 0, 0, 0, 0, 0, 0, 0);
simsimd_f16_for_arm_simd_t const *a = (simsimd_f16_for_arm_simd_t const *)(a_enum);
simsimd_f16_for_arm_simd_t const *b = (simsimd_f16_for_arm_simd_t const *)(b_enum);
do {
svbool_t pg_vec = svwhilelt_b16((unsigned int)i, (unsigned int)n);
svfloat16_t a_vec = svld1_f16(pg_vec, a + i);
svfloat16_t b_vec = svld1_f16(pg_vec, b + i);
ab_vec = svmla_f16_x(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f16_x(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f16_x(pg_vec, b2_vec, b_vec, b_vec);
i += svcnth();
} while (i < n);
simsimd_f16_for_arm_simd_t ab = svaddv_f16(svptrue_b16(), ab_vec);
simsimd_f16_for_arm_simd_t a2 = svaddv_f16(svptrue_b16(), a2_vec);
simsimd_f16_for_arm_simd_t b2 = svaddv_f16(svptrue_b16(), b2_vec);
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
#if SIMSIMD_TARGET_SVE_BF16
#pragma GCC push_options
#pragma GCC target("arch=armv8.2-a+sve+bf16")
#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+bf16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_bf16_sve(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_bf16_sve(a, b, n, result);
*result = _simsimd_sqrt_f32_neon(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_sve(simsimd_bf16_t const *a_enum, simsimd_bf16_t const *b_enum, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat32_t d2_low_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
svfloat32_t d2_high_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
simsimd_u16_t const *a = (simsimd_u16_t const *)(a_enum);
simsimd_u16_t const *b = (simsimd_u16_t const *)(b_enum);
do {
svbool_t pg_vec = svwhilelt_b16((unsigned int)i, (unsigned int)n);
svuint16_t a_vec = svld1_u16(pg_vec, a + i);
svuint16_t b_vec = svld1_u16(pg_vec, b + i);
svbool_t pg_low_vec = svwhilelt_b32((unsigned int)(i), (unsigned int)n);
svbool_t pg_high_vec = svwhilelt_b32((unsigned int)(i + svcnth() / 2), (unsigned int)n);
svfloat32_t a_low_vec = svreinterpret_f32_u32(svlsl_n_u32_x(pg_low_vec, svunpklo_u32(a_vec), 16));
svfloat32_t a_high_vec = svreinterpret_f32_u32(svlsl_n_u32_x(pg_high_vec, svunpkhi_u32(a_vec), 16));
svfloat32_t b_low_vec = svreinterpret_f32_u32(svlsl_n_u32_x(pg_low_vec, svunpklo_u32(b_vec), 16));
svfloat32_t b_high_vec = svreinterpret_f32_u32(svlsl_n_u32_x(pg_high_vec, svunpkhi_u32(b_vec), 16));
svfloat32_t a_minus_b_low_vec = svsub_f32_x(pg_low_vec, a_low_vec, b_low_vec);
svfloat32_t a_minus_b_high_vec = svsub_f32_x(pg_high_vec, a_high_vec, b_high_vec);
d2_low_vec = svmla_f32_x(pg_vec, d2_low_vec, a_minus_b_low_vec, a_minus_b_low_vec);
d2_high_vec = svmla_f32_x(pg_vec, d2_high_vec, a_minus_b_low_vec, a_minus_b_low_vec);
i += svcnth();
} while (i < n);
simsimd_f32_t d2 = svaddv_f32(svptrue_b32(), d2_low_vec) + svaddv_f32(svptrue_b32(), d2_high_vec);
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_bf16_sve(simsimd_bf16_t const *a_enum, simsimd_bf16_t const *b_enum, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_size_t i = 0;
svfloat32_t ab_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
svfloat32_t a2_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
svfloat32_t b2_vec = svdupq_n_f32(0.f, 0.f, 0.f, 0.f);
simsimd_bf16_for_arm_simd_t const *a = (simsimd_bf16_for_arm_simd_t const *)(a_enum);
simsimd_bf16_for_arm_simd_t const *b = (simsimd_bf16_for_arm_simd_t const *)(b_enum);
do {
svbool_t pg_vec = svwhilelt_b16((unsigned int)i, (unsigned int)n);
svbfloat16_t a_vec = svld1_bf16(pg_vec, a + i);
svbfloat16_t b_vec = svld1_bf16(pg_vec, b + i);
ab_vec = svbfdot_f32(ab_vec, a_vec, b_vec);
a2_vec = svbfdot_f32(a2_vec, a_vec, a_vec);
b2_vec = svbfdot_f32(b2_vec, b_vec, b_vec);
i += svcnth();
} while (i < n);
simsimd_f32_t ab = svaddv_f32(svptrue_b32(), ab_vec);
simsimd_f32_t a2 = svaddv_f32(svptrue_b32(), a2_vec);
simsimd_f32_t b2 = svaddv_f32(svptrue_b32(), b2_vec);
*result = _simsimd_cos_normalize_f32_neon(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif #endif
#if _SIMSIMD_TARGET_X86
#if SIMSIMD_TARGET_HASWELL
#pragma GCC push_options
#pragma GCC target("avx2")
#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
SIMSIMD_INTERNAL simsimd_f32_t _simsimd_sqrt_f32_haswell(simsimd_f32_t x) {
return _mm_cvtss_f32(_mm_sqrt_ps(_mm_set_ss(x)));
}
SIMSIMD_INTERNAL simsimd_f64_t _simsimd_sqrt_f64_haswell(simsimd_f64_t x) {
return _mm_cvtsd_f64(_mm_sqrt_pd(_mm_set_sd(x)));
}
SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f64_haswell(simsimd_f64_t ab, simsimd_f64_t a2,
simsimd_f64_t b2) {
if (a2 == 0 && b2 == 0) return 0;
else if (ab == 0)
return 1;
__m128d squares = _mm_set_pd(a2, b2);
__m128d rsqrts = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(squares)));
rsqrts = _mm_add_pd( _mm_mul_pd(_mm_set1_pd(1.5), rsqrts),
_mm_mul_pd(_mm_mul_pd(_mm_mul_pd(squares, _mm_set1_pd(-0.5)), rsqrts), _mm_mul_pd(rsqrts, rsqrts)));
simsimd_f64_t a2_reciprocal = _mm_cvtsd_f64(_mm_unpackhi_pd(rsqrts, rsqrts));
simsimd_f64_t b2_reciprocal = _mm_cvtsd_f64(rsqrts);
simsimd_distance_t result = 1 - ab * a2_reciprocal * b2_reciprocal;
return result > 0 ? result : 0;
}
SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f32_haswell(simsimd_f32_t ab, simsimd_f32_t a2,
simsimd_f32_t b2) {
if (a2 == 0.0f && b2 == 0.0f) return 0.0f;
else if (ab == 0.0f)
return 1.0f;
__m128 squares = _mm_set_ps(a2, b2, a2, b2);
__m128 rsqrts = _mm_rsqrt_ps(squares);
__m128 half = _mm_set1_ps(0.5f);
__m128 three_halves = _mm_set1_ps(1.5f);
rsqrts =
_mm_mul_ps(rsqrts, _mm_sub_ps(three_halves, _mm_mul_ps(half, _mm_mul_ps(squares, _mm_mul_ps(rsqrts, rsqrts)))));
simsimd_f32_t a2_reciprocal = _mm_cvtss_f32(_mm_shuffle_ps(rsqrts, rsqrts, _MM_SHUFFLE(0, 0, 0, 1)));
simsimd_f32_t b2_reciprocal = _mm_cvtss_f32(rsqrts);
simsimd_distance_t result = 1.0f - ab * a2_reciprocal * b2_reciprocal;
return result > 0 ? result : 0;
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif #endif
#if _SIMSIMD_TARGET_X86
#if SIMSIMD_TARGET_HASWELL
#pragma GCC push_options
#pragma GCC target("avx2", "f16c", "fma")
#pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f16_haswell(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 a_vec, b_vec;
__m256 d2_vec = _mm256_setzero_ps();
simsimd_l2sq_f16_haswell_cycle:
if (n < 8) {
a_vec = _simsimd_partial_load_f16x8_haswell(a, n);
b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
n = 0;
}
else {
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)a));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)b));
n -= 8, a += 8, b += 8;
}
__m256 d_vec = _mm256_sub_ps(a_vec, b_vec);
d2_vec = _mm256_fmadd_ps(d_vec, d_vec, d2_vec);
if (n) goto simsimd_l2sq_f16_haswell_cycle;
*result = _simsimd_reduce_f32x8_haswell(d2_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 a_vec, b_vec;
__m256 ab_vec = _mm256_setzero_ps(), a2_vec = _mm256_setzero_ps(), b2_vec = _mm256_setzero_ps();
simsimd_cos_f16_haswell_cycle:
if (n < 8) {
a_vec = _simsimd_partial_load_f16x8_haswell(a, n);
b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
n = 0;
}
else {
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)a));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const *)b));
n -= 8, a += 8, b += 8;
}
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
a2_vec = _mm256_fmadd_ps(a_vec, a_vec, a2_vec);
b2_vec = _mm256_fmadd_ps(b_vec, b_vec, b2_vec);
if (n) goto simsimd_cos_f16_haswell_cycle;
simsimd_f32_t ab = _simsimd_reduce_f32x8_haswell(ab_vec);
simsimd_f32_t a2 = _simsimd_reduce_f32x8_haswell(a2_vec);
simsimd_f32_t b2 = _simsimd_reduce_f32x8_haswell(b2_vec);
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_bf16_haswell(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_bf16_haswell(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_haswell(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 a_vec, b_vec;
__m256 d2_vec = _mm256_setzero_ps();
simsimd_l2sq_bf16_haswell_cycle:
if (n < 8) {
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(a, n));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b, n));
n = 0;
}
else {
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)a));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)b));
n -= 8, a += 8, b += 8;
}
__m256 d_vec = _mm256_sub_ps(a_vec, b_vec);
d2_vec = _mm256_fmadd_ps(d_vec, d_vec, d2_vec);
if (n) goto simsimd_l2sq_bf16_haswell_cycle;
*result = _simsimd_reduce_f32x8_haswell(d2_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_bf16_haswell(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 a_vec, b_vec;
__m256 ab_vec = _mm256_setzero_ps(), a2_vec = _mm256_setzero_ps(), b2_vec = _mm256_setzero_ps();
simsimd_cos_bf16_haswell_cycle:
if (n < 8) {
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(a, n));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b, n));
n = 0;
}
else {
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)a));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)b));
n -= 8, a += 8, b += 8;
}
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
a2_vec = _mm256_fmadd_ps(a_vec, a_vec, a2_vec);
b2_vec = _mm256_fmadd_ps(b_vec, b_vec, b2_vec);
if (n) goto simsimd_cos_bf16_haswell_cycle;
simsimd_f32_t ab = _simsimd_reduce_f32x8_haswell(ab_vec);
simsimd_f32_t a2 = _simsimd_reduce_f32x8_haswell(a2_vec);
simsimd_f32_t b2 = _simsimd_reduce_f32x8_haswell(b2_vec);
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_i8_haswell(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_i8_haswell(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_i8_haswell(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256i d2_i32_low_vec = _mm256_setzero_si256();
__m256i d2_i32_high_vec = _mm256_setzero_si256();
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_i8_vec = _mm256_lddqu_si256((__m256i const *)(a + i));
__m256i b_i8_vec = _mm256_lddqu_si256((__m256i const *)(b + i));
__m256i a_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(a_i8_vec));
__m256i a_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a_i8_vec, 1));
__m256i b_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(b_i8_vec));
__m256i b_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 1));
__m256i d_i16_low_vec = _mm256_sub_epi16(a_i16_low_vec, b_i16_low_vec);
__m256i d_i16_high_vec = _mm256_sub_epi16(a_i16_high_vec, b_i16_high_vec);
d2_i32_low_vec = _mm256_add_epi32(d2_i32_low_vec, _mm256_madd_epi16(d_i16_low_vec, d_i16_low_vec));
d2_i32_high_vec = _mm256_add_epi32(d2_i32_high_vec, _mm256_madd_epi16(d_i16_high_vec, d_i16_high_vec));
}
int d2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(d2_i32_low_vec, d2_i32_high_vec));
for (; i < n; ++i) {
int n = (int)(a[i]) - b[i];
d2 += n * n;
}
*result = (simsimd_f64_t)d2;
}
SIMSIMD_PUBLIC void simsimd_cos_i8_haswell(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256i ab_i32_low_vec = _mm256_setzero_si256();
__m256i ab_i32_high_vec = _mm256_setzero_si256();
__m256i a2_i32_low_vec = _mm256_setzero_si256();
__m256i a2_i32_high_vec = _mm256_setzero_si256();
__m256i b2_i32_low_vec = _mm256_setzero_si256();
__m256i b2_i32_high_vec = _mm256_setzero_si256();
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_i8_vec = _mm256_lddqu_si256((__m256i const *)(a + i));
__m256i b_i8_vec = _mm256_lddqu_si256((__m256i const *)(b + i));
__m256i a_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a_i8_vec, 0));
__m256i a_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a_i8_vec, 1));
__m256i b_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 0));
__m256i b_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 1));
ab_i32_low_vec = _mm256_add_epi32(ab_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, b_i16_low_vec));
ab_i32_high_vec = _mm256_add_epi32(ab_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, b_i16_high_vec));
a2_i32_low_vec = _mm256_add_epi32(a2_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, a_i16_low_vec));
a2_i32_high_vec = _mm256_add_epi32(a2_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, a_i16_high_vec));
b2_i32_low_vec = _mm256_add_epi32(b2_i32_low_vec, _mm256_madd_epi16(b_i16_low_vec, b_i16_low_vec));
b2_i32_high_vec = _mm256_add_epi32(b2_i32_high_vec, _mm256_madd_epi16(b_i16_high_vec, b_i16_high_vec));
}
int ab = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(ab_i32_low_vec, ab_i32_high_vec));
int a2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(a2_i32_low_vec, a2_i32_high_vec));
int b2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(b2_i32_low_vec, b2_i32_high_vec));
for (; i < n; ++i) {
int ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_u8_haswell(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_u8_haswell(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_u8_haswell(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256i d2_i32_low_vec = _mm256_setzero_si256();
__m256i d2_i32_high_vec = _mm256_setzero_si256();
__m256i const zeros_vec = _mm256_setzero_si256();
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_u8_vec = _mm256_lddqu_si256((__m256i const *)(a + i));
__m256i b_u8_vec = _mm256_lddqu_si256((__m256i const *)(b + i));
__m256i d_u8_vec = _mm256_or_si256(_mm256_subs_epu8(a_u8_vec, b_u8_vec), _mm256_subs_epu8(b_u8_vec, a_u8_vec));
__m256i d_i16_low_vec = _mm256_unpacklo_epi8(d_u8_vec, zeros_vec);
__m256i d_i16_high_vec = _mm256_unpackhi_epi8(d_u8_vec, zeros_vec);
d2_i32_low_vec = _mm256_add_epi32(d2_i32_low_vec, _mm256_madd_epi16(d_i16_low_vec, d_i16_low_vec));
d2_i32_high_vec = _mm256_add_epi32(d2_i32_high_vec, _mm256_madd_epi16(d_i16_high_vec, d_i16_high_vec));
}
int d2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(d2_i32_low_vec, d2_i32_high_vec));
for (; i < n; ++i) {
int n = (int)(a[i]) - b[i];
d2 += n * n;
}
*result = (simsimd_f64_t)d2;
}
SIMSIMD_PUBLIC void simsimd_cos_u8_haswell(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256i ab_i32_low_vec = _mm256_setzero_si256();
__m256i ab_i32_high_vec = _mm256_setzero_si256();
__m256i a2_i32_low_vec = _mm256_setzero_si256();
__m256i a2_i32_high_vec = _mm256_setzero_si256();
__m256i b2_i32_low_vec = _mm256_setzero_si256();
__m256i b2_i32_high_vec = _mm256_setzero_si256();
__m256i const zeros_vec = _mm256_setzero_si256();
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_u8_vec = _mm256_lddqu_si256((__m256i const *)(a + i));
__m256i b_u8_vec = _mm256_lddqu_si256((__m256i const *)(b + i));
__m256i a_i16_low_vec = _mm256_unpacklo_epi8(a_u8_vec, zeros_vec);
__m256i a_i16_high_vec = _mm256_unpackhi_epi8(a_u8_vec, zeros_vec);
__m256i b_i16_low_vec = _mm256_unpacklo_epi8(b_u8_vec, zeros_vec);
__m256i b_i16_high_vec = _mm256_unpackhi_epi8(b_u8_vec, zeros_vec);
ab_i32_low_vec = _mm256_add_epi32(ab_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, b_i16_low_vec));
ab_i32_high_vec = _mm256_add_epi32(ab_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, b_i16_high_vec));
a2_i32_low_vec = _mm256_add_epi32(a2_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, a_i16_low_vec));
a2_i32_high_vec = _mm256_add_epi32(a2_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, a_i16_high_vec));
b2_i32_low_vec = _mm256_add_epi32(b2_i32_low_vec, _mm256_madd_epi16(b_i16_low_vec, b_i16_low_vec));
b2_i32_high_vec = _mm256_add_epi32(b2_i32_high_vec, _mm256_madd_epi16(b_i16_high_vec, b_i16_high_vec));
}
int ab = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(ab_i32_low_vec, ab_i32_high_vec));
int a2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(a2_i32_low_vec, a2_i32_high_vec));
int b2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(b2_i32_low_vec, b2_i32_high_vec));
for (; i < n; ++i) {
int ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_f32_haswell(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f32_haswell(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f32_haswell(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 d2_vec = _mm256_setzero_ps();
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_loadu_ps(a + i);
__m256 b_vec = _mm256_loadu_ps(b + i);
__m256 d_vec = _mm256_sub_ps(a_vec, b_vec);
d2_vec = _mm256_fmadd_ps(d_vec, d_vec, d2_vec);
}
simsimd_f64_t d2 = _simsimd_reduce_f32x8_haswell(d2_vec);
for (; i < n; ++i) {
float d = a[i] - b[i];
d2 += d * d;
}
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_f32_haswell(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256 ab_vec = _mm256_setzero_ps();
__m256 a2_vec = _mm256_setzero_ps();
__m256 b2_vec = _mm256_setzero_ps();
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_loadu_ps(a + i);
__m256 b_vec = _mm256_loadu_ps(b + i);
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
a2_vec = _mm256_fmadd_ps(a_vec, a_vec, a2_vec);
b2_vec = _mm256_fmadd_ps(b_vec, b_vec, b2_vec);
}
simsimd_f64_t ab = _simsimd_reduce_f32x8_haswell(ab_vec);
simsimd_f64_t a2 = _simsimd_reduce_f32x8_haswell(a2_vec);
simsimd_f64_t b2 = _simsimd_reduce_f32x8_haswell(b2_vec);
for (; i < n; ++i) {
float ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f64_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_f64_haswell(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f64_haswell(a, b, n, result);
*result = _simsimd_sqrt_f64_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f64_haswell(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256d d2_vec = _mm256_setzero_pd();
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
__m256d a_vec = _mm256_loadu_pd(a + i);
__m256d b_vec = _mm256_loadu_pd(b + i);
__m256d d_vec = _mm256_sub_pd(a_vec, b_vec);
d2_vec = _mm256_fmadd_pd(d_vec, d_vec, d2_vec);
}
simsimd_f64_t d2 = _simsimd_reduce_f64x4_haswell(d2_vec);
for (; i < n; ++i) {
simsimd_f64_t d = a[i] - b[i];
d2 += d * d;
}
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_f64_haswell(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256d ab_vec = _mm256_setzero_pd();
__m256d a2_vec = _mm256_setzero_pd();
__m256d b2_vec = _mm256_setzero_pd();
simsimd_size_t i = 0;
for (; i + 4 <= n; i += 4) {
__m256d a_vec = _mm256_loadu_pd(a + i);
__m256d b_vec = _mm256_loadu_pd(b + i);
ab_vec = _mm256_fmadd_pd(a_vec, b_vec, ab_vec);
a2_vec = _mm256_fmadd_pd(a_vec, a_vec, a2_vec);
b2_vec = _mm256_fmadd_pd(b_vec, b_vec, b2_vec);
}
simsimd_f64_t ab = _simsimd_reduce_f64x4_haswell(ab_vec);
simsimd_f64_t a2 = _simsimd_reduce_f64x4_haswell(a2_vec);
simsimd_f64_t b2 = _simsimd_reduce_f64x4_haswell(b2_vec);
for (; i < n; ++i) {
simsimd_f64_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f64_haswell(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
#if SIMSIMD_TARGET_SKYLAKE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512bw", "avx512vl", "bmi2")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512bw,avx512vl,bmi2"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f32_skylake(a, b, n, result);
*result = _simsimd_sqrt_f64_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512 d2_vec = _mm512_setzero();
__m512 a_vec, b_vec;
simsimd_l2sq_f32_skylake_cycle:
if (n < 16) {
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_ps(mask, a);
b_vec = _mm512_maskz_loadu_ps(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_ps(a);
b_vec = _mm512_loadu_ps(b);
a += 16, b += 16, n -= 16;
}
__m512 d_vec = _mm512_sub_ps(a_vec, b_vec);
d2_vec = _mm512_fmadd_ps(d_vec, d_vec, d2_vec);
if (n) goto simsimd_l2sq_f32_skylake_cycle;
*result = _simsimd_reduce_f32x16_skylake(d2_vec);
}
SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f64_skylake(simsimd_f64_t ab, simsimd_f64_t a2,
simsimd_f64_t b2) {
if (a2 == 0 && b2 == 0) return 0;
else if (ab == 0)
return 1;
__m128d squares = _mm_set_pd(a2, b2);
__m128d rsqrts = _mm_maskz_rsqrt14_pd(0xFF, squares);
rsqrts = _mm_add_pd( _mm_mul_pd(_mm_set1_pd(1.5), rsqrts),
_mm_mul_pd(_mm_mul_pd(_mm_mul_pd(squares, _mm_set1_pd(-0.5)), rsqrts), _mm_mul_pd(rsqrts, rsqrts)));
simsimd_f64_t a2_reciprocal = _mm_cvtsd_f64(_mm_unpackhi_pd(rsqrts, rsqrts));
simsimd_f64_t b2_reciprocal = _mm_cvtsd_f64(rsqrts);
simsimd_distance_t result = 1 - ab * a2_reciprocal * b2_reciprocal;
return result > 0 ? result : 0;
}
SIMSIMD_PUBLIC void simsimd_cos_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512 ab_vec = _mm512_setzero();
__m512 a2_vec = _mm512_setzero();
__m512 b2_vec = _mm512_setzero();
__m512 a_vec, b_vec;
simsimd_cos_f32_skylake_cycle:
if (n < 16) {
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_ps(mask, a);
b_vec = _mm512_maskz_loadu_ps(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_ps(a);
b_vec = _mm512_loadu_ps(b);
a += 16, b += 16, n -= 16;
}
ab_vec = _mm512_fmadd_ps(a_vec, b_vec, ab_vec);
a2_vec = _mm512_fmadd_ps(a_vec, a_vec, a2_vec);
b2_vec = _mm512_fmadd_ps(b_vec, b_vec, b2_vec);
if (n) goto simsimd_cos_f32_skylake_cycle;
simsimd_f64_t ab = _simsimd_reduce_f32x16_skylake(ab_vec);
simsimd_f64_t a2 = _simsimd_reduce_f32x16_skylake(a2_vec);
simsimd_f64_t b2 = _simsimd_reduce_f32x16_skylake(b2_vec);
*result = _simsimd_cos_normalize_f64_skylake(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_f64_skylake(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f64_skylake(a, b, n, result);
*result = _simsimd_sqrt_f64_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f64_skylake(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512d d2_vec = _mm512_setzero_pd();
__m512d a_vec, b_vec;
simsimd_l2sq_f64_skylake_cycle:
if (n < 8) {
__mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_pd(mask, a);
b_vec = _mm512_maskz_loadu_pd(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_pd(a);
b_vec = _mm512_loadu_pd(b);
a += 8, b += 8, n -= 8;
}
__m512d d_vec = _mm512_sub_pd(a_vec, b_vec);
d2_vec = _mm512_fmadd_pd(d_vec, d_vec, d2_vec);
if (n) goto simsimd_l2sq_f64_skylake_cycle;
*result = _mm512_reduce_add_pd(d2_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_f64_skylake(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512d ab_vec = _mm512_setzero_pd();
__m512d a2_vec = _mm512_setzero_pd();
__m512d b2_vec = _mm512_setzero_pd();
__m512d a_vec, b_vec;
simsimd_cos_f64_skylake_cycle:
if (n < 8) {
__mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
a_vec = _mm512_maskz_loadu_pd(mask, a);
b_vec = _mm512_maskz_loadu_pd(mask, b);
n = 0;
}
else {
a_vec = _mm512_loadu_pd(a);
b_vec = _mm512_loadu_pd(b);
a += 8, b += 8, n -= 8;
}
ab_vec = _mm512_fmadd_pd(a_vec, b_vec, ab_vec);
a2_vec = _mm512_fmadd_pd(a_vec, a_vec, a2_vec);
b2_vec = _mm512_fmadd_pd(b_vec, b_vec, b2_vec);
if (n) goto simsimd_cos_f64_skylake_cycle;
simsimd_f64_t ab = _mm512_reduce_add_pd(ab_vec);
simsimd_f64_t a2 = _mm512_reduce_add_pd(a2_vec);
simsimd_f64_t b2 = _mm512_reduce_add_pd(b2_vec);
*result = _simsimd_cos_normalize_f64_skylake(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
#if SIMSIMD_TARGET_GENOA
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512bf16")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512bf16"))), \
apply_to = function)
SIMSIMD_INTERNAL __m512i _simsimd_substract_bf16x32_genoa(__m512i a_i16, __m512i b_i16) {
union {
__m512 fvec;
__m512i ivec;
simsimd_f32_t f32[16];
simsimd_u16_t u16[32];
simsimd_bf16_t bf16[32];
} d_odd, d_even, d, a_f32_even, b_f32_even, d_f32_even, a_f32_odd, b_f32_odd, d_f32_odd, a, b;
a.ivec = a_i16;
b.ivec = b_i16;
a_f32_odd.ivec = _mm512_and_si512(a_i16, _mm512_set1_epi32(0xFFFF0000));
a_f32_even.ivec = _mm512_slli_epi32(a_i16, 16);
b_f32_odd.ivec = _mm512_and_si512(b_i16, _mm512_set1_epi32(0xFFFF0000));
b_f32_even.ivec = _mm512_slli_epi32(b_i16, 16);
d_f32_odd.fvec = _mm512_sub_ps(a_f32_odd.fvec, b_f32_odd.fvec);
d_f32_even.fvec = _mm512_sub_ps(a_f32_even.fvec, b_f32_even.fvec);
d_f32_even.ivec = _mm512_srli_epi32(d_f32_even.ivec, 16);
d.ivec = _mm512_mask_blend_epi16(0x55555555, d_f32_odd.ivec, d_f32_even.ivec);
return d.ivec;
}
SIMSIMD_PUBLIC void simsimd_l2_bf16_genoa(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_bf16_genoa(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_bf16_genoa(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512 d2_vec = _mm512_setzero_ps();
__m512i a_i16_vec, b_i16_vec, d_i16_vec;
simsimd_l2sq_bf16_genoa_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_maskz_loadu_epi16(mask, a);
b_i16_vec = _mm512_maskz_loadu_epi16(mask, b);
n = 0;
}
else {
a_i16_vec = _mm512_loadu_epi16(a);
b_i16_vec = _mm512_loadu_epi16(b);
a += 32, b += 32, n -= 32;
}
d_i16_vec = _simsimd_substract_bf16x32_genoa(a_i16_vec, b_i16_vec);
d2_vec = _mm512_dpbf16_ps(d2_vec, (__m512bh)(d_i16_vec), (__m512bh)(d_i16_vec));
if (n) goto simsimd_l2sq_bf16_genoa_cycle;
*result = _simsimd_reduce_f32x16_skylake(d2_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_bf16_genoa(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512 ab_vec = _mm512_setzero_ps();
__m512 a2_vec = _mm512_setzero_ps();
__m512 b2_vec = _mm512_setzero_ps();
__m512i a_i16_vec, b_i16_vec;
simsimd_cos_bf16_genoa_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_maskz_loadu_epi16(mask, a);
b_i16_vec = _mm512_maskz_loadu_epi16(mask, b);
n = 0;
}
else {
a_i16_vec = _mm512_loadu_epi16(a);
b_i16_vec = _mm512_loadu_epi16(b);
a += 32, b += 32, n -= 32;
}
ab_vec = _mm512_dpbf16_ps(ab_vec, (__m512bh)(a_i16_vec), (__m512bh)(b_i16_vec));
a2_vec = _mm512_dpbf16_ps(a2_vec, (__m512bh)(a_i16_vec), (__m512bh)(a_i16_vec));
b2_vec = _mm512_dpbf16_ps(b2_vec, (__m512bh)(b_i16_vec), (__m512bh)(b_i16_vec));
if (n) goto simsimd_cos_bf16_genoa_cycle;
simsimd_f32_t ab = _simsimd_reduce_f32x16_skylake(ab_vec);
simsimd_f32_t a2 = _simsimd_reduce_f32x16_skylake(a2_vec);
simsimd_f32_t b2 = _simsimd_reduce_f32x16_skylake(b2_vec);
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
#if SIMSIMD_TARGET_SAPPHIRE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512fp16")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512fp16"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_f16_sapphire(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512h d2_vec = _mm512_setzero_ph();
__m512i a_i16_vec, b_i16_vec;
simsimd_l2sq_f16_sapphire_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_maskz_loadu_epi16(mask, a);
b_i16_vec = _mm512_maskz_loadu_epi16(mask, b);
n = 0;
}
else {
a_i16_vec = _mm512_loadu_epi16(a);
b_i16_vec = _mm512_loadu_epi16(b);
a += 32, b += 32, n -= 32;
}
__m512h d_vec = _mm512_sub_ph(_mm512_castsi512_ph(a_i16_vec), _mm512_castsi512_ph(b_i16_vec));
d2_vec = _mm512_fmadd_ph(d_vec, d_vec, d2_vec);
if (n) goto simsimd_l2sq_f16_sapphire_cycle;
*result = _mm512_reduce_add_ph(d2_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512h ab_vec = _mm512_setzero_ph();
__m512h a2_vec = _mm512_setzero_ph();
__m512h b2_vec = _mm512_setzero_ph();
__m512i a_i16_vec, b_i16_vec;
simsimd_cos_f16_sapphire_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_maskz_loadu_epi16(mask, a);
b_i16_vec = _mm512_maskz_loadu_epi16(mask, b);
n = 0;
}
else {
a_i16_vec = _mm512_loadu_epi16(a);
b_i16_vec = _mm512_loadu_epi16(b);
a += 32, b += 32, n -= 32;
}
ab_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(a_i16_vec), _mm512_castsi512_ph(b_i16_vec), ab_vec);
a2_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(a_i16_vec), _mm512_castsi512_ph(a_i16_vec), a2_vec);
b2_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(b_i16_vec), _mm512_castsi512_ph(b_i16_vec), b2_vec);
if (n) goto simsimd_cos_f16_sapphire_cycle;
simsimd_f32_t ab = _mm512_reduce_add_ph(ab_vec);
simsimd_f32_t a2 = _mm512_reduce_add_ph(a2_vec);
simsimd_f32_t b2 = _mm512_reduce_add_ph(b2_vec);
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
#if SIMSIMD_TARGET_ICE
#pragma GCC push_options
#pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512vnni")
#pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512vnni"))), \
apply_to = function)
SIMSIMD_PUBLIC void simsimd_l2_i8_ice(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_i8_ice(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_i8_ice(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512i d2_i32_vec = _mm512_setzero_si512();
__m512i a_i16_vec, b_i16_vec, d_i16s_vec;
simsimd_l2sq_i8_ice_cycle:
if (n < 32) { __mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, a));
b_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, b));
n = 0;
}
else {
a_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const *)a));
b_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const *)b));
a += 32, b += 32, n -= 32;
}
d_i16s_vec = _mm512_sub_epi16(a_i16_vec, b_i16_vec);
d2_i32_vec = _mm512_dpwssd_epi32(d2_i32_vec, d_i16s_vec, d_i16s_vec);
if (n) goto simsimd_l2sq_i8_ice_cycle;
*result = _mm512_reduce_add_epi32(d2_i32_vec);
}
SIMSIMD_PUBLIC void simsimd_cos_i8_ice(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512i ab_i32_vec = _mm512_setzero_si512();
__m512i a2_i32_vec = _mm512_setzero_si512();
__m512i b2_i32_vec = _mm512_setzero_si512();
__m512i a_i16_vec, b_i16_vec;
simsimd_cos_i8_ice_cycle:
if (n < 32) {
__mmask32 mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n);
a_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, a));
b_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, b));
n = 0;
}
else {
a_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const *)a));
b_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const *)b));
a += 32, b += 32, n -= 32;
}
ab_i32_vec = _mm512_add_epi32(ab_i32_vec, _mm512_madd_epi16(a_i16_vec, b_i16_vec));
a2_i32_vec = _mm512_add_epi32(a2_i32_vec, _mm512_madd_epi16(a_i16_vec, a_i16_vec));
b2_i32_vec = _mm512_add_epi32(b2_i32_vec, _mm512_madd_epi16(b_i16_vec, b_i16_vec));
if (n) goto simsimd_cos_i8_ice_cycle;
int ab = _mm512_reduce_add_epi32(ab_i32_vec);
int a2 = _mm512_reduce_add_epi32(a2_i32_vec);
int b2 = _mm512_reduce_add_epi32(b2_i32_vec);
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_u8_ice(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
simsimd_l2sq_u8_ice(a, b, n, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_u8_ice(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512i d2_i32_low_vec = _mm512_setzero_si512();
__m512i d2_i32_high_vec = _mm512_setzero_si512();
__m512i const zeros_vec = _mm512_setzero_si512();
__m512i d_i16_low_vec, d_i16_high_vec;
__m512i a_u8_vec, b_u8_vec, d_u8_vec;
simsimd_l2sq_u8_ice_cycle:
if (n < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b);
n = 0;
}
else {
a_u8_vec = _mm512_loadu_si512(a);
b_u8_vec = _mm512_loadu_si512(b);
a += 64, b += 64, n -= 64;
}
d_u8_vec = _mm512_or_si512(_mm512_subs_epu8(a_u8_vec, b_u8_vec), _mm512_subs_epu8(b_u8_vec, a_u8_vec));
d_i16_low_vec = _mm512_unpacklo_epi8(d_u8_vec, zeros_vec);
d_i16_high_vec = _mm512_unpackhi_epi8(d_u8_vec, zeros_vec);
d2_i32_low_vec = _mm512_dpwssd_epi32(d2_i32_low_vec, d_i16_low_vec, d_i16_low_vec);
d2_i32_high_vec = _mm512_dpwssd_epi32(d2_i32_high_vec, d_i16_high_vec, d_i16_high_vec);
if (n) goto simsimd_l2sq_u8_ice_cycle;
*result = _mm512_reduce_add_epi32(_mm512_add_epi32(d2_i32_low_vec, d2_i32_high_vec));
}
SIMSIMD_PUBLIC void simsimd_cos_u8_ice(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m512i ab_i32_low_vec = _mm512_setzero_si512();
__m512i ab_i32_high_vec = _mm512_setzero_si512();
__m512i a2_i32_low_vec = _mm512_setzero_si512();
__m512i a2_i32_high_vec = _mm512_setzero_si512();
__m512i b2_i32_low_vec = _mm512_setzero_si512();
__m512i b2_i32_high_vec = _mm512_setzero_si512();
__m512i const zeros_vec = _mm512_setzero_si512();
__m512i a_i16_low_vec, a_i16_high_vec, b_i16_low_vec, b_i16_high_vec;
__m512i a_u8_vec, b_u8_vec;
simsimd_cos_u8_ice_cycle:
if (n < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b);
n = 0;
}
else {
a_u8_vec = _mm512_loadu_si512(a);
b_u8_vec = _mm512_loadu_si512(b);
a += 64, b += 64, n -= 64;
}
a_i16_low_vec = _mm512_unpacklo_epi8(a_u8_vec, zeros_vec);
a_i16_high_vec = _mm512_unpackhi_epi8(a_u8_vec, zeros_vec);
b_i16_low_vec = _mm512_unpacklo_epi8(b_u8_vec, zeros_vec);
b_i16_high_vec = _mm512_unpackhi_epi8(b_u8_vec, zeros_vec);
ab_i32_low_vec = _mm512_dpwssds_epi32(ab_i32_low_vec, a_i16_low_vec, b_i16_low_vec);
ab_i32_high_vec = _mm512_dpwssds_epi32(ab_i32_high_vec, a_i16_high_vec, b_i16_high_vec);
a2_i32_low_vec = _mm512_dpwssds_epi32(a2_i32_low_vec, a_i16_low_vec, a_i16_low_vec);
a2_i32_high_vec = _mm512_dpwssds_epi32(a2_i32_high_vec, a_i16_high_vec, a_i16_high_vec);
b2_i32_low_vec = _mm512_dpwssds_epi32(b2_i32_low_vec, b_i16_low_vec, b_i16_low_vec);
b2_i32_high_vec = _mm512_dpwssds_epi32(b2_i32_high_vec, b_i16_high_vec, b_i16_high_vec);
if (n) goto simsimd_cos_u8_ice_cycle;
int ab = _mm512_reduce_add_epi32(_mm512_add_epi32(ab_i32_low_vec, ab_i32_high_vec));
int a2 = _mm512_reduce_add_epi32(_mm512_add_epi32(a2_i32_low_vec, a2_i32_high_vec));
int b2 = _mm512_reduce_add_epi32(_mm512_add_epi32(b2_i32_low_vec, b2_i32_high_vec));
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
SIMSIMD_PUBLIC void simsimd_l2_i4x2_ice(simsimd_i4x2_t const *a, simsimd_i4x2_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
simsimd_l2sq_i4x2_ice(a, b, n_words, result);
*result = _simsimd_sqrt_f32_haswell(*result);
}
SIMSIMD_PUBLIC void simsimd_l2sq_i4x2_ice(simsimd_i4x2_t const *a, simsimd_i4x2_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
__m512i const i4_to_i8_lookup_vec = _mm512_set_epi8( -1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0);
__m512i const u4_squares_lookup_vec = _mm512_set_epi8( (char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0, (char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0, (char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0, (char)225, (char)196, (char)169, (char)144, 121, 100, 81, 64, 49, 36, 25, 16, 9, 4, 1, 0);
__m512i const i4_nibble_vec = _mm512_set1_epi8(0x0F);
__m512i a_i4x2_vec, b_i4x2_vec;
__m512i a_i8_low_vec, a_i8_high_vec, b_i8_low_vec, b_i8_high_vec;
__m512i d_u8_low_vec, d_u8_high_vec; __m512i d2_u8_low_vec, d2_u8_high_vec;
__m512i d2_u16_low_vec, d2_u16_high_vec;
__m512i d2_u32_vec = _mm512_setzero_si512();
simsimd_l2sq_i4x2_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
a_i4x2_vec = _mm512_maskz_loadu_epi8(mask, a);
b_i4x2_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
}
else {
a_i4x2_vec = _mm512_loadu_epi8(a);
b_i4x2_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n_words -= 64;
}
a_i8_low_vec = _mm512_and_si512(a_i4x2_vec, i4_nibble_vec);
a_i8_high_vec = _mm512_and_si512(_mm512_srli_epi64(a_i4x2_vec, 4), i4_nibble_vec);
b_i8_low_vec = _mm512_and_si512(b_i4x2_vec, i4_nibble_vec);
b_i8_high_vec = _mm512_and_si512(_mm512_srli_epi64(b_i4x2_vec, 4), i4_nibble_vec);
a_i8_low_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, a_i8_low_vec);
a_i8_high_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, a_i8_high_vec);
b_i8_low_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, b_i8_low_vec);
b_i8_high_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, b_i8_high_vec);
d_u8_low_vec = _mm512_abs_epi8(_mm512_sub_epi8(a_i8_low_vec, b_i8_low_vec));
d_u8_high_vec = _mm512_abs_epi8(_mm512_sub_epi8(a_i8_high_vec, b_i8_high_vec));
d2_u8_low_vec = _mm512_shuffle_epi8(u4_squares_lookup_vec, d_u8_low_vec);
d2_u8_high_vec = _mm512_shuffle_epi8(u4_squares_lookup_vec, d_u8_high_vec);
d2_u16_low_vec = _mm512_add_epi16( _mm512_unpacklo_epi8(d2_u8_low_vec, _mm512_setzero_si512()),
_mm512_unpackhi_epi8(d2_u8_low_vec, _mm512_setzero_si512()));
d2_u16_high_vec = _mm512_add_epi16( _mm512_unpacklo_epi8(d2_u8_high_vec, _mm512_setzero_si512()),
_mm512_unpackhi_epi8(d2_u8_high_vec, _mm512_setzero_si512()));
d2_u32_vec = _mm512_add_epi32(d2_u32_vec, _mm512_unpacklo_epi16(d2_u16_low_vec, _mm512_setzero_si512()));
d2_u32_vec = _mm512_add_epi32(d2_u32_vec, _mm512_unpacklo_epi16(d2_u16_high_vec, _mm512_setzero_si512()));
if (n_words) goto simsimd_l2sq_i4x2_ice_cycle;
int d2 = _mm512_reduce_add_epi32(d2_u32_vec);
*result = d2;
}
SIMSIMD_PUBLIC void simsimd_cos_i4x2_ice(simsimd_i4x2_t const *a, simsimd_i4x2_t const *b, simsimd_size_t n_words,
simsimd_distance_t *result) {
__m512i const i4_to_i8_lookup_vec = _mm512_set_epi8( -1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0);
__m512i const i4_squares_lookup_vec = _mm512_set_epi8( 1, 4, 9, 16, 25, 36, 49, 64, 49, 36, 25, 16, 9, 4, 1, 0, 1, 4, 9, 16, 25, 36, 49, 64, 49, 36, 25, 16, 9, 4, 1, 0, 1, 4, 9, 16, 25, 36, 49, 64, 49, 36, 25, 16, 9, 4, 1, 0, 1, 4, 9, 16, 25, 36, 49, 64, 49, 36, 25, 16, 9, 4, 1, 0);
__m512i const i4_nibble_vec = _mm512_set1_epi8(0x0F);
__m512i a_i4x2_vec, b_i4x2_vec;
__m512i a_i8_low_vec, a_i8_high_vec, b_i8_low_vec, b_i8_high_vec;
__m512i a2_u8_vec, b2_u8_vec;
__m512i a2_u16_low_vec = _mm512_setzero_si512();
__m512i a2_u16_high_vec = _mm512_setzero_si512();
__m512i b2_u16_low_vec = _mm512_setzero_si512();
__m512i b2_u16_high_vec = _mm512_setzero_si512();
__m512i ab_i32_low_vec = _mm512_setzero_si512();
__m512i ab_i32_high_vec = _mm512_setzero_si512();
simsimd_cos_i4x2_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
a_i4x2_vec = _mm512_maskz_loadu_epi8(mask, a);
b_i4x2_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
}
else {
a_i4x2_vec = _mm512_loadu_epi8(a);
b_i4x2_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n_words -= 64;
}
a_i8_low_vec = _mm512_and_si512(a_i4x2_vec, i4_nibble_vec);
a_i8_high_vec = _mm512_and_si512(_mm512_srli_epi64(a_i4x2_vec, 4), i4_nibble_vec);
b_i8_low_vec = _mm512_and_si512(b_i4x2_vec, i4_nibble_vec);
b_i8_high_vec = _mm512_and_si512(_mm512_srli_epi64(b_i4x2_vec, 4), i4_nibble_vec);
a2_u8_vec = _mm512_add_epi8(_mm512_shuffle_epi8(i4_squares_lookup_vec, a_i8_low_vec),
_mm512_shuffle_epi8(i4_squares_lookup_vec, a_i8_high_vec));
b2_u8_vec = _mm512_add_epi8(_mm512_shuffle_epi8(i4_squares_lookup_vec, b_i8_low_vec),
_mm512_shuffle_epi8(i4_squares_lookup_vec, b_i8_high_vec));
a2_u16_low_vec = _mm512_adds_epu16(a2_u16_low_vec, _mm512_cvtepu8_epi16(_mm512_castsi512_si256(a2_u8_vec)));
a2_u16_high_vec = _mm512_adds_epu16(a2_u16_high_vec, _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(a2_u8_vec, 1)));
b2_u16_low_vec = _mm512_adds_epu16(b2_u16_low_vec, _mm512_cvtepu8_epi16(_mm512_castsi512_si256(a2_u8_vec)));
b2_u16_high_vec = _mm512_adds_epu16(b2_u16_high_vec, _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(a2_u8_vec, 1)));
a_i8_low_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, a_i8_low_vec);
a_i8_high_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, a_i8_high_vec);
b_i8_low_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, b_i8_low_vec);
b_i8_high_vec = _mm512_shuffle_epi8(i4_to_i8_lookup_vec, b_i8_high_vec);
ab_i32_low_vec = _mm512_dpwssds_epi32( ab_i32_low_vec, _mm512_cvtepi8_epi16(_mm512_castsi512_si256(a_i8_low_vec)), _mm512_cvtepi8_epi16(_mm512_castsi512_si256(b_i8_low_vec)));
ab_i32_low_vec = _mm512_dpwssds_epi32( ab_i32_low_vec, _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(a_i8_low_vec, 1)), _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(b_i8_low_vec, 1)));
ab_i32_high_vec = _mm512_dpwssds_epi32( ab_i32_high_vec, _mm512_cvtepi8_epi16(_mm512_castsi512_si256(a_i8_high_vec)), _mm512_cvtepi8_epi16(_mm512_castsi512_si256(b_i8_high_vec)));
ab_i32_high_vec = _mm512_dpwssds_epi32( ab_i32_high_vec, _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(a_i8_high_vec, 1)), _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(b_i8_high_vec, 1)));
if (n_words) goto simsimd_cos_i4x2_ice_cycle;
int ab = _mm512_reduce_add_epi32(_mm512_add_epi32(ab_i32_low_vec, ab_i32_high_vec));
unsigned short a2_u16[32], b2_u16[32];
_mm512_storeu_si512(a2_u16, _mm512_add_epi16(a2_u16_low_vec, a2_u16_high_vec));
unsigned int a2 = 0, b2 = 0;
for (int i = 0; i < 32; ++i) a2 += a2_u16[i], b2 += b2_u16[i];
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif
#if SIMSIMD_TARGET_SIERRA
#pragma GCC push_options
#pragma GCC target("avx2", "bmi2", "avx2vnni")
#pragma clang attribute push(__attribute__((target("avx2,bmi2,avx2vnni"))), apply_to = function)
SIMSIMD_PUBLIC void simsimd_cos_i8_sierra(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n,
simsimd_distance_t *result) {
__m256i ab_i32_vec = _mm256_setzero_si256();
__m256i a2_i32_vec = _mm256_setzero_si256();
__m256i b2_i32_vec = _mm256_setzero_si256();
simsimd_size_t i = 0;
for (; i + 32 <= n; i += 32) {
__m256i a_i8_vec = _mm256_lddqu_si256((__m256i const *)(a + i));
__m256i b_i8_vec = _mm256_lddqu_si256((__m256i const *)(b + i));
ab_i32_vec = _mm256_dpbssds_epi32(ab_i32_vec, a_i8_vec, b_i8_vec);
a2_i32_vec = _mm256_dpbssds_epi32(a2_i32_vec, a_i8_vec, a_i8_vec);
b2_i32_vec = _mm256_dpbssds_epi32(b2_i32_vec, b_i8_vec, b_i8_vec);
}
int ab = _simsimd_reduce_i32x8_haswell(ab_i32_vec);
int a2 = _simsimd_reduce_i32x8_haswell(a2_i32_vec);
int b2 = _simsimd_reduce_i32x8_haswell(b2_i32_vec);
for (; i < n; ++i) {
int ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}
*result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2);
}
#pragma clang attribute pop
#pragma GCC pop_options
#endif #endif
#ifdef __cplusplus
}
#endif
#endif