#ifndef UCS_AARCH64_CPU_H_
#define UCS_AARCH64_CPU_H_
#include "config.h"
#include <time.h>
#include <string.h>
#include <sys/times.h>
#include <ucs/sys/compiler_def.h>
#include <ucs/arch/generic/cpu.h>
#include <ucs/sys/math.h>
#include <ucs/type/status.h>
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
#define UCS_ARCH_CACHE_LINE_SIZE 64
BEGIN_C_DECLS
#define ucs_aarch64_dmb(_op) asm volatile ("dmb " #_op ::: "memory")
#define ucs_aarch64_isb(_op) asm volatile ("isb " #_op ::: "memory")
#define ucs_aarch64_dsb(_op) asm volatile ("dsb " #_op ::: "memory")
#define ucs_memory_bus_fence() ucs_aarch64_dmb(oshsy)
#define ucs_memory_bus_store_fence() ucs_aarch64_dmb(oshst)
#define ucs_memory_bus_load_fence() ucs_aarch64_dmb(oshld)
#if defined(HAVE_AARCH64_THUNDERX2)
#define ucs_memory_bus_cacheline_wc_flush()
#else
#define ucs_memory_bus_cacheline_wc_flush() ucs_aarch64_dmb(oshst)
#endif
#define ucs_memory_cpu_fence() ucs_aarch64_dmb(ish)
#define ucs_memory_cpu_store_fence() ucs_aarch64_dmb(ishst)
#define ucs_memory_cpu_load_fence() ucs_aarch64_dmb(ishld)
#define ucs_memory_cpu_wc_fence() ucs_aarch64_dmb(oshst)
typedef struct ucs_aarch64_cpuid {
int implementer;
int architecture;
int variant;
int part;
int revision;
} ucs_aarch64_cpuid_t;
void ucs_aarch64_cpuid(ucs_aarch64_cpuid_t *cpuid);
#if defined(HAVE_AARCH64_THUNDERX2)
extern void *__memcpy_thunderx2(void *, const void *, size_t);
#endif
#if HAVE_HW_TIMER
static inline uint64_t ucs_arch_read_hres_clock(void)
{
uint64_t ticks;
asm volatile("isb" : : : "memory");
asm volatile("mrs %0, cntvct_el0" : "=r" (ticks));
return ticks;
}
static inline double ucs_arch_get_clocks_per_sec()
{
uint64_t freq;
asm volatile("mrs %0, cntfrq_el0" : "=r" (freq));
return (double) freq;
}
#else
#define ucs_arch_read_hres_clock ucs_arch_generic_read_hres_clock
#define ucs_arch_get_clocks_per_sec ucs_arch_generic_get_clocks_per_sec
#endif
static inline ucs_cpu_model_t ucs_arch_get_cpu_model()
{
return UCS_CPU_MODEL_ARM_AARCH64;
}
static inline ucs_cpu_vendor_t ucs_arch_get_cpu_vendor()
{
ucs_aarch64_cpuid_t cpuid;
ucs_aarch64_cpuid(&cpuid);
if ((cpuid.implementer == 0x46) && (cpuid.architecture == 8)) {
return UCS_CPU_VENDOR_FUJITSU_ARM;
}
return UCS_CPU_VENDOR_GENERIC_ARM;
}
static inline int ucs_arch_get_cpu_flag()
{
return UCS_CPU_FLAG_UNKNOWN;
}
static inline void ucs_cpu_init()
{
}
static inline void ucs_arch_wait_mem(void *address)
{
unsigned long UCS_V_UNUSED tmp;
asm volatile ("ldaxrb %w0, [%1] \n"
"wfe \n"
: "=&r"(tmp)
: "r"(address)
: "memory");
}
#if !HAVE___CLEAR_CACHE
static inline void ucs_arch_clear_cache(void *start, void *end)
{
#if HAVE___AARCH64_SYNC_CACHE_RANGE
void __aarch64_sync_cache_range(void* beg, void* end);
__aarch64_sync_cache_range(start, end);
#else
uintptr_t ptr;
unsigned icache;
unsigned dcache;
unsigned dic;
unsigned idc;
unsigned ctr_el0;
asm volatile ("mrs\t%0, ctr_el0":"=r" (ctr_el0));
icache = sizeof(int) << (ctr_el0 & 0xf);
dcache = sizeof(int) << ((ctr_el0 >> 16) & 0xf);
dic = (ctr_el0 >> 29) & 0x1;
idc = (ctr_el0 >> 28) & 0x1;
if (idc == 0) {
for (ptr = ucs_align_down((uintptr_t)start, dcache); ptr < (uintptr_t)end; ptr += dcache) {
asm volatile ("dc cvau, %0" :: "r" (ptr) : "memory");
}
}
if (dic == 0) {
ucs_aarch64_dsb(ish);
for (ptr = ucs_align_down((uintptr_t)start, icache); ptr < (uintptr_t)end; ptr += icache) {
asm volatile ("ic ivau, %0" :: "r" (ptr) : "memory");
}
}
ucs_aarch64_dsb(ish);
ucs_aarch64_isb();
#endif
}
#endif
#if defined(__ARM_FEATURE_SVE)
static inline void *memcpy_aarch64_sve(void *dest, const void *src, size_t len)
{
uint8_t *dest_u8 = (uint8_t*) dest;
const uint8_t *src_u8 = (uint8_t*) src;
uint64_t i = 0;
svbool_t pg = svwhilelt_b8_u64(i, (uint64_t)len);
do {
svst1_u8(pg, &dest_u8[i], svld1_u8(pg, &src_u8[i]));
i += svcntb();
pg = svwhilelt_b8_u64(i, (uint64_t)len);
} while (svptest_first(svptrue_b8(), pg));
return dest;
}
#endif
static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
{
#if defined(HAVE_AARCH64_THUNDERX2)
return __memcpy_thunderx2(dst, src, len);
#elif defined(__ARM_FEATURE_SVE)
return memcpy_aarch64_sve(dst, src, len);
#else
return memcpy(dst, src, len);
#endif
}
static UCS_F_ALWAYS_INLINE void
ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
{
#if defined(HAVE_AARCH64_THUNDERX2)
__memcpy_thunderx2(dst, src,len);
#elif defined(__ARM_FEATURE_SVE)
memcpy_aarch64_sve(dst, src, len);
#else
memcpy(dst, src, len);
#endif
}
static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
{
return UCS_ERR_UNSUPPORTED;
}
END_C_DECLS
#endif