#include "jit_compiler_a64.hpp"
#include "superscalar.hpp"
#include "program.hpp"
#include "reciprocal.h"
#include "virtual_memory.hpp"
namespace ARMV8A {
constexpr uint32_t B = 0x14000000;
constexpr uint32_t EOR = 0xCA000000;
constexpr uint32_t EOR32 = 0x4A000000;
constexpr uint32_t ADD = 0x8B000000;
constexpr uint32_t SUB = 0xCB000000;
constexpr uint32_t MUL = 0x9B007C00;
constexpr uint32_t UMULH = 0x9BC07C00;
constexpr uint32_t SMULH = 0x9B407C00;
constexpr uint32_t MOVZ = 0xD2800000;
constexpr uint32_t MOVN = 0x92800000;
constexpr uint32_t MOVK = 0xF2800000;
constexpr uint32_t ADD_IMM_LO = 0x91000000;
constexpr uint32_t ADD_IMM_HI = 0x91400000;
constexpr uint32_t LDR_LITERAL = 0x58000000;
constexpr uint32_t ROR = 0x9AC02C00;
constexpr uint32_t ROR_IMM = 0x93C00000;
constexpr uint32_t MOV_REG = 0xAA0003E0;
constexpr uint32_t MOV_VREG_EL = 0x6E080400;
constexpr uint32_t FADD = 0x4E60D400;
constexpr uint32_t FSUB = 0x4EE0D400;
constexpr uint32_t FEOR = 0x6E201C00;
constexpr uint32_t FMUL = 0x6E60DC00;
constexpr uint32_t FDIV = 0x6E60FC00;
constexpr uint32_t FSQRT = 0x6EE1F800;
}
namespace randomx {
static const size_t CodeSize = ((uint8_t*)randomx_init_dataset_aarch64_end) - ((uint8_t*)randomx_program_aarch64);
static const size_t MainLoopBegin = ((uint8_t*)randomx_program_aarch64_main_loop) - ((uint8_t*)randomx_program_aarch64);
static const size_t PrologueSize = ((uint8_t*)randomx_program_aarch64_vm_instructions) - ((uint8_t*)randomx_program_aarch64);
static const size_t ImulRcpLiteralsEnd = ((uint8_t*)randomx_program_aarch64_imul_rcp_literals_end) - ((uint8_t*)randomx_program_aarch64);
static const size_t CalcDatasetItemSize =
((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch - (uint8_t*)randomx_calc_dataset_item_aarch64) +
RANDOMX_CACHE_ACCESSES * (
((uint8_t*)randomx_calc_dataset_item_aarch64_mix - ((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch)) + 4 +
((RANDOMX_SUPERSCALAR_LATENCY * 3) + 2) * 16 +
((uint8_t*)randomx_calc_dataset_item_aarch64_store_result - (uint8_t*)randomx_calc_dataset_item_aarch64_mix) + 4
) +
((uint8_t*)randomx_calc_dataset_item_aarch64_end - (uint8_t*)randomx_calc_dataset_item_aarch64_store_result);
constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };
template<typename T> static constexpr size_t Log2(T value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; }
JitCompilerA64::JitCompilerA64()
: code((uint8_t*) allocMemoryPages(CodeSize + CalcDatasetItemSize))
, literalPos(ImulRcpLiteralsEnd)
, num32bitLiterals(0)
{
memset(reg_changed_offset, 0, sizeof(reg_changed_offset));
memcpy(code, (void*) randomx_program_aarch64, CodeSize);
}
JitCompilerA64::~JitCompilerA64()
{
freePagedMemory(code, CodeSize + CalcDatasetItemSize);
}
void JitCompilerA64::enableWriting()
{
setPagesRW(code, CodeSize + CalcDatasetItemSize);
}
void JitCompilerA64::enableExecution()
{
setPagesRX(code, CodeSize + CalcDatasetItemSize);
}
void JitCompilerA64::enableAll()
{
setPagesRWX(code, CodeSize + CalcDatasetItemSize);
}
void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config)
{
uint32_t codePos = MainLoopBegin + 4;
emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
codePos = PrologueSize;
literalPos = ImulRcpLiteralsEnd;
num32bitLiterals = 0;
for (uint32_t i = 0; i < RegistersCount; ++i)
reg_changed_offset[i] = codePos;
for (uint32_t i = 0; i < program.getSize(); ++i)
{
Instruction& instr = program(i);
instr.src %= RegistersCount;
instr.dst %= RegistersCount;
(this->*engine[instr.opcode])(instr, codePos);
}
emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
emit32(ARMV8A::B | (offset / 4), code, codePos);
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
emit32(0x121A0000 | 18 | (18 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
emit32(0x121A0000 | 10 | (10 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);
#ifdef __GNUC__
__builtin___clear_cache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
#endif
}
void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration& config, uint32_t datasetOffset)
{
uint32_t codePos = MainLoopBegin + 4;
emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
codePos = PrologueSize;
literalPos = ImulRcpLiteralsEnd;
num32bitLiterals = 0;
for (uint32_t i = 0; i < RegistersCount; ++i)
reg_changed_offset[i] = codePos;
for (uint32_t i = 0; i < program.getSize(); ++i)
{
Instruction& instr = program(i);
instr.src %= RegistersCount;
instr.dst %= RegistersCount;
(this->*engine[instr.opcode])(instr, codePos);
}
emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos;
emit32(ARMV8A::B | (offset / 4), code, codePos);
codePos = (((uint8_t*)randomx_program_aarch64_light_cacheline_align_mask) - ((uint8_t*)randomx_program_aarch64));
emit32(0x121A0000 | 2 | (9 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);
codePos = ((uint8_t*)randomx_program_aarch64_light_dataset_offset) - ((uint8_t*)randomx_program_aarch64);
datasetOffset /= CacheLineSize;
const uint32_t imm_lo = datasetOffset & ((1 << 12) - 1);
const uint32_t imm_hi = datasetOffset >> 12;
emit32(ARMV8A::ADD_IMM_LO | 2 | (2 << 5) | (imm_lo << 10), code, codePos);
emit32(ARMV8A::ADD_IMM_HI | 2 | (2 << 5) | (imm_hi << 10), code, codePos);
#ifdef __GNUC__
__builtin___clear_cache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
#endif
}
template<size_t N>
void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &reciprocalCache)
{
uint32_t codePos = CodeSize;
uint8_t* p1 = (uint8_t*)randomx_calc_dataset_item_aarch64;
uint8_t* p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_prefetch;
memcpy(code + codePos, p1, p2 - p1);
codePos += p2 - p1;
num32bitLiterals = 64;
constexpr uint32_t tmp_reg = 12;
for (size_t i = 0; i < N; ++i)
{
emit32(0x92400000 | 11 | (10 << 5) | ((Log2(CacheSize / CacheLineSize) - 1) << 10), code, codePos);
p1 = ((uint8_t*)randomx_calc_dataset_item_aarch64_prefetch) + 4;
p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_mix;
memcpy(code + codePos, p1, p2 - p1);
codePos += p2 - p1;
SuperscalarProgram& prog = programs[i];
const size_t progSize = prog.getSize();
uint32_t jmp_pos = codePos;
codePos += 4;
for (size_t j = 0; j < progSize; ++j)
{
const Instruction& instr = prog(j);
if (static_cast<SuperscalarInstructionType>(instr.opcode) == randomx::SuperscalarInstructionType::IMUL_RCP)
emit64(reciprocalCache[instr.getImm32()], code, codePos);
}
uint32_t literal_pos = jmp_pos;
emit32(ARMV8A::B | ((codePos - jmp_pos) / 4), code, literal_pos);
for (size_t j = 0; j < progSize; ++j)
{
const Instruction& instr = prog(j);
const uint32_t src = instr.src;
const uint32_t dst = instr.dst;
switch (static_cast<SuperscalarInstructionType>(instr.opcode))
{
case randomx::SuperscalarInstructionType::ISUB_R:
emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, codePos);
break;
case randomx::SuperscalarInstructionType::IXOR_R:
emit32(ARMV8A::EOR | dst | (dst << 5) | (src << 16), code, codePos);
break;
case randomx::SuperscalarInstructionType::IADD_RS:
emit32(ARMV8A::ADD | dst | (dst << 5) | (instr.getModShift() << 10) | (src << 16), code, codePos);
break;
case randomx::SuperscalarInstructionType::IMUL_R:
emit32(ARMV8A::MUL | dst | (dst << 5) | (src << 16), code, codePos);
break;
case randomx::SuperscalarInstructionType::IROR_C:
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
break;
case randomx::SuperscalarInstructionType::IADD_C7:
case randomx::SuperscalarInstructionType::IADD_C8:
case randomx::SuperscalarInstructionType::IADD_C9:
emitAddImmediate(dst, dst, instr.getImm32(), code, codePos);
break;
case randomx::SuperscalarInstructionType::IXOR_C7:
case randomx::SuperscalarInstructionType::IXOR_C8:
case randomx::SuperscalarInstructionType::IXOR_C9:
emitMovImmediate(tmp_reg, instr.getImm32(), code, codePos);
emit32(ARMV8A::EOR | dst | (dst << 5) | (tmp_reg << 16), code, codePos);
break;
case randomx::SuperscalarInstructionType::IMULH_R:
emit32(ARMV8A::UMULH | dst | (dst << 5) | (src << 16), code, codePos);
break;
case randomx::SuperscalarInstructionType::ISMULH_R:
emit32(ARMV8A::SMULH | dst | (dst << 5) | (src << 16), code, codePos);
break;
case randomx::SuperscalarInstructionType::IMUL_RCP:
{
int32_t offset = (literal_pos - codePos) / 4;
offset &= (1 << 19) - 1;
literal_pos += 8;
emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, codePos);
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, codePos);
}
break;
default:
break;
}
}
p1 = (uint8_t*)randomx_calc_dataset_item_aarch64_mix;
p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_store_result;
memcpy(code + codePos, p1, p2 - p1);
codePos += p2 - p1;
emit32(ARMV8A::MOV_REG | 10 | (prog.getAddressRegister() << 16), code, codePos);
}
p1 = (uint8_t*)randomx_calc_dataset_item_aarch64_store_result;
p2 = (uint8_t*)randomx_calc_dataset_item_aarch64_end;
memcpy(code + codePos, p1, p2 - p1);
codePos += p2 - p1;
#ifdef __GNUC__
__builtin___clear_cache(reinterpret_cast<char*>(code + CodeSize), reinterpret_cast<char*>(code + codePos));
#endif
}
template void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES], std::vector<uint64_t> &reciprocalCache);
DatasetInitFunc* JitCompilerA64::getDatasetInitFunc()
{
return (DatasetInitFunc*)(code + (((uint8_t*)randomx_init_dataset_aarch64) - ((uint8_t*)randomx_program_aarch64)));
}
size_t JitCompilerA64::getCodeSize()
{
return CodeSize;
}
void JitCompilerA64::emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos)
{
uint32_t k = codePos;
if (imm < (1 << 16))
{
emit32(ARMV8A::MOVZ | dst | (imm << 5), code, k);
}
else
{
if (num32bitLiterals < 64)
{
if (static_cast<int32_t>(imm) < 0)
{
emit32(0x4E042C00 | dst | ((num32bitLiterals / 4) << 5) | ((num32bitLiterals % 4) << 19), code, k);
}
else
{
emit32(0x0E043C00 | dst | ((num32bitLiterals / 4) << 5) | ((num32bitLiterals % 4) << 19), code, k);
}
((uint32_t*)(code + ImulRcpLiteralsEnd))[num32bitLiterals] = imm;
++num32bitLiterals;
}
else
{
if (static_cast<int32_t>(imm) < 0)
{
emit32(ARMV8A::MOVN | dst | (1 << 21) | ((~imm >> 16) << 5), code, k);
}
else
{
emit32(ARMV8A::MOVZ | dst | (1 << 21) | ((imm >> 16) << 5), code, k);
}
emit32(ARMV8A::MOVK | dst | ((imm & 0xFFFF) << 5), code, k);
}
}
codePos = k;
}
void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos)
{
uint32_t k = codePos;
if (imm < (1 << 24))
{
const uint32_t imm_lo = imm & ((1 << 12) - 1);
const uint32_t imm_hi = imm >> 12;
if (imm_lo && imm_hi)
{
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
emit32(ARMV8A::ADD_IMM_HI | dst | (dst << 5) | (imm_hi << 10), code, k);
}
else if (imm_lo)
{
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
}
else
{
emit32(ARMV8A::ADD_IMM_HI | dst | (src << 5) | (imm_hi << 10), code, k);
}
}
else
{
constexpr uint32_t tmp_reg = 18;
emitMovImmediate(tmp_reg, imm, code, k);
emit32(ARMV8A::ADD | dst | (src << 5) | (tmp_reg << 16), code, k);
}
codePos = k;
}
template<uint32_t tmp_reg>
void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos)
{
uint32_t k = codePos;
uint32_t imm = instr.getImm32();
if (src != dst)
{
imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
emitAddImmediate(tmp_reg, src, imm, code, k);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10);
constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10);
emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k);
emit32(0xf8606840 | tmp_reg | (tmp_reg << 16), code, k);
}
else
{
imm = (imm & ScratchpadL3Mask) >> 3;
emitMovImmediate(tmp_reg, imm, code, k);
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
}
codePos = k;
}
template<uint32_t tmp_reg_fp>
void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos)
{
uint32_t k = codePos;
uint32_t imm = instr.getImm32();
constexpr uint32_t tmp_reg = 18;
imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
emitAddImmediate(tmp_reg, src, imm, code, k);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10);
constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10);
emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k);
emit32(ARMV8A::ADD | tmp_reg | (2 << 5) | (tmp_reg << 16), code, k);
emit32(0x69400000 | tmp_reg | (tmp_reg << 5) | ((tmp_reg + 1) << 10), code, k);
emit32(0x4E081C00 | tmp_reg_fp | (tmp_reg << 5), code, k);
emit32(0x4E181C00 | tmp_reg_fp | ((tmp_reg + 1) << 5), code, k);
emit32(0x4E61D800 | tmp_reg_fp | (tmp_reg_fp << 5), code, k);
codePos = k;
}
void JitCompilerA64::h_IADD_RS(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
const uint32_t shift = instr.getModShift();
emit32(ARMV8A::ADD | dst | (dst << 5) | (shift << 10) | (src << 16), code, k);
if (instr.dst == RegisterNeedsDisplacement)
emitAddImmediate(dst, dst, instr.getImm32(), code, k);
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_IADD_M(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
emit32(ARMV8A::ADD | dst | (dst << 5) | (tmp_reg << 16), code, k);
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_ISUB_R(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src != dst)
{
emit32(ARMV8A::SUB | dst | (dst << 5) | (src << 16), code, k);
}
else
{
emitAddImmediate(dst, dst, -instr.getImm32(), code, k);
}
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_ISUB_M(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
emit32(ARMV8A::SUB | dst | (dst << 5) | (tmp_reg << 16), code, k);
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_IMUL_R(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src == dst)
{
src = 18;
emitMovImmediate(src, instr.getImm32(), code, k);
}
emit32(ARMV8A::MUL | dst | (dst << 5) | (src << 16), code, k);
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_IMULH_R(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
emit32(ARMV8A::UMULH | dst | (dst << 5) | (src << 16), code, k);
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_IMULH_M(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
emit32(ARMV8A::UMULH | dst | (dst << 5) | (tmp_reg << 16), code, k);
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_ISMULH_R(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
emit32(ARMV8A::SMULH | dst | (dst << 5) | (src << 16), code, k);
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
emit32(ARMV8A::SMULH | dst | (dst << 5) | (tmp_reg << 16), code, k);
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos)
{
const uint64_t divisor = instr.getImm32();
if (isZeroOrPowerOf2(divisor))
return;
uint32_t k = codePos;
constexpr uint32_t tmp_reg = 18;
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint64_t N = 1ULL << 63;
const uint64_t q = N / divisor;
const uint64_t r = N % divisor;
#ifdef __GNUC__
const uint64_t shift = 64 - __builtin_clzll(divisor);
#else
uint64_t shift = 32;
for (uint64_t k = 1U << 31; (k & divisor) == 0; k >>= 1)
--shift;
#endif
const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t);
literalPos -= sizeof(uint64_t);
*(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor);
if (literal_id < 13)
{
static constexpr uint32_t literal_regs[13] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 20 << 16, 11 << 16, 0 };
emit32(ARMV8A::MUL | dst | (dst << 5) | literal_regs[literal_id], code, k);
}
else
{
const uint32_t offset = (literalPos - k) / 4;
emit32(ARMV8A::LDR_LITERAL | tmp_reg | (offset << 5), code, k);
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
}
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_INEG_R(Instruction& instr, uint32_t& codePos)
{
const uint32_t dst = IntRegMap[instr.dst];
emit32(ARMV8A::SUB | dst | (31 << 5) | (dst << 16), code, codePos);
reg_changed_offset[instr.dst] = codePos;
}
void JitCompilerA64::h_IXOR_R(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src == dst)
{
src = 18;
emitMovImmediate(src, instr.getImm32(), code, k);
}
emit32(ARMV8A::EOR | dst | (dst << 5) | (src << 16), code, k);
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_IXOR_M(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 18;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
emit32(ARMV8A::EOR | dst | (dst << 5) | (tmp_reg << 16), code, k);
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_IROR_R(Instruction& instr, uint32_t& codePos)
{
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src != dst)
{
emit32(ARMV8A::ROR | dst | (dst << 5) | (src << 16), code, codePos);
}
else
{
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
}
reg_changed_offset[instr.dst] = codePos;
}
void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src != dst)
{
constexpr uint32_t tmp_reg = 18;
emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k);
emit32(ARMV8A::ROR | dst | (dst << 5) | (tmp_reg << 16), code, k);
}
else
{
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k);
}
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_ISWAP_R(Instruction& instr, uint32_t& codePos)
{
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
if (src == dst)
return;
uint32_t k = codePos;
constexpr uint32_t tmp_reg = 18;
emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k);
emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k);
reg_changed_offset[instr.src] = k;
reg_changed_offset[instr.dst] = k;
codePos = k;
}
void JitCompilerA64::h_FSWAP_R(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t dst = instr.dst + 16;
constexpr uint32_t tmp_reg_fp = 28;
constexpr uint32_t src_index1 = 1 << 14;
constexpr uint32_t dst_index1 = 1 << 20;
emit32(ARMV8A::MOV_VREG_EL | tmp_reg_fp | (dst << 5) | src_index1, code, k);
emit32(ARMV8A::MOV_VREG_EL | dst | (dst << 5) | dst_index1, code, k);
emit32(ARMV8A::MOV_VREG_EL | dst | (tmp_reg_fp << 5), code, k);
codePos = k;
}
void JitCompilerA64::h_FADD_R(Instruction& instr, uint32_t& codePos)
{
const uint32_t src = (instr.src % 4) + 24;
const uint32_t dst = (instr.dst % 4) + 16;
emit32(ARMV8A::FADD | dst | (dst << 5) | (src << 16), code, codePos);
}
void JitCompilerA64::h_FADD_M(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = (instr.dst % 4) + 16;
constexpr uint32_t tmp_reg_fp = 28;
emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);
emit32(ARMV8A::FADD | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_FSUB_R(Instruction& instr, uint32_t& codePos)
{
const uint32_t src = (instr.src % 4) + 24;
const uint32_t dst = (instr.dst % 4) + 16;
emit32(ARMV8A::FSUB | dst | (dst << 5) | (src << 16), code, codePos);
}
void JitCompilerA64::h_FSUB_M(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = (instr.dst % 4) + 16;
constexpr uint32_t tmp_reg_fp = 28;
emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);
emit32(ARMV8A::FSUB | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_FSCAL_R(Instruction& instr, uint32_t& codePos)
{
const uint32_t dst = (instr.dst % 4) + 16;
emit32(ARMV8A::FEOR | dst | (dst << 5) | (31 << 16), code, codePos);
}
void JitCompilerA64::h_FMUL_R(Instruction& instr, uint32_t& codePos)
{
const uint32_t src = (instr.src % 4) + 24;
const uint32_t dst = (instr.dst % 4) + 20;
emit32(ARMV8A::FMUL | dst | (dst << 5) | (src << 16), code, codePos);
}
void JitCompilerA64::h_FDIV_M(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = (instr.dst % 4) + 20;
constexpr uint32_t tmp_reg_fp = 28;
emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);
emit32(0x4E201C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (29 << 16), code, k);
emit32(0x4EA01C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (30 << 16), code, k);
emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_FSQRT_R(Instruction& instr, uint32_t& codePos)
{
const uint32_t dst = (instr.dst % 4) + 20;
emit32(ARMV8A::FSQRT | dst | (dst << 5), code, codePos);
}
void JitCompilerA64::h_CBRANCH(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t dst = IntRegMap[instr.dst];
const uint32_t modCond = instr.getModCond();
const uint32_t shift = modCond + ConditionOffset;
const uint32_t imm = (instr.getImm32() | (1U << shift)) & ~(1U << (shift - 1));
emitAddImmediate(dst, dst, imm, code, k);
static_assert((ConditionMask == 0xFF) && (ConditionOffset == 8), "Update tst encoding for different mask and offset");
emit32((0xF2781C1F - (modCond << 16)) | (dst << 5), code, k);
int32_t offset = reg_changed_offset[instr.dst];
offset = ((offset - k) >> 2) & ((1 << 19) - 1);
emit32(0x54000000 | (offset << 5), code, k);
for (uint32_t i = 0; i < RegistersCount; ++i)
reg_changed_offset[i] = k;
codePos = k;
}
void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
constexpr uint32_t tmp_reg = 18;
constexpr uint32_t fpcr_tmp_reg = 8;
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);
emit32(0xDAC00000 | tmp_reg | (fpcr_tmp_reg << 5), code, k);
emit32(0xD51B4400 | tmp_reg, code, k);
codePos = k;
}
void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos)
{
uint32_t k = codePos;
const uint32_t src = IntRegMap[instr.src];
const uint32_t dst = IntRegMap[instr.dst];
constexpr uint32_t tmp_reg = 18;
uint32_t imm = instr.getImm32();
if (instr.getModCond() < StoreL3Condition)
imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
else
imm &= RANDOMX_SCRATCHPAD_L3 - 1;
emitAddImmediate(tmp_reg, dst, imm, code, k);
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
constexpr uint32_t andInstrL1 = t | ((Log2(RANDOMX_SCRATCHPAD_L1) - 4) << 10);
constexpr uint32_t andInstrL2 = t | ((Log2(RANDOMX_SCRATCHPAD_L2) - 4) << 10);
constexpr uint32_t andInstrL3 = t | ((Log2(RANDOMX_SCRATCHPAD_L3) - 4) << 10);
emit32((instr.getModCond() < StoreL3Condition) ? (instr.getModMem() ? andInstrL1 : andInstrL2) : andInstrL3, code, k);
emit32(0xF8206840 | src | (tmp_reg << 16), code, k);
codePos = k;
}
void JitCompilerA64::h_NOP(Instruction& instr, uint32_t& codePos)
{
}
#include "instruction_weights.hpp"
#define INST_HANDLE(x) REPN(&JitCompilerA64::h_##x, WT(x))
InstructionGeneratorA64 JitCompilerA64::engine[256] = {
INST_HANDLE(IADD_RS)
INST_HANDLE(IADD_M)
INST_HANDLE(ISUB_R)
INST_HANDLE(ISUB_M)
INST_HANDLE(IMUL_R)
INST_HANDLE(IMUL_M)
INST_HANDLE(IMULH_R)
INST_HANDLE(IMULH_M)
INST_HANDLE(ISMULH_R)
INST_HANDLE(ISMULH_M)
INST_HANDLE(IMUL_RCP)
INST_HANDLE(INEG_R)
INST_HANDLE(IXOR_R)
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
INST_HANDLE(ISWAP_R)
INST_HANDLE(FSWAP_R)
INST_HANDLE(FADD_R)
INST_HANDLE(FADD_M)
INST_HANDLE(FSUB_R)
INST_HANDLE(FSUB_M)
INST_HANDLE(FSCAL_R)
INST_HANDLE(FMUL_R)
INST_HANDLE(FDIV_M)
INST_HANDLE(FSQRT_R)
INST_HANDLE(CBRANCH)
INST_HANDLE(CFROUND)
INST_HANDLE(ISTORE)
INST_HANDLE(NOP)
};
}