use super::decode::{resolve_modrm32, Operand};
use super::isa_int::{Cpu, StepOk};
use super::mmu::Mmu;
use super::regs::Reg32;
use super::Trap;
#[derive(Copy, Clone, Debug)]
struct Vex {
map: u8,
pp: u8,
l: u8,
#[allow(dead_code)]
w: bool,
vvvv: u8,
}
impl Vex {
fn from_c5(b1: u8) -> Self {
Vex {
map: 1, pp: b1 & 0x3,
l: (b1 >> 2) & 0x1,
w: false,
vvvv: (!(b1 >> 3)) & 0xF,
}
}
fn from_c4(b1: u8, b2: u8) -> Self {
Vex {
map: b1 & 0x1F,
pp: b2 & 0x3,
l: (b2 >> 2) & 0x1,
w: (b2 & 0x80) != 0,
vvvv: (!(b2 >> 3)) & 0xF,
}
}
}
fn vex_opcode_id(vex: &Vex, opcode: u8) -> u32 {
u32::from(opcode)
| (u32::from(vex.l) << 8)
| (u32::from(vex.pp) << 9)
| (u32::from(vex.map) << 11)
| (u32::from(vex.vvvv) << 14)
}
pub fn dispatch(
cpu: &mut Cpu,
mmu: &mut Mmu,
prefix_byte: u8,
entry_eip: u32,
) -> Result<StepOk, Trap> {
cpu.bump_avx_count();
let vex = match prefix_byte {
0xC5 => {
let b1 = cpu.fetch_imm8_pub(mmu)?;
Vex::from_c5(b1)
}
0xC4 => {
let b1 = cpu.fetch_imm8_pub(mmu)?;
let b2 = cpu.fetch_imm8_pub(mmu)?;
Vex::from_c4(b1, b2)
}
_ => unreachable!("dispatch called with non-VEX prefix {prefix_byte:#x}"),
};
let opcode = cpu.fetch_imm8_pub(mmu)?;
match (vex.map, vex.pp, vex.l, opcode) {
(1, 1, 0, 0xEF) => vpxor_128(cpu, mmu, &vex),
(1, 1, 0, 0x60) => vpbinop_128(cpu, mmu, &vex, SimdOp::UnpckLBW),
(1, 1, 1, 0x60) => vpbinop_256(cpu, mmu, &vex, SimdOp::UnpckLBW),
(1, 1, 0, 0x61) => vpbinop_128(cpu, mmu, &vex, SimdOp::UnpckLWD),
(1, 1, 1, 0x61) => vpbinop_256(cpu, mmu, &vex, SimdOp::UnpckLWD),
(1, 1, 0, 0x62) => vpbinop_128(cpu, mmu, &vex, SimdOp::UnpckLDQ),
(1, 1, 1, 0x62) => vpbinop_256(cpu, mmu, &vex, SimdOp::UnpckLDQ),
(1, 1, 0, 0x63) => vpbinop_128(cpu, mmu, &vex, SimdOp::PackSSWB),
(1, 1, 1, 0x63) => vpbinop_256(cpu, mmu, &vex, SimdOp::PackSSWB),
(1, 1, 0, 0x64) => vpbinop_128(cpu, mmu, &vex, SimdOp::CmpGtB),
(1, 1, 1, 0x64) => vpbinop_256(cpu, mmu, &vex, SimdOp::CmpGtB),
(1, 1, 0, 0x65) => vpbinop_128(cpu, mmu, &vex, SimdOp::CmpGtW),
(1, 1, 1, 0x65) => vpbinop_256(cpu, mmu, &vex, SimdOp::CmpGtW),
(1, 1, 0, 0x66) => vpbinop_128(cpu, mmu, &vex, SimdOp::CmpGtD),
(1, 1, 1, 0x66) => vpbinop_256(cpu, mmu, &vex, SimdOp::CmpGtD),
(1, 1, 0, 0x67) => vpbinop_128(cpu, mmu, &vex, SimdOp::PackUSWB),
(1, 1, 1, 0x67) => vpbinop_256(cpu, mmu, &vex, SimdOp::PackUSWB),
(1, 1, 0, 0x68) => vpbinop_128(cpu, mmu, &vex, SimdOp::UnpckHBW),
(1, 1, 1, 0x68) => vpbinop_256(cpu, mmu, &vex, SimdOp::UnpckHBW),
(1, 1, 0, 0x69) => vpbinop_128(cpu, mmu, &vex, SimdOp::UnpckHWD),
(1, 1, 1, 0x69) => vpbinop_256(cpu, mmu, &vex, SimdOp::UnpckHWD),
(1, 1, 0, 0x6A) => vpbinop_128(cpu, mmu, &vex, SimdOp::UnpckHDQ),
(1, 1, 1, 0x6A) => vpbinop_256(cpu, mmu, &vex, SimdOp::UnpckHDQ),
(1, 1, 0, 0x6B) => vpbinop_128(cpu, mmu, &vex, SimdOp::PackSSDW),
(1, 1, 1, 0x6B) => vpbinop_256(cpu, mmu, &vex, SimdOp::PackSSDW),
(1, 1, 0, 0x6C) => vpbinop_128(cpu, mmu, &vex, SimdOp::UnpckLQDQ),
(1, 1, 1, 0x6C) => vpbinop_256(cpu, mmu, &vex, SimdOp::UnpckLQDQ),
(1, 1, 0, 0x6D) => vpbinop_128(cpu, mmu, &vex, SimdOp::UnpckHQDQ),
(1, 1, 1, 0x6D) => vpbinop_256(cpu, mmu, &vex, SimdOp::UnpckHQDQ),
(1, 1, 0, 0x74) => vpbinop_128(cpu, mmu, &vex, SimdOp::CmpEqB),
(1, 1, 1, 0x74) => vpbinop_256(cpu, mmu, &vex, SimdOp::CmpEqB),
(1, 1, 0, 0x75) => vpbinop_128(cpu, mmu, &vex, SimdOp::CmpEqW),
(1, 1, 1, 0x75) => vpbinop_256(cpu, mmu, &vex, SimdOp::CmpEqW),
(1, 1, 0, 0xD4) => vpbinop_128(cpu, mmu, &vex, SimdOp::AddQ),
(1, 1, 1, 0xD4) => vpbinop_256(cpu, mmu, &vex, SimdOp::AddQ),
(1, 1, 0, 0xD5) => vpbinop_128(cpu, mmu, &vex, SimdOp::MulLowW),
(1, 1, 1, 0xD5) => vpbinop_256(cpu, mmu, &vex, SimdOp::MulLowW),
(1, 1, 0, 0xD8) => vpbinop_128(cpu, mmu, &vex, SimdOp::SubSatUB),
(1, 1, 1, 0xD8) => vpbinop_256(cpu, mmu, &vex, SimdOp::SubSatUB),
(1, 1, 0, 0xD9) => vpbinop_128(cpu, mmu, &vex, SimdOp::SubSatUW),
(1, 1, 1, 0xD9) => vpbinop_256(cpu, mmu, &vex, SimdOp::SubSatUW),
(1, 1, 0, 0xDA) => vpbinop_128(cpu, mmu, &vex, SimdOp::MinUB),
(1, 1, 1, 0xDA) => vpbinop_256(cpu, mmu, &vex, SimdOp::MinUB),
(1, 1, 0, 0xDC) => vpbinop_128(cpu, mmu, &vex, SimdOp::AddSatUB),
(1, 1, 1, 0xDC) => vpbinop_256(cpu, mmu, &vex, SimdOp::AddSatUB),
(1, 1, 0, 0xDD) => vpbinop_128(cpu, mmu, &vex, SimdOp::AddSatUW),
(1, 1, 1, 0xDD) => vpbinop_256(cpu, mmu, &vex, SimdOp::AddSatUW),
(1, 1, 0, 0xDE) => vpbinop_128(cpu, mmu, &vex, SimdOp::MaxUB),
(1, 1, 1, 0xDE) => vpbinop_256(cpu, mmu, &vex, SimdOp::MaxUB),
(1, 1, 0, 0xE0) => vpbinop_128(cpu, mmu, &vex, SimdOp::AvgB),
(1, 1, 1, 0xE0) => vpbinop_256(cpu, mmu, &vex, SimdOp::AvgB),
(1, 1, 0, 0xE3) => vpbinop_128(cpu, mmu, &vex, SimdOp::AvgW),
(1, 1, 1, 0xE3) => vpbinop_256(cpu, mmu, &vex, SimdOp::AvgW),
(1, 1, 0, 0xE4) => vpbinop_128(cpu, mmu, &vex, SimdOp::MulHighUW),
(1, 1, 1, 0xE4) => vpbinop_256(cpu, mmu, &vex, SimdOp::MulHighUW),
(1, 1, 0, 0xE5) => vpbinop_128(cpu, mmu, &vex, SimdOp::MulHighSW),
(1, 1, 1, 0xE5) => vpbinop_256(cpu, mmu, &vex, SimdOp::MulHighSW),
(1, 1, 0, 0xE8) => vpbinop_128(cpu, mmu, &vex, SimdOp::SubSatSB),
(1, 1, 1, 0xE8) => vpbinop_256(cpu, mmu, &vex, SimdOp::SubSatSB),
(1, 1, 0, 0xE9) => vpbinop_128(cpu, mmu, &vex, SimdOp::SubSatSW),
(1, 1, 1, 0xE9) => vpbinop_256(cpu, mmu, &vex, SimdOp::SubSatSW),
(1, 1, 0, 0xEA) => vpbinop_128(cpu, mmu, &vex, SimdOp::MinSW),
(1, 1, 1, 0xEA) => vpbinop_256(cpu, mmu, &vex, SimdOp::MinSW),
(1, 1, 0, 0xEC) => vpbinop_128(cpu, mmu, &vex, SimdOp::AddSatSB),
(1, 1, 1, 0xEC) => vpbinop_256(cpu, mmu, &vex, SimdOp::AddSatSB),
(1, 1, 0, 0xED) => vpbinop_128(cpu, mmu, &vex, SimdOp::AddSatSW),
(1, 1, 1, 0xED) => vpbinop_256(cpu, mmu, &vex, SimdOp::AddSatSW),
(1, 1, 0, 0xEE) => vpbinop_128(cpu, mmu, &vex, SimdOp::MaxSW),
(1, 1, 1, 0xEE) => vpbinop_256(cpu, mmu, &vex, SimdOp::MaxSW),
(1, 1, 0, 0xF4) => vpbinop_128(cpu, mmu, &vex, SimdOp::MulUDQ),
(1, 1, 1, 0xF4) => vpbinop_256(cpu, mmu, &vex, SimdOp::MulUDQ),
(1, 1, 0, 0xF5) => vpbinop_128(cpu, mmu, &vex, SimdOp::MAddWD),
(1, 1, 1, 0xF5) => vpbinop_256(cpu, mmu, &vex, SimdOp::MAddWD),
(1, 1, 0, 0xF6) => vpbinop_128(cpu, mmu, &vex, SimdOp::SadBW),
(1, 1, 1, 0xF6) => vpbinop_256(cpu, mmu, &vex, SimdOp::SadBW),
(1, 1, 0, 0xF8) => vpbinop_128(cpu, mmu, &vex, SimdOp::SubB),
(1, 1, 1, 0xF8) => vpbinop_256(cpu, mmu, &vex, SimdOp::SubB),
(1, 1, 0, 0xF9) => vpbinop_128(cpu, mmu, &vex, SimdOp::SubW),
(1, 1, 1, 0xF9) => vpbinop_256(cpu, mmu, &vex, SimdOp::SubW),
(1, 1, 0, 0xFA) => vpbinop_128(cpu, mmu, &vex, SimdOp::SubD),
(1, 1, 1, 0xFA) => vpbinop_256(cpu, mmu, &vex, SimdOp::SubD),
(1, 1, 0, 0xFB) => vpbinop_128(cpu, mmu, &vex, SimdOp::SubQ),
(1, 1, 1, 0xFB) => vpbinop_256(cpu, mmu, &vex, SimdOp::SubQ),
(1, 1, 0, 0xFC) => vpbinop_128(cpu, mmu, &vex, SimdOp::AddB),
(1, 1, 1, 0xFC) => vpbinop_256(cpu, mmu, &vex, SimdOp::AddB),
(1, 1, 0, 0xFD) => vpbinop_128(cpu, mmu, &vex, SimdOp::AddW),
(1, 1, 1, 0xFD) => vpbinop_256(cpu, mmu, &vex, SimdOp::AddW),
(1, 1, 0, 0xFE) => vpbinop_128(cpu, mmu, &vex, SimdOp::AddD),
(1, 1, 1, 0xFE) => vpbinop_256(cpu, mmu, &vex, SimdOp::AddD),
(1, 1, 0, 0x70) => vpshuf_xmm(cpu, mmu, ShufKind::Dwords),
(1, 1, 1, 0x70) => vpshuf_ymm(cpu, mmu, ShufKind::Dwords),
(1, 2, 0, 0x70) => vpshuf_xmm(cpu, mmu, ShufKind::HighWords),
(1, 2, 1, 0x70) => vpshuf_ymm(cpu, mmu, ShufKind::HighWords),
(1, 3, 0, 0x70) => vpshuf_xmm(cpu, mmu, ShufKind::LowWords),
(1, 3, 1, 0x70) => vpshuf_ymm(cpu, mmu, ShufKind::LowWords),
(1, 1, 1, 0xEF) => vpbitwise_256(cpu, mmu, &vex, BitwiseOp::Xor),
(1, 1, 0, 0xDB) => vpbitwise_128(cpu, mmu, &vex, BitwiseOp::And),
(1, 1, 1, 0xDB) => vpbitwise_256(cpu, mmu, &vex, BitwiseOp::And),
(1, 1, 0, 0xDF) => vpbitwise_128(cpu, mmu, &vex, BitwiseOp::AndNot),
(1, 1, 1, 0xDF) => vpbitwise_256(cpu, mmu, &vex, BitwiseOp::AndNot),
(1, 1, 0, 0xEB) => vpbitwise_128(cpu, mmu, &vex, BitwiseOp::Or),
(1, 1, 1, 0xEB) => vpbitwise_256(cpu, mmu, &vex, BitwiseOp::Or),
(1, 1, 0, 0x71) => vex_group12_xmm(cpu, mmu, &vex),
(1, 1, 1, 0x71) => vex_group12_ymm(cpu, mmu, &vex),
(1, 1, 0, 0x72) => vex_group13_xmm(cpu, mmu, &vex),
(1, 1, 1, 0x72) => vex_group13_ymm(cpu, mmu, &vex),
(1, 1, 0, 0x73) => vex_group14_xmm(cpu, mmu, &vex),
(1, 1, 1, 0x73) => vex_group14_ymm(cpu, mmu, &vex),
(1, 1, 0, 0x6F) => vmovdqa_load_128(cpu, mmu),
(1, 1, 0, 0x7F) => vmovdqa_store_128(cpu, mmu),
(1, 1, 1, 0x76) => vpcmpeqd_256(cpu, mmu, &vex),
(1, 1, 1, 0x6F) => vmovdqa_load_256(cpu, mmu),
(1, 1, 1, 0x7F) => vmovdqa_store_256(cpu, mmu),
(1, 0, 0, 0x77) => vzeroupper(cpu),
(1, 0, 0, 0x11) => vmovups_store_128(cpu, mmu),
(1, 0, 0, 0x10) => vmovaps_load_128(cpu, mmu),
(1, 0, 1, 0x10) => vmovaps_load_256(cpu, mmu),
(1, 0, 1, 0x11) => vmovups_store_256(cpu, mmu),
(1, 0, 0, 0x28) => vmovaps_load_128(cpu, mmu),
(1, 0, 1, 0x28) => vmovaps_load_256(cpu, mmu),
(1, 0, 0, 0x29) => vmovaps_store_128(cpu, mmu),
(1, 0, 1, 0x29) => vmovups_store_256(cpu, mmu),
(1, 2, 0, 0x6F) => vmovdqa_load_128(cpu, mmu),
(1, 2, 1, 0x6F) => vmovdqa_load_256(cpu, mmu),
(1, 2, 0, 0x7F) => vmovdqa_store_128(cpu, mmu),
(3, 1, 0, 0x20) => vpinsrb_128(cpu, mmu, &vex),
(1, 1, 0, 0xC4) => vpinsrw_128(cpu, mmu, &vex),
(1, 1, 0, 0x6E) => vmovd_load(cpu, mmu),
(1, 1, 0, 0x7E) => vmovd_store(cpu, mmu),
(1, 2, 0, 0x7E) => vmovq_load(cpu, mmu),
(1, 1, 0, 0xD6) => vmovq_store(cpu, mmu),
(3, 1, 0, 0x14) => vpextrb_128(cpu, mmu),
(1, 1, 0, 0xC5) => vpextrw_imm_128(cpu, mmu),
(3, 1, 0, 0x15) => vpextrw_mem_128(cpu, mmu),
(3, 1, 0, 0x16) => vpextrd_128(cpu, mmu),
(1, 1, 0, 0xD7) => vpmovmskb_128(cpu, mmu),
(1, 1, 1, 0xD7) => vpmovmskb_256(cpu, mmu),
(2, 1, 0, 0x78) => vpbroadcast(cpu, mmu, BroadcastKind::B, false),
(2, 1, 1, 0x78) => vpbroadcast(cpu, mmu, BroadcastKind::B, true),
(2, 1, 0, 0x79) => vpbroadcast(cpu, mmu, BroadcastKind::W, false),
(2, 1, 1, 0x79) => vpbroadcast(cpu, mmu, BroadcastKind::W, true),
(2, 1, 0, 0x58) => vpbroadcast(cpu, mmu, BroadcastKind::D, false),
(2, 1, 1, 0x58) => vpbroadcast(cpu, mmu, BroadcastKind::D, true),
(2, 1, 0, 0x59) => vpbroadcast(cpu, mmu, BroadcastKind::Q, false),
(2, 1, 1, 0x59) => vpbroadcast(cpu, mmu, BroadcastKind::Q, true),
(2, 1, 1, 0x5A) => vbroadcasti128(cpu, mmu),
(1, 1, 0, 0xE7) => vmovaps_store_128(cpu, mmu),
(1, 1, 1, 0xE7) => vmovups_store_256(cpu, mmu),
(1, 0, 0, 0x2B) => vmovaps_store_128(cpu, mmu),
(1, 0, 1, 0x2B) => vmovups_store_256(cpu, mmu),
(1, 2, 1, 0x7F) => vmovdqa_store_256(cpu, mmu),
(2, 1, 0, 0xF7) => bmi2_shift_x(cpu, mmu, &vex, ShiftKind::Shl),
(2, 2, 0, 0xF7) => bmi2_shift_x(cpu, mmu, &vex, ShiftKind::Sar),
(2, 3, 0, 0xF7) => bmi2_shift_x(cpu, mmu, &vex, ShiftKind::Shr),
_ => {
Err(Trap::UndefinedOpcode {
eip: entry_eip,
opcode: vex_opcode_id(&vex, opcode),
})
}
}
}
fn read_xmm_dst_and_src2(cpu: &mut Cpu, mmu: &Mmu) -> Result<(usize, u128), Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let dst = (mr.reg & 0x7) as usize;
let src2 = match op {
Operand::Reg32(_) => cpu.xmm[(mr.rm & 0x7) as usize],
Operand::Mem32(addr) => {
let bs = mmu.read(cpu.seg_translate(addr), 16)?;
let mut buf = [0u8; 16];
buf.copy_from_slice(&bs);
u128::from_le_bytes(buf)
}
};
Ok((dst, src2))
}
fn vpxor_128(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex) -> Result<StepOk, Trap> {
let (dst, src2) = read_xmm_dst_and_src2(cpu, mmu)?;
let src1 = cpu.xmm[(vex.vvvv & 0x7) as usize];
cpu.xmm[dst] = src1 ^ src2;
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
#[derive(Copy, Clone)]
enum SimdOp {
AddB,
AddW,
AddD,
AddQ,
SubB,
SubW,
SubD,
SubQ,
AddSatUB,
AddSatUW,
AddSatSB,
AddSatSW,
SubSatUB,
SubSatUW,
SubSatSB,
SubSatSW,
MinUB,
MinSW,
MaxUB,
MaxSW,
CmpEqB,
CmpEqW,
CmpGtB,
CmpGtW,
CmpGtD,
MulLowW,
MulHighUW,
MulHighSW,
MulUDQ,
MAddWD,
SadBW,
AvgB,
AvgW,
PackSSWB,
PackSSDW,
PackUSWB,
UnpckLBW,
UnpckLWD,
UnpckLDQ,
UnpckLQDQ,
UnpckHBW,
UnpckHWD,
UnpckHDQ,
UnpckHQDQ,
}
#[inline]
fn lanes_u8(v: u128) -> [u8; 16] {
v.to_le_bytes()
}
#[inline]
fn from_lanes_u8(a: [u8; 16]) -> u128 {
u128::from_le_bytes(a)
}
#[inline]
fn lanes_u16(v: u128) -> [u16; 8] {
let b = v.to_le_bytes();
core::array::from_fn(|i| u16::from_le_bytes([b[2 * i], b[2 * i + 1]]))
}
#[inline]
fn from_lanes_u16(a: [u16; 8]) -> u128 {
let mut b = [0u8; 16];
for i in 0..8 {
let w = a[i].to_le_bytes();
b[2 * i] = w[0];
b[2 * i + 1] = w[1];
}
u128::from_le_bytes(b)
}
#[inline]
fn lanes_u32(v: u128) -> [u32; 4] {
let b = v.to_le_bytes();
core::array::from_fn(|i| {
u32::from_le_bytes([b[4 * i], b[4 * i + 1], b[4 * i + 2], b[4 * i + 3]])
})
}
#[inline]
fn from_lanes_u32(a: [u32; 4]) -> u128 {
let mut b = [0u8; 16];
for i in 0..4 {
let w = a[i].to_le_bytes();
b[4 * i..4 * i + 4].copy_from_slice(&w);
}
u128::from_le_bytes(b)
}
#[inline]
fn lanes_u64(v: u128) -> [u64; 2] {
[v as u64, (v >> 64) as u64]
}
#[inline]
fn from_lanes_u64(a: [u64; 2]) -> u128 {
u128::from(a[0]) | (u128::from(a[1]) << 64)
}
fn simd_op_apply(op: SimdOp, src1: u128, src2: u128) -> u128 {
match op {
SimdOp::AddB => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
from_lanes_u8(core::array::from_fn(|i| a[i].wrapping_add(b[i])))
}
SimdOp::AddW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| a[i].wrapping_add(b[i])))
}
SimdOp::AddD => {
let a = lanes_u32(src1);
let b = lanes_u32(src2);
from_lanes_u32(core::array::from_fn(|i| a[i].wrapping_add(b[i])))
}
SimdOp::AddQ => {
let a = lanes_u64(src1);
let b = lanes_u64(src2);
from_lanes_u64([a[0].wrapping_add(b[0]), a[1].wrapping_add(b[1])])
}
SimdOp::SubB => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
from_lanes_u8(core::array::from_fn(|i| a[i].wrapping_sub(b[i])))
}
SimdOp::SubW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| a[i].wrapping_sub(b[i])))
}
SimdOp::SubD => {
let a = lanes_u32(src1);
let b = lanes_u32(src2);
from_lanes_u32(core::array::from_fn(|i| a[i].wrapping_sub(b[i])))
}
SimdOp::SubQ => {
let a = lanes_u64(src1);
let b = lanes_u64(src2);
from_lanes_u64([a[0].wrapping_sub(b[0]), a[1].wrapping_sub(b[1])])
}
SimdOp::AddSatUB => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
from_lanes_u8(core::array::from_fn(|i| a[i].saturating_add(b[i])))
}
SimdOp::AddSatUW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| a[i].saturating_add(b[i])))
}
SimdOp::AddSatSB => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
from_lanes_u8(core::array::from_fn(|i| {
(a[i] as i8).saturating_add(b[i] as i8) as u8
}))
}
SimdOp::AddSatSW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| {
(a[i] as i16).saturating_add(b[i] as i16) as u16
}))
}
SimdOp::SubSatUB => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
from_lanes_u8(core::array::from_fn(|i| a[i].saturating_sub(b[i])))
}
SimdOp::SubSatUW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| a[i].saturating_sub(b[i])))
}
SimdOp::SubSatSB => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
from_lanes_u8(core::array::from_fn(|i| {
(a[i] as i8).saturating_sub(b[i] as i8) as u8
}))
}
SimdOp::SubSatSW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| {
(a[i] as i16).saturating_sub(b[i] as i16) as u16
}))
}
SimdOp::MinUB => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
from_lanes_u8(core::array::from_fn(|i| a[i].min(b[i])))
}
SimdOp::MaxUB => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
from_lanes_u8(core::array::from_fn(|i| a[i].max(b[i])))
}
SimdOp::MinSW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| {
((a[i] as i16).min(b[i] as i16)) as u16
}))
}
SimdOp::MaxSW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| {
((a[i] as i16).max(b[i] as i16)) as u16
}))
}
SimdOp::CmpEqB => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
from_lanes_u8(core::array::from_fn(
|i| if a[i] == b[i] { 0xFF } else { 0 },
))
}
SimdOp::CmpEqW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(
|i| if a[i] == b[i] { 0xFFFF } else { 0 },
))
}
SimdOp::CmpGtB => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
from_lanes_u8(core::array::from_fn(|i| {
if (a[i] as i8) > (b[i] as i8) {
0xFF
} else {
0
}
}))
}
SimdOp::CmpGtW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| {
if (a[i] as i16) > (b[i] as i16) {
0xFFFF
} else {
0
}
}))
}
SimdOp::CmpGtD => {
let a = lanes_u32(src1);
let b = lanes_u32(src2);
from_lanes_u32(core::array::from_fn(|i| {
if (a[i] as i32) > (b[i] as i32) {
0xFFFF_FFFF
} else {
0
}
}))
}
SimdOp::MulLowW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| a[i].wrapping_mul(b[i])))
}
SimdOp::MulHighUW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| {
((u32::from(a[i]) * u32::from(b[i])) >> 16) as u16
}))
}
SimdOp::MulHighSW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| {
((i32::from(a[i] as i16) * i32::from(b[i] as i16)) >> 16) as u16
}))
}
SimdOp::MulUDQ => {
let a = lanes_u64(src1);
let b = lanes_u64(src2);
from_lanes_u64([
u64::from(a[0] as u32) * u64::from(b[0] as u32),
u64::from(a[1] as u32) * u64::from(b[1] as u32),
])
}
SimdOp::MAddWD => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u32(core::array::from_fn(|i| {
let p0 = i32::from(a[2 * i] as i16) * i32::from(b[2 * i] as i16);
let p1 = i32::from(a[2 * i + 1] as i16) * i32::from(b[2 * i + 1] as i16);
p0.wrapping_add(p1) as u32
}))
}
SimdOp::SadBW => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
let mut sums = [0u16; 2];
for half in 0..2 {
let mut s: u32 = 0;
for i in 0..8 {
let aa = a[half * 8 + i] as i16;
let bb = b[half * 8 + i] as i16;
s += (aa - bb).unsigned_abs() as u32;
}
sums[half] = s as u16;
}
from_lanes_u64([u64::from(sums[0]), u64::from(sums[1])])
}
SimdOp::AvgB => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
from_lanes_u8(core::array::from_fn(|i| {
((u16::from(a[i]) + u16::from(b[i]) + 1) >> 1) as u8
}))
}
SimdOp::AvgW => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
from_lanes_u16(core::array::from_fn(|i| {
((u32::from(a[i]) + u32::from(b[i]) + 1) >> 1) as u16
}))
}
SimdOp::PackSSWB => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
let mut out = [0u8; 16];
for i in 0..8 {
out[i] = (a[i] as i16).clamp(-128, 127) as i8 as u8;
out[i + 8] = (b[i] as i16).clamp(-128, 127) as i8 as u8;
}
from_lanes_u8(out)
}
SimdOp::PackSSDW => {
let a = lanes_u32(src1);
let b = lanes_u32(src2);
let mut out = [0u16; 8];
for i in 0..4 {
out[i] = (a[i] as i32).clamp(-32768, 32767) as i16 as u16;
out[i + 4] = (b[i] as i32).clamp(-32768, 32767) as i16 as u16;
}
from_lanes_u16(out)
}
SimdOp::PackUSWB => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
let mut out = [0u8; 16];
for i in 0..8 {
out[i] = (a[i] as i16).clamp(0, 255) as u8;
out[i + 8] = (b[i] as i16).clamp(0, 255) as u8;
}
from_lanes_u8(out)
}
SimdOp::UnpckLBW => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
let mut out = [0u8; 16];
for i in 0..8 {
out[2 * i] = a[i];
out[2 * i + 1] = b[i];
}
from_lanes_u8(out)
}
SimdOp::UnpckHBW => {
let a = lanes_u8(src1);
let b = lanes_u8(src2);
let mut out = [0u8; 16];
for i in 0..8 {
out[2 * i] = a[i + 8];
out[2 * i + 1] = b[i + 8];
}
from_lanes_u8(out)
}
SimdOp::UnpckLWD => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
let mut out = [0u16; 8];
for i in 0..4 {
out[2 * i] = a[i];
out[2 * i + 1] = b[i];
}
from_lanes_u16(out)
}
SimdOp::UnpckHWD => {
let a = lanes_u16(src1);
let b = lanes_u16(src2);
let mut out = [0u16; 8];
for i in 0..4 {
out[2 * i] = a[i + 4];
out[2 * i + 1] = b[i + 4];
}
from_lanes_u16(out)
}
SimdOp::UnpckLDQ => {
let a = lanes_u32(src1);
let b = lanes_u32(src2);
from_lanes_u32([a[0], b[0], a[1], b[1]])
}
SimdOp::UnpckHDQ => {
let a = lanes_u32(src1);
let b = lanes_u32(src2);
from_lanes_u32([a[2], b[2], a[3], b[3]])
}
SimdOp::UnpckLQDQ => {
let a = lanes_u64(src1);
let b = lanes_u64(src2);
from_lanes_u64([a[0], b[0]])
}
SimdOp::UnpckHQDQ => {
let a = lanes_u64(src1);
let b = lanes_u64(src2);
from_lanes_u64([a[1], b[1]])
}
}
}
fn vpbinop_128(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex, op: SimdOp) -> Result<StepOk, Trap> {
let (dst, src2) = read_xmm_dst_and_src2(cpu, mmu)?;
let src1 = cpu.xmm[(vex.vvvv & 0x7) as usize];
cpu.xmm[dst] = simd_op_apply(op, src1, src2);
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vpbinop_256(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex, op: SimdOp) -> Result<StepOk, Trap> {
let (dst, s2l, s2h) = read_ymm_dst_and_src2(cpu, mmu)?;
let s1 = (vex.vvvv & 0x7) as usize;
cpu.xmm[dst] = simd_op_apply(op, cpu.xmm[s1], s2l);
cpu.ymm_high[dst] = simd_op_apply(op, cpu.ymm_high[s1], s2h);
Ok(StepOk::Continued)
}
#[derive(Copy, Clone)]
enum ShufKind {
Dwords,
LowWords,
HighWords,
}
fn pshuf_lane_128(src: u128, kind: ShufKind, imm: u8) -> u128 {
match kind {
ShufKind::Dwords => {
let mut out: u128 = 0;
for lane in 0..4 {
let sel = ((imm >> (lane * 2)) & 0x3) as u32;
let src_dw = ((src >> (sel * 32)) & 0xFFFF_FFFF) as u32;
out |= u128::from(src_dw) << (lane * 32);
}
out
}
ShufKind::LowWords => {
let low = src & 0xFFFF_FFFF_FFFF_FFFF;
let high = src & (u128::from(u64::MAX) << 64);
let mut new_low: u128 = 0;
for lane in 0..4 {
let sel = ((imm >> (lane * 2)) & 0x3) as u32;
let src_w = ((low >> (sel * 16)) & 0xFFFF) as u16;
new_low |= u128::from(src_w) << (lane * 16);
}
high | new_low
}
ShufKind::HighWords => {
let low_pass = src & 0xFFFF_FFFF_FFFF_FFFF;
let high = (src >> 64) & 0xFFFF_FFFF_FFFF_FFFF;
let mut new_high: u128 = 0;
for lane in 0..4 {
let sel = ((imm >> (lane * 2)) & 0x3) as u32;
let src_w = ((high >> (sel * 16)) & 0xFFFF) as u16;
new_high |= u128::from(src_w) << (lane * 16);
}
(new_high << 64) | low_pass
}
}
}
fn vpshuf_xmm(cpu: &mut Cpu, mmu: &Mmu, kind: ShufKind) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let src: u128 = match op {
Operand::Reg32(_) => cpu.xmm[(mr.rm & 0x7) as usize],
Operand::Mem32(addr) => {
let addr = cpu.seg_translate(addr);
let bs = mmu.read(addr, 16)?;
let mut buf = [0u8; 16];
buf.copy_from_slice(&bs);
u128::from_le_bytes(buf)
}
};
let imm = cpu.fetch_imm8_pub(mmu)?;
let dst = (mr.reg & 0x7) as usize;
cpu.xmm[dst] = pshuf_lane_128(src, kind, imm);
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vpshuf_ymm(cpu: &mut Cpu, mmu: &Mmu, kind: ShufKind) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let (low, high): (u128, u128) = match op {
Operand::Reg32(_) => {
let idx = (mr.rm & 0x7) as usize;
(cpu.xmm[idx], cpu.ymm_high[idx])
}
Operand::Mem32(addr) => {
let addr = cpu.seg_translate(addr);
let lo = mmu.read(addr, 16)?;
let hi = mmu.read(addr.wrapping_add(16), 16)?;
let mut lb = [0u8; 16];
let mut hb = [0u8; 16];
lb.copy_from_slice(&lo);
hb.copy_from_slice(&hi);
(u128::from_le_bytes(lb), u128::from_le_bytes(hb))
}
};
let imm = cpu.fetch_imm8_pub(mmu)?;
let dst = (mr.reg & 0x7) as usize;
cpu.xmm[dst] = pshuf_lane_128(low, kind, imm);
cpu.ymm_high[dst] = pshuf_lane_128(high, kind, imm);
Ok(StepOk::Continued)
}
#[derive(Copy, Clone)]
enum BitwiseOp {
And,
AndNot,
Or,
Xor,
}
fn bitwise_apply(op: BitwiseOp, a: u128, b: u128) -> u128 {
match op {
BitwiseOp::And => a & b,
BitwiseOp::AndNot => (!a) & b,
BitwiseOp::Or => a | b,
BitwiseOp::Xor => a ^ b,
}
}
fn vpbitwise_128(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex, op: BitwiseOp) -> Result<StepOk, Trap> {
let (dst, src2) = read_xmm_dst_and_src2(cpu, mmu)?;
let src1 = cpu.xmm[(vex.vvvv & 0x7) as usize];
cpu.xmm[dst] = bitwise_apply(op, src1, src2);
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vpbitwise_256(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex, op: BitwiseOp) -> Result<StepOk, Trap> {
let (dst, src2_low, src2_high) = read_ymm_dst_and_src2(cpu, mmu)?;
let src1_idx = (vex.vvvv & 0x7) as usize;
cpu.xmm[dst] = bitwise_apply(op, cpu.xmm[src1_idx], src2_low);
cpu.ymm_high[dst] = bitwise_apply(op, cpu.ymm_high[src1_idx], src2_high);
Ok(StepOk::Continued)
}
#[derive(Copy, Clone)]
enum ShiftKind {
Shl,
Shr,
Sar,
}
fn bmi2_shift_x(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex, kind: ShiftKind) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let value = match op {
Operand::Reg32(r) => cpu.regs.get32(r),
Operand::Mem32(addr) => mmu.load32(cpu.seg_translate(addr))?,
};
let count = cpu.regs.get32(Reg32::from_bits(vex.vvvv & 0x7)) & 31;
let dst = Reg32::from_bits(mr.reg & 0x7);
let result = match kind {
ShiftKind::Shl => value.wrapping_shl(count),
ShiftKind::Shr => value.wrapping_shr(count),
ShiftKind::Sar => (value as i32).wrapping_shr(count) as u32,
};
cpu.regs.set32(dst, result);
Ok(StepOk::Continued)
}
fn vmovups_store_128(cpu: &mut Cpu, mmu: &mut Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let src = cpu.xmm[(mr.reg & 0x7) as usize];
match op {
Operand::Reg32(_) => {
let dst = (mr.rm & 0x7) as usize;
cpu.xmm[dst] = src;
cpu.ymm_high[dst] = 0;
}
Operand::Mem32(addr) => {
mmu.write(cpu.seg_translate(addr), &src.to_le_bytes())?;
}
}
Ok(StepOk::Continued)
}
fn read_group_shift_operands(
cpu: &mut Cpu,
mmu: &Mmu,
vex: &Vex,
) -> Result<(usize, u8, usize, u32), Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let src = (mr.rm & 0x7) as usize;
let dst = (vex.vvvv & 0x7) as usize;
let sub = mr.reg & 0x7;
let imm = u32::from(cpu.fetch_imm8_pub(mmu)?);
Ok((src, sub, dst, imm))
}
#[derive(Copy, Clone)]
enum LaneShift {
Srl,
Sra,
Sll,
}
fn shift_lanes_u16(v: u128, kind: LaneShift, count: u32) -> u128 {
if count >= 16 {
return match kind {
LaneShift::Sra => {
let mut out: u128 = 0;
for lane in 0..8 {
let shift = lane * 16;
let aa = ((v >> shift) & 0xFFFF) as u16 as i16;
let mask: u128 = if aa < 0 { 0xFFFF } else { 0 };
out |= mask << shift;
}
out
}
_ => 0,
};
}
let mut out: u128 = 0;
for lane in 0..8 {
let shift = lane * 16;
let aa = ((v >> shift) & 0xFFFF) as u16;
let rr: u16 = match kind {
LaneShift::Sll => aa.wrapping_shl(count),
LaneShift::Srl => aa.wrapping_shr(count),
LaneShift::Sra => (aa as i16).wrapping_shr(count) as u16,
};
out |= u128::from(rr) << shift;
}
out
}
fn shift_lanes_u32(v: u128, kind: LaneShift, count: u32) -> u128 {
if count >= 32 {
return match kind {
LaneShift::Sra => {
let mut out: u128 = 0;
for lane in 0..4 {
let shift = lane * 32;
let aa = ((v >> shift) & 0xFFFF_FFFF) as u32 as i32;
let mask: u128 = if aa < 0 { 0xFFFF_FFFF } else { 0 };
out |= mask << shift;
}
out
}
_ => 0,
};
}
let mut out: u128 = 0;
for lane in 0..4 {
let shift = lane * 32;
let aa = ((v >> shift) & 0xFFFF_FFFF) as u32;
let rr: u32 = match kind {
LaneShift::Sll => aa.wrapping_shl(count),
LaneShift::Srl => aa.wrapping_shr(count),
LaneShift::Sra => (aa as i32).wrapping_shr(count) as u32,
};
out |= u128::from(rr) << shift;
}
out
}
fn shift_lanes_u64(v: u128, kind: LaneShift, count: u32) -> u128 {
if count >= 64 {
return match kind {
LaneShift::Sra => {
let mut out: u128 = 0;
for lane in 0..2 {
let shift = lane * 64;
let aa = ((v >> shift) & u128::from(u64::MAX)) as u64 as i64;
let mask: u128 = if aa < 0 { u128::from(u64::MAX) } else { 0 };
out |= mask << shift;
}
out
}
_ => 0,
};
}
let mut out: u128 = 0;
for lane in 0..2 {
let shift = lane * 64;
let aa = ((v >> shift) & u128::from(u64::MAX)) as u64;
let rr: u64 = match kind {
LaneShift::Sll => aa.wrapping_shl(count),
LaneShift::Srl => aa.wrapping_shr(count),
LaneShift::Sra => (aa as i64).wrapping_shr(count) as u64,
};
out |= u128::from(rr) << shift;
}
out
}
fn shift_lanes_byte_128(v: u128, kind: LaneShift, count: u32) -> u128 {
let bytes = count.min(16);
let bits = bytes * 8;
match kind {
LaneShift::Sll => {
if bits >= 128 {
0
} else {
v << bits
}
}
LaneShift::Srl => {
if bits >= 128 {
0
} else {
v >> bits
}
}
LaneShift::Sra => unreachable!("byte shift has no arithmetic form"),
}
}
fn group12_kind(sub: u8) -> Result<LaneShift, Trap> {
match sub {
2 => Ok(LaneShift::Srl),
4 => Ok(LaneShift::Sra),
6 => Ok(LaneShift::Sll),
_ => Err(Trap::UndefinedOpcode {
eip: 0,
opcode: 0x71_0000 | u32::from(sub),
}),
}
}
fn group13_kind(sub: u8) -> Result<LaneShift, Trap> {
match sub {
2 => Ok(LaneShift::Srl),
4 => Ok(LaneShift::Sra),
6 => Ok(LaneShift::Sll),
_ => Err(Trap::UndefinedOpcode {
eip: 0,
opcode: 0x72_0000 | u32::from(sub),
}),
}
}
fn vex_group12_xmm(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex) -> Result<StepOk, Trap> {
let (src, sub, dst, imm) = read_group_shift_operands(cpu, mmu, vex)?;
let kind = group12_kind(sub)?;
cpu.xmm[dst] = shift_lanes_u16(cpu.xmm[src], kind, imm);
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vex_group12_ymm(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex) -> Result<StepOk, Trap> {
let (src, sub, dst, imm) = read_group_shift_operands(cpu, mmu, vex)?;
let kind = group12_kind(sub)?;
cpu.xmm[dst] = shift_lanes_u16(cpu.xmm[src], kind, imm);
cpu.ymm_high[dst] = shift_lanes_u16(cpu.ymm_high[src], kind, imm);
Ok(StepOk::Continued)
}
fn vex_group13_xmm(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex) -> Result<StepOk, Trap> {
let (src, sub, dst, imm) = read_group_shift_operands(cpu, mmu, vex)?;
let kind = group13_kind(sub)?;
cpu.xmm[dst] = shift_lanes_u32(cpu.xmm[src], kind, imm);
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vex_group13_ymm(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex) -> Result<StepOk, Trap> {
let (src, sub, dst, imm) = read_group_shift_operands(cpu, mmu, vex)?;
let kind = group13_kind(sub)?;
cpu.xmm[dst] = shift_lanes_u32(cpu.xmm[src], kind, imm);
cpu.ymm_high[dst] = shift_lanes_u32(cpu.ymm_high[src], kind, imm);
Ok(StepOk::Continued)
}
fn vex_group14_xmm(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex) -> Result<StepOk, Trap> {
let (src, sub, dst, imm) = read_group_shift_operands(cpu, mmu, vex)?;
let result = match sub {
2 => shift_lanes_u64(cpu.xmm[src], LaneShift::Srl, imm),
6 => shift_lanes_u64(cpu.xmm[src], LaneShift::Sll, imm),
3 => shift_lanes_byte_128(cpu.xmm[src], LaneShift::Srl, imm), 7 => shift_lanes_byte_128(cpu.xmm[src], LaneShift::Sll, imm), _ => {
return Err(Trap::UndefinedOpcode {
eip: 0,
opcode: 0x73_0000 | u32::from(sub),
})
}
};
cpu.xmm[dst] = result;
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vex_group14_ymm(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex) -> Result<StepOk, Trap> {
let (src, sub, dst, imm) = read_group_shift_operands(cpu, mmu, vex)?;
let (low, high) = match sub {
2 => (
shift_lanes_u64(cpu.xmm[src], LaneShift::Srl, imm),
shift_lanes_u64(cpu.ymm_high[src], LaneShift::Srl, imm),
),
6 => (
shift_lanes_u64(cpu.xmm[src], LaneShift::Sll, imm),
shift_lanes_u64(cpu.ymm_high[src], LaneShift::Sll, imm),
),
3 => (
shift_lanes_byte_128(cpu.xmm[src], LaneShift::Srl, imm),
shift_lanes_byte_128(cpu.ymm_high[src], LaneShift::Srl, imm),
),
7 => (
shift_lanes_byte_128(cpu.xmm[src], LaneShift::Sll, imm),
shift_lanes_byte_128(cpu.ymm_high[src], LaneShift::Sll, imm),
),
_ => {
return Err(Trap::UndefinedOpcode {
eip: 0,
opcode: 0x73_0000 | u32::from(sub),
})
}
};
cpu.xmm[dst] = low;
cpu.ymm_high[dst] = high;
Ok(StepOk::Continued)
}
fn vmovdqa_load_128(cpu: &mut Cpu, mmu: &Mmu) -> Result<StepOk, Trap> {
let (dst, src2) = read_xmm_dst_and_src2(cpu, mmu)?;
cpu.xmm[dst] = src2;
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vmovdqa_store_128(cpu: &mut Cpu, mmu: &mut Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let src = cpu.xmm[(mr.reg & 0x7) as usize];
match op {
Operand::Reg32(_) => {
let dst = (mr.rm & 0x7) as usize;
cpu.xmm[dst] = src;
cpu.ymm_high[dst] = 0;
}
Operand::Mem32(addr) => {
mmu.write(cpu.seg_translate(addr), &src.to_le_bytes())?;
}
}
Ok(StepOk::Continued)
}
fn vzeroupper(cpu: &mut Cpu) -> Result<StepOk, Trap> {
cpu.ymm_high = [0u128; 8];
Ok(StepOk::Continued)
}
#[derive(Copy, Clone)]
enum BroadcastKind {
B,
W,
D,
Q,
}
fn vpbroadcast(
cpu: &mut Cpu,
mmu: &Mmu,
kind: BroadcastKind,
is_256: bool,
) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let dst = (mr.reg & 0x7) as usize;
let lane: u128 = match (op, kind) {
(Operand::Reg32(_), BroadcastKind::B) => cpu.xmm[(mr.rm & 0x7) as usize] & 0xFF,
(Operand::Reg32(_), BroadcastKind::W) => cpu.xmm[(mr.rm & 0x7) as usize] & 0xFFFF,
(Operand::Reg32(_), BroadcastKind::D) => cpu.xmm[(mr.rm & 0x7) as usize] & 0xFFFF_FFFF,
(Operand::Reg32(_), BroadcastKind::Q) => {
cpu.xmm[(mr.rm & 0x7) as usize] & ((1u128 << 64) - 1)
}
(Operand::Mem32(addr), BroadcastKind::B) => {
let addr = cpu.seg_translate(addr);
u128::from(mmu.read(addr, 1)?[0])
}
(Operand::Mem32(addr), BroadcastKind::W) => {
let addr = cpu.seg_translate(addr);
let b = mmu.read(addr, 2)?;
u128::from(u16::from_le_bytes([b[0], b[1]]))
}
(Operand::Mem32(addr), BroadcastKind::D) => {
let addr = cpu.seg_translate(addr);
u128::from(mmu.load32(addr)?)
}
(Operand::Mem32(addr), BroadcastKind::Q) => {
let addr = cpu.seg_translate(addr);
u128::from(mmu.load64(addr)?)
}
};
let low = match kind {
BroadcastKind::B => {
let v = lane as u8;
from_lanes_u8([v; 16])
}
BroadcastKind::W => {
let v = lane as u16;
from_lanes_u16([v; 8])
}
BroadcastKind::D => {
let v = lane as u32;
from_lanes_u32([v; 4])
}
BroadcastKind::Q => {
let v = lane as u64;
from_lanes_u64([v; 2])
}
};
cpu.xmm[dst] = low;
cpu.ymm_high[dst] = if is_256 { low } else { 0 };
Ok(StepOk::Continued)
}
fn vbroadcasti128(cpu: &mut Cpu, mmu: &Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let v: u128 = match op {
Operand::Mem32(addr) => {
let addr = cpu.seg_translate(addr);
let bs = mmu.read(addr, 16)?;
let mut buf = [0u8; 16];
buf.copy_from_slice(&bs);
u128::from_le_bytes(buf)
}
Operand::Reg32(_) => {
return Err(Trap::UndefinedOpcode {
eip: cpu.regs.eip.wrapping_sub(consumed as u32 + 3),
opcode: 0x0F38_5A_00,
})
}
};
let dst = (mr.reg & 0x7) as usize;
cpu.xmm[dst] = v;
cpu.ymm_high[dst] = v;
Ok(StepOk::Continued)
}
fn vmovd_load(cpu: &mut Cpu, mmu: &Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let value: u32 = match op {
Operand::Reg32(r) => cpu.regs.get32(r),
Operand::Mem32(addr) => mmu.load32(cpu.seg_translate(addr))?,
};
let dst = (mr.reg & 0x7) as usize;
cpu.xmm[dst] = u128::from(value);
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vmovd_store(cpu: &mut Cpu, mmu: &mut Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let value = cpu.xmm[(mr.reg & 0x7) as usize] as u32;
match op {
Operand::Reg32(r) => cpu.regs.set32(r, value),
Operand::Mem32(addr) => mmu.store32(cpu.seg_translate(addr), value)?,
}
Ok(StepOk::Continued)
}
fn vmovq_load(cpu: &mut Cpu, mmu: &Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let low: u64 = match op {
Operand::Reg32(_) => cpu.xmm[(mr.rm & 0x7) as usize] as u64,
Operand::Mem32(addr) => mmu.load64(cpu.seg_translate(addr))?,
};
let dst = (mr.reg & 0x7) as usize;
cpu.xmm[dst] = u128::from(low);
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vmovq_store(cpu: &mut Cpu, mmu: &mut Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let low = cpu.xmm[(mr.reg & 0x7) as usize] as u64;
match op {
Operand::Reg32(_) => {
let dst = (mr.rm & 0x7) as usize;
cpu.xmm[dst] = u128::from(low);
cpu.ymm_high[dst] = 0;
}
Operand::Mem32(addr) => {
mmu.write(cpu.seg_translate(addr), &low.to_le_bytes())?;
}
}
Ok(StepOk::Continued)
}
fn vpextrb_128(cpu: &mut Cpu, mmu: &mut Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let imm = cpu.fetch_imm8_pub(mmu)?;
let lane = (imm & 0xF) as usize;
let v = cpu.xmm[(mr.reg & 0x7) as usize].to_le_bytes()[lane];
match op {
Operand::Reg32(r) => cpu.regs.set32(r, u32::from(v)),
Operand::Mem32(addr) => mmu.write(cpu.seg_translate(addr), &[v])?,
}
Ok(StepOk::Continued)
}
fn vpextrw_imm_128(cpu: &mut Cpu, mmu: &Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let _bytes = cpu.peek_after_modrm(mmu, 16)?;
cpu.advance_eip(1);
let imm = cpu.fetch_imm8_pub(mmu)?;
let lane = (imm & 0x7) as usize;
let words = lanes_u16(cpu.xmm[(mr.rm & 0x7) as usize]);
cpu.regs
.set32(Reg32::from_bits(mr.reg & 0x7), u32::from(words[lane]));
Ok(StepOk::Continued)
}
fn vpextrw_mem_128(cpu: &mut Cpu, mmu: &mut Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let imm = cpu.fetch_imm8_pub(mmu)?;
let lane = (imm & 0x7) as usize;
let words = lanes_u16(cpu.xmm[(mr.reg & 0x7) as usize]);
let v = words[lane];
match op {
Operand::Reg32(r) => cpu.regs.set32(r, u32::from(v)),
Operand::Mem32(addr) => {
mmu.write(cpu.seg_translate(addr), &v.to_le_bytes())?;
}
}
Ok(StepOk::Continued)
}
fn vpextrd_128(cpu: &mut Cpu, mmu: &mut Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let imm = cpu.fetch_imm8_pub(mmu)?;
let lane = (imm & 0x3) as usize;
let dwords = lanes_u32(cpu.xmm[(mr.reg & 0x7) as usize]);
let v = dwords[lane];
match op {
Operand::Reg32(r) => cpu.regs.set32(r, v),
Operand::Mem32(addr) => mmu.store32(cpu.seg_translate(addr), v)?,
}
Ok(StepOk::Continued)
}
fn vpmovmskb_128(cpu: &mut Cpu, mmu: &Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let _bytes = cpu.peek_after_modrm(mmu, 16)?;
cpu.advance_eip(1);
let bs = cpu.xmm[(mr.rm & 0x7) as usize].to_le_bytes();
let mut mask: u32 = 0;
for (i, b) in bs.iter().enumerate() {
if (*b) & 0x80 != 0 {
mask |= 1u32 << i;
}
}
cpu.regs.set32(Reg32::from_bits(mr.reg & 0x7), mask);
Ok(StepOk::Continued)
}
fn vpmovmskb_256(cpu: &mut Cpu, mmu: &Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let _bytes = cpu.peek_after_modrm(mmu, 16)?;
cpu.advance_eip(1);
let idx = (mr.rm & 0x7) as usize;
let low = cpu.xmm[idx].to_le_bytes();
let high = cpu.ymm_high[idx].to_le_bytes();
let mut mask: u32 = 0;
for (i, b) in low.iter().enumerate() {
if (*b) & 0x80 != 0 {
mask |= 1u32 << i;
}
}
for (i, b) in high.iter().enumerate() {
if (*b) & 0x80 != 0 {
mask |= 1u32 << (i + 16);
}
}
cpu.regs.set32(Reg32::from_bits(mr.reg & 0x7), mask);
Ok(StepOk::Continued)
}
fn vpinsrb_128(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let value: u8 = match op {
Operand::Reg32(r) => cpu.regs.get32(r) as u8,
Operand::Mem32(addr) => {
let addr = cpu.seg_translate(addr);
mmu.read(addr, 1)?[0]
}
};
let imm = cpu.fetch_imm8_pub(mmu)?;
let lane = (imm & 0xF) as u32;
let src1 = (vex.vvvv & 0x7) as usize;
let mut bs = cpu.xmm[src1].to_le_bytes();
bs[lane as usize] = value;
let dst = (mr.reg & 0x7) as usize;
cpu.xmm[dst] = u128::from_le_bytes(bs);
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vpinsrw_128(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let value: u16 = match op {
Operand::Reg32(r) => cpu.regs.get32(r) as u16,
Operand::Mem32(addr) => {
let addr = cpu.seg_translate(addr);
let b = mmu.read(addr, 2)?;
u16::from_le_bytes([b[0], b[1]])
}
};
let imm = cpu.fetch_imm8_pub(mmu)?;
let lane = (imm & 0x7) as u32;
let src1 = (vex.vvvv & 0x7) as usize;
let mut words = lanes_u16(cpu.xmm[src1]);
words[lane as usize] = value;
let dst = (mr.reg & 0x7) as usize;
cpu.xmm[dst] = from_lanes_u16(words);
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vmovaps_load_128(cpu: &mut Cpu, mmu: &Mmu) -> Result<StepOk, Trap> {
let (dst, src2) = read_xmm_dst_and_src2(cpu, mmu)?;
cpu.xmm[dst] = src2;
cpu.ymm_high[dst] = 0;
Ok(StepOk::Continued)
}
fn vmovaps_store_128(cpu: &mut Cpu, mmu: &mut Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let src = cpu.xmm[(mr.reg & 0x7) as usize];
match op {
Operand::Reg32(_) => {
let dst = (mr.rm & 0x7) as usize;
cpu.xmm[dst] = src;
cpu.ymm_high[dst] = 0;
}
Operand::Mem32(addr) => {
mmu.write(cpu.seg_translate(addr), &src.to_le_bytes())?;
}
}
Ok(StepOk::Continued)
}
fn vmovaps_load_256(cpu: &mut Cpu, mmu: &Mmu) -> Result<StepOk, Trap> {
let (dst, low, high) = read_ymm_dst_and_src2(cpu, mmu)?;
cpu.xmm[dst] = low;
cpu.ymm_high[dst] = high;
Ok(StepOk::Continued)
}
fn vmovups_store_256(cpu: &mut Cpu, mmu: &mut Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let src = (mr.reg & 0x7) as usize;
let low = cpu.xmm[src];
let high = cpu.ymm_high[src];
match op {
Operand::Reg32(_) => {
let dst = (mr.rm & 0x7) as usize;
cpu.xmm[dst] = low;
cpu.ymm_high[dst] = high;
}
Operand::Mem32(addr) => {
let addr = cpu.seg_translate(addr);
mmu.write(addr, &low.to_le_bytes())?;
mmu.write(addr.wrapping_add(16), &high.to_le_bytes())?;
}
}
Ok(StepOk::Continued)
}
fn read_ymm_dst_and_src2(cpu: &mut Cpu, mmu: &Mmu) -> Result<(usize, u128, u128), Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let dst = (mr.reg & 0x7) as usize;
let (low, high) = match op {
Operand::Reg32(_) => {
let idx = (mr.rm & 0x7) as usize;
(cpu.xmm[idx], cpu.ymm_high[idx])
}
Operand::Mem32(addr) => {
let addr = cpu.seg_translate(addr);
let bs_low = mmu.read(addr, 16)?;
let bs_high = mmu.read(addr.wrapping_add(16), 16)?;
let mut low = [0u8; 16];
let mut high = [0u8; 16];
low.copy_from_slice(&bs_low);
high.copy_from_slice(&bs_high);
(u128::from_le_bytes(low), u128::from_le_bytes(high))
}
};
Ok((dst, low, high))
}
fn vpcmpeqd_256(cpu: &mut Cpu, mmu: &Mmu, vex: &Vex) -> Result<StepOk, Trap> {
let (dst, src2_low, src2_high) = read_ymm_dst_and_src2(cpu, mmu)?;
let src1_idx = (vex.vvvv & 0x7) as usize;
let src1_low = cpu.xmm[src1_idx];
let src1_high = cpu.ymm_high[src1_idx];
cpu.xmm[dst] = pcmpeqd_lanes_128(src1_low, src2_low);
cpu.ymm_high[dst] = pcmpeqd_lanes_128(src1_high, src2_high);
Ok(StepOk::Continued)
}
fn pcmpeqd_lanes_128(a: u128, b: u128) -> u128 {
let mut out: u128 = 0;
for lane in 0..4 {
let shift = lane * 32;
let aa = ((a >> shift) & 0xFFFF_FFFF) as u32;
let bb = ((b >> shift) & 0xFFFF_FFFF) as u32;
let mask: u128 = if aa == bb { 0xFFFF_FFFF } else { 0 };
out |= mask << shift;
}
out
}
fn vmovdqa_load_256(cpu: &mut Cpu, mmu: &Mmu) -> Result<StepOk, Trap> {
let (dst, low, high) = read_ymm_dst_and_src2(cpu, mmu)?;
cpu.xmm[dst] = low;
cpu.ymm_high[dst] = high;
Ok(StepOk::Continued)
}
fn vmovdqa_store_256(cpu: &mut Cpu, mmu: &mut Mmu) -> Result<StepOk, Trap> {
let mr = cpu.fetch_modrm(mmu)?;
let bytes = cpu.peek_after_modrm(mmu, 16)?;
let (op, consumed) = resolve_modrm32(mr, &bytes, &cpu.regs)?;
cpu.advance_eip(consumed as u32);
let src = (mr.reg & 0x7) as usize;
let low = cpu.xmm[src];
let high = cpu.ymm_high[src];
match op {
Operand::Reg32(_) => {
let dst = (mr.rm & 0x7) as usize;
cpu.xmm[dst] = low;
cpu.ymm_high[dst] = high;
}
Operand::Mem32(addr) => {
let addr = cpu.seg_translate(addr);
mmu.write(addr, &low.to_le_bytes())?;
mmu.write(addr.wrapping_add(16), &high.to_le_bytes())?;
}
}
Ok(StepOk::Continued)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn vex_c5_decode_pp_l_vvvv() {
let v = Vex::from_c5(0b1110_0001);
assert_eq!(v.map, 1);
assert_eq!(v.pp, 1);
assert_eq!(v.l, 0);
assert_eq!(v.vvvv, 3);
assert!(!v.w);
}
#[test]
fn vex_c4_decode_w_map() {
let v = Vex::from_c4(0b1110_0010, 0b1101_0111);
assert_eq!(v.map, 2);
assert_eq!(v.pp, 3);
assert_eq!(v.l, 1);
assert!(v.w);
assert_eq!(v.vvvv, 5);
}
#[test]
fn vex_opcode_id_round_trip() {
let v = Vex {
map: 3,
pp: 2,
l: 1,
w: true,
vvvv: 0xA,
};
let id = vex_opcode_id(&v, 0x58);
assert_eq!(id & 0xFF, 0x58);
assert_eq!((id >> 8) & 1, 1);
assert_eq!((id >> 9) & 0x3, 2);
assert_eq!((id >> 11) & 0x7, 3);
assert_eq!((id >> 14) & 0xF, 0xA);
}
}