Skip to main content

ud_arch_aarch64/
lib.rs

1//! AArch64 instruction decoder + minimal lifter.
2//!
3//! v0 scope is intentionally narrow: AArch64 instructions are
4//! fixed-width 4 bytes, so the decoder just splits the input into
5//! 4-byte chunks. Each chunk gets a coarse classification —
6//! enough to extract direct branch targets and identify returns,
7//! which is what the IR layer needs to build basic blocks. Full
8//! mnemonic + operand printing isn't here yet; instructions render
9//! as `<arm64 0xXXXXXXXX>` placeholder text alongside their pinned
10//! bytes, so the round-trip property holds via byte identity.
11//!
12//! Future iterations will wire in a real disassembler (`bad64` or
13//! similar) to produce readable `@asm` text and unlock the same
14//! lifting passes the x86 backend has (prologue/epilogue lift,
15//! call-site analysis, if/else groups).
16
17#![allow(clippy::cast_possible_truncation)]
18
19mod codec;
20
21pub use codec::{register, Aarch64Codec};
22
23use ud_core::VAddr;
24use ud_ir::{ArchInsn, BasicBlock, Function, Terminator};
25
26/// On-disk size of every AArch64 instruction.
27pub const INSN_SIZE: usize = 4;
28
29/// Errors specific to the AArch64 backend.
30#[derive(Debug, thiserror::Error)]
31pub enum Error {
32    #[error(
33        "byte buffer length {len} is not a multiple of {INSN_SIZE} (AArch64 insns are fixed-width)"
34    )]
35    Misaligned { len: usize },
36}
37
38pub type Result<T, E = Error> = std::result::Result<T, E>;
39
40/// Coarse classification of an AArch64 instruction. Enough to pick
41/// out flow-control behaviour for CFG construction; everything else
42/// falls into `Other`.
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44pub enum InsnKind {
45    /// Unconditional direct branch (`B target`).
46    BranchDirect { target: u64 },
47    /// Conditional direct branch (`B.cond target`).
48    BranchConditional { taken: u64, fallthrough: u64 },
49    /// Compare-and-branch zero / non-zero (`CBZ`, `CBNZ`) — also a
50    /// conditional direct branch in CFG terms.
51    CompareAndBranch { taken: u64, fallthrough: u64 },
52    /// Test-bit-and-branch zero / non-zero (`TBZ`, `TBNZ`).
53    TestBitAndBranch { taken: u64, fallthrough: u64 },
54    /// Direct call (`BL target`).
55    BranchLink { target: u64 },
56    /// Indirect branch (`BR Xn`).
57    BranchRegister,
58    /// Indirect call (`BLR Xn`).
59    BranchLinkRegister,
60    /// Return (`RET [Xn]`). Defaults to `RET X30` when no register
61    /// is given.
62    Return,
63    /// `NOP` — encoded as `D503201F` (HINT #0).
64    Nop,
65    /// Anything else.
66    Other,
67}
68
69/// One decoded AArch64 instruction. Carries the raw 4-byte
70/// encoding, the address it lived at, and a coarse `InsnKind`.
71#[derive(Debug, Clone, PartialEq, Eq)]
72pub struct DecodedInsn {
73    pub addr: VAddr,
74    pub bytes: [u8; INSN_SIZE],
75    pub kind: InsnKind,
76}
77
78impl DecodedInsn {
79    /// The 32-bit little-endian encoding as a `u32`.
80    #[must_use]
81    pub fn opcode(&self) -> u32 {
82        u32::from_le_bytes(self.bytes)
83    }
84}
85
86impl ArchInsn for DecodedInsn {
87    fn addr(&self) -> VAddr {
88        self.addr
89    }
90
91    fn original_bytes(&self) -> &[u8] {
92        &self.bytes
93    }
94}
95
96/// Decode `bytes` as an AArch64 instruction stream starting at
97/// virtual address `start`. The buffer length must be a multiple of
98/// `INSN_SIZE` — AArch64 has no concept of "the rest is data" the
99/// way x86 does, so a misaligned tail is a hard error.
100pub fn decode(bytes: &[u8], start: u64) -> Result<Vec<DecodedInsn>> {
101    if bytes.len() % INSN_SIZE != 0 {
102        return Err(Error::Misaligned { len: bytes.len() });
103    }
104    let mut out = Vec::with_capacity(bytes.len() / INSN_SIZE);
105    for (i, chunk) in bytes.chunks_exact(INSN_SIZE).enumerate() {
106        let addr = start.saturating_add((i * INSN_SIZE) as u64);
107        let mut raw = [0u8; INSN_SIZE];
108        raw.copy_from_slice(chunk);
109        let opcode = u32::from_le_bytes(raw);
110        let kind = classify(opcode, addr);
111        out.push(DecodedInsn {
112            addr: VAddr(addr),
113            bytes: raw,
114            kind,
115        });
116    }
117    Ok(out)
118}
119
120/// Classify a single 32-bit instruction word. Recognises the
121/// branch / return / nop encodings; everything else is `Other`.
122///
123/// Encodings (per the AArch64 ARM):
124///
125/// * `B target`           — `0001_01ii_iiii_iiii_iiii_iiii_iiii_iiii`
126///   (top 6 bits 000101); imm26 is a signed `<<2` PC-relative offset.
127/// * `BL target`          — `1001_01ii_iiii_iiii_iiii_iiii_iiii_iiii`
128///   (top 6 bits 100101); same imm26.
129/// * `B.cond target`      — `0101_0100_iiii_iiii_iiii_iiii_iii0_cccc`
130///   (top byte 0x54, bit 4 zero); imm19 is a signed `<<2` PC-rel offset.
131/// * `CBZ/CBNZ Rt, target` — `?011_010?_iiii_iiii_iiii_iiii_iiit_tttt`;
132///   imm19 is a signed `<<2` PC-relative offset.
133/// * `TBZ/TBNZ Rt, #b, target` — `?011_011?_bbbb_biii_iiii_iiii_iiit_tttt`;
134///   imm14 is a signed `<<2` PC-relative offset.
135/// * `BR Xn`              — `1101_0110_0001_1111_0000_00nn_nnn0_0000`
136/// * `BLR Xn`             — `1101_0110_0011_1111_0000_00nn_nnn0_0000`
137/// * `RET [Xn]`           — `1101_0110_0101_1111_0000_00nn_nnn0_0000`
138/// * `NOP`                — `1101_0101_0000_0011_0010_0000_0001_1111`
139fn classify(opcode: u32, addr: u64) -> InsnKind {
140    /// Mask used to identify the RET / BR / BLR encoding shape
141    /// `1101_0110_xxxx_1111_0000_00nn_nnn0_0000` — the `xxxx` field
142    /// distinguishes the variant.
143    const INDIRECT_BRANCH_MASK: u32 = 0xffff_fc1f;
144
145    // NOP — exact encoding D503201F.
146    if opcode == 0xd503_201f {
147        return InsnKind::Nop;
148    }
149    if (opcode & INDIRECT_BRANCH_MASK) == 0xd65f_0000 {
150        return InsnKind::Return;
151    }
152    if (opcode & INDIRECT_BRANCH_MASK) == 0xd61f_0000 {
153        return InsnKind::BranchRegister;
154    }
155    if (opcode & INDIRECT_BRANCH_MASK) == 0xd63f_0000 {
156        return InsnKind::BranchLinkRegister;
157    }
158    // B / BL — top 6 bits 000101 / 100101.
159    if (opcode & 0xfc00_0000) == 0x1400_0000 {
160        let target = pc_rel26(addr, opcode);
161        return InsnKind::BranchDirect { target };
162    }
163    if (opcode & 0xfc00_0000) == 0x9400_0000 {
164        let target = pc_rel26(addr, opcode);
165        return InsnKind::BranchLink { target };
166    }
167    // B.cond — top byte 0x54, bit 4 zero.
168    if (opcode & 0xff00_0010) == 0x5400_0000 {
169        let taken = pc_rel19(addr, opcode);
170        let fallthrough = addr.wrapping_add(INSN_SIZE as u64);
171        return InsnKind::BranchConditional { taken, fallthrough };
172    }
173    // CBZ / CBNZ — bits [30:25] == 011010.
174    if (opcode & 0x7e00_0000) == 0x3400_0000 {
175        let taken = pc_rel19(addr, opcode);
176        let fallthrough = addr.wrapping_add(INSN_SIZE as u64);
177        return InsnKind::CompareAndBranch { taken, fallthrough };
178    }
179    // TBZ / TBNZ — bits [30:25] == 011011.
180    if (opcode & 0x7e00_0000) == 0x3600_0000 {
181        let taken = pc_rel14(addr, opcode);
182        let fallthrough = addr.wrapping_add(INSN_SIZE as u64);
183        return InsnKind::TestBitAndBranch { taken, fallthrough };
184    }
185    InsnKind::Other
186}
187
188/// PC-relative target for a 26-bit `<<2` immediate (B / BL).
189#[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
190fn pc_rel26(addr: u64, opcode: u32) -> u64 {
191    let imm26 = opcode & 0x03ff_ffff;
192    // Sign-extend from 26 bits and shift left 2.
193    let signed = ((imm26 as i32) << 6) >> 6;
194    let off = i64::from(signed) << 2;
195    addr.wrapping_add(off as u64) // sign bits roll into u64; intentional
196}
197
198/// PC-relative target for a 19-bit `<<2` immediate (B.cond, CBZ).
199#[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
200fn pc_rel19(addr: u64, opcode: u32) -> u64 {
201    let imm19 = (opcode >> 5) & 0x0007_ffff;
202    let signed = ((imm19 as i32) << 13) >> 13;
203    let off = i64::from(signed) << 2;
204    addr.wrapping_add(off as u64) // sign bits roll into u64; intentional
205}
206
207/// PC-relative target for a 14-bit `<<2` immediate (TBZ).
208#[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
209fn pc_rel14(addr: u64, opcode: u32) -> u64 {
210    let imm14 = (opcode >> 5) & 0x0000_3fff;
211    let signed = ((imm14 as i32) << 18) >> 18;
212    let off = i64::from(signed) << 2;
213    addr.wrapping_add(off as u64) // sign bits roll into u64; intentional
214}
215
216/// Render an instruction as placeholder text alongside its bytes.
217/// v0: emits `<arm64 0xXXXXXXXX>` (mnemonic-based when classified)
218/// — full disassembly comes when we wire a real decoder.
219#[must_use]
220pub fn format_text(insn: &DecodedInsn) -> String {
221    match insn.kind {
222        InsnKind::BranchDirect { target } => format!("b 0x{target:x}"),
223        InsnKind::BranchLink { target } => format!("bl 0x{target:x}"),
224        InsnKind::BranchConditional { taken, .. } => format!("b.cond 0x{taken:x}"),
225        InsnKind::CompareAndBranch { taken, .. } => format!("cbz/cbnz 0x{taken:x}"),
226        InsnKind::TestBitAndBranch { taken, .. } => format!("tbz/tbnz 0x{taken:x}"),
227        InsnKind::BranchRegister => "br".into(),
228        InsnKind::BranchLinkRegister => "blr".into(),
229        InsnKind::Return => "ret".into(),
230        InsnKind::Nop => "nop".into(),
231        InsnKind::Other => format!("<arm64 0x{:08x}>", insn.opcode()),
232    }
233}
234
235/// Lift a decoded instruction stream into a CFG.
236///
237/// v0: build a single basic block for the whole function. Once we
238/// have full mnemonic decoding we can wire in proper block splitting
239/// at branch targets (mirroring the x86 lift in
240/// `ud_arch_x86::lift_function`).
241#[must_use]
242pub fn lift_function(name: String, insns: &[DecodedInsn]) -> Function<DecodedInsn> {
243    let addr = insns.first().map_or(VAddr(0), |i| i.addr);
244    let terminator = insns
245        .last()
246        .map_or(Terminator::Fallthrough, |i| match i.kind {
247            InsnKind::Return => Terminator::Return,
248            InsnKind::BranchDirect { target } => Terminator::UnconditionalBranch {
249                target: VAddr(target),
250            },
251            InsnKind::BranchConditional { taken, fallthrough }
252            | InsnKind::CompareAndBranch { taken, fallthrough }
253            | InsnKind::TestBitAndBranch { taken, fallthrough } => Terminator::ConditionalBranch {
254                taken: VAddr(taken),
255                fallthrough: VAddr(fallthrough),
256            },
257            InsnKind::BranchRegister | InsnKind::BranchLinkRegister => Terminator::IndirectBranch,
258            _ => Terminator::Fallthrough,
259        });
260    Function {
261        addr,
262        name,
263        blocks: vec![BasicBlock {
264            addr,
265            insns: insns.to_vec(),
266            terminator,
267        }],
268    }
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    #[test]
276    fn decode_splits_into_4byte_words() {
277        // ret = 0xd65f03c0 (encoded little-endian on disk).
278        let bytes = [
279            0xc0, 0x03, 0x5f, 0xd6, // ret
280            0x1f, 0x20, 0x03, 0xd5, // nop
281        ];
282        let insns = decode(&bytes, 0x1000).unwrap();
283        assert_eq!(insns.len(), 2);
284        assert_eq!(insns[0].addr, VAddr(0x1000));
285        assert_eq!(insns[0].kind, InsnKind::Return);
286        assert_eq!(insns[1].addr, VAddr(0x1004));
287        assert_eq!(insns[1].kind, InsnKind::Nop);
288    }
289
290    #[test]
291    fn rejects_misaligned_buffer() {
292        let bytes = [0x00, 0x01, 0x02];
293        assert!(matches!(
294            decode(&bytes, 0x1000),
295            Err(Error::Misaligned { len: 3 })
296        ));
297    }
298
299    #[test]
300    fn classifies_b_with_signed_target() {
301        // B +0x10 — opcode 0x14000004 (imm26 = 4, target = pc + 16).
302        let opcode: u32 = 0x14_00_00_04;
303        let bytes = opcode.to_le_bytes();
304        let insns = decode(&bytes, 0x1000).unwrap();
305        assert_eq!(insns[0].kind, InsnKind::BranchDirect { target: 0x1010 });
306    }
307
308    #[test]
309    fn classifies_bl_with_negative_target() {
310        // BL -8 — bits 100101 + imm26 = -2 (= 0x03ff_fffe).
311        // Encoding: 0x97_ff_ff_fe.
312        let opcode: u32 = 0x97_ff_ff_fe;
313        let bytes = opcode.to_le_bytes();
314        let insns = decode(&bytes, 0x2000).unwrap();
315        assert_eq!(insns[0].kind, InsnKind::BranchLink { target: 0x1ff8 });
316    }
317
318    #[test]
319    fn classifies_b_cond() {
320        // B.EQ +8 — top byte 0x54, imm19 = 2 (target = pc+8), cond = 0.
321        // 0x5400_0040.
322        let opcode: u32 = 0x54_00_00_40;
323        let bytes = opcode.to_le_bytes();
324        let insns = decode(&bytes, 0x1000).unwrap();
325        assert!(matches!(
326            insns[0].kind,
327            InsnKind::BranchConditional { taken: 0x1008, .. }
328        ));
329    }
330
331    #[test]
332    fn ret_kind_drives_terminator() {
333        // mov; ret (mov w0, #0 = 0x52800000; ret = 0xd65f03c0).
334        let bytes = [0x00, 0x00, 0x80, 0x52, 0xc0, 0x03, 0x5f, 0xd6];
335        let insns = decode(&bytes, 0x1000).unwrap();
336        let f = lift_function("f".into(), &insns);
337        assert_eq!(f.blocks.len(), 1);
338        assert_eq!(f.blocks[0].terminator, Terminator::Return);
339    }
340}