ud_arch_bpf/lib.rs
1//! Linux eBPF + Solana SBF (sBPFv1 / sBPFv2) decoder + minimal
2//! lifter.
3//!
4//! Every BPF "slot" is 8 bytes: a 1-byte opcode, a 1-byte
5//! pair of dst/src nibbles, a signed `le16` offset, and a
6//! signed `le32` immediate. One special instruction — `lddw`
7//! (load 64-bit immediate, opcode 0x18) — takes two
8//! consecutive slots: the first carries bits [31:0] in `imm`,
9//! the second has opcode 0 and bits [63:32] in its `imm`.
10//!
11//! Solana SBF (classic / sBPFv1) and Agave sBPFv2 reuse the
12//! same encoding with a handful of extra opcodes:
13//! * `CALL_REG` (0x8d) — register-indexed dynamic call (added
14//! in sBPFv1).
15//! * `UDIV` / `SDIV` / `UREM` / `SREM` PQR variants — sBPFv2
16//! dedicated division/remainder ops (the Linux eBPF
17//! opcodes for these slots mean different things or are
18//! absent).
19//! * Explicit sign-extends (`SXH`/`SXW`/`SXD`) — sBPFv2.
20//!
21//! The decoder is variant-gated. Opcodes we know the mnemonic
22//! for in the configured variant emit `InsnKind::*` with a
23//! readable text rendering; opcodes we don't recognise emit
24//! `InsnKind::Unknown` and the raw 8 bytes are preserved
25//! verbatim — the round-trip property holds via byte identity
26//! regardless of whether we can name the instruction.
27//!
28//! References:
29//! * Linux Kernel — eBPF Instruction Set, v6.5 docs.
30//! * solana_rbpf — text format and SBF-specific opcode set.
31
32#![allow(clippy::cast_possible_truncation)]
33#![allow(clippy::cast_possible_wrap)]
34#![allow(clippy::cast_sign_loss)]
35
36use std::collections::BTreeSet;
37
38use ud_core::VAddr;
39use ud_ir::{ArchInsn, BasicBlock, Function, Terminator};
40
41mod assemble;
42mod codec;
43pub use assemble::{
44 assemble_bpf, assemble_bpf_ifblock_cond, assemble_bpf_ja, desymbolize_bpf_text, AssembleError,
45};
46pub use codec::{register, BpfCodec, EM_BPF, EM_SBF};
47
48/// On-disk size of one BPF instruction slot.
49pub const INSN_SIZE: usize = 8;
50
51/// Variant selector. The bytes for shared opcodes are identical
52/// across variants; the variant only changes which opcodes we
53/// know the mnemonic for and which ones are *legal* per the
54/// runtime that consumes the bytecode.
55#[derive(Debug, Clone, Copy, PartialEq, Eq)]
56pub enum BpfVariant {
57 /// Linux eBPF — base ISA. ELF `e_machine = EM_BPF` (247).
58 Linux,
59 /// Solana SBF (classic / sBPFv1). Adds `CALL_REG` (0x8d).
60 /// The `CALL_IMM` immediate, after relocation, is the
61 /// Murmur3 hash of the syscall name (we render the raw
62 /// hash; name resolution is out of scope for v1).
63 Sbfv1,
64 /// Agave sBPFv2. Adds PQR ops (UDIV/SDIV/UREM/SREM) and
65 /// explicit sign-extends. Some classic ALU32 implicit
66 /// sign-extends behave differently here.
67 Sbfv2,
68}
69
70/// Errors specific to the BPF backend.
71#[derive(Debug, thiserror::Error)]
72pub enum Error {
73 #[error(
74 "byte buffer length {len} is not a multiple of {INSN_SIZE} (BPF slots are fixed-width)"
75 )]
76 Misaligned { len: usize },
77 #[error("lddw at offset {offset:#x} truncated — second slot missing")]
78 LddwTruncated { offset: usize },
79 #[error("lddw at offset {offset:#x} continuation slot has non-zero opcode {opcode:#x}")]
80 LddwBadContinuation { offset: usize, opcode: u8 },
81}
82
83pub type Result<T, E = Error> = std::result::Result<T, E>;
84
85/// Coarse classification — enough to drive CFG construction and
86/// to pick the text rendering. The variant-specific mnemonic
87/// (e.g. `udiv64` vs `udiv32` for sBPFv2) is derived from the
88/// raw `opcode` byte at format time; we don't carry a separate
89/// mnemonic field on `DecodedInsn`.
90#[derive(Debug, Clone, Copy, PartialEq, Eq)]
91pub enum InsnKind {
92 /// 32-bit ALU op (`add32`, `mov32`, etc.).
93 Alu32,
94 /// 64-bit ALU op (`add64`, `mov64`, etc.).
95 Alu64,
96 /// Unconditional 64-bit jump (`ja +offset`).
97 Jmp,
98 /// Conditional 64-bit jump (`jeq`, `jne`, `jgt`, …).
99 JmpCond,
100 /// Conditional 32-bit jump (`jeq32`, `jne32`, …) — eBPF JMP32 class.
101 JmpCond32,
102 /// `call imm` (helper / syscall — imm is a numeric id or
103 /// Murmur3 hash on SBF).
104 Call,
105 /// `callx r` (register-indirect call) — SBFv1+.
106 CallReg,
107 /// `exit`.
108 Exit,
109 /// Memory load (LD / LDX class).
110 Load,
111 /// Memory store (ST / STX class).
112 Store,
113 /// First slot of `lddw r, imm64` — `imm64` carries the
114 /// combined 64-bit immediate.
115 Lddw,
116 /// Second slot of `lddw` (opcode 0, imm is the high 32 of
117 /// the 64-bit value).
118 LddwSecondHalf,
119 /// Endian conversion (`be16`/`be32`/`be64`/`le16`/`le32`/`le64`).
120 Endian,
121 /// Bytes don't match any opcode we know in the configured
122 /// variant. The raw 8 bytes survive on `DecodedInsn::bytes`.
123 Unknown,
124}
125
126/// One decoded BPF slot.
127#[derive(Debug, Clone, PartialEq, Eq)]
128pub struct DecodedInsn {
129 pub addr: VAddr,
130 pub bytes: [u8; INSN_SIZE],
131 pub kind: InsnKind,
132 pub opcode: u8,
133 /// Destination register (low nibble of byte 1).
134 pub dst: u8,
135 /// Source register (high nibble of byte 1).
136 pub src: u8,
137 /// 16-bit signed offset, little-endian.
138 pub offset: i16,
139 /// 32-bit signed immediate, little-endian.
140 pub imm: i32,
141 /// Combined 64-bit immediate — populated only on the first
142 /// slot of LDDW (opcode 0x18); `None` everywhere else.
143 pub imm64: Option<u64>,
144}
145
146impl DecodedInsn {
147 /// Raw 8-byte encoding interpreted as a `u64` (little-endian).
148 #[must_use]
149 pub fn raw_u64(&self) -> u64 {
150 u64::from_le_bytes(self.bytes)
151 }
152}
153
154impl ArchInsn for DecodedInsn {
155 fn addr(&self) -> VAddr {
156 self.addr
157 }
158 fn original_bytes(&self) -> &[u8] {
159 &self.bytes
160 }
161}
162
163/// Decode `bytes` as a BPF instruction stream starting at
164/// virtual address `start`. Buffer length must be a multiple of
165/// `INSN_SIZE`. The decoder recognises `lddw` (opcode 0x18) and
166/// emits two `DecodedInsn`s for it — one `Lddw` carrying the
167/// 64-bit immediate, plus a `LddwSecondHalf` continuation —
168/// so each output `DecodedInsn` still has exactly 8 bytes.
169pub fn decode(bytes: &[u8], start: u64, variant: BpfVariant) -> Result<Vec<DecodedInsn>> {
170 if bytes.len() % INSN_SIZE != 0 {
171 return Err(Error::Misaligned { len: bytes.len() });
172 }
173 let mut out = Vec::with_capacity(bytes.len() / INSN_SIZE);
174 let mut i = 0usize;
175 while i < bytes.len() {
176 let slot = &bytes[i..i + INSN_SIZE];
177 let raw: [u8; INSN_SIZE] = slot.try_into().expect("INSN_SIZE chunk");
178 let addr = start.saturating_add(i as u64);
179 let opcode = raw[0];
180 let dst = raw[1] & 0x0f;
181 let src = (raw[1] >> 4) & 0x0f;
182 let offset = i16::from_le_bytes([raw[2], raw[3]]);
183 let imm = i32::from_le_bytes([raw[4], raw[5], raw[6], raw[7]]);
184
185 if opcode == 0x18 {
186 // LDDW — coalesce with the following slot. When
187 // the continuation slot is missing or starts with
188 // a non-zero opcode (e.g. a function boundary that
189 // happens to land mid-`lddw` after layer-2's
190 // call-target harvest), fall through to the
191 // generic slot emission below. That way the orphan
192 // bytes survive as a `@bpf 0x…` placeholder and
193 // round-trip stays byte-identical.
194 let has_well_formed_pair =
195 i + 2 * INSN_SIZE <= bytes.len() && bytes[i + INSN_SIZE] == 0;
196 if !has_well_formed_pair {
197 // Treat as a regular slot (no LDDW pairing).
198 out.push(DecodedInsn {
199 addr: VAddr(addr),
200 bytes: raw,
201 kind: InsnKind::Unknown,
202 opcode,
203 dst,
204 src,
205 offset,
206 imm,
207 imm64: None,
208 });
209 i += INSN_SIZE;
210 continue;
211 }
212 let cont = &bytes[i + INSN_SIZE..i + 2 * INSN_SIZE];
213 let imm_hi = u32::from_le_bytes([cont[4], cont[5], cont[6], cont[7]]);
214 let imm_lo = imm as u32;
215 let imm64 = (u64::from(imm_hi) << 32) | u64::from(imm_lo);
216 out.push(DecodedInsn {
217 addr: VAddr(addr),
218 bytes: raw,
219 kind: InsnKind::Lddw,
220 opcode,
221 dst,
222 src,
223 offset,
224 imm,
225 imm64: Some(imm64),
226 });
227 let cont_raw: [u8; INSN_SIZE] = cont.try_into().expect("INSN_SIZE chunk");
228 let cont_addr = addr.wrapping_add(INSN_SIZE as u64);
229 out.push(DecodedInsn {
230 addr: VAddr(cont_addr),
231 bytes: cont_raw,
232 kind: InsnKind::LddwSecondHalf,
233 opcode: 0,
234 dst: cont_raw[1] & 0x0f,
235 src: (cont_raw[1] >> 4) & 0x0f,
236 offset: i16::from_le_bytes([cont_raw[2], cont_raw[3]]),
237 imm: i32::from_le_bytes([cont_raw[4], cont_raw[5], cont_raw[6], cont_raw[7]]),
238 imm64: None,
239 });
240 i += 2 * INSN_SIZE;
241 continue;
242 }
243
244 let kind = classify_opcode(opcode, variant);
245 out.push(DecodedInsn {
246 addr: VAddr(addr),
247 bytes: raw,
248 kind,
249 opcode,
250 dst,
251 src,
252 offset,
253 imm,
254 imm64: None,
255 });
256 i += INSN_SIZE;
257 }
258 Ok(out)
259}
260
261/// Pure re-classifier — re-derives `kind` from `opcode` + the
262/// configured variant. Useful when something wants to re-walk a
263/// slice of decoded slots after the fact (matches the
264/// `classify` contract from other arch crates).
265#[must_use]
266pub fn classify(insn: &DecodedInsn, variant: BpfVariant) -> InsnKind {
267 classify_opcode(insn.opcode, variant)
268}
269
270fn classify_opcode(opcode: u8, variant: BpfVariant) -> InsnKind {
271 let class = opcode & 0x07;
272 // sBPFv2 reuses the ALU32 div/mod opcode bytes for explicit
273 // PQR variants (UDIV / UREM / SDIV / SREM 32-bit) — same
274 // raw bytes, different runtime semantics. Pattern-match the
275 // ones we know so future passes can render the right
276 // mnemonic.
277 let _ = variant;
278 match class {
279 // BPF_LD (0x00) — non-register-indexed loads, and
280 // BPF_LDX (0x01) — register-indexed (`ldxb/h/w/dw`).
281 // Both reach us as `Load`; the formatter picks the
282 // exact mnemonic from the opcode byte.
283 0x00 | 0x01 => InsnKind::Load,
284 // BPF_ST (0x02) — immediate store; BPF_STX (0x03) —
285 // register store. Same `Store` classification for CFG
286 // purposes.
287 0x02 | 0x03 => InsnKind::Store,
288 // BPF_ALU (0x04) — 32-bit ALU; `END` (byte-swap /
289 // endian conversion) lives at op nibble 0xd.
290 0x04 => {
291 if (opcode >> 4) == 0xd {
292 InsnKind::Endian
293 } else {
294 InsnKind::Alu32
295 }
296 }
297 // BPF_JMP (0x05) — 64-bit jumps.
298 0x05 => classify_jmp(opcode, variant),
299 // BPF_JMP32 (0x06) — 32-bit-compare conditional jumps.
300 0x06 => classify_jmp32(opcode),
301 // BPF_ALU64 (0x07) — 64-bit ALU; 0xd is the (rare)
302 // 64-bit endian slot.
303 0x07 => {
304 if (opcode >> 4) == 0xd {
305 InsnKind::Endian
306 } else {
307 InsnKind::Alu64
308 }
309 }
310 _ => InsnKind::Unknown,
311 }
312}
313
314/// Classify an opcode in `BPF_JMP` class (low 3 bits = 5).
315fn classify_jmp(opcode: u8, variant: BpfVariant) -> InsnKind {
316 let op = opcode >> 4;
317 match op {
318 // JA = 0x05 (op nibble 0, class 5).
319 0x0 => InsnKind::Jmp,
320 // CALL (0x85) and CALL-with-src=1 (0x8d) both live in
321 // JMP class with op nibble 0x8. On Linux eBPF 0x8d is
322 // a BPF-to-BPF local call (imm is a relative slot
323 // offset); on SBF it's CALLX (register-source). Either
324 // way it's a call, not a conditional jump.
325 0x8 => {
326 if opcode == 0x8d && matches!(variant, BpfVariant::Sbfv1 | BpfVariant::Sbfv2) {
327 InsnKind::CallReg
328 } else if opcode == 0x8d {
329 // Linux BPF-to-BPF call: target = next + imm*8,
330 // same shape as a CALL_IMM. Classify as Call so
331 // layer-2 picks up the target.
332 InsnKind::Call
333 } else if opcode == 0x85 {
334 InsnKind::Call
335 } else {
336 InsnKind::JmpCond
337 }
338 }
339 // EXIT = 0x95.
340 0x9 if opcode == 0x95 => InsnKind::Exit,
341 // Everything else in JMP class is a conditional jump
342 // (JEQ/JGT/JGE/JSET/JNE/JSGT/JSGE/JLT/JLE/JSLT/JSLE),
343 // either reg- or imm-source. Either way the CFG cares
344 // about the offset to the taken branch.
345 _ => InsnKind::JmpCond,
346 }
347}
348
349fn classify_jmp32(opcode: u8) -> InsnKind {
350 // All JMP32 opcodes are conditional (there's no unconditional
351 // ja32; ja stays in the JMP class).
352 let _ = opcode;
353 InsnKind::JmpCond32
354}
355
356/// Compute the absolute byte-address target of a relative jump.
357/// BPF offsets are in *slots* (8 bytes each) and apply to the
358/// instruction *after* this one.
359#[must_use]
360pub fn jump_target(insn: &DecodedInsn) -> u64 {
361 let next_slot = insn.addr.0.wrapping_add(INSN_SIZE as u64);
362 let off_bytes = i64::from(insn.offset).wrapping_mul(INSN_SIZE as i64);
363 next_slot.wrapping_add(off_bytes as u64)
364}
365
366/// Compute the absolute byte-address target of a `call <imm>`
367/// instruction *for a local call*. The `imm` field on a BPF
368/// `call` is a signed slot offset relative to the next slot.
369///
370/// Callers should first verify the call isn't a syscall — for
371/// the Linux kernel the `imm` is a helper-id and is *not* a
372/// code offset; for SBF the `imm` is a Murmur3 hash (or `-1`
373/// before relocation) and again is not a code offset. The
374/// usual discriminator is "is this call site in the
375/// relocation-resolved syscall map?" — see
376/// `ud_analysis::bpf_relocs::build_call_site_names`.
377#[must_use]
378pub fn call_target(insn: &DecodedInsn) -> u64 {
379 let next_slot = insn.addr.0.wrapping_add(INSN_SIZE as u64);
380 let off_bytes = i64::from(insn.imm).wrapping_mul(INSN_SIZE as i64);
381 next_slot.wrapping_add(off_bytes as u64)
382}
383
384/// Lift a decoded instruction stream into a CFG.
385///
386/// Slices the stream into basic blocks at every intra-function
387/// jump target and immediately after every control-flow exit
388/// (`exit` / unconditional `ja` / conditional `j*`). The
389/// resulting `Function<DecodedInsn>` is suitable for
390/// downstream SSA / dominance / liveness analyses — joins at
391/// reconvergence points generate proper phi placement, which
392/// the previous single-block-per-function shape could never
393/// produce.
394///
395/// Calls (`call imm` and indirect `callx`) are **not** block
396/// terminators: control flow returns through them
397/// normally and the following slot stays in the same block.
398/// Only when a call is the function's last instruction does
399/// `Terminator::IndirectBranch` surface on its block.
400///
401/// The byte-identity contract still holds — every instruction
402/// rides in some block in original address order, and
403/// `Function::emit_bytes` concatenates blocks back into the
404/// original byte stream.
405#[must_use]
406pub fn lift_function(name: String, insns: &[DecodedInsn]) -> Function<DecodedInsn> {
407 let addr = insns.first().map_or(VAddr(0), |i| i.addr);
408 if insns.is_empty() {
409 return Function {
410 addr,
411 name,
412 blocks: Vec::new(),
413 };
414 }
415 let fn_start = addr.0;
416 let fn_end = insns
417 .last()
418 .map_or(fn_start, |i| i.addr.0.wrapping_add(INSN_SIZE as u64));
419
420 // Collect block boundaries: function entry, every intra-
421 // function jump target, and the slot immediately after
422 // every control-flow exit (jmp/jcc/exit). LDDW second
423 // halves are never boundary candidates — the verifier
424 // forbids jumps into mid-`lddw` and we never need to
425 // split between the two slots of an `lddw` pair.
426 let mut boundaries: BTreeSet<u64> = BTreeSet::new();
427 boundaries.insert(fn_start);
428 for i in insns {
429 if matches!(
430 i.kind,
431 InsnKind::Jmp | InsnKind::JmpCond | InsnKind::JmpCond32
432 ) {
433 let t = jump_target(i);
434 if (fn_start..fn_end).contains(&t) {
435 boundaries.insert(t);
436 }
437 }
438 if matches!(
439 i.kind,
440 InsnKind::Jmp | InsnKind::JmpCond | InsnKind::JmpCond32 | InsnKind::Exit
441 ) {
442 let next = i.addr.0.wrapping_add(INSN_SIZE as u64);
443 if next < fn_end {
444 boundaries.insert(next);
445 }
446 }
447 }
448
449 // Walk the stream once, emitting a block whenever the
450 // current insn lands on a boundary (and we have prior
451 // insns accumulated).
452 let mut blocks: Vec<BasicBlock<DecodedInsn>> = Vec::new();
453 let mut current: Vec<DecodedInsn> = Vec::new();
454 let mut current_addr: u64 = fn_start;
455 for i in insns {
456 if boundaries.contains(&i.addr.0) && !current.is_empty() {
457 let term = block_terminator(¤t);
458 blocks.push(BasicBlock {
459 addr: VAddr(current_addr),
460 insns: std::mem::take(&mut current),
461 terminator: term,
462 });
463 current_addr = i.addr.0;
464 }
465 current.push(i.clone());
466 }
467 if !current.is_empty() {
468 let term = block_terminator(¤t);
469 blocks.push(BasicBlock {
470 addr: VAddr(current_addr),
471 insns: current,
472 terminator: term,
473 });
474 }
475
476 Function { addr, name, blocks }
477}
478
479/// Pick the terminator for a block from its last instruction's
480/// kind. Falls through to the next block when the last insn
481/// isn't a control-flow primitive — typically because the
482/// block ended at a jump target rather than at an exit.
483fn block_terminator(insns: &[DecodedInsn]) -> Terminator {
484 let Some(last) = insns.last() else {
485 return Terminator::Fallthrough;
486 };
487 match last.kind {
488 InsnKind::Exit => Terminator::Return,
489 InsnKind::Jmp => Terminator::UnconditionalBranch {
490 target: VAddr(jump_target(last)),
491 },
492 InsnKind::JmpCond | InsnKind::JmpCond32 => Terminator::ConditionalBranch {
493 taken: VAddr(jump_target(last)),
494 fallthrough: VAddr(last.addr.0.wrapping_add(INSN_SIZE as u64)),
495 },
496 InsnKind::CallReg => Terminator::IndirectBranch,
497 _ => Terminator::Fallthrough,
498 }
499}
500
501// ============================================================
502// Text rendering — solana_rbpf / llvm-objdump style.
503// ============================================================
504
505/// Render a decoded instruction as text. Matches the
506/// solana_rbpf / llvm-objdump dialect closely enough that a
507/// reader who knows BPF will recognise everything.
508#[must_use]
509pub fn format_insn(insn: &DecodedInsn, variant: BpfVariant) -> String {
510 if matches!(insn.kind, InsnKind::LddwSecondHalf) {
511 // The continuation half of LDDW has no standalone
512 // mnemonic; render it as bytes-only so the .ud reader
513 // sees the pair clearly.
514 return format!("<lddw-cont 0x{:08x}>", insn.imm as u32);
515 }
516 let class = insn.opcode & 0x07;
517 match class {
518 0x00 | 0x01 => format_ld(insn),
519 0x02 | 0x03 => format_st(insn),
520 0x04 => format_alu(insn, /* alu64 */ false, variant),
521 0x05 => format_jmp(insn, /* is_32 */ false, variant),
522 0x06 => format_jmp(insn, /* is_32 */ true, variant),
523 0x07 => format_alu(insn, /* alu64 */ true, variant),
524 _ => format!("<bpf 0x{:016x}>", insn.raw_u64()),
525 }
526}
527
528fn format_ld(insn: &DecodedInsn) -> String {
529 // LDDW (opcode 0x18) — load 64-bit immediate. For a
530 // well-formed pair the decoder set `imm64` to the
531 // joined value. For an orphaned LDDW (continuation
532 // slot stolen by a downstream pass, e.g. function-
533 // boundary harvest mid-pair), `imm64` is None and we
534 // render just the slot's own `imm32` so the byte-drop
535 // pass can round-trip it via `assemble_bpf` — the
536 // separated continuation slot rides as its own
537 // `<lddw-cont 0x…>` line and round-trips too.
538 if insn.opcode == 0x18 {
539 let imm = match insn.imm64 {
540 Some(v) => v,
541 None => u64::from(insn.imm as u32),
542 };
543 return format!("lddw r{}, 0x{:x}", insn.dst, imm);
544 }
545 // Opcode 0 — typically the continuation slot of an
546 // LDDW pair. The decoder labels these `LddwSecondHalf`
547 // when the pair was well-formed; orphans fall through
548 // here with `kind = Unknown`. Render them as the
549 // continuation form regardless so the byte-drop pass
550 // recognises them uniformly.
551 if insn.opcode == 0 {
552 return format!("<lddw-cont 0x{:08x}>", insn.imm as u32);
553 }
554 // LD_ABS / LD_IND (legacy packet loads, opcodes 0x20, 0x28,
555 // 0x30, 0x38, 0x40, 0x48, 0x50). Render generically; corpus
556 // codecs rarely use them.
557 if matches!(insn.opcode, 0x20 | 0x28 | 0x30 | 0x38 | 0x40 | 0x48 | 0x50) {
558 let sz = size_letter(insn.opcode);
559 return format!("ld_abs_{sz} r0, 0x{:x}", insn.imm as u32);
560 }
561 // LDX class — `ldx{b,h,w,dw} dst, [src + offset]`.
562 let sz = size_letter(insn.opcode);
563 let offset = format_offset(insn.offset);
564 format!("ldx{sz} r{}, [r{}{offset}]", insn.dst, insn.src)
565}
566
567fn format_st(insn: &DecodedInsn) -> String {
568 let sz = size_letter(insn.opcode);
569 let offset = format_offset(insn.offset);
570 if (insn.opcode & 0x07) == 0x02 {
571 // ST_IMM — immediate store.
572 format!("st{sz} [r{}{offset}], 0x{:x}", insn.dst, insn.imm as u32)
573 } else {
574 // STX — register store.
575 format!("stx{sz} [r{}{offset}], r{}", insn.dst, insn.src)
576 }
577}
578
579fn size_letter(opcode: u8) -> &'static str {
580 // BPF size field is bits 3..4 of the opcode (mask 0x18):
581 // 0x00 = W (32-bit), 0x08 = H (16-bit), 0x10 = B (8-bit),
582 // 0x18 = DW (64-bit).
583 match opcode & 0x18 {
584 0x00 => "w",
585 0x08 => "h",
586 0x10 => "b",
587 0x18 => "dw",
588 _ => unreachable!(),
589 }
590}
591
592fn format_offset(offset: i16) -> String {
593 use std::cmp::Ordering;
594 match offset.cmp(&0) {
595 Ordering::Equal => String::new(),
596 Ordering::Greater => format!(" + 0x{offset:x}"),
597 Ordering::Less => {
598 let abs = u32::from(offset.unsigned_abs());
599 format!(" - 0x{abs:x}")
600 }
601 }
602}
603
604fn format_alu(insn: &DecodedInsn, alu64: bool, variant: BpfVariant) -> String {
605 // Source bit (bit 3 of opcode): 0 = imm source, 1 = reg source.
606 let is_reg = (insn.opcode & 0x08) != 0;
607 let op_nibble = insn.opcode >> 4;
608 let suffix = if alu64 { "64" } else { "32" };
609 let mnemonic = match (op_nibble, alu64, variant) {
610 (0x0, _, _) => "add",
611 (0x1, _, _) => "sub",
612 (0x2, _, _) => "mul",
613 (0x3, _, BpfVariant::Linux | BpfVariant::Sbfv1) => "div",
614 // sBPFv2: 0x3 in ALU class is `udiv` per the PQR spec.
615 // Same byte; the mnemonic differs.
616 (0x3, _, BpfVariant::Sbfv2) => "udiv",
617 (0x4, _, _) => "or",
618 (0x5, _, _) => "and",
619 (0x6, _, _) => "lsh",
620 (0x7, _, _) => "rsh",
621 (0x8, _, _) => "neg",
622 (0x9, _, BpfVariant::Linux | BpfVariant::Sbfv1) => "mod",
623 (0x9, _, BpfVariant::Sbfv2) => "urem",
624 (0xa, _, _) => "xor",
625 (0xb, _, _) => "mov",
626 (0xc, _, _) => "arsh",
627 (0xd, _, _) => return format_endian(insn),
628 // sBPFv2 added SDIV / SREM in op-nibbles 0xe / 0xf.
629 (0xe, _, BpfVariant::Sbfv2) => "sdiv",
630 (0xf, _, BpfVariant::Sbfv2) => "srem",
631 _ => "<alu?>",
632 };
633 if matches!(op_nibble, 0x8) {
634 // `neg` is single-operand.
635 return format!("neg{suffix} r{}", insn.dst);
636 }
637 if is_reg {
638 format!("{mnemonic}{suffix} r{}, r{}", insn.dst, insn.src)
639 } else {
640 format!("{mnemonic}{suffix} r{}, 0x{:x}", insn.dst, insn.imm as u32)
641 }
642}
643
644fn format_endian(insn: &DecodedInsn) -> String {
645 // `be`/`le` family: opcode = 0xd4 (le) or 0xdc (be); imm
646 // carries the width (16, 32, or 64).
647 let dir = if (insn.opcode & 0x08) == 0 {
648 "le"
649 } else {
650 "be"
651 };
652 format!("{dir}{} r{}", insn.imm, insn.dst)
653}
654
655fn format_jmp(insn: &DecodedInsn, is_32: bool, _variant: BpfVariant) -> String {
656 let op = insn.opcode >> 4;
657 // JA — unconditional.
658 if op == 0 && !is_32 && insn.opcode == 0x05 {
659 return format!("ja {}", format_branch_offset(insn.offset));
660 }
661 // CALL — imm-source helper / syscall.
662 if insn.opcode == 0x85 {
663 return format!("call 0x{:x}", insn.imm as u32);
664 }
665 // CALLX — register-indirect call (SBF).
666 if insn.opcode == 0x8d {
667 return format!("callx r{}", insn.dst);
668 }
669 // EXIT.
670 if insn.opcode == 0x95 {
671 return "exit".into();
672 }
673 let is_reg = (insn.opcode & 0x08) != 0;
674 let suffix = if is_32 { "32" } else { "" };
675 let mnemonic = match op {
676 0x1 => "jeq",
677 0x2 => "jgt",
678 0x3 => "jge",
679 0x4 => "jset",
680 0x5 => "jne",
681 0x6 => "jsgt",
682 0x7 => "jsge",
683 0xa => "jlt",
684 0xb => "jle",
685 0xc => "jslt",
686 0xd => "jsle",
687 _ => "<jcc?>",
688 };
689 let rhs = if is_reg {
690 format!("r{}", insn.src)
691 } else {
692 format!("0x{:x}", insn.imm as u32)
693 };
694 format!(
695 "{mnemonic}{suffix} r{}, {rhs}, {}",
696 insn.dst,
697 format_branch_offset(insn.offset)
698 )
699}
700
701fn format_branch_offset(offset: i16) -> String {
702 if offset >= 0 {
703 format!("+0x{offset:x}")
704 } else {
705 let abs = u32::from(offset.unsigned_abs());
706 format!("-0x{abs:x}")
707 }
708}
709
710#[cfg(test)]
711mod tests {
712 use super::*;
713
714 /// Decoded shape for the fixture filter() function:
715 /// 79 11 00 00 ... — ldxdw r1, [r1 + 0]
716 /// b4 00 00 00 ... — mov32 r0, 0
717 /// 15 01 02 00 ... — jeq r1, 0, +2
718 /// 04 01 00 00 01 00 00 00 — add32 r1, 1
719 /// bc 10 00 00 ... — mov32 r0, r1
720 /// 95 00 00 00 ... — exit
721 #[test]
722 fn decodes_fixture_filter() {
723 let bytes: Vec<u8> = vec![
724 0x79, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // ldxdw r1, [r1+0]
725 0xb4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mov32 r0, 0
726 0x15, 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, // jeq r1, 0, +2
727 0x04, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, // add32 r1, 1
728 0xbc, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mov32 r0, r1
729 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // exit
730 ];
731 let insns = decode(&bytes, 0, BpfVariant::Linux).unwrap();
732 assert_eq!(insns.len(), 6);
733 assert_eq!(insns[0].kind, InsnKind::Load);
734 assert_eq!(insns[1].kind, InsnKind::Alu32);
735 assert_eq!(insns[2].kind, InsnKind::JmpCond);
736 assert_eq!(insns[5].kind, InsnKind::Exit);
737
738 // Round-trip property — every decoded slot's bytes equal
739 // the input bytes at its offset.
740 let mut reconstructed: Vec<u8> = Vec::with_capacity(bytes.len());
741 for i in &insns {
742 reconstructed.extend_from_slice(&i.bytes);
743 }
744 assert_eq!(reconstructed, bytes);
745 }
746
747 #[test]
748 fn rejects_misaligned_buffer() {
749 let bytes = [0u8; 7];
750 assert!(matches!(
751 decode(&bytes, 0, BpfVariant::Linux),
752 Err(Error::Misaligned { len: 7 })
753 ));
754 }
755
756 #[test]
757 fn lddw_pairs_two_slots() {
758 // 18 01 00 00 78 56 34 12 — lddw r1, 0x...12345678 (low)
759 // 00 00 00 00 ef cd ab 90 — continuation (high = 0x90abcdef)
760 let bytes: Vec<u8> = vec![
761 0x18, 0x01, 0x00, 0x00, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00, 0xef, 0xcd,
762 0xab, 0x90,
763 ];
764 let insns = decode(&bytes, 0, BpfVariant::Linux).unwrap();
765 assert_eq!(insns.len(), 2);
766 assert_eq!(insns[0].kind, InsnKind::Lddw);
767 assert_eq!(insns[0].imm64, Some(0x90ab_cdef_1234_5678));
768 assert_eq!(insns[1].kind, InsnKind::LddwSecondHalf);
769 assert_eq!(insns[1].bytes, [0, 0, 0, 0, 0xef, 0xcd, 0xab, 0x90]);
770 }
771
772 #[test]
773 fn exit_drives_return_terminator() {
774 let bytes = [0x95, 0, 0, 0, 0, 0, 0, 0];
775 let insns = decode(&bytes, 0x100, BpfVariant::Linux).unwrap();
776 let f = lift_function("f".into(), &insns);
777 assert_eq!(f.blocks[0].terminator, Terminator::Return);
778 }
779
780 #[test]
781 fn opcode_8d_classification_per_variant() {
782 // 0x8d in JMP class with the source bit set:
783 // * SBFv1+: register-source callx (`callx r3`).
784 // * Linux eBPF: BPF-to-BPF local call (`imm` is a
785 // relative slot offset).
786 // Both classify as a "call" — register-target on SBF,
787 // imm-target on Linux.
788 let bytes = [0x8d, 0x30, 0, 0, 0, 0, 0, 0];
789 assert_eq!(
790 decode(&bytes, 0, BpfVariant::Linux).unwrap()[0].kind,
791 InsnKind::Call,
792 );
793 assert_eq!(
794 decode(&bytes, 0, BpfVariant::Sbfv1).unwrap()[0].kind,
795 InsnKind::CallReg,
796 );
797 }
798
799 #[test]
800 fn formats_basic_ops() {
801 let bytes: Vec<u8> = vec![
802 0x79, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x00,
803 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
804 ];
805 let insns = decode(&bytes, 0, BpfVariant::Linux).unwrap();
806 assert_eq!(format_insn(&insns[0], BpfVariant::Linux), "ldxdw r1, [r1]");
807 assert_eq!(format_insn(&insns[1], BpfVariant::Linux), "mov32 r0, 0x0");
808 assert_eq!(format_insn(&insns[2], BpfVariant::Linux), "exit");
809 }
810}