Skip to main content

ud_arch_bpf/
codec.rs

1//! `ArchCodec` implementation for BPF (Linux eBPF + Solana SBF
2//! v1 / v2).
3//!
4//! Each codec instance carries a [`BpfVariant`]. The trait
5//! methods that need slot-offset arithmetic compute it from
6//! `(source_ip, target)` then delegate to the existing
7//! `assemble_bpf_*` family.
8//!
9//! `register()` submits one factory that picks a variant from the
10//! parsed module's `arch` field plus its numeric `e_machine`
11//! (`EM_BPF = 247` for Linux eBPF; `EM_SBF = 263` for Solana SBF).
12
13use crate::{
14    assemble_bpf, assemble_bpf_ifblock_cond, assemble_bpf_ja, desymbolize_bpf_text, BpfVariant,
15    INSN_SIZE,
16};
17use ud_arch_codec::{ArchCodec, ArchError, EncodeHints, SwitchSpec};
18
19/// One codec per BPF variant.
20#[derive(Debug, Clone, Copy)]
21pub struct BpfCodec(pub BpfVariant);
22
23impl BpfCodec {
24    /// Linux eBPF (base ISA).
25    pub const LINUX: Self = Self(BpfVariant::Linux);
26    /// Solana sBPFv1.
27    pub const SBF_V1: Self = Self(BpfVariant::Sbfv1);
28    /// Agave sBPFv2.
29    pub const SBF_V2: Self = Self(BpfVariant::Sbfv2);
30}
31
32/// Compute the slot offset for a relative branch: `(target -
33/// next_slot) / 8`. Returns the i16 the BPF encoder expects,
34/// or an `OutOfRange` error if the displacement doesn't fit /
35/// isn't slot-aligned.
36fn slot_offset(source_ip: u64, target: u64) -> Result<i16, ArchError> {
37    let next_slot = source_ip.wrapping_add(INSN_SIZE as u64);
38    #[allow(clippy::cast_possible_wrap)]
39    let delta = (target as i64).wrapping_sub(next_slot as i64);
40    if delta % (INSN_SIZE as i64) != 0 {
41        return Err(ArchError::OutOfRange(format!(
42            "BPF branch displacement {delta} bytes is not slot-aligned"
43        )));
44    }
45    let slots = delta / (INSN_SIZE as i64);
46    i16::try_from(slots).map_err(|_| {
47        ArchError::OutOfRange(format!(
48            "BPF branch displacement {slots} slots overflows i16 (max ±32768)"
49        ))
50    })
51}
52
53impl ArchCodec for BpfCodec {
54    fn name(&self) -> &'static str {
55        match self.0 {
56            BpfVariant::Linux => "bpf-linux",
57            BpfVariant::Sbfv1 => "bpf-sbf-v1",
58            BpfVariant::Sbfv2 => "bpf-sbf-v2",
59        }
60    }
61
62    fn assemble_one(&self, text: &str, _addr: u64) -> Result<Vec<u8>, ArchError> {
63        assemble_bpf(text).map_err(|e| ArchError::Assemble(e.to_string()))
64    }
65
66    fn desymbolize(&self, text: &str, addr: u64) -> String {
67        desymbolize_bpf_text(text, addr, None).unwrap_or_else(|| text.to_string())
68    }
69
70    fn encode_jump(
71        &self,
72        source_ip: u64,
73        target: u64,
74        _hints: EncodeHints,
75    ) -> Result<Vec<u8>, ArchError> {
76        let off = slot_offset(source_ip, target)?;
77        assemble_bpf_ja(off).map_err(|e| ArchError::Assemble(e.to_string()))
78    }
79
80    /// Encode an intra-program call. The choice between
81    /// `call_local` (opcode 0x8d, Linux eBPF convention) and
82    /// `call_internal` (opcode 0x85 src=1, Solana sBPF
83    /// convention) is hinted by `EncodeHints::bpf_call_local`:
84    /// `Some(true)` → call_local, `Some(false)` → call_internal,
85    /// `None` → default to call_internal (Solana sBPF, the
86    /// dominant convention in practice; Solana programs
87    /// often carry `e_machine = EM_BPF (247)` despite using
88    /// the sBPF call form, so the EM marker alone can't
89    /// disambiguate). The imm is the slot delta from the
90    /// next slot to the target, signed.
91    ///
92    /// Syscalls (opcode 0x85 src=0, imm = a name hash or -1)
93    /// don't go through this method — they remain pinned in
94    /// `Stmt::Call.bytes` because the imm depends on
95    /// relocation context the codec doesn't carry.
96    fn encode_call(
97        &self,
98        source_ip: u64,
99        target: u64,
100        hints: EncodeHints,
101    ) -> Result<Vec<u8>, ArchError> {
102        let slots = slot_offset(source_ip, target)?;
103        let imm32 = i32::from(slots);
104        let mnemonic = if hints.bpf_call_local.unwrap_or(false) {
105            "call_local"
106        } else {
107            "call_internal"
108        };
109        assemble_bpf(&format!("{mnemonic} {imm32}")).map_err(|e| ArchError::Assemble(e.to_string()))
110    }
111
112    fn encode_cond_jump(
113        &self,
114        cond_text: &str,
115        source_ip: u64,
116        target: u64,
117        _hints: EncodeHints,
118    ) -> Result<Vec<u8>, ArchError> {
119        let off = slot_offset(source_ip, target)?;
120        assemble_bpf_ifblock_cond(cond_text, off).map_err(|e| ArchError::Assemble(e.to_string()))
121    }
122
123    fn encode_switch_dispatch(&self, _spec: &SwitchSpec) -> Result<Vec<u8>, ArchError> {
124        // BPF doesn't model jump-table dispatch as a single
125        // structural form today.
126        Err(ArchError::Unsupported {
127            arch: self.name(),
128            operation: "switch_dispatch",
129        })
130    }
131
132    fn encoded_jump_size(&self, _source_ip: u64, _target: u64, _hints: EncodeHints) -> usize {
133        INSN_SIZE
134    }
135
136    fn encoded_cond_jump_size(&self, _source_ip: u64, _target: u64, _hints: EncodeHints) -> usize {
137        INSN_SIZE
138    }
139
140    fn encoded_call_size(&self, _source_ip: u64, _target: u64, _hints: EncodeHints) -> usize {
141        INSN_SIZE
142    }
143
144    /// BPF calls are single 8-byte instructions — pinned
145    /// `Stmt::Call.bytes` (when present) is the complete call.
146    fn direct_call_bytes_contain_call(&self) -> bool {
147        true
148    }
149
150    /// Encode `dst = src` as one BPF instruction.
151    ///
152    /// Supported shapes:
153    ///
154    /// * `("rN", "rM")` → `mov64 rN, rM` (8 bytes)
155    /// * `("rN", "0xN")` → `mov64 rN, imm32` (8 bytes)
156    /// * `("rN", "[rM ± off]")` (optional `:uNN` suffix) →
157    ///   `ldxdw / ldxw / ldxh / ldxb rN, [rM ± off]` (8 bytes)
158    /// * `("[rN ± off]", "rM")` (optional `:uNN` suffix on dst) →
159    ///   `stxdw / stxw / stxh / stxb [rN ± off], rM` (8 bytes)
160    /// * `("rN", "0x<imm>:u64")` → `lddw rN, 0x<imm>` (16 bytes,
161    ///   two BPF slots — the second being a zero-opcode
162    ///   continuation slot carrying the high 32 bits)
163    ///
164    /// The `:u<bits>` size suffix on a memory operand picks the
165    /// access width (`:u8 / :u16 / :u32 / :u64`); the bare
166    /// `[rN ± off]` form defaults to `:u64` (BPF `dw`).
167    fn encode_move(&self, dst: &str, src: &str) -> Result<Vec<u8>, ArchError> {
168        let dst = dst.trim();
169        let src = src.trim();
170        // Strip optional `:u<bits>` suffix from each side;
171        // remember the size and which side carried it (the
172        // memory operand always carries the suffix when
173        // present).
174        let (dst_core, dst_size) = split_size_suffix(dst);
175        let (src_core, src_size) = split_size_suffix(src);
176
177        // LDDW: register dst, 64-bit immediate src.
178        if is_bpf_reg(dst_core) && (src_size == Some(64) || is_lddw_imm(src_core)) {
179            let imm_str = src_core.trim();
180            return assemble_bpf(&format!("lddw {dst_core}, {imm_str}"))
181                .map_err(|e| ArchError::Assemble(e.to_string()));
182        }
183
184        // ldx: register dst, [memory] src.
185        if is_bpf_reg(dst_core) && is_bracket_mem(src_core) {
186            let bits = src_size.unwrap_or(64);
187            let suffix = size_suffix_for_bits(bits)?;
188            let mem = desymbolize_mem(src_core);
189            return assemble_bpf(&format!("ldx{suffix} {dst_core}, {mem}"))
190                .map_err(|e| ArchError::Assemble(e.to_string()));
191        }
192
193        // stx: [memory] dst, register src.
194        if is_bracket_mem(dst_core) && is_bpf_reg(src_core) {
195            let bits = dst_size.unwrap_or(64);
196            let suffix = size_suffix_for_bits(bits)?;
197            let mem = desymbolize_mem(dst_core);
198            return assemble_bpf(&format!("stx{suffix} {mem}, {src_core}"))
199                .map_err(|e| ArchError::Assemble(e.to_string()));
200        }
201
202        // mov64: register dst, register or imm32 src.
203        if is_bpf_reg(dst_core) && (is_bpf_reg(src_core) || is_bpf_imm(src_core)) {
204            return assemble_bpf(&format!("mov64 {dst_core}, {src_core}"))
205                .map_err(|e| ArchError::Assemble(e.to_string()));
206        }
207
208        Err(ArchError::Unsupported {
209            arch: self.name(),
210            operation: "move (unrecognised operand shape)",
211        })
212    }
213
214    /// Encode a function return. BPF returns r0 implicitly via
215    /// the `exit` instruction; the `value` field is ignored.
216    fn encode_return(&self, _value: Option<u64>) -> Result<Vec<u8>, ArchError> {
217        assemble_bpf("exit").map_err(|e| ArchError::Assemble(e.to_string()))
218    }
219
220    /// Encode `dst op src` as a single 64-bit BPF ALU
221    /// instruction. The op string maps to the corresponding
222    /// BPF mnemonic; the src may be a register (`r0`..`r10`)
223    /// or an immediate (`0x<hex>` / decimal).
224    ///
225    /// Returns `Unsupported` for operators outside the lifted
226    /// set (`arsh`, `neg`, 32-bit forms keep their `@asm`
227    /// rendering for now).
228    fn encode_arith(&self, dst: &str, op: &str, src: &str) -> Result<Vec<u8>, ArchError> {
229        let dst = dst.trim();
230        let src = src.trim();
231        if !is_bpf_reg(dst) {
232            return Err(ArchError::Unsupported {
233                arch: self.name(),
234                operation: "arith (non-register dst)",
235            });
236        }
237        if !(is_bpf_reg(src) || is_bpf_imm(src)) {
238            return Err(ArchError::Unsupported {
239                arch: self.name(),
240                operation: "arith (unsupported src shape)",
241            });
242        }
243        let mnemonic = match op {
244            "+=" => "add64",
245            "-=" => "sub64",
246            "*=" => "mul64",
247            "/=" => "div64",
248            "%=" => "mod64",
249            "|=" => "or64",
250            "&=" => "and64",
251            "^=" => "xor64",
252            "<<=" => "lsh64",
253            ">>=" => "rsh64",
254            _ => {
255                return Err(ArchError::Unsupported {
256                    arch: self.name(),
257                    operation: "arith (unsupported op)",
258                });
259            }
260        };
261        assemble_bpf(&format!("{mnemonic} {dst}, {src}"))
262            .map_err(|e| ArchError::Assemble(e.to_string()))
263    }
264}
265
266/// Recognise a BPF general-purpose register name (`r0`..`r10`).
267fn is_bpf_reg(s: &str) -> bool {
268    let s = s.trim();
269    if !s.starts_with('r') {
270        return false;
271    }
272    let n = &s[1..];
273    matches!(
274        n,
275        "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
276    )
277}
278
279/// Recognise a `[...]` memory operand (BPF ldx / stx form).
280/// The contents aren't validated here — the underlying
281/// assembler's `parse_mem` does that.
282fn is_bracket_mem(s: &str) -> bool {
283    let s = s.trim();
284    s.starts_with('[') && s.ends_with(']')
285}
286
287/// Rewrite a memory operand's symbolic stack-slot reference
288/// (`[local_<hex>]` or `[arg_<hex>]`) back to the
289/// `[r10 ± 0x<hex>]` form `parse_mem` accepts. The shared
290/// `desymbolize_bpf_text` does the same for `@asm` lines —
291/// this routes a single operand through that helper rather
292/// than reimplementing the rewrite.
293fn desymbolize_mem(operand: &str) -> String {
294    desymbolize_bpf_text(operand, 0, None).unwrap_or_else(|| operand.to_string())
295}
296
297/// Strip an optional trailing `:uNN` size suffix from a Move
298/// operand. Returns the operand without the suffix plus the
299/// parsed bit width (`Some(8/16/32/64)`) when one was found.
300fn split_size_suffix(s: &str) -> (&str, Option<u32>) {
301    let s = s.trim();
302    if let Some(idx) = s.rfind(":u") {
303        let suffix = &s[idx + 2..];
304        if let Ok(n) = suffix.parse::<u32>() {
305            if matches!(n, 8 | 16 | 32 | 64) {
306                return (s[..idx].trim_end(), Some(n));
307            }
308        }
309    }
310    (s, None)
311}
312
313/// Map a `:uNN` width to the BPF ldx/stx mnemonic suffix.
314fn size_suffix_for_bits(bits: u32) -> Result<&'static str, ArchError> {
315    match bits {
316        8 => Ok("b"),
317        16 => Ok("h"),
318        32 => Ok("w"),
319        64 => Ok("dw"),
320        _ => Err(ArchError::OutOfRange(format!(
321            "unsupported memory access width :u{bits}"
322        ))),
323    }
324}
325
326/// Detect a 64-bit-only immediate (i.e. one that requires LDDW
327/// rather than `mov64 reg, imm32`): explicit `0x` literal whose
328/// value exceeds u32::MAX, OR explicit `:u64` suffix on the src
329/// (handled by the caller). Returns `false` for small hex
330/// constants that fit `mov64 reg, imm`.
331fn is_lddw_imm(s: &str) -> bool {
332    let s = s.trim();
333    let s = s.strip_prefix('-').unwrap_or(s);
334    if let Some(hex) = s.strip_prefix("0x") {
335        if hex.is_empty() || !hex.chars().all(|c| c.is_ascii_hexdigit()) {
336            return false;
337        }
338        u64::from_str_radix(hex, 16).is_ok_and(|v| v > u64::from(u32::MAX))
339    } else {
340        false
341    }
342}
343
344/// Recognise a BPF immediate constant in textual form.
345/// Accepts decimal, `0x`-prefixed hex, and an optional leading
346/// minus sign. Used by `encode_move`'s src classifier.
347fn is_bpf_imm(s: &str) -> bool {
348    let s = s.trim();
349    let s = s.strip_prefix('-').unwrap_or(s);
350    if let Some(hex) = s.strip_prefix("0x") {
351        return !hex.is_empty() && hex.chars().all(|c| c.is_ascii_hexdigit());
352    }
353    !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())
354}
355
356/// Register the BPF codec factory with [`ud_arch_codec::registry`].
357///
358/// Variant selection prefers numeric `e_machine` when both signals
359/// are present (more specific); falls back to the friendly `arch`
360/// string. EM_BPF (247) → Linux; EM_SBF (263) → sBPFv1 by default
361/// (sBPFv2 distinction requires e_flags inspection which the
362/// trait doesn't yet surface — out of scope for now).
363pub fn register() {
364    ud_arch_codec::register(factory);
365}
366
367/// `EM_BPF` from the ELF spec (Linux eBPF).
368pub const EM_BPF: u64 = 247;
369/// `EM_SBF` from Solana's ELF extension (sBPFv1 / sBPFv2 — variant
370/// distinction needs `e_flags`).
371pub const EM_SBF: u64 = 263;
372
373fn factory(arch_name: Option<&str>, e_machine: Option<u64>) -> Option<Box<dyn ArchCodec>> {
374    if let Some(em) = e_machine {
375        match em {
376            EM_BPF => return Some(Box::new(BpfCodec(BpfVariant::Linux))),
377            EM_SBF => return Some(Box::new(BpfCodec(BpfVariant::Sbfv1))),
378            _ => {}
379        }
380    }
381    match arch_name {
382        Some("bpf") => Some(Box::new(BpfCodec(BpfVariant::Linux))),
383        Some("sbf" | "sbfv1") => Some(Box::new(BpfCodec(BpfVariant::Sbfv1))),
384        Some("sbfv2") => Some(Box::new(BpfCodec(BpfVariant::Sbfv2))),
385        _ => None,
386    }
387}