Skip to main content

ud_arch_codec/
lib.rs

1//! Arch-codec trait + open registry.
2//!
3//! The `univdreams` decompile/compile pipeline is arch-agnostic at
4//! its boundaries — the lower path takes a parsed `.ud` source and
5//! emits bytes; the decompile path takes a binary and emits `.ud`
6//! source. Between those boundaries, every instruction-shaped
7//! decision belongs to a specific architecture.
8//!
9//! This crate defines the shared shape: [`ArchCodec`] is the trait
10//! every arch backend implements; [`registry`] is the open registry
11//! consumers (CLI, wasm) populate at process start. The lower path
12//! resolves a codec from the parsed `@module` block, then asks it to
13//! encode each statement that carries semantic fields the codec can
14//! re-emit (jumps, calls, moves, returns). Anything the codec
15//! doesn't model returns [`ArchError::Unsupported`] and the pinned
16//! `bytes` field on the statement is the fallback.
17//!
18//! ## Layering
19//!
20//! This crate intentionally has **no dependency on `ud-ast`** — it
21//! takes raw `(arch, e_machine)` pairs at the registry boundary
22//! and leaves the marshaling from a parsed `ud_ast::Module` to the
23//! caller (`ud-translate`). That break is what keeps the dependency
24//! graph acyclic: `ud-ast` depends on `ud-arch-x86` for emitter
25//! helpers, and the arch crates depend on `ud-arch-codec`, so
26//! `ud-arch-codec` cannot also depend on `ud-ast`.
27//!
28//! Prologue/epilogue parameters (which today live in `ud-ast`)
29//! flow through the lower path as arch-specific types, not through
30//! the trait. A follow-up commit will introduce a shared
31//! representation here once we settle on a cross-arch shape.
32
33#![allow(clippy::module_name_repetitions)]
34
35pub mod registry;
36
37pub use registry::{for_arch, register, CodecFactory};
38
39/// Errors raised by [`ArchCodec`] implementations.
40///
41/// `Unsupported` is the soft-fail signal — it means "this arch
42/// doesn't model this operation, please fall back to pinned
43/// bytes." Other variants are hard failures the caller surfaces to
44/// the user.
45#[derive(Debug, thiserror::Error)]
46pub enum ArchError {
47    /// Returned when an arch is asked to encode something its
48    /// codec doesn't model. The caller (decompile-time byte-drop
49    /// pass, compile-time lower path) treats this as
50    /// "leave the pinned bytes alone."
51    #[error("arch {arch} does not support {operation}")]
52    Unsupported {
53        arch: &'static str,
54        operation: &'static str,
55    },
56
57    /// The text the codec was asked to assemble didn't parse.
58    #[error("assembly failed: {0}")]
59    Assemble(String),
60
61    /// An operand (typically a jump/call displacement) didn't fit
62    /// the arch's encoding range.
63    #[error("operand out of range: {0}")]
64    OutOfRange(String),
65
66    /// No registered codec factory claimed this arch.
67    #[error(
68        "no codec registered for arch = {arch:?}, e_machine = {e_machine:?}; \
69         did you call <arch_crate>::register() at startup?"
70    )]
71    UnknownArch {
72        arch: Option<String>,
73        e_machine: Option<u64>,
74    },
75
76    /// Catch-all for arch-specific encoder failures that don't fit
77    /// the structured variants. Use sparingly.
78    #[error("{0}")]
79    Other(String),
80}
81
82/// Per-call encoding hints that arches interpret in their own
83/// convention. Today the only hint is `wide` (x86's short-vs-rel32
84/// toggle); fixed-width arches (BPF, AArch64) ignore it.
85///
86/// Kept as a plain struct rather than per-method args so the trait
87/// can grow new hints without breaking every impl.
88#[derive(Debug, Clone, Copy, Default)]
89pub struct EncodeHints {
90    /// Force a wide-form encoding. On x86 this means "use rel32
91    /// even when rel8 would fit"; on BPF this is ignored (slot
92    /// offsets are always one slot wide). `None` = arch picks.
93    pub wide: Option<bool>,
94    /// BPF call-convention hint for `encode_call`: `Some(true)`
95    /// requests `call_local` (opcode 0x8d, Linux eBPF style),
96    /// `Some(false)` requests `call_internal` (opcode 0x85
97    /// src=1, Solana sBPF style), `None` defers to the codec's
98    /// default. Lifters that have the original opcode (e.g.
99    /// the byte-drop pass with pinned bytes) set this so the
100    /// regen matches the original encoding exactly. Ignored by
101    /// non-BPF arches.
102    pub bpf_call_local: Option<bool>,
103}
104
105impl EncodeHints {
106    /// Convenience: hints with `wide` set.
107    #[must_use]
108    pub const fn wide(wide: bool) -> Self {
109        Self {
110            wide: Some(wide),
111            bpf_call_local: None,
112        }
113    }
114
115    /// Resolve `wide` with a default for arches that need a bool.
116    /// Most callers don't care about the default; BPF ignores wide
117    /// entirely, x86 falls back to "pick shortest."
118    #[must_use]
119    pub fn wide_or(self, default: bool) -> bool {
120        self.wide.unwrap_or(default)
121    }
122}
123
124/// Structured switch-dispatch spec, passed to
125/// [`ArchCodec::encode_switch_dispatch`]. Holds everything the x86
126/// MSVC encoder needs; arches that don't model jump-table dispatch
127/// return `Unsupported`.
128#[derive(Debug, Clone, Copy)]
129pub struct SwitchSpec<'a> {
130    /// Register name (e.g. `"ecx"`) holding the case selector.
131    pub selector: &'a str,
132    /// The case-target addresses, in case-index order.
133    pub cases: &'a [u64],
134    /// Target for out-of-range selectors.
135    pub default_addr: u64,
136    /// Dispatch shape identifier — `"msvc-jmp-table"` today.
137    /// Implementations match on this and return Unsupported for
138    /// shapes they don't recognise.
139    pub dispatch: &'a str,
140    /// Absolute virtual address where the jump-table data lives.
141    pub table_va: u64,
142    /// Absolute address of the dispatch's first instruction.
143    pub cmp_ip: u64,
144}
145
146/// The shared interface every arch backend implements.
147///
148/// Methods come in three classes:
149///
150/// * **Always-supported**: every arch must implement (`name`,
151///   `assemble_one`, `encode_jump`, `encode_call`,
152///   `encode_cond_jump`, the three size queries). Trait users can
153///   call these unconditionally.
154/// * **Optional with `Unsupported` default**: methods that not
155///   every arch needs (`encode_switch_dispatch`, `encode_move`,
156///   `encode_arith`, `encode_return`,
157///   `encode_cond_jump_with_code`). Default impl returns
158///   `ArchError::Unsupported`. The decompile-side byte-drop pass
159///   and compile-side lower path both treat `Unsupported` as
160///   "leave the pinned bytes alone."
161/// * **Optional with passthrough default**: `desymbolize`, which
162///   maps `label_<hex>` / `sub_<hex>` operands to numeric form.
163///   BPF overrides; default is identity.
164///
165/// Implementations must be `Sync + Send` so they can be stored
166/// behind a `Box<dyn ArchCodec>` and shared across threads.
167pub trait ArchCodec: Sync + Send + std::fmt::Debug {
168    /// Short stable identifier used in error messages and
169    /// `ArchError::Unsupported.arch`. Recommended forms:
170    /// `"x86-64"`, `"x86-32"`, `"bpf-linux"`, `"bpf-sbf-v1"`,
171    /// `"bpf-sbf-v2"`, `"aarch64"`, `"6502"`.
172    fn name(&self) -> &'static str;
173
174    // ---------------------------------------------------------------
175    // Assembly: text → bytes for a single instruction.
176    // ---------------------------------------------------------------
177
178    /// Assemble one instruction's text into bytes at `addr`.
179    ///
180    /// `addr` matters only for arches whose instructions encode
181    /// the IP or whose symbolic operands need cursor context. Pass
182    /// `0` when you don't have a real address (e.g. unit tests).
183    fn assemble_one(&self, text: &str, addr: u64) -> Result<Vec<u8>, ArchError>;
184
185    /// Resolve symbolic operands in `text` against `addr`. The
186    /// default is identity — arches with named-target operands
187    /// (BPF's `label_<hex>` / `sub_<hex>`) override to substitute
188    /// numeric forms the assembler accepts.
189    fn desymbolize(&self, text: &str, _addr: u64) -> String {
190        text.to_string()
191    }
192
193    // ---------------------------------------------------------------
194    // Control flow: jumps, calls, conditional branches.
195    // ---------------------------------------------------------------
196
197    /// Encode an unconditional jump from `source_ip` to `target`.
198    fn encode_jump(
199        &self,
200        source_ip: u64,
201        target: u64,
202        hints: EncodeHints,
203    ) -> Result<Vec<u8>, ArchError>;
204
205    /// Encode a direct call from `source_ip` to `target`.
206    fn encode_call(
207        &self,
208        source_ip: u64,
209        target: u64,
210        hints: EncodeHints,
211    ) -> Result<Vec<u8>, ArchError>;
212
213    /// Encode a conditional jump driven by a BPF-style text
214    /// condition.
215    ///
216    /// `cond_text` reads as "when this is true, the body runs"
217    /// (e.g. `"r0 != 0x0"`). The implementation typically inverts
218    /// internally to pick the underlying jcc that *skips* the
219    /// body. `target` is the address the jcc jumps to when the
220    /// condition is false (i.e. past the body).
221    ///
222    /// Used by `Stmt::IfBlock` / `Stmt::WhileBlock` regen. Arches
223    /// whose `If*` Stmts carry a numeric cond_code instead use
224    /// [`Self::encode_cond_jump_with_code`].
225    fn encode_cond_jump(
226        &self,
227        cond_text: &str,
228        source_ip: u64,
229        target: u64,
230        hints: EncodeHints,
231    ) -> Result<Vec<u8>, ArchError>;
232
233    /// Encode a conditional jump driven by an x86-style numeric
234    /// cond_code (the low nibble of the jcc opcode).
235    ///
236    /// Used by `Stmt::IfGoto` / `Stmt::IfReturn` regen. Default
237    /// returns `Unsupported`.
238    fn encode_cond_jump_with_code(
239        &self,
240        _cond_code: u8,
241        _source_ip: u64,
242        _target: u64,
243        _hints: EncodeHints,
244    ) -> Result<Vec<u8>, ArchError> {
245        Err(ArchError::Unsupported {
246            arch: self.name(),
247            operation: "cond_jump_with_code",
248        })
249    }
250
251    /// Encode a jump-table dispatch. Default `Unsupported`.
252    fn encode_switch_dispatch(&self, _spec: &SwitchSpec) -> Result<Vec<u8>, ArchError> {
253        Err(ArchError::Unsupported {
254            arch: self.name(),
255            operation: "switch_dispatch",
256        })
257    }
258
259    // ---------------------------------------------------------------
260    // Size queries: predict the encoded byte length without
261    // actually emitting bytes. Used by the lower path to compute
262    // downstream offsets before laying out the surrounding region.
263    // ---------------------------------------------------------------
264
265    /// Predicted size of `encode_jump`'s output.
266    fn encoded_jump_size(&self, source_ip: u64, target: u64, hints: EncodeHints) -> usize;
267    /// Predicted size of `encode_cond_jump` (text-driven).
268    fn encoded_cond_jump_size(&self, source_ip: u64, target: u64, hints: EncodeHints) -> usize;
269    /// Predicted size of `encode_call`'s output.
270    fn encoded_call_size(&self, source_ip: u64, target: u64, hints: EncodeHints) -> usize;
271
272    /// Whether a `Stmt::Call`'s pinned `bytes` already
273    /// contains the call instruction itself (return true), or
274    /// `bytes` is just the arg-setup prefix and `encode_call`
275    /// regenerates the trailing call (return false).
276    ///
277    /// - x86 strips the trailing 5 bytes of `call rel32` and
278    ///   regenerates them at lower time so an edit that moves
279    ///   the function auto-resolves the new rel32. Returns
280    ///   `false` (the default).
281    /// - BPF has no separate "prefix" — the call IS the
282    ///   single 8-byte instruction. Returns `true`.
283    ///
284    /// Used by the lower path's `Stmt::Call` arm to decide
285    /// whether to append `encode_call` output after the
286    /// pinned bytes.
287    fn direct_call_bytes_contain_call(&self) -> bool {
288        false
289    }
290
291    // ---------------------------------------------------------------
292    // Data movement (lifted forms — register/memory operands as
293    // text). The strings follow the arch's textual convention; the
294    // codec parses them and emits the corresponding instruction.
295    // ---------------------------------------------------------------
296
297    /// Encode `dst = src` as a single instruction. Default
298    /// `Unsupported`.
299    ///
300    /// Both `dst` and `src` follow the arch's text convention:
301    /// BPF accepts `"r6"`, `"0x5"`, `"[r5 - 0xff8]"`, etc.; x86
302    /// would accept `"rax"`, `"0x5"`, `"qword ptr [rbp-8]"`, etc.
303    /// Implementations return `Unsupported` for any shape they
304    /// don't model.
305    fn encode_move(&self, _dst: &str, _src: &str) -> Result<Vec<u8>, ArchError> {
306        Err(ArchError::Unsupported {
307            arch: self.name(),
308            operation: "move",
309        })
310    }
311
312    /// Encode `dst op= src` (e.g. `"r6", "+=", "r1"`). Default
313    /// `Unsupported`.
314    fn encode_arith(&self, _dst: &str, _op: &str, _src: &str) -> Result<Vec<u8>, ArchError> {
315        Err(ArchError::Unsupported {
316            arch: self.name(),
317            operation: "arith",
318        })
319    }
320
321    /// Encode a function return. `value` carries a known literal
322    /// (e.g. x86's `xor eax, eax; ret` collapses to "ret returning
323    /// 0"); arches that ignore it (BPF `exit` returns r0
324    /// implicitly) discard the field. Default `Unsupported`.
325    fn encode_return(&self, _value: Option<u64>) -> Result<Vec<u8>, ArchError> {
326        Err(ArchError::Unsupported {
327            arch: self.name(),
328            operation: "return",
329        })
330    }
331}