ud_arch_codec/lib.rs
1//! Arch-codec trait + open registry.
2//!
3//! The `univdreams` decompile/compile pipeline is arch-agnostic at
4//! its boundaries — the lower path takes a parsed `.ud` source and
5//! emits bytes; the decompile path takes a binary and emits `.ud`
6//! source. Between those boundaries, every instruction-shaped
7//! decision belongs to a specific architecture.
8//!
9//! This crate defines the shared shape: [`ArchCodec`] is the trait
10//! every arch backend implements; [`registry`] is the open registry
11//! consumers (CLI, wasm) populate at process start. The lower path
12//! resolves a codec from the parsed `@module` block, then asks it to
13//! encode each statement that carries semantic fields the codec can
14//! re-emit (jumps, calls, moves, returns). Anything the codec
15//! doesn't model returns [`ArchError::Unsupported`] and the pinned
16//! `bytes` field on the statement is the fallback.
17//!
18//! ## Layering
19//!
20//! This crate intentionally has **no dependency on `ud-ast`** — it
21//! takes raw `(arch, e_machine)` pairs at the registry boundary
22//! and leaves the marshaling from a parsed `ud_ast::Module` to the
23//! caller (`ud-translate`). That break is what keeps the dependency
24//! graph acyclic: `ud-ast` depends on `ud-arch-x86` for emitter
25//! helpers, and the arch crates depend on `ud-arch-codec`, so
26//! `ud-arch-codec` cannot also depend on `ud-ast`.
27//!
28//! Prologue/epilogue parameters (which today live in `ud-ast`)
29//! flow through the lower path as arch-specific types, not through
30//! the trait. A follow-up commit will introduce a shared
31//! representation here once we settle on a cross-arch shape.
32
33#![allow(clippy::module_name_repetitions)]
34
35pub mod registry;
36
37pub use registry::{for_arch, register, CodecFactory};
38
39/// Errors raised by [`ArchCodec`] implementations.
40///
41/// `Unsupported` is the soft-fail signal — it means "this arch
42/// doesn't model this operation, please fall back to pinned
43/// bytes." Other variants are hard failures the caller surfaces to
44/// the user.
45#[derive(Debug, thiserror::Error)]
46pub enum ArchError {
47 /// Returned when an arch is asked to encode something its
48 /// codec doesn't model. The caller (decompile-time byte-drop
49 /// pass, compile-time lower path) treats this as
50 /// "leave the pinned bytes alone."
51 #[error("arch {arch} does not support {operation}")]
52 Unsupported {
53 arch: &'static str,
54 operation: &'static str,
55 },
56
57 /// The text the codec was asked to assemble didn't parse.
58 #[error("assembly failed: {0}")]
59 Assemble(String),
60
61 /// An operand (typically a jump/call displacement) didn't fit
62 /// the arch's encoding range.
63 #[error("operand out of range: {0}")]
64 OutOfRange(String),
65
66 /// No registered codec factory claimed this arch.
67 #[error(
68 "no codec registered for arch = {arch:?}, e_machine = {e_machine:?}; \
69 did you call <arch_crate>::register() at startup?"
70 )]
71 UnknownArch {
72 arch: Option<String>,
73 e_machine: Option<u64>,
74 },
75
76 /// Catch-all for arch-specific encoder failures that don't fit
77 /// the structured variants. Use sparingly.
78 #[error("{0}")]
79 Other(String),
80}
81
82/// Per-call encoding hints that arches interpret in their own
83/// convention. Today the only hint is `wide` (x86's short-vs-rel32
84/// toggle); fixed-width arches (BPF, AArch64) ignore it.
85///
86/// Kept as a plain struct rather than per-method args so the trait
87/// can grow new hints without breaking every impl.
88#[derive(Debug, Clone, Copy, Default)]
89pub struct EncodeHints {
90 /// Force a wide-form encoding. On x86 this means "use rel32
91 /// even when rel8 would fit"; on BPF this is ignored (slot
92 /// offsets are always one slot wide). `None` = arch picks.
93 pub wide: Option<bool>,
94 /// BPF call-convention hint for `encode_call`: `Some(true)`
95 /// requests `call_local` (opcode 0x8d, Linux eBPF style),
96 /// `Some(false)` requests `call_internal` (opcode 0x85
97 /// src=1, Solana sBPF style), `None` defers to the codec's
98 /// default. Lifters that have the original opcode (e.g.
99 /// the byte-drop pass with pinned bytes) set this so the
100 /// regen matches the original encoding exactly. Ignored by
101 /// non-BPF arches.
102 pub bpf_call_local: Option<bool>,
103}
104
105impl EncodeHints {
106 /// Convenience: hints with `wide` set.
107 #[must_use]
108 pub const fn wide(wide: bool) -> Self {
109 Self {
110 wide: Some(wide),
111 bpf_call_local: None,
112 }
113 }
114
115 /// Resolve `wide` with a default for arches that need a bool.
116 /// Most callers don't care about the default; BPF ignores wide
117 /// entirely, x86 falls back to "pick shortest."
118 #[must_use]
119 pub fn wide_or(self, default: bool) -> bool {
120 self.wide.unwrap_or(default)
121 }
122}
123
124/// Structured switch-dispatch spec, passed to
125/// [`ArchCodec::encode_switch_dispatch`]. Holds everything the x86
126/// MSVC encoder needs; arches that don't model jump-table dispatch
127/// return `Unsupported`.
128#[derive(Debug, Clone, Copy)]
129pub struct SwitchSpec<'a> {
130 /// Register name (e.g. `"ecx"`) holding the case selector.
131 pub selector: &'a str,
132 /// The case-target addresses, in case-index order.
133 pub cases: &'a [u64],
134 /// Target for out-of-range selectors.
135 pub default_addr: u64,
136 /// Dispatch shape identifier — `"msvc-jmp-table"` today.
137 /// Implementations match on this and return Unsupported for
138 /// shapes they don't recognise.
139 pub dispatch: &'a str,
140 /// Absolute virtual address where the jump-table data lives.
141 pub table_va: u64,
142 /// Absolute address of the dispatch's first instruction.
143 pub cmp_ip: u64,
144}
145
146/// The shared interface every arch backend implements.
147///
148/// Methods come in three classes:
149///
150/// * **Always-supported**: every arch must implement (`name`,
151/// `assemble_one`, `encode_jump`, `encode_call`,
152/// `encode_cond_jump`, the three size queries). Trait users can
153/// call these unconditionally.
154/// * **Optional with `Unsupported` default**: methods that not
155/// every arch needs (`encode_switch_dispatch`, `encode_move`,
156/// `encode_arith`, `encode_return`,
157/// `encode_cond_jump_with_code`). Default impl returns
158/// `ArchError::Unsupported`. The decompile-side byte-drop pass
159/// and compile-side lower path both treat `Unsupported` as
160/// "leave the pinned bytes alone."
161/// * **Optional with passthrough default**: `desymbolize`, which
162/// maps `label_<hex>` / `sub_<hex>` operands to numeric form.
163/// BPF overrides; default is identity.
164///
165/// Implementations must be `Sync + Send` so they can be stored
166/// behind a `Box<dyn ArchCodec>` and shared across threads.
167pub trait ArchCodec: Sync + Send + std::fmt::Debug {
168 /// Short stable identifier used in error messages and
169 /// `ArchError::Unsupported.arch`. Recommended forms:
170 /// `"x86-64"`, `"x86-32"`, `"bpf-linux"`, `"bpf-sbf-v1"`,
171 /// `"bpf-sbf-v2"`, `"aarch64"`, `"6502"`.
172 fn name(&self) -> &'static str;
173
174 // ---------------------------------------------------------------
175 // Assembly: text → bytes for a single instruction.
176 // ---------------------------------------------------------------
177
178 /// Assemble one instruction's text into bytes at `addr`.
179 ///
180 /// `addr` matters only for arches whose instructions encode
181 /// the IP or whose symbolic operands need cursor context. Pass
182 /// `0` when you don't have a real address (e.g. unit tests).
183 fn assemble_one(&self, text: &str, addr: u64) -> Result<Vec<u8>, ArchError>;
184
185 /// Resolve symbolic operands in `text` against `addr`. The
186 /// default is identity — arches with named-target operands
187 /// (BPF's `label_<hex>` / `sub_<hex>`) override to substitute
188 /// numeric forms the assembler accepts.
189 fn desymbolize(&self, text: &str, _addr: u64) -> String {
190 text.to_string()
191 }
192
193 // ---------------------------------------------------------------
194 // Control flow: jumps, calls, conditional branches.
195 // ---------------------------------------------------------------
196
197 /// Encode an unconditional jump from `source_ip` to `target`.
198 fn encode_jump(
199 &self,
200 source_ip: u64,
201 target: u64,
202 hints: EncodeHints,
203 ) -> Result<Vec<u8>, ArchError>;
204
205 /// Encode a direct call from `source_ip` to `target`.
206 fn encode_call(
207 &self,
208 source_ip: u64,
209 target: u64,
210 hints: EncodeHints,
211 ) -> Result<Vec<u8>, ArchError>;
212
213 /// Encode a conditional jump driven by a BPF-style text
214 /// condition.
215 ///
216 /// `cond_text` reads as "when this is true, the body runs"
217 /// (e.g. `"r0 != 0x0"`). The implementation typically inverts
218 /// internally to pick the underlying jcc that *skips* the
219 /// body. `target` is the address the jcc jumps to when the
220 /// condition is false (i.e. past the body).
221 ///
222 /// Used by `Stmt::IfBlock` / `Stmt::WhileBlock` regen. Arches
223 /// whose `If*` Stmts carry a numeric cond_code instead use
224 /// [`Self::encode_cond_jump_with_code`].
225 fn encode_cond_jump(
226 &self,
227 cond_text: &str,
228 source_ip: u64,
229 target: u64,
230 hints: EncodeHints,
231 ) -> Result<Vec<u8>, ArchError>;
232
233 /// Encode a conditional jump driven by an x86-style numeric
234 /// cond_code (the low nibble of the jcc opcode).
235 ///
236 /// Used by `Stmt::IfGoto` / `Stmt::IfReturn` regen. Default
237 /// returns `Unsupported`.
238 fn encode_cond_jump_with_code(
239 &self,
240 _cond_code: u8,
241 _source_ip: u64,
242 _target: u64,
243 _hints: EncodeHints,
244 ) -> Result<Vec<u8>, ArchError> {
245 Err(ArchError::Unsupported {
246 arch: self.name(),
247 operation: "cond_jump_with_code",
248 })
249 }
250
251 /// Encode a jump-table dispatch. Default `Unsupported`.
252 fn encode_switch_dispatch(&self, _spec: &SwitchSpec) -> Result<Vec<u8>, ArchError> {
253 Err(ArchError::Unsupported {
254 arch: self.name(),
255 operation: "switch_dispatch",
256 })
257 }
258
259 // ---------------------------------------------------------------
260 // Size queries: predict the encoded byte length without
261 // actually emitting bytes. Used by the lower path to compute
262 // downstream offsets before laying out the surrounding region.
263 // ---------------------------------------------------------------
264
265 /// Predicted size of `encode_jump`'s output.
266 fn encoded_jump_size(&self, source_ip: u64, target: u64, hints: EncodeHints) -> usize;
267 /// Predicted size of `encode_cond_jump` (text-driven).
268 fn encoded_cond_jump_size(&self, source_ip: u64, target: u64, hints: EncodeHints) -> usize;
269 /// Predicted size of `encode_call`'s output.
270 fn encoded_call_size(&self, source_ip: u64, target: u64, hints: EncodeHints) -> usize;
271
272 /// Whether a `Stmt::Call`'s pinned `bytes` already
273 /// contains the call instruction itself (return true), or
274 /// `bytes` is just the arg-setup prefix and `encode_call`
275 /// regenerates the trailing call (return false).
276 ///
277 /// - x86 strips the trailing 5 bytes of `call rel32` and
278 /// regenerates them at lower time so an edit that moves
279 /// the function auto-resolves the new rel32. Returns
280 /// `false` (the default).
281 /// - BPF has no separate "prefix" — the call IS the
282 /// single 8-byte instruction. Returns `true`.
283 ///
284 /// Used by the lower path's `Stmt::Call` arm to decide
285 /// whether to append `encode_call` output after the
286 /// pinned bytes.
287 fn direct_call_bytes_contain_call(&self) -> bool {
288 false
289 }
290
291 // ---------------------------------------------------------------
292 // Data movement (lifted forms — register/memory operands as
293 // text). The strings follow the arch's textual convention; the
294 // codec parses them and emits the corresponding instruction.
295 // ---------------------------------------------------------------
296
297 /// Encode `dst = src` as a single instruction. Default
298 /// `Unsupported`.
299 ///
300 /// Both `dst` and `src` follow the arch's text convention:
301 /// BPF accepts `"r6"`, `"0x5"`, `"[r5 - 0xff8]"`, etc.; x86
302 /// would accept `"rax"`, `"0x5"`, `"qword ptr [rbp-8]"`, etc.
303 /// Implementations return `Unsupported` for any shape they
304 /// don't model.
305 fn encode_move(&self, _dst: &str, _src: &str) -> Result<Vec<u8>, ArchError> {
306 Err(ArchError::Unsupported {
307 arch: self.name(),
308 operation: "move",
309 })
310 }
311
312 /// Encode `dst op= src` (e.g. `"r6", "+=", "r1"`). Default
313 /// `Unsupported`.
314 fn encode_arith(&self, _dst: &str, _op: &str, _src: &str) -> Result<Vec<u8>, ArchError> {
315 Err(ArchError::Unsupported {
316 arch: self.name(),
317 operation: "arith",
318 })
319 }
320
321 /// Encode a function return. `value` carries a known literal
322 /// (e.g. x86's `xor eax, eax; ret` collapses to "ret returning
323 /// 0"); arches that ignore it (BPF `exit` returns r0
324 /// implicitly) discard the field. Default `Unsupported`.
325 fn encode_return(&self, _value: Option<u64>) -> Result<Vec<u8>, ArchError> {
326 Err(ArchError::Unsupported {
327 arch: self.name(),
328 operation: "return",
329 })
330 }
331}