ud_ast/types.rs
1//! AST types for `.ud`.
2
3/// A complete `.ud` file: a `@module { … }` header followed by zero
4/// or more top-level items.
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct UdFile {
7 pub module: Module,
8 pub items: Vec<Item>,
9}
10
11/// The `@module { … }` block at the top of every file.
12///
13/// `fields` is an ordered list — order is significant for round-trip
14/// (the pretty-printer emits in this order).
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct Module {
17 pub fields: Vec<Field>,
18}
19
20/// One `name: value` entry inside a `@module` or nested block.
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub struct Field {
23 pub name: String,
24 pub value: Value,
25}
26
27/// A value that can appear on the right-hand side of a `Field`.
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub enum Value {
30 /// A double-quoted string. Storage is the unescaped form.
31 String(String),
32 /// An integer literal. Always emitted in hex with the `0x` prefix
33 /// for now (decimal also accepted on parse).
34 Int(u64),
35 /// A bracketed list of values: `[v1, v2, …]`.
36 List(Vec<Value>),
37 /// A nested block: `{ name: value, … }`.
38 Block(Vec<Field>),
39}
40
41/// A `#[key=value]` annotation. Attributes live on structural
42/// elements (functions, conditionals, …) and carry metadata that is
43/// either:
44///
45/// * **Informational**: hints for the reader / downstream tooling
46/// (`#[compiler="msvc15"]`, `#[abi="stdcall"]`) — they don't change
47/// the lower-path output.
48/// * **Load-bearing**: bytes / decisions that the lower path
49/// consumes (`#[head_bytes=[…]]` on a separated cmp/jcc `if`, so
50/// the cmp bytes land at the right offset relative to the
51/// intervening statements).
52///
53/// Round-trip rule: attributes round-trip verbatim. The `@module`
54/// header's optional `defaults: { … }` block can shadow any attribute
55/// here; the emitter omits an attribute when it equals the module
56/// default, the parser supplies the default when an attribute is
57/// missing.
58#[derive(Debug, Clone, PartialEq, Eq)]
59pub struct Attribute {
60 pub key: String,
61 pub value: AttrValue,
62}
63
64/// Right-hand side of an attribute. Kept small on purpose — every
65/// new variant has to round-trip through emit + parse + lower.
66#[derive(Debug, Clone, PartialEq, Eq)]
67pub enum AttrValue {
68 /// `"…"` — quoted string.
69 String(String),
70 /// `0x…` or decimal integer.
71 Int(u64),
72 /// `[0x01, 0x02, …]` — used by `head_bytes` and friends.
73 ByteList(Vec<u8>),
74 /// Bare flag, e.g. `#[naked]` — no `=value` part. Renders
75 /// as just the key name; parses from any attribute that
76 /// omits the `=` sign.
77 Flag,
78}
79
80/// An item in the file: at the top level, or nested inside an
81/// [`Item::Section`].
82#[derive(Debug, Clone, PartialEq, Eq)]
83pub enum Item {
84 /// Free-floating `// …` line. Preserved on emit so structural
85 /// notes survive parse → re-emit.
86 Comment(String),
87
88 /// A function declaration.
89 Function(FnDecl),
90
91 /// `@raw(0x…, [bytes])` — pin a slice of bytes at a virtual address.
92 /// Used by the decompiler to fill the gaps between functions
93 /// (alignment padding) and to capture the content of non-executable
94 /// sections (`.rodata`, `.data`, etc.).
95 Raw { addr: u64, bytes: Vec<u8> },
96
97 /// `@strings(0x…, ["a", "b", …])` — a packed null-terminated
98 /// string table. Lowers to each entry's UTF-8 bytes followed by
99 /// a single 0x00 terminator, in order. Used for ELF `SHT_STRTAB`
100 /// sections (`.dynstr`, `.strtab`, `.shstrtab`) and for any
101 /// well-known single-string sections like `.interp` (which is
102 /// emitted as a one-entry list).
103 Strings { addr: u64, strings: Vec<String> },
104
105 /// `@notes(0x…, [{ type: …, name: "…", desc: [bytes] }, …])` — an
106 /// ELF note section. Each entry has a 12-byte `Elf64_Nhdr`
107 /// header (name_size, desc_size, type), a name padded to a 4-byte
108 /// boundary, and a desc padded to a 4-byte boundary. Used for
109 /// `SHT_NOTE` sections (`.note.gnu.property`, `.note.ABI-tag`,
110 /// `.note.gnu.build-id`, …).
111 Notes { addr: u64, entries: Vec<NoteEntry> },
112
113 /// `@section("name", 0x…) { items… }` — group items under an ELF
114 /// section. The section's start address must equal the first
115 /// nested item's address; items are required to cover the section
116 /// contiguously (no gaps) for [`lower`](crate) to succeed.
117 Section {
118 name: String,
119 addr: u64,
120 items: Vec<Item>,
121 },
122
123 /// `@jump_table(0x…, dispatch="…") { case_0: label_<addr>; … }` —
124 /// a structured switch jump table. Each entry names a case index
125 /// and the address it dispatches to; the `dispatch` string tags
126 /// the encoding kind (e.g. `"gcc_pie_rel32"`, `"msvc_va32"`) so
127 /// lower knows whether to emit 4-byte signed offsets relative to
128 /// the table base, absolute 32-bit VAs, or some other layout.
129 /// Replaces the `@raw` byte run a jump table would otherwise
130 /// occupy in `.rodata`, recovering the symbolic intent of the
131 /// dispatch.
132 JumpTable {
133 addr: u64,
134 dispatch: String,
135 entries: Vec<JumpTableEntry>,
136 },
137}
138
139/// One entry inside an [`Item::JumpTable`] block: a case index and
140/// the address it dispatches to. The case ordering is the encoded
141/// table order — entries lower in source-text order render at
142/// `(addr + i * entry_size)`.
143#[derive(Debug, Clone, PartialEq, Eq)]
144pub struct JumpTableEntry {
145 /// Case index — `0`, `1`, … for dense tables; sparse tables
146 /// preserve gaps via case numbers that aren't strictly
147 /// contiguous (rare in practice — most compilers normalise to
148 /// a dense table with a `default` arm).
149 pub case: u64,
150 /// Target address the case dispatches to. Renders as
151 /// `label_<addr:x>` in source text — the same label name a
152 /// `Stmt::Goto` would produce.
153 pub target: u64,
154}
155
156/// One entry inside an [`Item::Notes`] block. Mirrors the structure
157/// of an ELF note (`Elf64_Nhdr` + name + desc, each padded to a
158/// 4-byte boundary).
159#[derive(Debug, Clone, PartialEq, Eq)]
160pub struct NoteEntry {
161 /// Note type (`NT_GNU_PROPERTY_TYPE_0`, `NT_GNU_BUILD_ID`, …).
162 pub note_type: u32,
163 /// Owner string (`"GNU"` for GNU notes, etc.). Encoded with a
164 /// trailing NUL byte then padded to a 4-byte boundary.
165 pub name: String,
166 /// Descriptor bytes — opaque payload, padded to a 4-byte
167 /// boundary on emit.
168 pub desc: Vec<u8>,
169}
170
171/// A function declaration.
172///
173/// `signature` carries typed parameters and return type when known
174/// (e.g. recovered from DWARF). When absent, the function emits as
175/// `fn name() { … }` and behaves as untyped.
176#[derive(Debug, Clone, PartialEq, Eq)]
177pub struct FnDecl {
178 /// Optional `@addr(0x…)` directive preceding `fn`. Required for
179 /// functions whose name doesn't encode the address (i.e. anything
180 /// not matching `sub_<hex>`); the decompiler emits it always for
181 /// clarity.
182 pub addr: Option<u64>,
183 pub name: String,
184 /// `#[…]` attributes attached to the `fn` keyword. Carry per-
185 /// function profile info (`abi`, `cc`, `saves`, …); module-level
186 /// `defaults` in the `@module` header can shadow these.
187 pub attrs: Vec<Attribute>,
188 /// Typed parameters and return type, when known.
189 pub signature: Option<Signature>,
190 /// Variable / register declarations at the top of the function
191 /// body. Stack slots discovered from `[ebp±N]` accesses get a
192 /// `Stack` decl; registers the function touches get a `Register`
193 /// decl. Purely informational today (the prologue's pinned bytes
194 /// already encode the actual stack allocation); future work can
195 /// use the size hints to drive lowering of a re-allocated frame.
196 pub locals: Vec<LocalDecl>,
197 pub body: Vec<Stmt>,
198}
199
200/// One `let name: type;` entry at the head of a function body.
201///
202/// Kinds:
203/// * **Stack** — `let var_4: u32;` — backed by a stack slot at
204/// `[ebp-4]` (the name carries the offset as its hex suffix).
205/// * **Register** — `let eax: u32 @reg;` — backed by a CPU
206/// register; the name is the canonical x86 register mnemonic.
207///
208/// The type captures the largest access size seen at the slot /
209/// register in the function. Multiple-width accesses (`mov al,
210/// [ebp-1]; mov dword ptr [ebp-4], …`) pick the widest.
211#[derive(Debug, Clone, PartialEq, Eq)]
212pub struct LocalDecl {
213 pub name: String,
214 pub ty: Type,
215 pub kind: LocalKind,
216}
217
218#[derive(Debug, Clone, Copy, PartialEq, Eq)]
219pub enum LocalKind {
220 Stack,
221 Register,
222}
223
224/// A function signature: parameter list + return type.
225#[derive(Debug, Clone, PartialEq, Eq)]
226pub struct Signature {
227 pub params: Vec<Param>,
228 pub return_type: Type,
229}
230
231/// One typed parameter in a function signature.
232///
233/// `location` carries the calling-convention slot the value is
234/// passed in — for the 6502 backend this is a register name like
235/// `"A"` / `"X"` / `"Y"`. When `Some`, the parameter renders as
236/// `name: ty @LOC`. When `None`, just `name: ty`.
237#[derive(Debug, Clone, PartialEq, Eq)]
238pub struct Param {
239 pub name: String,
240 pub ty: Type,
241 pub location: Option<String>,
242}
243
244/// A type expressible in `.ud` source.
245///
246/// v0 covers C-like primitives plus single-level pointer wrapping.
247/// Anything we can't recover (composite types, qualifiers, function
248/// pointers) lands as [`Type::Unknown`], which the parser still
249/// accepts so the round-trip closes.
250#[derive(Debug, Clone, PartialEq, Eq)]
251pub enum Type {
252 Void,
253 I8,
254 I16,
255 I32,
256 I64,
257 U8,
258 U16,
259 U32,
260 U64,
261 F32,
262 F64,
263 Bool,
264 Char,
265 /// `ptr<T>` — pointer to `T`.
266 Pointer(Box<Type>),
267 /// A type the source language can't yet express. Round-trips
268 /// verbatim as the literal token `unknown`.
269 Unknown,
270}
271
272/// Structured breakdown of a function prologue. Lets the source
273/// language carry semantic information (which registers got
274/// saved, whether a frame was set up, how much stack the function
275/// reserves, whether CET protection is on) instead of an opaque
276/// byte blob.
277///
278/// Used by the emitter to render
279/// `@prologue(saves: [ebx, esi, edi], frame, sub: 0x40)` style
280/// directives that drop the byte list because the parser can
281/// regenerate identical bytes via the arch's prologue codec.
282#[derive(Debug, Clone, Default, PartialEq, Eq)]
283pub struct PrologueParams {
284 /// Callee-saved registers pushed before the frame setup, in
285 /// push order. Lowercase canonical names (`"ebx"`, `"esi"`,
286 /// `"r12"`, …).
287 pub saves: Vec<String>,
288 /// Callee-saved registers pushed AFTER the frame setup
289 /// (MSVC i386 idiom). Same naming.
290 pub saves_after: Vec<String>,
291 /// True when the prologue includes `push ebp; mov ebp, esp`
292 /// (or the 64-bit variant).
293 pub frame: bool,
294 /// Stack reservation in bytes (`sub esp, IMM`). Zero when
295 /// the function has no stack locals beyond saves.
296 pub sub_esp: u32,
297 /// True when the prologue starts with `endbr32` / `endbr64`
298 /// (Intel CET indirect-branch landing pad).
299 pub cf_protect: bool,
300 /// Frame-setup encoding selector: `false` for the MSVC RM
301 /// form (`mov ebp, esp` as `0x8b 0xec`), `true` for the GCC
302 /// MR form (`0x89 0xe5`). Only meaningful when `frame` is
303 /// true; the codec uses it to re-emit byte-identical
304 /// instructions for either compiler.
305 pub frame_alt: bool,
306}
307
308/// Structured breakdown of a function epilogue. Mirrors
309/// [`PrologueParams`].
310#[derive(Debug, Clone, Default, PartialEq, Eq)]
311pub struct EpilogueParams {
312 /// Callee-saved registers popped, in pop order (typically
313 /// the reverse of the prologue's push order).
314 pub saves: Vec<String>,
315 /// True when the epilogue uses `leave` (atomic
316 /// `mov esp, ebp; pop ebp`).
317 pub leave: bool,
318 /// True when the epilogue pops the frame pointer with an
319 /// explicit `pop ebp` (after the named saves).
320 pub pop_frame: bool,
321 /// Stack adjustment via `add esp, IMM` before `ret`. Zero
322 /// when absent.
323 pub add_esp: u32,
324 /// Immediate operand of `ret` (callee-cleanup amount).
325 /// Zero for cdecl.
326 pub ret_imm: u16,
327}
328
329/// A statement inside a function body.
330#[derive(Debug, Clone, PartialEq, Eq)]
331pub enum Stmt {
332 /// `@asm("text")` or `@asm("text", [bytes])` — an instruction.
333 ///
334 /// `text` is the human-readable assembly. `bytes` pins the exact
335 /// encoded bytes; when non-empty, it's the ground truth for
336 /// recompilation and the assembler's job is to verify that
337 /// assembling `text` produces matching bytes (with directive-pinned
338 /// encoding choices, when those land).
339 ///
340 /// `bytes` may be empty: a future assembler will then derive them
341 /// from the text alone. v0 always populates `bytes` because we
342 /// don't yet ship a text assembler that produces byte-identical
343 /// output for non-canonical encodings.
344 Asm { text: String, bytes: Vec<u8> },
345
346 /// `// …` line. Used by the decompiler to surface block boundaries
347 /// and direct-branch targets without committing to a structural
348 /// syntax for them yet.
349 Comment(String),
350
351 /// `@return(value, [bytes])` — a recognised return-with-literal
352 /// pattern at the tail of a function. Lifted from sequences like
353 /// `mov eax, N; [pop rbp;] ret` or `xor eax, eax; [pop rbp;] ret`.
354 /// `bytes` carries every encoded byte of those instructions
355 /// concatenated, so the lower path just emits the bytes.
356 Return { value: u64, bytes: Vec<u8> },
357
358 /// `@prologue("kind", [bytes])` — a recognised function prologue,
359 /// typically `endbr64; push rbp; mov rbp, rsp; sub rsp, IMM` or
360 /// a close variant. `kind` is a descriptive label
361 /// (`"std"` / `"std-no-cf"` / `"std-noframe"`); `bytes` carries
362 /// every encoded byte for round-trip.
363 ///
364 /// `params` carries the structured breakdown (saves list,
365 /// frame flag, sub_esp value, cf_protect) when the prologue's
366 /// bytes round-trip through the canonical codec. Lets the
367 /// emitter render `@prologue(saves: [ebx, esi, edi], frame,
368 /// sub: 0x40)` without the byte list. Empty for handwritten
369 /// or non-canonical prologues where bytes are the source of
370 /// truth.
371 Prologue {
372 kind: String,
373 params: Option<PrologueParams>,
374 bytes: Vec<u8>,
375 },
376
377 /// `@epilogue("kind", [bytes])` — a recognised function epilogue,
378 /// typically `leave; ret` or `pop rbp; ret`. Used at the tail of
379 /// the last block when no [`Stmt::Return`] consumed those bytes
380 /// (e.g. the return value was computed in an earlier block).
381 Epilogue {
382 kind: String,
383 params: Option<EpilogueParams>,
384 bytes: Vec<u8>,
385 },
386
387 /// `@save("REG", [bytes])` — a mid-function callee-saved register
388 /// save. Pairs LIFO with a matching [`Stmt::Restore`] elsewhere in
389 /// the body; together they bracket a region where the function
390 /// borrows an extra register the prologue didn't reserve. Bytes
391 /// are exactly the `push REG` encoding.
392 Save { reg: String, bytes: Vec<u8> },
393
394 /// `@restore("REG", [bytes])` — the matching restore for a prior
395 /// [`Stmt::Save`]. Bytes are exactly the `pop REG` encoding.
396 Restore { reg: String, bytes: Vec<u8> },
397
398 /// `@if_return("cond", "value", [bytes])` — an early-return
399 /// pattern: a `test/cmp + jcc` whose taken target is a
400 /// return-shaped block elsewhere in the function. The bytes
401 /// are the original cmp/test + jcc encoding; the actual return
402 /// happens at the target block (whose bytes remain in place).
403 /// Renders as `if (cond) return value;` to convey the intent
404 /// even though the jcc semantically transfers control to a
405 /// shared cleanup tail.
406 ///
407 /// `value` is the literal/expression the target block returns,
408 /// when statically known; empty when the target's return value
409 /// can't be folded.
410 ///
411 /// Same shape as `IfGoto`: the jcc tail re-encodes from
412 /// the target's *implicit* address (the return-block's
413 /// position, captured at decompile time via the cmp-bytes
414 /// length + jcc rel resolution). `cmp_bytes` stays pinned
415 /// until the text assembler.
416 IfReturn {
417 cond_text: String,
418 value_text: String,
419 target_addr: u64,
420 cmp_bytes: Vec<u8>,
421 cond_code: u8,
422 wide: bool,
423 },
424
425 /// `label_XXXX:` — a zero-byte marker for a jump target. The
426 /// `addr` is the run-time virtual address the label represents
427 /// (rendered as `label_<hex>`). Labels carry no bytes; they
428 /// occupy a position in the source so a [`Stmt::Goto`] or
429 /// [`Stmt::IfGoto`] elsewhere in the function can point at
430 /// them by name. Round-trip neutral.
431 Label { addr: u64 },
432
433 /// `goto label_XXXX;` (or `goto label_XXXX #[wide];`) — an
434 /// unconditional `jmp` to a label somewhere in the function
435 /// body. No pinned bytes: the lower path picks the encoding
436 /// from `target_addr`, the cursor position, and the `wide`
437 /// flag:
438 ///
439 /// * `wide=false` and the displacement fits in `i8`:
440 /// `jmp rel8` (2 bytes).
441 /// * otherwise: `jmp rel32` (5 bytes).
442 ///
443 /// The `wide` flag captures encoding choices the compiler
444 /// made that don't follow the "always shortest" rule —
445 /// occasional, but real (some MSVC paths emit `jmp rel32`
446 /// even when `jmp rel8` would fit). Editing the function so
447 /// a label moves auto-promotes `wide=false` → `wide=true`
448 /// when the displacement no longer fits in `i8`.
449 Goto { target_addr: u64, wide: bool },
450
451 /// `if (cond) goto label_XXXX;` — a conditional jump folded
452 /// from `cmp/test …; jcc …`. The jcc tail is no longer
453 /// pinned in source: the lower path re-encodes
454 /// `jcc rel8/rel32` from `target_addr`, `cond_code`, and
455 /// `wide`. `cmp_bytes` carries the cmp/test prefix (empty
456 /// when the source is a bare flag check); it stays pinned
457 /// until the text-assembler can re-encode it from
458 /// `cond_text`.
459 ///
460 /// Editing a label so its position changes flows through to
461 /// the rebuilt binary. Editing `cmp_bytes` and `cond_text`
462 /// without keeping them consistent is the user's job until
463 /// the assembler lands.
464 IfGoto {
465 cond_text: String,
466 target_addr: u64,
467 cmp_bytes: Vec<u8>,
468 cond_code: u8,
469 wide: bool,
470 },
471
472 /// `switch (selector) #[dispatch="…", table_va=…] { case N: goto … }`
473 /// — a structured switch whose dispatch bytes are *not* pinned
474 /// to the source. The lower path regenerates `cmp REG,MAX; ja
475 /// DEFAULT; jmp dword ptr [REG*4+TABLE_VA]` from the structured
476 /// fields, validating that the case/default/selector data
477 /// re-encodes to a correct dispatch sequence.
478 ///
479 /// `dispatch` names the encoding shape (currently only
480 /// `"msvc-jmp-table"` is recognised). `table_va` is the
481 /// absolute address of the jump-table data the indirect jmp
482 /// reads — the table contents themselves still ride in a
483 /// `@raw` block under the appropriate data section.
484 ///
485 /// Editing the source is the whole point: adding a case here,
486 /// changing `default_addr`, or renaming the selector all flow
487 /// through to the rebuilt binary via the lower-side encoder,
488 /// without any pinned bytes to silently invalidate.
489 Switch {
490 selector: String,
491 cases: Vec<u64>,
492 default_addr: u64,
493 dispatch: String,
494 table_va: u64,
495 },
496
497 /// `@seh_install([bytes])` — MSVC's Structured Exception
498 /// Handling frame install: `mov fs:[0], esp` after pushing
499 /// the handler-frame fields. Bytes are exactly the
500 /// `mov fs:[0], esp` encoding (7 bytes on x86-32).
501 SehInstall { bytes: Vec<u8> },
502
503 /// `@seh_restore([bytes])` — pops the SEH chain back to the
504 /// previously installed handler. Bytes encode
505 /// `mov reg, [ebp-N]; mov fs:[0], reg` (or similar pop
506 /// sequence). Pairs LIFO with a prior `Stmt::SehInstall`.
507 SehRestore { bytes: Vec<u8> },
508
509 /// `@return_expr("text", [bytes])` — a recognised
510 /// "compute-a-value-and-fall-through-to-the-epilogue" block whose
511 /// contents have been lifted into a single human-readable
512 /// expression. The expression text is informational; the pinned
513 /// bytes are the lower path's source of truth, so the original
514 /// instruction stream re-emits exactly even if the expression is
515 /// edited.
516 ReturnExpr { text: String, bytes: Vec<u8> },
517
518 /// `@arg_spill(N, [bytes])` — a recognised SysV-x64 argument
519 /// spill: `mov [rbp+disp], REG_N` where `REG_N` is the integer or
520 /// XMM register holding argument `N` at function entry. The slot
521 /// displacement is recoverable from the pinned bytes, so it
522 /// doesn't appear in the directive shape.
523 ArgSpill { arg_index: u32, bytes: Vec<u8> },
524
525 /// `@call("name", [args], [bytes])` — a recognised direct-call
526 /// site whose preceding `mov reg, …` / `lea reg, …` instructions
527 /// have been folded into the args list. Each arg is a
528 /// human-readable rendering (string literal, integer constant,
529 /// global address, `&function` reference, or `result` for a
530 /// previous call's return value); the pinned bytes cover both
531 /// `name(args)` — a function call (direct or indirect).
532 ///
533 /// `bytes` pins the arg-setup prefix (pushes, movs, etc.).
534 /// For **indirect** calls (`call dword ptr [imm]` etc.) the
535 /// call instruction itself rides at the end of `bytes`
536 /// because we don't yet re-encode arbitrary memory operands.
537 ///
538 /// For **direct** calls (`call rel32`) the trailing 5 bytes
539 /// are stripped from `bytes` and `direct_target` carries the
540 /// callee's IP. The lower path encodes `call rel32` against
541 /// the current cursor + `direct_target`, so editing a
542 /// function's position automatically re-resolves every
543 /// caller's relative offset.
544 Call {
545 name: String,
546 args: Vec<String>,
547 bytes: Vec<u8>,
548 direct_target: Option<u64>,
549 },
550
551 /// A structured `cmp/test + jcc` head plus its branches:
552 ///
553 /// ```text
554 /// @if_branch("cond text", [cond bytes]) {
555 /// @then { …fallthrough body… }
556 /// @else { …taken body… } // optional
557 /// }
558 /// ```
559 ///
560 /// `else_body == None` means the source-language `if` has no
561 /// `else` clause — the jcc-taken side jumps directly to whatever
562 /// code follows the `@if_branch` in source order. With `Some`,
563 /// both arms are real branches that converge somewhere later.
564 ///
565 /// Bytes layout, exactly preserved on lower (in source order):
566 ///
567 /// * `attrs["head_bytes"]` if present (the cmp/test bytes that
568 /// live *before* the intervening insns the compiler reordered
569 /// between the comparison and the conditional branch),
570 /// * `pre_body` statement bytes (the "intervening" insns
571 /// between cmp and jcc — empty for the adjacent-cmp case),
572 /// * `cond_bytes` (the jcc when there's `head_bytes`; the full
573 /// cmp+jcc when there isn't),
574 /// * `then_body` statement bytes,
575 /// * `else_body` statement bytes if present.
576 IfBranch {
577 cond_text: String,
578 cond_bytes: Vec<u8>,
579 /// Free-form metadata. Recognised keys today: `head_bytes`
580 /// (load-bearing — see byte layout above).
581 attrs: Vec<Attribute>,
582 /// Statements that fall between the cmp/test and the jcc in
583 /// the original instruction stream. Empty for adjacent cmp +
584 /// jcc (the common case) — the field exists for the
585 /// separated-by-flag-preserving-insns case.
586 pre_body: Vec<Stmt>,
587 then_body: Vec<Stmt>,
588 else_body: Option<Vec<Stmt>>,
589 },
590
591 /// `@local_set(slot, value, [bytes])` — a recognised
592 /// `mov dword/qword ptr [rbp+disp], IMM` (or analogous on i386
593 /// `[ebp+disp]`) where the destination is a stack-frame local.
594 /// Lifts the common "initialise a local with a literal" pattern.
595 /// `slot` is the signed displacement from the frame pointer
596 /// (e.g. `-8` for `[rbp-8]`); `value` is the immediate, signed.
597 LocalSet {
598 slot: i64,
599 value: i64,
600 bytes: Vec<u8>,
601 },
602
603 /// `@local_arith(slot, op, value, [bytes])` — a recognised
604 /// `add/sub dword/qword ptr [rbp+disp], IMM` pattern. Lifts
605 /// the loop-counter / accumulator-update idiom.
606 /// `op` is the arithmetic operation (`"+="` or `"-="`); `value`
607 /// is the immediate, signed.
608 LocalArith {
609 slot: i64,
610 op: String,
611 value: i64,
612 bytes: Vec<u8>,
613 },
614
615 /// `@local_compound(dst, op, src, [bytes])` — a multi-instruction
616 /// pattern of the shape `[rbp+dst] op= [rbp+src]`. Either:
617 ///
618 /// * 2-insn form: `mov reg, [rbp+src]; <op> [rbp+dst], reg`
619 /// for ops with a memory-destination form (add, sub, and, or, xor),
620 /// * 3-insn form: `mov reg, [rbp+dst]; <op> reg, [rbp+src];
621 /// mov [rbp+dst], reg` for ops without one (imul).
622 ///
623 /// The pinned `bytes` cover the whole sequence; the lower path
624 /// re-emits them verbatim.
625 LocalCompound {
626 dst: i64,
627 op: String,
628 src: i64,
629 bytes: Vec<u8>,
630 },
631
632 /// `@move("dst", "src", [bytes])` — an arch-agnostic
633 /// "dst := src" data move whose lowering is pinned by `bytes`.
634 /// The 6502 decompiler emits this for `LDA src; STA dst` pairs;
635 /// the `dst` and `src` strings are operand text from the
636 /// instruction stream (e.g. `"IN,Y"` and `"KBD"`).
637 ///
638 /// Round-trip: the source-language text is purely informational,
639 /// `bytes` is what the lower path emits.
640 Move {
641 dst: String,
642 src: String,
643 bytes: Vec<u8>,
644 },
645
646 /// `@inc16("lo", "hi", [bytes])` — a 16-bit increment composed
647 /// of `INC lo; BNE +2; INC hi` (with the `BNE` skipping the
648 /// high-byte INC unless the low byte just rolled over). The
649 /// canonical 6502 idiom for advancing a 16-bit pointer.
650 Inc16 {
651 lo: String,
652 hi: String,
653 bytes: Vec<u8>,
654 },
655
656 /// A structured loop with the test at the bottom. Canonical
657 /// gcc -O0 shape:
658 ///
659 /// ```text
660 /// @loop(entry_jmp=[bytes], "cond text", [tail bytes]) {
661 /// …body stmts…
662 /// }
663 /// ```
664 ///
665 /// Lifted from a CFG triple where:
666 ///
667 /// * a body block falls through to a tail block,
668 /// * the tail block ends with a conditional branch whose
669 /// `taken` target is the body block (i.e. a back-edge),
670 ///
671 /// `entry_jmp_bytes` is the pre-header `jmp` that enters the
672 /// loop at the tail (gcc's "skip body on first iteration" idiom).
673 /// When detected, those bytes are folded into the directive so
674 /// no `@asm` line is left behind for them.
675 ///
676 /// Lower-path byte order: `entry_jmp_bytes` (if any) → `body`
677 /// bytes → `tail_bytes`. The `@loop` itself contributes nothing
678 /// before `entry_jmp_bytes` — its placement in the function body
679 /// determines where the bytes land.
680 Loop {
681 cond_text: String,
682 entry_jmp_bytes: Option<Vec<u8>>,
683 tail_bytes: Vec<u8>,
684 body: Vec<Stmt>,
685 },
686
687 /// `if (cond_text) #[bytes=[…]] { … } [else { … }]` —
688 /// structured if/else recovered from a forward conditional
689 /// jump.
690 ///
691 /// `cond_bytes` carries the jcc instruction itself (8 bytes
692 /// on BPF, 2–6 on x86). The body is whatever fell through
693 /// the conditional; the `else` body is whatever sat past
694 /// the unconditional jump at the end of the `then` body
695 /// (when present).
696 ///
697 /// Lower order: `cond_bytes` → walk `then_body` Stmts → if
698 /// `else_body` is `Some`: `then_tail_jmp` (the unconditional
699 /// branch skipping the else) → walk `else_body` Stmts.
700 ///
701 /// Round-trip preservation: every encoded byte rides
702 /// somewhere in `cond_bytes` / `then_tail_jmp` / a nested
703 /// `@asm`. Editing `cond_text` is purely cosmetic until an
704 /// arch-side text re-encoder lands.
705 IfBlock {
706 cond_text: String,
707 cond_bytes: Vec<u8>,
708 then_body: Vec<Stmt>,
709 then_tail_jmp: Vec<u8>,
710 else_body: Vec<Stmt>,
711 },
712
713 /// `while (cond_text) #[bytes=[…]] { … }` — a top-checked
714 /// loop. `entry_bytes` pins the loop-header jcc that skips
715 /// the body when the condition is false on first entry;
716 /// `tail_bytes` pins the unconditional jump at the end of
717 /// the body that branches back to the header.
718 ///
719 /// Lower order: `entry_bytes` → walk `body` Stmts →
720 /// `tail_bytes`.
721 WhileBlock {
722 cond_text: String,
723 entry_bytes: Vec<u8>,
724 tail_bytes: Vec<u8>,
725 body: Vec<Stmt>,
726 },
727
728 /// `dst op src;` — a compound-assignment arithmetic stmt
729 /// where `dst` is a register name, `op` is a C-style
730 /// compound operator (`"+="`, `"-="`, `"*="`, `"/="`,
731 /// `"%="`, `"|="`, `"&="`, `"^="`, `"<<="`, `">>="`), and
732 /// `src` is a register or immediate text.
733 ///
734 /// Lifted from arch ALU instructions whose register-only
735 /// shape lets the codec round-trip via `encode_arith`.
736 /// On BPF that's the 64-bit ALU class (add64, lsh64,
737 /// or64, etc.); the framework lets other arches plug in
738 /// the same way.
739 ///
740 /// Round-trip: `bytes` rides pinned by default; the
741 /// decompile-side byte-drop clears `bytes` when
742 /// `arch.encode_arith(dst, op, src)` reproduces them, and
743 /// the lower path regenerates from the textual fields.
744 RegArith {
745 dst: String,
746 op: String,
747 src: String,
748 bytes: Vec<u8>,
749 },
750}
751
752impl Stmt {
753 /// Construct an [`Stmt::Asm`] with both text and pinned bytes.
754 #[must_use]
755 pub fn asm(text: impl Into<String>, bytes: Vec<u8>) -> Self {
756 Self::Asm {
757 text: text.into(),
758 bytes,
759 }
760 }
761
762 /// Construct an [`Stmt::Asm`] with text only (no bytes pinned).
763 /// Useful in tests; not used by the v0 decompiler.
764 #[must_use]
765 pub fn asm_text(text: impl Into<String>) -> Self {
766 Self::Asm {
767 text: text.into(),
768 bytes: Vec::new(),
769 }
770 }
771}