ud_ir/lib.rs
1//! Shared IR types for univdreams.
2//!
3//! Per the architecture sketch, the IR is **arch-tagged**: each
4//! architecture brings its own instruction vocabulary. This crate hosts
5//! only the concepts that genuinely span architectures — function and
6//! basic-block structure, control-flow terminators, and the
7//! [`ArchInsn`] trait that lets per-arch instruction types plug in.
8//!
9//! The byte-identity contract for the IR layer:
10//!
11//! > For any [`Function`] built from real bytes by an arch's lifter,
12//! > [`Function::emit_bytes`] returns exactly the input bytes.
13//!
14//! This is true by construction: [`emit_bytes`] concatenates each
15//! instruction's preserved original bytes in address order. The CFG
16//! structure is a *view* over a flat byte stream, not a transformation
17//! of it.
18//!
19//! [`emit_bytes`]: Function::emit_bytes
20
21#![allow(clippy::cast_possible_truncation)]
22
23pub mod ssa;
24
25use ud_core::VAddr;
26
27/// An architecture's per-instruction type, plugged into the shared IR.
28///
29/// The trait surface is deliberately tiny: an instruction must know its
30/// virtual address and the bytes it occupied in the source binary.
31/// Higher-level analyses query the arch-specific concrete type directly.
32pub trait ArchInsn {
33 /// Virtual address where this instruction lives.
34 fn addr(&self) -> VAddr;
35
36 /// Exact bytes the instruction occupied in the source. Used by
37 /// [`Function::emit_bytes`] to defend round-trip byte identity.
38 fn original_bytes(&self) -> &[u8];
39
40 /// Length of the encoded instruction in bytes. Default impl uses
41 /// `original_bytes().len()`; override only if your representation
42 /// can differ between the two (e.g. a synthetic IR insn that
43 /// doesn't have a byte form yet).
44 fn len_bytes(&self) -> usize {
45 self.original_bytes().len()
46 }
47}
48
49/// How control flow leaves a basic block.
50///
51/// Calls are deliberately *not* terminators: the standard CFG convention
52/// treats a `call` as a regular instruction whose semantics include
53/// transferring control elsewhere and eventually returning.
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub enum Terminator {
56 /// Reached only at the very end of a function whose body simply
57 /// ran out of recorded bytes (e.g. `nop`s for alignment with no
58 /// branch back). Rare in real code but possible.
59 Fallthrough,
60
61 /// Unconditional direct branch (`jmp rel32`, etc.) to a known target.
62 UnconditionalBranch { target: VAddr },
63
64 /// Conditional direct branch (`jcc`); has both a taken target and a
65 /// fall-through edge to the next address.
66 ConditionalBranch { taken: VAddr, fallthrough: VAddr },
67
68 /// Return from function (`ret`, `iret`, etc.).
69 Return,
70
71 /// Indirect branch or call where the target is not a constant
72 /// (`jmp rax`, `jmp [rip+...]`, `call rax`). Static analysis can't
73 /// resolve these; downstream passes may attempt jump-table
74 /// recovery.
75 IndirectBranch,
76
77 /// `ud2`, `int3` reaching here as a terminator, or any other
78 /// instruction that surfaces as flow-control "exception" /
79 /// "interrupt" / "unreachable".
80 InvalidOrUnreachable,
81}
82
83/// A maximal straight-line sequence of instructions ending in a
84/// [`Terminator`].
85///
86/// "Maximal" in the usual sense: the only entry to the block is at
87/// `addr`, and every instruction except the last falls through to its
88/// successor.
89#[derive(Debug, Clone)]
90pub struct BasicBlock<I> {
91 pub addr: VAddr,
92 pub insns: Vec<I>,
93 pub terminator: Terminator,
94}
95
96impl<I: ArchInsn> BasicBlock<I> {
97 /// Sum of `original_bytes` lengths across the block.
98 #[must_use]
99 pub fn size(&self) -> usize {
100 self.insns.iter().map(ArchInsn::len_bytes).sum()
101 }
102
103 /// Address one past the last byte of the block.
104 #[must_use]
105 pub fn end_addr(&self) -> VAddr {
106 VAddr(self.addr.0 + self.size() as u64)
107 }
108}
109
110/// A function: a name, an entry address, and a sequence of basic blocks
111/// in original layout (address) order.
112///
113/// `blocks[0]` is by convention the entry block, since blocks are stored
114/// in address order and a function starts at its lowest address.
115#[derive(Debug, Clone)]
116pub struct Function<I> {
117 pub addr: VAddr,
118 pub name: String,
119 pub blocks: Vec<BasicBlock<I>>,
120}
121
122impl<I: ArchInsn> Function<I> {
123 /// Total size in bytes — sum across all blocks.
124 #[must_use]
125 pub fn size(&self) -> usize {
126 self.blocks.iter().map(BasicBlock::size).sum()
127 }
128
129 /// Concatenate every instruction's preserved bytes in address order.
130 /// For any function lifted from real bytes, this returns exactly the
131 /// input bytes.
132 #[must_use]
133 pub fn emit_bytes(&self) -> Vec<u8> {
134 let mut out = Vec::with_capacity(self.size());
135 for block in &self.blocks {
136 for insn in &block.insns {
137 out.extend_from_slice(insn.original_bytes());
138 }
139 }
140 out
141 }
142
143 /// Find the block whose address equals `addr`, if any.
144 #[must_use]
145 pub fn block_at(&self, addr: VAddr) -> Option<&BasicBlock<I>> {
146 self.blocks.iter().find(|b| b.addr == addr)
147 }
148}
149
150#[cfg(test)]
151mod tests {
152 use super::*;
153
154 /// Synthetic instruction type for testing the shared IR without
155 /// pulling in an arch crate.
156 #[derive(Debug, Clone)]
157 struct DummyInsn {
158 addr: u64,
159 bytes: Vec<u8>,
160 }
161
162 impl ArchInsn for DummyInsn {
163 fn addr(&self) -> VAddr {
164 VAddr(self.addr)
165 }
166 fn original_bytes(&self) -> &[u8] {
167 &self.bytes
168 }
169 }
170
171 fn insn(addr: u64, bytes: &[u8]) -> DummyInsn {
172 DummyInsn {
173 addr,
174 bytes: bytes.to_vec(),
175 }
176 }
177
178 fn function_one_block() -> Function<DummyInsn> {
179 Function {
180 addr: VAddr(0x1000),
181 name: "single".into(),
182 blocks: vec![BasicBlock {
183 addr: VAddr(0x1000),
184 insns: vec![insn(0x1000, &[0x90, 0x90]), insn(0x1002, &[0xc3])],
185 terminator: Terminator::Return,
186 }],
187 }
188 }
189
190 #[test]
191 fn emit_bytes_concatenates_in_block_order() {
192 let f = function_one_block();
193 assert_eq!(f.emit_bytes(), vec![0x90, 0x90, 0xc3]);
194 }
195
196 #[test]
197 fn function_size_sums_block_sizes() {
198 let f = function_one_block();
199 assert_eq!(f.size(), 3);
200 }
201
202 #[test]
203 fn block_end_addr_is_addr_plus_size() {
204 let f = function_one_block();
205 assert_eq!(f.blocks[0].end_addr(), VAddr(0x1003));
206 }
207
208 #[test]
209 fn block_at_finds_blocks_by_address() {
210 let f = Function::<DummyInsn> {
211 addr: VAddr(0x1000),
212 name: "two_blocks".into(),
213 blocks: vec![
214 BasicBlock {
215 addr: VAddr(0x1000),
216 insns: vec![insn(0x1000, &[0xeb, 0x02])],
217 terminator: Terminator::UnconditionalBranch {
218 target: VAddr(0x1004),
219 },
220 },
221 BasicBlock {
222 addr: VAddr(0x1004),
223 insns: vec![insn(0x1004, &[0xc3])],
224 terminator: Terminator::Return,
225 },
226 ],
227 };
228 assert!(f.block_at(VAddr(0x1004)).is_some());
229 assert!(f.block_at(VAddr(0x1002)).is_none());
230 }
231}