Skip to main content

ud_ir/
lib.rs

1//! Shared IR types for univdreams.
2//!
3//! Per the architecture sketch, the IR is **arch-tagged**: each
4//! architecture brings its own instruction vocabulary. This crate hosts
5//! only the concepts that genuinely span architectures — function and
6//! basic-block structure, control-flow terminators, and the
7//! [`ArchInsn`] trait that lets per-arch instruction types plug in.
8//!
9//! The byte-identity contract for the IR layer:
10//!
11//! > For any [`Function`] built from real bytes by an arch's lifter,
12//! > [`Function::emit_bytes`] returns exactly the input bytes.
13//!
14//! This is true by construction: [`emit_bytes`] concatenates each
15//! instruction's preserved original bytes in address order. The CFG
16//! structure is a *view* over a flat byte stream, not a transformation
17//! of it.
18//!
19//! [`emit_bytes`]: Function::emit_bytes
20
21#![allow(clippy::cast_possible_truncation)]
22
23pub mod ssa;
24
25use ud_core::VAddr;
26
27/// An architecture's per-instruction type, plugged into the shared IR.
28///
29/// The trait surface is deliberately tiny: an instruction must know its
30/// virtual address and the bytes it occupied in the source binary.
31/// Higher-level analyses query the arch-specific concrete type directly.
32pub trait ArchInsn {
33    /// Virtual address where this instruction lives.
34    fn addr(&self) -> VAddr;
35
36    /// Exact bytes the instruction occupied in the source. Used by
37    /// [`Function::emit_bytes`] to defend round-trip byte identity.
38    fn original_bytes(&self) -> &[u8];
39
40    /// Length of the encoded instruction in bytes. Default impl uses
41    /// `original_bytes().len()`; override only if your representation
42    /// can differ between the two (e.g. a synthetic IR insn that
43    /// doesn't have a byte form yet).
44    fn len_bytes(&self) -> usize {
45        self.original_bytes().len()
46    }
47}
48
49/// How control flow leaves a basic block.
50///
51/// Calls are deliberately *not* terminators: the standard CFG convention
52/// treats a `call` as a regular instruction whose semantics include
53/// transferring control elsewhere and eventually returning.
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub enum Terminator {
56    /// Reached only at the very end of a function whose body simply
57    /// ran out of recorded bytes (e.g. `nop`s for alignment with no
58    /// branch back). Rare in real code but possible.
59    Fallthrough,
60
61    /// Unconditional direct branch (`jmp rel32`, etc.) to a known target.
62    UnconditionalBranch { target: VAddr },
63
64    /// Conditional direct branch (`jcc`); has both a taken target and a
65    /// fall-through edge to the next address.
66    ConditionalBranch { taken: VAddr, fallthrough: VAddr },
67
68    /// Return from function (`ret`, `iret`, etc.).
69    Return,
70
71    /// Indirect branch or call where the target is not a constant
72    /// (`jmp rax`, `jmp [rip+...]`, `call rax`). Static analysis can't
73    /// resolve these; downstream passes may attempt jump-table
74    /// recovery.
75    IndirectBranch,
76
77    /// `ud2`, `int3` reaching here as a terminator, or any other
78    /// instruction that surfaces as flow-control "exception" /
79    /// "interrupt" / "unreachable".
80    InvalidOrUnreachable,
81}
82
83/// A maximal straight-line sequence of instructions ending in a
84/// [`Terminator`].
85///
86/// "Maximal" in the usual sense: the only entry to the block is at
87/// `addr`, and every instruction except the last falls through to its
88/// successor.
89#[derive(Debug, Clone)]
90pub struct BasicBlock<I> {
91    pub addr: VAddr,
92    pub insns: Vec<I>,
93    pub terminator: Terminator,
94}
95
96impl<I: ArchInsn> BasicBlock<I> {
97    /// Sum of `original_bytes` lengths across the block.
98    #[must_use]
99    pub fn size(&self) -> usize {
100        self.insns.iter().map(ArchInsn::len_bytes).sum()
101    }
102
103    /// Address one past the last byte of the block.
104    #[must_use]
105    pub fn end_addr(&self) -> VAddr {
106        VAddr(self.addr.0 + self.size() as u64)
107    }
108}
109
110/// A function: a name, an entry address, and a sequence of basic blocks
111/// in original layout (address) order.
112///
113/// `blocks[0]` is by convention the entry block, since blocks are stored
114/// in address order and a function starts at its lowest address.
115#[derive(Debug, Clone)]
116pub struct Function<I> {
117    pub addr: VAddr,
118    pub name: String,
119    pub blocks: Vec<BasicBlock<I>>,
120}
121
122impl<I: ArchInsn> Function<I> {
123    /// Total size in bytes — sum across all blocks.
124    #[must_use]
125    pub fn size(&self) -> usize {
126        self.blocks.iter().map(BasicBlock::size).sum()
127    }
128
129    /// Concatenate every instruction's preserved bytes in address order.
130    /// For any function lifted from real bytes, this returns exactly the
131    /// input bytes.
132    #[must_use]
133    pub fn emit_bytes(&self) -> Vec<u8> {
134        let mut out = Vec::with_capacity(self.size());
135        for block in &self.blocks {
136            for insn in &block.insns {
137                out.extend_from_slice(insn.original_bytes());
138            }
139        }
140        out
141    }
142
143    /// Find the block whose address equals `addr`, if any.
144    #[must_use]
145    pub fn block_at(&self, addr: VAddr) -> Option<&BasicBlock<I>> {
146        self.blocks.iter().find(|b| b.addr == addr)
147    }
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153
154    /// Synthetic instruction type for testing the shared IR without
155    /// pulling in an arch crate.
156    #[derive(Debug, Clone)]
157    struct DummyInsn {
158        addr: u64,
159        bytes: Vec<u8>,
160    }
161
162    impl ArchInsn for DummyInsn {
163        fn addr(&self) -> VAddr {
164            VAddr(self.addr)
165        }
166        fn original_bytes(&self) -> &[u8] {
167            &self.bytes
168        }
169    }
170
171    fn insn(addr: u64, bytes: &[u8]) -> DummyInsn {
172        DummyInsn {
173            addr,
174            bytes: bytes.to_vec(),
175        }
176    }
177
178    fn function_one_block() -> Function<DummyInsn> {
179        Function {
180            addr: VAddr(0x1000),
181            name: "single".into(),
182            blocks: vec![BasicBlock {
183                addr: VAddr(0x1000),
184                insns: vec![insn(0x1000, &[0x90, 0x90]), insn(0x1002, &[0xc3])],
185                terminator: Terminator::Return,
186            }],
187        }
188    }
189
190    #[test]
191    fn emit_bytes_concatenates_in_block_order() {
192        let f = function_one_block();
193        assert_eq!(f.emit_bytes(), vec![0x90, 0x90, 0xc3]);
194    }
195
196    #[test]
197    fn function_size_sums_block_sizes() {
198        let f = function_one_block();
199        assert_eq!(f.size(), 3);
200    }
201
202    #[test]
203    fn block_end_addr_is_addr_plus_size() {
204        let f = function_one_block();
205        assert_eq!(f.blocks[0].end_addr(), VAddr(0x1003));
206    }
207
208    #[test]
209    fn block_at_finds_blocks_by_address() {
210        let f = Function::<DummyInsn> {
211            addr: VAddr(0x1000),
212            name: "two_blocks".into(),
213            blocks: vec![
214                BasicBlock {
215                    addr: VAddr(0x1000),
216                    insns: vec![insn(0x1000, &[0xeb, 0x02])],
217                    terminator: Terminator::UnconditionalBranch {
218                        target: VAddr(0x1004),
219                    },
220                },
221                BasicBlock {
222                    addr: VAddr(0x1004),
223                    insns: vec![insn(0x1004, &[0xc3])],
224                    terminator: Terminator::Return,
225                },
226            ],
227        };
228        assert!(f.block_at(VAddr(0x1004)).is_some());
229        assert!(f.block_at(VAddr(0x1002)).is_none());
230    }
231}