edb_engine/utils/
disasm.rs

1// EDB - Ethereum Debugger
2// Copyright (C) 2024 Zhuo Zhang and Wuqi Zhang
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU Affero General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU Affero General Public License for more details.
13//
14// You should have received a copy of the GNU Affero General Public License
15// along with this program. If not, see <https://www.gnu.org/licenses/>.
16
17//! EVM bytecode disassembly utilities
18//!
19//! This module provides functionality to disassemble EVM bytecode into a structured
20//! representation that includes opcodes and their associated data (particularly for
21//! PUSHX instructions that include immediate values).
22//!
23//! The disassembly process handles:
24//! - All standard EVM opcodes
25//! - PUSH instructions with their immediate values (PUSH1 through PUSH32)
26//! - Proper instruction boundary detection
27//! - Invalid opcodes identification
28
29use alloy_primitives::{Bytes, U256};
30use revm::bytecode::opcode::OpCode;
31
32/// A single disassembled instruction with its associated data
33#[derive(Debug, Clone, PartialEq, Eq)]
34pub struct DisassemblyInstruction {
35    /// Program counter offset where this instruction starts
36    pub pc: usize,
37    /// The opcode for this instruction
38    pub opcode: OpCode,
39    /// For PUSHX instructions, this contains the immediate value bytes
40    /// For other instructions, this is empty
41    pub push_data: Vec<u8>,
42}
43
44impl DisassemblyInstruction {
45    /// Create a new instruction without push data
46    pub fn new(pc: usize, opcode: OpCode) -> Self {
47        Self { pc, opcode, push_data: Vec::new() }
48    }
49
50    /// Create a new instruction with push data
51    pub fn with_push_data(pc: usize, opcode: OpCode, push_data: Vec<u8>) -> Self {
52        Self { pc, opcode, push_data }
53    }
54
55    /// Check if this instruction is a PUSH instruction
56    pub fn is_push(&self) -> bool {
57        let opcode_byte = self.opcode.get();
58        (0x60..=0x7F).contains(&opcode_byte)
59    }
60
61    /// Get the size of the immediate data for this instruction
62    /// Returns 0 for non-PUSH instructions
63    pub fn push_size(&self) -> usize {
64        if self.is_push() {
65            (self.opcode.get() - 0x60 + 1) as usize
66        } else {
67            0
68        }
69    }
70
71    /// Get the total instruction size (opcode + immediate data)
72    pub fn instruction_size(&self) -> usize {
73        1 + self.push_size()
74    }
75}
76
77/// Complete disassembly result for a piece of bytecode
78#[derive(Debug, Clone)]
79pub struct DisassemblyResult {
80    /// Original bytecode that was disassembled
81    pub bytecode: Bytes,
82    /// List of disassembled instructions in order
83    pub instructions: Vec<DisassemblyInstruction>,
84}
85
86impl DisassemblyResult {
87    /// Create a new disassembly result
88    pub fn new(bytecode: Bytes, instructions: Vec<DisassemblyInstruction>) -> Self {
89        Self { bytecode, instructions }
90    }
91
92    /// Get the total number of instructions
93    pub fn instruction_count(&self) -> usize {
94        self.instructions.len()
95    }
96
97    /// Get instruction at a specific program counter offset
98    pub fn get_instruction_at_pc(&self, pc: usize) -> Option<&DisassemblyInstruction> {
99        self.instructions.iter().find(|inst| inst.pc == pc)
100    }
101
102    /// Get all PUSH instructions
103    pub fn get_push_instructions(&self) -> Vec<&DisassemblyInstruction> {
104        self.instructions.iter().filter(|inst| inst.is_push()).collect()
105    }
106
107    /// Find the instruction that contains a given PC offset
108    /// This is useful when the PC might point into the middle of a PUSH instruction's data
109    pub fn find_instruction_containing_pc(&self, pc: usize) -> Option<&DisassemblyInstruction> {
110        self.instructions.iter().find(|inst| {
111            let start = inst.pc;
112            let end = start + inst.instruction_size();
113            pc >= start && pc < end
114        })
115    }
116}
117
118/// Disassemble EVM bytecode into a structured representation
119///
120/// This function parses the bytecode and extracts all opcodes along with their
121/// associated immediate data (for PUSH instructions). It properly handles
122/// instruction boundaries and invalid opcodes.
123///
124/// # Arguments
125/// * `bytecode` - The bytecode to disassemble
126///
127/// # Returns
128/// A `DisassemblyResult` containing the list of instructions
129///
130/// # Examples
131/// ```rust
132/// use alloy_primitives::Bytes;
133/// use edb_engine::utils::disasm::disassemble;
134///
135/// let bytecode = Bytes::from(vec![0x60, 0x42, 0x80]); // PUSH1 0x42, DUP1
136/// let result = disassemble(&bytecode);
137/// assert_eq!(result.instructions.len(), 2);
138/// assert!(result.instructions[0].is_push());
139/// assert_eq!(result.instructions[0].push_data, vec![0x42]);
140/// ```
141pub fn disassemble(bytecode: &Bytes) -> DisassemblyResult {
142    let mut instructions = Vec::new();
143    let mut pc = 0;
144
145    while pc < bytecode.len() {
146        let opcode_byte = bytecode[pc];
147
148        // Create the opcode - use new_unchecked for performance since we handle invalid opcodes below
149        let opcode = unsafe { OpCode::new_unchecked(opcode_byte) };
150
151        // Handle PUSH instructions (PUSH1 through PUSH32)
152        if (0x60..=0x7F).contains(&opcode_byte) {
153            // PUSHX instruction
154            let push_size = (opcode_byte - 0x60 + 1) as usize;
155            let data_start = pc + 1;
156            let data_end = data_start + push_size;
157
158            // Extract push data, padding with zeros if bytecode is truncated
159            let mut push_data = Vec::new();
160            for i in data_start..data_end {
161                if i < bytecode.len() {
162                    push_data.push(bytecode[i]);
163                } else {
164                    push_data.push(0); // Pad with zeros for truncated bytecode
165                }
166            }
167
168            instructions.push(DisassemblyInstruction::with_push_data(pc, opcode, push_data));
169            pc = data_end;
170        } else {
171            // Regular instruction without immediate data
172            instructions.push(DisassemblyInstruction::new(pc, opcode));
173            pc += 1;
174        }
175    }
176
177    DisassemblyResult::new(bytecode.clone(), instructions)
178}
179
180/// Extract immediate value from a PUSH instruction as a big-endian integer
181///
182/// This helper function converts the bytes from a PUSH instruction into a big-endian
183/// integer representation. Useful for analyzing PUSH values numerically.
184///
185/// # Arguments
186/// * `instruction` - The PUSH instruction to extract value from
187///
188/// # Returns
189/// The immediate value as a U256, or None if not a PUSH instruction
190///
191/// # Examples
192/// ```rust
193/// use edb_engine::utils::disasm::{DisassemblyInstruction, extract_push_value};
194/// use revm::bytecode::opcode::OpCode;
195///
196/// let push_inst = DisassemblyInstruction::with_push_data(
197///     0,
198///     unsafe { OpCode::new_unchecked(0x60) }, // PUSH1
199///     vec![0x42]
200/// );
201/// assert_eq!(extract_push_value(&push_inst), 0x42);
202/// ```
203pub fn extract_push_value(instruction: &DisassemblyInstruction) -> Option<U256> {
204    if !instruction.is_push() || instruction.push_data.is_empty() {
205        return None;
206    }
207
208    let mut value = U256::ZERO;
209    for &byte in &instruction.push_data {
210        value = value.wrapping_shl(8).wrapping_add(U256::from(byte));
211    }
212    Some(value)
213}
214
215/// Format a disassembly instruction as a human-readable string
216///
217/// This function creates a formatted string representation of an instruction,
218/// similar to what you'd see in a standard disassembler.
219///
220/// # Arguments
221/// * `instruction` - The instruction to format
222/// * `show_pc` - Whether to include the program counter in the output
223///
224/// # Returns
225/// A formatted string representation of the instruction
226///
227/// # Examples
228/// ```rust
229/// use edb_engine::utils::disasm::{DisassemblyInstruction, format_instruction};
230/// use revm::bytecode::opcode::OpCode;
231///
232/// let push_inst = DisassemblyInstruction::with_push_data(
233///     10,
234///     unsafe { OpCode::new_unchecked(0x61) }, // PUSH2
235///     vec![0x12, 0x34]
236/// );
237/// let formatted = format_instruction(&push_inst, true);
238/// assert!(formatted.contains("PUSH2"));
239/// assert!(formatted.contains("0x1234"));
240/// ```
241pub fn format_instruction(instruction: &DisassemblyInstruction, show_pc: bool) -> String {
242    let pc_part = if show_pc { format!("{:04x}: ", instruction.pc) } else { String::new() };
243
244    let opcode_name = if instruction.opcode.is_valid() {
245        instruction.opcode.as_str().to_string()
246    } else {
247        format!("'{:x}'(Unknown Opcode)", instruction.opcode.get())
248    };
249
250    if instruction.is_push() && !instruction.push_data.is_empty() {
251        let hex_data = instruction.push_data.iter().map(|b| format!("{b:02x}")).collect::<String>();
252        format!("{pc_part}{opcode_name} 0x{hex_data}")
253    } else {
254        format!("{pc_part}{opcode_name}")
255    }
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261    use alloy_primitives::Bytes;
262
263    #[test]
264    fn test_disassemble_simple() {
265        let bytecode = Bytes::from(vec![0x80, 0x81, 0x82]); // DUP1, DUP2, DUP3
266        let result = disassemble(&bytecode);
267
268        assert_eq!(result.instructions.len(), 3);
269        assert_eq!(result.instructions[0].pc, 0);
270        assert_eq!(result.instructions[1].pc, 1);
271        assert_eq!(result.instructions[2].pc, 2);
272
273        for inst in &result.instructions {
274            assert!(!inst.is_push());
275            assert!(inst.push_data.is_empty());
276        }
277    }
278
279    #[test]
280    fn test_disassemble_push_instructions() {
281        let bytecode = Bytes::from(vec![
282            0x60, 0x42, // PUSH1 0x42
283            0x61, 0x12, 0x34, // PUSH2 0x1234
284            0x80, // DUP1
285        ]);
286        let result = disassemble(&bytecode);
287
288        assert_eq!(result.instructions.len(), 3);
289
290        // PUSH1
291        assert_eq!(result.instructions[0].pc, 0);
292        assert!(result.instructions[0].is_push());
293        assert_eq!(result.instructions[0].push_data, vec![0x42]);
294        assert_eq!(result.instructions[0].instruction_size(), 2);
295
296        // PUSH2
297        assert_eq!(result.instructions[1].pc, 2);
298        assert!(result.instructions[1].is_push());
299        assert_eq!(result.instructions[1].push_data, vec![0x12, 0x34]);
300        assert_eq!(result.instructions[1].instruction_size(), 3);
301
302        // DUP1
303        assert_eq!(result.instructions[2].pc, 5);
304        assert!(!result.instructions[2].is_push());
305        assert!(result.instructions[2].push_data.is_empty());
306        assert_eq!(result.instructions[2].instruction_size(), 1);
307    }
308
309    #[test]
310    fn test_extract_push_value() {
311        let push1 = DisassemblyInstruction::with_push_data(
312            0,
313            unsafe { OpCode::new_unchecked(0x60) },
314            vec![0x42],
315        );
316        assert_eq!(extract_push_value(&push1), Some(U256::from(0x42)));
317
318        let push2 = DisassemblyInstruction::with_push_data(
319            0,
320            unsafe { OpCode::new_unchecked(0x61) },
321            vec![0x12, 0x34],
322        );
323        assert_eq!(extract_push_value(&push2), Some(U256::from(0x1234)));
324
325        let push4 = DisassemblyInstruction::with_push_data(
326            0,
327            unsafe { OpCode::new_unchecked(0x63) },
328            vec![0x12, 0x34, 0x56, 0x78],
329        );
330        assert_eq!(extract_push_value(&push4), Some(U256::from(0x12345678)));
331    }
332
333    #[test]
334    fn test_find_instruction_containing_pc() {
335        let bytecode = Bytes::from(vec![
336            0x60, 0x42, // PUSH1 0x42 (PC 0-1)
337            0x61, 0x12, 0x34, // PUSH2 0x1234 (PC 2-4)
338            0x80, // DUP1 (PC 5)
339        ]);
340        let result = disassemble(&bytecode);
341
342        // PC 0 and 1 should find the PUSH1 instruction
343        assert_eq!(result.find_instruction_containing_pc(0).unwrap().pc, 0);
344        assert_eq!(result.find_instruction_containing_pc(1).unwrap().pc, 0);
345
346        // PC 2, 3, and 4 should find the PUSH2 instruction
347        assert_eq!(result.find_instruction_containing_pc(2).unwrap().pc, 2);
348        assert_eq!(result.find_instruction_containing_pc(3).unwrap().pc, 2);
349        assert_eq!(result.find_instruction_containing_pc(4).unwrap().pc, 2);
350
351        // PC 5 should find the DUP1 instruction
352        assert_eq!(result.find_instruction_containing_pc(5).unwrap().pc, 5);
353
354        // PC beyond bytecode should return None
355        assert!(result.find_instruction_containing_pc(6).is_none());
356    }
357
358    #[test]
359    fn test_truncated_push_instruction() {
360        let bytecode = Bytes::from(vec![0x61, 0x12]); // PUSH2 but only 1 byte of data
361        let result = disassemble(&bytecode);
362
363        assert_eq!(result.instructions.len(), 1);
364        assert!(result.instructions[0].is_push());
365        assert_eq!(result.instructions[0].push_data, vec![0x12, 0x00]); // Padded with zero
366    }
367
368    #[test]
369    fn test_format_instruction() {
370        let push_inst = DisassemblyInstruction::with_push_data(
371            10,
372            unsafe { OpCode::new_unchecked(0x61) },
373            vec![0x12, 0x34],
374        );
375
376        let with_pc = format_instruction(&push_inst, true);
377        assert_eq!(with_pc, "000a: PUSH2 0x1234");
378
379        let without_pc = format_instruction(&push_inst, false);
380        assert_eq!(without_pc, "PUSH2 0x1234");
381
382        let regular_inst = DisassemblyInstruction::new(5, unsafe { OpCode::new_unchecked(0x80) });
383
384        let regular_formatted = format_instruction(&regular_inst, true);
385        assert_eq!(regular_formatted, "0005: DUP1");
386    }
387}