Skip to main content

py_spy/
binary_parser.rs

1use std::collections::HashMap;
2use std::fs::File;
3use std::path::Path;
4
5use anyhow::Error;
6use goblin::Object;
7use memmap2::Mmap;
8
9use crate::utils::is_subrange;
10
11pub struct BinaryInfo {
12    pub symbols: HashMap<String, u64>,
13    pub bss_addr: u64,
14    pub bss_size: u64,
15    pub pyruntime_addr: u64,
16    pub pyruntime_size: u64,
17    #[allow(dead_code)]
18    pub addr: u64,
19    #[allow(dead_code)]
20    pub size: u64,
21}
22
23impl BinaryInfo {
24    #[cfg(feature = "unwind")]
25    pub fn contains(&self, addr: u64) -> bool {
26        addr >= self.addr && addr < (self.addr + self.size)
27    }
28}
29
30#[cfg(target_os = "macos")]
31fn get_mach_cpu_type() -> goblin::mach::cputype::CpuType {
32    let is_arm: i32 = 0;
33    let size: usize = std::mem::size_of_val(&is_arm);
34    unsafe {
35        let name = std::ffi::CString::new("hw.optional.arm64").expect("CString::new failed");
36        let ret = libc::sysctlbyname(
37            name.as_ptr() as *const i8,
38            &is_arm as *const _ as *mut _,
39            &size as *const _ as *mut _,
40            std::ptr::null_mut(),
41            0,
42        );
43        if ret != 0 {
44            // if the `hw.optional.arm64` key doesn't exist, its likely that we are running on an
45            // older x86_64 based intel mac
46            warn!("failed to call 'libc::sysctlbyname(\"hw.optional.arm64\",...' - assume running on x86_64 ");
47            return goblin::mach::cputype::CPU_TYPE_X86_64;
48        }
49    }
50    if is_arm == 1 {
51        goblin::mach::cputype::CPU_TYPE_ARM64
52    } else {
53        goblin::mach::cputype::CPU_TYPE_X86_64
54    }
55}
56
57#[cfg(not(target_os = "macos"))]
58fn get_mach_cpu_type() -> goblin::mach::cputype::CpuType {
59    goblin::mach::cputype::CPU_TYPE_ANY
60}
61
62/// Uses goblin to parse a binary file, returns information on symbols/bss/adjusted offset etc
63pub fn parse_binary(filename: &Path, addr: u64, size: u64) -> Result<BinaryInfo, Error> {
64    let offset = addr;
65
66    let mut symbols = HashMap::new();
67
68    // Read in the filename
69    let file = File::open(filename)?;
70    let buffer = unsafe { Mmap::map(&file)? };
71
72    // for OSX, we could be running under rosetta (is compiled for x86_64, but running under
73    // arm64). check the currently running cpu type rather than relying on compile time flags
74    let mach_cputype = get_mach_cpu_type();
75
76    // Use goblin to parse the binary
77    match Object::parse(&buffer)? {
78        Object::Mach(mach) => {
79            // Get the mach binary from the archive
80            let mach = match mach {
81                goblin::mach::Mach::Binary(mach) => mach,
82                goblin::mach::Mach::Fat(fat) => {
83                    let arch = fat
84                        .iter_arches()
85                        .find(|arch| match arch {
86                            Ok(arch) => arch.is_64() && arch.cputype() == mach_cputype,
87                            Err(_) => false,
88                        })
89                        .ok_or_else(|| {
90                            format_err!(
91                                "Failed to find 64 bit arch in FAT archive in {}",
92                                filename.display()
93                            )
94                        })??;
95                    if !is_subrange(0, buffer.len(), arch.offset as usize, arch.size as usize) {
96                        return Err(format_err!(
97                            "Invalid offset/size in FAT archive in {}",
98                            filename.display()
99                        ));
100                    }
101                    let bytes = &buffer[arch.offset as usize..][..arch.size as usize];
102                    goblin::mach::MachO::parse(bytes, 0)?
103                }
104            };
105
106            let mut pyruntime_addr = 0;
107            let mut pyruntime_size = 0;
108            let mut bss_addr = 0;
109            let mut bss_size = 0;
110            for segment in mach.segments.iter() {
111                for (section, _) in &segment.sections()? {
112                    let name = section.name()?;
113                    if name == "PyRuntime" {
114                        if let Some(addr) = section.addr.checked_add(offset) {
115                            if addr.checked_add(section.size).is_some() {
116                                pyruntime_addr = addr;
117                                pyruntime_size = section.size;
118                            }
119                        }
120                    }
121
122                    if name == "__bss" {
123                        if let Some(addr) = section.addr.checked_add(offset) {
124                            if addr.checked_add(section.size).is_some() {
125                                bss_addr = addr;
126                                bss_size = section.size;
127                            }
128                        }
129                    }
130                }
131            }
132
133            if let Some(syms) = mach.symbols {
134                for symbol in syms.iter() {
135                    let (name, value) = symbol?;
136                    // almost every symbol we care about starts with an extra _, remove to normalize
137                    // with the entries seen on linux/windows
138                    if let Some(stripped_name) = name.strip_prefix('_') {
139                        symbols.insert(stripped_name.to_string(), value.n_value + offset);
140                    }
141                }
142            }
143            Ok(BinaryInfo {
144                symbols,
145                bss_addr,
146                bss_size,
147                pyruntime_addr,
148                pyruntime_size,
149                addr,
150                size,
151            })
152        }
153
154        Object::Elf(elf) => {
155            let strtab = elf.shdr_strtab;
156            let bss_header = elf
157                .section_headers
158                .iter()
159                // filter down to things that are both NOBITS sections and are named .bss
160                .filter(|header| header.sh_type == goblin::elf::section_header::SHT_NOBITS)
161                .filter(|header| {
162                    strtab
163                        .get_at(header.sh_name)
164                        .is_none_or(|name| name == ".bss")
165                })
166                // if we have multiple sections here, take the largest
167                .max_by_key(|header| header.sh_size)
168                .ok_or_else(|| {
169                    format_err!(
170                        "Failed to find BSS section header in {}",
171                        filename.display()
172                    )
173                })?;
174
175            let program_header = elf
176                .program_headers
177                .iter()
178                .find(|header| {
179                    header.p_type == goblin::elf::program_header::PT_LOAD
180                        && header.p_flags & goblin::elf::program_header::PF_X != 0
181                })
182                .ok_or_else(|| {
183                    format_err!(
184                        "Failed to find executable PT_LOAD program header in {}",
185                        filename.display()
186                    )
187                })?;
188
189            // Align the virtual address offset, then subtract it from the offset
190            // to get real offset for symbol addresses in the file.
191            let aligned_vaddr =
192                program_header.p_vaddr - (program_header.p_vaddr % page_size::get() as u64);
193            let offset = offset.saturating_sub(aligned_vaddr);
194
195            let mut bss_addr = 0;
196            let mut bss_size = 0;
197            let mut bss_end = 0;
198            if let Some(addr) = bss_header.sh_addr.checked_add(offset) {
199                if bss_header.sh_size.checked_add(addr).is_none() {
200                    return Err(format_err!(
201                        "Invalid bss address/size in {}",
202                        filename.display()
203                    ));
204                }
205                bss_addr = addr;
206                bss_size = bss_header.sh_size;
207                bss_end = bss_header.sh_addr + bss_header.sh_size;
208            }
209
210            let pyruntime_header = elf
211                .section_headers
212                .iter()
213                .find(|header| strtab.get_at(header.sh_name) == Some(".PyRuntime"));
214
215            let mut pyruntime_addr = 0;
216            let mut pyruntime_size = 0;
217            if let Some(header) = pyruntime_header {
218                if let Some(addr) = header.sh_addr.checked_add(offset) {
219                    pyruntime_addr = addr;
220                    pyruntime_size = header.sh_size;
221                }
222            }
223
224            for sym in elf.syms.iter() {
225                // Skip undefined symbols.
226                if sym.st_shndx == goblin::elf::section_header::SHN_UNDEF as usize {
227                    continue;
228                }
229                // Skip imported symbols
230                if sym.is_import()
231                    || (bss_end != 0
232                        && sym.st_size != 0
233                        && !is_subrange(0u64, bss_end, sym.st_value, sym.st_size))
234                {
235                    continue;
236                }
237                if let Some(pos) = sym.st_value.checked_add(offset) {
238                    if sym.is_function() && !is_subrange(addr, size, pos, sym.st_size) {
239                        continue;
240                    }
241                    if let Some(name) = elf.strtab.get_unsafe(sym.st_name) {
242                        symbols.insert(name.to_string(), pos);
243                    }
244                }
245            }
246            for dynsym in elf.dynsyms.iter() {
247                // Skip undefined symbols.
248                if dynsym.st_shndx == goblin::elf::section_header::SHN_UNDEF as usize {
249                    continue;
250                }
251                // Skip imported symbols
252                if dynsym.is_import()
253                    || (bss_end != 0
254                        && dynsym.st_size != 0
255                        && !is_subrange(0u64, bss_end, dynsym.st_value, dynsym.st_size))
256                {
257                    continue;
258                }
259                if let Some(pos) = dynsym.st_value.checked_add(offset) {
260                    if dynsym.is_function() && !is_subrange(addr, size, pos, dynsym.st_size) {
261                        continue;
262                    }
263                    if let Some(name) = elf.dynstrtab.get_unsafe(dynsym.st_name) {
264                        symbols.insert(name.to_string(), pos);
265                    }
266                }
267            }
268
269            Ok(BinaryInfo {
270                symbols,
271                bss_addr,
272                bss_size,
273                pyruntime_addr,
274                pyruntime_size,
275                addr,
276                size,
277            })
278        }
279        Object::PE(pe) => {
280            for export in pe.exports {
281                if let Some(name) = export.name {
282                    if let Some(addr) = offset.checked_add(export.rva as u64) {
283                        symbols.insert(name.to_string(), addr);
284                    }
285                }
286            }
287
288            let mut bss_addr = 0;
289            let mut bss_size = 0;
290            let mut pyruntime_addr = 0;
291            let mut pyruntime_size = 0;
292            let mut found_data = false;
293            for section in pe.sections.iter() {
294                if section.name.starts_with(b".data") {
295                    found_data = true;
296                    if let Some(addr) = offset.checked_add(section.virtual_address as u64) {
297                        if addr.checked_add(section.virtual_size as u64).is_some() {
298                            bss_addr = addr;
299                            bss_size = u64::from(section.virtual_size);
300                        }
301                    }
302                } else if section.name.starts_with(b"PyRuntim") {
303                    // note that the name is only 8 chars here, so we don't check for
304                    // trailing 'e' in PyRuntime
305                    if let Some(addr) = offset.checked_add(section.virtual_address as u64) {
306                        if addr.checked_add(section.virtual_size as u64).is_some() {
307                            pyruntime_addr = addr;
308                            pyruntime_size = u64::from(section.virtual_size);
309                        }
310                    }
311                }
312            }
313
314            if !found_data {
315                return Err(format_err!(
316                    "Failed to find .data section in PE binary of {}",
317                    filename.display()
318                ));
319            }
320
321            Ok(BinaryInfo {
322                symbols,
323                bss_addr,
324                bss_size,
325                pyruntime_size,
326                pyruntime_addr,
327                addr,
328                size,
329            })
330        }
331        _ => Err(format_err!("Unhandled binary type")),
332    }
333}