py_spy/
binary_parser.rs

1use std::collections::HashMap;
2use std::fs::File;
3use std::path::Path;
4
5use anyhow::Error;
6use goblin::Object;
7use memmap2::Mmap;
8
9use crate::utils::is_subrange;
10
11pub struct BinaryInfo {
12    pub symbols: HashMap<String, u64>,
13    pub bss_addr: u64,
14    pub bss_size: u64,
15    pub pyruntime_addr: u64,
16    pub pyruntime_size: u64,
17    #[allow(dead_code)]
18    pub addr: u64,
19    #[allow(dead_code)]
20    pub size: u64,
21}
22
23impl BinaryInfo {
24    #[cfg(feature = "unwind")]
25    pub fn contains(&self, addr: u64) -> bool {
26        addr >= self.addr && addr < (self.addr + self.size)
27    }
28}
29
30/// Uses goblin to parse a binary file, returns information on symbols/bss/adjusted offset etc
31pub fn parse_binary(filename: &Path, addr: u64, size: u64) -> Result<BinaryInfo, Error> {
32    let offset = addr;
33
34    let mut symbols = HashMap::new();
35
36    // Read in the filename
37    let file = File::open(filename)?;
38    let buffer = unsafe { Mmap::map(&file)? };
39
40    // Use goblin to parse the binary
41    match Object::parse(&buffer)? {
42        Object::Mach(mach) => {
43            // Get the mach binary from the archive
44            let mach = match mach {
45                goblin::mach::Mach::Binary(mach) => mach,
46                goblin::mach::Mach::Fat(fat) => {
47                    let arch = fat
48                        .iter_arches()
49                        .find(|arch| match arch {
50                            Ok(arch) => arch.is_64(),
51                            Err(_) => false,
52                        })
53                        .ok_or_else(|| {
54                            format_err!(
55                                "Failed to find 64 bit arch in FAT archive in {}",
56                                filename.display()
57                            )
58                        })??;
59                    if !is_subrange(0, buffer.len(), arch.offset as usize, arch.size as usize) {
60                        return Err(format_err!(
61                            "Invalid offset/size in FAT archive in {}",
62                            filename.display()
63                        ));
64                    }
65                    let bytes = &buffer[arch.offset as usize..][..arch.size as usize];
66                    goblin::mach::MachO::parse(bytes, 0)?
67                }
68            };
69
70            let mut pyruntime_addr = 0;
71            let mut pyruntime_size = 0;
72            let mut bss_addr = 0;
73            let mut bss_size = 0;
74            for segment in mach.segments.iter() {
75                for (section, _) in &segment.sections()? {
76                    let name = section.name()?;
77                    if name == "PyRuntime" {
78                        if let Some(addr) = section.addr.checked_add(offset) {
79                            if addr.checked_add(section.size).is_some() {
80                                pyruntime_addr = addr;
81                                pyruntime_size = section.size;
82                            }
83                        }
84                    }
85
86                    if name == "__bss" {
87                        if let Some(addr) = section.addr.checked_add(offset) {
88                            if addr.checked_add(section.size).is_some() {
89                                bss_addr = addr;
90                                bss_size = section.size;
91                            }
92                        }
93                    }
94                }
95            }
96
97            if let Some(syms) = mach.symbols {
98                for symbol in syms.iter() {
99                    let (name, value) = symbol?;
100                    // almost every symbol we care about starts with an extra _, remove to normalize
101                    // with the entries seen on linux/windows
102                    if let Some(stripped_name) = name.strip_prefix('_') {
103                        symbols.insert(stripped_name.to_string(), value.n_value + offset);
104                    }
105                }
106            }
107            Ok(BinaryInfo {
108                symbols,
109                bss_addr,
110                bss_size,
111                pyruntime_addr,
112                pyruntime_size,
113                addr,
114                size,
115            })
116        }
117
118        Object::Elf(elf) => {
119            let strtab = elf.shdr_strtab;
120            let bss_header = elf
121                .section_headers
122                .iter()
123                // filter down to things that are both NOBITS sections and are named .bss
124                .filter(|header| header.sh_type == goblin::elf::section_header::SHT_NOBITS)
125                .filter(|header| {
126                    strtab
127                        .get_at(header.sh_name)
128                        .map_or(true, |name| name == ".bss")
129                })
130                // if we have multiple sections here, take the largest
131                .max_by_key(|header| header.sh_size)
132                .ok_or_else(|| {
133                    format_err!(
134                        "Failed to find BSS section header in {}",
135                        filename.display()
136                    )
137                })?;
138
139            let program_header = elf
140                .program_headers
141                .iter()
142                .find(|header| {
143                    header.p_type == goblin::elf::program_header::PT_LOAD
144                        && header.p_flags & goblin::elf::program_header::PF_X != 0
145                })
146                .ok_or_else(|| {
147                    format_err!(
148                        "Failed to find executable PT_LOAD program header in {}",
149                        filename.display()
150                    )
151                })?;
152
153            // Align the virtual address offset, then subtract it from the offset
154            // to get real offset for symbol addresses in the file.
155            let aligned_vaddr =
156                program_header.p_vaddr - (program_header.p_vaddr % page_size::get() as u64);
157            let offset = offset.saturating_sub(aligned_vaddr);
158
159            let mut bss_addr = 0;
160            let mut bss_size = 0;
161            let mut bss_end = 0;
162            if let Some(addr) = bss_header.sh_addr.checked_add(offset) {
163                if bss_header.sh_size.checked_add(addr).is_none() {
164                    return Err(format_err!(
165                        "Invalid bss address/size in {}",
166                        filename.display()
167                    ));
168                }
169                bss_addr = addr;
170                bss_size = bss_header.sh_size;
171                bss_end = bss_header.sh_addr + bss_header.sh_size;
172            }
173
174            let pyruntime_header = elf.section_headers.iter().find(|header| {
175                strtab
176                    .get_at(header.sh_name)
177                    .map_or(false, |name| name == ".PyRuntime")
178            });
179
180            let mut pyruntime_addr = 0;
181            let mut pyruntime_size = 0;
182            if let Some(header) = pyruntime_header {
183                if let Some(addr) = header.sh_addr.checked_add(offset) {
184                    pyruntime_addr = addr;
185                    pyruntime_size = header.sh_size;
186                }
187            }
188
189            for sym in elf.syms.iter() {
190                // Skip undefined symbols.
191                if sym.st_shndx == goblin::elf::section_header::SHN_UNDEF as usize {
192                    continue;
193                }
194                // Skip imported symbols
195                if sym.is_import()
196                    || (bss_end != 0
197                        && sym.st_size != 0
198                        && !is_subrange(0u64, bss_end, sym.st_value, sym.st_size))
199                {
200                    continue;
201                }
202                if let Some(pos) = sym.st_value.checked_add(offset) {
203                    if sym.is_function() && !is_subrange(addr, size, pos, sym.st_size) {
204                        continue;
205                    }
206                    if let Some(name) = elf.strtab.get_unsafe(sym.st_name) {
207                        symbols.insert(name.to_string(), pos);
208                    }
209                }
210            }
211            for dynsym in elf.dynsyms.iter() {
212                // Skip undefined symbols.
213                if dynsym.st_shndx == goblin::elf::section_header::SHN_UNDEF as usize {
214                    continue;
215                }
216                // Skip imported symbols
217                if dynsym.is_import()
218                    || (bss_end != 0
219                        && dynsym.st_size != 0
220                        && !is_subrange(0u64, bss_end, dynsym.st_value, dynsym.st_size))
221                {
222                    continue;
223                }
224                if let Some(pos) = dynsym.st_value.checked_add(offset) {
225                    if dynsym.is_function() && !is_subrange(addr, size, pos, dynsym.st_size) {
226                        continue;
227                    }
228                    if let Some(name) = elf.dynstrtab.get_unsafe(dynsym.st_name) {
229                        symbols.insert(name.to_string(), pos);
230                    }
231                }
232            }
233
234            Ok(BinaryInfo {
235                symbols,
236                bss_addr,
237                bss_size,
238                pyruntime_addr,
239                pyruntime_size,
240                addr,
241                size,
242            })
243        }
244        Object::PE(pe) => {
245            for export in pe.exports {
246                if let Some(name) = export.name {
247                    if let Some(addr) = offset.checked_add(export.rva as u64) {
248                        symbols.insert(name.to_string(), addr);
249                    }
250                }
251            }
252
253            let mut bss_addr = 0;
254            let mut bss_size = 0;
255            let mut pyruntime_addr = 0;
256            let mut pyruntime_size = 0;
257            let mut found_data = false;
258            for section in pe.sections.iter() {
259                if section.name.starts_with(b".data") {
260                    found_data = true;
261                    if let Some(addr) = offset.checked_add(section.virtual_address as u64) {
262                        if addr.checked_add(section.virtual_size as u64).is_some() {
263                            bss_addr = addr;
264                            bss_size = u64::from(section.virtual_size);
265                        }
266                    }
267                } else if section.name.starts_with(b"PyRuntim") {
268                    // note that the name is only 8 chars here, so we don't check for
269                    // trailing 'e' in PyRuntime
270                    if let Some(addr) = offset.checked_add(section.virtual_address as u64) {
271                        if addr.checked_add(section.virtual_size as u64).is_some() {
272                            pyruntime_addr = addr;
273                            pyruntime_size = u64::from(section.virtual_size);
274                        }
275                    }
276                }
277            }
278
279            if !found_data {
280                return Err(format_err!(
281                    "Failed to find .data section in PE binary of {}",
282                    filename.display()
283                ));
284            }
285
286            Ok(BinaryInfo {
287                symbols,
288                bss_addr,
289                bss_size,
290                pyruntime_size,
291                pyruntime_addr,
292                addr,
293                size,
294            })
295        }
296        _ => Err(format_err!("Unhandled binary type")),
297    }
298}