Skip to main content

addr_symbolizer/
symbolizer.rs

1// Axel '0vercl0k' Souchet - February 20 2024
2//! This module contains the implementation of the [`Symbolizer`] which is the
3//! object that is able to symbolize files using PDB information if available.
4use std::collections::HashMap;
5use std::fs::{self, File};
6use std::hash::{BuildHasher, Hasher};
7use std::io::{self, BufWriter, Read, Seek, Write};
8use std::path::{Path, PathBuf};
9
10use log::{debug, info, trace, warn};
11
12use crate::addr_space::AddrSpace;
13use crate::misc::{fast_hex32, fast_hex64, parse_full_name};
14use crate::modules::{Module, Modules};
15use crate::pdbcache::{
16    PdbCache, PdbCacheBuilder, PdbCacheStore, format_symcache_path, format_symsrv_url,
17};
18use crate::pe::{PdbId, Pe, PeId, SymcacheEntry};
19use crate::stats::Stats;
20use crate::{Error, Guid, Result};
21
22#[derive(Debug)]
23struct DownloadedFile {
24    path: PathBuf,
25    size: u64,
26}
27
28impl DownloadedFile {
29    fn new(path: impl AsRef<Path>, size: u64) -> Self {
30        Self {
31            path: path.as_ref().to_path_buf(),
32            size,
33        }
34    }
35}
36
37/// Where did we find this PDB? On the file-system somewhere, in a local symbol
38/// cache or downloaded on a symbol server.
39///
40/// This is used mainly to account for statistics; how many files were
41/// downloaded, etc.
42#[derive(Debug)]
43enum PdbLocationKind {
44    /// The PDB file was found on the file system but not in a symbol cache.
45    Local,
46    /// The PDB file was found on the file system in a local symbol cache.
47    LocalCache,
48    /// The PDB file was downloaded on a remote symbol server.
49    Download(u64),
50}
51
52#[derive(Debug)]
53struct PdbLocation {
54    kind: PdbLocationKind,
55    path: PathBuf,
56}
57
58impl PdbLocation {
59    fn new(kind: PdbLocationKind, path: PathBuf) -> Self {
60        Self { kind, path }
61    }
62}
63
64/// Where did we find this PE? In a local symbol cache or downloaded on a symbol
65/// server.
66///
67/// This is used mainly to account for statistics; how many files were
68/// downloaded, etc.
69#[derive(Debug)]
70enum PeLocationKind {
71    /// The PE file was found on the file system in a local symbol cache.
72    LocalCache,
73    /// The PE file was downloaded on a remote symbol server.
74    Download(u64),
75}
76
77#[derive(Debug)]
78struct PeLocation {
79    kind: PeLocationKind,
80    pdb_id: Option<PdbId>,
81}
82
83impl PeLocation {
84    fn new(kind: PeLocationKind, pdb_id: Option<PdbId>) -> Self {
85        Self { kind, pdb_id }
86    }
87}
88
89/// Attempt to download a PE/PDB file from a list of symbol servers.
90///
91/// The code iterates through every symbol servers, and stops as soon as it was
92/// able to download a matching file.
93fn download_from_symsrv<'s>(
94    symcache: impl AsRef<Path>,
95    symsrvs: impl Iterator<Item = &'s str>,
96    entry: &impl SymcacheEntry,
97) -> Result<Option<DownloadedFile>> {
98    // The way a symbol path is structured is that there is a directory per module..
99    let symcache = symcache.as_ref();
100    let entry_root_dir = symcache.join(entry.name());
101
102    // ..and inside, there is a directory per version of the PE/PDB..
103    let entry_dir = entry_root_dir.join(entry.index());
104
105    // ..and finally the PE/PDB file itself.
106    let entry_path = entry_dir.join(entry.name());
107
108    // Give a try to each of the symbol servers.
109    for symsrv in symsrvs {
110        // The file doesn't exist on the file system, so let's try to download it from a
111        // symbol server.
112        let entry_url = format_symsrv_url(symsrv, entry);
113        debug!("trying to download {entry_url}..");
114
115        let resp = match ureq::get(&entry_url).call() {
116            Ok(o) => o,
117            // If we get a 404, it means that the server doesn't know about this file. So we'll skip
118            // to the next symbol server.
119            Err(ureq::Error::StatusCode(404)) => {
120                warn!("got a 404 for {entry_url}");
121                continue;
122            }
123            // If we received any other errors, well that's not expected so let's bail.
124            Err(e) => {
125                return Err(Error::Download {
126                    entry_url,
127                    e: e.into(),
128                });
129            }
130        };
131
132        // If the server knows about this file, it is time to create the directory
133        // structure in which we'll download the file into.
134        if !entry_dir.try_exists()? {
135            debug!("creating {}..", entry_dir.display());
136            fs::create_dir_all(&entry_dir).map_err(|_| {
137                Error::Other(format!("failed to create pdb dir {}", entry_dir.display()))
138            })?;
139        }
140
141        // Finally, we can download and save the file.
142        let file = File::create(&entry_path)
143            .map_err(|_| Error::Other(format!("failed to create {}", entry_path.display())))?;
144
145        let size = io::copy(
146            &mut resp.into_body().into_reader(),
147            &mut BufWriter::new(file),
148        )?;
149
150        debug!("downloaded to {}", entry_path.display());
151        return Ok(Some(DownloadedFile::new(entry_path, size)));
152    }
153
154    Ok(None)
155}
156
157/// Try to download a PE file off the symbol servers, and if one is found, try
158/// to extract its PDB identifier.
159fn get_pdb_id_from_symsrvs(
160    pdb_lookup: &PdbLookupConfig,
161    pe_id: &PeId,
162) -> Result<Option<PeLocation>> {
163    Ok(match pdb_lookup.symsrvs() {
164        None => {
165            // If we're offline, we're done.
166            None
167        }
168
169        Some(symsrvs) => {
170            struct FileAddrSpace(File);
171
172            impl FileAddrSpace {
173                fn new(path: impl AsRef<Path>) -> Result<Self> {
174                    Ok(Self(File::open(path.as_ref())?))
175                }
176            }
177
178            impl AddrSpace for FileAddrSpace {
179                fn read_at(&mut self, addr: u64, buf: &mut [u8]) -> io::Result<usize> {
180                    self.0.seek(io::SeekFrom::Start(addr))?;
181
182                    self.0.read(buf)
183                }
184            }
185
186            let symcache = &pdb_lookup.symcache;
187            let mut pe_path = format_symcache_path(symcache, pe_id);
188            let kind = if pe_path.exists() {
189                PeLocationKind::LocalCache
190            } else {
191                // We didn't find a PE on disk, so last resort is to try to download it.
192                let Some(downloaded) = download_from_symsrv(symcache, symsrvs, pe_id)? else {
193                    debug!("did not find {pe_id} on any symbol server");
194                    return Ok(None);
195                };
196
197                pe_path = downloaded.path;
198
199                PeLocationKind::Download(downloaded.size)
200            };
201
202            debug!("trying to parse {} from disk..", pe_path.display());
203            let mut addr_space = FileAddrSpace::new(pe_path)?;
204            let pe_file = Pe::new(&mut addr_space, 0)?;
205            let pdb_id = pe_file.read_pdbid(&mut addr_space)?;
206
207            debug!("PDB id parsed from the PE: {pdb_id:?}");
208
209            Some(PeLocation::new(kind, pdb_id))
210        }
211    })
212}
213
214/// Try to find a PDB file online or locally from a [`PdbId`].
215fn get_pdb(pdb_lookup: &PdbLookupConfig, pdb_id: &PdbId) -> Result<Option<PdbLocation>> {
216    // Let's see if the path exists locally..
217    if pdb_id.path.is_file() {
218        // .. if it does, this is a 'Local' PDB.
219        return Ok(Some(PdbLocation::new(
220            PdbLocationKind::Local,
221            pdb_id.path.clone(),
222        )));
223    }
224
225    // Now, let's see if it's in the local cache..
226    let symcache = &pdb_lookup.symcache;
227    let local_path = format_symcache_path(symcache, pdb_id);
228    if local_path.is_file() {
229        // .. if it does, this is a 'LocalCache' PDB.
230        return Ok(Some(PdbLocation::new(
231            PdbLocationKind::LocalCache,
232            local_path,
233        )));
234    }
235
236    Ok(match pdb_lookup.symsrvs() {
237        None => {
238            // If we're offline, let's just skip the downloading part.
239            None
240        }
241        Some(symsrvs) => {
242            // We didn't find a PDB on disk, so last resort is to try to download it.
243            let downloaded_path = download_from_symsrv(symcache, symsrvs, pdb_id)?;
244
245            downloaded_path
246                .map(|file| PdbLocation::new(PdbLocationKind::Download(file.size), file.path))
247        }
248    })
249}
250
251/// A simple 'hasher' that uses the input bytes as a hash.
252///
253/// This is used for the cache `HashMap` used in the [`Symbolizer`]. We are
254/// caching symbol addresses and so we know those addresses are unique and do
255/// not need to be hashed.
256#[derive(Default)]
257struct IdentityHasher {
258    h: u64,
259}
260
261impl Hasher for IdentityHasher {
262    fn finish(&self) -> u64 {
263        self.h
264    }
265
266    fn write(&mut self, bytes: &[u8]) {
267        debug_assert_eq!(bytes.len(), 8);
268
269        self.h = u64::from_le_bytes(bytes.try_into().unwrap());
270    }
271}
272
273impl BuildHasher for IdentityHasher {
274    type Hasher = Self;
275
276    fn build_hasher(&self) -> Self::Hasher {
277        Self::default()
278    }
279}
280
281/// The logic in here has been extracted from the [`Symbolizer`] class to
282/// satisfy the borrow checker and avoid having free functions taking 5+
283/// arguments.
284struct SymbolizerInner<'symbolizer> {
285    stats: &'symbolizer mut Stats,
286    pdb_lookup: &'symbolizer PdbLookupConfig,
287    pdbcache_store: &'symbolizer mut PdbCacheStore,
288}
289
290impl<'symbolizer> SymbolizerInner<'symbolizer> {
291    fn new(
292        stats: &'symbolizer mut Stats,
293        pdb_lookup: &'symbolizer PdbLookupConfig,
294        pdbcache_store: &'symbolizer mut PdbCacheStore,
295    ) -> Self {
296        Self {
297            stats,
298            pdb_lookup,
299            pdbcache_store,
300        }
301    }
302
303    fn get_or_create_module_pdbcache(
304        &'symbolizer mut self,
305        addr_space: &mut impl AddrSpace,
306        module: &Module,
307    ) -> Result<&'symbolizer PdbCache> {
308        let create_pdbcache = || -> Result<PdbCache> {
309            let mut builder = PdbCacheBuilder::new(module);
310
311            // Let's start by parsing the PE to get its exports, and PDB information if
312            // there's any.
313            let pe = Pe::new(addr_space, module.at.start)?;
314
315            // Ingest the EAT.
316            builder.ingest(pe.read_exports(addr_space)?.unwrap_or_default());
317
318            // See if it has PDB information. If it doesn't try to download the
319            // original PE file off symbol servers.
320            let pdb_id = pe.read_pdbid(addr_space).and_then(|pdb_id| {
321                if pdb_id.is_some() {
322                    return Ok(pdb_id);
323                }
324
325                let pe_id = PeId::new(&module.name, pe.timestamp, pe.size);
326                trace!("No PDB information found, trying to download PE file for {pe_id}..");
327
328                let downloaded_pe = get_pdb_id_from_symsrvs(self.pdb_lookup, &pe_id)?;
329
330                Ok(downloaded_pe.and_then(|d| {
331                    if let PeLocationKind::Download(size) = d.kind {
332                        self.stats.downloaded_pe(pe_id, size);
333                    }
334
335                    d.pdb_id
336                }))
337            })?;
338
339            if let Some(pdb_id) = pdb_id {
340                trace!("getting PDB information for {module:?}/{pdb_id}..");
341
342                // Try to get a PDB..
343                if let Some(downloaded_pdb) = get_pdb(self.pdb_lookup, &pdb_id)? {
344                    if let PdbLocationKind::Download(size) = downloaded_pdb.kind {
345                        self.stats.downloaded_pdb(pdb_id, size);
346                    }
347
348                    // .. and ingest it if we have one.
349                    trace!("Ingesting PDB..");
350                    builder.ingest_pdb(downloaded_pdb.path)?;
351                }
352            }
353
354            // Build the cache..
355            let pdbcache = builder.build()?;
356
357            Ok(pdbcache)
358        };
359
360        self.pdbcache_store.get_or_create(module, create_pdbcache)
361    }
362
363    /// Try to symbolize an address.
364    ///
365    /// If there's a [`PdbCache`] already created, then ask it to symbolize.
366    /// Otherwise, this will create a [`PdbCache`], try to find a PDB (locally
367    /// or remotely) and extract every bit of relevant information for us.
368    /// Finally, the result will be kept around to symbolize addresses in that
369    /// module faster in the future.
370    fn try_symbolize_addr_from_pdbs(
371        &'symbolizer mut self,
372        addr_space: &mut impl AddrSpace,
373        module: &Module,
374        addr: u64,
375    ) -> Result<Option<String>> {
376        trace!("symbolizing address {addr:#x} from {}..", module.name);
377
378        // Get a pdbcache..
379        let pdbcache = self.get_or_create_module_pdbcache(addr_space, module)?;
380
381        // .. and symbolize `addr`!
382        let line = pdbcache.symbolize(module.rva(addr));
383
384        Ok(Some(line))
385    }
386}
387
388/// Holds the details of where PDBs can be looked up from; both locally and
389/// online if possible.
390#[derive(Debug)]
391pub struct PdbLookupConfig {
392    /// This is a path to the local PDB symbol cache where PDBs will be
393    /// downloaded into / where some are available.
394    symcache: PathBuf,
395    /// List of symbol servers to try to download PDBs from when needed.
396    symsrvs: Option<Vec<String>>,
397}
398
399impl PdbLookupConfig {
400    fn inner_new(symcache: PathBuf, symsrvs: Option<Vec<String>>) -> Result<Self> {
401        if !symcache.is_dir() {
402            return Err(Error::Other(format!(
403                "{} directory does not exist",
404                symcache.display()
405            )));
406        }
407
408        Ok(Self { symcache, symsrvs })
409    }
410
411    pub fn new(symcache: PathBuf) -> Result<Self> {
412        Self::inner_new(symcache, None)
413    }
414
415    pub fn with_msft_symsrv(symcache: PathBuf) -> Result<Self> {
416        Self::with_symsrvs(symcache, vec![
417            "https://msdl.microsoft.com/download/symbols/".to_string(),
418        ])
419    }
420
421    pub fn with_symsrvs(symcache: PathBuf, symsrvs: Vec<String>) -> Result<Self> {
422        Self::inner_new(symcache, Some(symsrvs))
423    }
424
425    #[must_use]
426    pub fn symcache(&self) -> &Path {
427        &self.symcache
428    }
429
430    #[must_use]
431    pub fn is_offline(&self) -> bool {
432        self.symsrvs.is_none()
433    }
434
435    #[must_use]
436    pub fn is_online(&self) -> bool {
437        self.symsrvs.is_some()
438    }
439
440    fn symsrvs(&self) -> Option<impl Iterator<Item = &str>> {
441        self.symsrvs
442            .as_ref()
443            .map(|symsrvs| symsrvs.iter().map(AsRef::as_ref))
444    }
445}
446
447/// The [`Symbolizer`] is the main object that glues all the logic.
448///
449/// It downloads, parses PDB information, and symbolizes.
450pub struct Symbolizer {
451    /// Keep track of some statistics such as the number of lines symbolized,
452    /// PDB downloaded, etc.
453    stats: Stats,
454    /// This is the list of kernel / user modules read from the kernel crash
455    /// dump.
456    modules: Modules,
457    /// List of symbol servers to try to download PDBs from when needed.
458    pdb_lookup: PdbLookupConfig,
459    /// Caches addresses to symbols. This allows us to not have to symbolize an
460    /// address again.
461    addr_cache: HashMap<u64, Box<str>, IdentityHasher>,
462    /// Each parsed module is stored in this cache. We parse PDBs, etc. only
463    /// once and then the [`PdbCache`] is used to query.
464    pdbcache_store: PdbCacheStore,
465}
466
467impl Symbolizer {
468    /// Create a [`Symbolizer`].
469    #[must_use]
470    pub fn new(pdb_lookup: PdbLookupConfig, modules: impl IntoIterator<Item = Module>) -> Self {
471        let modules = modules.into_iter().collect();
472
473        Self {
474            stats: Stats::default(),
475            modules: Modules::new(modules),
476            pdb_lookup,
477            addr_cache: HashMap::default(),
478            pdbcache_store: PdbCacheStore::default(),
479        }
480    }
481
482    pub fn with_cache_capacity(
483        pdb_lookup: PdbLookupConfig,
484        modules: impl IntoIterator<Item = Module>,
485        cache_capacity_hint: usize,
486    ) -> Self {
487        let modules = modules.into_iter().collect();
488        let addr_cache =
489            HashMap::with_capacity_and_hasher(cache_capacity_hint, IdentityHasher::default());
490
491        Self {
492            stats: Stats::default(),
493            modules: Modules::new(modules),
494            pdb_lookup,
495            addr_cache,
496            pdbcache_store: PdbCacheStore::default(),
497        }
498    }
499
500    /// Get [`Stats`].
501    #[must_use]
502    pub fn stats(&self) -> &Stats {
503        &self.stats
504    }
505
506    /// Try to symbolize an address.
507    ///
508    /// If the address has been symbolized before, it will be in the
509    /// `addr_cache` already. If not, we need to take the slow path and ask the
510    /// right [`PdbCache`] which might require to create one in the first place.
511    fn try_symbolize_addr(
512        &mut self,
513        addr_space: &mut impl AddrSpace,
514        addr: u64,
515    ) -> Result<Option<&str>> {
516        use std::collections::hash_map::Entry::{Occupied, Vacant};
517        Ok(match self.addr_cache.entry(addr) {
518            Occupied(o) => {
519                self.stats.cache_hit();
520
521                Some(o.into_mut())
522            }
523            Vacant(v) => {
524                let Some(module) = self.modules.by_addr(addr) else {
525                    trace!("address {addr:#x} doesn't belong to any module");
526                    return Ok(None);
527                };
528
529                let mut inner = SymbolizerInner::new(
530                    &mut self.stats,
531                    &self.pdb_lookup,
532                    &mut self.pdbcache_store,
533                );
534
535                let Some(symbol) = inner.try_symbolize_addr_from_pdbs(addr_space, module, addr)?
536                else {
537                    return Ok(None);
538                };
539
540                Some(v.insert(symbol.into_boxed_str()))
541            }
542        })
543    }
544
545    /// Symbolize `addr` in the `module+offset` style.
546    pub fn symbolize_modoff(&mut self, addr: u64) -> Result<String> {
547        let mut modoff = Vec::new();
548        self.symbolize_modoff_into(addr, &mut modoff)?;
549
550        Ok(String::from_utf8(modoff)?)
551    }
552
553    /// Symbolize `addr` in the `module!function+offset` style.
554    pub fn symbolize_full(&mut self, addr_space: &mut impl AddrSpace, addr: u64) -> Result<String> {
555        let mut full = Vec::new();
556        self.symbolize_full_into(addr_space, addr, &mut full)?;
557
558        Ok(String::from_utf8(full)?)
559    }
560
561    /// Symbolize `addr` in the `module+offset` style and write the result into
562    /// `output`.
563    pub fn symbolize_modoff_into(&mut self, addr: u64, output: &mut impl Write) -> Result<()> {
564        let mut buffer = [0; 16];
565        if let Some(module) = self.modules.by_addr(addr) {
566            output.write_all(module.name.as_bytes())?;
567            output.write_all(b"+0x")?;
568
569            output.write_all(fast_hex32(
570                &mut buffer[0..8].try_into().unwrap(),
571                module.rva(addr),
572            ))
573        } else {
574            output.write_all(b"0x")?;
575
576            output.write_all(fast_hex64(&mut buffer, addr))
577        }
578        .map_err(|_| Error::Other("failed to write symbolized value to output".to_string()))?;
579
580        self.stats.addr_symbolized();
581
582        Ok(())
583    }
584
585    /// Symbolize `addr` in the `module!function+offset` style and write the
586    /// result into `output`.
587    pub fn symbolize_full_into(
588        &mut self,
589        addr_space: &mut impl AddrSpace,
590        addr: u64,
591        output: &mut impl Write,
592    ) -> Result<()> {
593        match self.try_symbolize_addr(addr_space, addr)? {
594            Some(sym) => {
595                output.write_all(sym.as_bytes()).map_err(|_| {
596                    Error::Other("failed to write symbolized value to output".to_string())
597                })?;
598
599                self.stats.addr_symbolized();
600
601                Ok(())
602            }
603            None => self.symbolize_modoff_into(addr, output),
604        }
605    }
606
607    /// Resolves a symbol name (eg `mod.dll!foo+0x1337` / `mod.dll+0x1337`) into
608    /// an address.
609    pub fn name_to_addr(
610        &mut self,
611        addr_space: &mut impl AddrSpace,
612        name: &str,
613    ) -> Result<Option<u64>> {
614        let Some(parsed_name) = parse_full_name(name) else {
615            return Err(Error::Other(format!("failed to parse {name}")));
616        };
617
618        let Some(module) = self.modules.by_name(parsed_name.module_name) else {
619            return Ok(None);
620        };
621
622        let mut inner =
623            SymbolizerInner::new(&mut self.stats, &self.pdb_lookup, &mut self.pdbcache_store);
624
625        let pdbcache = inner.get_or_create_module_pdbcache(addr_space, module)?;
626
627        Ok(pdbcache
628            .addr_by_name(parsed_name.function_name)
629            .map(|base_addr| u64::from(base_addr).strict_add(parsed_name.offset)))
630    }
631
632    /// Imports PDBs from other directory into the symcache that is used by this
633    /// [`Symbolizer`].
634    pub fn import_pdbs(&self, dirs: impl IntoIterator<Item = impl AsRef<Path>>) -> Result<()> {
635        for dir in dirs {
636            let dir = dir.as_ref();
637            if !(dir.exists() && dir.is_dir()) {
638                return Err(Error::Other(format!(
639                    "cannot import pdb from {} as it doesn't exist or isn't a directory",
640                    dir.display()
641                )));
642            }
643
644            for file in dir.read_dir()? {
645                let path = file?.path();
646                if !path.is_file() {
647                    debug!("skipping {} because not a file", path.display());
648                    continue;
649                }
650
651                let Some(ext) = path.extension() else {
652                    debug!(
653                        "skipping {} because doesn't have an extension",
654                        path.display()
655                    );
656                    continue;
657                };
658
659                if ext != "pdb" {
660                    debug!("skipping {} because not a pdb file", path.display());
661                    continue;
662                }
663
664                let Some(filename) = path.file_name() else {
665                    debug!("skipping {} because no filename", path.display());
666                    continue;
667                };
668
669                let mut pdb = pdb2::PDB::open(File::open(&path)?)?;
670                let info = pdb.pdb_information()?;
671                let debug_info = pdb.debug_information()?;
672                let Some(age) = debug_info.age() else {
673                    debug!("skipping {} because no age in debug info", path.display());
674                    continue;
675                };
676
677                let pdbid = PdbId::new(filename, Guid::from(info.guid.to_bytes_le()), age)?;
678                let cached_pdb = format_symcache_path(self.pdb_lookup.symcache(), &pdbid);
679                if cached_pdb.exists() {
680                    debug!(
681                        "skipping {} because already in symbol cache",
682                        path.display()
683                    );
684                    continue;
685                }
686
687                let Some(cached_pdb_dir) = cached_pdb.parent() else {
688                    return Err(Error::Other(format!(
689                        "{} has no parent",
690                        cached_pdb.display()
691                    )));
692                };
693
694                info!(
695                    "copying {} into the symbol cache at {}",
696                    path.display(),
697                    cached_pdb.display()
698                );
699                fs::create_dir_all(cached_pdb_dir)?;
700                fs::copy(path, cached_pdb)?;
701            }
702        }
703
704        Ok(())
705    }
706}