samply_symbols/
shared.rs

1#[cfg(feature = "partial_read_stats")]
2use std::cell::RefCell;
3use std::fmt::{Debug, Display};
4use std::future::Future;
5use std::marker::PhantomData;
6use std::ops::{Deref, Range};
7use std::str::FromStr;
8use std::sync::Arc;
9
10#[cfg(feature = "partial_read_stats")]
11use bitvec::{bitvec, prelude::BitVec};
12use debugid::DebugId;
13use object::read::ReadRef;
14use object::FileFlags;
15use uuid::Uuid;
16
17use crate::mapped_path::MappedPath;
18use crate::symbol_map::SymbolMapTrait;
19
20pub type FileAndPathHelperError = Box<dyn std::error::Error + Send + Sync + 'static>;
21pub type FileAndPathHelperResult<T> = std::result::Result<T, FileAndPathHelperError>;
22
23// Define a OptionallySendFuture trait. This exists for the following reasons:
24//  - The "+ Send" in the return types of the FileAndPathHelper trait methods
25//    trickles down all the way to the root async functions exposed by this crate.
26//  - We have two consumers: One that requires Send on the futures returned by those
27//    root functions, and one that cannot return Send futures from the trait methods.
28//    The former is hyper/tokio (in profiler-symbol-server), the latter is the wasm/js
29//    implementation: JsFutures are not Send.
30// So we provide a cargo feature to allow the consumer to select whether they want Send or not.
31//
32// Please tell me that there is a better way.
33
34#[cfg(not(feature = "send_futures"))]
35pub trait OptionallySendFuture: Future {}
36
37#[cfg(not(feature = "send_futures"))]
38impl<T> OptionallySendFuture for T where T: Future {}
39
40#[cfg(feature = "send_futures")]
41pub trait OptionallySendFuture: Future + Send {}
42
43#[cfg(feature = "send_futures")]
44impl<T> OptionallySendFuture for T where T: Future + Send {}
45
46#[derive(Debug)]
47pub enum CandidatePathInfo<FL: FileLocation> {
48    SingleFile(FL),
49    InDyldCache {
50        dyld_cache_path: FL,
51        dylib_path: String,
52    },
53}
54
55/// An address that can be looked up in a `SymbolMap`.
56///
57/// You'll usually want to use `LookupAddress::Relative`, i.e. addresses that
58/// are relative to some "image base address". This form works with all types
59/// of symbol maps across all platforms.
60///
61/// When testing, be aware that many binaries are laid out in such a way that
62/// all three representations of addresses are the same: The image base address
63/// is often zero and the sections are often laid out so that each section's
64/// address matches its file offset. So if you misrepresent an address in
65/// the wrong form, you might not notice it because it still works until you
66/// encounter a more complex binary.
67#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
68pub enum LookupAddress {
69    /// A relative address is relative to the image base address.
70    ///
71    /// What this means depends on the format of the binary:
72    ///
73    /// - On Windows, a "relative address" is the same as a RVA ("relative virtual
74    ///   address") in the PE file.
75    /// - On macOS, a "relative address" is relative to the start of the `__TEXT`
76    ///   segment.
77    /// - On Linux / ELF, a "relative address" is relative to the address of the
78    ///   first LOAD command in the program header table. In other words, it's
79    ///   relative to the start of the first segment.
80    /// - For Jitdump files, the "relative address" space is a conceptual space
81    ///   in which the code from all `JIT_CODE_LOAD` records is laid out
82    ///   sequentially, starting at 0.
83    ///   So the relative address of an instruction inside a `JIT_CODE_LOAD` record
84    ///   is the sum of the `code_size` fields of all previous `JIT_CODE_LOAD`
85    ///   records plus the offset of the instruction within the code of this
86    ///   `JIT_CODE_LOAD` record.
87    ///
88    /// See [`relative_address_base`] for more information.
89    Relative(u32),
90    /// A "stated virtual memory address", i.e. a virtual memory address as
91    /// written down in the binary. In mach-O and ELF, this is the space that
92    /// section addresses and symbol addresses are in. It's the type of address
93    /// you'd pass to the Linux `addr2line` tool.
94    ///
95    /// This type of lookup address is not supported by symbol maps for PDB
96    /// files or Breakpad files.
97    Svma(u64),
98    /// A raw file offset to the point in the binary file where the bytes of the
99    /// instruction are stored for which symbols should be looked up.
100    ///
101    /// On Linux, if you have an "AVMA" (absolute virtual memory address) and
102    /// the `/proc/<pid>/maps` for the process, this is probably the easiest
103    /// form of address to compute, because the process maps give you the file offsets.
104    ///
105    /// However, if you do this, be aware that the file offset often is not
106    /// the same as an SVMA, so expect wrong results if you end up using it in
107    /// places where SVMAs are expected - it might work fine with some binaries
108    /// and then break with others.
109    ///
110    /// File offsets are not supported by symbol maps for PDB files or Breakpad files.
111    FileOffset(u64),
112}
113
114/// In case the loaded binary contains multiple architectures, this specifies
115/// how to resolve the ambiguity. This is only needed on macOS.
116#[derive(Debug, Clone)]
117pub enum MultiArchDisambiguator {
118    /// Disambiguate by CPU architecture (exact match).
119    ///
120    /// This string is a name for what mach-O calls the "CPU type" and "CPU subtype".
121    /// Examples are `x86_64`, `x86_64h`, `arm64`, `arm64e`.
122    ///
123    /// These strings are returned by the mach function `macho_arch_name_for_cpu_type`.
124    Arch(String),
125
126    /// Disambiguate by CPU architecture (best match).
127    ///
128    /// The Vec contains the first choice, followed by acceptable fallback choices.
129    /// Examples are `["arm64e", "arm64"]` or `["x86_64h", "x86_64"]`.
130    /// This is used in cases where you have lost information about the architecture
131    /// you're interested in and just want to hope to get the right one.
132    ///
133    /// The strings are names for what mach-O calls the "CPU type" and "CPU subtype".
134    /// Examples are `x86_64`, `x86_64h`, `arm64`, `arm64e`.
135    ///
136    /// These strings are returned by the mach function `macho_arch_name_for_cpu_type`.
137    BestMatch(Vec<String>),
138
139    /// Disambiguate by CPU architecture and find the best match for the architecture
140    /// that is currently executing this code. This is a heuristic, and should only
141    /// be used in cases where you have lost information about the architecture you're
142    /// interested in.
143    BestMatchForNative,
144
145    /// Disambiguate by `DebugId`.
146    DebugId(DebugId),
147}
148
149/// An enum carrying an identifier for a binary. This is stores the same information
150/// as a [`debugid::CodeId`], but without projecting it down to a string.
151///
152/// All types need to be treated rather differently, see their respective documentation.
153#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
154pub enum CodeId {
155    /// The code ID for a Windows PE file. When combined with the binary name,
156    /// the code ID lets you obtain binaries from symbol servers. It is not useful
157    /// on its own, it has to be paired with the binary name.
158    ///
159    /// On Windows, a binary's code ID is distinct from its debug ID (= pdb GUID + age).
160    /// If you have a binary file, you can get both the code ID and the debug ID
161    /// from it. If you only have a PDB file, you usually *cannot* get the code ID of
162    /// the corresponding binary from it.
163    PeCodeId(PeCodeId),
164
165    /// The code ID for a macOS / iOS binary (mach-O). This is just the mach-O UUID.
166    /// The mach-O UUID is shared between both the binary file and the debug file (dSYM),
167    /// and it can be used on its own to find dSYMs using Spotlight.
168    ///
169    /// The debug ID and the code ID contain the same information; the debug ID
170    /// is literally just the UUID plus a zero at the end.
171    MachoUuid(Uuid),
172
173    /// The code ID for a Linux ELF file. This is the "ELF build ID" (also called "GNU build ID").
174    /// The build ID is usually 20 bytes, commonly written out as 40 hex chars.
175    ///
176    /// It can be used to find debug files on the local file system or to download
177    /// binaries or debug files from a `debuginfod` symbol server. it does not have to be
178    /// paired with the binary name.
179    ///
180    /// An ELF binary's code ID is more useful than its debug ID: The debug ID is truncated
181    /// to 16 bytes (32 hex characters), whereas the code ID is the full ELF build ID.
182    ElfBuildId(ElfBuildId),
183}
184
185impl FromStr for CodeId {
186    type Err = ();
187
188    fn from_str(s: &str) -> Result<Self, Self::Err> {
189        if s.len() <= 17 {
190            // 8 bytes timestamp + 1 to 8 bytes of image size
191            Ok(CodeId::PeCodeId(PeCodeId::from_str(s)?))
192        } else if s.len() == 32 && is_uppercase_hex(s) {
193            // mach-O UUID
194            Ok(CodeId::MachoUuid(Uuid::from_str(s).map_err(|_| ())?))
195        } else {
196            // ELF build ID. These are usually 40 hex characters (= 20 bytes).
197            Ok(CodeId::ElfBuildId(ElfBuildId::from_str(s)?))
198        }
199    }
200}
201
202fn is_uppercase_hex(s: &str) -> bool {
203    s.chars()
204        .all(|c| c.is_ascii_hexdigit() && (c.is_ascii_digit() || c.is_ascii_uppercase()))
205}
206
207impl std::fmt::Display for CodeId {
208    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
209        match self {
210            CodeId::PeCodeId(pe) => std::fmt::Display::fmt(pe, f),
211            CodeId::MachoUuid(uuid) => f.write_fmt(format_args!("{:X}", uuid.simple())),
212            CodeId::ElfBuildId(elf) => std::fmt::Display::fmt(elf, f),
213        }
214    }
215}
216
217/// The code ID for a Windows PE file.
218///
219/// When combined with the binary name, the `PeCodeId` lets you obtain binaries from
220/// symbol servers. It is not useful on its own, it has to be paired with the binary name.
221///
222/// A Windows binary's `PeCodeId` is distinct from its debug ID (= pdb GUID + age).
223/// If you have a binary file, you can get both the `PeCodeId` and the debug ID
224/// from it. If you only have a PDB file, you usually *cannot* get the `PeCodeId` of
225/// the corresponding binary from it.
226#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
227pub struct PeCodeId {
228    pub timestamp: u32,
229    pub image_size: u32,
230}
231
232impl FromStr for PeCodeId {
233    type Err = ();
234
235    fn from_str(s: &str) -> Result<Self, Self::Err> {
236        if s.len() < 9 || s.len() > 16 {
237            return Err(());
238        }
239        let timestamp = u32::from_str_radix(&s[..8], 16).map_err(|_| ())?;
240        let image_size = u32::from_str_radix(&s[8..], 16).map_err(|_| ())?;
241        Ok(Self {
242            timestamp,
243            image_size,
244        })
245    }
246}
247
248impl std::fmt::Display for PeCodeId {
249    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
250        f.write_fmt(format_args!("{:08X}{:x}", self.timestamp, self.image_size))
251    }
252}
253
254/// The build ID for an ELF file (also called "GNU build ID").
255///
256/// The build ID can be used to find debug files on the local file system or to download
257/// binaries or debug files from a `debuginfod` symbol server. it does not have to be
258/// paired with the binary name.
259#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
260pub struct ElfBuildId(pub Vec<u8>);
261
262impl ElfBuildId {
263    /// Create a new `ElfBuildId` from a slice of bytes (commonly a sha1 hash
264    /// generated by the linker, i.e. 20 bytes).
265    pub fn from_bytes(bytes: &[u8]) -> Self {
266        Self(bytes.to_owned())
267    }
268}
269
270impl FromStr for ElfBuildId {
271    type Err = ();
272
273    fn from_str(s: &str) -> Result<Self, Self::Err> {
274        let byte_count = s.len() / 2;
275        let mut bytes = Vec::with_capacity(byte_count);
276        for i in 0..byte_count {
277            let hex_byte = &s[i * 2..i * 2 + 2];
278            let b = u8::from_str_radix(hex_byte, 16).map_err(|_| ())?;
279            bytes.push(b);
280        }
281        Ok(Self(bytes))
282    }
283}
284
285impl std::fmt::Display for ElfBuildId {
286    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
287        for byte in &self.0 {
288            f.write_fmt(format_args!("{byte:02x}"))?;
289        }
290        Ok(())
291    }
292}
293
294/// Information about a library ("binary" / "module" / "DSO") which allows finding
295/// symbol files for it. The information can be partial.
296#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
297pub struct LibraryInfo {
298    pub debug_name: Option<String>,
299    pub debug_id: Option<DebugId>,
300    pub debug_path: Option<String>,
301    pub name: Option<String>,
302    pub code_id: Option<CodeId>,
303    pub path: Option<String>,
304    pub arch: Option<String>,
305}
306
307impl LibraryInfo {
308    /// Fill all `None` fields on this object with the corresponding fields from `other`.
309    ///
310    /// This should only be called if some minimal matching has been established, for
311    /// example if the `code_id` matches or if the combination pair `debug_name, debug_id`
312    /// matches.
313    pub fn absorb(&mut self, other: &LibraryInfo) {
314        if self.debug_name.is_none() && other.debug_name.is_some() {
315            self.debug_name.clone_from(&other.debug_name);
316        }
317        if self.debug_id.is_none() && other.debug_id.is_some() {
318            self.debug_id = other.debug_id;
319        }
320        if self.debug_path.is_none() && other.debug_path.is_some() {
321            self.debug_path.clone_from(&other.debug_path);
322        }
323        if self.name.is_none() && other.name.is_some() {
324            self.name.clone_from(&other.name);
325        }
326        if self.code_id.is_none() && other.code_id.is_some() {
327            self.code_id.clone_from(&other.code_id);
328        }
329        if self.path.is_none() && other.path.is_some() {
330            self.path.clone_from(&other.path);
331        }
332        if self.arch.is_none() && other.arch.is_some() {
333            self.arch.clone_from(&other.arch);
334        }
335    }
336}
337
338/// This is the trait that consumers need to implement so that they can call
339/// the main entry points of this crate. This crate contains no direct file
340/// access - all access to the file system is via this trait, and its associated
341/// trait `FileContents`.
342pub trait FileAndPathHelper {
343    type F: FileContents + 'static;
344    type FL: FileLocation + 'static;
345
346    /// Given a "debug name" and a "breakpad ID", return a list of file paths
347    /// which may potentially have artifacts containing symbol data for the
348    /// requested binary (executable or library).
349    ///
350    /// The symbolication methods will try these paths one by one, calling
351    /// `load_file` for each until it succeeds and finds a file whose contents
352    /// match the breakpad ID. Any remaining paths are discarded.
353    ///
354    /// # Arguments
355    ///
356    ///  - `debug_name`: On Windows, this is the filename of the associated PDB
357    ///    file of the executable / DLL, for example "firefox.pdb" or "xul.pdb". On
358    ///    non-Windows, this is the filename of the binary, for example "firefox"
359    ///    or "XUL" or "libxul.so".
360    ///  - `breakpad_id`: A string of 33 hex digits, serving as a hash of the
361    ///    contents of the binary / library. On Windows, this is 32 digits "signature"
362    ///    plus one digit of "pdbAge". On non-Windows, this is the binary's UUID
363    ///    (ELF id or mach-o UUID) plus a "0" digit at the end (replacing the pdbAge).
364    ///
365    fn get_candidate_paths_for_debug_file(
366        &self,
367        info: &LibraryInfo,
368    ) -> FileAndPathHelperResult<Vec<CandidatePathInfo<Self::FL>>>;
369
370    /// TODO
371    fn get_candidate_paths_for_binary(
372        &self,
373        info: &LibraryInfo,
374    ) -> FileAndPathHelperResult<Vec<CandidatePathInfo<Self::FL>>>;
375
376    /// TODO
377    fn get_dyld_shared_cache_paths(
378        &self,
379        arch: Option<&str>,
380    ) -> FileAndPathHelperResult<Vec<Self::FL>>;
381
382    /// TODO
383    fn get_candidate_paths_for_gnu_debug_link_dest(
384        &self,
385        _original_file_location: &Self::FL,
386        _debug_link_name: &str,
387    ) -> FileAndPathHelperResult<Vec<Self::FL>> {
388        Ok(Vec::new())
389    }
390
391    /// TODO
392    fn get_candidate_paths_for_supplementary_debug_file(
393        &self,
394        _original_file_path: &Self::FL,
395        _supplementary_file_path: &str,
396        _supplementary_file_build_id: &ElfBuildId,
397    ) -> FileAndPathHelperResult<Vec<Self::FL>> {
398        Ok(Vec::new())
399    }
400
401    /// This method is the entry point for file access during symbolication.
402    /// The implementer needs to return an object which implements the `FileContents` trait.
403    /// This method is asynchronous, but once it returns, the file data needs to be
404    /// available synchronously because the `FileContents` methods are synchronous.
405    /// If there is no file at the requested path, an error should be returned (or in any
406    /// other error case).
407    fn load_file(
408        &self,
409        location: Self::FL,
410    ) -> std::pin::Pin<Box<dyn OptionallySendFuture<Output = FileAndPathHelperResult<Self::F>> + '_>>;
411
412    /// Ask the helper to return a SymbolMap if it happens to have one available already.
413    fn get_symbol_map_for_library(
414        &self,
415        _info: &LibraryInfo,
416    ) -> Option<(Self::FL, Arc<dyn SymbolMapTrait + Send + Sync>)> {
417        None
418    }
419}
420
421/// Provides synchronous access to the raw bytes of a file.
422/// This trait needs to be implemented by the consumer of this crate.
423pub trait FileContents: Send + Sync {
424    /// Must return the length, in bytes, of this file.
425    fn len(&self) -> u64;
426
427    /// Whether the file is empty.
428    fn is_empty(&self) -> bool {
429        self.len() == 0
430    }
431
432    /// Must return a slice of the file contents, or an error.
433    /// The slice's lifetime must be valid for the entire lifetime of this
434    /// `FileContents` object. This restriction may be a bit cumbersome to satisfy;
435    /// it's a restriction that's inherited from the `object` crate's `ReadRef` trait.
436    fn read_bytes_at(&self, offset: u64, size: u64) -> FileAndPathHelperResult<&[u8]>;
437
438    /// TODO: document
439    fn read_bytes_at_until(
440        &self,
441        range: Range<u64>,
442        delimiter: u8,
443    ) -> FileAndPathHelperResult<&[u8]>;
444
445    /// Append `size` bytes to `buffer`, starting to read at `offset` in the file.
446    /// If successful, `buffer` must have had its len increased exactly by `size`,
447    /// otherwise the caller may panic.
448    fn read_bytes_into(
449        &self,
450        buffer: &mut Vec<u8>,
451        offset: u64,
452        size: usize,
453    ) -> FileAndPathHelperResult<()>;
454}
455
456/// The debug information (function name, file path, line number) for a single frame
457/// at the looked-up address.
458#[derive(Debug, Clone, PartialEq, Eq)]
459pub struct FrameDebugInfo {
460    /// The function name for this frame, if known.
461    pub function: Option<String>,
462    /// The [`SourceFilePath`] for this frame, if known.
463    pub file_path: Option<SourceFilePath>,
464    /// The line number for this frame, if known.
465    pub line_number: Option<u32>,
466}
467
468/// A trait which abstracts away the token that's passed to the [`FileAndPathHelper::load_file`]
469/// trait method.
470///
471/// This is usually something like a `PathBuf`, but it can also be more complicated. For example,
472/// in `wholesym` this is an enum which can refer to a local file or to a file from a symbol
473/// server.
474pub trait FileLocation: Clone + Display {
475    /// Called on a Dyld shared cache location to create a location for a subcache.
476    /// Subcaches are separate files with filenames such as `dyld_shared_cache_arm64e.01`.
477    ///
478    /// The suffix begins with a period.
479    fn location_for_dyld_subcache(&self, suffix: &str) -> Option<Self>;
480
481    /// Called on the location of a debug file in order to create a location for an
482    /// external object file, based on an absolute path found in the "object map" of
483    /// the original file.
484    fn location_for_external_object_file(&self, object_file: &str) -> Option<Self>;
485
486    /// Callod on the location of a PE binary in order to create a location for
487    /// a corresponding PDB file, based on an absolute PDB path found in the binary.
488    fn location_for_pdb_from_binary(&self, pdb_path_in_binary: &str) -> Option<Self>;
489
490    /// Called on the location of a debug file in order to create a location for
491    /// a source file. `source_file_path` is the path to the source file as written
492    /// down in the debug file. This is usually an absolute path.
493    ///
494    /// Only one case with a relative path has been observed to date: In this case the
495    /// "debug file" was a synthetic .so file which was generated by `perf inject --jit`
496    /// based on a JITDUMP file which included relative paths. You could argue
497    /// that the application which emitted relative paths into the JITDUMP file was
498    /// creating bad data and should have written out absolute paths. However, the `perf`
499    /// infrastructure worked fine on this file, because the relative paths happened to
500    /// be relative to the working directory, and because perf / objdump were resolving
501    /// those relative paths relative to the current working directory.
502    fn location_for_source_file(&self, source_file_path: &str) -> Option<Self>;
503
504    /// Called on the location of a Breakpad sym file, to get a location for its
505    /// corresponding symindex file.
506    fn location_for_breakpad_symindex(&self) -> Option<Self>;
507
508    fn location_for_dwo(&self, comp_dir: &str, path: &str) -> Option<Self>;
509
510    fn location_for_dwp(&self) -> Option<Self>;
511}
512
513/// The path of a source file, as found in the debug info.
514///
515/// This contains both the raw path and an optional "mapped path". The raw path can
516/// refer to a file on this machine or on a different machine (i.e. the original
517/// build machine). The mapped path is something like a permalink which potentially
518/// allows obtaining the source file from a source server or a public hosted repository.
519#[derive(Debug, Clone, PartialEq, Eq)]
520pub struct SourceFilePath {
521    /// The raw path to the source file, as written down in the debug file. This is
522    /// usually an absolute path.
523    raw_path: String,
524
525    /// A variant of the path which may allow obtaining the source code for this file
526    /// from the web.
527    mapped_path: Option<MappedPath>,
528}
529
530impl SourceFilePath {
531    /// Create a new `SourceFilePath`.
532    pub fn new(raw_path: String, mapped_path: Option<MappedPath>) -> Self {
533        Self {
534            raw_path,
535            mapped_path,
536        }
537    }
538
539    /// Create a `SourceFilePath` from a path in a Breakpad .sym file. Such files can
540    /// contain the "special path" serialization of a mapped path, but they can
541    /// also contain absolute paths.
542    pub fn from_breakpad_path(raw_path: String) -> Self {
543        let mapped_path = MappedPath::from_special_path_str(&raw_path);
544        Self {
545            raw_path,
546            mapped_path,
547        }
548    }
549
550    /// A short, display-friendly version of this path.
551    pub fn display_path(&self) -> String {
552        match self.mapped_path() {
553            Some(mapped_path) => mapped_path.display_path(),
554            None => self.raw_path.clone(),
555        }
556    }
557
558    /// The raw path to the source file, as written down in the debug file. This is
559    /// usually an absolute path.
560    ///
561    /// Examples:
562    ///
563    ///  - `"/Users/mstange/code/samply/samply-symbols/src/shared.rs"`
564    ///  - `"/Users/mstange/code/mozilla/widget/cocoa/nsNativeThemeCocoa.mm"`
565    ///  - `"./csu/../csu/libc-start.c"`
566    ///  - `"/rustc/69f9c33d71c871fc16ac445211281c6e7a340943/library/core/src/ptr/const_ptr.rs"`
567    ///  - `r#"D:\agent\_work\2\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl"#`
568    ///
569    /// If the debug file was produced by compiling code on this machine, then the path
570    /// usually refers to a file on this machine. (An exception to this is debug info
571    /// from the Rust stdlib, which has fake `/rustc/<rev>/...` paths even if the when
572    /// compiling Rust code locally.)
573    ///
574    /// If the code was compiled on a different machine, then the raw path does not refer
575    /// to a file on this machine.
576    ///
577    /// Sometimes this path is a relative path. One such case was observed when the
578    /// "debug file" was a synthetic .so file which was generated by `perf inject --jit`
579    /// based on a JITDUMP file which included relative paths. You could argue
580    /// that the application which emitted relative paths into the JITDUMP file was
581    /// creating bad data and should have written out absolute paths. However, the `perf`
582    /// infrastructure worked fine on this file, because the relative paths happened to
583    /// be relative to the working directory, and because perf / objdump were resolving
584    /// those relative paths relative to the current working directory.
585    pub fn raw_path(&self) -> &str {
586        &self.raw_path
587    }
588
589    /// Returns the raw path while consuming this `SourceFilePath`.
590    pub fn into_raw_path(self) -> String {
591        self.raw_path
592    }
593
594    /// A variant of the path which may allow obtaining the source code for this file
595    /// from the web.
596    ///
597    /// Examples:
598    ///
599    ///   - If the source file is from a Rust dependency from crates.io, we detect the
600    ///     cargo cache directory in the raw path and create a mapped path of the form [`MappedPath::Cargo`].
601    ///   - If the source file can be obtained from a github URL, and we know this either
602    ///     from the `srcsrv` stream of a PDB file or because we recognize a path of the
603    ///     form `/rustc/<rust-revision>/`, then we create a mapped path of the form [`MappedPath::Git`].
604    pub fn mapped_path(&self) -> Option<&MappedPath> {
605        self.mapped_path.as_ref()
606    }
607
608    /// Returns the mapped path while consuming this `SourceFilePath`.
609    pub fn into_mapped_path(self) -> Option<MappedPath> {
610        self.mapped_path
611    }
612}
613
614/// The "relative address base" is the base address which [`LookupAddress::Relative`]
615/// addresses are relative to. You start with an SVMA (a stated virtual memory address),
616/// you subtract the relative address base, and out comes a relative address.
617///
618/// This function computes that base address. It is defined as follows:
619///
620///  - For Windows binaries, the base address is the "image base address".
621///  - For mach-O binaries, the base address is the vmaddr of the __TEXT segment.
622///  - For ELF binaries, the base address is the vmaddr of the *first* segment,
623///    i.e. the vmaddr of the first "LOAD" ELF command.
624///
625/// In many cases, this base address is simply zero:
626///
627///  - ELF images of dynamic libraries (i.e. not executables) usually have a
628///    base address of zero.
629///  - Stand-alone mach-O dylibs usually have a base address of zero because their
630///    __TEXT segment is at address zero.
631///  - In PDBs, "RVAs" are relative addresses which are already relative to the
632///    image base.
633///
634/// However, in the following cases, the base address is usually non-zero:
635///
636///  - The "image base address" of Windows binaries is usually non-zero.
637///  - mach-O executable files (not dylibs) usually have their __TEXT segment at
638///    address 0x100000000.
639///  - mach-O libraries in the dyld shared cache have a __TEXT segment at some
640///    non-zero address in the cache.
641///  - ELF executables can have non-zero base addresses, e.g. 0x200000 or 0x400000.
642///  - Kernel ELF binaries ("vmlinux") have a large base address such as
643///    0xffffffff81000000. Moreover, the base address seems to coincide with the
644///    vmaddr of the .text section, which is readily-available in perf.data files
645///    (in a synthetic mapping called "[kernel.kallsyms]_text").
646pub fn relative_address_base<'data>(object_file: &impl object::Object<'data>) -> u64 {
647    use object::read::ObjectSegment;
648    if let Some(text_segment) = object_file
649        .segments()
650        .find(|s| s.name() == Ok(Some("__TEXT")))
651    {
652        // This is a mach-O image. "Relative addresses" are relative to the
653        // vmaddr of the __TEXT segment.
654        return text_segment.address();
655    }
656
657    if let FileFlags::Elf { .. } = object_file.flags() {
658        // This is an ELF image. "Relative addresses" are relative to the
659        // vmaddr of the first segment (the first LOAD command).
660        if let Some(first_segment) = object_file.segments().next() {
661            return first_segment.address();
662        }
663    }
664
665    // For PE binaries, relative_address_base() returns the image base address.
666    object_file.relative_address_base()
667}
668
669/// The symbol for a function.
670#[derive(Debug, Clone, PartialEq, Eq)]
671pub struct SymbolInfo {
672    /// The function's address. This is a relative address.
673    pub address: u32,
674    /// The function size, in bytes. May have been approximated from neighboring symbols.
675    pub size: Option<u32>,
676    /// The function name, demangled.
677    pub name: String,
678}
679
680/// The lookup result for an address.
681#[derive(Debug, Clone, PartialEq, Eq)]
682pub struct AddressInfo {
683    /// Information about the symbol which contains the looked up address.
684    pub symbol: SymbolInfo,
685    /// Information about the frames at the looked up address, if found in the debug info.
686    ///
687    /// This Vec contains the file name and line number of the address.
688    /// If the compiler inlined a function call at this address, then this Vec
689    /// also contains the function name of the inlined function, along with the
690    /// file and line information inside that function.
691    ///
692    /// The Vec begins with the callee-most ("innermost") inlinee, followed by
693    /// its caller, and so on. The last element is always the outer function.
694    pub frames: Option<Vec<FrameDebugInfo>>,
695}
696
697/// The lookup result from `lookup_sync`.
698#[derive(Debug, Clone, PartialEq, Eq)]
699pub struct SyncAddressInfo {
700    /// Information about the symbol which contains the looked up address.
701    pub symbol: SymbolInfo,
702    /// Information about the frames at the looked up address, from the debug info.
703    pub frames: Option<FramesLookupResult>,
704}
705
706/// Contains address debug info (inlined functions, file names, line numbers) if
707/// available.
708#[derive(Debug, Clone, PartialEq, Eq)]
709pub enum FramesLookupResult {
710    /// Debug info for this address was found in the symbol map.
711    ///
712    /// This Vec contains the file name and line number of the address.
713    /// If the compiler inlined a function call at this address, then this Vec
714    /// also contains the function name of the inlined function, along with the
715    /// file and line information inside that function.
716    ///
717    /// The Vec begins with the callee-most ("innermost") inlinee, followed by
718    /// its caller, and so on. The last element is always the outer function.
719    Available(Vec<FrameDebugInfo>),
720
721    /// Debug info for this address was not found in the symbol map, but can
722    /// potentially be found in a different file, with the help of
723    /// [`SymbolMap::lookup_external`](crate::SymbolMap::lookup_external).
724    ///
725    /// This case can currently only be hit on macOS: On macOS, linking multiple
726    /// `.o` files together into a library or an executable does not copy the
727    /// DWARF information into the linked output. Instead, the linker stores the
728    /// paths to those original `.o` files, using 'OSO' stabs entries, and debug
729    /// info must be obtained from those original files.
730    External(ExternalFileAddressRef),
731}
732
733/// Information to find an external file and an address within that file, to be
734/// passed to [`SymbolMap::lookup_external`](crate::SymbolMap::lookup_external) or
735/// [`ExternalFileSymbolMap::lookup`](crate::ExternalFileSymbolMap::lookup).
736#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
737pub struct ExternalFileAddressRef {
738    /// Information needed to find the external file.
739    pub file_ref: ExternalFileRef,
740    /// Information needed to find the address within that external file.
741    pub address_in_file: ExternalFileAddressInFileRef,
742}
743
744/// Information to find an external file with debug information.
745#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
746pub enum ExternalFileRef {
747    MachoExternalObject {
748        /// The path to the file, as specified in the linked binary's object map.
749        file_path: String,
750    },
751    ElfExternalDwo {
752        comp_dir: String,
753        path: String,
754    },
755}
756
757/// Information to find an address within an external file, for debug info lookup.
758#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
759pub enum ExternalFileAddressInFileRef {
760    MachoOsoObject {
761        /// The name of the function symbol, as bytes, for the function which contains the
762        /// address we want to look up.
763        symbol_name: Vec<u8>,
764        /// The address to look up, as a relative offset from the function symbol address.
765        offset_from_symbol: u32,
766    },
767    MachoOsoArchive {
768        /// If the external file is an archive file (e.g. `libjs_static.a`, created with `ar`),
769        /// then this is the name of the archive member (e.g. `Unified_cpp_js_src23.o`),
770        /// otherwise `None`.
771        name_in_archive: String,
772        /// The name of the function symbol, as bytes, for the function which contains the
773        /// address we want to look up.
774        symbol_name: Vec<u8>,
775        /// The address to look up, as a relative offset from the function symbol address.
776        offset_from_symbol: u32,
777    },
778    ElfDwo {
779        dwo_id: u64,
780        svma: u64,
781    },
782}
783
784/// Implementation for slices.
785impl<T: Deref<Target = [u8]> + Send + Sync> FileContents for T {
786    fn len(&self) -> u64 {
787        <[u8]>::len(self) as u64
788    }
789
790    fn read_bytes_at(&self, offset: u64, size: u64) -> FileAndPathHelperResult<&[u8]> {
791        <[u8]>::get(self, offset as usize..)
792            .and_then(|s| s.get(..size as usize))
793            .ok_or_else(|| {
794                std::io::Error::new(
795                    std::io::ErrorKind::UnexpectedEof,
796                    "FileContents::read_bytes_at for &[u8] was called with out-of-range indexes",
797                )
798                .into()
799            })
800    }
801
802    fn read_bytes_at_until(
803        &self,
804        range: Range<u64>,
805        delimiter: u8,
806    ) -> FileAndPathHelperResult<&[u8]> {
807        if range.end < range.start {
808            return Err("Invalid range in read_bytes_at_until".into());
809        }
810        let slice = self.read_bytes_at(range.start, range.end - range.start)?;
811        if let Some(pos) = memchr::memchr(delimiter, slice) {
812            Ok(&slice[..pos])
813        } else {
814            Err(Box::new(std::io::Error::new(
815                std::io::ErrorKind::InvalidInput,
816                "Delimiter not found",
817            )))
818        }
819    }
820
821    #[inline]
822    fn read_bytes_into(
823        &self,
824        buffer: &mut Vec<u8>,
825        offset: u64,
826        size: usize,
827    ) -> FileAndPathHelperResult<()> {
828        buffer.extend_from_slice(self.read_bytes_at(offset, size as u64)?);
829        Ok(())
830    }
831}
832
833#[cfg(feature = "partial_read_stats")]
834const CHUNK_SIZE: u64 = 32 * 1024;
835
836#[cfg(feature = "partial_read_stats")]
837struct FileReadStats {
838    bytes_read: u64,
839    unique_chunks_read: BitVec,
840    read_call_count: u64,
841}
842
843#[cfg(feature = "partial_read_stats")]
844impl FileReadStats {
845    pub fn new(size_in_bytes: u64) -> Self {
846        assert!(size_in_bytes > 0);
847        let chunk_count = (size_in_bytes - 1) / CHUNK_SIZE + 1;
848        FileReadStats {
849            bytes_read: 0,
850            unique_chunks_read: bitvec![0; chunk_count as usize],
851            read_call_count: 0,
852        }
853    }
854
855    pub fn record_read(&mut self, offset: u64, size: u64) {
856        if size == 0 {
857            return;
858        }
859
860        let start = offset;
861        let end = offset + size;
862        let chunk_index_start = start / CHUNK_SIZE;
863        let chunk_index_end = (end - 1) / CHUNK_SIZE + 1;
864
865        let chunkbits =
866            &mut self.unique_chunks_read[chunk_index_start as usize..chunk_index_end as usize];
867        if chunkbits.count_ones() != (chunk_index_end - chunk_index_start) as usize {
868            if chunkbits[0] {
869                self.bytes_read += chunk_index_end * CHUNK_SIZE - start;
870            } else {
871                self.bytes_read += (chunk_index_end - chunk_index_start) * CHUNK_SIZE;
872            }
873            self.read_call_count += 1;
874        }
875        chunkbits.set_all(true);
876    }
877
878    pub fn unique_bytes_read(&self) -> u64 {
879        self.unique_chunks_read.count_ones() as u64 * CHUNK_SIZE
880    }
881}
882
883#[cfg(feature = "partial_read_stats")]
884impl std::fmt::Display for FileReadStats {
885    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
886        let unique_bytes_read = self.unique_bytes_read();
887        let repeated_bytes_read = self.bytes_read - unique_bytes_read;
888        let redudancy_percentage = repeated_bytes_read * 100 / unique_bytes_read;
889        write!(
890            f,
891            "{} total, {} unique, {}% redundancy, {} reads total",
892            bytesize::ByteSize(self.bytes_read),
893            bytesize::ByteSize(unique_bytes_read),
894            redudancy_percentage,
895            self.read_call_count
896        )
897    }
898}
899
900/// A wrapper for a FileContents object. The wrapper provides some convenience methods
901/// and, most importantly, implements `ReadRef` for `&FileContentsWrapper`.
902pub struct FileContentsWrapper<T: FileContents> {
903    file_contents: T,
904    len: u64,
905    #[cfg(feature = "partial_read_stats")]
906    partial_read_stats: std::sync::Mutex<FileReadStats>,
907}
908
909impl<T: FileContents> FileContentsWrapper<T> {
910    pub fn new(file_contents: T) -> Self {
911        let len = file_contents.len();
912        Self {
913            file_contents,
914            len,
915            #[cfg(feature = "partial_read_stats")]
916            partial_read_stats: std::sync::Mutex::new(FileReadStats::new(len)),
917        }
918    }
919
920    #[inline]
921    pub fn len(&self) -> u64 {
922        self.len
923    }
924
925    #[inline]
926    pub fn is_empty(&self) -> bool {
927        self.len == 0
928    }
929
930    #[inline]
931    pub fn read_bytes_at(&self, offset: u64, size: u64) -> FileAndPathHelperResult<&[u8]> {
932        #[cfg(feature = "partial_read_stats")]
933        self.partial_read_stats
934            .lock()
935            .unwrap()
936            .record_read(offset, size);
937
938        self.file_contents.read_bytes_at(offset, size)
939    }
940
941    #[inline]
942    pub fn read_bytes_at_until(
943        &self,
944        range: Range<u64>,
945        delimiter: u8,
946    ) -> FileAndPathHelperResult<&[u8]> {
947        #[cfg(feature = "partial_read_stats")]
948        let start = range.start;
949
950        let bytes = self.file_contents.read_bytes_at_until(range, delimiter)?;
951
952        #[cfg(feature = "partial_read_stats")]
953        self.partial_read_stats
954            .lock()
955            .unwrap()
956            .record_read(start, (bytes.len() + 1) as u64);
957
958        Ok(bytes)
959    }
960
961    /// Append `size` bytes to `buffer`, starting to read at `offset` in the file.
962    /// If successful, `buffer` must have had its len increased exactly by `size`,
963    /// otherwise the caller may panic.
964    pub fn read_bytes_into(
965        &self,
966        buffer: &mut Vec<u8>,
967        offset: u64,
968        size: usize,
969    ) -> FileAndPathHelperResult<()> {
970        #[cfg(feature = "partial_read_stats")]
971        self.partial_read_stats
972            .lock()
973            .unwrap()
974            .record_read(offset, size as u64);
975
976        self.file_contents.read_bytes_into(buffer, offset, size)
977    }
978
979    pub fn read_entire_data(&self) -> FileAndPathHelperResult<&[u8]> {
980        self.read_bytes_at(0, self.len())
981    }
982
983    pub fn full_range(&self) -> RangeReadRef<'_, &Self> {
984        RangeReadRef::new(self, 0, self.len)
985    }
986
987    pub fn range(&self, start: u64, size: u64) -> RangeReadRef<'_, &Self> {
988        RangeReadRef::new(self, start, size)
989    }
990}
991
992#[cfg(feature = "partial_read_stats")]
993impl<T: FileContents> Drop for FileContentsWrapper<T> {
994    fn drop(&mut self) {
995        eprintln!("{}", self.partial_read_stats.lock());
996    }
997}
998
999impl<T: FileContents> Debug for FileContentsWrapper<T> {
1000    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1001        write!(f, "FileContentsWrapper({} bytes)", self.len())
1002    }
1003}
1004
1005impl<'data, T: FileContents> ReadRef<'data> for &'data FileContentsWrapper<T> {
1006    #[inline]
1007    fn len(self) -> Result<u64, ()> {
1008        Ok(self.len())
1009    }
1010
1011    #[inline]
1012    fn read_bytes_at(self, offset: u64, size: u64) -> Result<&'data [u8], ()> {
1013        self.read_bytes_at(offset, size).map_err(|_| {
1014            // Note: We're discarding the error from the FileContents method here.
1015        })
1016    }
1017
1018    #[inline]
1019    fn read_bytes_at_until(self, range: Range<u64>, delimiter: u8) -> Result<&'data [u8], ()> {
1020        self.read_bytes_at_until(range, delimiter).map_err(|_| {
1021            // Note: We're discarding the error from the FileContents method here.
1022        })
1023    }
1024}
1025
1026#[test]
1027fn test_filecontents_readref_is_send_and_sync() {
1028    fn assert_is_send<T: Send>() {}
1029    fn assert_is_sync<T: Sync>() {}
1030    #[allow(unused)]
1031    fn wrapper<T: FileContents + Sync>() {
1032        assert_is_send::<&FileContentsWrapper<T>>();
1033        assert_is_sync::<&FileContentsWrapper<T>>();
1034    }
1035}
1036
1037#[derive(Clone, Copy)]
1038pub struct RangeReadRef<'data, T: ReadRef<'data>> {
1039    original_readref: T,
1040    range_start: u64,
1041    range_size: u64,
1042    _phantom_data: PhantomData<&'data ()>,
1043}
1044
1045impl<'data, T: ReadRef<'data>> RangeReadRef<'data, T> {
1046    pub fn new(original_readref: T, range_start: u64, range_size: u64) -> Self {
1047        Self {
1048            original_readref,
1049            range_start,
1050            range_size,
1051            _phantom_data: PhantomData,
1052        }
1053    }
1054
1055    pub fn make_subrange(&self, start: u64, size: u64) -> Self {
1056        Self::new(self.original_readref, self.range_start + start, size)
1057    }
1058
1059    pub fn original_readref(&self) -> T {
1060        self.original_readref
1061    }
1062
1063    pub fn range_start(&self) -> u64 {
1064        self.range_start
1065    }
1066
1067    pub fn range_size(&self) -> u64 {
1068        self.range_size
1069    }
1070}
1071
1072impl<'data, T: ReadRef<'data>> ReadRef<'data> for RangeReadRef<'data, T> {
1073    #[inline]
1074    fn len(self) -> Result<u64, ()> {
1075        Ok(self.range_size)
1076    }
1077
1078    #[inline]
1079    fn read_bytes_at(self, offset: u64, size: u64) -> Result<&'data [u8], ()> {
1080        let shifted_offset = self.range_start.checked_add(offset).ok_or(())?;
1081        self.original_readref.read_bytes_at(shifted_offset, size)
1082    }
1083
1084    #[inline]
1085    fn read_bytes_at_until(self, range: Range<u64>, delimiter: u8) -> Result<&'data [u8], ()> {
1086        if range.end < range.start {
1087            return Err(());
1088        }
1089        let shifted_start = self.range_start.checked_add(range.start).ok_or(())?;
1090        let shifted_end = self.range_start.checked_add(range.end).ok_or(())?;
1091        let range = shifted_start..shifted_end;
1092        self.original_readref.read_bytes_at_until(range, delimiter)
1093    }
1094}
1095
1096pub struct FileContentsCursor<'a, T: FileContents> {
1097    /// The current offset of the cursor. This can be beyond the end of the file!
1098    current_offset: u64,
1099    /// The total length of the file.
1100    total_len: u64,
1101    inner: &'a FileContentsWrapper<T>,
1102}
1103
1104impl<'a, T: FileContents> FileContentsCursor<'a, T> {
1105    pub fn new(inner: &'a FileContentsWrapper<T>) -> Self {
1106        let total_len = inner.len();
1107        Self {
1108            current_offset: 0,
1109            total_len,
1110            inner,
1111        }
1112    }
1113}
1114
1115impl<T: FileContents> std::io::Read for FileContentsCursor<'_, T> {
1116    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
1117        if self.current_offset >= self.total_len {
1118            return Ok(0);
1119        }
1120        let remaining_len = self.total_len - self.current_offset;
1121        let read_len = <[u8]>::len(buf).min(remaining_len as usize);
1122        // Make a silly copy
1123        let mut tmp_buf = Vec::with_capacity(read_len);
1124        self.inner
1125            .read_bytes_into(&mut tmp_buf, self.current_offset, read_len)
1126            .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
1127        buf[..read_len].copy_from_slice(&tmp_buf);
1128        self.current_offset += read_len as u64;
1129        Ok(read_len)
1130    }
1131}
1132
1133impl<T: FileContents> std::io::Seek for FileContentsCursor<'_, T> {
1134    fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
1135        /// Returns None on overflow / underflow.
1136        ///
1137        /// Seeks beyond the file length are allowed.
1138        fn inner(cur: u64, total_len: u64, pos: std::io::SeekFrom) -> Option<u64> {
1139            let new_offset: u64 = match pos {
1140                std::io::SeekFrom::Start(pos) => pos,
1141                std::io::SeekFrom::End(pos) => {
1142                    (total_len as i64).checked_add(pos)?.try_into().ok()?
1143                }
1144                std::io::SeekFrom::Current(pos) => {
1145                    (cur as i64).checked_add(pos)?.try_into().ok()?
1146                }
1147            };
1148            Some(new_offset)
1149        }
1150
1151        match inner(self.current_offset, self.total_len, pos) {
1152            Some(cur) => {
1153                self.current_offset = cur;
1154                Ok(cur)
1155            }
1156            None => Err(std::io::Error::new(std::io::ErrorKind::Other, "Bad Seek")),
1157        }
1158    }
1159}
1160
1161#[cfg(test)]
1162mod test {
1163    use super::*;
1164
1165    #[test]
1166    fn file_contents_cursor_allows_seeks_beyond_eof() {
1167        use std::io::{Read, Seek};
1168        let bytes = b"Test";
1169        let bytes = &bytes[..];
1170        let file_contents_wrapper = FileContentsWrapper::new(bytes);
1171        let mut cursor = FileContentsCursor::new(&file_contents_wrapper);
1172        let mut read_buf = [0; 10];
1173        let read_len = cursor.read(&mut read_buf[..3]).unwrap();
1174        assert_eq!(read_len, 3);
1175        assert_eq!(&read_buf[..3], b"Tes");
1176        let new_pos = cursor.seek(std::io::SeekFrom::Current(2)).unwrap();
1177        assert_eq!(new_pos, 5);
1178        let read_len = cursor.read(&mut read_buf[..2]).unwrap();
1179        assert_eq!(read_len, 0);
1180    }
1181}