symbolic_symcache/
writer.rs

1//! Defines the [SymCache Converter](`SymCacheConverter`).
2
3use std::collections::btree_map;
4use std::collections::BTreeMap;
5use std::io::Write;
6
7use indexmap::IndexSet;
8use symbolic_common::{Arch, DebugId};
9use symbolic_debuginfo::{DebugSession, FileFormat, Function, ObjectLike, Symbol};
10use watto::{Pod, StringTable, Writer};
11
12use super::{raw, transform};
13use crate::raw::NO_SOURCE_LOCATION;
14use crate::{Error, ErrorKind};
15
16/// The SymCache Converter.
17///
18/// This can convert data in various source formats to an intermediate representation, which can
19/// then be serialized to disk via its [`serialize`](SymCacheConverter::serialize) method.
20#[derive(Debug, Default)]
21pub struct SymCacheConverter<'a> {
22    /// Debug identifier of the object file.
23    debug_id: DebugId,
24    /// CPU architecture of the object file.
25    arch: Arch,
26
27    /// A flag that indicates that we are currently processing a Windows object, which
28    /// will inform us if we should undecorate function names.
29    is_windows_object: bool,
30
31    /// A list of transformers that are used to transform each function / source location.
32    transformers: transform::Transformers<'a>,
33
34    string_table: StringTable,
35    /// The set of all [`raw::File`]s that have been added to this `Converter`.
36    files: IndexSet<raw::File>,
37    /// The set of all [`raw::Function`]s that have been added to this `Converter`.
38    functions: IndexSet<raw::Function>,
39    /// The set of [`raw::SourceLocation`]s used in this `Converter` that are only used as
40    /// "call locations", i.e. which are only referred to from `inlined_into_idx`.
41    call_locations: IndexSet<raw::SourceLocation>,
42    /// A map from code ranges to the [`raw::SourceLocation`]s they correspond to.
43    ///
44    /// Only the starting address of a range is saved, the end address is given implicitly
45    /// by the start address of the next range.
46    ranges: BTreeMap<u32, raw::SourceLocation>,
47
48    /// This is highest addr that we know is outside of a valid function.
49    /// Functions have an explicit end, while Symbols implicitly extend to infinity.
50    /// In case the highest addr belongs to a Symbol, this will be `None` and the SymCache
51    /// also extends to infinite, otherwise this is the end of the highest function.
52    last_addr: Option<u32>,
53}
54
55impl<'a> SymCacheConverter<'a> {
56    /// Creates a new Converter.
57    pub fn new() -> Self {
58        Self::default()
59    }
60
61    /// Adds a new [`transform::Transformer`] to this [`SymCacheConverter`].
62    ///
63    /// Every [`transform::Function`] and [`transform::SourceLocation`] will be passed through
64    /// this transformer before it is written to the SymCache.
65    pub fn add_transformer<T>(&mut self, t: T)
66    where
67        T: transform::Transformer + 'a,
68    {
69        self.transformers.0.push(Box::new(t));
70    }
71
72    /// Sets the CPU architecture of this SymCache.
73    pub fn set_arch(&mut self, arch: Arch) {
74        self.arch = arch;
75    }
76
77    /// Sets the debug identifier of this SymCache.
78    pub fn set_debug_id(&mut self, debug_id: DebugId) {
79        self.debug_id = debug_id;
80    }
81
82    // Methods processing symbolic-debuginfo [`ObjectLike`] below:
83    // Feel free to move these to a separate file.
84
85    /// This processes the given [`ObjectLike`] object, collecting all its functions and line
86    /// information into the converter.
87    #[tracing::instrument(skip_all, fields(object.debug_id = %object.debug_id().breakpad()))]
88    pub fn process_object<'d, 'o, O>(&mut self, object: &'o O) -> Result<(), Error>
89    where
90        O: ObjectLike<'d, 'o>,
91        O::Error: std::error::Error + Send + Sync + 'static,
92    {
93        let session = object
94            .debug_session()
95            .map_err(|e| Error::new(ErrorKind::BadDebugFile, e))?;
96
97        self.set_arch(object.arch());
98        self.set_debug_id(object.debug_id());
99
100        self.is_windows_object = matches!(object.file_format(), FileFormat::Pe | FileFormat::Pdb);
101
102        for function in session.functions() {
103            let function = function.map_err(|e| Error::new(ErrorKind::BadDebugFile, e))?;
104
105            self.process_symbolic_function(&function);
106        }
107
108        for symbol in object.symbols() {
109            self.process_symbolic_symbol(&symbol);
110        }
111
112        self.is_windows_object = false;
113
114        Ok(())
115    }
116
117    /// Processes an individual [`Function`], adding its line information to the converter.
118    pub fn process_symbolic_function(&mut self, function: &Function<'_>) {
119        self.process_symbolic_function_recursive(function, &[(0x0, u32::MAX)]);
120    }
121
122    /// Processes an individual [`Function`], adding its line information to the converter.
123    ///
124    /// `call_locations` is a non-empty sorted list of `(address, call_location index)` pairs.
125    fn process_symbolic_function_recursive(
126        &mut self,
127        function: &Function<'_>,
128        call_locations: &[(u32, u32)],
129    ) {
130        let string_table = &mut self.string_table;
131        // skip over empty functions or functions whose address is too large to fit in a u32
132        if function.size == 0 || function.address > u32::MAX as u64 {
133            return;
134        }
135
136        let comp_dir = std::str::from_utf8(function.compilation_dir).ok();
137
138        let entry_pc = if function.inline {
139            u32::MAX
140        } else {
141            function.address as u32
142        };
143
144        let function_idx = {
145            let language = function.name.language();
146            let mut function = transform::Function {
147                name: function.name.as_str().into(),
148                comp_dir: comp_dir.map(Into::into),
149            };
150            for transformer in &mut self.transformers.0 {
151                function = transformer.transform_function(function);
152            }
153
154            let function_name = if self.is_windows_object {
155                undecorate_win_symbol(&function.name)
156            } else {
157                &function.name
158            };
159
160            let name_offset = string_table.insert(function_name) as u32;
161
162            let lang = language as u32;
163            let (fun_idx, _) = self.functions.insert_full(raw::Function {
164                name_offset,
165                _comp_dir_offset: u32::MAX,
166                entry_pc,
167                lang,
168            });
169            fun_idx as u32
170        };
171
172        // We can divide the instructions in a function into two buckets:
173        //  (1) Instructions which are part of an inlined function call, and
174        //  (2) instructions which are *not* part of an inlined function call.
175        //
176        // Our incoming line records cover both (1) and (2) types of instructions.
177        //
178        // Let's call the address ranges of these instructions (1) inlinee ranges and (2) self ranges.
179        //
180        // We use the following strategy: For each function, only insert that function's "self ranges"
181        // into `self.ranges`. Then recurse into the function's inlinees. Those will insert their
182        // own "self ranges". Once the entire tree has been traversed, `self.ranges` will contain
183        // entries from all levels.
184        //
185        // In order to compute this function's "self ranges", we first gather and sort its
186        // "inlinee ranges". Later, when we iterate over this function's lines, we will compute the
187        // "self ranges" from the gaps between the "inlinee ranges".
188
189        let mut inlinee_ranges = Vec::new();
190        for inlinee in &function.inlinees {
191            for line in &inlinee.lines {
192                let (start, end) = line_boundaries(line.address, line.size);
193                inlinee_ranges.push(start..end);
194            }
195        }
196        inlinee_ranges.sort_unstable_by_key(|range| range.start);
197
198        // Walk three iterators. All of these are already sorted by address.
199        let mut line_iter = function.lines.iter();
200        let mut call_location_iter = call_locations.iter();
201        let mut inline_iter = inlinee_ranges.into_iter();
202
203        // call_locations is non-empty, so the first element always exists.
204        let mut current_call_location = call_location_iter.next().unwrap();
205
206        let mut next_call_location = call_location_iter.next();
207        let mut next_line = line_iter.next();
208        let mut next_inline = inline_iter.next();
209
210        // This will be the list we pass to our inlinees as the call_locations argument.
211        // This list is ordered by address by construction.
212        let mut callee_call_locations = Vec::new();
213
214        // Iterate over the line records.
215        while let Some(line) = next_line.take() {
216            let (line_range_start, line_range_end) = line_boundaries(line.address, line.size);
217
218            // Find the call location for this line.
219            while next_call_location.is_some() && next_call_location.unwrap().0 <= line_range_start
220            {
221                current_call_location = next_call_location.unwrap();
222                next_call_location = call_location_iter.next();
223            }
224            let inlined_into_idx = current_call_location.1;
225
226            let mut location = transform::SourceLocation {
227                file: transform::File {
228                    name: line.file.name_str(),
229                    directory: Some(line.file.dir_str()),
230                    comp_dir: comp_dir.map(Into::into),
231                },
232                line: line.line as u32,
233            };
234            for transformer in &mut self.transformers.0 {
235                location = transformer.transform_source_location(location);
236            }
237
238            let name_offset = string_table.insert(&location.file.name) as u32;
239            let directory_offset = location
240                .file
241                .directory
242                .map_or(u32::MAX, |d| string_table.insert(&d) as u32);
243            let comp_dir_offset = location
244                .file
245                .comp_dir
246                .map_or(u32::MAX, |cd| string_table.insert(&cd) as u32);
247
248            let (file_idx, _) = self.files.insert_full(raw::File {
249                name_offset,
250                directory_offset,
251                comp_dir_offset,
252            });
253
254            let source_location = raw::SourceLocation {
255                file_idx: file_idx as u32,
256                line: location.line,
257                function_idx,
258                inlined_into_idx,
259            };
260
261            // The current line can be a "self line", or a "call line", or even a mixture.
262            //
263            // Examples:
264            //
265            //  a) Just self line:
266            //      Line:            |==============|
267            //      Inlinee ranges:  (none)
268            //
269            //      Effect: insert_range
270            //
271            //  b) Just call line:
272            //      Line:            |==============|
273            //      Inlinee ranges:  |--------------|
274            //
275            //      Effect: make_call_location
276            //
277            //  c) Just call line, for multiple inlined calls:
278            //      Line:            |==========================|
279            //      Inlinee ranges:  |----------||--------------|
280            //
281            //      Effect: make_call_location, make_call_location
282            //
283            //  d) Call line and trailing self line:
284            //      Line:            |==================|
285            //      Inlinee ranges:  |-----------|
286            //
287            //      Effect: make_call_location, insert_range
288            //
289            //  e) Leading self line and also call line:
290            //      Line:            |==================|
291            //      Inlinee ranges:         |-----------|
292            //
293            //      Effect: insert_range, make_call_location
294            //
295            //  f) Interleaving
296            //      Line:            |======================================|
297            //      Inlinee ranges:         |-----------|    |-------|
298            //
299            //      Effect: insert_range, make_call_location, insert_range, make_call_location, insert_range
300            //
301            //  g) Bad debug info
302            //      Line:            |=======|
303            //      Inlinee ranges:  |-------------|
304            //
305            //      Effect: make_call_location
306
307            let mut current_address = line_range_start;
308            while current_address < line_range_end {
309                // Emit our source location at current_address if current_address is not covered by an inlinee.
310                if next_inline
311                    .as_ref()
312                    .is_none_or(|next| next.start > current_address)
313                {
314                    // "insert_range"
315                    self.ranges.insert(current_address, source_location.clone());
316                }
317
318                // If there is an inlinee range covered by this line record, turn this line into that
319                // call's "call line". Make a `call_location_idx` for it and store it in `callee_call_locations`.
320                if let Some(inline_range) =
321                    take_if(&mut next_inline, |next| next.start < line_range_end)
322                {
323                    // "make_call_location"
324                    let (call_location_idx, _) =
325                        self.call_locations.insert_full(source_location.clone());
326                    callee_call_locations.push((inline_range.start, call_location_idx as u32));
327
328                    // Advance current_address to the end of this inlinee range.
329                    current_address = inline_range.end;
330                    next_inline = inline_iter.next();
331                } else {
332                    // No further inlinee ranges are overlapping with this line record. Advance to the
333                    // end of the line record.
334                    current_address = line_range_end;
335                }
336            }
337
338            // Advance the line iterator.
339            next_line = line_iter.next();
340
341            // Skip any lines that start before current_address.
342            // Such lines can exist if the debug information is faulty, or if the compiler created
343            // multiple identical small "call line" records instead of one combined record
344            // covering the entire inlinee range. We can't have different "call lines" for a single
345            // inlinee range anyway, so it's fine to skip these.
346            while next_line
347                .as_ref()
348                .is_some_and(|next| (next.address as u32) < current_address)
349            {
350                next_line = line_iter.next();
351            }
352        }
353
354        if !function.inline {
355            // add the bare minimum of information for the function if there isn't any.
356            insert_source_location(&mut self.ranges, entry_pc, || raw::SourceLocation {
357                file_idx: u32::MAX,
358                line: 0,
359                function_idx,
360                inlined_into_idx: u32::MAX,
361            });
362        }
363
364        // We've processed all address ranges which are *not* covered by inlinees.
365        // Now it's time to recurse.
366        // Process our inlinees.
367        if !callee_call_locations.is_empty() {
368            for inlinee in &function.inlinees {
369                self.process_symbolic_function_recursive(inlinee, &callee_call_locations);
370            }
371        }
372
373        let function_end = function.end_address() as u32;
374        let last_addr = self.last_addr.get_or_insert(0);
375        if function_end > *last_addr {
376            *last_addr = function_end;
377        }
378
379        // Insert an explicit "empty" mapping for the end of the function.
380        // This is to ensure that addresses that fall "between" functions don't get
381        // erroneously mapped to the previous function.
382        //
383        // We only do this if there is no previous mapping for the end address—we don't
384        // want to overwrite valid mappings.
385        //
386        // If the next function starts right at this function's end, that's no trouble,
387        // it will just overwrite this mapping with one of its ranges.
388        if let btree_map::Entry::Vacant(vacant_entry) = self.ranges.entry(function_end) {
389            vacant_entry.insert(NO_SOURCE_LOCATION);
390        }
391    }
392
393    /// Processes an individual [`Symbol`].
394    pub fn process_symbolic_symbol(&mut self, symbol: &Symbol<'_>) {
395        let name_idx = {
396            let mut function = transform::Function {
397                name: match symbol.name {
398                    Some(ref name) => name.clone(),
399                    None => return,
400                },
401                comp_dir: None,
402            };
403            for transformer in &mut self.transformers.0 {
404                function = transformer.transform_function(function);
405            }
406
407            let function_name = if self.is_windows_object {
408                undecorate_win_symbol(&function.name)
409            } else {
410                &function.name
411            };
412
413            self.string_table.insert(function_name) as u32
414        };
415
416        // Insert a source location for the symbol, overwriting `NO_SOURCE_LOCATION` sentinel
417        // values but not actual source locations coming from e.g. functions.
418        insert_source_location(&mut self.ranges, symbol.address as u32, || {
419            let function = raw::Function {
420                name_offset: name_idx,
421                _comp_dir_offset: u32::MAX,
422                entry_pc: symbol.address as u32,
423                lang: u32::MAX,
424            };
425            let function_idx = self.functions.insert_full(function).0 as u32;
426
427            raw::SourceLocation {
428                file_idx: u32::MAX,
429                line: 0,
430                function_idx,
431                inlined_into_idx: u32::MAX,
432            }
433        });
434
435        let last_addr = self.last_addr.get_or_insert(0);
436        if symbol.address as u32 >= *last_addr {
437            self.last_addr = None;
438        }
439
440        // Insert an explicit "empty" mapping for the end of the symbol.
441        // This is to ensure that addresses that fall "between" symbols don't get
442        // erroneously mapped to the previous symbol.
443        //
444        // We only do this if there is no previous mapping for the end address—we don't
445        // want to overwrite valid mappings.
446        //
447        // If the next symbol starts right at this symbols's end, that's no trouble,
448        // it will just overwrite this mapping.
449        if symbol.size > 0 {
450            let end_address = (symbol.address + symbol.size) as u32;
451            if let btree_map::Entry::Vacant(vacant_entry) = self.ranges.entry(end_address) {
452                vacant_entry.insert(NO_SOURCE_LOCATION);
453            }
454        }
455    }
456
457    // Methods for serializing to a [`Write`] below:
458    // Feel free to move these to a separate file.
459
460    /// Serialize the converted data.
461    ///
462    /// This writes the SymCache binary format into the given [`Write`].
463    pub fn serialize<W: Write>(mut self, writer: &mut W) -> std::io::Result<()> {
464        let mut writer = Writer::new(writer);
465
466        // Insert a trailing sentinel source location in case we have a definite end addr
467        if let Some(last_addr) = self.last_addr {
468            // TODO: to be extra safe, we might check that `last_addr` is indeed larger than
469            // the largest range at some point.
470            match self.ranges.entry(last_addr) {
471                btree_map::Entry::Vacant(entry) => {
472                    entry.insert(raw::NO_SOURCE_LOCATION);
473                }
474                btree_map::Entry::Occupied(_entry) => {
475                    // BUG:
476                    // the last addr should not map to an already defined range
477                }
478            }
479        }
480
481        let num_files = self.files.len() as u32;
482        let num_functions = self.functions.len() as u32;
483        let num_source_locations = (self.call_locations.len() + self.ranges.len()) as u32;
484        let num_ranges = self.ranges.len() as u32;
485        let string_bytes = self.string_table.into_bytes();
486
487        let header = raw::Header {
488            magic: raw::SYMCACHE_MAGIC,
489            version: crate::SYMCACHE_VERSION,
490
491            debug_id: self.debug_id,
492            arch: self.arch,
493
494            num_files,
495            num_functions,
496            num_source_locations,
497            num_ranges,
498            string_bytes: string_bytes.len() as u32,
499            _reserved: [0; 16],
500        };
501
502        writer.write_all(header.as_bytes())?;
503        writer.align_to(8)?;
504
505        for f in self.files {
506            writer.write_all(f.as_bytes())?;
507        }
508        writer.align_to(8)?;
509
510        for f in self.functions {
511            writer.write_all(f.as_bytes())?;
512        }
513        writer.align_to(8)?;
514
515        for s in self.call_locations {
516            writer.write_all(s.as_bytes())?;
517        }
518        for s in self.ranges.values() {
519            writer.write_all(s.as_bytes())?;
520        }
521        writer.align_to(8)?;
522
523        for r in self.ranges.keys() {
524            writer.write_all(r.as_bytes())?;
525        }
526        writer.align_to(8)?;
527
528        writer.write_all(&string_bytes)?;
529
530        Ok(())
531    }
532}
533
534/// Inserts a source location into a map, but only if there either isn't already
535/// a value for the provided key or the value is the `NO_SOURCE_LOCATION` sentinel.
536///
537/// This is useful because a `NO_SOURCE_LOCATION` value may be inserted at an address to explicitly
538/// mark the end of a function or symbol. If later there is a function, symbol, or range
539/// starting at that same address, we want to evict that sentinel, but we wouldn't want to
540/// evict source locations carrying actual information.
541fn insert_source_location<K, F>(
542    source_locations: &mut BTreeMap<K, raw::SourceLocation>,
543    key: K,
544    val: F,
545) where
546    K: Ord,
547    F: FnOnce() -> raw::SourceLocation,
548{
549    if source_locations
550        .get(&key)
551        .is_none_or(|sl| *sl == NO_SOURCE_LOCATION)
552    {
553        source_locations.insert(key, val());
554    }
555}
556
557/// Undecorates a Windows C-decorated symbol name.
558///
559/// The decoration rules are explained here:
560/// <https://docs.microsoft.com/en-us/cpp/build/reference/decorated-names?view=vs-2019>
561///
562/// - __cdecl Leading underscore (_)
563/// - __stdcall Leading underscore (_) and a trailing at sign (@) followed by the number of bytes in the parameter list in decimal
564/// - __fastcall Leading and trailing at signs (@) followed by a decimal number representing the number of bytes in the parameter list
565/// - __vectorcall Two trailing at signs (@@) followed by a decimal number of bytes in the parameter list
566/// > In a 64-bit environment, C or extern "C" functions are only decorated when using the __vectorcall calling convention."
567///
568/// This code is adapted from `dump_syms`:
569/// See <https://github.com/mozilla/dump_syms/blob/325cf2c61b2cacc55a7f1af74081b57237c7f9de/src/symbol.rs#L169-L216>
570fn undecorate_win_symbol(name: &str) -> &str {
571    if name.starts_with('?') || name.contains([':', '(', '<']) {
572        return name;
573    }
574
575    // Parse __vectorcall.
576    if let Some((name, param_size)) = name.rsplit_once("@@") {
577        if param_size.parse::<u32>().is_ok() {
578            return name;
579        }
580    }
581
582    // Parse the other three.
583    if !name.is_empty() {
584        if let ("@" | "_", rest) = name.split_at(1) {
585            if let Some((name, param_size)) = rest.rsplit_once('@') {
586                if param_size.parse::<u32>().is_ok() {
587                    // __stdcall or __fastcall
588                    return name;
589                }
590            }
591            if let Some(name) = name.strip_prefix('_') {
592                // __cdecl
593                return name;
594            }
595        }
596    }
597
598    name
599}
600
601/// Returns the start and end address for a line record, clamped to `u32`.
602fn line_boundaries(address: u64, size: Option<u64>) -> (u32, u32) {
603    let start = address.try_into().unwrap_or(u32::MAX);
604    let end = start.saturating_add(size.unwrap_or(1).try_into().unwrap_or(u32::MAX));
605    (start, end)
606}
607
608fn take_if<T>(opt: &mut Option<T>, predicate: impl FnOnce(&mut T) -> bool) -> Option<T> {
609    if opt.as_mut().is_some_and(predicate) {
610        opt.take()
611    } else {
612        None
613    }
614}
615
616#[cfg(test)]
617mod tests {
618    use super::*;
619
620    /// Tests that computing a range with a large size naively
621    /// results in an empty range, but using `line_boundaries`
622    /// doesn't.
623    #[test]
624    fn test_large_range() {
625        // Line record values from an actual example
626        let address = 0x11d255;
627        let size = 0xffee9d55;
628
629        let naive_range = {
630            let start = address as u32;
631            let end = (address + size) as u32;
632            start..end
633        };
634
635        assert!(naive_range.is_empty());
636
637        let range = {
638            let (start, end) = line_boundaries(address, Some(size));
639            start..end
640        };
641
642        assert!(!range.is_empty());
643    }
644}