symbolic_symcache/writer.rs
1//! Defines the [SymCache Converter](`SymCacheConverter`).
2
3use std::collections::btree_map;
4use std::collections::BTreeMap;
5use std::io::Write;
6
7use indexmap::IndexSet;
8use symbolic_common::{Arch, DebugId};
9use symbolic_debuginfo::{DebugSession, FileFormat, Function, ObjectLike, Symbol};
10use watto::{Pod, StringTable, Writer};
11
12use super::{raw, transform};
13use crate::raw::NO_SOURCE_LOCATION;
14use crate::{Error, ErrorKind};
15
16/// The SymCache Converter.
17///
18/// This can convert data in various source formats to an intermediate representation, which can
19/// then be serialized to disk via its [`serialize`](SymCacheConverter::serialize) method.
20#[derive(Debug, Default)]
21pub struct SymCacheConverter<'a> {
22 /// Debug identifier of the object file.
23 debug_id: DebugId,
24 /// CPU architecture of the object file.
25 arch: Arch,
26
27 /// A flag that indicates that we are currently processing a Windows object, which
28 /// will inform us if we should undecorate function names.
29 is_windows_object: bool,
30
31 /// A list of transformers that are used to transform each function / source location.
32 transformers: transform::Transformers<'a>,
33
34 string_table: StringTable,
35 /// The set of all [`raw::File`]s that have been added to this `Converter`.
36 files: IndexSet<raw::File>,
37 /// The set of all [`raw::Function`]s that have been added to this `Converter`.
38 functions: IndexSet<raw::Function>,
39 /// The set of [`raw::SourceLocation`]s used in this `Converter` that are only used as
40 /// "call locations", i.e. which are only referred to from `inlined_into_idx`.
41 call_locations: IndexSet<raw::SourceLocation>,
42 /// A map from code ranges to the [`raw::SourceLocation`]s they correspond to.
43 ///
44 /// Only the starting address of a range is saved, the end address is given implicitly
45 /// by the start address of the next range.
46 ranges: BTreeMap<u32, raw::SourceLocation>,
47
48 /// This is highest addr that we know is outside of a valid function.
49 /// Functions have an explicit end, while Symbols implicitly extend to infinity.
50 /// In case the highest addr belongs to a Symbol, this will be `None` and the SymCache
51 /// also extends to infinite, otherwise this is the end of the highest function.
52 last_addr: Option<u32>,
53}
54
55impl<'a> SymCacheConverter<'a> {
56 /// Creates a new Converter.
57 pub fn new() -> Self {
58 Self::default()
59 }
60
61 /// Adds a new [`transform::Transformer`] to this [`SymCacheConverter`].
62 ///
63 /// Every [`transform::Function`] and [`transform::SourceLocation`] will be passed through
64 /// this transformer before it is written to the SymCache.
65 pub fn add_transformer<T>(&mut self, t: T)
66 where
67 T: transform::Transformer + 'a,
68 {
69 self.transformers.0.push(Box::new(t));
70 }
71
72 /// Sets the CPU architecture of this SymCache.
73 pub fn set_arch(&mut self, arch: Arch) {
74 self.arch = arch;
75 }
76
77 /// Sets the debug identifier of this SymCache.
78 pub fn set_debug_id(&mut self, debug_id: DebugId) {
79 self.debug_id = debug_id;
80 }
81
82 // Methods processing symbolic-debuginfo [`ObjectLike`] below:
83 // Feel free to move these to a separate file.
84
85 /// This processes the given [`ObjectLike`] object, collecting all its functions and line
86 /// information into the converter.
87 #[tracing::instrument(skip_all, fields(object.debug_id = %object.debug_id().breakpad()))]
88 pub fn process_object<'d, 'o, O>(&mut self, object: &'o O) -> Result<(), Error>
89 where
90 O: ObjectLike<'d, 'o>,
91 O::Error: std::error::Error + Send + Sync + 'static,
92 {
93 let session = object
94 .debug_session()
95 .map_err(|e| Error::new(ErrorKind::BadDebugFile, e))?;
96
97 self.set_arch(object.arch());
98 self.set_debug_id(object.debug_id());
99
100 self.is_windows_object = matches!(object.file_format(), FileFormat::Pe | FileFormat::Pdb);
101
102 for function in session.functions() {
103 let function = function.map_err(|e| Error::new(ErrorKind::BadDebugFile, e))?;
104
105 self.process_symbolic_function(&function);
106 }
107
108 for symbol in object.symbols() {
109 self.process_symbolic_symbol(&symbol);
110 }
111
112 self.is_windows_object = false;
113
114 Ok(())
115 }
116
117 /// Processes an individual [`Function`], adding its line information to the converter.
118 pub fn process_symbolic_function(&mut self, function: &Function<'_>) {
119 self.process_symbolic_function_recursive(function, &[(0x0, u32::MAX)]);
120 }
121
122 /// Processes an individual [`Function`], adding its line information to the converter.
123 ///
124 /// `call_locations` is a non-empty sorted list of `(address, call_location index)` pairs.
125 fn process_symbolic_function_recursive(
126 &mut self,
127 function: &Function<'_>,
128 call_locations: &[(u32, u32)],
129 ) {
130 let string_table = &mut self.string_table;
131 // skip over empty functions or functions whose address is too large to fit in a u32
132 if function.size == 0 || function.address > u32::MAX as u64 {
133 return;
134 }
135
136 let comp_dir = std::str::from_utf8(function.compilation_dir).ok();
137
138 let entry_pc = if function.inline {
139 u32::MAX
140 } else {
141 function.address as u32
142 };
143
144 let function_idx = {
145 let language = function.name.language();
146 let mut function = transform::Function {
147 name: function.name.as_str().into(),
148 comp_dir: comp_dir.map(Into::into),
149 };
150 for transformer in &mut self.transformers.0 {
151 function = transformer.transform_function(function);
152 }
153
154 let function_name = if self.is_windows_object {
155 undecorate_win_symbol(&function.name)
156 } else {
157 &function.name
158 };
159
160 let name_offset = string_table.insert(function_name) as u32;
161
162 let lang = language as u32;
163 let (fun_idx, _) = self.functions.insert_full(raw::Function {
164 name_offset,
165 _comp_dir_offset: u32::MAX,
166 entry_pc,
167 lang,
168 });
169 fun_idx as u32
170 };
171
172 // We can divide the instructions in a function into two buckets:
173 // (1) Instructions which are part of an inlined function call, and
174 // (2) instructions which are *not* part of an inlined function call.
175 //
176 // Our incoming line records cover both (1) and (2) types of instructions.
177 //
178 // Let's call the address ranges of these instructions (1) inlinee ranges and (2) self ranges.
179 //
180 // We use the following strategy: For each function, only insert that function's "self ranges"
181 // into `self.ranges`. Then recurse into the function's inlinees. Those will insert their
182 // own "self ranges". Once the entire tree has been traversed, `self.ranges` will contain
183 // entries from all levels.
184 //
185 // In order to compute this function's "self ranges", we first gather and sort its
186 // "inlinee ranges". Later, when we iterate over this function's lines, we will compute the
187 // "self ranges" from the gaps between the "inlinee ranges".
188
189 let mut inlinee_ranges = Vec::new();
190 for inlinee in &function.inlinees {
191 for line in &inlinee.lines {
192 let (start, end) = line_boundaries(line.address, line.size);
193 inlinee_ranges.push(start..end);
194 }
195 }
196 inlinee_ranges.sort_unstable_by_key(|range| range.start);
197
198 // Walk three iterators. All of these are already sorted by address.
199 let mut line_iter = function.lines.iter();
200 let mut call_location_iter = call_locations.iter();
201 let mut inline_iter = inlinee_ranges.into_iter();
202
203 // call_locations is non-empty, so the first element always exists.
204 let mut current_call_location = call_location_iter.next().unwrap();
205
206 let mut next_call_location = call_location_iter.next();
207 let mut next_line = line_iter.next();
208 let mut next_inline = inline_iter.next();
209
210 // This will be the list we pass to our inlinees as the call_locations argument.
211 // This list is ordered by address by construction.
212 let mut callee_call_locations = Vec::new();
213
214 // Iterate over the line records.
215 while let Some(line) = next_line.take() {
216 let (line_range_start, line_range_end) = line_boundaries(line.address, line.size);
217
218 // Find the call location for this line.
219 while next_call_location.is_some() && next_call_location.unwrap().0 <= line_range_start
220 {
221 current_call_location = next_call_location.unwrap();
222 next_call_location = call_location_iter.next();
223 }
224 let inlined_into_idx = current_call_location.1;
225
226 let mut location = transform::SourceLocation {
227 file: transform::File {
228 name: line.file.name_str(),
229 directory: Some(line.file.dir_str()),
230 comp_dir: comp_dir.map(Into::into),
231 },
232 line: line.line as u32,
233 };
234 for transformer in &mut self.transformers.0 {
235 location = transformer.transform_source_location(location);
236 }
237
238 let name_offset = string_table.insert(&location.file.name) as u32;
239 let directory_offset = location
240 .file
241 .directory
242 .map_or(u32::MAX, |d| string_table.insert(&d) as u32);
243 let comp_dir_offset = location
244 .file
245 .comp_dir
246 .map_or(u32::MAX, |cd| string_table.insert(&cd) as u32);
247
248 let (file_idx, _) = self.files.insert_full(raw::File {
249 name_offset,
250 directory_offset,
251 comp_dir_offset,
252 });
253
254 let source_location = raw::SourceLocation {
255 file_idx: file_idx as u32,
256 line: location.line,
257 function_idx,
258 inlined_into_idx,
259 };
260
261 // The current line can be a "self line", or a "call line", or even a mixture.
262 //
263 // Examples:
264 //
265 // a) Just self line:
266 // Line: |==============|
267 // Inlinee ranges: (none)
268 //
269 // Effect: insert_range
270 //
271 // b) Just call line:
272 // Line: |==============|
273 // Inlinee ranges: |--------------|
274 //
275 // Effect: make_call_location
276 //
277 // c) Just call line, for multiple inlined calls:
278 // Line: |==========================|
279 // Inlinee ranges: |----------||--------------|
280 //
281 // Effect: make_call_location, make_call_location
282 //
283 // d) Call line and trailing self line:
284 // Line: |==================|
285 // Inlinee ranges: |-----------|
286 //
287 // Effect: make_call_location, insert_range
288 //
289 // e) Leading self line and also call line:
290 // Line: |==================|
291 // Inlinee ranges: |-----------|
292 //
293 // Effect: insert_range, make_call_location
294 //
295 // f) Interleaving
296 // Line: |======================================|
297 // Inlinee ranges: |-----------| |-------|
298 //
299 // Effect: insert_range, make_call_location, insert_range, make_call_location, insert_range
300 //
301 // g) Bad debug info
302 // Line: |=======|
303 // Inlinee ranges: |-------------|
304 //
305 // Effect: make_call_location
306
307 let mut current_address = line_range_start;
308 while current_address < line_range_end {
309 // Emit our source location at current_address if current_address is not covered by an inlinee.
310 if next_inline
311 .as_ref()
312 .is_none_or(|next| next.start > current_address)
313 {
314 // "insert_range"
315 self.ranges.insert(current_address, source_location.clone());
316 }
317
318 // If there is an inlinee range covered by this line record, turn this line into that
319 // call's "call line". Make a `call_location_idx` for it and store it in `callee_call_locations`.
320 if let Some(inline_range) =
321 take_if(&mut next_inline, |next| next.start < line_range_end)
322 {
323 // "make_call_location"
324 let (call_location_idx, _) =
325 self.call_locations.insert_full(source_location.clone());
326 callee_call_locations.push((inline_range.start, call_location_idx as u32));
327
328 // Advance current_address to the end of this inlinee range.
329 current_address = inline_range.end;
330 next_inline = inline_iter.next();
331 } else {
332 // No further inlinee ranges are overlapping with this line record. Advance to the
333 // end of the line record.
334 current_address = line_range_end;
335 }
336 }
337
338 // Advance the line iterator.
339 next_line = line_iter.next();
340
341 // Skip any lines that start before current_address.
342 // Such lines can exist if the debug information is faulty, or if the compiler created
343 // multiple identical small "call line" records instead of one combined record
344 // covering the entire inlinee range. We can't have different "call lines" for a single
345 // inlinee range anyway, so it's fine to skip these.
346 while next_line
347 .as_ref()
348 .is_some_and(|next| (next.address as u32) < current_address)
349 {
350 next_line = line_iter.next();
351 }
352 }
353
354 if !function.inline {
355 // add the bare minimum of information for the function if there isn't any.
356 insert_source_location(&mut self.ranges, entry_pc, || raw::SourceLocation {
357 file_idx: u32::MAX,
358 line: 0,
359 function_idx,
360 inlined_into_idx: u32::MAX,
361 });
362 }
363
364 // We've processed all address ranges which are *not* covered by inlinees.
365 // Now it's time to recurse.
366 // Process our inlinees.
367 if !callee_call_locations.is_empty() {
368 for inlinee in &function.inlinees {
369 self.process_symbolic_function_recursive(inlinee, &callee_call_locations);
370 }
371 }
372
373 let function_end = function.end_address() as u32;
374 let last_addr = self.last_addr.get_or_insert(0);
375 if function_end > *last_addr {
376 *last_addr = function_end;
377 }
378
379 // Insert an explicit "empty" mapping for the end of the function.
380 // This is to ensure that addresses that fall "between" functions don't get
381 // erroneously mapped to the previous function.
382 //
383 // We only do this if there is no previous mapping for the end address—we don't
384 // want to overwrite valid mappings.
385 //
386 // If the next function starts right at this function's end, that's no trouble,
387 // it will just overwrite this mapping with one of its ranges.
388 if let btree_map::Entry::Vacant(vacant_entry) = self.ranges.entry(function_end) {
389 vacant_entry.insert(NO_SOURCE_LOCATION);
390 }
391 }
392
393 /// Processes an individual [`Symbol`].
394 pub fn process_symbolic_symbol(&mut self, symbol: &Symbol<'_>) {
395 let name_idx = {
396 let mut function = transform::Function {
397 name: match symbol.name {
398 Some(ref name) => name.clone(),
399 None => return,
400 },
401 comp_dir: None,
402 };
403 for transformer in &mut self.transformers.0 {
404 function = transformer.transform_function(function);
405 }
406
407 let function_name = if self.is_windows_object {
408 undecorate_win_symbol(&function.name)
409 } else {
410 &function.name
411 };
412
413 self.string_table.insert(function_name) as u32
414 };
415
416 // Insert a source location for the symbol, overwriting `NO_SOURCE_LOCATION` sentinel
417 // values but not actual source locations coming from e.g. functions.
418 insert_source_location(&mut self.ranges, symbol.address as u32, || {
419 let function = raw::Function {
420 name_offset: name_idx,
421 _comp_dir_offset: u32::MAX,
422 entry_pc: symbol.address as u32,
423 lang: u32::MAX,
424 };
425 let function_idx = self.functions.insert_full(function).0 as u32;
426
427 raw::SourceLocation {
428 file_idx: u32::MAX,
429 line: 0,
430 function_idx,
431 inlined_into_idx: u32::MAX,
432 }
433 });
434
435 let last_addr = self.last_addr.get_or_insert(0);
436 if symbol.address as u32 >= *last_addr {
437 self.last_addr = None;
438 }
439
440 // Insert an explicit "empty" mapping for the end of the symbol.
441 // This is to ensure that addresses that fall "between" symbols don't get
442 // erroneously mapped to the previous symbol.
443 //
444 // We only do this if there is no previous mapping for the end address—we don't
445 // want to overwrite valid mappings.
446 //
447 // If the next symbol starts right at this symbols's end, that's no trouble,
448 // it will just overwrite this mapping.
449 if symbol.size > 0 {
450 let end_address = (symbol.address + symbol.size) as u32;
451 if let btree_map::Entry::Vacant(vacant_entry) = self.ranges.entry(end_address) {
452 vacant_entry.insert(NO_SOURCE_LOCATION);
453 }
454 }
455 }
456
457 // Methods for serializing to a [`Write`] below:
458 // Feel free to move these to a separate file.
459
460 /// Serialize the converted data.
461 ///
462 /// This writes the SymCache binary format into the given [`Write`].
463 pub fn serialize<W: Write>(mut self, writer: &mut W) -> std::io::Result<()> {
464 let mut writer = Writer::new(writer);
465
466 // Insert a trailing sentinel source location in case we have a definite end addr
467 if let Some(last_addr) = self.last_addr {
468 // TODO: to be extra safe, we might check that `last_addr` is indeed larger than
469 // the largest range at some point.
470 match self.ranges.entry(last_addr) {
471 btree_map::Entry::Vacant(entry) => {
472 entry.insert(raw::NO_SOURCE_LOCATION);
473 }
474 btree_map::Entry::Occupied(_entry) => {
475 // BUG:
476 // the last addr should not map to an already defined range
477 }
478 }
479 }
480
481 let num_files = self.files.len() as u32;
482 let num_functions = self.functions.len() as u32;
483 let num_source_locations = (self.call_locations.len() + self.ranges.len()) as u32;
484 let num_ranges = self.ranges.len() as u32;
485 let string_bytes = self.string_table.into_bytes();
486
487 let header = raw::Header {
488 magic: raw::SYMCACHE_MAGIC,
489 version: crate::SYMCACHE_VERSION,
490
491 debug_id: self.debug_id,
492 arch: self.arch,
493
494 num_files,
495 num_functions,
496 num_source_locations,
497 num_ranges,
498 string_bytes: string_bytes.len() as u32,
499 _reserved: [0; 16],
500 };
501
502 writer.write_all(header.as_bytes())?;
503 writer.align_to(8)?;
504
505 for f in self.files {
506 writer.write_all(f.as_bytes())?;
507 }
508 writer.align_to(8)?;
509
510 for f in self.functions {
511 writer.write_all(f.as_bytes())?;
512 }
513 writer.align_to(8)?;
514
515 for s in self.call_locations {
516 writer.write_all(s.as_bytes())?;
517 }
518 for s in self.ranges.values() {
519 writer.write_all(s.as_bytes())?;
520 }
521 writer.align_to(8)?;
522
523 for r in self.ranges.keys() {
524 writer.write_all(r.as_bytes())?;
525 }
526 writer.align_to(8)?;
527
528 writer.write_all(&string_bytes)?;
529
530 Ok(())
531 }
532}
533
534/// Inserts a source location into a map, but only if there either isn't already
535/// a value for the provided key or the value is the `NO_SOURCE_LOCATION` sentinel.
536///
537/// This is useful because a `NO_SOURCE_LOCATION` value may be inserted at an address to explicitly
538/// mark the end of a function or symbol. If later there is a function, symbol, or range
539/// starting at that same address, we want to evict that sentinel, but we wouldn't want to
540/// evict source locations carrying actual information.
541fn insert_source_location<K, F>(
542 source_locations: &mut BTreeMap<K, raw::SourceLocation>,
543 key: K,
544 val: F,
545) where
546 K: Ord,
547 F: FnOnce() -> raw::SourceLocation,
548{
549 if source_locations
550 .get(&key)
551 .is_none_or(|sl| *sl == NO_SOURCE_LOCATION)
552 {
553 source_locations.insert(key, val());
554 }
555}
556
557/// Undecorates a Windows C-decorated symbol name.
558///
559/// The decoration rules are explained here:
560/// <https://docs.microsoft.com/en-us/cpp/build/reference/decorated-names?view=vs-2019>
561///
562/// - __cdecl Leading underscore (_)
563/// - __stdcall Leading underscore (_) and a trailing at sign (@) followed by the number of bytes in the parameter list in decimal
564/// - __fastcall Leading and trailing at signs (@) followed by a decimal number representing the number of bytes in the parameter list
565/// - __vectorcall Two trailing at signs (@@) followed by a decimal number of bytes in the parameter list
566/// > In a 64-bit environment, C or extern "C" functions are only decorated when using the __vectorcall calling convention."
567///
568/// This code is adapted from `dump_syms`:
569/// See <https://github.com/mozilla/dump_syms/blob/325cf2c61b2cacc55a7f1af74081b57237c7f9de/src/symbol.rs#L169-L216>
570fn undecorate_win_symbol(name: &str) -> &str {
571 if name.starts_with('?') || name.contains([':', '(', '<']) {
572 return name;
573 }
574
575 // Parse __vectorcall.
576 if let Some((name, param_size)) = name.rsplit_once("@@") {
577 if param_size.parse::<u32>().is_ok() {
578 return name;
579 }
580 }
581
582 // Parse the other three.
583 if !name.is_empty() {
584 if let ("@" | "_", rest) = name.split_at(1) {
585 if let Some((name, param_size)) = rest.rsplit_once('@') {
586 if param_size.parse::<u32>().is_ok() {
587 // __stdcall or __fastcall
588 return name;
589 }
590 }
591 if let Some(name) = name.strip_prefix('_') {
592 // __cdecl
593 return name;
594 }
595 }
596 }
597
598 name
599}
600
601/// Returns the start and end address for a line record, clamped to `u32`.
602fn line_boundaries(address: u64, size: Option<u64>) -> (u32, u32) {
603 let start = address.try_into().unwrap_or(u32::MAX);
604 let end = start.saturating_add(size.unwrap_or(1).try_into().unwrap_or(u32::MAX));
605 (start, end)
606}
607
608fn take_if<T>(opt: &mut Option<T>, predicate: impl FnOnce(&mut T) -> bool) -> Option<T> {
609 if opt.as_mut().is_some_and(predicate) {
610 opt.take()
611 } else {
612 None
613 }
614}
615
616#[cfg(test)]
617mod tests {
618 use super::*;
619
620 /// Tests that computing a range with a large size naively
621 /// results in an empty range, but using `line_boundaries`
622 /// doesn't.
623 #[test]
624 fn test_large_range() {
625 // Line record values from an actual example
626 let address = 0x11d255;
627 let size = 0xffee9d55;
628
629 let naive_range = {
630 let start = address as u32;
631 let end = (address + size) as u32;
632 start..end
633 };
634
635 assert!(naive_range.is_empty());
636
637 let range = {
638 let (start, end) = line_boundaries(address, Some(size));
639 start..end
640 };
641
642 assert!(!range.is_empty());
643 }
644}