Skip to main content

ms_pdb/dbi/
sources.rs

1//! DBI Sources Substream
2
3use super::*;
4use crate::BStr;
5use std::collections::HashMap;
6
7/// The "Sources" substream of the DBI stream. This stream describes the merged set of source
8/// files that were the inputs (compilands) of all modules.
9///
10/// See: <https://llvm.org/docs/PDB/DbiStream.html#file-info-substream>
11pub struct DbiSourcesSubstream<'a> {
12    /// The `module_file_starts` array gives the index within `file_name_offsets` where the file
13    /// names for each module begin. That is, `file_name_offsets[module_file_starts[m]]` is the file
14    /// name offset for the first file in the set of files for module `m`.
15    ///
16    /// When combined with the `module_file_counts` array, you can easily find the slice within
17    /// `file_name_offsets` of files for a specific module.
18    ///
19    /// The length of this slice is equal to `num_modules`. This slice _does not_ have an extra
20    /// entry at the end, so you must use `file_name_offsets.len()` as the end of the per-module
21    /// slice for the last entry in this slice.
22    module_file_starts: &'a [U16<LE>],
23
24    /// For each module, gives the number of source files that contribute to that module.
25    module_file_counts: &'a [U16<LE>],
26
27    /// Contains the concatenated list of file name lists, one list per module. For each module
28    /// `m`, the set of items within `file_name_offsets` is given by
29    /// `file_name_offsets[module_file_starts[m]..][..module_file_counts[m]]`.
30    ///
31    /// Each item in this list is an offset into `names_buffer` and points to the start of a
32    /// NUL-terminated UTF-8 string.
33    ///
34    /// This array can (and usually does) contain duplicate values. The values are ordered by the
35    /// module which referenced a given set of source files. Since many modules will read a shared
36    /// set of header files (e.g. `windows.h`), those shared header files will appear many times
37    /// in this list.
38    ///
39    /// The length of `file_name_offsets` is usually higher than the number of _unique_ source files
40    /// because many source files (header files) are referenced by more than one module.
41    ///
42    /// The length of this slice is equal to the sum of the values in the `module_file_counts`.
43    /// The on-disk file format stores a field that counts the number of source files, but the field
44    /// is only 16-bit, so it can easily overflow on large executables. That is why this
45    /// value is computed when the substream is parsed, instead of using the on-disk version.
46    file_name_offsets: &'a [U32<LE>],
47
48    /// Contains the file name strings, encoded in UTF-8 and NUL-terminated.
49    names_buffer: &'a [u8],
50}
51
52impl<'a> DbiSourcesSubstream<'a> {
53    /// The number of modules
54    pub fn num_modules(&self) -> usize {
55        self.module_file_starts.len()
56    }
57
58    /// Provides access to the file name offsets slice. Each value is a file name offset, and can
59    /// be used with `get_source_name_at()`.
60    pub fn file_name_offsets(&self) -> &'a [U32<LE>] {
61        self.file_name_offsets
62    }
63
64    /// Parses the file info substream.
65    ///
66    /// This does not parse or validate every part of the substream. It only parses enough to find
67    /// the module indices and file names.
68    pub fn parse(substream_data: &'a [u8]) -> anyhow::Result<Self> {
69        let mut p = Parser::new(substream_data);
70        let num_modules = p.u16()? as usize;
71
72        // In theory this is supposed to contain the number of source files for which this substream
73        // contains information. But that would present a problem in that the width of this field
74        // being 16-bits would prevent one from having more than 64K source files in a program. In
75        // early versions of the file format, this seems to have been the case. In order to support
76        // more than this, this field of the is simply ignored, and computed dynamically by summing
77        // up the values of the ModFileCounts array (discussed below).
78        //
79        // In short, this value should be ignored. However, we still have to read the value in
80        // order to parse the header correctly.
81        let _obsolete_num_source_files = p.u16()? as usize;
82
83        let module_file_starts: &[U16<LE>] = p.slice(num_modules)?;
84
85        // An array of num_modules integers, each one containing the number of source files which
86        // contribute to the module at the specified index. While each individual module is limited
87        // to 64K contributing source files, the union of all modules' source files may be greater
88        // than 64K. The real number of source files is thus computed by summing this array.
89        //
90        // Note that summing this array does not give the number of _unique source files_, only the
91        // total number of source file contributions to modules.
92        let module_file_counts: &[U16<LE>] = p.slice(num_modules)?;
93
94        let num_file_offsets = module_file_counts.iter().map(|c| c.get() as usize).sum();
95
96        // At this point, we could scan module_file_starts + module_file_counts and validate that
97        // no entry exceeds num_file_offsets.
98
99        let file_name_offsets = p.slice(num_file_offsets)?;
100        let names_buffer = p.into_rest();
101
102        Ok(Self {
103            module_file_starts,
104            module_file_counts,
105            file_name_offsets,
106            names_buffer,
107        })
108    }
109
110    /// Given a source file index, returns the source file name.
111    pub fn get_source_file_name(&self, source_file_index: usize) -> Result<&'a BStr, ParserError> {
112        let offset = self.file_name_offsets[source_file_index].get();
113        self.get_source_file_name_at(offset)
114    }
115
116    /// Given a file name offset (within `name_buffer`), returns the source file name.
117    pub fn get_source_file_name_at(&self, file_name_offset: u32) -> Result<&'a BStr, ParserError> {
118        let Some(string_data) = self.names_buffer.get(file_name_offset as usize..) else {
119            return Err(ParserError);
120        };
121        let mut p = Parser::new(string_data);
122        let file_name = p.strz()?;
123        Ok(file_name)
124    }
125
126    /// Caller is expected to validate module_index (against `num_modules()`) before calling
127    pub fn name_offsets_for_module(&self, module_index: usize) -> anyhow::Result<&[U32<LE>]> {
128        let start = self.module_file_starts[module_index].get() as usize;
129        let count = self.module_file_counts[module_index].get() as usize;
130        let Some(s) = self.file_name_offsets.get(start..start + count) else {
131            bail!(
132                "File name offsets for module #{module_index} are invalid.  start: {start}, count: {count}, len available: {}",
133                self.file_name_offsets.len()
134            );
135        };
136        Ok(s)
137    }
138
139    /// Iterates source files in the DBI Sources Substream.
140    pub fn iter_sources(&self) -> IterSources<'_> {
141        IterSources {
142            names_buffer: self.names_buffer,
143            file_name_offsets: self.file_name_offsets.iter(),
144        }
145    }
146
147    /// Builds a HashMap that maps from file name offsets to strings.
148    pub fn sources_map(&self) -> anyhow::Result<HashMap<u32, &BStr>> {
149        let mut unique_offsets: Vec<u32> = self.file_name_offsets.iter().map(|i| i.get()).collect();
150        unique_offsets.sort_unstable();
151        unique_offsets.dedup();
152
153        let mut map = HashMap::new();
154        for &offset in unique_offsets.iter() {
155            let name = self.get_source_file_name_at(offset)?;
156            map.insert(offset, name);
157        }
158
159        Ok(map)
160    }
161}
162
163/// Iterates source files in the DBI Sources Substream.
164pub struct IterSources<'a> {
165    names_buffer: &'a [u8],
166    file_name_offsets: std::slice::Iter<'a, U32<LE>>,
167}
168
169impl<'a> Iterator for IterSources<'a> {
170    /// name_offset (in bytes), name
171    type Item = (u32, &'a BStr);
172
173    fn next(&mut self) -> Option<Self::Item> {
174        let offset = self.file_name_offsets.next()?.get();
175        let mut p = Parser::new(self.names_buffer);
176        p.skip(offset as usize).ok()?;
177        let name = p.strz().ok()?;
178        Some((offset, name))
179    }
180}
181
182#[cfg(test)]
183#[rustfmt::skip]
184static TEST_SOURCES_DATA: &[u8] = &[
185    /* 0x0000 */ 4, 0,                     // num_modules = 4
186    /* 0x0004 */ 0xee, 0xee,               // obsolete num_sources (bogus)
187    /* 0x0008 */ 0, 0, 2, 0, 3, 0, 3, 0,   // module_file_starts = [0, 2, 3, 3]
188    /* 0x0010 */ 2, 0, 1, 0, 0, 0, 3, 0,   // module_file_counts = [2, 1, 0, 3] sum = 6
189
190    /* 0x0018 */                           // file_offsets, len = 6 items, 24 bytes
191    /* 0x0018 */ 0x00, 0, 0, 0,            // module 0, file_offsets[0] = 0x00, points to "foo.c",
192    /* 0x0018 */ 0x14, 0, 0, 0,            // module 0, file_offsets[1] = 0x14, points to "windows.h"
193    /* 0x0018 */ 0x06, 0, 0, 0,            // module 1, file_offsets[2] = 0x06, points to "bar.rs"
194    /* 0x0018 */ 0x00, 0, 0, 0,            // module 3, file_offsets[3] = 0x00, points to "foo.c"
195    /* 0x0018 */ 0x14, 0, 0, 0,            // module 3, file_offsets[4] = 0x14, points to "windows.h"
196    /* 0x0018 */ 0x0d, 0, 0, 0,            // module 3, file_offsets[5] = 0x0d, points to "main.c"
197
198    // names_buffer; contains (at relative offsets):
199    //      name offset 0x0000 : "foo.c"
200    //      name offset 0x0006 : "bar.rs"
201    //      name offset 0x000d : "main.c"
202    //      name offset 0x0014 : "windows.h"
203    /* 0x0030 */                                // names_buffer
204    /* 0x0030 */ b'f', b'o', b'o', b'.',
205    /* 0x0034 */ b'c', 0,    b'b', b'a',
206    /* 0x0038 */ b'r', b'.', b'r', b's',
207    /* 0x003c */ 0,    b'm', b'a', b'i',
208    /* 0x0040 */ b'n', b'.', b'c', 0,
209    /* 0x0044 */ b'w', b'i', b'n', b'd',
210    /* 0x0048 */ b'o', b'w', b's', b'.',
211    /* 0x004c */ b'h', 0,    0,    0,
212
213    /* 0x0050 : end */
214];
215
216#[test]
217fn basic_parse() {
218    let s = DbiSourcesSubstream::parse(TEST_SOURCES_DATA).unwrap();
219    assert_eq!(s.num_modules(), 4);
220
221    assert_eq!(s.file_name_offsets.len(), 6);
222
223    let module_file_starts: Vec<u16> = s.module_file_starts.iter().map(|x| x.get()).collect();
224    assert_eq!(&module_file_starts, &[0, 2, 3, 3]);
225
226    let module_file_counts: Vec<u16> = s.module_file_counts.iter().map(|x| x.get()).collect();
227    assert_eq!(&module_file_counts, &[2, 1, 0, 3]);
228
229    let file_name_offsets: Vec<u32> = s.file_name_offsets.iter().map(|x| x.get()).collect();
230    assert_eq!(&file_name_offsets, &[0x00, 0x14, 0x06, 0x00, 0x14, 0x0d]);
231
232    // Read the file names. Remember that there are duplicates in this list.
233    assert_eq!(s.get_source_file_name(0).unwrap(), "foo.c");
234    assert_eq!(s.get_source_file_name(1).unwrap(), "windows.h");
235    assert_eq!(s.get_source_file_name(2).unwrap(), "bar.rs");
236    assert_eq!(s.get_source_file_name(3).unwrap(), "foo.c");
237    assert_eq!(s.get_source_file_name(4).unwrap(), "windows.h");
238    assert_eq!(s.get_source_file_name(5).unwrap(), "main.c");
239
240    let modsrcs0 = s.name_offsets_for_module(0).unwrap();
241    assert_eq!(modsrcs0.len(), 2);
242    assert_eq!(modsrcs0[0].get(), 0);
243    assert_eq!(modsrcs0[1].get(), 0x14);
244
245    // Test bounds check on get_source_file_name_at()
246    assert!(s.get_source_file_name_at(0xeeee).is_err());
247}
248
249#[test]
250fn test_iter_sources() {
251    use bstr::ByteSlice;
252
253    let s = DbiSourcesSubstream::parse(TEST_SOURCES_DATA).unwrap();
254
255    let sources: Vec<(u32, &str)> = s
256        .iter_sources()
257        .map(|(i, s)| (i, s.to_str().unwrap()))
258        .collect();
259
260    assert_eq!(
261        &sources,
262        &[
263            (0x00, "foo.c"),
264            (0x14, "windows.h"),
265            (0x06, "bar.rs"),
266            (0x00, "foo.c"),
267            (0x14, "windows.h"),
268            (0x0d, "main.c"),
269        ]
270    );
271}
272
273#[test]
274fn test_sources_map() {
275    let s = DbiSourcesSubstream::parse(TEST_SOURCES_DATA).unwrap();
276    let map = s.sources_map().unwrap();
277    assert_eq!(map.len(), 4); // 4 unique file names
278    assert_eq!(*map.get(&0x00).unwrap(), "foo.c");
279    assert_eq!(*map.get(&0x06).unwrap(), "bar.rs");
280    assert_eq!(*map.get(&0x0d).unwrap(), "main.c");
281    assert_eq!(*map.get(&0x14).unwrap(), "windows.h");
282}