ms_pdb/dbi/
sources.rs

1//! DBI Sources Substream
2
3use super::*;
4use crate::BStr;
5use std::collections::HashMap;
6
7/// The "Sources" substream of the DBI stream. This stream describes the merged set of source
8/// files that were the inputs (compilands) of all modules.
9///
10/// See: <https://llvm.org/docs/PDB/DbiStream.html#file-info-substream>
11pub struct DbiSourcesSubstream<'a> {
12    /// The `module_file_starts` array gives the index within `file_name_offsets` where the file
13    /// names for each module begin. That is, `file_name_offsets[module_file_starts[m]]` is the file
14    /// name offset for the first file in the set of files for module `m`.
15    ///
16    /// When combined with the `module_file_counts` array, you can easily find the slice within
17    /// `file_name_offsets` of files for a specific module.
18    ///
19    /// The length of this slice is equal to `num_modules`. This slice _does not_ have an extra
20    /// entry at the end, so you must use `file_name_offsets.len()` as the end of the per-module
21    /// slice for the last entry in this slice.
22    module_file_starts: &'a [U16<LE>],
23
24    /// For each module, gives the number of source files that contribute to that module.
25    module_file_counts: &'a [U16<LE>],
26
27    /// Contains the concatenated list of file name lists, one list per module. For each module
28    /// `m`, the set of items within `file_name_offsets` is given by
29    /// `file_name_offsets[module_file_starts[m]..][..module_file_counts[m]]`.
30    ///
31    /// Each item in this list is an offset into `names_buffer` and points to the start of a
32    /// NUL-terminated UTF-8 string.
33    ///
34    /// This array can (and usually does) contain duplicate values. The values are ordered by the
35    /// module which referenced a given set of source files. Since many modules will read a shared
36    /// set of header files (e.g. `windows.h`), those shared header files will appear many times
37    /// in this list.
38    ///
39    /// The length of `file_name_offsets` is usually higher than the number of _unique_ source files
40    /// because many source files (header files) are referenced by more than one module.
41    ///
42    /// The length of this slice is equal to the sum of the values in the `module_file_counts`.
43    /// The on-disk file format stores a field that counts the number of source files, but the field
44    /// is only 16-bit, so it can easily overflow on large executables. That is why this
45    /// value is computed when the substream is parsed, instead of using the on-disk version.
46    file_name_offsets: &'a [U32<LE>],
47
48    /// Contains the file name strings, encoded in UTF-8 and NUL-terminated.
49    names_buffer: &'a [u8],
50}
51
52impl<'a> DbiSourcesSubstream<'a> {
53    /// The number of modules
54    pub fn num_modules(&self) -> usize {
55        self.module_file_starts.len()
56    }
57
58    /// Provides access to the file name offsets slice. Each value is a file name offset, and can
59    /// be used with `get_source_name_at()`.
60    pub fn file_name_offsets(&self) -> &'a [U32<LE>] {
61        self.file_name_offsets
62    }
63
64    /// Parses the file info substream.
65    ///
66    /// This does not parse or validate every part of the substream. It only parses enough to find
67    /// the module indices and file names.
68    pub fn parse(substream_data: &'a [u8]) -> anyhow::Result<Self> {
69        let mut p = Parser::new(substream_data);
70        let num_modules = p.u16()? as usize;
71
72        // In theory this is supposed to contain the number of source files for which this substream
73        // contains information. But that would present a problem in that the width of this field
74        // being 16-bits would prevent one from having more than 64K source files in a program. In
75        // early versions of the file format, this seems to have been the case. In order to support
76        // more than this, this field of the is simply ignored, and computed dynamically by summing
77        // up the values of the ModFileCounts array (discussed below).
78        //
79        // In short, this value should be ignored. However, we still have to read the value in
80        // order to parse the header correctly.
81        let _obsolete_num_source_files = p.u16()? as usize;
82
83        let module_file_starts: &[U16<LE>] = p.slice(num_modules)?;
84
85        // An array of num_modules integers, each one containing the number of source files which
86        // contribute to the module at the specified index. While each individual module is limited
87        // to 64K contributing source files, the union of all modules' source files may be greater
88        // than 64K. The real number of source files is thus computed by summing this array.
89        //
90        // Note that summing this array does not give the number of _unique source files_, only the
91        // total number of source file contributions to modules.
92        let module_file_counts: &[U16<LE>] = p.slice(num_modules)?;
93
94        let num_file_offsets = module_file_counts.iter().map(|c| c.get() as usize).sum();
95
96        // At this point, we could scan module_file_starts + module_file_counts and validate that
97        // no entry exceeds num_file_offsets.
98
99        let file_name_offsets = p.slice(num_file_offsets)?;
100        let names_buffer = p.into_rest();
101
102        Ok(Self {
103            module_file_starts,
104            module_file_counts,
105            file_name_offsets,
106            names_buffer,
107        })
108    }
109
110    /// Given a source file index, returns the source file name.
111    pub fn get_source_file_name(&self, source_file_index: usize) -> Result<&'a BStr, ParserError> {
112        let offset = self.file_name_offsets[source_file_index].get();
113        self.get_source_file_name_at(offset)
114    }
115
116    /// Given a file name offset (within `name_buffer`), returns the source file name.
117    pub fn get_source_file_name_at(&self, file_name_offset: u32) -> Result<&'a BStr, ParserError> {
118        let Some(string_data) = self.names_buffer.get(file_name_offset as usize..) else {
119            return Err(ParserError);
120        };
121        let mut p = Parser::new(string_data);
122        let file_name = p.strz()?;
123        Ok(file_name)
124    }
125
126    /// Caller is expected to validate module_index (against `num_modules()`) before calling
127    pub fn name_offsets_for_module(&self, module_index: usize) -> anyhow::Result<&[U32<LE>]> {
128        let start = self.module_file_starts[module_index].get() as usize;
129        let count = self.module_file_counts[module_index].get() as usize;
130        let Some(s) = self.file_name_offsets.get(start..start + count) else {
131            bail!("File name offsets for module #{module_index} are invalid.  start: {start}, count: {count}, len available: {}", self.file_name_offsets.len());
132        };
133        Ok(s)
134    }
135
136    /// Iterates source files in the DBI Sources Substream.
137    pub fn iter_sources(&self) -> IterSources<'_> {
138        IterSources {
139            names_buffer: self.names_buffer,
140            file_name_offsets: self.file_name_offsets.iter(),
141        }
142    }
143
144    /// Builds a HashMap that maps from file name offsets to strings.
145    pub fn sources_map(&self) -> anyhow::Result<HashMap<u32, &BStr>> {
146        let mut unique_offsets: Vec<u32> = self.file_name_offsets.iter().map(|i| i.get()).collect();
147        unique_offsets.sort_unstable();
148        unique_offsets.dedup();
149
150        let mut map = HashMap::new();
151        for &offset in unique_offsets.iter() {
152            let name = self.get_source_file_name_at(offset)?;
153            map.insert(offset, name);
154        }
155
156        Ok(map)
157    }
158}
159
160/// Iterates source files in the DBI Sources Substream.
161pub struct IterSources<'a> {
162    names_buffer: &'a [u8],
163    file_name_offsets: std::slice::Iter<'a, U32<LE>>,
164}
165
166impl<'a> Iterator for IterSources<'a> {
167    /// name_offset (in bytes), name
168    type Item = (u32, &'a BStr);
169
170    fn next(&mut self) -> Option<Self::Item> {
171        let offset = self.file_name_offsets.next()?.get();
172        let mut p = Parser::new(self.names_buffer);
173        p.skip(offset as usize).ok()?;
174        let name = p.strz().ok()?;
175        Some((offset, name))
176    }
177}
178
179#[cfg(test)]
180#[rustfmt::skip]
181static TEST_SOURCES_DATA: &[u8] = &[
182    /* 0x0000 */ 4, 0,                     // num_modules = 4
183    /* 0x0004 */ 0xee, 0xee,               // obsolete num_sources (bogus)
184    /* 0x0008 */ 0, 0, 2, 0, 3, 0, 3, 0,   // module_file_starts = [0, 2, 3, 3]
185    /* 0x0010 */ 2, 0, 1, 0, 0, 0, 3, 0,   // module_file_counts = [2, 1, 0, 3] sum = 6
186
187    /* 0x0018 */                           // file_offsets, len = 6 items, 24 bytes
188    /* 0x0018 */ 0x00, 0, 0, 0,            // module 0, file_offsets[0] = 0x00, points to "foo.c",
189    /* 0x0018 */ 0x14, 0, 0, 0,            // module 0, file_offsets[1] = 0x14, points to "windows.h"
190    /* 0x0018 */ 0x06, 0, 0, 0,            // module 1, file_offsets[2] = 0x06, points to "bar.rs"
191    /* 0x0018 */ 0x00, 0, 0, 0,            // module 3, file_offsets[3] = 0x00, points to "foo.c"
192    /* 0x0018 */ 0x14, 0, 0, 0,            // module 3, file_offsets[4] = 0x14, points to "windows.h"
193    /* 0x0018 */ 0x0d, 0, 0, 0,            // module 3, file_offsets[5] = 0x0d, points to "main.c"
194
195    // names_buffer; contains (at relative offsets):
196    //      name offset 0x0000 : "foo.c"
197    //      name offset 0x0006 : "bar.rs"
198    //      name offset 0x000d : "main.c"
199    //      name offset 0x0014 : "windows.h"
200    /* 0x0030 */                                // names_buffer
201    /* 0x0030 */ b'f', b'o', b'o', b'.',
202    /* 0x0034 */ b'c', 0,    b'b', b'a',
203    /* 0x0038 */ b'r', b'.', b'r', b's',
204    /* 0x003c */ 0,    b'm', b'a', b'i',
205    /* 0x0040 */ b'n', b'.', b'c', 0,
206    /* 0x0044 */ b'w', b'i', b'n', b'd',
207    /* 0x0048 */ b'o', b'w', b's', b'.',
208    /* 0x004c */ b'h', 0,    0,    0,
209
210    /* 0x0050 : end */
211];
212
213#[test]
214fn basic_parse() {
215    let s = DbiSourcesSubstream::parse(TEST_SOURCES_DATA).unwrap();
216    assert_eq!(s.num_modules(), 4);
217
218    assert_eq!(s.file_name_offsets.len(), 6);
219
220    let module_file_starts: Vec<u16> = s.module_file_starts.iter().map(|x| x.get()).collect();
221    assert_eq!(&module_file_starts, &[0, 2, 3, 3]);
222
223    let module_file_counts: Vec<u16> = s.module_file_counts.iter().map(|x| x.get()).collect();
224    assert_eq!(&module_file_counts, &[2, 1, 0, 3]);
225
226    let file_name_offsets: Vec<u32> = s.file_name_offsets.iter().map(|x| x.get()).collect();
227    assert_eq!(&file_name_offsets, &[0x00, 0x14, 0x06, 0x00, 0x14, 0x0d]);
228
229    // Read the file names. Remember that there are duplicates in this list.
230    assert_eq!(s.get_source_file_name(0).unwrap(), "foo.c");
231    assert_eq!(s.get_source_file_name(1).unwrap(), "windows.h");
232    assert_eq!(s.get_source_file_name(2).unwrap(), "bar.rs");
233    assert_eq!(s.get_source_file_name(3).unwrap(), "foo.c");
234    assert_eq!(s.get_source_file_name(4).unwrap(), "windows.h");
235    assert_eq!(s.get_source_file_name(5).unwrap(), "main.c");
236
237    let modsrcs0 = s.name_offsets_for_module(0).unwrap();
238    assert_eq!(modsrcs0.len(), 2);
239    assert_eq!(modsrcs0[0].get(), 0);
240    assert_eq!(modsrcs0[1].get(), 0x14);
241
242    // Test bounds check on get_source_file_name_at()
243    assert!(s.get_source_file_name_at(0xeeee).is_err());
244}
245
246#[test]
247fn test_iter_sources() {
248    use bstr::ByteSlice;
249
250    let s = DbiSourcesSubstream::parse(TEST_SOURCES_DATA).unwrap();
251
252    let sources: Vec<(u32, &str)> = s
253        .iter_sources()
254        .map(|(i, s)| (i, s.to_str().unwrap()))
255        .collect();
256
257    assert_eq!(
258        &sources,
259        &[
260            (0x00, "foo.c"),
261            (0x14, "windows.h"),
262            (0x06, "bar.rs"),
263            (0x00, "foo.c"),
264            (0x14, "windows.h"),
265            (0x0d, "main.c"),
266        ]
267    );
268}
269
270#[test]
271fn test_sources_map() {
272    let s = DbiSourcesSubstream::parse(TEST_SOURCES_DATA).unwrap();
273    let map = s.sources_map().unwrap();
274    assert_eq!(map.len(), 4); // 4 unique file names
275    assert_eq!(*map.get(&0x00).unwrap(), "foo.c");
276    assert_eq!(*map.get(&0x06).unwrap(), "bar.rs");
277    assert_eq!(*map.get(&0x0d).unwrap(), "main.c");
278    assert_eq!(*map.get(&0x14).unwrap(), "windows.h");
279}