ms_pdb/dbi/sources.rs
1//! DBI Sources Substream
2
3use super::*;
4use crate::BStr;
5use std::collections::HashMap;
6
7/// The "Sources" substream of the DBI stream. This stream describes the merged set of source
8/// files that were the inputs (compilands) of all modules.
9///
10/// See: <https://llvm.org/docs/PDB/DbiStream.html#file-info-substream>
11pub struct DbiSourcesSubstream<'a> {
12 /// The `module_file_starts` array gives the index within `file_name_offsets` where the file
13 /// names for each module begin. That is, `file_name_offsets[module_file_starts[m]]` is the file
14 /// name offset for the first file in the set of files for module `m`.
15 ///
16 /// When combined with the `module_file_counts` array, you can easily find the slice within
17 /// `file_name_offsets` of files for a specific module.
18 ///
19 /// The length of this slice is equal to `num_modules`. This slice _does not_ have an extra
20 /// entry at the end, so you must use `file_name_offsets.len()` as the end of the per-module
21 /// slice for the last entry in this slice.
22 module_file_starts: &'a [U16<LE>],
23
24 /// For each module, gives the number of source files that contribute to that module.
25 module_file_counts: &'a [U16<LE>],
26
27 /// Contains the concatenated list of file name lists, one list per module. For each module
28 /// `m`, the set of items within `file_name_offsets` is given by
29 /// `file_name_offsets[module_file_starts[m]..][..module_file_counts[m]]`.
30 ///
31 /// Each item in this list is an offset into `names_buffer` and points to the start of a
32 /// NUL-terminated UTF-8 string.
33 ///
34 /// This array can (and usually does) contain duplicate values. The values are ordered by the
35 /// module which referenced a given set of source files. Since many modules will read a shared
36 /// set of header files (e.g. `windows.h`), those shared header files will appear many times
37 /// in this list.
38 ///
39 /// The length of `file_name_offsets` is usually higher than the number of _unique_ source files
40 /// because many source files (header files) are referenced by more than one module.
41 ///
42 /// The length of this slice is equal to the sum of the values in the `module_file_counts`.
43 /// The on-disk file format stores a field that counts the number of source files, but the field
44 /// is only 16-bit, so it can easily overflow on large executables. That is why this
45 /// value is computed when the substream is parsed, instead of using the on-disk version.
46 file_name_offsets: &'a [U32<LE>],
47
48 /// Contains the file name strings, encoded in UTF-8 and NUL-terminated.
49 names_buffer: &'a [u8],
50}
51
52impl<'a> DbiSourcesSubstream<'a> {
53 /// The number of modules
54 pub fn num_modules(&self) -> usize {
55 self.module_file_starts.len()
56 }
57
58 /// Provides access to the file name offsets slice. Each value is a file name offset, and can
59 /// be used with `get_source_name_at()`.
60 pub fn file_name_offsets(&self) -> &'a [U32<LE>] {
61 self.file_name_offsets
62 }
63
64 /// Parses the file info substream.
65 ///
66 /// This does not parse or validate every part of the substream. It only parses enough to find
67 /// the module indices and file names.
68 pub fn parse(substream_data: &'a [u8]) -> anyhow::Result<Self> {
69 let mut p = Parser::new(substream_data);
70 let num_modules = p.u16()? as usize;
71
72 // In theory this is supposed to contain the number of source files for which this substream
73 // contains information. But that would present a problem in that the width of this field
74 // being 16-bits would prevent one from having more than 64K source files in a program. In
75 // early versions of the file format, this seems to have been the case. In order to support
76 // more than this, this field of the is simply ignored, and computed dynamically by summing
77 // up the values of the ModFileCounts array (discussed below).
78 //
79 // In short, this value should be ignored. However, we still have to read the value in
80 // order to parse the header correctly.
81 let _obsolete_num_source_files = p.u16()? as usize;
82
83 let module_file_starts: &[U16<LE>] = p.slice(num_modules)?;
84
85 // An array of num_modules integers, each one containing the number of source files which
86 // contribute to the module at the specified index. While each individual module is limited
87 // to 64K contributing source files, the union of all modules' source files may be greater
88 // than 64K. The real number of source files is thus computed by summing this array.
89 //
90 // Note that summing this array does not give the number of _unique source files_, only the
91 // total number of source file contributions to modules.
92 let module_file_counts: &[U16<LE>] = p.slice(num_modules)?;
93
94 let num_file_offsets = module_file_counts.iter().map(|c| c.get() as usize).sum();
95
96 // At this point, we could scan module_file_starts + module_file_counts and validate that
97 // no entry exceeds num_file_offsets.
98
99 let file_name_offsets = p.slice(num_file_offsets)?;
100 let names_buffer = p.into_rest();
101
102 Ok(Self {
103 module_file_starts,
104 module_file_counts,
105 file_name_offsets,
106 names_buffer,
107 })
108 }
109
110 /// Given a source file index, returns the source file name.
111 pub fn get_source_file_name(&self, source_file_index: usize) -> Result<&'a BStr, ParserError> {
112 let offset = self.file_name_offsets[source_file_index].get();
113 self.get_source_file_name_at(offset)
114 }
115
116 /// Given a file name offset (within `name_buffer`), returns the source file name.
117 pub fn get_source_file_name_at(&self, file_name_offset: u32) -> Result<&'a BStr, ParserError> {
118 let Some(string_data) = self.names_buffer.get(file_name_offset as usize..) else {
119 return Err(ParserError);
120 };
121 let mut p = Parser::new(string_data);
122 let file_name = p.strz()?;
123 Ok(file_name)
124 }
125
126 /// Caller is expected to validate module_index (against `num_modules()`) before calling
127 pub fn name_offsets_for_module(&self, module_index: usize) -> anyhow::Result<&[U32<LE>]> {
128 let start = self.module_file_starts[module_index].get() as usize;
129 let count = self.module_file_counts[module_index].get() as usize;
130 let Some(s) = self.file_name_offsets.get(start..start + count) else {
131 bail!(
132 "File name offsets for module #{module_index} are invalid. start: {start}, count: {count}, len available: {}",
133 self.file_name_offsets.len()
134 );
135 };
136 Ok(s)
137 }
138
139 /// Iterates source files in the DBI Sources Substream.
140 pub fn iter_sources(&self) -> IterSources<'_> {
141 IterSources {
142 names_buffer: self.names_buffer,
143 file_name_offsets: self.file_name_offsets.iter(),
144 }
145 }
146
147 /// Builds a HashMap that maps from file name offsets to strings.
148 pub fn sources_map(&self) -> anyhow::Result<HashMap<u32, &BStr>> {
149 let mut unique_offsets: Vec<u32> = self.file_name_offsets.iter().map(|i| i.get()).collect();
150 unique_offsets.sort_unstable();
151 unique_offsets.dedup();
152
153 let mut map = HashMap::new();
154 for &offset in unique_offsets.iter() {
155 let name = self.get_source_file_name_at(offset)?;
156 map.insert(offset, name);
157 }
158
159 Ok(map)
160 }
161}
162
163/// Iterates source files in the DBI Sources Substream.
164pub struct IterSources<'a> {
165 names_buffer: &'a [u8],
166 file_name_offsets: std::slice::Iter<'a, U32<LE>>,
167}
168
169impl<'a> Iterator for IterSources<'a> {
170 /// name_offset (in bytes), name
171 type Item = (u32, &'a BStr);
172
173 fn next(&mut self) -> Option<Self::Item> {
174 let offset = self.file_name_offsets.next()?.get();
175 let mut p = Parser::new(self.names_buffer);
176 p.skip(offset as usize).ok()?;
177 let name = p.strz().ok()?;
178 Some((offset, name))
179 }
180}
181
182#[cfg(test)]
183#[rustfmt::skip]
184static TEST_SOURCES_DATA: &[u8] = &[
185 /* 0x0000 */ 4, 0, // num_modules = 4
186 /* 0x0004 */ 0xee, 0xee, // obsolete num_sources (bogus)
187 /* 0x0008 */ 0, 0, 2, 0, 3, 0, 3, 0, // module_file_starts = [0, 2, 3, 3]
188 /* 0x0010 */ 2, 0, 1, 0, 0, 0, 3, 0, // module_file_counts = [2, 1, 0, 3] sum = 6
189
190 /* 0x0018 */ // file_offsets, len = 6 items, 24 bytes
191 /* 0x0018 */ 0x00, 0, 0, 0, // module 0, file_offsets[0] = 0x00, points to "foo.c",
192 /* 0x0018 */ 0x14, 0, 0, 0, // module 0, file_offsets[1] = 0x14, points to "windows.h"
193 /* 0x0018 */ 0x06, 0, 0, 0, // module 1, file_offsets[2] = 0x06, points to "bar.rs"
194 /* 0x0018 */ 0x00, 0, 0, 0, // module 3, file_offsets[3] = 0x00, points to "foo.c"
195 /* 0x0018 */ 0x14, 0, 0, 0, // module 3, file_offsets[4] = 0x14, points to "windows.h"
196 /* 0x0018 */ 0x0d, 0, 0, 0, // module 3, file_offsets[5] = 0x0d, points to "main.c"
197
198 // names_buffer; contains (at relative offsets):
199 // name offset 0x0000 : "foo.c"
200 // name offset 0x0006 : "bar.rs"
201 // name offset 0x000d : "main.c"
202 // name offset 0x0014 : "windows.h"
203 /* 0x0030 */ // names_buffer
204 /* 0x0030 */ b'f', b'o', b'o', b'.',
205 /* 0x0034 */ b'c', 0, b'b', b'a',
206 /* 0x0038 */ b'r', b'.', b'r', b's',
207 /* 0x003c */ 0, b'm', b'a', b'i',
208 /* 0x0040 */ b'n', b'.', b'c', 0,
209 /* 0x0044 */ b'w', b'i', b'n', b'd',
210 /* 0x0048 */ b'o', b'w', b's', b'.',
211 /* 0x004c */ b'h', 0, 0, 0,
212
213 /* 0x0050 : end */
214];
215
216#[test]
217fn basic_parse() {
218 let s = DbiSourcesSubstream::parse(TEST_SOURCES_DATA).unwrap();
219 assert_eq!(s.num_modules(), 4);
220
221 assert_eq!(s.file_name_offsets.len(), 6);
222
223 let module_file_starts: Vec<u16> = s.module_file_starts.iter().map(|x| x.get()).collect();
224 assert_eq!(&module_file_starts, &[0, 2, 3, 3]);
225
226 let module_file_counts: Vec<u16> = s.module_file_counts.iter().map(|x| x.get()).collect();
227 assert_eq!(&module_file_counts, &[2, 1, 0, 3]);
228
229 let file_name_offsets: Vec<u32> = s.file_name_offsets.iter().map(|x| x.get()).collect();
230 assert_eq!(&file_name_offsets, &[0x00, 0x14, 0x06, 0x00, 0x14, 0x0d]);
231
232 // Read the file names. Remember that there are duplicates in this list.
233 assert_eq!(s.get_source_file_name(0).unwrap(), "foo.c");
234 assert_eq!(s.get_source_file_name(1).unwrap(), "windows.h");
235 assert_eq!(s.get_source_file_name(2).unwrap(), "bar.rs");
236 assert_eq!(s.get_source_file_name(3).unwrap(), "foo.c");
237 assert_eq!(s.get_source_file_name(4).unwrap(), "windows.h");
238 assert_eq!(s.get_source_file_name(5).unwrap(), "main.c");
239
240 let modsrcs0 = s.name_offsets_for_module(0).unwrap();
241 assert_eq!(modsrcs0.len(), 2);
242 assert_eq!(modsrcs0[0].get(), 0);
243 assert_eq!(modsrcs0[1].get(), 0x14);
244
245 // Test bounds check on get_source_file_name_at()
246 assert!(s.get_source_file_name_at(0xeeee).is_err());
247}
248
249#[test]
250fn test_iter_sources() {
251 use bstr::ByteSlice;
252
253 let s = DbiSourcesSubstream::parse(TEST_SOURCES_DATA).unwrap();
254
255 let sources: Vec<(u32, &str)> = s
256 .iter_sources()
257 .map(|(i, s)| (i, s.to_str().unwrap()))
258 .collect();
259
260 assert_eq!(
261 &sources,
262 &[
263 (0x00, "foo.c"),
264 (0x14, "windows.h"),
265 (0x06, "bar.rs"),
266 (0x00, "foo.c"),
267 (0x14, "windows.h"),
268 (0x0d, "main.c"),
269 ]
270 );
271}
272
273#[test]
274fn test_sources_map() {
275 let s = DbiSourcesSubstream::parse(TEST_SOURCES_DATA).unwrap();
276 let map = s.sources_map().unwrap();
277 assert_eq!(map.len(), 4); // 4 unique file names
278 assert_eq!(*map.get(&0x00).unwrap(), "foo.c");
279 assert_eq!(*map.get(&0x06).unwrap(), "bar.rs");
280 assert_eq!(*map.get(&0x0d).unwrap(), "main.c");
281 assert_eq!(*map.get(&0x14).unwrap(), "windows.h");
282}