ms_pdb_msf/lib.rs
1//! Reads and writes Multi-Stream Files (MSF). MSF is the underlying container format used by
2//! Program Database (PDB) files.
3//!
4//! MSF files contain a set of numbered _streams_. Each stream is like a file; a stream is a
5//! sequence of bytes.
6//!
7//! The bytes stored within a single stream are usually not stored sequentially on disk. The
8//! organization of the disk file and the mapping from stream locations to MSF file locations is
9//! similar to a traditional file system; managing that mapping is the main purpose of the MSF
10//! file format.
11//!
12//! MSF files are used as the container format for Program Database (PDB) files. PDB files are used
13//! by compilers, debuggers, and other tools when targeting Windows.
14//!
15//! Most developers should not use this crate directly. This crate is a building block for tools
16//! that read and write PDBs. This crate does not provide any means for building or parsing the
17//! data structures of PDB files; it only handles storing files in the MSF container format.
18//!
19//! The `mspdb` crate uses this crate for reading and writing PDB files. It provides an interface
20//! for reading PDB data structures, and in some cases for creating or modifying them. Most
21//! developers should use `mspdb` instead of using `msf` directly.
22//!
23//! # References
24//!
25//! * [The MSF File Format](https://llvm.org/docs/PDB/MsfFile.html)
26//! * [The PDB File Format](https://llvm.org/docs/PDB/index.html)
27//! * [`microsoft-pdb` repository](https://github.com/microsoft/microsoft-pdb): Many of the comments
28//! in this Rust crate reference C++ source files and header files from this `microsoft-pdb`
29//! repository. If a C++ file is referenced in a comment without more context, such as `dbi.h`,
30//! then check for it in the `microsoft-pdb` repository.
31
32#![forbid(unused_must_use)]
33#![forbid(unsafe_code)]
34#![warn(missing_docs)]
35#![allow(clippy::collapsible_if)]
36#![allow(clippy::needless_late_init)]
37#![allow(clippy::needless_lifetimes)]
38
39mod check;
40mod commit;
41mod open;
42mod pages;
43mod read;
44mod stream_reader;
45mod stream_writer;
46mod write;
47
48#[cfg(test)]
49mod tests;
50
51pub use open::CreateOptions;
52pub use stream_reader::StreamReader;
53pub use stream_writer::StreamWriter;
54
55use anyhow::bail;
56use bitvec::prelude::{BitVec, Lsb0};
57use pow2::{IntOnlyPow2, Pow2};
58use std::collections::HashMap;
59use std::fs::{File, OpenOptions};
60use std::io::{Read, Seek, SeekFrom};
61use std::mem::size_of;
62use std::path::Path;
63use sync_file::{RandomAccessFile, ReadAt, WriteAt};
64use zerocopy::{FromBytes, FromZeros, Immutable, IntoBytes, KnownLayout, LE, U16, U32, Unaligned};
65
66use self::pages::{PageAllocator, num_pages_for_stream_size};
67
68/// Identifies a page number in the MSF file. Not to be confused with `StreamPage`.
69type Page = u32;
70
71/// Identifies a page within a stream. `StreamPage` can be translated to `Page` by using the
72/// stream page mapper.
73type StreamPage = u32;
74
75const FPM_NUMBER_1: u32 = 1;
76const FPM_NUMBER_2: u32 = 2;
77
78/// The value of `magic` for "big" MSF files.
79const MSF_BIG_MAGIC: [u8; 32] = *b"Microsoft C/C++ MSF 7.00\r\n\x1a\x44\x53\x00\x00\x00";
80
81/// This identifies MSF files before the transition to "big" MSF files.
82const MSF_SMALL_MAGIC: [u8; 0x2c] = *b"Microsoft C/C++ program database 2.00\r\n\x1a\x4a\x47\0\0";
83
84#[test]
85fn show_magics() {
86 use pretty_hex::PrettyHex;
87
88 println!("MSF_SMALL_MAGIC:");
89 println!("{:?}", MSF_SMALL_MAGIC.hex_dump());
90
91 println!("MSF_BIG_MAGIC:");
92 println!("{:?}", MSF_BIG_MAGIC.hex_dump());
93}
94
95/// The header of the PDB/MSF file, before the transition to "big" MSF files.
96/// This is at file offset 0.
97#[allow(missing_docs)]
98#[derive(IntoBytes, FromBytes, Unaligned, Immutable, KnownLayout)]
99#[repr(C)]
100struct SmallMsfHeader {
101 /// Identifies this file as a PDB. Value must be [`MSF_SMALL_MAGIC`].
102 magic: [u8; 0x2c],
103 page_size: U32<LE>,
104 active_fpm: U16<LE>,
105 num_pages: U16<LE>,
106 stream_dir_size: U32<LE>,
107 /// This field contains a pointer to an in-memory data structure, and hence is meaningless.
108 /// Decoders should ignore this field. Encoders should set this field to 0.
109 stream_dir_ptr: U32<LE>,
110}
111
112/// The header of the PDB/MSF file. This is at file offset 0.
113#[derive(IntoBytes, FromBytes, Unaligned, Immutable, KnownLayout)]
114#[repr(C)]
115struct MsfHeader {
116 /// Identifies this file as a PDB.
117 magic: [u8; 32],
118
119 /// The size of each page, in bytes.
120 page_size: U32<LE>,
121
122 /// Page number of the active FPM. This can only be 1 or 2. In the C++ implementation (in the
123 /// `microsoft-pdb` repository), this is `pnFpm`.
124 active_fpm: U32<LE>,
125
126 /// The number of pages in this MSF file. In the C++ implementation, this is `pnMac`.
127 num_pages: U32<LE>,
128
129 /// Size of the Stream Directory, in bytes. In the C++ implementation, this is `siSt.cb`.
130 stream_dir_size: U32<LE>,
131
132 /// The page which contains the Stream Directory Map. This page contains a list of pages
133 /// which contain the Stream Directory.
134 ///
135 /// This field is only used for "Small MSF" (pre-"Big MSF") encoding. When using Big MSF,
136 /// this field is expected to be zero.
137 ///
138 /// In the C++ implementation, this is `mpspnpn` (map of stream page number to page number).
139 stream_dir_small_page_map: U32<LE>,
140 // When using "Big MSF", there is an array of u32 values that immediately follow
141 // the MSfHeader. The size of the array is a function of stream_dir_size and num_pages:
142 //
143 // divide_round_up(divide_round_up(stream_dir_size, num_pages) * 4), num_pages)
144 //
145 // pub stream_dir_big_page_map: [U32<LE>],
146}
147
148/// The length of the MSF File Header.
149const MSF_HEADER_LEN: usize = size_of::<MsfHeader>();
150
151/// The byte offset of the stream directory page map. This is a small array of page indices that
152/// point to pages that contain the stream directory. This is used only with the Big MSF encoding.
153const STREAM_DIR_PAGE_MAP_FILE_OFFSET: u64 = MSF_HEADER_LEN as u64;
154static_assertions::const_assert_eq!(MSF_HEADER_LEN, 52);
155
156/// The minimum page size.
157pub const MIN_PAGE_SIZE: PageSize = PageSize::from_exponent(9);
158
159/// The default page size.
160pub const DEFAULT_PAGE_SIZE: PageSize = PageSize::from_exponent(12);
161
162/// A large page size. This is less than the largest supported page size.
163pub const LARGE_PAGE_SIZE: PageSize = PageSize::from_exponent(13);
164
165/// The largest supported page size.
166pub const MAX_PAGE_SIZE: PageSize = PageSize::from_exponent(16);
167
168/// This size is used to mark a stream as "invalid". An invalid stream is different from a
169/// stream with a length of zero bytes.
170pub const NIL_STREAM_SIZE: u32 = 0xffff_ffff;
171
172/// Specifies a page size used in an MSF file. This value is always a power of 2.
173pub type PageSize = Pow2;
174
175/// The stream index of the Stream Directory stream. This is reserved and cannot be used by
176/// applications.
177pub const STREAM_DIR_STREAM: u32 = 0;
178
179/// The maximum valid stream index.
180///
181/// Although this library uses `u32` for stream indices, many MSF and PDB data structures assume
182/// that stream numbers cab be stored in `u16`. Also, `0xffff` is used as a sentinel value in
183/// some data structures, so that value cannot be a valid stream index.
184pub const MAX_STREAM: u32 = 0xfffe;
185
186/// Converts a page number to a file offset.
187fn page_to_offset(page: u32, page_size: PageSize) -> u64 {
188 (page as u64) << page_size.exponent()
189}
190
191/// Given an interval number, returns the page number of the first page of the interval.
192fn interval_to_page(interval: u32, page_size: PageSize) -> u32 {
193 interval << page_size.exponent()
194}
195
196/// Gets the byte offset within a page, for a given offset within a stream.
197pub fn offset_within_page(offset: u32, page_size: PageSize) -> u32 {
198 let page_low_mask = (1u32 << page_size.exponent()) - 1u32;
199 offset & page_low_mask
200}
201
202/// Allows reading and writing the contents of a PDB/MSF file.
203///
204/// The [`Msf::open`] function opens an MSF file for read access, given a file. This is the most
205/// commonly-used way to open a file.
206pub struct Msf<F = RandomAccessFile> {
207 /// The data source.
208 file: F,
209
210 kind: MsfKind,
211
212 /// The FPM number for the committed (active) FPM.
213 ///
214 /// The `commit()` function can change this number.
215 active_fpm: u32,
216
217 /// Contains the sizes of all streams. The length of `stream_sizes` implicitly defines
218 /// the number of streams.
219 ///
220 /// Values in this vector may be [`NIL_STREAM_SIZE`], indicating that the stream is present
221 /// but is a nil stream.
222 ///
223 /// As streams are modified, this vector changes. It contains a combination of both committed
224 /// and uncommitted state.
225 stream_sizes: Vec<u32>,
226
227 /// The maximum number of streams that we will allow to be created using `new_stream` or
228 /// `nil_stream`. The default value is 0xfffe, which prevents overflowing the 16-bit stream
229 /// indexes that are used in PDB (or confusing them with the "nil" stream index).
230 max_streams: u32,
231
232 /// Contains the page numbers for all streams in the committed state.
233 committed_stream_pages: Vec<Page>,
234
235 /// Vector contains offsets into `committed_stream_pages` where the pages for a given stream start.
236 committed_stream_page_starts: Vec<u32>,
237
238 /// Handles allocating pages.
239 pages: PageAllocator,
240
241 /// If a stream has been modified then there is an entry in this table for it. The key for
242 /// each entry is the stream number. The value is the sequence of pages for that stream.
243 ///
244 /// One of the side-effects of the [`Msf::commit`] function is that the `modified_streams`
245 /// table is cleared.
246 ///
247 /// This table is always empty if `access_mode == AccessMode::Read`.
248 modified_streams: HashMap<u32, Vec<Page>>,
249
250 access_mode: AccessMode,
251}
252
253/// Specifies the versions used for the MSF.
254#[derive(Copy, Clone, Eq, PartialEq, Debug)]
255pub enum MsfKind {
256 /// The obsolete, pre-Big MSF encoding. This library does not support creating or modifying
257 /// MSF files that use this encoding, but it does support reading them.
258 Small,
259 /// The Big MSF encoding, which is the encoding currently used by most tools that target
260 /// Windows.
261 Big,
262}
263
264/// Specifies the access mode for opening a PDB/MSF file.
265#[derive(Copy, Clone, Eq, PartialEq, Debug)]
266enum AccessMode {
267 /// Read-only access
268 Read,
269 /// Read-write access
270 ReadWrite,
271}
272
273impl<F> Msf<F> {
274 /// Returns the page size used for this file.
275 pub fn page_size(&self) -> PageSize {
276 self.pages.page_size
277 }
278
279 /// Gets access to the stream page pointers for a given stream. The stream page pointers
280 /// provide the mapping from offsets within a stream to offsets within the entire PDB (MSF) file.
281 ///
282 /// If the stream is a NIL stream, then this returns `(NIL_STREAM_SIZE, &[])`.
283 pub fn stream_size_and_pages(&self, stream: u32) -> Result<(u32, &[Page]), anyhow::Error> {
284 let Some(&stream_size) = self.stream_sizes.get(stream as usize) else {
285 bail!("Stream index is out of range. Index: {stream}");
286 };
287
288 if stream_size == NIL_STREAM_SIZE {
289 // This is a NIL stream.
290 return Ok((NIL_STREAM_SIZE, &[]));
291 }
292
293 // The stream index is valid and the stream is not a NIL stream.
294 let num_stream_pages =
295 num_pages_for_stream_size(stream_size, self.pages.page_size) as usize;
296
297 if num_stream_pages == 0 {
298 // The stream is valid (is not nil) and is a zero-length stream.
299 // It has no pages assigned to it.
300 return Ok((0, &[]));
301 }
302
303 // If this stream has been modified, then return the modified page list.
304 if let Some(pages) = self.modified_streams.get(&stream) {
305 assert_eq!(num_stream_pages, pages.len());
306 return Ok((stream_size, pages.as_slice()));
307 }
308
309 let start = self.committed_stream_page_starts[stream as usize] as usize;
310 let pages = &self.committed_stream_pages[start..start + num_stream_pages];
311 Ok((stream_size, pages))
312 }
313
314 /// The total number of streams in this PDB, including nil streams.
315 pub fn num_streams(&self) -> u32 {
316 self.stream_sizes.len() as u32
317 }
318
319 /// Gets the size of a given stream, in bytes.
320 ///
321 /// The `stream` value must be in a valid range of `0..num_streams()`.
322 ///
323 /// If `stream` is a NIL stream, this function returns 0.
324 pub fn stream_size(&self, stream: u32) -> u32 {
325 assert!((stream as usize) < self.stream_sizes.len());
326 let stream_size = self.stream_sizes[stream as usize];
327 if stream_size == NIL_STREAM_SIZE {
328 0
329 } else {
330 stream_size
331 }
332 }
333
334 /// Indicates whether a given stream index is valid.
335 pub fn is_valid_stream_index(&self, stream: u32) -> bool {
336 (stream as usize) < self.stream_sizes.len()
337 }
338
339 /// Indicates that a stream index is valid, and that its length is valid.
340 pub fn is_stream_valid(&self, stream: u32) -> bool {
341 if stream != 0 && (stream as usize) < self.stream_sizes.len() {
342 self.stream_sizes[stream as usize] != NIL_STREAM_SIZE
343 } else {
344 false
345 }
346 }
347
348 /// Return the nominal length of this file, in bytes.
349 ///
350 /// This is the number of pages multiplied by the page size. It is not guaranteed to be equal to
351 /// the on-disk size of the file, but in practice it usually is.
352 pub fn nominal_size(&self) -> u64 {
353 page_to_offset(self.pages.num_pages, self.pages.page_size)
354 }
355
356 /// Return the total number of pages allocated to the file, including all pages (allocated,
357 /// unallocated, etc.).
358 ///
359 /// This count includes pages allocated to streams, Page 0, FPM pages, pages that are free
360 /// (not allocated), and pages allocated to the Stream Directory.
361 pub fn num_total_pages(&self) -> u32 {
362 self.pages.num_pages
363 }
364
365 /// Returns the number of free pages.
366 ///
367 /// This number counts the pages that are _less than_ `num_pages`. There may be pages assigned
368 /// to the MSF file beyond `num_pages`, but if there are then this does not count that space.
369 ///
370 /// This value does not count Page 0, pages assigned to the FPM, streams, or the current
371 /// Stream Directory. It does count pages assigned to the old stream directory.
372 pub fn num_free_pages(&self) -> u32 {
373 self.pages.fpm.count_ones() as u32
374 }
375
376 /// Extracts the underlying file for this MSF. **All pending modifications are dropped**.
377 pub fn into_file(self) -> F {
378 self.file
379 }
380
381 /// Gets access to the contained file
382 pub fn file(&self) -> &F {
383 &self.file
384 }
385
386 /// Gets mutable access to the contained file
387 pub fn file_mut(&mut self) -> &mut F {
388 &mut self.file
389 }
390
391 /// Indicates whether this [`Msf`] was opened for read/write access.
392 pub fn is_writable(&self) -> bool {
393 self.access_mode == AccessMode::ReadWrite
394 }
395}
396
397impl<F: ReadAt> Msf<F> {
398 /// Reads a portion of a stream to a vector.
399 pub fn read_stream_section_to_box(
400 &self,
401 stream: u32,
402 start: u32,
403 size: u32,
404 ) -> anyhow::Result<Box<[u8]>>
405 where
406 F: ReadAt,
407 {
408 let reader = self.get_stream_reader(stream)?;
409 let mut stream_data = FromZeros::new_box_zeroed_with_elems(size as usize)
410 .map_err(|_| std::io::Error::from(std::io::ErrorKind::OutOfMemory))?;
411 reader.read_exact_at(&mut stream_data, u64::from(start))?;
412 Ok(stream_data)
413 }
414
415 /// Reads the entire stream into a `Box<[u8]>`.
416 pub fn read_stream_to_box(&self, stream: u32) -> anyhow::Result<Box<[u8]>> {
417 let reader = self.get_stream_reader(stream)?;
418 let mut stream_data = FromZeros::new_box_zeroed_with_elems(reader.len() as usize)
419 .map_err(|_| std::io::Error::from(std::io::ErrorKind::OutOfMemory))?;
420 reader.read_exact_at(&mut stream_data, 0)?;
421 Ok(stream_data)
422 }
423
424 /// Reads an entire stream to a vector.
425 pub fn read_stream_to_vec(&self, stream: u32) -> anyhow::Result<Vec<u8>> {
426 let stream_data = self.read_stream_to_box(stream)?;
427
428 // This conversion does not reallocate data. It just adds a 'capacity' field.
429 Ok(stream_data.into_vec())
430 }
431
432 /// Reads an entire stream into an existing vector.
433 pub fn read_stream_to_vec_mut(
434 &self,
435 stream: u32,
436 stream_data: &mut Vec<u8>,
437 ) -> anyhow::Result<()> {
438 let reader = self.get_stream_reader(stream)?;
439
440 // Do not clear and resize. Doing so requires zeroing all the data in the vector.
441 // Since we are going to read into the vector, that means we would modify every byte twice.
442 // That's expensive when you're working with a lot of data.
443 stream_data.resize(reader.len() as usize, 0);
444
445 reader.read_exact_at(stream_data, 0)?;
446 Ok(())
447 }
448
449 /// Returns an object which can read from a given stream. The returned object implements
450 /// the [`Read`], [`Seek`], and [`ReadAt`] traits.
451 pub fn get_stream_reader(&self, stream: u32) -> anyhow::Result<StreamReader<'_, F>>
452 where
453 F: ReadAt,
454 {
455 let (stream_size, stream_pages) = self.stream_size_and_pages(stream)?;
456 Ok(StreamReader::new(
457 self,
458 stream,
459 stream_size,
460 stream_pages,
461 0,
462 ))
463 }
464}
465
466/// Checks whether the header of a file appears to be a valid MSF file.
467///
468/// This only looks at the signature; it does not read anything else in the file. This is useful
469/// for quickly determining whether a file could be an MSF file, but without any validation.
470pub fn is_file_header_msf(header: &[u8]) -> bool {
471 header.starts_with(&MSF_BIG_MAGIC) || header.starts_with(&MSF_SMALL_MAGIC)
472}
473
474/// The absolute minimum size of a slice that could contain a valid MSF file header, as tested by
475/// [`is_file_header_msf`].
476///
477/// This does not specify the minimum valid size of an MSF file. It is only a recommended minimum
478/// for callers of [`is_file_header_msf`].
479pub const MIN_FILE_HEADER_SIZE: usize = 0x100;
480
481#[doc(hidden)]
482pub fn open_options_shared(options: &mut OpenOptions) -> &mut OpenOptions {
483 #[cfg(windows)]
484 {
485 use std::os::windows::fs::OpenOptionsExt;
486 const FILE_SHARE_READ: u32 = 1;
487 options.share_mode(FILE_SHARE_READ)
488 }
489 #[cfg(not(windows))]
490 {
491 options
492 }
493}
494
495#[doc(hidden)]
496pub fn open_options_exclusive(options: &mut OpenOptions) -> &mut OpenOptions {
497 #[cfg(windows)]
498 {
499 use std::os::windows::fs::OpenOptionsExt;
500 options.share_mode(0)
501 }
502 #[cfg(not(windows))]
503 {
504 options
505 }
506}