ms_pdb_msf/
open.rs

1//! Code for opening or creating MSF files.
2
3use super::*;
4use sync_file::RandomAccessFile;
5use tracing::{trace, trace_span, warn};
6use zerocopy::IntoBytes;
7
8/// Options for creating a new PDB/MSF file.
9#[derive(Clone, Debug)]
10pub struct CreateOptions {
11    /// The page size to use. This must be in the range [`MIN_PAGE_SIZE..=MAX_PAGE_SIZE`].
12    pub page_size: PageSize,
13
14    /// The maximum number of streams that we will allow to be created using `new_stream` or
15    /// `nil_stream`. The default value is 0xfffe, which prevents overflowing the 16-bit stream
16    /// indexes that are used in PDB (or confusing them with the "nil" stream index).
17    ///
18    /// Applications may increase this value beyond the default, but this will produce MSF files
19    /// that are not usable by most PDB tools.
20    pub max_streams: u32,
21}
22
23/// The maximum number of streams that PDB can tolerate.
24const DEFAULT_MAX_STREAMS: u32 = 0xfffe;
25
26impl Default for CreateOptions {
27    fn default() -> Self {
28        Self {
29            page_size: DEFAULT_PAGE_SIZE,
30            max_streams: DEFAULT_MAX_STREAMS,
31        }
32    }
33}
34
35impl Msf<RandomAccessFile> {
36    /// Opens an MSF file for read access, given a file name.
37    pub fn open(file_name: &Path) -> anyhow::Result<Self> {
38        let file = File::open(file_name)?;
39        let random_file = RandomAccessFile::from(file);
40        Self::new_with_access_mode(random_file, AccessMode::Read)
41    }
42
43    /// Creates a new MSF file on disk (**truncating any existing file!**) and creates a new
44    /// [`Msf`] object in-memory object with read/write access.
45    ///
46    /// This function does not write anything to disk until stream data is written or
47    /// [`Self::commit`] is called.
48    pub fn create(file_name: &Path, options: CreateOptions) -> anyhow::Result<Self> {
49        let file = File::create(file_name)?;
50        let random_file = RandomAccessFile::from(file);
51        Self::create_with_file(random_file, options)
52    }
53
54    /// Opens an existing MSF file for read/write access, given a file name.
55    pub fn modify(file_name: &Path) -> anyhow::Result<Self> {
56        let file = File::options().read(true).write(true).open(file_name)?;
57        let random_file = RandomAccessFile::from(file);
58        Self::modify_with_file(random_file)
59    }
60}
61
62impl<F: ReadAt> Msf<F> {
63    /// Opens an MSF file for read access, given a [`File`] that has already been opened.
64    pub fn open_with_file(file: F) -> anyhow::Result<Self> {
65        Self::new_with_access_mode(file, AccessMode::Read)
66    }
67
68    /// Creates a new MSF file, given a file handle that has already been opened.
69    ///
70    /// **This function destroys the contents of the existing file.**
71    pub fn create_with_file(file: F, options: CreateOptions) -> anyhow::Result<Self> {
72        Self::create_for(file, options)
73    }
74
75    /// Opens an existing MSF file for read/write access, given an [`File`] that has already
76    /// been opened.
77    ///
78    /// The `file` handle will be used for absolute reads and writes. The caller should never use
79    /// this same file handle for reads (and especially not for writes) while also using [`Msf`]
80    /// because the operating system's read/write file position may be updated by [`Msf`].
81    pub fn modify_with_file(file: F) -> anyhow::Result<Self> {
82        Self::new_with_access_mode(file, AccessMode::ReadWrite)
83    }
84
85    /// Reads the header of a PDB file and provides access to the streams contained within the
86    /// PDB file.
87    ///
88    /// This function reads the MSF File Header, which is the header for the entire file.
89    /// It also reads the stream directory, so it knows how to find each of the streams
90    /// and the pages of the streams.
91    fn new_with_access_mode(file: F, access_mode: AccessMode) -> anyhow::Result<Self> {
92        // Read the MSF File Header.
93
94        let _span = trace_span!("Msf::new_with_access_mode").entered();
95
96        const MIN_PAGE_SIZE_USIZE: usize = 1usize << MIN_PAGE_SIZE.exponent();
97
98        let mut page0: [u8; MIN_PAGE_SIZE_USIZE] = [0; MIN_PAGE_SIZE_USIZE];
99
100        // If this read fails, then the file is too small to be a valid PDB of any kind.
101        file.read_exact_at(&mut page0, 0)?;
102
103        let msf_kind: MsfKind;
104        let page_size: u32;
105        let active_fpm: u32;
106        let num_pages: u32;
107        let stream_dir_size: u32;
108
109        if page0.starts_with(&MSF_BIG_MAGIC) {
110            // unwrap() cannot fail because page0 has a fixed size that is larger than MsfHeader
111            let (msf_header, _) = MsfHeader::ref_from_prefix(page0.as_slice()).unwrap();
112            page_size = msf_header.page_size.get();
113            active_fpm = msf_header.active_fpm.get();
114            num_pages = msf_header.num_pages.get();
115            stream_dir_size = msf_header.stream_dir_size.get();
116            msf_kind = MsfKind::Big;
117
118            // The active FPM can only be 1 or 2.
119            if !matches!(active_fpm, 1 | 2) {
120                bail!("The PDB header is invalid.  The active FPM is invalid.");
121            }
122        } else if page0.starts_with(&MSF_SMALL_MAGIC) {
123            // Found an "old" MSF header.
124            // unwrap() cannot fail because page0 has a fixed size that is larger than SmallMsfHeader
125            let (msf_header, _) = SmallMsfHeader::ref_from_prefix(page0.as_slice()).unwrap();
126            page_size = msf_header.page_size.get();
127            active_fpm = msf_header.active_fpm.get() as u32;
128            num_pages = msf_header.num_pages.get() as u32;
129            stream_dir_size = msf_header.stream_dir_size.get();
130            msf_kind = MsfKind::Small;
131        } else if page0[16..24] == *b"PDB v1.0" {
132            bail!("This file is a Portable PDB, which is not supported.");
133        } else {
134            bail!("PDB file does not have the correct header (magic is wrong).");
135        }
136
137        let Ok(page_size_pow2) = PageSize::try_from(page_size) else {
138            bail!("The PDB header is invalid. The page size ({page_size}) is not a power of 2.",);
139        };
140
141        if num_pages == 0 {
142            bail!("PDB specifies invalid value for num_pages (zero).");
143        }
144
145        let mut stream_sizes: Vec<u32>;
146
147        // The number of pages in the stream directory.
148        let stream_dir_num_pages = stream_dir_size.div_round_up(page_size_pow2);
149
150        // Create the PageAllocator. This initializes the fpm vector to "everything is free"
151        // and then sets Page 0 and the FPM pages as "free". Nothing is marked as "freed".
152        let mut page_allocator = PageAllocator::new(num_pages as usize, page_size_pow2);
153
154        let mut committed_stream_pages: Vec<Page>;
155        let mut committed_stream_page_starts: Vec<u32>;
156
157        match msf_kind {
158            MsfKind::Big => {
159                // "Big MSF" uses a 3-level hierarchy for the Stream Directory:
160                //
161                // stream_dir_map        <-- contains u32 pages to ↓
162                // stream_dir_pages      <-- contains u32 pages to ↓
163                // stream_dir_bytes      <-- bottom level, stored in pages
164                //
165                // stream_dir_map is an array of u32 page pointers. It is stored directly in
166                // page 0, immediately after MsfHeader. These pointers point to pages that contain
167                // the stream_dir_pages, which is the next level down.
168                // The number of pages allocated to stream_dir_map = ceil(stream_dir_pages.len() * 4 / page_size).
169                // The number of bytes used within stream_dir_map = stream_dir_pages.len() * 4.
170                //
171                // stream_dir_pages is a set of pages. When concatenated, they contain the page
172                // pointers that point to the stream directory bytes.
173                // The number of pages in stream_dir_pages = ceil(stream_dir_size / page_size).
174                // The number of bytes used within stream_dir_pages is stream_dir_pages * 4.
175
176                if stream_dir_size % 4 != 0 {
177                    bail!("MSF Stream Directory has an invalid size; it is not a multiple of 4.");
178                }
179
180                // We are going to read the stream directory into this vector.
181                let mut stream_dir: Vec<U32<LE>> = vec![U32::new(0); stream_dir_size as usize / 4];
182
183                // Read the page map for the stream directory.
184                let stream_dir_l1_num_pages =
185                    num_pages_for_stream_size(4 * stream_dir_num_pages, page_size_pow2) as usize;
186                let Ok((page_map_l1_ptrs, _)) = <[U32<LE>]>::ref_from_prefix_with_elems(
187                    &page0[STREAM_DIR_PAGE_MAP_FILE_OFFSET as usize..],
188                    stream_dir_l1_num_pages,
189                ) else {
190                    bail!("Stream dir size is invalid (exceeds design limits)");
191                };
192
193                let stream_dir_bytes: &mut [u8] = stream_dir.as_mut_bytes();
194                let mut stream_dir_chunks = stream_dir_bytes.chunks_mut(page_size as usize);
195                // Now read the stream pages for the stream dir.
196                let mut l1_page: Vec<u8> = vec![0; page_size as usize];
197                'l1_loop: for &page_map_l1_ptr in page_map_l1_ptrs.iter() {
198                    let page_map_l1_ptr: u32 = page_map_l1_ptr.get();
199
200                    page_allocator.init_mark_stream_dir_page_busy(page_map_l1_ptr)?;
201                    if is_special_page_big_msf(page_size_pow2, page_map_l1_ptr) {
202                        bail!(
203                            "Stream dir contains invalid page number: {page_map_l1_ptr}. \
204                             Page points to Page 0 or to an FPM page."
205                        );
206                    }
207
208                    // Read the page pointers.
209                    let file_offset = page_to_offset(page_map_l1_ptr, page_size_pow2);
210                    file.read_exact_at(l1_page.as_mut_slice(), file_offset)?;
211
212                    // Now read the individual pages, as long as we have more.
213                    let l2_page_u32 = <[U32<LE>]>::ref_from_bytes(l1_page.as_slice()).unwrap();
214
215                    for &l2_page in l2_page_u32.iter() {
216                        let l2_page: u32 = l2_page.get();
217
218                        let Some(stream_dir_chunk) = stream_dir_chunks.next() else {
219                            break 'l1_loop;
220                        };
221
222                        page_allocator.init_mark_stream_dir_page_busy(l2_page)?;
223                        if is_special_page_big_msf(page_size_pow2, l2_page) {
224                            bail!(
225                                "Stream dir contains invalid page number: {l2_page}. \
226                                 Page points to Page 0 or to an FPM page."
227                            );
228                        }
229
230                        let l2_file_offset = page_to_offset(l2_page, page_size_pow2);
231                        file.read_exact_at(stream_dir_chunk, l2_file_offset)?;
232                    }
233                }
234
235                if stream_dir.is_empty() {
236                    bail!("Stream directory is invalid (zero-length)");
237                }
238
239                let num_streams = stream_dir[0].get() as usize;
240
241                // Stream 0 is special and must exist.
242                if num_streams == 0 {
243                    bail!("MSF file is invalid, because num_streams = 0.");
244                }
245
246                let Some(stream_sizes_src) = stream_dir.get(1..1 + num_streams) else {
247                    bail!("Stream directory is invalid (num_streams is not consistent with size)");
248                };
249                stream_sizes = stream_sizes_src.iter().map(|size| size.get()).collect();
250
251                let mut stream_pages_iter = &stream_dir[1 + num_streams..];
252
253                // Build committed_stream_pages and committed_stream_page_starts.
254                committed_stream_pages = Vec::with_capacity(stream_dir.len() - num_streams - 1);
255                committed_stream_page_starts = Vec::with_capacity(num_streams + 1);
256
257                for (stream, &stream_size) in stream_sizes_src.iter().enumerate() {
258                    committed_stream_page_starts.push(committed_stream_pages.len() as u32);
259
260                    let stream_size = stream_size.get();
261                    if stream_size != NIL_STREAM_SIZE {
262                        let num_stream_pages =
263                            num_pages_for_stream_size(stream_size, page_size_pow2) as usize;
264                        if num_stream_pages > stream_pages_iter.len() {
265                            bail!(
266                                "Stream directory is invalid.  Stream {stream} has size {stream_size}, \
267                                 which exceeds the size of the stream directory."
268                            );
269                        }
270                        let (this_stream_pages, next) =
271                            stream_pages_iter.split_at(num_stream_pages);
272                        stream_pages_iter = next;
273                        committed_stream_pages.extend(this_stream_pages.iter().map(|p| p.get()));
274                    }
275                }
276                committed_stream_page_starts.push(committed_stream_pages.len() as u32);
277
278                // Now that we have finished reading the stream directory, we set the length
279                // of stream 0 (the "Old Stream Directory") to 0. Nothing should ever read Stream 0.
280                // If we modify a PDB/MSF file, then we want to write no pages at all for Stream 0.
281                // Doing this here is the most convenient way to handle this.
282                stream_sizes[0] = 0;
283            }
284
285            MsfKind::Small => {
286                // Before Big MSF files, the stream directory was stored in a set of pages.
287                // These pages were listed directly within page 0. Keep in mind that page numbers
288                // are 16-bit in old MSF files.
289                let page_pointers_size_bytes = stream_dir_num_pages * 2;
290
291                let mut pages_u16: Vec<U16<LE>> = vec![U16::new(0); stream_dir_num_pages as usize];
292                if page_pointers_size_bytes + size_of::<SmallMsfHeader>() as u32 > page_size {
293                    bail!(
294                        "The MSF header is invalid. The page pointers for the stream directory \
295                         exceed the range of the first page. \
296                         Stream dir size (in bytes): {stream_dir_size}  Page size: {page_size}"
297                    );
298                }
299
300                file.read_exact_at(pages_u16.as_mut_bytes(), size_of::<SmallMsfHeader>() as u64)?;
301
302                // Read the pages of the stream directory. Be careful with the last page.
303                let mut page_iter = pages_u16.iter();
304                let mut old_stream_dir_bytes: Vec<u8> = vec![0; stream_dir_size as usize];
305                for stream_dir_chunk in old_stream_dir_bytes.chunks_mut(page_size as usize) {
306                    // This unwrap should succeed because we computed the length of pages_u16
307                    // based on the byte size of the stream directory.
308                    let page = page_iter.next().unwrap().get() as u32;
309                    page_allocator.init_mark_stream_dir_page_busy(page)?;
310                    file.read_exact_at(stream_dir_chunk, page_to_offset(page, page_size_pow2))?;
311                }
312
313                let Ok((header, rest)) =
314                    OldMsfStreamDirHeader::read_from_prefix(old_stream_dir_bytes.as_slice())
315                else {
316                    bail!("Invalid stream directory: too small");
317                };
318
319                let num_streams = header.num_streams.get() as usize;
320                stream_sizes = Vec::with_capacity(num_streams);
321
322                let Ok((entries, mut rest)) =
323                    <[OldMsfStreamEntry]>::ref_from_prefix_with_elems(rest, num_streams)
324                else {
325                    bail!("Invalid stream directory: too small")
326                };
327
328                for i in 0..num_streams {
329                    let stream_size = entries[i].stream_size.get();
330                    stream_sizes.push(stream_size);
331                }
332
333                committed_stream_page_starts = Vec::with_capacity(num_streams + 1);
334                committed_stream_pages = Vec::new(); // TODO: precompute capacity
335
336                for &stream_size in stream_sizes.iter() {
337                    committed_stream_page_starts.push(committed_stream_pages.len() as u32);
338                    if stream_size != NIL_STREAM_SIZE {
339                        let num_pages = stream_size.div_round_up(page_size_pow2);
340
341                        let Ok((pages, r)) =
342                            <[U16<LE>]>::ref_from_prefix_with_elems(rest, num_pages as usize)
343                        else {
344                            bail!("Invalid stream directory: too small");
345                        };
346
347                        rest = r; // update iterator state
348                        for page in pages.iter() {
349                            committed_stream_pages.push(page.get() as u32);
350                        }
351                    }
352                }
353
354                committed_stream_page_starts.push(committed_stream_pages.len() as u32);
355
356                if !rest.is_empty() {
357                    warn!(
358                        unused_bytes = rest.len(),
359                        "old-style stream dir contained unused bytes"
360                    );
361                }
362            }
363        }
364
365        // Mark the pages in all streams (except for stream 0) as busy. This will also detect
366        // page numbers that are invalid (0 or FPM).
367        {
368            // pages is the list of the page numbers for all streams (except stream 0).
369            let start = committed_stream_page_starts[1] as usize;
370            let pages = &committed_stream_pages[start..];
371            for &page in pages.iter() {
372                page_allocator.init_mark_stream_page_busy(page, 0, 0)?;
373            }
374        }
375
376        // We have finished building the in-memory FPM, including both the fpm and fpm_freed
377        // vectors. We expect that every page is either FREE, BUSY, or DELETED. Check that now.
378        page_allocator.check_vector_consistency()?;
379
380        // Read the FPM from disk and compare it to the FPM that we just constructed. They should
381        // be identical.
382        // TODO: implement for small MSF
383        let fpm_on_disk = read_fpm_big_msf(&file, active_fpm, num_pages, page_size_pow2)?;
384
385        assert_eq!(fpm_on_disk.len(), page_allocator.fpm.len()); // because num_pages defines both
386
387        if page_allocator.fpm != fpm_on_disk {
388            {
389                use tracing::warn;
390
391                warn!("FPM computed from Stream Directory is not equal to FPM found on disk.");
392                warn!(
393                    "Num pages = {num_pages} (0x{num_pages:x} bytes, bit offset: 0x{:x}:{})",
394                    num_pages / 8,
395                    num_pages % 8
396                );
397
398                for i in 0..num_pages as usize {
399                    if fpm_on_disk[i] != page_allocator.fpm[i] {
400                        warn!(
401                            "  bit 0x{:04x} is different. disk = {}, computed = {}",
402                            i, fpm_on_disk[i], page_allocator.fpm[i]
403                        );
404                    }
405                }
406            }
407            bail!("FPM is corrupted; FPM computed from Stream Directory is not equal to FPM found on disk.");
408        }
409
410        // We have finished checking all the data that we have read from disk.
411        // Now check the consistency of our in-memory data structures.
412        page_allocator.assert_invariants();
413
414        match (access_mode, msf_kind) {
415            (AccessMode::ReadWrite, MsfKind::Small) => {
416                bail!(
417                    "This PDB file uses the obsolete 'Small MSF' encoding. \
418                     This library does not support read-write mode with Small MSF files."
419                );
420            }
421
422            (AccessMode::ReadWrite, MsfKind::Big) => {}
423
424            (AccessMode::Read, _) => {}
425        }
426
427        Ok(Self {
428            file,
429            access_mode,
430            active_fpm,
431            committed_stream_pages,
432            committed_stream_page_starts,
433            stream_sizes,
434            kind: msf_kind,
435            pages: page_allocator,
436            modified_streams: HashMap::new(),
437            max_streams: DEFAULT_MAX_STREAMS,
438        })
439    }
440
441    /// Creates a new MSF object in memory. The on-disk file is not modified until `commit()` is
442    /// called.
443    pub fn create_for(file: F, options: CreateOptions) -> anyhow::Result<Self> {
444        let _span = trace_span!("Msf::create_for").entered();
445
446        assert!(options.page_size >= MIN_PAGE_SIZE);
447        assert!(options.page_size <= MAX_PAGE_SIZE);
448
449        let num_pages: usize = 3;
450
451        let mut this = Self {
452            file,
453            access_mode: AccessMode::ReadWrite,
454            committed_stream_pages: vec![],
455            committed_stream_page_starts: vec![0; 2],
456            kind: MsfKind::Big,
457            pages: PageAllocator::new(num_pages, options.page_size),
458            modified_streams: HashMap::new(),
459            stream_sizes: vec![0],
460            active_fpm: 2,
461            max_streams: options.max_streams,
462        };
463
464        // Set up the 4 fixed-index streams. They are created as nil streams.
465        for _ in 1..=4 {
466            let _stream_index = this.nil_stream()?;
467        }
468
469        Ok(this)
470    }
471}
472
473/// Read each page of the FPM. Each page of the FPM is stored in a different interval;
474/// they are not contiguous.
475///
476/// num_pages is the total number of pages in the FPM.
477fn read_fpm_big_msf<F: ReadAt>(
478    file: &F,
479    active_fpm: u32,
480    num_pages: u32,
481    page_size: PageSize,
482) -> anyhow::Result<BitVec<u32, Lsb0>> {
483    let _span = trace_span!("read_fpm_big_msf").entered();
484
485    assert!(num_pages > 0);
486
487    let mut free_page_map: BitVec<u32, Lsb0> = BitVec::new();
488    free_page_map.resize(num_pages as usize, false);
489    let fpm_bytes: &mut [u8] = free_page_map.as_raw_mut_slice().as_mut_bytes();
490    let page_size_usize = usize::from(page_size);
491
492    for (interval, fpm_page_bytes) in fpm_bytes.chunks_mut(page_size_usize).enumerate() {
493        let interval_page = interval_to_page(interval as u32, page_size);
494        let file_pos = page_to_offset(interval_page + active_fpm, page_size);
495
496        trace!(
497            interval,
498            interval_page,
499            file_pos,
500            "reading FPM page, interval_page = 0x{interval_page:x}, file_pos = 0x{file_pos:x}"
501        );
502        file.read_exact_at(fpm_page_bytes, file_pos)?;
503    }
504
505    // Check our invariants for the FPM. If these checks fail then we return Err because we
506    // are validating data that we read from disk. After these checks succeed, we switch to using
507    // assert_invariants(), which uses assert!(). That verifies that we preserve our invariants.
508
509    // Check that page 0, which stores the MSF File Header, is busy.
510    if free_page_map[0] {
511        bail!("FPM is invalid: Page 0 should always be BUSY");
512    }
513
514    // Check that the pages assigned to the FPM are marked "busy" in all intervals.
515
516    let mut interval: u32 = 0;
517    loop {
518        let interval_page = interval_to_page(interval, page_size) as usize;
519        let fpm1_index = interval_page + 1;
520        let fpm2_index = interval_page + 2;
521
522        if fpm1_index < free_page_map.len() {
523            if free_page_map[fpm1_index] {
524                bail!("All FPM pages should be marked BUSY");
525            }
526        }
527
528        if fpm2_index < free_page_map.len() {
529            if free_page_map[fpm2_index] {
530                bail!("All FPM pages should be marked BUSY");
531            }
532            interval += 1;
533        } else {
534            break;
535        }
536    }
537
538    Ok(free_page_map)
539}
540
541/// Computes the low-bits-on mask for the page mask.
542fn low_page_mask(page_size: PageSize) -> u32 {
543    (1u32 << page_size.exponent()).wrapping_sub(1u32)
544}
545
546/// Tests whether `page` contributes to either FPM1 or FPM2.
547fn is_fpm_page_big_msf(page_size: PageSize, page: u32) -> bool {
548    let page_within_interval = page & low_page_mask(page_size);
549    matches!(page_within_interval, 1 | 2)
550}
551
552/// Tests whether `page` is one of the special pages (Page 0, FPM1, or FPM2)
553fn is_special_page_big_msf(page_size: PageSize, page: u32) -> bool {
554    page == 0 || is_fpm_page_big_msf(page_size, page)
555}
556
557/// Describes the "old" MSF Stream Directory Header.
558#[derive(Clone, IntoBytes, FromBytes, Unaligned, KnownLayout, Immutable)]
559#[repr(C)]
560struct OldMsfStreamDirHeader {
561    num_streams: U16<LE>,
562    ignored: U16<LE>,
563}
564
565/// An entry in the "old" MSF Stream Directory.
566#[derive(Clone, IntoBytes, FromBytes, Unaligned, KnownLayout, Immutable)]
567#[repr(C)]
568struct OldMsfStreamEntry {
569    stream_size: U32<LE>,
570    ignored: U32<LE>,
571}