ms_pdb/
names.rs

1//! Parses the Names Stream (`/names`).
2//!
3//! The Names Stream stores a set of unique strings (names). This allows other data structures to
4//! refer to strings using an integer index ([`NameIndex`]), rather than storing copies of the same
5//! string in many different places.
6//!
7//! The stream index for the Names Stream is found in the PDB Information Stream, in the Named
8//! Streams section.  The key is "/names".
9//!
10//! The Names Stream begins with `NamesStreamHeader`, which specifies the size in bytes of the
11//! string data substream. The string data substream immediately follows the stream header.
12//! It consists of NUL-terminated UTF-8 strings.
13//!
14//! After the string data there is a hash table. The hash table is an array, one for each string
15//! in the table. The value of each array entry is a byte offset that points into the string data.
16//! The index of each array entry is chosen using a hash of the corresponding string value.
17//!
18//! Hash collisions are resolved using linear probing. That is, during table construction, the
19//! hash table is allocated and initialized, with each entry pointing to nothing (nil). For each
20//! string, we compute the hash of the string (modulo the size of the hash table). If the
21//! corresponding entry in the hash table is empty, then we write the `NameIndex` value into that
22//! slot. If that slot is already busy, then we check the next slot; if we reach the end of the
23//! table then we wrap around to slot 0. For this reason, the number of hash entries must be
24//! greater than or equal to the number of strings in the table.
25//!
26//! The overall organization of the stream is:
27//!
28//! name             | type                 | usage
29//! -----------------|----------------------|------
30//! `signature`      | `u32`                | should always be 0xEFFE_EFFE
31//! `version`        | `u32`                | should be 1
32//! `strings_size`   | `u32`                | size of the string data
33//! `strings_data`   | `[u8; strings_size]` | contains the UTF-8 string data, with NUL terminators
34//! `num_hashes`     | `u32`                | specifies the number of hash entries
35//! `hashes`         | `[u32; num_hashes]`  | contains hash entries for all strings
36//! `num_strings`    | `u32`                | number of non-empty strings in the table
37
38#[cfg(test)]
39mod tests;
40
41use crate::parser::{Parser, ParserMut};
42use crate::utils::align_4;
43use crate::utils::iter::{HasRestLen, IteratorWithRangesExt};
44use crate::ReadAt;
45use anyhow::bail;
46use bstr::BStr;
47use std::ops::Range;
48use tracing::{debug, trace, trace_span, warn};
49use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout, Unaligned, LE, U32};
50
51/// The name of the `/names` stream. This identifies the stream in the Named Streams Table,
52/// in the PDB Information Stream.
53pub const NAMES_STREAM_NAME: &str = "/names";
54
55/// A byte offset into the Names Stream.
56///
57/// This value does not include the size of the stream header, so the size of the stream header
58/// must be added to it when dereferencing a string.
59#[derive(Copy, Clone, Eq, PartialEq, Debug, Hash, Ord, PartialOrd)]
60pub struct NameIndex(pub u32);
61
62impl std::fmt::Display for NameIndex {
63    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
64        std::fmt::Display::fmt(&self.0, f)
65    }
66}
67
68#[test]
69fn name_index_display() {
70    assert_eq!(format!("{}", NameIndex(42)), "42");
71}
72
73/// Represents a `NameIndex` value in LE byte order.
74#[derive(
75    Copy, Clone, Eq, PartialEq, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Unaligned,
76)]
77#[repr(transparent)]
78pub struct NameIndexLe(pub U32<LE>);
79
80impl NameIndexLe {
81    /// Converts the value to the in-memory byte order.
82    #[inline(always)]
83    pub fn get(self) -> NameIndex {
84        NameIndex(self.0.get())
85    }
86}
87
88/// Value for `NamesStreamHeader::signature`.
89pub const NAMES_STREAM_SIGNATURE: u32 = 0xEFFE_EFFE;
90
91/// Value for `NamesStreamHeader::version`.
92pub const NAMES_STREAM_VERSION_V1: u32 = 1;
93
94/// The header of the Names Stream.
95#[repr(C)]
96#[derive(IntoBytes, FromBytes, Immutable, KnownLayout, Unaligned)]
97pub struct NamesStreamHeader {
98    /// Signature identifies this as a Names Stream. Should always be `NAMES_STREAM_SIGNATURE`.
99    pub signature: U32<LE>,
100    /// Version of the Names Stream, which determines the hash function.
101    pub version: U32<LE>,
102    /// Size in bytes of the string data, which immediately follows this header.
103    pub strings_size: U32<LE>,
104}
105
106/// Stream data for an empty Names stream.
107pub static EMPTY_NAMES_STREAM_DATA: &[u8] = &[
108    0xFE, 0xEF, 0xFE, 0xEF, // signature
109    0x01, 0x00, 0x00, 0x00, // version
110    0x04, 0x00, 0x00, 0x00, // strings_size
111    0x00, 0x00, 0x00, 0x00, // string data
112    0x01, 0x00, 0x00, 0x00, // num_hashes
113    0x00, 0x00, 0x00, 0x00, // hash[0]
114    0x00, 0x00, 0x00, 0x00, // num_strings
115];
116
117#[test]
118fn parse_empty_names_stream() {
119    let names = NamesStream::parse(EMPTY_NAMES_STREAM_DATA).unwrap();
120    assert_eq!(names.num_strings, 0);
121    assert_eq!(names.num_hashes, 1);
122}
123
124/// The size of the Names Stream Header, in bytes.
125pub const NAMES_STREAM_HEADER_LEN: usize = 12;
126
127/// Reads the `/names` stream.
128pub struct NamesStream<StreamData>
129where
130    StreamData: AsRef<[u8]>,
131{
132    /// Contains the stream data of the `/names` stream.
133    pub stream_data: StreamData,
134
135    /// The size of the string data. This value comes from the stream header.
136    pub strings_size: usize,
137
138    /// The number of entries in the hash table.
139    pub num_hashes: usize,
140
141    /// The byte offset within `stream_data` where the hash records begin. Each hash record
142    /// contains a `NameIndex` value. The number of elements is `num_hashes`.
143    pub hashes_offset: usize,
144
145    /// The is the number of strings from the stream trailer. Nothing guarantees that this value
146    /// correctly reflects the number of strings in the string data.
147    pub num_strings: usize,
148}
149
150impl<StreamData> NamesStream<StreamData>
151where
152    StreamData: AsRef<[u8]>,
153{
154    /// Parses and validates the stream header.
155    ///
156    /// This function does not validate all of the strings in the table.
157    /// The `check()` function performs extensive checks.
158    pub fn parse(stream_data: StreamData) -> anyhow::Result<Self> {
159        let stream_data_slice: &[u8] = stream_data.as_ref();
160        let mut p = Parser::new(stream_data_slice);
161        let header: &NamesStreamHeader = p.get()?;
162
163        if header.signature.get() != NAMES_STREAM_SIGNATURE {
164            bail!(
165                "The `/names` stream has an invalid signature: 0x{:08x}.",
166                header.signature.get()
167            );
168        }
169
170        if header.version.get() != NAMES_STREAM_VERSION_V1 {
171            bail!(
172                "The `/names` stream is using an unsupported version: {}.",
173                header.version.get()
174            );
175        }
176
177        let strings_size = header.strings_size.get() as usize;
178        let _string_data = p.bytes(strings_size)?;
179
180        // Read the header of the hash table. The only value in the fixed-size portion is a u32
181        // that specifies the number of hashes in the table.
182        let num_hashes = p.u32()? as usize;
183
184        let hashes_offset = stream_data_slice.len() - p.len();
185        let _hashed_names: &[U32<LE>] = p.slice(num_hashes)?;
186
187        // The last item is a u32 that specifies the number of strings in the table.
188        let num_strings = p.u32()? as usize;
189
190        Ok(Self {
191            stream_data,
192            strings_size,
193            num_hashes,
194            hashes_offset,
195            num_strings,
196        })
197    }
198
199    /// Returns the byte range within the stream of the string data.
200    pub fn strings_range(&self) -> Range<usize> {
201        NAMES_STREAM_HEADER_LEN..NAMES_STREAM_HEADER_LEN + self.strings_size
202    }
203
204    /// Gets the strings data
205    pub fn strings_bytes(&self) -> &[u8] {
206        &self.stream_data.as_ref()[self.strings_range()]
207    }
208
209    /// Gets the hash table. Each entry contains NameIndex, or 0.  The entries are arranged in
210    /// the order of the hash of the strings.
211    pub fn hashes(&self) -> &[U32<LE>] {
212        let stream_data = self.stream_data.as_ref();
213        <[U32<LE>]>::ref_from_prefix_with_elems(&stream_data[self.hashes_offset..], self.num_hashes)
214            .unwrap()
215            .0
216    }
217
218    /// Retrieves one string from the string table.
219    pub fn get_string(&self, offset: NameIndex) -> anyhow::Result<&BStr> {
220        let strings_bytes = self.strings_bytes();
221        if let Some(s_bytes) = strings_bytes.get(offset.0 as usize..) {
222            let mut p = Parser::new(s_bytes);
223            let s = p.strz()?;
224            trace!("found string at {offset:?} : {s:?}");
225            Ok(s)
226        } else {
227            bail!("String offset {offset:?} is invalid (out of range)");
228        }
229    }
230
231    /// Iterates the strings in the table, by reading the character data directly.
232    ///
233    /// By convention, the string table usually begins with the empty string. However, this is not
234    /// a guarantee of this implementation.
235    ///
236    /// This iterator may iterate empty strings at the end of the sequence, due to alignment bytes
237    /// at the end of the string data.
238    pub fn iter(&self) -> IterNames<'_> {
239        IterNames {
240            rest: self.strings_bytes(),
241        }
242    }
243
244    /// Sorts the Names Stream and removes duplicates. This also eliminates duplicate strings.
245    ///
246    /// Returns `(remapping_table, new_stream_data)`. The `remapping_table` contains tuples of
247    /// `(old_offset, new_offset)` and is sorted by `old_offset`. The caller can use a binary
248    /// search to remap entries.
249    pub fn rebuild(&self) -> (NameIndexMapping, Vec<u8>) {
250        let _span = trace_span!("NamesStream::rebuild").entered();
251
252        let old_stream_data: &[u8] = self.stream_data.as_ref();
253        // We verified the length of the stream in NamesStream::parse().
254        let old_string_data = self.strings_bytes();
255
256        // Check for the degenerate case of an empty names table, which does not even contain
257        // the empty string. This should never happen, but protect against it anyway. Return
258        // a copy of the current table, such as it is. The remapping_table is empty.
259        if old_string_data.is_empty() {
260            return (
261                NameIndexMapping { table: Vec::new() },
262                old_stream_data.to_vec(),
263            );
264        }
265
266        // First pass, count the non-empty strings.
267        let num_strings = self.iter().filter(|s| !s.is_empty()).count();
268        debug!("Number of strings found: {num_strings}");
269
270        // Second pass, build a string table.
271        let mut strings: Vec<(Range<usize>, &BStr)> = Vec::with_capacity(num_strings);
272        strings.extend(self.iter().with_ranges().filter(|(_, s)| !s.is_empty()));
273
274        // Sort the strings.
275        strings.sort_unstable_by_key(|i| i.1);
276        strings.dedup_by_key(|i| i.1);
277
278        let num_unique_strings = strings.len();
279        if num_unique_strings != num_strings {
280            debug!(
281                "Removed {} duplicate strings.",
282                num_strings - num_unique_strings
283            );
284        } else {
285            debug!("Did not find duplicate strings.");
286        }
287
288        // Find the size of the new stream.
289        // The 1+ at the start is for the empty string.
290        let new_strings_len_unaligned = 1 + strings.iter().map(|(_, s)| s.len() + 1).sum::<usize>();
291        let new_strings_len = align_4(new_strings_len_unaligned);
292
293        // Choose the number of hashes.
294        let num_hashes = num_unique_strings * 6 / 4;
295        assert!(num_hashes >= num_unique_strings);
296        debug!(
297            "Using {} hashes for {} strings with linear probing.",
298            num_hashes, num_unique_strings
299        );
300
301        let new_hash_size_bytes = 4   // for the num_hashes field
302            + num_hashes * 4 // for the hashes array
303            + 4; // for the num_strings field
304
305        let new_stream_data_len = NAMES_STREAM_HEADER_LEN + new_strings_len + new_hash_size_bytes;
306        debug!(
307            "Old name stream size (strings only): {}",
308            old_string_data.len()
309        );
310        debug!("New name stream size (strings only): {}", new_strings_len);
311
312        let mut new_stream_data: Vec<u8> = vec![0; new_stream_data_len];
313        let mut p = ParserMut::new(&mut new_stream_data);
314        *p.get_mut().unwrap() = NamesStreamHeader {
315            signature: U32::new(NAMES_STREAM_SIGNATURE),
316            version: U32::new(NAMES_STREAM_VERSION_V1),
317            strings_size: U32::new(new_strings_len as u32),
318        };
319
320        // Write the string data into the output table, and build the remapping table as we go.
321        let mut remapping_table: Vec<(NameIndex, NameIndex)> = Vec::with_capacity(num_strings + 1);
322        // Add mapping for empty
323        remapping_table.push((NameIndex(0), NameIndex(0)));
324        {
325            let new_strings_data_with_alignment = p.bytes_mut(new_strings_len).unwrap();
326            let out_bytes = &mut new_strings_data_with_alignment[..new_strings_len_unaligned];
327            let out_bytes_len = out_bytes.len();
328            let mut out_iter = out_bytes;
329
330            // Write empty string.
331            out_iter[0] = 0;
332            out_iter = &mut out_iter[1..];
333
334            for (old_range, s) in strings.iter() {
335                let old_ni = NameIndex(old_range.start as u32);
336                let new_ni = NameIndex((out_bytes_len - out_iter.len()) as u32);
337                remapping_table.push((old_ni, new_ni));
338                let sb: &[u8] = s;
339
340                trace!(
341                    "string: old_ni: 0x{old_ni:08x}, new_ni: 0x{new_ni:08x}, old_range: {:08x}..{:08x} s: {:?}",
342                    old_range.start,
343                    old_range.end,
344                    s,
345                    old_ni = old_ni.0,
346                    new_ni = new_ni.0,
347                );
348
349                out_iter[..sb.len()].copy_from_slice(sb);
350                out_iter = &mut out_iter[sb.len() + 1..]; // +1 for NUL
351            }
352
353            assert!(out_iter.is_empty());
354            remapping_table.sort_unstable_by_key(|&(old, _)| old);
355        }
356
357        // Build the hash table. We rely on the table contain all zeroes before we begin writing.
358        // We iterate through the strings, in the sorted order, and compute their hashes. Then we
359        // insert the NameIndex into the table, using linear probing. If we get to the end, we
360        // wrap around.
361        let stream_offset_num_hashes = new_stream_data_len - p.len();
362        *p.get_mut::<U32<LE>>().unwrap() = U32::new(num_hashes as u32);
363        let stream_offset_hash_table = new_stream_data_len - p.len();
364
365        {
366            debug!("Building hash table, num_hashes = {}", num_hashes);
367            let hash_table: &mut [U32<LE>] = p.slice_mut(num_hashes).unwrap();
368            let mut new_ni: u32 = 1; // 1 is for empty string length
369            for &(_, sb) in strings.iter() {
370                let h = crate::hash::hash_mod_u32(sb, num_hashes as u32);
371                trace!("ni {:08x}, hash {:08x}, {:?}", new_ni, h, sb);
372
373                let mut hi = h;
374                let mut wrapped = false;
375                loop {
376                    let slot = &mut hash_table[hi as usize];
377                    if slot.get() == 0 {
378                        *slot = U32::new(new_ni);
379                        break;
380                    }
381                    hi += 1;
382                    if hi as usize == hash_table.len() {
383                        hi = 0;
384                        assert!(!wrapped, "should not wrap around the table more than once");
385                        wrapped = true;
386                    }
387                }
388
389                new_ni += (sb.len() + 1) as u32;
390            }
391        }
392
393        let stream_offset_num_strings = new_stream_data_len - p.len();
394        *p.get_mut::<U32<LE>>().unwrap() = U32::new(strings.len() as u32);
395
396        assert!(p.is_empty());
397
398        debug!("Stream offsets:");
399        debug!(
400            "    [{:08x}] - Names Stream header",
401            NAMES_STREAM_HEADER_LEN
402        );
403        debug!("    [{:08x}] - string data", NAMES_STREAM_HEADER_LEN);
404        debug!(
405            "    [{:08x}] - hash table header (num_hashes)",
406            stream_offset_num_hashes
407        );
408        debug!(
409            "    [{:08x}] - hash table, size in bytes = {}",
410            stream_offset_hash_table,
411            num_hashes * 4
412        );
413        debug!(
414            "    [{:08x}] - num_strings field",
415            stream_offset_num_strings
416        );
417        debug!("    [{:08x}] - (end)", new_stream_data_len);
418
419        (
420            NameIndexMapping {
421                table: remapping_table,
422            },
423            new_stream_data,
424        )
425    }
426}
427
428/// Contains a mapping from old `NameIndex` to new `NameIndex. The mapping is sparse.
429#[derive(Default)]
430pub struct NameIndexMapping {
431    /// the mapping table; use binary search for it
432    ///
433    /// This always starts with `(0, 0)`.
434    pub table: Vec<(NameIndex, NameIndex)>,
435}
436
437impl NameIndexMapping {
438    /// Looks up `name` in the mapping table and returns the mapping for it.
439    pub fn map_old_to_new(&self, name: NameIndex) -> anyhow::Result<NameIndex> {
440        // Perf optimization: Avoid the binary search for 0, which is never remapped.
441        if name.0 == 0 {
442            return Ok(name);
443        }
444
445        let table = self.table.as_slice();
446        match table.binary_search_by_key(&name, |(old, _)| *old) {
447            Ok(i) => Ok(table[i].1),
448            Err(_) => bail!("The NameIndex value 0x{:x} cannot be remapped because it was not present in the old Names stream.", name.0),
449        }
450    }
451}
452
453/// Given an index `i` into a hash table `hashes`, where `hashes[i]` is already known to be used
454/// (non-empty), find the range or ranges of contiguous non-empty entries in `hashes` that cover `i`.
455///
456/// The reason this function can return two ranges is that linear probing wraps around at the end
457/// of the hash table. We have to account for wrap-around at both the start and end of `hashes`.
458/// The unit tests (below) illustrate this.
459///
460/// We use the ranges returned from this function to verify that a given hash entry is at a legal
461/// index within the hash table. The hash table may place hash entries adjacent to each other either
462/// because the hash functions were numerically 1 different from each other (e.g. `foo` hashes to
463/// 42 and `bar` hashes to 43) or because a hash collision occurred. This function does not
464/// (cannot) distinguish between those two cases, because it does not have the original strings.
465/// Instead, it just computes the places where a given string could legally be. The caller then
466/// verifies that each hash entry is in a range that is valid for it.
467#[allow(dead_code)]
468fn find_collision_ranges(hashes: &[U32<LE>], i: usize) -> (Range<usize>, Range<usize>) {
469    assert!(i < hashes.len());
470    assert!(hashes[i].get() != 0);
471
472    let mut start = i;
473    while start > 0 && hashes[start - 1].get() != 0 {
474        start -= 1;
475    }
476
477    let mut end = i + 1;
478    while end < hashes.len() && hashes[end].get() != 0 {
479        end += 1;
480    }
481
482    if start == 0 {
483        // Special case: The entire hash table is one collision range.
484        // We check for this because there are no unused slots in the table.
485        if end == hashes.len() {
486            return (start..end, 0..0);
487        }
488
489        let mut r2_start = hashes.len();
490        while r2_start > 0 && hashes[r2_start - 1].get() != 0 {
491            r2_start -= 1;
492            assert!(r2_start > end); // prevent infinite loops
493        }
494        if r2_start != hashes.len() {
495            (start..end, r2_start..hashes.len())
496        } else {
497            (start..end, 0..0)
498        }
499    } else if end == hashes.len() {
500        // The end of the main range is aligned at the end of the buffer.
501        // Wrap around to the beginning and find the range at the beginning, if any.
502        let mut r2_end = 0;
503        while r2_end < hashes.len() && hashes[r2_end].get() != 0 {
504            assert!(r2_end < start); // prevent infinite loops
505            r2_end += 1;
506        }
507
508        (start..end, 0..r2_end)
509    } else {
510        (start..end, 0..0)
511    }
512}
513
514#[test]
515fn test_find_collision_range() {
516    const EMPTY: U32<LE> = U32::from_bytes([0; 4]);
517    const BUSY: U32<LE> = U32::from_bytes([0xff; 4]);
518
519    let hashes_full: Vec<U32<LE>> = vec![BUSY, BUSY, BUSY, BUSY, BUSY];
520    assert_eq!(find_collision_ranges(&hashes_full, 0), (0..5, 0..0));
521    assert_eq!(find_collision_ranges(&hashes_full, 2), (0..5, 0..0));
522
523    {
524        let hashes_2 = vec![
525            BUSY,  // 0 - wraps around
526            EMPTY, // 1
527            BUSY,  // 2
528            EMPTY, // 3
529            EMPTY, // 4
530            BUSY,  // 5
531            BUSY,  // 6 - wraps around
532        ];
533        assert_eq!(find_collision_ranges(&hashes_2, 0), (0..1, 5..7));
534        assert_eq!(find_collision_ranges(&hashes_2, 2), (2..3, 0..0));
535        assert_eq!(find_collision_ranges(&hashes_2, 5), (5..7, 0..1));
536    }
537
538    {
539        let hashes_3 = vec![
540            BUSY,  // 0 - wraps around
541            EMPTY, // 1
542            BUSY,  // 2
543            EMPTY, // 3
544            EMPTY, // 4
545            BUSY,  // 5
546            EMPTY, // 6 - no wrap around
547        ];
548        assert_eq!(find_collision_ranges(&hashes_3, 0), (0..1, 0..0));
549        assert_eq!(find_collision_ranges(&hashes_3, 2), (2..3, 0..0));
550        assert_eq!(find_collision_ranges(&hashes_3, 5), (5..6, 0..0));
551    }
552    {
553        let hashes_4 = vec![
554            EMPTY, // 0 - no wrap around
555            EMPTY, // 1
556            BUSY,  // 2
557            EMPTY, // 3
558            EMPTY, // 4
559            BUSY,  // 5
560            BUSY,  // 6 - wraps around
561        ];
562        assert_eq!(find_collision_ranges(&hashes_4, 2), (2..3, 0..0));
563        assert_eq!(find_collision_ranges(&hashes_4, 5), (5..7, 0..0));
564        assert_eq!(find_collision_ranges(&hashes_4, 6), (5..7, 0..0));
565    }
566}
567
568/// Iterator state
569pub struct IterNames<'a> {
570    rest: &'a [u8],
571}
572
573impl<'a> HasRestLen for IterNames<'a> {
574    fn rest_len(&self) -> usize {
575        self.rest.len()
576    }
577}
578
579impl<'a> Iterator for IterNames<'a> {
580    type Item = &'a BStr;
581
582    fn next(&mut self) -> Option<Self::Item> {
583        if self.rest.is_empty() {
584            return None;
585        }
586
587        let mut p = Parser::new(self.rest);
588        let Ok(s) = p.strz() else {
589            warn!(
590                rest_len = self.rest.len(),
591                "Found malformed string in /names stream"
592            );
593            return None;
594        };
595
596        self.rest = p.into_rest();
597        Some(s)
598    }
599}
600
601impl NamesStream<Vec<u8>> {
602    /// Reads the Names Stream and parses its header.
603    pub fn load_and_parse<F: ReadAt>(
604        pdb: &crate::msf::Msf<F>,
605        named_streams: &crate::pdbi::NamedStreams,
606    ) -> anyhow::Result<Self> {
607        let named_stream_index = named_streams.get_err(NAMES_STREAM_NAME)?;
608        let named_stream_data = pdb.read_stream_to_vec(named_stream_index)?;
609        Self::parse(named_stream_data)
610    }
611}