1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
// Copyright 2017 pdb Developers
//
// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
// http://opensource.org/licenses/MIT>, at your option. This file may not be
// copied, modified, or distributed except according to those terms.

use std::fmt;
use std::marker::PhantomData;
use std::result;

use crate::common::*;
use crate::msf::Stream;
use crate::FallibleIterator;

pub(crate) mod constants;
mod data;
mod header;
mod id;
mod primitive;

use self::header::*;
use self::primitive::type_data_for_primitive;

pub use self::data::*;
pub use self::id::*;
pub use self::primitive::{Indirection, PrimitiveKind, PrimitiveType};

/// Zero-copy access to a PDB type or id stream.
///
/// PDBs store two kinds of related streams with an identical internal structure:
///
///  - [`TypeInformation`] (TPI stream) contains information on primitive types, classes and
///    procedures, including their return type and arguments. Its contents are identified by
///    [`TypeIndex`].
///  - [`IdInformation`] (IPI stream) is a stricter version of the above stream that contains inline
///    functions, build infos and source references. Its contents are identified by [`IdIndex`].
///
/// Items in these streams are stored by their index in ascending order. Symbols declared in
/// [`ModuleInfo`](crate::ModuleInfo) can refer to items in both streams, as well as items to other
/// items with one exception: `Type`s cannot refer to `Id`s. Also, the PDB format requires that
/// items refer only to types with lower indexes. Thus, the stream of items forms a directed acyclic
/// graph.
///
/// Both streams can iterate by their index using [`ItemInformation::iter`]. Additionally,
/// [`ItemFinder`] is a secondary data structure to provide efficient backtracking for random
/// access.
///
/// There are type definitions for both streams:
///
///  - `ItemInformation`: [`TypeInformation`] and [`IdInformation`]
///  - [`ItemFinder`]: [`TypeFinder`] and [`IdFinder`]
///  - [`ItemIndex`]: [`TypeIndex`] and [`IdIndex`]
///  - [`ItemIter`]: [`TypeIter`] and [`IdIter`]
///  - [`Item`]: [`Type`] and [`Id`]
///
/// # Examples
///
/// Iterating over the types while building a `TypeFinder`:
///
/// ```
/// # use pdb::FallibleIterator;
/// #
/// # fn test() -> pdb::Result<usize> {
/// # let file = std::fs::File::open("fixtures/self/foo.pdb")?;
/// # let mut pdb = pdb::PDB::open(file)?;
///
/// let type_information = pdb.type_information()?;
/// let mut type_finder = type_information.finder();
///
/// # let expected_count = type_information.len();
/// # let mut count: usize = 0;
/// let mut iter = type_information.iter();
/// while let Some(typ) = iter.next()? {
///     // build the type finder as we go
///     type_finder.update(&iter);
///
///     // parse the type record
///     match typ.parse() {
///         Ok(pdb::TypeData::Class(pdb::ClassType {name, properties, fields: Some(fields), ..})) => {
///             // this Type describes a class-like type with fields
///             println!("type {} is a class named {}", typ.index(), name);
///
///             // `fields` is a TypeIndex which refers to a FieldList
///             // To find information about the fields, find and parse that Type
///             match type_finder.find(fields)?.parse()? {
///                 pdb::TypeData::FieldList(list) => {
///                     // `fields` is a Vec<TypeData>
///                     for field in list.fields {
///                         if let pdb::TypeData::Member(member) = field {
///                             // follow `member.field_type` as desired
///                             println!("  - field {} at offset {:x}", member.name, member.offset);
///                         } else {
///                             // handle member functions, nested types, etc.
///                         }
///                     }
///
///                     if let Some(more_fields) = list.continuation {
///                         // A FieldList can be split across multiple records
///                         // TODO: follow `more_fields` and handle the next FieldList
///                     }
///                 }
///                 _ => { }
///             }
///
///         },
///         Ok(_) => {
///             // ignore everything that's not a class-like type
///         },
///         Err(pdb::Error::UnimplementedTypeKind(_)) => {
///             // found an unhandled type record
///             // this probably isn't fatal in most use cases
///         },
///         Err(e) => {
///             // other error, probably is worth failing
///             return Err(e);
///         }
///     }
///     # count += 1;
/// }
///
/// # assert_eq!(expected_count, count);
/// # Ok(count)
/// # }
/// # assert!(test().expect("test") > 8000);
/// ```
#[derive(Debug)]
pub struct ItemInformation<'s, I> {
    stream: Stream<'s>,
    header: Header,
    _ph: PhantomData<&'s I>,
}

impl<'s, I> ItemInformation<'s, I>
where
    I: ItemIndex,
{
    /// Parses `TypeInformation` from raw stream data.
    pub(crate) fn parse(stream: Stream<'s>) -> Result<Self> {
        let mut buf = stream.parse_buffer();
        let header = Header::parse(&mut buf)?;
        let _ph = PhantomData;
        Ok(Self {
            stream,
            header,
            _ph,
        })
    }

    /// Returns an iterator that can traverse the type table in sequential order.
    pub fn iter(&self) -> ItemIter<'_, I> {
        // get a parse buffer
        let mut buf = self.stream.parse_buffer();

        // drop the header
        // this can't fail; we've already read this once
        buf.take(self.header.header_size as usize)
            .expect("dropping TPI header");

        ItemIter {
            buf,
            index: self.header.minimum_index,
            _ph: PhantomData,
        }
    }

    /// Returns the number of items contained in this `ItemInformation`.
    ///
    /// Note that in the case of the type stream ([`TypeInformation`]) primitive types are not
    /// stored in the PDB file. The number of distinct types reachable via this table will be higher
    /// than `len()`.
    pub fn len(&self) -> usize {
        (self.header.maximum_index - self.header.minimum_index) as usize
    }

    /// Returns whether this `ItemInformation` contains any data.
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Returns an `ItemFinder` with a default time-space tradeoff useful for access by
    /// [`ItemIndex`].
    ///
    /// The `ItemFinder` is initially empty and must be populated by iterating. See the struct-level
    /// docs for an example.
    pub fn finder(&self) -> ItemFinder<'_, I> {
        ItemFinder::new(self, 3)
    }
}

/// This buffer is used when a `Type` refers to a primitive type. It doesn't contain anything
/// type-specific, but it does parse as `raw_type() == 0xffff`, which is a reserved value. Seems
/// like a reasonable thing to do.
const PRIMITIVE_TYPE: &[u8] = b"\xff\xff";

/// Represents an entry in the type or id stream.
///
/// An `Item` has been minimally processed and may not be correctly formed or even understood by
/// this library. To avoid copying, `Items`s exist as references to data owned by the parent
/// `ItemInformation`. Therefore, an `Item` may not outlive its parent.
///
/// The data held by items can be parsed:
///
///  - [`Type::parse`](Self::parse) returns [`TypeData`].
///  - [`Id::parse`](Self::parse) returns [`IdData`].
///
/// Depending on the stream, this can either be a [`Type`] or [`Id`].
#[derive(Copy, Clone, PartialEq)]
pub struct Item<'t, I> {
    index: I,
    data: &'t [u8],
}

impl<'t, I> Item<'t, I>
where
    I: ItemIndex,
{
    /// Returns this item's index.
    ///
    /// Depending on the stream, either a [`TypeIndex`] or [`IdIndex`].
    pub fn index(&self) -> I {
        self.index
    }

    /// Returns the the binary data length in the on-disk format.
    ///
    /// Items are prefixed by a 16-bit length number, which is not included in this length.
    pub fn len(&self) -> usize {
        self.data.len()
    }

    /// Returns whether this items's data is empty.
    ///
    /// Items are prefixed by a 16-bit length number, which is not included in this operation.
    pub fn is_empty(&self) -> bool {
        self.data.is_empty()
    }

    /// Returns the identifier of the kind of data stored by this this `Item`.
    ///
    /// As a special case, if this is a primitive [`Type`], this function will return `0xffff`.
    #[inline]
    pub fn raw_kind(&self) -> u16 {
        debug_assert!(self.data.len() >= 2);

        // assemble a little-endian u16
        u16::from(self.data[0]) | (u16::from(self.data[1]) << 8)
    }
}

impl<'t, I> fmt::Debug for Item<'t, I>
where
    I: ItemIndex,
{
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "Type{{ kind: 0x{:04x} [{} bytes] }}",
            self.raw_kind(),
            self.data.len()
        )
    }
}

/// In-memory index for efficient random-access to [`Item`]s by index.
///
/// `ItemFinder` can be obtained via [`ItemInformation::finder`]. It starts out empty and must be
/// populated by calling [`ItemFinder::update`] while iterating. There are two typedefs for easier
/// use:
///
///  - [`TypeFinder`] for finding [`Type`]s in a [`TypeInformation`](crate::TypeInformation) (TPI stream).
///  - [`IdFinder`] for finding [`Id`]s in a [`IdInformation`](crate::IdInformation) (IPI stream).
///
/// `ItemFinder` allocates all the memory it needs when it is first created. The footprint is
/// directly proportional to the total number of types; see [`ItemInformation::len`].
///
/// # Time/space trade-off
///
/// The naïve approach is to store the position of each `Item` as they are covered in the stream.
/// The cost is memory: namely one `u32` per `Item`.
///
/// Compare this approach to an `ItemFinder` that stores the position of every Nth item. Memory
/// requirements would be reduced by a factor of N in exchange for requiring an average of (N-1)/2
/// iterations per lookup. However, iteration is cheap sequential memory access, and spending less
/// memory on `ItemFinder` means more of the data can fit in the cache, so this is likely a good
/// trade-off for small-ish values of N.
///
/// `ItemFinder` is parameterized by `shift` which controls this trade-off as powers of two:
///
///   * If `shift` is 0, `ItemFinder` stores 4 bytes per `Item` and always performs direct lookups.
///   * If `shift` is 1, `ItemFinder` stores 2 bytes per `Item` and averages 0.5 iterations per
///     lookup.
///   * If `shift` is 2, `ItemFinder` stores 1 byte per `Item` and averages 1.5 iterations per
///     lookup.
///   * If `shift` is 3, `ItemFinder` stores 4 bits per `Item` and averages 3.5 iterations per
///     lookup.
///   * If `shift` is 4, `ItemFinder` stores 2 bits per `Item` and averages 7.5 iterations per
///     lookup.
///   * If `shift` is 5, `ItemFinder` stores 1 bit per `Item` and averages 15.5 iterations per
///     lookup.
///
/// This list can continue but with rapidly diminishing returns. Iteration cost is proportional to
/// item size, which varies, but typical numbers from a large program are:
///
///   * 24% of items are    12 bytes
///   * 34% of items are <= 16 bytes
///   * 84% of items are <= 32 bytes
///
/// A `shift` of 2 or 3 is likely appropriate for most workloads. 500K items would require 1 MB or
/// 500 KB of memory respectively, and lookups -- though indirect -- would still usually need only
/// one or two 64-byte cache lines.
#[derive(Debug)]
pub struct ItemFinder<'t, I> {
    buffer: ParseBuffer<'t>,
    minimum_index: u32,
    maximum_index: u32,
    positions: Vec<u32>,
    shift: u8,
    _ph: PhantomData<&'t I>,
}

impl<'t, I> ItemFinder<'t, I>
where
    I: ItemIndex,
{
    fn new(info: &'t ItemInformation<'_, I>, shift: u8) -> Self {
        // maximum index is the highest index + 1.
        let count = info.header.maximum_index - info.header.minimum_index;

        let round_base = (1 << shift) - 1;
        let shifted_count = ((count + round_base) & !round_base) >> shift;
        let mut positions = Vec::with_capacity(shifted_count as usize);

        if shifted_count > 0 {
            // add record zero, which is identical regardless of shift
            positions.push(info.header.header_size);
        }

        Self {
            buffer: info.stream.parse_buffer(),
            minimum_index: info.header.minimum_index,
            maximum_index: info.header.maximum_index,
            positions,
            shift,
            _ph: PhantomData,
        }
    }

    /// Given an index, find which position in the Vec we should jump to and how many times we
    /// need to iterate to find the requested type.
    ///
    /// `shift` refers to the size of these bit shifts.
    #[inline]
    fn resolve(&self, type_index: u32) -> (usize, usize) {
        let raw = type_index - self.minimum_index;
        (
            (raw >> self.shift) as usize,
            (raw & ((1 << self.shift) - 1)) as usize,
        )
    }

    /// Returns the highest index which is currently served by this `ItemFinder`.
    ///
    /// When iterating through the stream, you shouldn't need to consider this. Items only ever
    /// reference lower indexes. However, when loading items referenced by the symbols stream, this
    /// can be useful to check whether iteration is required.
    #[inline]
    pub fn max_index(&self) -> I {
        I::from(match self.positions.len() {
            0 => 0, // special case for an empty type index
            len => (len << self.shift) as u32 + self.minimum_index - 1,
        })
    }

    /// Update this `ItemFinder` based on the current position of a [`ItemIter`].
    ///
    /// Do this each time you call `.next()`. See documentation of [`ItemInformation`] for an
    /// example.
    #[inline]
    pub fn update(&mut self, iterator: &ItemIter<'t, I>) {
        let (vec_index, iteration_count) = self.resolve(iterator.index);
        if iteration_count == 0 && vec_index == self.positions.len() {
            let pos = iterator.buf.pos();
            assert!(pos < u32::max_value() as usize);
            self.positions.push(pos as u32);
        }
    }

    /// Find an `Item` by its index.
    ///
    /// # Errors
    ///
    /// * `Error::TypeNotFound(index)` if you ask for an item that doesn't exist.
    /// * `Error::TypeNotIndexed(index, max_index)` if you ask for an item that is known to exist
    ///   but is not currently known by this `ItemFinder`.
    pub fn find(&self, index: I) -> Result<Item<'t, I>> {
        let index: u32 = index.into();
        if index < self.minimum_index {
            return Ok(Item {
                index: I::from(index),
                data: PRIMITIVE_TYPE,
            });
        } else if index > self.maximum_index {
            return Err(Error::TypeNotFound(index));
        }

        // figure out where we'd find this
        let (vec_index, iteration_count) = self.resolve(index);

        if let Some(pos) = self.positions.get(vec_index) {
            // hit
            let mut buf = self.buffer.clone();

            // jump forwards
            buf.take(*pos as usize)?;

            // skip some records
            for _ in 0..iteration_count {
                let length = buf.parse_u16()?;
                buf.take(length as usize)?;
            }

            // read the type
            let length = buf.parse_u16()?;

            Ok(Item {
                index: I::from(index),
                data: buf.take(length as usize)?,
            })
        } else {
            // miss
            Err(Error::TypeNotIndexed(index, self.max_index().into()))
        }
    }
}

/// An iterator over items in [`TypeInformation`](crate::TypeInformation) or
/// [`IdInformation`](crate::IdInformation).
///
/// The TPI and IPI streams are represented internally as a series of records, each of which have a
/// length, a kind, and a type-specific field layout. Iteration performance is therefore similar to
/// a linked list.
#[derive(Debug)]
pub struct ItemIter<'t, I> {
    buf: ParseBuffer<'t>,
    index: u32,
    _ph: PhantomData<&'t I>,
}

impl<'t, I> FallibleIterator for ItemIter<'t, I>
where
    I: ItemIndex,
{
    type Item = Item<'t, I>;
    type Error = Error;

    fn next(&mut self) -> result::Result<Option<Self::Item>, Self::Error> {
        // see if we're at EOF
        if self.buf.is_empty() {
            return Ok(None);
        }

        // read the length of the next type
        let length = self.buf.parse_u16()? as usize;

        // validate
        if length < 2 {
            // this can't be correct
            return Err(Error::TypeTooShort);
        }

        // grab the type itself
        let type_buf = self.buf.take(length)?;
        let index = self.index;

        self.index += 1;

        // Done
        Ok(Some(Item {
            index: I::from(index),
            data: type_buf,
        }))
    }
}

/// Zero-copy access to the PDB type stream (TPI).
///
/// This stream exposes types, the variants of which are enumerated by [`TypeData`]. See
/// [`ItemInformation`] for more information on accessing types.
pub type TypeInformation<'s> = ItemInformation<'s, TypeIndex>;

/// In-memory index for efficient random-access of [`Type`]s by index.
///
/// `TypeFinder` can be obtained via [`TypeInformation::finder`](ItemInformation::finder). See
/// [`ItemFinder`] for more information.
pub type TypeFinder<'t> = ItemFinder<'t, TypeIndex>;

/// An iterator over [`Type`]s returned by [`TypeInformation::iter`](ItemInformation::iter).
pub type TypeIter<'t> = ItemIter<'t, TypeIndex>;

/// Information on a primitive type, class, or procedure.
pub type Type<'t> = Item<'t, TypeIndex>;

impl<'t> Item<'t, TypeIndex> {
    /// Parse this `Type` into `TypeData`.
    ///
    /// # Errors
    ///
    /// * `Error::UnimplementedTypeKind(kind)` if the type record isn't currently understood by this
    ///   library
    /// * `Error::UnexpectedEof` if the type record is malformed
    pub fn parse(&self) -> Result<TypeData<'t>> {
        if self.index < TypeIndex(0x1000) {
            // Primitive type
            type_data_for_primitive(self.index)
        } else {
            let mut buf = ParseBuffer::from(self.data);
            parse_type_data(&mut buf)
        }
    }
}

/// Zero-copy access to the PDB type stream (TPI).
///
/// This stream exposes types, the variants of which are enumerated by [`IdData`]. See
/// [`ItemInformation`] for more information on accessing types.
pub type IdInformation<'s> = ItemInformation<'s, IdIndex>;

/// In-memory index for efficient random-access of [`Id`]s by index.
///
/// `IdFinder` can be obtained via [`IdInformation::finder`](ItemInformation::finder). See
/// [`ItemFinder`] for more information.
pub type IdFinder<'t> = ItemFinder<'t, IdIndex>;

/// An iterator over [`Id`]s returned by [`IdInformation::iter`](ItemInformation::iter).
pub type IdIter<'t> = ItemIter<'t, IdIndex>;

/// Information on an inline function, build infos or source references.
pub type Id<'t> = Item<'t, IdIndex>;

impl<'t> Item<'t, IdIndex> {
    /// Parse this `Id` into `IdData`.
    ///
    /// # Errors
    ///
    /// * `Error::UnimplementedTypeKind(kind)` if the id record isn't currently understood by this
    ///   library
    /// * `Error::UnexpectedEof` if the id record is malformed
    pub fn parse(&self) -> Result<IdData<'t>> {
        ParseBuffer::from(self.data).parse()
    }
}