symbolic_ppdb/format/
mod.rs

1mod metadata;
2mod raw;
3mod sequence_points;
4mod streams;
5mod utils;
6
7use std::{borrow::Cow, collections::BTreeMap, fmt, io::Read};
8
9use flate2::read::DeflateDecoder;
10use serde::Deserialize;
11use thiserror::Error;
12use watto::Pod;
13
14use symbolic_common::{DebugId, Language, SourceLinkMappings, Uuid};
15
16use metadata::{
17    CustomDebugInformation, CustomDebugInformationIterator, CustomDebugInformationTag,
18    MetadataStream, Table, TableType,
19};
20use streams::{BlobStream, GuidStream, PdbStream, StringStream, UsStream};
21
22/// The kind of a [`FormatError`].
23#[derive(Debug, Clone, Copy, Error)]
24#[non_exhaustive]
25pub enum FormatErrorKind {
26    /// The header of the Portable PDB file could not be read.
27    #[error("invalid header")]
28    InvalidHeader,
29    #[error("invalid signature")]
30    /// The header of the Portable PDB does not contain the correct signature.
31    InvalidSignature,
32    /// The file ends prematurely.
33    #[error("invalid length")]
34    InvalidLength,
35    /// The file does not contain a valid version string.
36    #[error("invalid version string")]
37    InvalidVersionString,
38    /// A stream header could not be read.
39    #[error("invalid stream header")]
40    InvalidStreamHeader,
41    /// A stream's name could not be read.
42    #[error("invalid stream name")]
43    InvalidStreamName,
44    /// String data was requested, but the file does not contain a `#Strings` stream.
45    #[error("file does not contain a #Strings stream")]
46    NoStringsStream,
47    /// The given offset is out of bounds for the string heap.
48    #[error("invalid string offset")]
49    InvalidStringOffset,
50    /// Tried to read invalid string data.
51    #[error("invalid string data")]
52    InvalidStringData,
53    /// An unrecognized stream name was encountered.
54    #[error("unknown stream")]
55    UnknownStream,
56    /// GUID data was requested, but the file does not contain a `#GUID` stream.
57    #[error("file does not contain a #Guid stream")]
58    NoGuidStream,
59    /// The given index is out of bounds for the GUID heap.
60    #[error("invalid guid index")]
61    InvalidGuidIndex,
62    /// The table stream is too small to hold all claimed tables.
63    #[error(
64        "insufficient table data: {0} bytes required, but table stream only contains {1} bytes"
65    )]
66    InsufficientTableData(usize, usize),
67    /// The given offset is out of bounds for the `#Blob` heap.
68    #[error("invalid blob offset")]
69    InvalidBlobOffset,
70    /// The given offset points to invalid blob data.
71    #[error("invalid blob data")]
72    InvalidBlobData,
73    /// Blob data was requested, but the file does not contain a `#Blob` stream.
74    #[error("file does not contain a #Blob stream")]
75    NoBlobStream,
76    /// Tried to read an invalid compressed unsigned number.
77    #[error("invalid compressed unsigned number")]
78    InvalidCompressedUnsigned,
79    /// Tried to read an invalid compressed signed number.
80    #[error("invalid compressed signed number")]
81    InvalidCompressedSigned,
82    /// Could not read a document name.
83    #[error("invalid document name")]
84    InvalidDocumentName,
85    /// Failed to parse a sequence point.
86    #[error("invalid sequence point")]
87    InvalidSequencePoint,
88    /// Table data was requested, but the file does not contain a `#~` stream.
89    #[error("file does not contain a #~ stream")]
90    NoMetadataStream,
91    /// The given row index is out of bounds for the table.
92    #[error("row index {1} is out of bounds for table {0:?}")]
93    RowIndexOutOfBounds(TableType, usize),
94    /// The given column index is out of bounds for the table.
95    #[error("column index {1} is out of bounds for table {0:?}")]
96    ColIndexOutOfBounds(TableType, usize),
97    /// The given column in the table has an incompatible width.
98    #[error("column {1} in table {0:?} has incompatible width {2}")]
99    ColumnWidth(TableType, usize, usize),
100    /// Tried to read an custom debug information table item tag.
101    #[error("invalid custom debug information table item tag {0}")]
102    InvalidCustomDebugInformationTag(u32),
103    /// Tried to read contents of a blob in an unknown format.
104    #[error("invalid blob format {0}")]
105    InvalidBlobFormat(u32),
106    /// Failed to parse Source Link JSON
107    #[error("invalid source link JSON")]
108    InvalidSourceLinkJson,
109}
110
111/// An error encountered while parsing a [`PortablePdb`] file.
112#[derive(Debug, Error)]
113#[error("{kind}")]
114pub struct FormatError {
115    pub(crate) kind: FormatErrorKind,
116    #[source]
117    pub(crate) source: Option<Box<dyn std::error::Error + Send + Sync + 'static>>,
118}
119
120impl FormatError {
121    /// Creates a new FormatError error from a known kind of error as well as an
122    /// arbitrary error payload.
123    pub(crate) fn new<E>(kind: FormatErrorKind, source: E) -> Self
124    where
125        E: Into<Box<dyn std::error::Error + Send + Sync>>,
126    {
127        let source = Some(source.into());
128        Self { kind, source }
129    }
130
131    /// Returns the corresponding [`FormatErrorKind`] for this error.
132    pub fn kind(&self) -> FormatErrorKind {
133        self.kind
134    }
135}
136
137impl From<FormatErrorKind> for FormatError {
138    fn from(kind: FormatErrorKind) -> Self {
139        Self { kind, source: None }
140    }
141}
142
143/// A parsed Portable PDB file.
144///
145/// This can be converted to a [`PortablePdbCache`](crate::PortablePdbCache) using the
146/// [`PortablePdbCacheConverter::process_portable_pdb`](crate::PortablePdbCacheConverter::process_portable_pdb)
147/// method.
148#[derive(Clone)]
149pub struct PortablePdb<'data> {
150    /// First part of the metadata header.
151    header: &'data raw::Header,
152    /// The version string.
153    version_string: &'data str,
154    /// Second part of the metadata header.
155    header2: &'data raw::HeaderPart2,
156    /// The file's #PDB stream, if it exists.
157    pdb_stream: Option<PdbStream<'data>>,
158    /// The file's #~ stream, if it exists.
159    metadata_stream: Option<MetadataStream<'data>>,
160    /// The file's #Strings stream, if it exists.
161    string_stream: Option<StringStream<'data>>,
162    /// The file's #US stream, if it exists.
163    us_stream: Option<UsStream<'data>>,
164    /// The file's #Blob stream, if it exists.
165    blob_stream: Option<BlobStream<'data>>,
166    /// The file's #GUID stream, if it exists.
167    guid_stream: Option<GuidStream<'data>>,
168    /// Source link mappings
169    source_link_mappings: SourceLinkMappings,
170}
171
172impl fmt::Debug for PortablePdb<'_> {
173    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
174        f.debug_struct("PortablePdb")
175            .field("header", &self.header)
176            .field("version_string", &self.version_string)
177            .field("header2", &self.header2)
178            .field("has_pdb_stream", &self.pdb_stream.is_some())
179            .field("has_table_stream", &self.metadata_stream.is_some())
180            .field("has_string_stream", &self.string_stream.is_some())
181            .field("has_us_stream", &self.us_stream.is_some())
182            .field("has_blob_stream", &self.blob_stream.is_some())
183            .field("has_guid_stream", &self.guid_stream.is_some())
184            .finish()
185    }
186}
187
188impl<'data> PortablePdb<'data> {
189    /// Checks whether the provided buffer could potentially be a Portable PDB file,
190    /// without fully parsing it.
191    pub fn peek(buf: &[u8]) -> bool {
192        if let Some((header, _)) = raw::Header::ref_from_prefix(buf) {
193            return header.signature == raw::METADATA_SIGNATURE;
194        }
195        false
196    }
197
198    /// Parses the provided buffer into a Portable PDB file.
199    pub fn parse(buf: &'data [u8]) -> Result<Self, FormatError> {
200        let (header, rest) =
201            raw::Header::ref_from_prefix(buf).ok_or(FormatErrorKind::InvalidHeader)?;
202
203        if header.signature != raw::METADATA_SIGNATURE {
204            return Err(FormatErrorKind::InvalidSignature.into());
205        }
206
207        // TODO: verify major/minor version
208        // TODO: verify reserved
209        let version_length = header.version_length as usize;
210        let version_buf = rest
211            .get(..version_length)
212            .ok_or(FormatErrorKind::InvalidLength)?;
213        let version_buf = version_buf
214            .split(|c| *c == 0)
215            .next()
216            .ok_or(FormatErrorKind::InvalidVersionString)?;
217        let version = std::str::from_utf8(version_buf)
218            .map_err(|e| FormatError::new(FormatErrorKind::InvalidVersionString, e))?;
219
220        // We already know that buf is long enough.
221        let streams_buf = &rest[version_length..];
222        let (header2, mut streams_buf) =
223            raw::HeaderPart2::ref_from_prefix(streams_buf).ok_or(FormatErrorKind::InvalidHeader)?;
224
225        // TODO: validate flags
226
227        let stream_count = header2.streams;
228
229        let mut result = Self {
230            header,
231            version_string: version,
232            header2,
233            pdb_stream: None,
234            metadata_stream: None,
235            string_stream: None,
236            us_stream: None,
237            blob_stream: None,
238            guid_stream: None,
239            source_link_mappings: SourceLinkMappings::default(),
240        };
241
242        let mut metadata_stream = None;
243        for _ in 0..stream_count {
244            let (header, after_header_buf) = raw::StreamHeader::ref_from_prefix(streams_buf)
245                .ok_or(FormatErrorKind::InvalidStreamHeader)?;
246
247            let name_buf = after_header_buf.get(..32).unwrap_or(after_header_buf);
248            let name_buf = name_buf
249                .split(|c| *c == 0)
250                .next()
251                .ok_or(FormatErrorKind::InvalidStreamName)?;
252            let name = std::str::from_utf8(name_buf)
253                .map_err(|e| FormatError::new(FormatErrorKind::InvalidStreamName, e))?;
254
255            let mut rounded_name_len = name.len() + 1;
256            rounded_name_len = match rounded_name_len % 4 {
257                0 => rounded_name_len,
258                r => rounded_name_len + (4 - r),
259            };
260            streams_buf = after_header_buf
261                .get(rounded_name_len..)
262                .ok_or(FormatErrorKind::InvalidLength)?;
263
264            let offset = header.offset as usize;
265            let size = header.size as usize;
266            let stream_buf = buf
267                .get(offset..offset + size)
268                .ok_or(FormatErrorKind::InvalidLength)?;
269
270            match name {
271                "#Pdb" => result.pdb_stream = Some(PdbStream::parse(stream_buf)?),
272                // Save the #~ stream for last; it definitely must be parsed after the #Pdb stream.
273                "#~" => metadata_stream = Some(stream_buf),
274                "#Strings" => result.string_stream = Some(StringStream::new(stream_buf)),
275                "#US" => result.us_stream = Some(UsStream::new(stream_buf)),
276                "#Blob" => result.blob_stream = Some(BlobStream::new(stream_buf)),
277                "#GUID" => result.guid_stream = Some(GuidStream::parse(stream_buf)?),
278                _ => return Err(FormatErrorKind::UnknownStream.into()),
279            }
280        }
281
282        if let Some(stream_buf) = metadata_stream {
283            result.metadata_stream = Some(MetadataStream::parse(
284                stream_buf,
285                result
286                    .pdb_stream
287                    .as_ref()
288                    .map_or([0; 64], |s| s.referenced_table_sizes),
289            )?)
290        }
291
292        // Read source link mappings.
293        // https://github.com/dotnet/runtime/blob/main/docs/design/specs/PortablePdb-Metadata.md#source-link-c-and-vb-compilers
294        const SOURCE_LINK_KIND: Uuid = uuid::uuid!("CC110556-A091-4D38-9FEC-25AB9A351A6A");
295
296        #[derive(Debug, Clone, Deserialize)]
297        struct SourceLinkDocuments {
298            documents: BTreeMap<String, String>,
299        }
300
301        for cdi in CustomDebugInformationIterator::new(&result, SOURCE_LINK_KIND)? {
302            let cdi = cdi?;
303            // Note: only handle module #1 (do we actually handle multiple modules in any way??)
304            if let (CustomDebugInformationTag::Module, 1) = (cdi.tag, cdi.value) {
305                let docs: SourceLinkDocuments = serde_json::from_slice(result.get_blob(cdi.blob)?)
306                    .map_err(|e| FormatError::new(FormatErrorKind::InvalidSourceLinkJson, e))?;
307                result
308                    .source_link_mappings
309                    .extend(docs.documents.iter().map(|(k, v)| (&k[..], &v[..])));
310            }
311        }
312
313        Ok(result)
314    }
315
316    /// Reads the string starting at the given offset from this file's string heap.
317    #[allow(unused)]
318    fn get_string(&self, offset: u32) -> Result<&'data str, FormatError> {
319        self.string_stream
320            .as_ref()
321            .ok_or(FormatErrorKind::NoStringsStream)?
322            .get_string(offset)
323    }
324
325    /// Reads the GUID with the given index from this file's GUID heap.
326    ///
327    /// Note that the index is 1-based!
328    fn get_guid(&self, idx: u32) -> Result<Uuid, FormatError> {
329        self.guid_stream
330            .as_ref()
331            .ok_or(FormatErrorKind::NoGuidStream)?
332            .get_guid(idx)
333            .ok_or_else(|| FormatErrorKind::InvalidGuidIndex.into())
334    }
335
336    /// Reads the blob starting at the given offset from this file's blob heap.
337    fn get_blob(&self, offset: u32) -> Result<&'data [u8], FormatError> {
338        self.blob_stream
339            .as_ref()
340            .ok_or(FormatErrorKind::NoBlobStream)?
341            .get_blob(offset)
342    }
343
344    /// Reads this file's PDB ID from its #PDB stream.
345    pub fn pdb_id(&self) -> Option<DebugId> {
346        self.pdb_stream.as_ref().map(|stream| stream.id())
347    }
348
349    /// Reads the `(row, col)` cell in the given table as a `u32`.
350    ///
351    /// This returns an error if the indices are out of bounds for the table
352    /// or the cell is too wide for a `u32`.
353    ///
354    /// Note that row and column indices are 1-based!
355    pub(crate) fn get_table(&self, table: TableType) -> Result<Table<'_>, FormatError> {
356        let md_stream = self
357            .metadata_stream
358            .as_ref()
359            .ok_or(FormatErrorKind::NoMetadataStream)?;
360        Ok(md_stream[table])
361    }
362
363    /// Returns true if this portable pdb file contains method debug information.
364    pub fn has_debug_info(&self) -> bool {
365        self.metadata_stream
366            .as_ref()
367            .is_some_and(|md_stream| md_stream[TableType::MethodDebugInformation].rows > 0)
368    }
369
370    /// Get source file referenced by this PDB.
371    ///
372    /// Given index must be between 1 and get_documents_count().
373    pub fn get_document(&self, idx: usize) -> Result<Document, FormatError> {
374        let table = self.get_table(TableType::Document)?;
375        let row = table.get_row(idx)?;
376        let name_offset = row.get_col_u32(1)?;
377        let lang_offset = row.get_col_u32(4)?;
378
379        let name = self.get_document_name(name_offset)?;
380        let lang = self.get_document_lang(lang_offset)?;
381
382        Ok(Document { name, lang })
383    }
384
385    /// Get the number of source files referenced by this PDB.
386    pub fn get_documents_count(&self) -> Result<usize, FormatError> {
387        let table = self.get_table(TableType::Document)?;
388        Ok(table.rows)
389    }
390
391    /// An iterator over source files contents' embedded in this PDB.
392    pub fn get_embedded_sources(&self) -> Result<EmbeddedSourceIterator<'_, 'data>, FormatError> {
393        EmbeddedSourceIterator::new(self)
394    }
395
396    /// Whether this PPDB contains source-link mappings.
397    pub fn has_source_links(&self) -> Result<bool, FormatError> {
398        Ok(!self.source_link_mappings.is_empty() && self.get_documents_count()? > 0)
399    }
400
401    /// Tries to resolve given document as a source link (URL).
402    /// Make sure to try [Self::get_embedded_sources] first when looking for a source file, because
403    /// function may return a link that actually doesn't exist (e.g. file is in .gitignore).
404    /// In that case, it's usually the case that the file is embedded in the PPDB instead.
405    pub fn get_source_link(&self, document: &Document) -> Option<Cow<'_, str>> {
406        self.source_link_mappings
407            .resolve(&document.name)
408            .map(Cow::Owned)
409    }
410}
411
412/// Represents a source file that is referenced by this PDB.
413#[derive(Debug, Clone)]
414pub struct Document {
415    /// Document names are usually normalized full paths.
416    pub name: String,
417    pub(crate) lang: Language,
418}
419
420/// An iterator over Embedded Sources.
421#[derive(Debug, Clone)]
422pub struct EmbeddedSourceIterator<'object, 'data> {
423    ppdb: &'object PortablePdb<'data>,
424    inner_it: CustomDebugInformationIterator<'data>,
425}
426
427impl<'object, 'data> EmbeddedSourceIterator<'object, 'data> {
428    fn new(ppdb: &'object PortablePdb<'data>) -> Result<Self, FormatError> {
429        // https://github.com/dotnet/runtime/blob/main/docs/design/specs/PortablePdb-Metadata.md#embedded-source-c-and-vb-compilers
430        const EMBEDDED_SOURCES_KIND: Uuid = uuid::uuid!("0E8A571B-6926-466E-B4AD-8AB04611F5FE");
431        let inner_it = CustomDebugInformationIterator::new(ppdb, EMBEDDED_SOURCES_KIND)?;
432        Ok(EmbeddedSourceIterator { ppdb, inner_it })
433    }
434
435    fn get_source(
436        &mut self,
437        info: CustomDebugInformation,
438    ) -> Result<EmbeddedSource<'data>, FormatError> {
439        let document = self.ppdb.get_document(info.value as usize)?;
440        let blob = self.ppdb.get_blob(info.blob)?;
441        Ok(EmbeddedSource { document, blob })
442    }
443}
444
445impl<'data> Iterator for EmbeddedSourceIterator<'_, 'data> {
446    type Item = Result<EmbeddedSource<'data>, FormatError>;
447
448    fn next(&mut self) -> Option<Self::Item> {
449        // Skip rows that are not "Document". From the specs, it should always be the case but we've
450        // had a MethodDef (with an invalid 0 row index...) in the field so there's a test for it.
451        while let Some(row) = self.inner_it.next() {
452            match row {
453                Err(e) => return Some(Err(e)),
454                Ok(info) => {
455                    if let CustomDebugInformationTag::Document = info.tag {
456                        return Some(self.get_source(info));
457                    }
458                }
459            }
460        }
461        None
462    }
463}
464
465/// Lazy Embedded Source file reader.
466#[derive(Debug, Clone)]
467pub struct EmbeddedSource<'data> {
468    document: Document,
469    blob: &'data [u8],
470}
471
472impl<'data, 'object> EmbeddedSource<'data> {
473    /// Returns the build-time path associated with this source file.
474    pub fn get_path(&'object self) -> &'object str {
475        self.document.name.as_str()
476    }
477
478    /// Reads the source file contents from the Portable PDB.
479    pub fn get_contents(&self) -> Result<Cow<'data, [u8]>, FormatError> {
480        // The blob has the following structure: `Blob ::= format content`
481        // - format - int32 - Indicates how the content is serialized.
482        //     0 = raw bytes, uncompressed.
483        //     Positive value = compressed by deflate algorithm and value indicates uncompressed size.
484        //     Negative values reserved for future formats.
485        // - content - format-specific - The text of the document in the specified format. The length is implied by the length of the blob minus four bytes for the format.
486        if self.blob.len() < 4 {
487            return Err(FormatErrorKind::InvalidBlobData.into());
488        }
489        let (format_blob, data_blob) = self.blob.split_at(4);
490        let format = u32::from_ne_bytes(format_blob.try_into().unwrap());
491        match format {
492            0 => Ok(Cow::Borrowed(data_blob)),
493            x if x > 0 => self.inflate_contents(format as usize, data_blob),
494            _ => Err(FormatErrorKind::InvalidBlobFormat(format).into()),
495        }
496    }
497
498    fn inflate_contents(
499        &self,
500        size: usize,
501        data: &'data [u8],
502    ) -> Result<Cow<'data, [u8]>, FormatError> {
503        let mut decoder = DeflateDecoder::new(data);
504        let mut output = Vec::with_capacity(size);
505        let read_size = decoder
506            .read_to_end(&mut output)
507            .map_err(|e| FormatError::new(FormatErrorKind::InvalidBlobData, e))?;
508        if read_size != size {
509            return Err(FormatErrorKind::InvalidLength.into());
510        }
511        Ok(Cow::Owned(output))
512    }
513}