Skip to main content

padlock_dwarf/
pdb_reader.rs

1// padlock-dwarf/src/pdb_reader.rs
2//
3// Extract struct/union/enum layouts from a PDB (Program Database) file
4// produced by MSVC.
5//
6// PDB files encode type information in a TPI (Type Info) stream.  We iterate
7// the stream building a TypeFinder, then for every non-forward-reference
8// Class, Union, or Enumeration record we resolve its FieldList to collect
9// members.
10//
11// Limitations:
12//   - Bitfield members: grouped by field_type into synthetic [f1:3|f2:5] fields.
13//   - Virtual-base and base-class members are omitted (size comes from the
14//     struct's own `size` field, which is already correct).
15//   - Static members are skipped (no byte offset in a struct instance).
16//   - Source file/line: PDB stores source locations in symbol records (per
17//     function/variable), not in type records — they are not available here.
18
19use std::collections::HashMap;
20
21use anyhow::Context;
22use padlock_core::arch::ArchConfig;
23use padlock_core::ir::{AccessPattern, Field, StructLayout, TypeInfo};
24use pdb::{FallibleIterator, PrimitiveKind, TypeData, TypeFinder, TypeIndex};
25
26/// Extract struct/union/enum layouts from raw PDB file bytes.
27pub fn extract_from_pdb(
28    data: &[u8],
29    arch: &'static ArchConfig,
30) -> anyhow::Result<Vec<StructLayout>> {
31    let cursor = std::io::Cursor::new(data);
32    let mut pdb = pdb::PDB::open(cursor).context("failed to open PDB")?;
33    let type_info = pdb
34        .type_information()
35        .context("failed to read TPI stream")?;
36    let mut type_finder = type_info.finder();
37
38    // First pass: iterate all types, build the TypeFinder, and collect every
39    // Class/Union/Enumeration that is not a forward reference.
40    struct RawStruct {
41        name: String,
42        size: usize,
43        fields_idx: TypeIndex,
44        is_union: bool,
45        is_enum: bool,
46    }
47    let mut raw_structs: Vec<RawStruct> = Vec::new();
48
49    {
50        let mut iter = type_info.iter();
51        while let Some(typ) = iter.next()? {
52            type_finder.update(&iter);
53            match typ.parse() {
54                Ok(TypeData::Class(c)) => {
55                    if c.properties.forward_reference() {
56                        continue;
57                    }
58                    let Some(fields_idx) = c.fields else {
59                        continue;
60                    };
61                    raw_structs.push(RawStruct {
62                        name: c.name.to_string().into_owned(),
63                        size: c.size as usize,
64                        fields_idx,
65                        is_union: false,
66                        is_enum: false,
67                    });
68                }
69                Ok(TypeData::Union(u)) => {
70                    if u.properties.forward_reference() {
71                        continue;
72                    }
73                    raw_structs.push(RawStruct {
74                        name: u.name.to_string().into_owned(),
75                        size: u.size as usize,
76                        fields_idx: u.fields,
77                        is_union: true,
78                        is_enum: false,
79                    });
80                }
81                Ok(TypeData::Enumeration(e)) => {
82                    if e.properties.forward_reference() {
83                        continue;
84                    }
85                    let underlying_size = underlying_enum_size(e.underlying_type, arch);
86                    raw_structs.push(RawStruct {
87                        name: e.name.to_string().into_owned(),
88                        size: underlying_size,
89                        fields_idx: e.fields,
90                        is_union: false,
91                        is_enum: true,
92                    });
93                }
94                _ => {}
95            }
96        }
97    }
98
99    // Build a size cache: TypeIndex → (size, align) for fast field type lookup.
100    // For structs/unions we approximate alignment as the largest power-of-two
101    // that divides the size (capped at pointer_size).  This is conservative
102    // but avoids false-positives in padding detection; exact field-derived
103    // alignment is computed when we build the final StructLayout.
104    let mut size_cache: HashMap<TypeIndex, (usize, usize)> = HashMap::new();
105    {
106        let mut iter = type_info.iter();
107        while let Some(typ) = iter.next()? {
108            match typ.parse() {
109                Ok(TypeData::Class(c)) if !c.properties.forward_reference() => {
110                    let sz = c.size as usize;
111                    let al = approx_struct_align(sz, arch);
112                    size_cache.insert(typ.index(), (sz, al));
113                }
114                Ok(TypeData::Union(u)) if !u.properties.forward_reference() => {
115                    let sz = u.size as usize;
116                    let al = approx_struct_align(sz, arch);
117                    size_cache.insert(typ.index(), (sz, al));
118                }
119                Ok(TypeData::Primitive(p)) => {
120                    if let Some((sz, al)) = primitive_size(&p, arch) {
121                        size_cache.insert(typ.index(), (sz, al));
122                    }
123                }
124                _ => {}
125            }
126        }
127    }
128
129    let mut layouts = Vec::new();
130
131    for raw in raw_structs {
132        if raw.is_enum {
133            // Enums are represented as a single `__discriminant` field.
134            let sz = raw.size;
135            let al = sz.max(1);
136            let fields = vec![Field {
137                name: "__discriminant".to_string(),
138                ty: TypeInfo::Primitive {
139                    name: format!("uint{}_t", sz * 8),
140                    size: sz,
141                    align: al,
142                },
143                offset: 0,
144                size: sz,
145                align: al,
146                source_file: None,
147                source_line: None,
148                access: AccessPattern::Unknown,
149            }];
150            layouts.push(StructLayout {
151                name: raw.name,
152                total_size: sz,
153                align: al,
154                fields,
155                source_file: None,
156                source_line: None,
157                arch,
158                is_packed: false,
159                is_union: false,
160                is_repr_rust: false,
161                suppressed_findings: Vec::new(),
162                uncertain_fields: Vec::new(),
163            });
164            continue;
165        }
166
167        let (fields, uncertain_fields) = collect_fields(
168            &raw.fields_idx,
169            &type_finder,
170            &size_cache,
171            arch,
172            raw.is_union,
173        )?;
174
175        let align = fields.iter().map(|f| f.align).max().unwrap_or(1);
176
177        layouts.push(StructLayout {
178            name: raw.name,
179            total_size: raw.size,
180            align,
181            fields,
182            source_file: None,
183            source_line: None,
184            arch,
185            is_packed: false,
186            is_union: raw.is_union,
187            is_repr_rust: false,
188            suppressed_findings: Vec::new(),
189            uncertain_fields,
190        });
191    }
192
193    Ok(layouts)
194}
195
196/// Approximate the alignment of a struct/union from its total size.
197///
198/// Returns the largest power-of-two that is ≤ `sz` and ≤ `arch.pointer_size`.
199/// This is a conservative under-estimate; the true alignment is the max of
200/// field alignments, which we derive from the collected fields after parsing.
201fn approx_struct_align(sz: usize, arch: &ArchConfig) -> usize {
202    if sz == 0 {
203        return 1;
204    }
205    // Largest power-of-two that divides sz (i.e. is ≤ sz)
206    let pot = 1usize << sz.trailing_zeros();
207    pot.min(arch.pointer_size)
208}
209
210/// Resolve the byte size of an enum's underlying integer type.
211fn underlying_enum_size(idx: TypeIndex, arch: &'static ArchConfig) -> usize {
212    // We need a TypeFinder to resolve this, but during the first pass we only
213    // have the iterator. Re-open would be expensive, so we fall back to 4 bytes
214    // (the MSVC default for `int`-backed enums) as a safe approximation.
215    // In practice this is correct for the vast majority of MSVC-compiled enums.
216    let _ = (idx, arch);
217    4
218}
219
220/// Collect fields from a FieldList type index.
221/// Returns `(fields, uncertain_field_names)`.
222fn collect_fields(
223    fields_idx: &TypeIndex,
224    type_finder: &TypeFinder<'_>,
225    size_cache: &HashMap<TypeIndex, (usize, usize)>,
226    arch: &'static ArchConfig,
227    is_union: bool,
228) -> anyhow::Result<(Vec<Field>, Vec<String>)> {
229    let field_type = type_finder.find(*fields_idx)?.parse()?;
230    let field_list = match field_type {
231        TypeData::FieldList(fl) => fl,
232        _ => return Ok((Vec::new(), Vec::new())),
233    };
234
235    let mut fields: Vec<Field> = Vec::new();
236    let mut uncertain: Vec<String> = Vec::new();
237
238    // Accumulate consecutive bitfield members (same offset) before flushing.
239    struct BfGroup {
240        parts: Vec<String>,
241        offset: usize,
242        storage_bytes: usize,
243    }
244    let mut pending_bf: Option<BfGroup> = None;
245
246    let flush_bf = |group: BfGroup, fields: &mut Vec<Field>, uncertain: &mut Vec<String>| {
247        if group.storage_bytes == 0 {
248            uncertain.push(format!("[bf@{}]", group.offset));
249            return;
250        }
251        let name = if group.parts.is_empty() {
252            "[__pad]".to_string()
253        } else {
254            format!("[{}]", group.parts.join("|"))
255        };
256        fields.push(Field {
257            name,
258            ty: TypeInfo::Primitive {
259                name: format!("uint{}_t", group.storage_bytes * 8),
260                size: group.storage_bytes,
261                align: group.storage_bytes,
262            },
263            offset: group.offset,
264            size: group.storage_bytes,
265            align: group.storage_bytes,
266            source_file: None,
267            source_line: None,
268            access: AccessPattern::Unknown,
269        });
270    };
271
272    for field_data in &field_list.fields {
273        if let TypeData::Member(m) = field_data {
274            let offset = m.offset as usize;
275            let name = m.name.to_string().into_owned();
276
277            // Detect bitfield: the field_type will be a Bitfield type record.
278            let bitfield_info = type_finder
279                .find(m.field_type)
280                .ok()
281                .and_then(|t| t.parse().ok())
282                .and_then(|td| {
283                    if let TypeData::Bitfield(bf) = td {
284                        Some(bf)
285                    } else {
286                        None
287                    }
288                });
289
290            if let Some(bf) = bitfield_info {
291                // Flush pending group if byte offset changed.
292                if let Some(ref g) = pending_bf
293                    && g.offset != offset
294                {
295                    let g = pending_bf.take().unwrap();
296                    flush_bf(g, &mut fields, &mut uncertain);
297                }
298
299                // Resolve the underlying storage type size.
300                let storage_bytes = type_finder
301                    .find(bf.underlying_type)
302                    .ok()
303                    .and_then(|t| t.parse().ok())
304                    .and_then(|td| {
305                        if let TypeData::Primitive(p) = td {
306                            primitive_size(&p, arch).map(|(sz, _)| sz)
307                        } else {
308                            None
309                        }
310                    })
311                    .unwrap_or(0);
312
313                let group = pending_bf.get_or_insert(BfGroup {
314                    parts: Vec::new(),
315                    offset,
316                    storage_bytes: 0,
317                });
318                if !name.is_empty() && bf.length > 0 {
319                    group.parts.push(format!("{name}:{}", bf.length));
320                }
321                if storage_bytes > group.storage_bytes {
322                    group.storage_bytes = storage_bytes;
323                }
324            } else {
325                // Flush any pending bitfield group.
326                if let Some(g) = pending_bf.take() {
327                    flush_bf(g, &mut fields, &mut uncertain);
328                }
329
330                let (size, align) = resolve_type_size(m.field_type, type_finder, size_cache, arch);
331                let ty = TypeInfo::Opaque {
332                    name: format!("{}", m.field_type.0),
333                    size,
334                    align,
335                };
336
337                fields.push(Field {
338                    name,
339                    ty,
340                    offset,
341                    size,
342                    align,
343                    source_file: None,
344                    source_line: None,
345                    access: AccessPattern::Unknown,
346                });
347            }
348            // Skip static members, virtual-base records, base classes — they don't
349            // occupy a predictable slot in the struct's memory layout.
350        }
351    }
352
353    if let Some(g) = pending_bf.take() {
354        flush_bf(g, &mut fields, &mut uncertain);
355    }
356
357    if is_union {
358        for f in &mut fields {
359            f.offset = 0;
360        }
361    } else {
362        fields.sort_by_key(|f| f.offset);
363    }
364
365    Ok((fields, uncertain))
366}
367
368/// Resolve a TypeIndex to (size_bytes, align_bytes).
369fn resolve_type_size(
370    idx: TypeIndex,
371    type_finder: &TypeFinder<'_>,
372    size_cache: &HashMap<TypeIndex, (usize, usize)>,
373    arch: &'static ArchConfig,
374) -> (usize, usize) {
375    if let Some(&pair) = size_cache.get(&idx) {
376        return pair;
377    }
378    let td = match type_finder.find(idx).ok().and_then(|t| t.parse().ok()) {
379        Some(td) => td,
380        None => return (arch.pointer_size, arch.pointer_size),
381    };
382    match td {
383        TypeData::Primitive(p) => {
384            primitive_size(&p, arch).unwrap_or((arch.pointer_size, arch.pointer_size))
385        }
386        TypeData::Class(c) => {
387            let sz = c.size as usize;
388            (sz, approx_struct_align(sz, arch))
389        }
390        TypeData::Union(u) => {
391            let sz = u.size as usize;
392            (sz, approx_struct_align(sz, arch))
393        }
394        TypeData::Pointer(_) => (arch.pointer_size, arch.pointer_size),
395        TypeData::Array(a) => {
396            // `dimensions` holds cumulative byte lengths per dimension, NOT
397            // element counts.  E.g. `float[4][4]` → `[16, 64]`.  The total
398            // byte size is always the last (outermost) entry.
399            let total = a.dimensions.last().copied().unwrap_or(0) as usize;
400            let (_, elem_al) = resolve_type_size(a.element_type, type_finder, size_cache, arch);
401            (total, elem_al)
402        }
403        _ => (arch.pointer_size, arch.pointer_size),
404    }
405}
406
407/// Map a PDB `PrimitiveType` to (size_bytes, align_bytes). Returns `None` for
408/// void/notype. Pointer indirection is always arch pointer size.
409fn primitive_size(p: &pdb::PrimitiveType, arch: &ArchConfig) -> Option<(usize, usize)> {
410    if p.indirection.is_some() {
411        return Some((arch.pointer_size, arch.pointer_size));
412    }
413    let sz: usize = match p.kind {
414        PrimitiveKind::NoType | PrimitiveKind::Void => return None,
415        PrimitiveKind::Char
416        | PrimitiveKind::UChar
417        | PrimitiveKind::I8
418        | PrimitiveKind::U8
419        | PrimitiveKind::RChar
420        | PrimitiveKind::Bool8 => 1,
421        PrimitiveKind::WChar
422        | PrimitiveKind::Short
423        | PrimitiveKind::UShort
424        | PrimitiveKind::I16
425        | PrimitiveKind::U16
426        | PrimitiveKind::RChar16
427        | PrimitiveKind::F16
428        | PrimitiveKind::Bool16 => 2,
429        PrimitiveKind::Long
430        | PrimitiveKind::ULong
431        | PrimitiveKind::I32
432        | PrimitiveKind::U32
433        | PrimitiveKind::RChar32
434        | PrimitiveKind::F32
435        | PrimitiveKind::F32PP
436        | PrimitiveKind::HRESULT
437        | PrimitiveKind::Bool32 => 4,
438        PrimitiveKind::Quad
439        | PrimitiveKind::UQuad
440        | PrimitiveKind::I64
441        | PrimitiveKind::U64
442        | PrimitiveKind::F64
443        | PrimitiveKind::Complex32
444        | PrimitiveKind::Bool64 => 8,
445        PrimitiveKind::Octa
446        | PrimitiveKind::UOcta
447        | PrimitiveKind::I128
448        | PrimitiveKind::U128
449        | PrimitiveKind::F128
450        | PrimitiveKind::Complex64 => 16,
451        PrimitiveKind::F48 => 6,
452        PrimitiveKind::F80 => 10,
453        PrimitiveKind::Complex80 => 20,
454        PrimitiveKind::Complex128 => 32,
455        _ => return None,
456    };
457    Some((sz, sz))
458}
459
460// ── tests ─────────────────────────────────────────────────────────────────────
461
462#[cfg(test)]
463mod tests {
464    use super::*;
465    use padlock_core::arch::X86_64_SYSV;
466
467    // ── approx_struct_align ───────────────────────────────────────────────────
468
469    #[test]
470    fn approx_align_zero() {
471        assert_eq!(approx_struct_align(0, &X86_64_SYSV), 1);
472    }
473
474    #[test]
475    fn approx_align_exact_powers_of_two() {
476        // Powers of two ≤ pointer_size are returned as-is.
477        assert_eq!(approx_struct_align(1, &X86_64_SYSV), 1);
478        assert_eq!(approx_struct_align(2, &X86_64_SYSV), 2);
479        assert_eq!(approx_struct_align(4, &X86_64_SYSV), 4);
480        assert_eq!(approx_struct_align(8, &X86_64_SYSV), 8);
481    }
482
483    #[test]
484    fn approx_align_capped_at_pointer_size() {
485        // Sizes larger than pointer_size still return pointer_size (8 on x86-64).
486        assert_eq!(approx_struct_align(16, &X86_64_SYSV), 8);
487        assert_eq!(approx_struct_align(24, &X86_64_SYSV), 8);
488        assert_eq!(approx_struct_align(64, &X86_64_SYSV), 8);
489    }
490
491    #[test]
492    fn approx_align_non_power_of_two_uses_trailing_zeros() {
493        // Non-power-of-two sizes: alignment = largest power-of-two factor, capped at 8.
494        // 12 = 4 × 3 → trailing zeros = 2 → 1<<2 = 4
495        assert_eq!(approx_struct_align(12, &X86_64_SYSV), 4);
496        // 6 = 2 × 3 → trailing zeros = 1 → 1<<1 = 2
497        assert_eq!(approx_struct_align(6, &X86_64_SYSV), 2);
498        // 3 = odd → trailing zeros = 0 → 1<<0 = 1
499        assert_eq!(approx_struct_align(3, &X86_64_SYSV), 1);
500        // 20 = 4 × 5 → trailing zeros = 2 → 4
501        assert_eq!(approx_struct_align(20, &X86_64_SYSV), 4);
502    }
503
504    // ── primitive_size ────────────────────────────────────────────────────────
505
506    #[test]
507    fn primitive_size_void_returns_none() {
508        let p = pdb::PrimitiveType {
509            kind: PrimitiveKind::Void,
510            indirection: None,
511        };
512        assert!(primitive_size(&p, &X86_64_SYSV).is_none());
513    }
514
515    #[test]
516    fn primitive_size_i32_is_4() {
517        let p = pdb::PrimitiveType {
518            kind: PrimitiveKind::I32,
519            indirection: None,
520        };
521        assert_eq!(primitive_size(&p, &X86_64_SYSV), Some((4, 4)));
522    }
523
524    #[test]
525    fn primitive_size_pointer_indirection_uses_arch_size() {
526        let p = pdb::PrimitiveType {
527            kind: PrimitiveKind::I32,
528            indirection: Some(pdb::Indirection::Near32),
529        };
530        assert_eq!(
531            primitive_size(&p, &X86_64_SYSV),
532            Some((X86_64_SYSV.pointer_size, X86_64_SYSV.pointer_size))
533        );
534    }
535
536    #[test]
537    fn primitive_size_u64_is_8() {
538        let p = pdb::PrimitiveType {
539            kind: PrimitiveKind::U64,
540            indirection: None,
541        };
542        assert_eq!(primitive_size(&p, &X86_64_SYSV), Some((8, 8)));
543    }
544}