Skip to main content

synth_core/
dwarf_line.rs

1//! VCR-DBG-001 step 3 — compose the source-line table (the join half).
2//!
3//! The DWARF Tier-1 bridge maps an ARM text offset back to a source `file:line`
4//! through three established facts:
5//!   1. each ARM instruction carries `source_line` = the wasm OP INDEX
6//!      (`ArmInstruction.source_line`);
7//!   2. step 1 (`FunctionOps.op_offsets`) maps op-index → the wasm code BYTE
8//!      OFFSET (module-relative);
9//!   3. step 2 parses the input wasm's `.debug_line` → (code-section-relative
10//!      address → `file:line`) rows.
11//!
12//! This module is the join for the wasm half — **op-index → source line** —
13//! which step 4 (emit) composes with the ARM layout (ARM-text-offset → op-index
14//! is just `source_line`). It is pure plain-data (no gimli, no backend): the
15//! caller parses the rows and supplies them, so the module is Bazel-clean and
16//! unwired (frozen-safe) until the emitter consumes it.
17//!
18//! The crux it encodes (validated on `scripts/repro/dwarf_coherent.wasm`,
19//! VCR-DBG-001 step-3 fixture): `op_offsets` are MODULE-relative while DWARF
20//! addresses are CODE-section-relative, and they differ by a single constant —
21//! the code section's payload start. So normalization is one subtraction:
22//! `dwarf_addr = op_offset - code_base`.
23
24/// One `.debug_line` row: a code-section-relative address and its source line.
25/// `file` is an opaque caller-supplied id (e.g. an index into the line
26/// program's file table) so this stays gimli-free.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub struct LineRow {
29    /// Code-section-relative address (the DWARF-for-wasm address space).
30    pub addr: u32,
31    pub line: u32,
32    pub file: u32,
33}
34
35/// A resolved source location for a wasm op.
36#[derive(Debug, Clone, Copy, PartialEq, Eq)]
37pub struct SourceLoc {
38    pub line: u32,
39    pub file: u32,
40}
41
42/// Map each wasm op (by its module-relative `op_offsets` byte offset) to a
43/// source location, by normalizing into the code-section-relative DWARF address
44/// space (`op_offset - code_base`) and taking the covering line-table row (the
45/// last row whose address is ≤ the op's address — standard line-table lookup).
46///
47/// Returns one entry per `op_offsets` element (parallel to a function's ops).
48/// `None` where the op precedes `code_base` (shouldn't happen for real code) or
49/// no row covers it (an op before the first line-table address).
50pub fn op_offsets_to_source(
51    op_offsets: &[u32],
52    code_base: u32,
53    rows: &[LineRow],
54) -> Vec<Option<SourceLoc>> {
55    let mut sorted: Vec<LineRow> = rows.to_vec();
56    sorted.sort_by_key(|r| r.addr);
57    op_offsets
58        .iter()
59        .map(|&off| {
60            let a = off.checked_sub(code_base)?;
61            // Largest row addr ≤ a (the line in effect at address a).
62            sorted
63                .iter()
64                .rev()
65                .find(|r| r.addr <= a)
66                .map(|r| SourceLoc {
67                    line: r.line,
68                    file: r.file,
69                })
70        })
71        .collect()
72}
73
74// ---------------------------------------------------------------------------
75// VCR-DBG-001 step 4 — PRODUCTION read + emit (the `--debug-line` feature).
76//
77// `read_input_dwarf_line` ports the read-side spike
78// (`tests/dwarf_line_read_spike.rs` + `dwarf_compose_step3.rs::code_base`):
79// pull the `.debug_*` custom sections out of the input wasm, parse `.debug_line`
80// with gimli, and also report the code-section payload start (`code_base`) the
81// compose normalizes against. `emit_debug_sections` ports the emit-side spike
82// (`tests/dwarf_emit_roundtrip_step4.rs::emit_dwarf`): take an address-ordered
83// (arm_addr → source line) table and produce a FULL debugger-readable DWARF unit
84// (`.debug_info`/`.debug_abbrev`/`.debug_str`/`.debug_line`) via gimli::write,
85// the CU's DW_AT_stmt_list pointing at the line table. Both are gated behind
86// `--debug-line`; when the input carries no DWARF, `read_input_dwarf_line`
87// returns empty rows (graceful no-op) and the emit is skipped, so the default
88// object stays bit-identical.
89
90use std::collections::HashMap;
91
92use gimli::{Dwarf, EndianSlice, LittleEndian, SectionId};
93use wasmparser::{Parser, Payload};
94
95/// Result of reading the input wasm's DWARF line table: the parsed rows plus the
96/// code-section payload start (`code_base`) the op-offset compose subtracts.
97#[derive(Debug, Default, Clone)]
98pub struct InputDwarfLine {
99    /// Code-section-relative `.debug_line` rows (`addr` is a wasm code byte
100    /// offset; for the synth bridge that equals the DWARF address space).
101    pub rows: Vec<LineRow>,
102    /// Module-relative byte offset of the code section payload start. Empty wasm
103    /// or a wasm with no code section reports 0.
104    pub code_base: u32,
105}
106
107/// Read the input wasm's `.debug_line` into code-section-relative
108/// `(addr → line)` rows and report `code_base`. Returns an empty table (rows
109/// empty, the feature a no-op) when the input carries no `.debug_*` sections or
110/// no parseable line program — never an error for a DWARF-free module.
111pub fn read_input_dwarf_line(wasm: &[u8]) -> InputDwarfLine {
112    // (a) extract every `.debug_*` custom section + find the code payload start.
113    let mut sections: HashMap<String, Vec<u8>> = HashMap::new();
114    let mut code_base = 0u32;
115    for payload in Parser::new(0).parse_all(wasm) {
116        match payload {
117            Ok(Payload::CustomSection(c)) if c.name().starts_with(".debug_") => {
118                sections.insert(c.name().to_string(), c.data().to_vec());
119            }
120            Ok(Payload::CodeSectionStart { range, .. }) => {
121                code_base = range.start as u32;
122            }
123            _ => {}
124        }
125    }
126    if !sections.contains_key(".debug_line") {
127        return InputDwarfLine {
128            rows: Vec::new(),
129            code_base,
130        };
131    }
132
133    // (b) parse `.debug_line` with gimli. A malformed line program degrades to
134    // an empty table (the feature no-ops) rather than failing the compile.
135    let rows = parse_debug_line_rows(&sections).unwrap_or_default();
136    InputDwarfLine { rows, code_base }
137}
138
139/// gimli read of `.debug_line` → rows. `file` is recorded as the line program's
140/// file index (kept opaque per `LineRow`'s contract; the compose carries it but
141/// only `addr`/`line` are load-bearing for the wasm-offset bridge).
142fn parse_debug_line_rows(
143    sections: &HashMap<String, Vec<u8>>,
144) -> Result<Vec<LineRow>, gimli::Error> {
145    let empty: &[u8] = &[];
146    let load = |id: SectionId| -> Result<EndianSlice<'_, LittleEndian>, gimli::Error> {
147        let data = sections.get(id.name()).map_or(empty, |v| v.as_slice());
148        Ok(EndianSlice::new(data, LittleEndian))
149    };
150    let dwarf = Dwarf::load(load)?;
151
152    let mut rows = Vec::new();
153    let mut units = dwarf.units();
154    while let Some(header) = units.next()? {
155        let unit = dwarf.unit(header)?;
156        let Some(program) = unit.line_program.clone() else {
157            continue;
158        };
159        let mut state = program.rows();
160        while let Some((_, row)) = state.next_row()? {
161            if row.end_sequence() {
162                continue;
163            }
164            rows.push(LineRow {
165                addr: row.address() as u32,
166                line: row.line().map(|l| l.get() as u32).unwrap_or(0),
167                file: row.file_index() as u32,
168            });
169        }
170    }
171    Ok(rows)
172}
173
174/// Emit an address-ordered `(arm_addr, line)` table as a FULL minimal DWARF unit
175/// (gimli::write) and return EVERY non-empty `.debug_*` section it produces —
176/// `.debug_info`, `.debug_abbrev`, `.debug_str`, `.debug_line` (and
177/// `.debug_line_str`/`.debug_ranges` etc. when non-empty). The caller composes
178/// the table (one address-sorted, de-duped sequence covering every function);
179/// this produces the section bytes for non-ALLOC ELF `PROGBITS` sections.
180/// Returns an empty `Vec` for an empty table (nothing to map ⇒ no sections ⇒
181/// output stays byte-identical).
182///
183/// Crucially this emits a real root `DW_TAG_compile_unit` DIE with `DW_AT_name`,
184/// `DW_AT_low_pc`/`DW_AT_high_pc` spanning the emitted text, and the line program
185/// attached — so the CU's `DW_AT_stmt_list` points at `.debug_line`. That makes
186/// the line table reachable via the NORMAL debugger walk (`.debug_info` → CU →
187/// `DW_AT_stmt_list` → line program), not just a standalone `.debug_line` parse.
188///
189/// Ports `tests/dwarf_emit_roundtrip_step4.rs::emit_dwarf` (which emits the same
190/// full unit and round-trips through `Dwarf::units()`).
191pub fn emit_debug_sections(table: &[(u64, u32)], text_sym: usize) -> Vec<EmittedDwarfSection> {
192    use gimli::write::{Address, AttributeValue, DwarfUnit, LineProgram, LineString, Sections};
193
194    if table.is_empty() {
195        return Vec::new();
196    }
197
198    let encoding = gimli::Encoding {
199        format: gimli::Format::Dwarf32,
200        version: 4,
201        address_size: 4,
202    };
203    let mut dwarf = DwarfUnit::new(encoding);
204
205    // The span of emitted text the unit describes: low_pc=`.text`+0 (text base),
206    // high_pc one past the last mapped address.
207    let high_pc = table.iter().map(|&(a, _)| a).max().unwrap_or(0) + 1;
208
209    // gimli 0.33 split the comp-dir/file args: (working_dir, source_dir,
210    // source_file, source_file_info). source_dir = None ⇒ the file sits in
211    // working_dir, matching the previous single-dir behaviour.
212    let mut program = LineProgram::new(
213        encoding,
214        gimli::LineEncoding::default(),
215        LineString::String(b"/synth".to_vec()),
216        None,
217        LineString::String(b"synth.wasm".to_vec()),
218        None,
219    );
220    let dir = program.default_directory();
221    let fid = program.add_file(LineString::String(b"synth.wasm".to_vec()), dir, None);
222
223    // The sequence base is `.text + 0` as a RELOCATABLE address (one
224    // `DW_LNE_set_address` against the `.text` symbol, addend 0); each row's
225    // `address_offset` stays a text-relative DELTA, so only this single site
226    // needs a relocation per section. Addend 0 ⇒ the in-place bytes are
227    // byte-identical to the previous `Address::Constant(0)` form.
228    let text_base = Address::Symbol {
229        symbol: text_sym,
230        addend: 0,
231    };
232    program.begin_sequence(Some(text_base));
233    for &(addr, line) in table {
234        let row = program.row();
235        row.address_offset = addr;
236        row.file = fid;
237        row.line = line as u64;
238        program.generate_row();
239    }
240    program.end_sequence(high_pc);
241    dwarf.unit.line_program = program;
242
243    // Populate the root DW_TAG_compile_unit DIE: a name, the text span, and (via
244    // gimli auto-wiring the attached line_program) DW_AT_stmt_list → .debug_line.
245    {
246        let name_id = dwarf.strings.add("synth.wasm");
247        let root = dwarf.unit.root();
248        let root_die = dwarf.unit.get_mut(root);
249        root_die.set(gimli::DW_AT_name, AttributeValue::StringRef(name_id));
250        root_die.set(gimli::DW_AT_low_pc, AttributeValue::Address(text_base));
251        root_die.set(gimli::DW_AT_high_pc, AttributeValue::Udata(high_pc));
252    }
253
254    let seed = RelocWriter {
255        inner: gimli::write::EndianVec::new(LittleEndian),
256        relocs: Vec::new(),
257    };
258    let mut sections = Sections::new(seed);
259    if dwarf.write(&mut sections).is_err() {
260        return Vec::new();
261    }
262
263    let mut out: Vec<EmittedDwarfSection> = Vec::new();
264    let _ = sections.for_each(|id, w: &RelocWriter| -> Result<(), ()> {
265        let bytes = w.inner.slice();
266        if !bytes.is_empty()
267            && let Some(name) = section_name(id)
268        {
269            let text_relocs = w
270                .relocs
271                .iter()
272                .map(|&(offset, _addend, size)| DwarfTextReloc {
273                    offset: offset as u32,
274                    size,
275                })
276                .collect();
277            out.push(EmittedDwarfSection {
278                name,
279                bytes: bytes.to_vec(),
280                text_relocs,
281            });
282        }
283        Ok(())
284    });
285    out
286}
287
288/// A relocation a `.debug_*` section needs against the `.text` symbol so a host
289/// linker fixes up the embedded `.text` address when `.text` is placed. REL
290/// form: the in-place bytes already hold the addend (always `0` for our
291/// text-base references), so only the site (`offset`) and `size` travel here.
292#[derive(Debug, Clone, Copy, PartialEq, Eq)]
293pub struct DwarfTextReloc {
294    /// Byte offset within the section where the relocated address word sits.
295    pub offset: u32,
296    /// Size of the relocated value (always 4 for DWARF32 addresses).
297    pub size: u8,
298}
299
300/// One emitted `.debug_*` section: its ELF name, bytes, and the `.text`-symbol
301/// relocations it needs (empty for address-free sections like `.debug_str`).
302#[derive(Debug, Clone)]
303pub struct EmittedDwarfSection {
304    /// `'static` ELF section name (e.g. `.debug_line`).
305    pub name: &'static str,
306    /// Section payload bytes.
307    pub bytes: Vec<u8>,
308    /// `.text`-symbol relocations within this section (REL, in-place addend 0).
309    pub text_relocs: Vec<DwarfTextReloc>,
310}
311
312/// A gimli `write::Writer` that delegates to an inner `EndianVec` but records
313/// every `Address::Symbol` write as a relocation. ONLY `write_address` is
314/// overridden — `write_offset` (gimli's internal section-to-section references,
315/// e.g. `.debug_info` → `.debug_str`/`.debug_abbrev` and `DW_AT_stmt_list` →
316/// `.debug_line`) keeps the default, so those stay CONCRETE intra-file offsets
317/// and need no section symbols. The only relocations captured are the two
318/// `.text` references (the line program's `DW_LNE_set_address` and the CU's
319/// `DW_AT_low_pc`). `Clone` so `Sections::new` can seed each section writer.
320#[derive(Clone)]
321struct RelocWriter {
322    inner: gimli::write::EndianVec<LittleEndian>,
323    /// (offset within section, addend, size) for each `Address::Symbol` write.
324    relocs: Vec<(usize, i64, u8)>,
325}
326
327impl gimli::write::Writer for RelocWriter {
328    type Endian = LittleEndian;
329
330    fn endian(&self) -> Self::Endian {
331        self.inner.endian()
332    }
333
334    fn len(&self) -> usize {
335        self.inner.len()
336    }
337
338    fn write(&mut self, bytes: &[u8]) -> gimli::write::Result<()> {
339        self.inner.write(bytes)
340    }
341
342    fn write_at(&mut self, offset: usize, bytes: &[u8]) -> gimli::write::Result<()> {
343        self.inner.write_at(offset, bytes)
344    }
345
346    fn write_address(
347        &mut self,
348        address: gimli::write::Address,
349        size: u8,
350    ) -> gimli::write::Result<()> {
351        use gimli::write::Address;
352        match address {
353            Address::Constant(val) => self.inner.write_udata(val, size),
354            Address::Symbol { symbol: _, addend } => {
355                // REL: record the site and write the addend in place (0 ⇒ the
356                // bytes match the old `Address::Constant(0)` exactly).
357                let offset = self.inner.len();
358                self.relocs.push((offset, addend, size));
359                self.inner.write_udata(addend as u64, size)
360            }
361        }
362    }
363}
364
365/// `'static` ELF section name for the `.debug_*` sections the emitter can
366/// produce. Returns `None` for any section id we do not wire (none are expected
367/// for this minimal unit, but the match keeps the names `'static`).
368fn section_name(id: SectionId) -> Option<&'static str> {
369    Some(match id {
370        SectionId::DebugInfo => ".debug_info",
371        SectionId::DebugAbbrev => ".debug_abbrev",
372        SectionId::DebugStr => ".debug_str",
373        SectionId::DebugLine => ".debug_line",
374        SectionId::DebugLineStr => ".debug_line_str",
375        SectionId::DebugRanges => ".debug_ranges",
376        SectionId::DebugRngLists => ".debug_rnglists",
377        SectionId::DebugStrOffsets => ".debug_str_offsets",
378        SectionId::DebugAddr => ".debug_addr",
379        _ => return None,
380    })
381}
382
383#[cfg(test)]
384mod tests {
385    use super::*;
386
387    #[test]
388    fn covering_row_lookup() {
389        // code_base 100; rows at code-rel 0→line10, 8→line11, 20→line12.
390        let rows = [
391            LineRow {
392                addr: 0,
393                line: 10,
394                file: 1,
395            },
396            LineRow {
397                addr: 8,
398                line: 11,
399                file: 1,
400            },
401            LineRow {
402                addr: 20,
403                line: 12,
404                file: 1,
405            },
406        ];
407        // ops at module 100 (→0), 104 (→4), 108 (→8), 130 (→30).
408        let got = op_offsets_to_source(&[100, 104, 108, 130], 100, &rows);
409        assert_eq!(got[0].map(|s| s.line), Some(10)); // addr 0  → row 0
410        assert_eq!(got[1].map(|s| s.line), Some(10)); // addr 4  → still row 0
411        assert_eq!(got[2].map(|s| s.line), Some(11)); // addr 8  → row 8
412        assert_eq!(got[3].map(|s| s.line), Some(12)); // addr 30 → row 20 (last ≤)
413    }
414
415    #[test]
416    fn op_before_first_row_is_none() {
417        let rows = [LineRow {
418            addr: 8,
419            line: 11,
420            file: 1,
421        }];
422        // op at module 100 → code-rel 0, before the first row (addr 8).
423        let got = op_offsets_to_source(&[100], 100, &rows);
424        assert_eq!(got[0], None);
425    }
426
427    #[test]
428    fn op_before_code_base_is_none() {
429        let rows = [LineRow {
430            addr: 0,
431            line: 1,
432            file: 1,
433        }];
434        let got = op_offsets_to_source(&[50], 100, &rows);
435        assert_eq!(got[0], None);
436    }
437}