synth_core/dwarf_line.rs
1//! VCR-DBG-001 step 3 — compose the source-line table (the join half).
2//!
3//! The DWARF Tier-1 bridge maps an ARM text offset back to a source `file:line`
4//! through three established facts:
5//! 1. each ARM instruction carries `source_line` = the wasm OP INDEX
6//! (`ArmInstruction.source_line`);
7//! 2. step 1 (`FunctionOps.op_offsets`) maps op-index → the wasm code BYTE
8//! OFFSET (module-relative);
9//! 3. step 2 parses the input wasm's `.debug_line` → (code-section-relative
10//! address → `file:line`) rows.
11//!
12//! This module is the join for the wasm half — **op-index → source line** —
13//! which step 4 (emit) composes with the ARM layout (ARM-text-offset → op-index
14//! is just `source_line`). It is pure plain-data (no gimli, no backend): the
15//! caller parses the rows and supplies them, so the module is Bazel-clean and
16//! unwired (frozen-safe) until the emitter consumes it.
17//!
18//! The crux it encodes (validated on `scripts/repro/dwarf_coherent.wasm`,
19//! VCR-DBG-001 step-3 fixture): `op_offsets` are MODULE-relative while DWARF
20//! addresses are CODE-section-relative, and they differ by a single constant —
21//! the code section's payload start. So normalization is one subtraction:
22//! `dwarf_addr = op_offset - code_base`.
23
24/// One `.debug_line` row: a code-section-relative address and its source line.
25/// `file` is an opaque caller-supplied id (e.g. an index into the line
26/// program's file table) so this stays gimli-free.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub struct LineRow {
29 /// Code-section-relative address (the DWARF-for-wasm address space).
30 pub addr: u32,
31 pub line: u32,
32 pub file: u32,
33}
34
35/// A resolved source location for a wasm op.
36#[derive(Debug, Clone, Copy, PartialEq, Eq)]
37pub struct SourceLoc {
38 pub line: u32,
39 pub file: u32,
40}
41
42/// Map each wasm op (by its module-relative `op_offsets` byte offset) to a
43/// source location, by normalizing into the code-section-relative DWARF address
44/// space (`op_offset - code_base`) and taking the covering line-table row (the
45/// last row whose address is ≤ the op's address — standard line-table lookup).
46///
47/// Returns one entry per `op_offsets` element (parallel to a function's ops).
48/// `None` where the op precedes `code_base` (shouldn't happen for real code) or
49/// no row covers it (an op before the first line-table address).
50pub fn op_offsets_to_source(
51 op_offsets: &[u32],
52 code_base: u32,
53 rows: &[LineRow],
54) -> Vec<Option<SourceLoc>> {
55 let mut sorted: Vec<LineRow> = rows.to_vec();
56 sorted.sort_by_key(|r| r.addr);
57 op_offsets
58 .iter()
59 .map(|&off| {
60 let a = off.checked_sub(code_base)?;
61 // Largest row addr ≤ a (the line in effect at address a).
62 sorted
63 .iter()
64 .rev()
65 .find(|r| r.addr <= a)
66 .map(|r| SourceLoc {
67 line: r.line,
68 file: r.file,
69 })
70 })
71 .collect()
72}
73
74// ---------------------------------------------------------------------------
75// VCR-DBG-001 step 4 — PRODUCTION read + emit (the `--debug-line` feature).
76//
77// `read_input_dwarf_line` ports the read-side spike
78// (`tests/dwarf_line_read_spike.rs` + `dwarf_compose_step3.rs::code_base`):
79// pull the `.debug_*` custom sections out of the input wasm, parse `.debug_line`
80// with gimli, and also report the code-section payload start (`code_base`) the
81// compose normalizes against. `emit_debug_sections` ports the emit-side spike
82// (`tests/dwarf_emit_roundtrip_step4.rs::emit_dwarf`): take an address-ordered
83// (arm_addr → source line) table and produce a FULL debugger-readable DWARF unit
84// (`.debug_info`/`.debug_abbrev`/`.debug_str`/`.debug_line`) via gimli::write,
85// the CU's DW_AT_stmt_list pointing at the line table. Both are gated behind
86// `--debug-line`; when the input carries no DWARF, `read_input_dwarf_line`
87// returns empty rows (graceful no-op) and the emit is skipped, so the default
88// object stays bit-identical.
89
90use std::collections::HashMap;
91
92use gimli::{Dwarf, EndianSlice, LittleEndian, SectionId};
93use wasmparser::{Parser, Payload};
94
95/// Result of reading the input wasm's DWARF line table: the parsed rows plus the
96/// code-section payload start (`code_base`) the op-offset compose subtracts.
97#[derive(Debug, Default, Clone)]
98pub struct InputDwarfLine {
99 /// Code-section-relative `.debug_line` rows (`addr` is a wasm code byte
100 /// offset; for the synth bridge that equals the DWARF address space).
101 pub rows: Vec<LineRow>,
102 /// Module-relative byte offset of the code section payload start. Empty wasm
103 /// or a wasm with no code section reports 0.
104 pub code_base: u32,
105}
106
107/// Read the input wasm's `.debug_line` into code-section-relative
108/// `(addr → line)` rows and report `code_base`. Returns an empty table (rows
109/// empty, the feature a no-op) when the input carries no `.debug_*` sections or
110/// no parseable line program — never an error for a DWARF-free module.
111pub fn read_input_dwarf_line(wasm: &[u8]) -> InputDwarfLine {
112 // (a) extract every `.debug_*` custom section + find the code payload start.
113 let mut sections: HashMap<String, Vec<u8>> = HashMap::new();
114 let mut code_base = 0u32;
115 for payload in Parser::new(0).parse_all(wasm) {
116 match payload {
117 Ok(Payload::CustomSection(c)) if c.name().starts_with(".debug_") => {
118 sections.insert(c.name().to_string(), c.data().to_vec());
119 }
120 Ok(Payload::CodeSectionStart { range, .. }) => {
121 code_base = range.start as u32;
122 }
123 _ => {}
124 }
125 }
126 if !sections.contains_key(".debug_line") {
127 return InputDwarfLine {
128 rows: Vec::new(),
129 code_base,
130 };
131 }
132
133 // (b) parse `.debug_line` with gimli. A malformed line program degrades to
134 // an empty table (the feature no-ops) rather than failing the compile.
135 let rows = parse_debug_line_rows(§ions).unwrap_or_default();
136 InputDwarfLine { rows, code_base }
137}
138
139/// gimli read of `.debug_line` → rows. `file` is recorded as the line program's
140/// file index (kept opaque per `LineRow`'s contract; the compose carries it but
141/// only `addr`/`line` are load-bearing for the wasm-offset bridge).
142fn parse_debug_line_rows(
143 sections: &HashMap<String, Vec<u8>>,
144) -> Result<Vec<LineRow>, gimli::Error> {
145 let empty: &[u8] = &[];
146 let load = |id: SectionId| -> Result<EndianSlice<'_, LittleEndian>, gimli::Error> {
147 let data = sections.get(id.name()).map_or(empty, |v| v.as_slice());
148 Ok(EndianSlice::new(data, LittleEndian))
149 };
150 let dwarf = Dwarf::load(load)?;
151
152 let mut rows = Vec::new();
153 let mut units = dwarf.units();
154 while let Some(header) = units.next()? {
155 let unit = dwarf.unit(header)?;
156 let Some(program) = unit.line_program.clone() else {
157 continue;
158 };
159 let mut state = program.rows();
160 while let Some((_, row)) = state.next_row()? {
161 if row.end_sequence() {
162 continue;
163 }
164 rows.push(LineRow {
165 addr: row.address() as u32,
166 line: row.line().map(|l| l.get() as u32).unwrap_or(0),
167 file: row.file_index() as u32,
168 });
169 }
170 }
171 Ok(rows)
172}
173
174/// Emit an address-ordered `(arm_addr, line)` table as a FULL minimal DWARF unit
175/// (gimli::write) and return EVERY non-empty `.debug_*` section it produces —
176/// `.debug_info`, `.debug_abbrev`, `.debug_str`, `.debug_line` (and
177/// `.debug_line_str`/`.debug_ranges` etc. when non-empty). The caller composes
178/// the table (one address-sorted, de-duped sequence covering every function);
179/// this produces the section bytes for non-ALLOC ELF `PROGBITS` sections.
180/// Returns an empty `Vec` for an empty table (nothing to map ⇒ no sections ⇒
181/// output stays byte-identical).
182///
183/// Crucially this emits a real root `DW_TAG_compile_unit` DIE with `DW_AT_name`,
184/// `DW_AT_low_pc`/`DW_AT_high_pc` spanning the emitted text, and the line program
185/// attached — so the CU's `DW_AT_stmt_list` points at `.debug_line`. That makes
186/// the line table reachable via the NORMAL debugger walk (`.debug_info` → CU →
187/// `DW_AT_stmt_list` → line program), not just a standalone `.debug_line` parse.
188///
189/// Ports `tests/dwarf_emit_roundtrip_step4.rs::emit_dwarf` (which emits the same
190/// full unit and round-trips through `Dwarf::units()`).
191pub fn emit_debug_sections(table: &[(u64, u32)], text_sym: usize) -> Vec<EmittedDwarfSection> {
192 use gimli::write::{Address, AttributeValue, DwarfUnit, LineProgram, LineString, Sections};
193
194 if table.is_empty() {
195 return Vec::new();
196 }
197
198 let encoding = gimli::Encoding {
199 format: gimli::Format::Dwarf32,
200 version: 4,
201 address_size: 4,
202 };
203 let mut dwarf = DwarfUnit::new(encoding);
204
205 // The span of emitted text the unit describes: low_pc=`.text`+0 (text base),
206 // high_pc one past the last mapped address.
207 let high_pc = table.iter().map(|&(a, _)| a).max().unwrap_or(0) + 1;
208
209 // gimli 0.33 split the comp-dir/file args: (working_dir, source_dir,
210 // source_file, source_file_info). source_dir = None ⇒ the file sits in
211 // working_dir, matching the previous single-dir behaviour.
212 let mut program = LineProgram::new(
213 encoding,
214 gimli::LineEncoding::default(),
215 LineString::String(b"/synth".to_vec()),
216 None,
217 LineString::String(b"synth.wasm".to_vec()),
218 None,
219 );
220 let dir = program.default_directory();
221 let fid = program.add_file(LineString::String(b"synth.wasm".to_vec()), dir, None);
222
223 // The sequence base is `.text + 0` as a RELOCATABLE address (one
224 // `DW_LNE_set_address` against the `.text` symbol, addend 0); each row's
225 // `address_offset` stays a text-relative DELTA, so only this single site
226 // needs a relocation per section. Addend 0 ⇒ the in-place bytes are
227 // byte-identical to the previous `Address::Constant(0)` form.
228 let text_base = Address::Symbol {
229 symbol: text_sym,
230 addend: 0,
231 };
232 program.begin_sequence(Some(text_base));
233 for &(addr, line) in table {
234 let row = program.row();
235 row.address_offset = addr;
236 row.file = fid;
237 row.line = line as u64;
238 program.generate_row();
239 }
240 program.end_sequence(high_pc);
241 dwarf.unit.line_program = program;
242
243 // Populate the root DW_TAG_compile_unit DIE: a name, the text span, and (via
244 // gimli auto-wiring the attached line_program) DW_AT_stmt_list → .debug_line.
245 {
246 let name_id = dwarf.strings.add("synth.wasm");
247 let root = dwarf.unit.root();
248 let root_die = dwarf.unit.get_mut(root);
249 root_die.set(gimli::DW_AT_name, AttributeValue::StringRef(name_id));
250 root_die.set(gimli::DW_AT_low_pc, AttributeValue::Address(text_base));
251 root_die.set(gimli::DW_AT_high_pc, AttributeValue::Udata(high_pc));
252 }
253
254 let seed = RelocWriter {
255 inner: gimli::write::EndianVec::new(LittleEndian),
256 relocs: Vec::new(),
257 };
258 let mut sections = Sections::new(seed);
259 if dwarf.write(&mut sections).is_err() {
260 return Vec::new();
261 }
262
263 let mut out: Vec<EmittedDwarfSection> = Vec::new();
264 let _ = sections.for_each(|id, w: &RelocWriter| -> Result<(), ()> {
265 let bytes = w.inner.slice();
266 if !bytes.is_empty()
267 && let Some(name) = section_name(id)
268 {
269 let text_relocs = w
270 .relocs
271 .iter()
272 .map(|&(offset, _addend, size)| DwarfTextReloc {
273 offset: offset as u32,
274 size,
275 })
276 .collect();
277 out.push(EmittedDwarfSection {
278 name,
279 bytes: bytes.to_vec(),
280 text_relocs,
281 });
282 }
283 Ok(())
284 });
285 out
286}
287
288/// A relocation a `.debug_*` section needs against the `.text` symbol so a host
289/// linker fixes up the embedded `.text` address when `.text` is placed. REL
290/// form: the in-place bytes already hold the addend (always `0` for our
291/// text-base references), so only the site (`offset`) and `size` travel here.
292#[derive(Debug, Clone, Copy, PartialEq, Eq)]
293pub struct DwarfTextReloc {
294 /// Byte offset within the section where the relocated address word sits.
295 pub offset: u32,
296 /// Size of the relocated value (always 4 for DWARF32 addresses).
297 pub size: u8,
298}
299
300/// One emitted `.debug_*` section: its ELF name, bytes, and the `.text`-symbol
301/// relocations it needs (empty for address-free sections like `.debug_str`).
302#[derive(Debug, Clone)]
303pub struct EmittedDwarfSection {
304 /// `'static` ELF section name (e.g. `.debug_line`).
305 pub name: &'static str,
306 /// Section payload bytes.
307 pub bytes: Vec<u8>,
308 /// `.text`-symbol relocations within this section (REL, in-place addend 0).
309 pub text_relocs: Vec<DwarfTextReloc>,
310}
311
312/// A gimli `write::Writer` that delegates to an inner `EndianVec` but records
313/// every `Address::Symbol` write as a relocation. ONLY `write_address` is
314/// overridden — `write_offset` (gimli's internal section-to-section references,
315/// e.g. `.debug_info` → `.debug_str`/`.debug_abbrev` and `DW_AT_stmt_list` →
316/// `.debug_line`) keeps the default, so those stay CONCRETE intra-file offsets
317/// and need no section symbols. The only relocations captured are the two
318/// `.text` references (the line program's `DW_LNE_set_address` and the CU's
319/// `DW_AT_low_pc`). `Clone` so `Sections::new` can seed each section writer.
320#[derive(Clone)]
321struct RelocWriter {
322 inner: gimli::write::EndianVec<LittleEndian>,
323 /// (offset within section, addend, size) for each `Address::Symbol` write.
324 relocs: Vec<(usize, i64, u8)>,
325}
326
327impl gimli::write::Writer for RelocWriter {
328 type Endian = LittleEndian;
329
330 fn endian(&self) -> Self::Endian {
331 self.inner.endian()
332 }
333
334 fn len(&self) -> usize {
335 self.inner.len()
336 }
337
338 fn write(&mut self, bytes: &[u8]) -> gimli::write::Result<()> {
339 self.inner.write(bytes)
340 }
341
342 fn write_at(&mut self, offset: usize, bytes: &[u8]) -> gimli::write::Result<()> {
343 self.inner.write_at(offset, bytes)
344 }
345
346 fn write_address(
347 &mut self,
348 address: gimli::write::Address,
349 size: u8,
350 ) -> gimli::write::Result<()> {
351 use gimli::write::Address;
352 match address {
353 Address::Constant(val) => self.inner.write_udata(val, size),
354 Address::Symbol { symbol: _, addend } => {
355 // REL: record the site and write the addend in place (0 ⇒ the
356 // bytes match the old `Address::Constant(0)` exactly).
357 let offset = self.inner.len();
358 self.relocs.push((offset, addend, size));
359 self.inner.write_udata(addend as u64, size)
360 }
361 }
362 }
363}
364
365/// `'static` ELF section name for the `.debug_*` sections the emitter can
366/// produce. Returns `None` for any section id we do not wire (none are expected
367/// for this minimal unit, but the match keeps the names `'static`).
368fn section_name(id: SectionId) -> Option<&'static str> {
369 Some(match id {
370 SectionId::DebugInfo => ".debug_info",
371 SectionId::DebugAbbrev => ".debug_abbrev",
372 SectionId::DebugStr => ".debug_str",
373 SectionId::DebugLine => ".debug_line",
374 SectionId::DebugLineStr => ".debug_line_str",
375 SectionId::DebugRanges => ".debug_ranges",
376 SectionId::DebugRngLists => ".debug_rnglists",
377 SectionId::DebugStrOffsets => ".debug_str_offsets",
378 SectionId::DebugAddr => ".debug_addr",
379 _ => return None,
380 })
381}
382
383#[cfg(test)]
384mod tests {
385 use super::*;
386
387 #[test]
388 fn covering_row_lookup() {
389 // code_base 100; rows at code-rel 0→line10, 8→line11, 20→line12.
390 let rows = [
391 LineRow {
392 addr: 0,
393 line: 10,
394 file: 1,
395 },
396 LineRow {
397 addr: 8,
398 line: 11,
399 file: 1,
400 },
401 LineRow {
402 addr: 20,
403 line: 12,
404 file: 1,
405 },
406 ];
407 // ops at module 100 (→0), 104 (→4), 108 (→8), 130 (→30).
408 let got = op_offsets_to_source(&[100, 104, 108, 130], 100, &rows);
409 assert_eq!(got[0].map(|s| s.line), Some(10)); // addr 0 → row 0
410 assert_eq!(got[1].map(|s| s.line), Some(10)); // addr 4 → still row 0
411 assert_eq!(got[2].map(|s| s.line), Some(11)); // addr 8 → row 8
412 assert_eq!(got[3].map(|s| s.line), Some(12)); // addr 30 → row 20 (last ≤)
413 }
414
415 #[test]
416 fn op_before_first_row_is_none() {
417 let rows = [LineRow {
418 addr: 8,
419 line: 11,
420 file: 1,
421 }];
422 // op at module 100 → code-rel 0, before the first row (addr 8).
423 let got = op_offsets_to_source(&[100], 100, &rows);
424 assert_eq!(got[0], None);
425 }
426
427 #[test]
428 fn op_before_code_base_is_none() {
429 let rows = [LineRow {
430 addr: 0,
431 line: 1,
432 file: 1,
433 }];
434 let got = op_offsets_to_source(&[50], 100, &rows);
435 assert_eq!(got[0], None);
436 }
437}