nornir 0.4.34

Companion to cargo: dependency tracking, release gating, deploy, benchmarks, and documentation assembly. Project-agnostic.
//! DWARF symbol extraction from compiled artifacts.
//!
//! Reads `DW_TAG_subprogram` entries out of a binary's `.debug_info`
//! section (gimli) and returns one [`Symbol`] per defined function.
//! Both the raw mangled name and the rustc-demangled name (with and
//! without generic args) are kept, so an agent can lookup either form.
//!
//! Symbols whose `DW_AT_decl_file` resolves outside `workspace_root`
//! are filtered out — we only care about *our* code, not libstd or
//! crates.io deps.

use std::borrow::Cow;
use std::fs::File;
use std::path::{Path, PathBuf};

use anyhow::{anyhow, Context, Result};
use gimli::{
    AttributeValue, DebuggingInformationEntry, Dwarf, EndianSlice, RunTimeEndian, Unit,
};
use memmap2::Mmap;
use object::{Object, ObjectSection};
use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Symbol {
    /// Rustc-demangled name with generics stripped, e.g. `foo::Bar::new`.
    pub name: String,
    /// Rustc-demangled name with full generics, e.g. `foo::Bar<u32>::new`.
    pub name_demangled: String,
    /// Raw symbol from DWARF (linkage name when present, else `DW_AT_name`).
    pub name_mangled: String,
    /// Absolute source path of `DW_AT_decl_file`, relative to workspace root if inside it.
    pub file: String,
    /// Source line of `DW_AT_decl_line`. `None` if absent.
    pub line: Option<u32>,
    /// `DW_AT_high_pc - DW_AT_low_pc` when both present.
    pub size_bytes: Option<u64>,
    /// Crate name as guessed from the source path's top workspace segment.
    pub krate: String,
}

/// Extract every workspace-local function symbol from `binary_path`.
///
/// `workspace_root` is the dir whose children are repos; any DWARF
/// entry whose source file lives outside this root is dropped. Pass
/// `binary_path` to a build artifact compiled with debug info (the
/// default for `cargo build`; `cargo build --release` keeps it too
/// unless you strip).
pub fn extract_symbols(binary_path: &Path, workspace_root: &Path) -> Result<Vec<Symbol>> {
    let file = File::open(binary_path)
        .with_context(|| format!("open binary {}", binary_path.display()))?;
    let mmap = unsafe { Mmap::map(&file) }
        .with_context(|| format!("mmap {}", binary_path.display()))?;
    let object = object::File::parse(&*mmap)
        .with_context(|| format!("parse object {}", binary_path.display()))?;

    let endian = if object.is_little_endian() {
        RunTimeEndian::Little
    } else {
        RunTimeEndian::Big
    };

    let load_section = |id: gimli::SectionId| -> Result<Cow<[u8]>> {
        Ok(match object.section_by_name(id.name()) {
            Some(s) => s.uncompressed_data().unwrap_or(Cow::Borrowed(&[])),
            None => Cow::Borrowed(&[]),
        })
    };
    let dwarf_sections = gimli::DwarfSections::load(load_section)?;
    let dwarf = dwarf_sections.borrow(|section| EndianSlice::new(section, endian));

    let abs_root = workspace_root
        .canonicalize()
        .unwrap_or_else(|_| workspace_root.to_path_buf());

    let mut out = Vec::new();
    let mut iter = dwarf.units();
    while let Some(header) = iter.next()? {
        let unit = dwarf.unit(header)?;
        let comp_dir = unit
            .comp_dir
            .as_ref()
            .map(|s| PathBuf::from(s.to_string_lossy().into_owned()));
        let file_table = build_file_table(&dwarf, &unit, comp_dir.as_deref())?;

        let mut entries = unit.entries();
        while let Some(entry) = entries.next_dfs()? {
            if entry.tag() != gimli::DW_TAG_subprogram {
                continue;
            }
            if let Some(sym) = extract_one(&dwarf, &unit, entry, &file_table, &abs_root)? {
                out.push(sym);
            }
        }
    }
    Ok(out)
}

type FileTable = Vec<PathBuf>;

fn build_file_table(
    dwarf: &Dwarf<EndianSlice<RunTimeEndian>>,
    unit: &Unit<EndianSlice<RunTimeEndian>>,
    comp_dir: Option<&Path>,
) -> Result<FileTable> {
    let mut out: FileTable = Vec::new();
    let Some(program) = unit.line_program.clone() else {
        return Ok(out);
    };
    let header = program.header();
    // DWARF v5 starts file indices at 0; v2-4 at 1. We just push everything;
    // callers index directly with the DW_AT_decl_file value.
    for (idx, file) in header.file_names().iter().enumerate() {
        let mut path = PathBuf::new();
        if let Some(dir_idx) = file.directory(header) {
            if let Ok(dir) = dwarf.attr_string(unit, dir_idx) {
                let dir_str = dir.to_string_lossy().into_owned();
                if Path::new(&dir_str).is_absolute() {
                    path.push(dir_str);
                } else {
                    if let Some(cd) = comp_dir { path.push(cd); }
                    path.push(dir_str);
                }
            }
        } else if let Some(cd) = comp_dir {
            path.push(cd);
        }
        if let Ok(name) = dwarf.attr_string(unit, file.path_name()) {
            path.push(name.to_string_lossy().into_owned());
        }
        // Pad if needed so DWARF-v2 1-based indexing works
        while out.len() <= idx { out.push(PathBuf::new()); }
        out[idx] = path;
    }
    Ok(out)
}

fn extract_one(
    dwarf: &Dwarf<EndianSlice<RunTimeEndian>>,
    unit: &Unit<EndianSlice<RunTimeEndian>>,
    entry: &DebuggingInformationEntry<EndianSlice<RunTimeEndian>>,
    files: &FileTable,
    abs_root: &Path,
) -> Result<Option<Symbol>> {
    // Prefer DW_AT_linkage_name (mangled) for the source-of-truth name.
    let linkage = attr_string(dwarf, unit, entry, gimli::DW_AT_linkage_name)?;
    let plain = attr_string(dwarf, unit, entry, gimli::DW_AT_name)?;
    let mangled = match (linkage, plain.clone()) {
        (Some(l), _) => l,
        (None, Some(n)) => n,
        (None, None) => return Ok(None),
    };

    let demangled = rustc_demangle::try_demangle(&mangled)
        .map(|d| format!("{:#}", d))
        .unwrap_or_else(|_| mangled.clone());
    let stripped = strip_generics(&demangled);

    let file_idx = match entry.attr_value(gimli::DW_AT_decl_file) {
        Some(AttributeValue::FileIndex(n)) => Some(n as usize),
        _ => None,
    };
    let line = match entry.attr_value(gimli::DW_AT_decl_line) {
        Some(AttributeValue::Udata(n)) => Some(n as u32),
        _ => None,
    };
    let file_path = file_idx.and_then(|i| files.get(i)).cloned();
    let Some(fp) = file_path else { return Ok(None) };
    if fp.as_os_str().is_empty() {
        return Ok(None);
    }
    // Only keep symbols whose source lives under workspace_root.
    let canon = fp.canonicalize().unwrap_or(fp.clone());
    let Ok(rel) = canon.strip_prefix(abs_root) else {
        return Ok(None);
    };
    let krate = rel
        .components()
        .next()
        .map(|c| c.as_os_str().to_string_lossy().into_owned())
        .unwrap_or_default();
    let file = rel.to_string_lossy().into_owned();

    let low = match entry.attr_value(gimli::DW_AT_low_pc) {
        Some(AttributeValue::Addr(a)) => Some(a),
        _ => None,
    };
    let size_bytes = match (low, entry.attr_value(gimli::DW_AT_high_pc)) {
        (Some(_), Some(AttributeValue::Udata(n))) => Some(n),
        (Some(a), Some(AttributeValue::Addr(b))) => Some(b.saturating_sub(a)),
        _ => None,
    };

    Ok(Some(Symbol {
        name: stripped,
        name_demangled: demangled,
        name_mangled: mangled,
        file,
        line,
        size_bytes,
        krate,
    }))
}

fn attr_string(
    dwarf: &Dwarf<EndianSlice<RunTimeEndian>>,
    unit: &Unit<EndianSlice<RunTimeEndian>>,
    entry: &DebuggingInformationEntry<EndianSlice<RunTimeEndian>>,
    name: gimli::DwAt,
) -> Result<Option<String>> {
    let Some(v) = entry.attr_value(name) else { return Ok(None) };
    let s = dwarf
        .attr_string(unit, v)
        .map_err(|e| anyhow!("attr_string: {e}"))?;
    Ok(Some(s.to_string_lossy().into_owned()))
}

/// Strip `<…>` generic args at top level, e.g. `Vec<u32>::new` → `Vec::new`.
fn strip_generics(s: &str) -> String {
    let mut out = String::with_capacity(s.len());
    let mut depth = 0i32;
    for ch in s.chars() {
        match ch {
            '<' => depth += 1,
            '>' => { if depth > 0 { depth -= 1; } }
            _ if depth == 0 => out.push(ch),
            _ => {}
        }
    }
    out
}

/// Convenience: filter symbols whose [`Symbol::name`] contains `pattern`.
pub fn lookup<'a>(symbols: &'a [Symbol], pattern: &str) -> Vec<&'a Symbol> {
    symbols
        .iter()
        .filter(|s| {
            s.name.contains(pattern)
                || s.name_demangled.contains(pattern)
                || s.name_mangled.contains(pattern)
        })
        .collect()
}

/// Convenience: filter symbols whose source file ends with `suffix`.
pub fn defined_in<'a>(symbols: &'a [Symbol], suffix: &str) -> Vec<&'a Symbol> {
    symbols.iter().filter(|s| s.file.ends_with(suffix)).collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Smoke test against our own debug build. Skipped on CI / environments
    /// where the binary doesn't exist (we don't want to fail an arbitrary
    /// test run because nobody's built `nornir` yet).
    #[test]
    fn extract_own_binary() {
        let bin = Path::new(env!("CARGO_MANIFEST_DIR"))
            .join("target/debug/nornir");
        if !bin.exists() {
            eprintln!("skipping: {} not built", bin.display());
            return;
        }
        let root = Path::new(env!("CARGO_MANIFEST_DIR"))
            .parent()
            .unwrap()
            .to_path_buf();
        let syms = extract_symbols(&bin, &root).expect("extract");
        assert!(!syms.is_empty(), "expected symbols in {}", bin.display());
        let nornir_syms: Vec<_> = syms.iter().filter(|s| s.krate == "nornir").collect();
        assert!(
            !nornir_syms.is_empty(),
            "expected some symbols from crate `nornir` (got {} total)",
            syms.len()
        );
        // Should find at least our index::Index::build
        let build_hits = lookup(&syms, "Index");
        assert!(!build_hits.is_empty(), "expected `Index` symbols");
    }

    #[test]
    fn strip_generics_basic() {
        assert_eq!(strip_generics("Vec<u32>::new"), "Vec::new");
        assert_eq!(
            strip_generics("foo::Bar<X, Y<Z>>::baz"),
            "foo::Bar::baz"
        );
        assert_eq!(strip_generics("plain"), "plain");
    }
}