Skip to main content

rsleigh_fid/
lib.rs

1//! Function ID database — operand-masked body hashing for function
2//! identification across stripped binaries. Ghidra FID semantic clone
3//! with pure-Rust ingest/match pipeline.
4//!
5//! # Algorithm
6//! 1. Disassemble function linearly.
7//! 2. For each instruction: keep opcode + prefix bytes, zero operand slots
8//!    (registers, immediates, displacements) per arch mask table.
9//! 3. Hash masked byte stream with xxh3-64 → `full_hash`.
10//! 4. Combine with hashes of direct call targets → `specific_hash`.
11//! 5. Persist rows (full_hash, specific_hash, name, lib_id) in compact
12//!    binary format; ship as gzipped blob for runtime match pass.
13
14pub mod db;
15pub mod hash;
16pub mod ingest;
17pub mod mask;
18
19pub use db::{FidDb, FidEntry};
20pub use hash::FidHashQuad;
21
22/// Load all bundled FID databases matching the given architecture.
23/// Returns a list of (library_name, db) pairs. Empty if no bundled
24/// DBs ship for the architecture (e.g. ARM32, MIPS32, RISC-V today).
25pub fn bundled_dbs(arch: rsleigh_api::Architecture) -> Vec<(&'static str, FidDb)> {
26    let mut out = Vec::new();
27    let blobs: &[(&str, &[u8])] = match arch {
28        rsleigh_api::Architecture::X86_64 => &[
29            ("glibc", include_bytes!("../data/glibc-x86_64.fidb")),
30            ("libstdcxx", include_bytes!("../data/libstdcxx-x86_64.fidb")),
31            ("musl", include_bytes!("../data/musl-x86_64.fidb")),
32            ("zlib", include_bytes!("../data/zlib-x86_64.fidb")),
33            ("openssl", include_bytes!("../data/openssl-x86_64.fidb")),
34        ],
35        rsleigh_api::Architecture::AArch64 => &[
36            ("glibc", include_bytes!("../data/glibc-aarch64.fidb")),
37            (
38                "libstdcxx",
39                include_bytes!("../data/libstdcxx-aarch64.fidb"),
40            ),
41            ("musl", include_bytes!("../data/musl-aarch64.fidb")),
42            ("zlib", include_bytes!("../data/zlib-aarch64.fidb")),
43            ("openssl", include_bytes!("../data/openssl-aarch64.fidb")),
44        ],
45        _ => &[],
46    };
47    for (name, bytes) in blobs {
48        if let Ok(db) = FidDb::read(std::io::Cursor::new(bytes)) {
49            out.push((*name, db));
50        }
51    }
52    out
53}
54
55/// Convenience: fingerprint a function body and return matching name(s)
56/// from the database. Prefers `specific_hash` (callee-aware) matches;
57/// falls back to `full_hash` if specific yields nothing.
58///
59/// Returns `None` if the body is too small to fingerprint, or if no
60/// match exists. Returns `Some(&name)` when exactly one entry matches
61/// (unambiguous rename). Multi-match caller should use `FidDb` directly
62/// and apply additional disambiguation (e.g. library preference).
63pub fn identify<'a>(
64    arch: rsleigh_api::Architecture,
65    body: &[u8],
66    addr: u64,
67    db: &'a FidDb,
68) -> Option<&'a str> {
69    let hq = ingest::fingerprint(arch, body, addr, |_| None)?;
70    // Specific hash will only match when the callee graph lines up —
71    // without cross-function fingerprints during match, fall back to full.
72    // Accept multi-match when:
73    //  - every hit has the identical name (weak-alias duplicates), OR
74    //  - all hits are Itanium C++ ABI ctor/dtor variants of the same
75    //    function (C1/C2 complete/base ctors, D0/D1/D2 dtors share a body
76    //    by design). Prefer C1/D1 (complete-object) in that case.
77    let resolve = |idxs: &[usize]| -> Option<&'a str> {
78        if idxs.is_empty() {
79            return None;
80        }
81        let first = &db.entries[idxs[0]].name;
82        if idxs.iter().all(|i| &db.entries[*i].name == first) {
83            return Some(first);
84        }
85        if idxs
86            .iter()
87            .all(|i| is_cxx_abi_variant(&db.entries[*i].name, first))
88        {
89            // Pick complete-object variant if present, else first.
90            if let Some(i) = idxs.iter().find(|i| {
91                let n = &db.entries[**i].name;
92                n.contains("C1E") || n.contains("D1E")
93            }) {
94                return Some(&db.entries[*i].name);
95            }
96            return Some(first);
97        }
98        None
99    };
100    resolve(db.match_specific(hq.specific)).or_else(|| resolve(db.match_full(hq.full)))
101}
102
103/// Are `a` and `b` Itanium C++ ABI ctor/dtor variants of the same
104/// function? Variants share a body by spec:
105///   - `C1` (complete), `C2` (base), `C3` (allocating) ctors
106///   - `D0` (deleting), `D1` (complete), `D2` (base) dtors
107fn is_cxx_abi_variant(a: &str, b: &str) -> bool {
108    if a == b {
109        return true;
110    }
111    // Extract (group, position) for each name. If both share group + prefix
112    // + suffix, they are body-equivalent by ABI.
113    let locate = |s: &str| -> Option<(char, usize)> {
114        for &(ch, tag) in &[
115            ('C', "C1E"),
116            ('C', "C2E"),
117            ('C', "C3E"),
118            ('D', "D0E"),
119            ('D', "D1E"),
120            ('D', "D2E"),
121        ] {
122            if let Some(p) = s.find(tag) {
123                return Some((ch, p));
124            }
125        }
126        None
127    };
128    if let (Some((ga, pa)), Some((gb, pb))) = (locate(a), locate(b)) {
129        if ga == gb && pa == pb && a[..pa] == b[..pb] && a[pa + 2..] == b[pb + 2..] {
130            return true;
131        }
132    }
133    false
134}