compiledfiles 0.2.0

Parse native binary symbols for list of source files used to compile binary
Documentation
//! # compiledfiles
//!
//! A library to get a list of all files that were used to compile the given
//! binary.
//!
//! This library currently only supports the following formats:
//!
//! * ELF files
//! * PDB files
//!
//! The following file formats are a work in progress
//!
//! * Mach-O files
//!
//! This library currently only supports files generated by the following compilers:
//!
//! * GCC
//! * LLVM
//! * MSVC
//!
//! This library currently only has been tested with the following languages:
//!
//! * C/C++
//!
//! The following languages are a work in progress
//!
//! * Rust
//! * Go
//!
//! Help is welcome for supporting any future formats.
//!
//! # Examples
//!
//! ```no_run
//! let elf_file = std::fs::File::open("path_to_binary").unwrap();
//! let files = compiledfiles::parse(elf_file).unwrap();
//! for file in files {
//!     println!("{:?}", file);
//! }
//! ```
use gimli::Dwarf;
use object::Object;
use pdb::FallibleIterator;

use std::borrow::Cow;
use std::cmp::Ordering;
use std::error::Error as StdError;
use std::fmt;
use std::io::Read;
use std::io::Seek;
use std::io::SeekFrom;
use std::path::PathBuf;
use std::vec::Vec;

/// Checksum of the source file's content
#[derive(Debug, PartialEq, Eq, PartialOrd)]
pub enum FileCheckSum {
    Md5([u8; 16]),
    Sha1([u8; 20]),
    Sha256([u8; 32]),
}

/// Basic information stored for each source file. Only the path is required.
#[derive(Debug, PartialEq, Eq, PartialOrd)]
pub struct FileInfo {
    /// Recorded path to the source file
    pub path: PathBuf,

    /// Size of the source file in bytes
    pub size: Option<u64>,

    /// Last modified timestamp of the source file
    pub timestamp: Option<u64>,

    /// Checksum of the source file
    pub checksum: Option<FileCheckSum>,
}

impl Ord for FileInfo {
    fn cmp(&self, other: &Self) -> Ordering {
        self.path.cmp(&other.path)
    }
}

/// Possible errors for attempting to list all sources
#[derive(Debug)]
pub enum Error {
    /// The binary file is a valid file format, but does not contain debug
    /// symbols.
    MissingDebugSymbols,

    /// The format of the file past is a unknown format
    UnrecognizedFileFormat,

    /// An IO error occurred
    Io(std::io::Error),

    /// There was an error parsing the Dwarf information
    Dwarf(gimli::Error),

    /// There was an error parsing an ELF or Mach-O file
    Object(&'static str),

    /// There was an error parsing a PDB file
    Pdb(pdb::Error),
}

impl fmt::Display for Error {
    fn fmt(&self, f: &mut fmt::Formatter) -> std::fmt::Result {
        match *self {
            Error::MissingDebugSymbols => write!(f, "{}", self.description()),
            Error::UnrecognizedFileFormat => write!(f, "{}", self.description()),
            Error::Object(s) => write!(f, "{}", s),
            Error::Io(ref e) => write!(f, "{}", e),
            Error::Dwarf(ref e) => write!(f, "{}", e),
            Error::Pdb(ref p) => write!(f, "{}", p),
        }
    }
}

impl StdError for Error {
    fn description(&self) -> &str {
        match *self {
            Error::MissingDebugSymbols => "Binary missing debug symbols",
            Error::UnrecognizedFileFormat => "File was not a recognized file format",
            Error::Object(s) => s,
            Error::Io(ref e) => e.description(),
            Error::Dwarf(ref e) => e.description(),
            Error::Pdb(ref p) => p.description(),
        }
    }

    fn source(&self) -> Option<&(dyn StdError + 'static)> {
        match *self {
            Error::Dwarf(ref err) => Some(err),
            Error::Io(ref err) => Some(err),
            Error::Pdb(ref err) => Some(err),
            Error::MissingDebugSymbols | Error::UnrecognizedFileFormat | Error::Object(_) => None,
        }
    }
}

impl From<std::io::Error> for Error {
    fn from(e: std::io::Error) -> Error {
        Error::Io(e)
    }
}

impl From<gimli::Error> for Error {
    fn from(e: gimli::Error) -> Error {
        Error::Dwarf(e)
    }
}

impl From<&'static str> for Error {
    fn from(s: &'static str) -> Error {
        Error::Object(s)
    }
}

impl From<pdb::Error> for Error {
    fn from(p: pdb::Error) -> Error {
        Error::Pdb(p)
    }
}

type Result<T> = ::std::result::Result<T, Error>;

fn convert_pdb_checksum_to_checksum(pdb_checksum: pdb::FileChecksum) -> Option<FileCheckSum> {
    match pdb_checksum {
        pdb::FileChecksum::Md5(data) => {
            let mut hash: [u8; 16] = [0; 16];
            hash.copy_from_slice(data);
            Some(FileCheckSum::Md5(hash))
        }
        pdb::FileChecksum::Sha1(data) => {
            let mut hash: [u8; 20] = [0; 20];
            hash.copy_from_slice(data);
            Some(FileCheckSum::Sha1(hash))
        }
        pdb::FileChecksum::Sha256(data) => {
            let mut hash: [u8; 32] = [0; 32];
            hash.copy_from_slice(data);
            Some(FileCheckSum::Sha256(hash))
        }
        pdb::FileChecksum::None => None,
    }
}

/// Parses out the source file information from a file
///
/// # Arguments
///
/// * `file` - The opened file we want to parse
///
/// # Example
///
/// ```no_run
/// let elf_file = std::fs::File::open("path_to_binary").unwrap();
/// let files = compiledfiles::parse(elf_file).unwrap();
/// for file in files {
///     println!("{:?}", file);
/// }
/// ```
pub fn parse(mut file: std::fs::File) -> Result<Vec<FileInfo>> {
    // try parsing a PDB first
    match pdb::PDB::open(&mut file) {
        Ok(pdb) => return parse_pdb(pdb),
        Err(e) => match e {
            pdb::Error::UnrecognizedFileFormat => {
                // continue
            }
            pdb::Error::IoError(i) if i.kind() == std::io::ErrorKind::UnexpectedEof => {
                // continue
                // workaround for https://github.com/willglynn/pdb/issues/75
            }
            _ => return Err(Error::Pdb(e)),
        },
    };

    file.seek(SeekFrom::Start(0))?;

    // Now try elf or mach-o
    let mut contents = vec![];
    file.read_to_end(&mut contents)?;

    match object::File::parse(&contents[..]) {
        Ok(obj) => return parse_object(&obj),
        Err(e) => match e {
            "Unknown file magic" => {
                // continue
            }
            _ => return Err(Error::Object(e)),
        },
    };

    Err(Error::UnrecognizedFileFormat)
}

fn parse_pdb<'s, S: pdb::Source<'s> + 's>(mut pdb: pdb::PDB<'s, S>) -> Result<Vec<FileInfo>> {
    let mut files = vec![];

    let dbi = pdb.debug_information()?;
    let string_table = pdb.string_table()?;

    let mut modules = dbi.modules()?;

    while let Some(module) = modules.next()? {
        if let Some(mod_info) = pdb.module_info(&module)? {
            let line_program = mod_info.line_program()?;
            let mut mod_files = line_program.files();
            while let Some(file) = mod_files.next()? {
                let path_str = file.name.to_raw_string(&string_table)?;
                let file_checksum = file.checksum;
                let path = PathBuf::from(path_str.to_string().as_ref());
                let info = FileInfo {
                    path,
                    size: None,
                    timestamp: None,
                    checksum: convert_pdb_checksum_to_checksum(file_checksum),
                };
                files.push(info);
            }
        }
    }

    files.sort();
    files.dedup();

    Ok(files)
}

fn parse_object(file: &object::File) -> Result<Vec<FileInfo>> {
    let endianness = if file.is_little_endian() {
        gimli::RunTimeEndian::Little
    } else {
        gimli::RunTimeEndian::Big
    };

    if file.has_debug_symbols() {
        match file.format() {
            object::target_lexicon::BinaryFormat::Elf => parse_elf_file(file, endianness),
            object::target_lexicon::BinaryFormat::Coff => Err(Error::MissingDebugSymbols),
            object::target_lexicon::BinaryFormat::Macho => {
                unimplemented!();
            }
            _ => Err(Error::UnrecognizedFileFormat),
        }
    } else {
        Err(Error::MissingDebugSymbols)
    }
}

fn parse_elf_file(file: &object::File, endianness: gimli::RunTimeEndian) -> Result<Vec<FileInfo>> {
    // Load a section and return as `Cow<[u8]>`.
    let load_section = |id: gimli::SectionId| -> Result<Cow<[u8]>> {
        Ok(file
            .section_data_by_name(id.name())
            .unwrap_or(Cow::Borrowed(&[][..])))
    };
    // Load a supplementary section. We don't have a supplementary object file,
    // so always return an empty slice.
    let load_section_sup = |_| Ok(Cow::Borrowed(&[][..]));

    // Load all of the sections.
    let dwarf_cow = Dwarf::load(&load_section, &load_section_sup)?;

    // Borrow a `Cow<[u8]>` to create an `EndianSlice`.
    let borrow_section: &dyn for<'a> Fn(
        &'a Cow<[u8]>,
    ) -> gimli::EndianSlice<'a, gimli::RunTimeEndian> =
        &|section| gimli::EndianSlice::new(&*section, endianness);

    // Create `EndianSlice`s for all of the sections.
    let dwarf = dwarf_cow.borrow(&borrow_section);

    // Iterate over the compilation units.
    let mut iter = dwarf.units();

    let mut files = vec![];

    while let Some(header) = iter.next()? {
        let unit = dwarf.unit(header)?;

        if let Some(ref program) = unit.line_program {
            for file in program.header().file_names() {
                let dir_attr = file.directory(program.header()).unwrap();
                let dir_string = dwarf.attr_string(&unit, dir_attr)?.to_string_lossy();
                let dir_str = dir_string.as_ref();
                let path = PathBuf::from(dir_str);
                let mut info = FileInfo {
                    path,
                    size: None,
                    timestamp: None,
                    checksum: None,
                };

                let filename_string = dwarf
                    .attr_string(&unit, file.path_name())?
                    .to_string_lossy();
                let filename_str = filename_string.as_ref();
                info.path.push(filename_str);

                if program.header().file_has_timestamp() {
                    info.timestamp = match file.timestamp() {
                        0 => None,
                        x => Some(x),
                    };
                }

                if program.header().file_has_size() {
                    info.size = match file.size() {
                        0 => None,
                        x => Some(x),
                    };
                }

                if program.header().file_has_md5() {
                    info.checksum = Some(FileCheckSum::Md5(*file.md5()));
                }

                // GCC will stick in a pseudo filename "<built-in>" for source
                // built into GCC.
                if !filename_str.starts_with('<') {
                    files.push(info);
                }
            }
        }
    }

    files.sort();
    files.dedup();
    Ok(files)
}

#[cfg(test)]
mod tests {
    #[test]
    fn it_works() {
        assert_eq!(2 + 2, 4);
    }
}