//! Implements a simple parser and extractor for a Unix Archive.
//!
//! There are two "common" formats: BSD and SysV
//!
//! This crate currently only implements the SysV version, which essentially postfixes all
//! names in the archive with a / as a sigil for the end of the name, and uses a special symbol
//! index for looking up symbols faster.
use scroll::{self, Pread};
use strtab;
use error::{Result, Error};
use std::usize;
use std::collections::HashMap;
pub const SIZEOF_MAGIC: usize = 8;
/// The magic number of a Unix Archive
pub const MAGIC: &'static [u8; SIZEOF_MAGIC] = b"!<arch>\x0A";
const SIZEOF_FILE_IDENTIFER: usize = 16;
const SIZEOF_FILE_SIZE: usize = 10;
#[repr(C)]
#[derive(Debug, Clone, PartialEq, Pread, Pwrite, SizeWith)]
/// A Unix Archive Header - meta data for the file/byte blob/whatever that follows exactly after.
/// All data is right-padded with spaces ASCII `0x20`. The Binary layout is as follows:
///
/// |Offset|Length|Name |Format |
/// |:-----|:-----|:--------------------------|:----------|
/// |0 |16 |File identifier |ASCII |
/// |16 |12 |File modification timestamp|Decimal |
/// |28 |6 |Owner ID |Decimal |
/// |34 |6 |Group ID |Decimal |
/// |40 |8 |File mode |Octal |
/// |48 |10 |Filesize in bytes |Decimal |
/// |58 |2 |Ending characters |`0x60 0x0A`|
///
/// Byte alignment is according to the following:
/// > Each archive file member begins on an even byte boundary; a newline is inserted between files
/// > if necessary. Nevertheless, the size given reflects the actual size of the file exclusive
/// > of padding.
pub struct MemberHeader {
/// The identifier, or name for this file/whatever.
pub identifier: [u8; 16],
/// The timestamp for when this file was last modified. Base 10 number
pub timestamp: [u8; 12],
/// The file's owner's id. Base 10 string number
pub owner_id: [u8; 6],
/// The file's group id. Base 10 string number
pub group_id: [u8; 6],
/// The file's permissions mode. Base 8 number number
pub mode: [u8; 8],
/// The size of this file. Base 10 string number
pub file_size: [u8; 10],
/// The file header's terminator, always `0x60 0x0A`
pub terminator: [u8; 2],
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct Header<'a> {
pub name: &'a str,
pub size: usize,
}
pub const SIZEOF_HEADER: usize = SIZEOF_FILE_IDENTIFER + 12 + 6 + 6 + 8 + SIZEOF_FILE_SIZE + 2;
impl MemberHeader {
pub fn name(&self) -> Result<&str> {
Ok(self.identifier.pread_with::<&str>(0, ::scroll::ctx::StrCtx::Length(SIZEOF_FILE_IDENTIFER))?)
}
pub fn size(&self) -> Result<usize> {
match usize::from_str_radix(self.file_size.pread_with::<&str>(0, ::scroll::ctx::StrCtx::Length(self.file_size.len()))?.trim_right(), 10) {
Ok(file_size) => Ok(file_size),
Err(err) => Err(Error::Malformed(format!("{:?} Bad file_size in header: {:?}", err, self)))
}
}
}
#[derive(Debug, Clone, PartialEq)]
/// Represents a single entry in the archive
pub struct Member<'a> {
/// The entry header
pub header: Header<'a>,
/// File offset from the start of the archive to where the header begins
pub header_offset: u64,
/// File offset from the start of the archive to where the file begins
pub offset: u64,
/// BSD `ar` members store the filename separately
bsd_name: Option<&'a str>,
/// SysV `ar` members store the filename in a string table, a copy of which we hold here
sysv_name: Option<&'a str>,
}
impl<'a> Member<'a> {
/// Tries to parse the header in `R`, as well as the offset in `R.
/// **NOTE** the Seek will be pointing at the first byte of whatever the file is, skipping padding.
/// This is because just like members in the archive, the data section is 2-byte aligned.
pub fn parse(buffer: &'a [u8], offset: &mut usize) -> Result<Member<'a>> {
let header_offset = *offset;
let name = buffer.pread_with::<&str>(*offset, ::scroll::ctx::StrCtx::Length(SIZEOF_FILE_IDENTIFER))?;
let archive_header = buffer.gread::<MemberHeader>(offset)?;
let mut header = Header { name: name, size: archive_header.size()? };
// skip newline padding if we're on an uneven byte boundary
if *offset & 1 == 1 {
*offset += 1;
}
let bsd_name = if let Some(len) = Self::bsd_filename_length(name) {
// there's a filename of length `len` right after the header
let name = buffer.pread_with::<&str>(header_offset + SIZEOF_HEADER, ::scroll::ctx::StrCtx::Length(len))?;
// adjust the offset and size accordingly
*offset = header_offset + SIZEOF_HEADER + len;
header.size -= len;
// the name may have trailing NULs which we don't really want to keep
Some(name.trim_right_matches('\0'))
} else {
None
};
Ok(Member {
header: header,
header_offset: header_offset as u64,
offset: *offset as u64,
bsd_name: bsd_name,
sysv_name: None,
})
}
/// The size of the Member's content, in bytes. Does **not** include newline padding,
/// nor the size of the file header.
pub fn size(&self) -> usize {
self.header.size
}
/// Parse `#1/123` as `Some(123)`
fn bsd_filename_length(name: &str) -> Option<usize> {
use core::str::FromStr;
if name.len() > 3 && &name[0..3] == "#1/" {
let trimmed_name = &name[3..].trim_right_matches(' ');
if let Ok(len) = usize::from_str(trimmed_name) {
Some(len)
} else {
None
}
} else {
None
}
}
/// The member name, accounting for SysV and BSD `ar` filename extensions
pub fn extended_name(&self) -> &'a str {
if let Some(bsd_name) = self.bsd_name {
bsd_name
} else if let Some(ref sysv_name) = self.sysv_name {
sysv_name
} else {
self.header.name.trim_right_matches(' ').trim_right_matches('/')
}
}
/// The untrimmed raw member name, i.e., includes right-aligned space padding and `'/'` end-of-string
/// identifier
pub fn raw_name(&self) -> &'a str {
self.header.name
}
}
#[derive(Debug, Default)]
/// The special index member signified by the name `'/'`.
/// The data element contains a list of symbol indexes and symbol names, giving their offsets
/// into the archive for a given name.
pub struct Index<'a> {
/// Big Endian number of symbol_indexes and strings
pub size: usize,
/// Big Endian u32 index into the archive for this symbol (index in array is the index into the string table)
pub symbol_indexes: Vec<u32>,
/// Set of zero-terminated strings indexed by above. Number of strings = `self.size`
pub strtab: Vec<&'a str>,
}
/// SysV Archive Variant Symbol Lookup Table "Magic" Name
const INDEX_NAME: &'static str = "/ ";
/// SysV Archive Variant Extended Filename String Table Name
const NAME_INDEX_NAME: &'static str = "// ";
/// BSD symbol definitions
const BSD_SYMDEF_NAME: &'static str = "__.SYMDEF";
const BSD_SYMDEF_SORTED_NAME: &'static str = "__.SYMDEF SORTED";
impl<'a> Index<'a> {
/// Parses the given byte buffer into an Index. NB: the buffer must be the start of the index
pub fn parse_sysv_index(buffer: &'a [u8]) -> Result<Self> {
let offset = &mut 0;
let sizeof_table = buffer.gread_with::<u32>(offset, scroll::BE)? as usize;
let mut indexes = Vec::with_capacity(sizeof_table);
for _ in 0..sizeof_table {
indexes.push(buffer.gread_with::<u32>(offset, scroll::BE)?);
}
let sizeof_strtab = buffer.len() - ((sizeof_table * 4) + 4);
let strtab = strtab::Strtab::parse(buffer, *offset, sizeof_strtab, 0x0)?;
Ok (Index {
size: sizeof_table,
symbol_indexes: indexes,
strtab: strtab.to_vec()?, // because i'm lazy
})
}
/// Parses the given byte buffer into an Index, in BSD style archives
pub fn parse_bsd_symdef(buffer: &'a [u8]) -> Result<Self> {
// `llvm-ar` is a suitable reference:
// https://github.com/llvm-mirror/llvm/blob/6ea9891f9310510c621be562d1c5cdfcf5575678/lib/Object/Archive.cpp#L842-L870
// BSD __.SYMDEF files look like:
//
// ┌─────────────┐
// entries: │ # bytes │
// ├─────────────┼─────────────┐
// │ name offset │ .o offset │
// ├─────────────┼─────────────┤
// │ name offset │ .o offset │
// ├─────────────┼─────────────┤
// │ name offset │ .o offset │
// ├─────────────┼─────────────┤
// │ name offset │ .o offset │
// ├─────────────┼─────────────┘
// strings: │ # bytes │
// ├─────────────┴───────────────────┐
// │ _symbol\0 │
// ├─────────────────────────────────┴─────────────────────┐
// │ _longer_symbol\0 │
// ├────────────────┬──────────────────────────────────────┘
// │ _baz\0 │
// ├────────────────┴───┐
// │ _quxx\0 │
// └────────────────────┘
//
// All numeric values are u32s. Name offsets are relative to the start of the string table,
// and .o offsets are relative to the the start of the archive.
// Read the number of entries, which is at the start of the symdef (offset 0)
let entries_bytes = buffer.pread_with::<u32>(0, scroll::LE)? as usize;
let entries = entries_bytes / 8;
// Set up the string table, the length of which is recorded after the entire entries table,
// (`entries_bytes + 4`), and which starts immediately after that (`entries_bytes + 8`).
let strtab_bytes = buffer.pread_with::<u32>(entries_bytes + 4, scroll::LE)? as usize;
let strtab = strtab::Strtab::parse(buffer, entries_bytes + 8, strtab_bytes, 0x0)?;
// build the index
let mut indexes = Vec::with_capacity(entries);
let mut strings = Vec::with_capacity(entries);
for i in 0..entries {
// The entries table starts after the original length value (offset 4), and each entry
// has two u32 values, making them 8 bytes long.
//
// Therefore, the `i`th entry starts at offset `(i*8)+4`. The first u32 is at that
// address, and the second u32 follows 4 bytes later.
let string_offset: u32 = buffer.pread_with(i * 8 + 4, scroll::LE)?;
let archive_member: u32 = buffer.pread_with(i * 8 + 8, scroll::LE)?;
let string = match strtab.get(string_offset as usize) {
Some(result) => result,
None => Err(Error::Malformed(format!("{} entry {} has string offset {}, which is out of bounds", BSD_SYMDEF_NAME, i, string_offset)))
}?;
indexes.push(archive_member);
strings.push(string);
}
Ok (Index {
size: entries,
symbol_indexes: indexes,
strtab: strings,
})
}
}
/// Member names greater than 16 bytes are indirectly referenced using a `/<idx` schema,
/// where `idx` is an offset into a newline delimited string table directly following the `//` member
/// of the archive.
#[derive(Debug, Default)]
struct NameIndex<'a> {
strtab: strtab::Strtab<'a>
}
impl<'a> NameIndex<'a> {
pub fn parse(buffer: &'a [u8], offset: &mut usize, size: usize) -> Result<NameIndex<'a>> {
// This is a total hack, because strtab returns "" if idx == 0, need to change
// but previous behavior might rely on this, as ELF strtab's have "" at 0th index...
let hacked_size = size + 1;
let strtab = strtab::Strtab::parse(buffer, *offset-1, hacked_size, '\n' as u8)?;
// precious time was lost when refactoring because strtab::parse doesn't update the mutable seek...
*offset += hacked_size - 2;
Ok (NameIndex {
strtab: strtab
})
}
pub fn get(&self, name: &str) -> Result<&'a str> {
let idx = name.trim_left_matches('/').trim_right();
match usize::from_str_radix(idx, 10) {
Ok(idx) => {
let name = match self.strtab.get(idx+1) {
Some(result) => result,
None => Err(Error::Malformed(format!("Name {} is out of range in archive NameIndex", name)))
}?;
if name != "" {
Ok(name.trim_right_matches('/'))
} else {
return Err(Error::Malformed(format!("Could not find {:?} in index", name).into()));
}
},
Err (_) => {
return Err(Error::Malformed(format!("Bad name index {:?} in index", name).into()));
}
}
}
}
// TODO: add pretty printer fmt::Display with number of members, and names of members, along with
// the values of the index symbols once implemented
#[derive(Debug)]
/// An in-memory representation of a parsed Unix Archive
pub struct Archive<'a> {
// we can chuck this because the symbol index is a better representation, but we keep for
// debugging
index: Index<'a>,
sysv_name_index: NameIndex<'a>,
// the array of members, which are indexed by the members hash and symbol index
member_array: Vec<Member<'a>>,
members: HashMap<&'a str, usize>,
// symbol -> member
symbol_index: HashMap<&'a str, usize>
}
impl<'a> Archive<'a> {
pub fn parse(buffer: &'a [u8]) -> Result<Archive<'a>> {
let mut magic = [0u8; SIZEOF_MAGIC];
let offset = &mut 0usize;
buffer.gread_inout(offset, &mut magic)?;
if &magic != MAGIC {
use scroll::Pread;
return Err(Error::BadMagic(magic.pread(0)?).into());
}
let mut member_array = Vec::new();
let mut index = Index::default();
let mut sysv_name_index = NameIndex::default();
while *offset < buffer.len() {
// realign the cursor to a word boundary, if it's not on one already
if *offset & 1 == 1 {
*offset += 1;
}
let member = Member::parse(buffer, offset)?;
// advance to the next record
*offset = member.offset as usize + member.size() as usize;
let name = member.raw_name();
if name == INDEX_NAME {
let data: &[u8] = buffer.pread_with(member.offset as usize, member.size())?;
index = Index::parse_sysv_index(data)?;
} else if member.bsd_name == Some(BSD_SYMDEF_NAME) || member.bsd_name == Some(BSD_SYMDEF_SORTED_NAME) {
let data: &[u8] = buffer.pread_with(member.offset as usize, member.size())?;
index = Index::parse_bsd_symdef(data)?;
} else if name == NAME_INDEX_NAME {
let mut name_index_offset: usize = member.offset as usize;
sysv_name_index = NameIndex::parse(buffer, &mut name_index_offset, member.size())?;
} else {
// record this as an archive member
member_array.push(member);
}
}
// preprocess member names
let mut members = HashMap::new();
let mut member_index_by_offset: HashMap<u32, usize> = HashMap::with_capacity(member_array.len());
for (i, member) in member_array.iter_mut().enumerate() {
// copy in any SysV extended names
if let Ok(sysv_name) = sysv_name_index.get(member.raw_name()) {
member.sysv_name = Some(sysv_name);
}
// build a hashmap by extended name
let key = member.extended_name();
members.insert(key, i);
// build a hashmap translating archive offset into member index
member_index_by_offset.insert(member.header_offset as u32, i);
}
// build the symbol index, translating symbol names into member indexes
let mut symbol_index: HashMap<&str, usize> = HashMap::new();
for (member_offset, name) in index.symbol_indexes.iter().zip(index.strtab.iter()) {
let name = name.clone();
let member_index = member_index_by_offset[member_offset];
symbol_index.insert(name, member_index);
}
let archive = Archive {
index: index,
member_array: member_array,
sysv_name_index: sysv_name_index,
members: members,
symbol_index: symbol_index,
};
Ok(archive)
}
/// Get the member named `member` in this archive, if any
pub fn get (&self, member: &str) -> Option<&Member> {
if let Some(idx) = self.members.get(member) {
Some(&self.member_array[*idx])
} else {
None
}
}
/// Returns a slice of the raw bytes for the given `member` in the scrollable `buffer`
pub fn extract<'b>(&self, member: &str, buffer: &'b [u8]) -> Result<&'b [u8]> {
if let Some(member) = self.get(member) {
let bytes = buffer.pread_with(member.offset as usize, member.size())?;
Ok(bytes)
} else {
Err(Error::Malformed(format!("Cannot extract member {:?}", member).into()))
}
}
/// Gets a summary of this archive, returning a list of membername, the member, and the list of symbols the member contains
pub fn summarize(&self) -> Vec<(&str, &Member, Vec<&'a str>)> {
// build a result array, with indexes matching the member indexes
let mut result = self.member_array.iter()
.map(|ref member| {
(member.extended_name(), *member, Vec::new())
})
.collect::<Vec<_>>();
// walk the symbol index once, adding each symbol to the appropriate result Vec
for (symbol_name, member_index) in self.symbol_index.iter() {
result[*member_index].2.push(*symbol_name);
}
result
}
/// Get the list of member names in this archive
pub fn members(&self) -> Vec<&'a str> {
self.members.keys().map(|s| *s).collect()
}
/// Returns the member's name which contains the given `symbol`, if it is in the archive
pub fn member_of_symbol (&self, symbol: &str) -> Option<&'a str> {
if let Some(idx) = self.symbol_index.get(symbol) {
Some(self.member_array[*idx].extended_name())
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_member_bsd_filename_length() {
// non-BSD names should fall through
assert_eq!(Member::bsd_filename_length(""), None);
assert_eq!(Member::bsd_filename_length("123"), None);
assert_eq!(Member::bsd_filename_length("#1"), None);
assert_eq!(Member::bsd_filename_length("#1/"), None);
assert_eq!(Member::bsd_filename_length("#2/1"), None);
assert_eq!(Member::bsd_filename_length(INDEX_NAME), None);
assert_eq!(Member::bsd_filename_length(NAME_INDEX_NAME), None);
// #1/<len> should be parsed as Some(len), with or without whitespace
assert_eq!(Member::bsd_filename_length("#1/1"), Some(1));
assert_eq!(Member::bsd_filename_length("#1/22"), Some(22));
assert_eq!(Member::bsd_filename_length("#1/333"), Some(333));
assert_eq!(Member::bsd_filename_length("#1/1 "), Some(1));
assert_eq!(Member::bsd_filename_length("#1/22 "), Some(22));
assert_eq!(Member::bsd_filename_length("#1/333 "), Some(333));
// #!/<len><trailing garbage> should be None
assert_eq!(Member::bsd_filename_length("#1/1A"), None);
assert_eq!(Member::bsd_filename_length("#1/1 A"), None);
}
}