llvm-bitstream 0.0.3

A content-agnostic parser for LLVM's bitstream container format
Documentation
//! Abbreviation definition and abbreviated record parsing and handling for `llvm-bitstream`.

use std::convert::{From, TryFrom, TryInto};

use llvm_bitcursor::BitCursor;
use llvm_support::bitcodes::{AbbrevOpEnc, ReservedAbbrevId};
use llvm_support::CHAR6_ALPHABET;

use crate::error::Error;
use crate::record::Fields;

/// An abbreviation ID, whether reserved or defined by the stream itself.
#[derive(Clone, Copy, Debug)]
pub enum AbbrevId {
    /// A reserved abbreviation ID.
    Reserved(ReservedAbbrevId),
    /// An abbreviation ID that's been defined within the stream.
    Defined(u64),
}

impl From<u64> for AbbrevId {
    fn from(value: u64) -> Self {
        ReservedAbbrevId::try_from(value)
            .map_or_else(|_| AbbrevId::Defined(value), AbbrevId::Reserved)
    }
}

/// The valid abbreviation operand forms.
#[derive(Clone, Debug, PartialEq)]
pub enum AbbrevOp {
    /// A literal, constant operand.
    Literal(u64),
    /// A VBR whose width is is associated as extra data.
    Vbr(u64),
    /// A fixed-width field whose width is associated as extra data.
    Fixed(u64),
    /// A fixed-length array whose member elements are specified.
    Array(Box<AbbrevOp>),
    /// A single Char6.
    Char6,
    /// A fixed-length blob of bytes.
    Blob,
}

impl AbbrevOp {
    /// Given a Char6 value, map it back to its ASCII printable equivalent.
    ///
    /// This function is private because it requires caller-upheld invariants
    /// for panic safety.
    fn decode_char6(char6: u8) -> u8 {
        // Panic safety: the caller is expected to constrain char6 to a valid
        // index within CHAR6_ALPHABET.
        CHAR6_ALPHABET[char6 as usize]
    }

    /// Parse a single abbreviation operand from the stream, returning a
    /// vector of one or more fields for that operand.
    pub(self) fn parse<T: AsRef<[u8]>>(&self, cur: &mut BitCursor<T>) -> Result<Fields, Error> {
        // A sad thing happens in this function: we parse by iterating over
        // each operand, collecting the field(s) in the bitstream that correspond to it.
        // Operands are typed and carry detailed information about their semantics:
        // for example, an `AbbrevOp::Char6` is exactly 6 bits and maps directly
        // to a printable character. It would be really nice if we could expose this structure
        // at a higher level, i.e. by returning a `Value` enum with different variants
        // for each operand, and higher levels could take advantage of it.
        // Unfortunately, LLVM does not let us do this: bitstream consumers **must**
        // be agnostic to how the bitstream is emitted, which means that an emitter's
        // decision to use a Char6 vs. a VBR6 cannot affect later, higher-level interpretation.
        // As a result, we have to discard all of our nice structure here in favor of
        // sequences of "fields," which are really just individual `u64`s.
        Ok(match self {
            AbbrevOp::Literal(val) => vec![*val],
            AbbrevOp::Vbr(width) => vec![cur.read_vbr(*width as usize)?],
            AbbrevOp::Fixed(width) => vec![cur.read_as::<u64>(*width as usize)?],
            AbbrevOp::Array(elem) => {
                // An array operand is encoded as a length (VBR6), followed by
                // each encoded element of the array.
                // TODO(ww): Sanity check array_len here.
                let array_len = cur.read_vbr(6)? as usize;

                let mut fields: Fields = Vec::with_capacity(array_len);
                for _ in 0..array_len {
                    fields.extend(elem.parse(cur)?);
                }

                fields
            }
            AbbrevOp::Char6 => vec![Self::decode_char6(cur.read_as::<u8>(6)?).into()],
            AbbrevOp::Blob => {
                // A blob operand is encoded as a length (VBR6), followed by a 32-bit aligned
                // sequence of bytes, followed by another alignment back to 32 bits.

                // TODO(ww): Sanity check blob_len here: it probably shouldn't be 0,
                // and it definitely can't be longer than the stream.
                let blob_len = cur.read_vbr(6)? as usize;
                cur.align32();

                // TODO(ww): This read loop is probably slower than it needs to be;
                // `BitCursor` could probably learn a `read_bytes` API that's
                // only allowed when the stream is byte-aligned.
                let mut fields: Fields = Vec::with_capacity(blob_len);
                for _ in 0..blob_len {
                    fields.push(cur.read_exact::<u8>()?.into());
                }
                cur.align32();

                fields
            }
        })
    }
}

/// Represents a defined abbreviation, as specified by a `DEFINE_ABBREV` record.
#[derive(Clone, Debug)]
pub struct Abbrev {
    /// The abstract operands for this abbreviation definition.
    pub operands: Vec<AbbrevOp>,
}

impl Abbrev {
    /// Parse a new `Abbrev` from the stream.
    ///
    /// Assumes that the `DEFINE_ABBREV` ID has already been consumed.
    pub fn new<T: AsRef<[u8]>>(cur: &mut BitCursor<T>) -> Result<Self, Error> {
        // TODO(ww): This and other structures should probably implement a `FromStream`
        // trait instead, for construction.

        // Per the LLVM docs: abbreviation records look like this:
        // [DEFINE_ABBREV, VBR5:numabbrevops, abbrevop0, abbrevop1, ...]
        // Our surrounding parse context should have consumed the DEFINE_ABBREV
        // already, so we start with numabbrevops.
        let num_abbrev_opnds = cur.read_vbr(5)?;
        if num_abbrev_opnds < 1 {
            return Err(Error::AbbrevParse(
                "expected at least one abbrev operand".into(),
            ));
        }

        log::debug!("expecting {} operands", num_abbrev_opnds);

        // Abbreviated records must have at least one operand.
        if num_abbrev_opnds < 1 {
            return Err(Error::AbbrevParse(
                "expected abbrev operand count to be nonzero".into(),
            ));
        }

        // Decode each abbreviation operand.
        let mut operands = vec![];
        let mut done_early = false;
        for idx in 0..num_abbrev_opnds {
            // Each operand starts with a single bit that indicates whether
            // the operand is "literal" (i.e., a VBR8) or an "encoded" operand.
            let operand_kind = cur.read(1)?;

            // If this operand is a literal, then we read it as a VBR8.
            if operand_kind == 1 {
                let val = cur.read_vbr(8)?;

                // NOTE(ww): This error is exceedingly unlikely (usize would have to be larger
                // than u64). But you never know.
                operands.push(AbbrevOp::Literal(val));

                continue;
            }

            // Otherwise, we need to suss the encoding representation out of it.
            // This is always a 3-bit field (**not** a VBR3), which in turn tells us whether the
            // operand encoding includes extra data.
            let enc: AbbrevOpEnc = cur.read(3)?.try_into()?;
            let opnd = match enc {
                AbbrevOpEnc::Fixed => AbbrevOp::Fixed(cur.read_vbr(5)?),
                AbbrevOpEnc::Vbr => AbbrevOp::Vbr(cur.read_vbr(5)?),
                AbbrevOpEnc::Array => {
                    // There is only ever one array operand in an abbreviation definition,
                    // and it is always the second-to-last operand. Anything else is an error.
                    if idx != num_abbrev_opnds - 2 {
                        return Err(Error::AbbrevParse("array operand at invalid index".into()));
                    }

                    // NOTE(ww): We get a little clever here: instead of parsing
                    // the inner array operand on its own, we steal it here and set
                    // `done_early` to indicate that we're done with operand parsing.
                    // This works since array operands are guaranteed to be second-to-last,
                    // followed only by their element operand encoding.
                    cur.read(1)?;
                    let elem_enc: AbbrevOpEnc = cur.read(3)?.try_into()?;
                    done_early = true;

                    let elem = match elem_enc {
                        AbbrevOpEnc::Fixed => AbbrevOp::Fixed(cur.read_vbr(5)?),
                        AbbrevOpEnc::Vbr => AbbrevOp::Vbr(cur.read_vbr(5)?),
                        AbbrevOpEnc::Char6 => AbbrevOp::Char6,
                        _ => {
                            // Blobs and arrays cannot themselves be member types.
                            return Err(Error::AbbrevParse(format!(
                                "invalid element type for an array: {:?}",
                                elem_enc
                            )));
                        }
                    };

                    AbbrevOp::Array(Box::new(elem))
                }
                AbbrevOpEnc::Char6 => AbbrevOp::Char6,
                AbbrevOpEnc::Blob => {
                    // Similarly to arrays: there is only ever one blob operand.
                    // Blobs don't have an element type, so they're always the last operand.
                    if idx != num_abbrev_opnds - 1 {
                        return Err(Error::AbbrevParse("blob operand at invalid index".into()));
                    }

                    AbbrevOp::Blob
                }
            };

            operands.push(opnd);

            // See above: don't complete the entire operand parsing loop if we've successfully
            // stolen the last operand as part of an array.
            if done_early {
                break;
            }
        }

        Ok(Self { operands: operands })
    }

    /// Parse an abbreviated record from this stream, returning its fields.
    pub fn parse<T: AsRef<[u8]>>(&self, cur: &mut BitCursor<T>) -> Result<Fields, Error> {
        Ok(self
            .operands
            .iter()
            .map(|opnd| opnd.parse(cur))
            .collect::<Result<Vec<_>, _>>()?
            .into_iter()
            .flatten()
            .collect())
    }
}