1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
//! Abbreviation definition and abbreviated record parsing and handling for `llvm-bitstream`.

use std::convert::{From, TryFrom, TryInto};

use llvm_bitcursor::BitCursor;
use llvm_support::bitcodes::{AbbrevOpEnc, ReservedAbbrevId};
use llvm_support::CHAR6_ALPHABET;

use crate::error::Error;
use crate::record::Fields;

/// An abbreviation ID, whether reserved or defined by the stream itself.
#[derive(Clone, Copy, Debug)]
pub enum AbbrevId {
    /// A reserved abbreviation ID.
    Reserved(ReservedAbbrevId),
    /// An abbreviation ID that's been defined within the stream.
    Defined(u64),
}

impl From<u64> for AbbrevId {
    fn from(value: u64) -> Self {
        ReservedAbbrevId::try_from(value)
            .map_or_else(|_| AbbrevId::Defined(value), AbbrevId::Reserved)
    }
}

/// The valid abbreviation operand forms.
#[derive(Clone, Debug, PartialEq)]
pub enum AbbrevOp {
    /// A literal, constant operand.
    Literal(u64),
    /// A VBR whose width is is associated as extra data.
    Vbr(u64),
    /// A fixed-width field whose width is associated as extra data.
    Fixed(u64),
    /// A fixed-length array whose member elements are specified.
    Array(Box<AbbrevOp>),
    /// A single Char6.
    Char6,
    /// A fixed-length blob of bytes.
    Blob,
}

impl AbbrevOp {
    /// Given a Char6 value, map it back to its ASCII printable equivalent.
    ///
    /// This function is private because it requires caller-upheld invariants
    /// for panic safety.
    fn decode_char6(char6: u8) -> u8 {
        // Panic safety: the caller is expected to constrain char6 to a valid
        // index within CHAR6_ALPHABET.
        CHAR6_ALPHABET[char6 as usize]
    }

    /// Parse a single abbreviation operand from the stream, returning a
    /// vector of one or more fields for that operand.
    pub(self) fn parse<T: AsRef<[u8]>>(&self, cur: &mut BitCursor<T>) -> Result<Fields, Error> {
        // A sad thing happens in this function: we parse by iterating over
        // each operand, collecting the field(s) in the bitstream that correspond to it.
        // Operands are typed and carry detailed information about their semantics:
        // for example, an `AbbrevOp::Char6` is exactly 6 bits and maps directly
        // to a printable character. It would be really nice if we could expose this structure
        // at a higher level, i.e. by returning a `Value` enum with different variants
        // for each operand, and higher levels could take advantage of it.
        // Unfortunately, LLVM does not let us do this: bitstream consumers **must**
        // be agnostic to how the bitstream is emitted, which means that an emitter's
        // decision to use a Char6 vs. a VBR6 cannot affect later, higher-level interpretation.
        // As a result, we have to discard all of our nice structure here in favor of
        // sequences of "fields," which are really just individual `u64`s.
        Ok(match self {
            AbbrevOp::Literal(val) => vec![*val],
            AbbrevOp::Vbr(width) => vec![cur.read_vbr(*width as usize)?],
            AbbrevOp::Fixed(width) => vec![cur.read_as::<u64>(*width as usize)?],
            AbbrevOp::Array(elem) => {
                // An array operand is encoded as a length (VBR6), followed by
                // each encoded element of the array.
                // TODO(ww): Sanity check array_len here.
                let array_len = cur.read_vbr(6)? as usize;

                let mut fields: Fields = Vec::with_capacity(array_len);
                for _ in 0..array_len {
                    fields.extend(elem.parse(cur)?);
                }

                fields
            }
            AbbrevOp::Char6 => vec![Self::decode_char6(cur.read_as::<u8>(6)?).into()],
            AbbrevOp::Blob => {
                // A blob operand is encoded as a length (VBR6), followed by a 32-bit aligned
                // sequence of bytes, followed by another alignment back to 32 bits.

                // TODO(ww): Sanity check blob_len here: it probably shouldn't be 0,
                // and it definitely can't be longer than the stream.
                let blob_len = cur.read_vbr(6)? as usize;
                cur.align32();

                // TODO(ww): This read loop is probably slower than it needs to be;
                // `BitCursor` could probably learn a `read_bytes` API that's
                // only allowed when the stream is byte-aligned.
                let mut fields: Fields = Vec::with_capacity(blob_len);
                for _ in 0..blob_len {
                    fields.push(cur.read_exact::<u8>()?.into());
                }
                cur.align32();

                fields
            }
        })
    }
}

/// Represents a defined abbreviation, as specified by a `DEFINE_ABBREV` record.
#[derive(Clone, Debug)]
pub struct Abbrev {
    /// The abstract operands for this abbreviation definition.
    pub operands: Vec<AbbrevOp>,
}

impl Abbrev {
    /// Parse a new `Abbrev` from the stream.
    ///
    /// Assumes that the `DEFINE_ABBREV` ID has already been consumed.
    pub fn new<T: AsRef<[u8]>>(cur: &mut BitCursor<T>) -> Result<Self, Error> {
        // TODO(ww): This and other structures should probably implement a `FromStream`
        // trait instead, for construction.

        // Per the LLVM docs: abbreviation records look like this:
        // [DEFINE_ABBREV, VBR5:numabbrevops, abbrevop0, abbrevop1, ...]
        // Our surrounding parse context should have consumed the DEFINE_ABBREV
        // already, so we start with numabbrevops.
        let num_abbrev_opnds = cur.read_vbr(5)?;
        if num_abbrev_opnds < 1 {
            return Err(Error::AbbrevParse(
                "expected at least one abbrev operand".into(),
            ));
        }

        log::debug!("expecting {} operands", num_abbrev_opnds);

        // Abbreviated records must have at least one operand.
        if num_abbrev_opnds < 1 {
            return Err(Error::AbbrevParse(
                "expected abbrev operand count to be nonzero".into(),
            ));
        }

        // Decode each abbreviation operand.
        let mut operands = vec![];
        let mut done_early = false;
        for idx in 0..num_abbrev_opnds {
            // Each operand starts with a single bit that indicates whether
            // the operand is "literal" (i.e., a VBR8) or an "encoded" operand.
            let operand_kind = cur.read(1)?;

            // If this operand is a literal, then we read it as a VBR8.
            if operand_kind == 1 {
                let val = cur.read_vbr(8)?;

                // NOTE(ww): This error is exceedingly unlikely (usize would have to be larger
                // than u64). But you never know.
                operands.push(AbbrevOp::Literal(val));

                continue;
            }

            // Otherwise, we need to suss the encoding representation out of it.
            // This is always a 3-bit field (**not** a VBR3), which in turn tells us whether the
            // operand encoding includes extra data.
            let enc: AbbrevOpEnc = cur.read(3)?.try_into()?;
            let opnd = match enc {
                AbbrevOpEnc::Fixed => AbbrevOp::Fixed(cur.read_vbr(5)?),
                AbbrevOpEnc::Vbr => AbbrevOp::Vbr(cur.read_vbr(5)?),
                AbbrevOpEnc::Array => {
                    // There is only ever one array operand in an abbreviation definition,
                    // and it is always the second-to-last operand. Anything else is an error.
                    if idx != num_abbrev_opnds - 2 {
                        return Err(Error::AbbrevParse("array operand at invalid index".into()));
                    }

                    // NOTE(ww): We get a little clever here: instead of parsing
                    // the inner array operand on its own, we steal it here and set
                    // `done_early` to indicate that we're done with operand parsing.
                    // This works since array operands are guaranteed to be second-to-last,
                    // followed only by their element operand encoding.
                    cur.read(1)?;
                    let elem_enc: AbbrevOpEnc = cur.read(3)?.try_into()?;
                    done_early = true;

                    let elem = match elem_enc {
                        AbbrevOpEnc::Fixed => AbbrevOp::Fixed(cur.read_vbr(5)?),
                        AbbrevOpEnc::Vbr => AbbrevOp::Vbr(cur.read_vbr(5)?),
                        AbbrevOpEnc::Char6 => AbbrevOp::Char6,
                        _ => {
                            // Blobs and arrays cannot themselves be member types.
                            return Err(Error::AbbrevParse(format!(
                                "invalid element type for an array: {:?}",
                                elem_enc
                            )));
                        }
                    };

                    AbbrevOp::Array(Box::new(elem))
                }
                AbbrevOpEnc::Char6 => AbbrevOp::Char6,
                AbbrevOpEnc::Blob => {
                    // Similarly to arrays: there is only ever one blob operand.
                    // Blobs don't have an element type, so they're always the last operand.
                    if idx != num_abbrev_opnds - 1 {
                        return Err(Error::AbbrevParse("blob operand at invalid index".into()));
                    }

                    AbbrevOp::Blob
                }
            };

            operands.push(opnd);

            // See above: don't complete the entire operand parsing loop if we've successfully
            // stolen the last operand as part of an array.
            if done_early {
                break;
            }
        }

        Ok(Self { operands: operands })
    }

    /// Parse an abbreviated record from this stream, returning its fields.
    pub fn parse<T: AsRef<[u8]>>(&self, cur: &mut BitCursor<T>) -> Result<Fields, Error> {
        Ok(self
            .operands
            .iter()
            .map(|opnd| opnd.parse(cur))
            .collect::<Result<Vec<_>, _>>()?
            .into_iter()
            .flatten()
            .collect())
    }
}