llvm_bitstream/
abbrev.rs

1//! Abbreviation definition and abbreviated record parsing and handling for `llvm-bitstream`.
2
3use std::convert::{From, TryFrom, TryInto};
4
5use llvm_bitcursor::BitCursor;
6use llvm_support::bitcodes::{AbbrevOpEnc, ReservedAbbrevId};
7use llvm_support::CHAR6_ALPHABET;
8
9use crate::error::Error;
10use crate::record::Fields;
11
12/// An abbreviation ID, whether reserved or defined by the stream itself.
13#[derive(Clone, Copy, Debug)]
14pub enum AbbrevId {
15    /// A reserved abbreviation ID.
16    Reserved(ReservedAbbrevId),
17    /// An abbreviation ID that's been defined within the stream.
18    Defined(u64),
19}
20
21impl From<u64> for AbbrevId {
22    fn from(value: u64) -> Self {
23        ReservedAbbrevId::try_from(value)
24            .map_or_else(|_| AbbrevId::Defined(value), AbbrevId::Reserved)
25    }
26}
27
28/// The valid abbreviation operand forms.
29#[derive(Clone, Debug, PartialEq)]
30pub enum AbbrevOp {
31    /// A literal, constant operand.
32    Literal(u64),
33    /// A VBR whose width is is associated as extra data.
34    Vbr(u64),
35    /// A fixed-width field whose width is associated as extra data.
36    Fixed(u64),
37    /// A fixed-length array whose member elements are specified.
38    Array(Box<AbbrevOp>),
39    /// A single Char6.
40    Char6,
41    /// A fixed-length blob of bytes.
42    Blob,
43}
44
45impl AbbrevOp {
46    /// Given a Char6 value, map it back to its ASCII printable equivalent.
47    ///
48    /// This function is private because it requires caller-upheld invariants
49    /// for panic safety.
50    fn decode_char6(char6: u8) -> u8 {
51        // Panic safety: the caller is expected to constrain char6 to a valid
52        // index within CHAR6_ALPHABET.
53        CHAR6_ALPHABET[char6 as usize]
54    }
55
56    /// Parse a single abbreviation operand from the stream, returning a
57    /// vector of one or more fields for that operand.
58    pub(self) fn parse<T: AsRef<[u8]>>(&self, cur: &mut BitCursor<T>) -> Result<Fields, Error> {
59        // A sad thing happens in this function: we parse by iterating over
60        // each operand, collecting the field(s) in the bitstream that correspond to it.
61        // Operands are typed and carry detailed information about their semantics:
62        // for example, an `AbbrevOp::Char6` is exactly 6 bits and maps directly
63        // to a printable character. It would be really nice if we could expose this structure
64        // at a higher level, i.e. by returning a `Value` enum with different variants
65        // for each operand, and higher levels could take advantage of it.
66        // Unfortunately, LLVM does not let us do this: bitstream consumers **must**
67        // be agnostic to how the bitstream is emitted, which means that an emitter's
68        // decision to use a Char6 vs. a VBR6 cannot affect later, higher-level interpretation.
69        // As a result, we have to discard all of our nice structure here in favor of
70        // sequences of "fields," which are really just individual `u64`s.
71        Ok(match self {
72            AbbrevOp::Literal(val) => vec![*val],
73            AbbrevOp::Vbr(width) => vec![cur.read_vbr(*width as usize)?],
74            AbbrevOp::Fixed(width) => vec![cur.read_as::<u64>(*width as usize)?],
75            AbbrevOp::Array(elem) => {
76                // An array operand is encoded as a length (VBR6), followed by
77                // each encoded element of the array.
78                // TODO(ww): Sanity check array_len here.
79                let array_len = cur.read_vbr(6)? as usize;
80
81                let mut fields: Fields = Vec::with_capacity(array_len);
82                for _ in 0..array_len {
83                    fields.extend(elem.parse(cur)?);
84                }
85
86                fields
87            }
88            AbbrevOp::Char6 => vec![Self::decode_char6(cur.read_as::<u8>(6)?).into()],
89            AbbrevOp::Blob => {
90                // A blob operand is encoded as a length (VBR6), followed by a 32-bit aligned
91                // sequence of bytes, followed by another alignment back to 32 bits.
92
93                // TODO(ww): Sanity check blob_len here: it probably shouldn't be 0,
94                // and it definitely can't be longer than the stream.
95                let blob_len = cur.read_vbr(6)? as usize;
96                cur.align32();
97
98                // TODO(ww): This read loop is probably slower than it needs to be;
99                // `BitCursor` could probably learn a `read_bytes` API that's
100                // only allowed when the stream is byte-aligned.
101                let mut fields: Fields = Vec::with_capacity(blob_len);
102                for _ in 0..blob_len {
103                    fields.push(cur.read_exact::<u8>()?.into());
104                }
105                cur.align32();
106
107                fields
108            }
109        })
110    }
111}
112
113/// Represents a defined abbreviation, as specified by a `DEFINE_ABBREV` record.
114#[derive(Clone, Debug)]
115pub struct Abbrev {
116    /// The abstract operands for this abbreviation definition.
117    pub operands: Vec<AbbrevOp>,
118}
119
120impl Abbrev {
121    /// Parse a new `Abbrev` from the stream.
122    ///
123    /// Assumes that the `DEFINE_ABBREV` ID has already been consumed.
124    pub fn new<T: AsRef<[u8]>>(cur: &mut BitCursor<T>) -> Result<Self, Error> {
125        // TODO(ww): This and other structures should probably implement a `FromStream`
126        // trait instead, for construction.
127
128        // Per the LLVM docs: abbreviation records look like this:
129        // [DEFINE_ABBREV, VBR5:numabbrevops, abbrevop0, abbrevop1, ...]
130        // Our surrounding parse context should have consumed the DEFINE_ABBREV
131        // already, so we start with numabbrevops.
132        let num_abbrev_opnds = cur.read_vbr(5)?;
133        if num_abbrev_opnds < 1 {
134            return Err(Error::AbbrevParse(
135                "expected at least one abbrev operand".into(),
136            ));
137        }
138
139        log::debug!("expecting {} operands", num_abbrev_opnds);
140
141        // Abbreviated records must have at least one operand.
142        if num_abbrev_opnds < 1 {
143            return Err(Error::AbbrevParse(
144                "expected abbrev operand count to be nonzero".into(),
145            ));
146        }
147
148        // Decode each abbreviation operand.
149        let mut operands = vec![];
150        let mut done_early = false;
151        for idx in 0..num_abbrev_opnds {
152            // Each operand starts with a single bit that indicates whether
153            // the operand is "literal" (i.e., a VBR8) or an "encoded" operand.
154            let operand_kind = cur.read(1)?;
155
156            // If this operand is a literal, then we read it as a VBR8.
157            if operand_kind == 1 {
158                let val = cur.read_vbr(8)?;
159
160                // NOTE(ww): This error is exceedingly unlikely (usize would have to be larger
161                // than u64). But you never know.
162                operands.push(AbbrevOp::Literal(val));
163
164                continue;
165            }
166
167            // Otherwise, we need to suss the encoding representation out of it.
168            // This is always a 3-bit field (**not** a VBR3), which in turn tells us whether the
169            // operand encoding includes extra data.
170            let enc: AbbrevOpEnc = cur.read(3)?.try_into()?;
171            let opnd = match enc {
172                AbbrevOpEnc::Fixed => AbbrevOp::Fixed(cur.read_vbr(5)?),
173                AbbrevOpEnc::Vbr => AbbrevOp::Vbr(cur.read_vbr(5)?),
174                AbbrevOpEnc::Array => {
175                    // There is only ever one array operand in an abbreviation definition,
176                    // and it is always the second-to-last operand. Anything else is an error.
177                    if idx != num_abbrev_opnds - 2 {
178                        return Err(Error::AbbrevParse("array operand at invalid index".into()));
179                    }
180
181                    // NOTE(ww): We get a little clever here: instead of parsing
182                    // the inner array operand on its own, we steal it here and set
183                    // `done_early` to indicate that we're done with operand parsing.
184                    // This works since array operands are guaranteed to be second-to-last,
185                    // followed only by their element operand encoding.
186                    cur.read(1)?;
187                    let elem_enc: AbbrevOpEnc = cur.read(3)?.try_into()?;
188                    done_early = true;
189
190                    let elem = match elem_enc {
191                        AbbrevOpEnc::Fixed => AbbrevOp::Fixed(cur.read_vbr(5)?),
192                        AbbrevOpEnc::Vbr => AbbrevOp::Vbr(cur.read_vbr(5)?),
193                        AbbrevOpEnc::Char6 => AbbrevOp::Char6,
194                        _ => {
195                            // Blobs and arrays cannot themselves be member types.
196                            return Err(Error::AbbrevParse(format!(
197                                "invalid element type for an array: {:?}",
198                                elem_enc
199                            )));
200                        }
201                    };
202
203                    AbbrevOp::Array(Box::new(elem))
204                }
205                AbbrevOpEnc::Char6 => AbbrevOp::Char6,
206                AbbrevOpEnc::Blob => {
207                    // Similarly to arrays: there is only ever one blob operand.
208                    // Blobs don't have an element type, so they're always the last operand.
209                    if idx != num_abbrev_opnds - 1 {
210                        return Err(Error::AbbrevParse("blob operand at invalid index".into()));
211                    }
212
213                    AbbrevOp::Blob
214                }
215            };
216
217            operands.push(opnd);
218
219            // See above: don't complete the entire operand parsing loop if we've successfully
220            // stolen the last operand as part of an array.
221            if done_early {
222                break;
223            }
224        }
225
226        Ok(Self { operands: operands })
227    }
228
229    /// Parse an abbreviated record from this stream, returning its fields.
230    pub fn parse<T: AsRef<[u8]>>(&self, cur: &mut BitCursor<T>) -> Result<Fields, Error> {
231        Ok(self
232            .operands
233            .iter()
234            .map(|opnd| opnd.parse(cur))
235            .collect::<Result<Vec<_>, _>>()?
236            .into_iter()
237            .flatten()
238            .collect())
239    }
240}