llvm_bitstream/abbrev.rs
1//! Abbreviation definition and abbreviated record parsing and handling for `llvm-bitstream`.
2
3use std::convert::{From, TryFrom, TryInto};
4
5use llvm_bitcursor::BitCursor;
6use llvm_support::bitcodes::{AbbrevOpEnc, ReservedAbbrevId};
7use llvm_support::CHAR6_ALPHABET;
8
9use crate::error::Error;
10use crate::record::Fields;
11
12/// An abbreviation ID, whether reserved or defined by the stream itself.
13#[derive(Clone, Copy, Debug)]
14pub enum AbbrevId {
15 /// A reserved abbreviation ID.
16 Reserved(ReservedAbbrevId),
17 /// An abbreviation ID that's been defined within the stream.
18 Defined(u64),
19}
20
21impl From<u64> for AbbrevId {
22 fn from(value: u64) -> Self {
23 ReservedAbbrevId::try_from(value)
24 .map_or_else(|_| AbbrevId::Defined(value), AbbrevId::Reserved)
25 }
26}
27
28/// The valid abbreviation operand forms.
29#[derive(Clone, Debug, PartialEq)]
30pub enum AbbrevOp {
31 /// A literal, constant operand.
32 Literal(u64),
33 /// A VBR whose width is is associated as extra data.
34 Vbr(u64),
35 /// A fixed-width field whose width is associated as extra data.
36 Fixed(u64),
37 /// A fixed-length array whose member elements are specified.
38 Array(Box<AbbrevOp>),
39 /// A single Char6.
40 Char6,
41 /// A fixed-length blob of bytes.
42 Blob,
43}
44
45impl AbbrevOp {
46 /// Given a Char6 value, map it back to its ASCII printable equivalent.
47 ///
48 /// This function is private because it requires caller-upheld invariants
49 /// for panic safety.
50 fn decode_char6(char6: u8) -> u8 {
51 // Panic safety: the caller is expected to constrain char6 to a valid
52 // index within CHAR6_ALPHABET.
53 CHAR6_ALPHABET[char6 as usize]
54 }
55
56 /// Parse a single abbreviation operand from the stream, returning a
57 /// vector of one or more fields for that operand.
58 pub(self) fn parse<T: AsRef<[u8]>>(&self, cur: &mut BitCursor<T>) -> Result<Fields, Error> {
59 // A sad thing happens in this function: we parse by iterating over
60 // each operand, collecting the field(s) in the bitstream that correspond to it.
61 // Operands are typed and carry detailed information about their semantics:
62 // for example, an `AbbrevOp::Char6` is exactly 6 bits and maps directly
63 // to a printable character. It would be really nice if we could expose this structure
64 // at a higher level, i.e. by returning a `Value` enum with different variants
65 // for each operand, and higher levels could take advantage of it.
66 // Unfortunately, LLVM does not let us do this: bitstream consumers **must**
67 // be agnostic to how the bitstream is emitted, which means that an emitter's
68 // decision to use a Char6 vs. a VBR6 cannot affect later, higher-level interpretation.
69 // As a result, we have to discard all of our nice structure here in favor of
70 // sequences of "fields," which are really just individual `u64`s.
71 Ok(match self {
72 AbbrevOp::Literal(val) => vec![*val],
73 AbbrevOp::Vbr(width) => vec![cur.read_vbr(*width as usize)?],
74 AbbrevOp::Fixed(width) => vec![cur.read_as::<u64>(*width as usize)?],
75 AbbrevOp::Array(elem) => {
76 // An array operand is encoded as a length (VBR6), followed by
77 // each encoded element of the array.
78 // TODO(ww): Sanity check array_len here.
79 let array_len = cur.read_vbr(6)? as usize;
80
81 let mut fields: Fields = Vec::with_capacity(array_len);
82 for _ in 0..array_len {
83 fields.extend(elem.parse(cur)?);
84 }
85
86 fields
87 }
88 AbbrevOp::Char6 => vec![Self::decode_char6(cur.read_as::<u8>(6)?).into()],
89 AbbrevOp::Blob => {
90 // A blob operand is encoded as a length (VBR6), followed by a 32-bit aligned
91 // sequence of bytes, followed by another alignment back to 32 bits.
92
93 // TODO(ww): Sanity check blob_len here: it probably shouldn't be 0,
94 // and it definitely can't be longer than the stream.
95 let blob_len = cur.read_vbr(6)? as usize;
96 cur.align32();
97
98 // TODO(ww): This read loop is probably slower than it needs to be;
99 // `BitCursor` could probably learn a `read_bytes` API that's
100 // only allowed when the stream is byte-aligned.
101 let mut fields: Fields = Vec::with_capacity(blob_len);
102 for _ in 0..blob_len {
103 fields.push(cur.read_exact::<u8>()?.into());
104 }
105 cur.align32();
106
107 fields
108 }
109 })
110 }
111}
112
113/// Represents a defined abbreviation, as specified by a `DEFINE_ABBREV` record.
114#[derive(Clone, Debug)]
115pub struct Abbrev {
116 /// The abstract operands for this abbreviation definition.
117 pub operands: Vec<AbbrevOp>,
118}
119
120impl Abbrev {
121 /// Parse a new `Abbrev` from the stream.
122 ///
123 /// Assumes that the `DEFINE_ABBREV` ID has already been consumed.
124 pub fn new<T: AsRef<[u8]>>(cur: &mut BitCursor<T>) -> Result<Self, Error> {
125 // TODO(ww): This and other structures should probably implement a `FromStream`
126 // trait instead, for construction.
127
128 // Per the LLVM docs: abbreviation records look like this:
129 // [DEFINE_ABBREV, VBR5:numabbrevops, abbrevop0, abbrevop1, ...]
130 // Our surrounding parse context should have consumed the DEFINE_ABBREV
131 // already, so we start with numabbrevops.
132 let num_abbrev_opnds = cur.read_vbr(5)?;
133 if num_abbrev_opnds < 1 {
134 return Err(Error::AbbrevParse(
135 "expected at least one abbrev operand".into(),
136 ));
137 }
138
139 log::debug!("expecting {} operands", num_abbrev_opnds);
140
141 // Abbreviated records must have at least one operand.
142 if num_abbrev_opnds < 1 {
143 return Err(Error::AbbrevParse(
144 "expected abbrev operand count to be nonzero".into(),
145 ));
146 }
147
148 // Decode each abbreviation operand.
149 let mut operands = vec![];
150 let mut done_early = false;
151 for idx in 0..num_abbrev_opnds {
152 // Each operand starts with a single bit that indicates whether
153 // the operand is "literal" (i.e., a VBR8) or an "encoded" operand.
154 let operand_kind = cur.read(1)?;
155
156 // If this operand is a literal, then we read it as a VBR8.
157 if operand_kind == 1 {
158 let val = cur.read_vbr(8)?;
159
160 // NOTE(ww): This error is exceedingly unlikely (usize would have to be larger
161 // than u64). But you never know.
162 operands.push(AbbrevOp::Literal(val));
163
164 continue;
165 }
166
167 // Otherwise, we need to suss the encoding representation out of it.
168 // This is always a 3-bit field (**not** a VBR3), which in turn tells us whether the
169 // operand encoding includes extra data.
170 let enc: AbbrevOpEnc = cur.read(3)?.try_into()?;
171 let opnd = match enc {
172 AbbrevOpEnc::Fixed => AbbrevOp::Fixed(cur.read_vbr(5)?),
173 AbbrevOpEnc::Vbr => AbbrevOp::Vbr(cur.read_vbr(5)?),
174 AbbrevOpEnc::Array => {
175 // There is only ever one array operand in an abbreviation definition,
176 // and it is always the second-to-last operand. Anything else is an error.
177 if idx != num_abbrev_opnds - 2 {
178 return Err(Error::AbbrevParse("array operand at invalid index".into()));
179 }
180
181 // NOTE(ww): We get a little clever here: instead of parsing
182 // the inner array operand on its own, we steal it here and set
183 // `done_early` to indicate that we're done with operand parsing.
184 // This works since array operands are guaranteed to be second-to-last,
185 // followed only by their element operand encoding.
186 cur.read(1)?;
187 let elem_enc: AbbrevOpEnc = cur.read(3)?.try_into()?;
188 done_early = true;
189
190 let elem = match elem_enc {
191 AbbrevOpEnc::Fixed => AbbrevOp::Fixed(cur.read_vbr(5)?),
192 AbbrevOpEnc::Vbr => AbbrevOp::Vbr(cur.read_vbr(5)?),
193 AbbrevOpEnc::Char6 => AbbrevOp::Char6,
194 _ => {
195 // Blobs and arrays cannot themselves be member types.
196 return Err(Error::AbbrevParse(format!(
197 "invalid element type for an array: {:?}",
198 elem_enc
199 )));
200 }
201 };
202
203 AbbrevOp::Array(Box::new(elem))
204 }
205 AbbrevOpEnc::Char6 => AbbrevOp::Char6,
206 AbbrevOpEnc::Blob => {
207 // Similarly to arrays: there is only ever one blob operand.
208 // Blobs don't have an element type, so they're always the last operand.
209 if idx != num_abbrev_opnds - 1 {
210 return Err(Error::AbbrevParse("blob operand at invalid index".into()));
211 }
212
213 AbbrevOp::Blob
214 }
215 };
216
217 operands.push(opnd);
218
219 // See above: don't complete the entire operand parsing loop if we've successfully
220 // stolen the last operand as part of an array.
221 if done_early {
222 break;
223 }
224 }
225
226 Ok(Self { operands: operands })
227 }
228
229 /// Parse an abbreviated record from this stream, returning its fields.
230 pub fn parse<T: AsRef<[u8]>>(&self, cur: &mut BitCursor<T>) -> Result<Fields, Error> {
231 Ok(self
232 .operands
233 .iter()
234 .map(|opnd| opnd.parse(cur))
235 .collect::<Result<Vec<_>, _>>()?
236 .into_iter()
237 .flatten()
238 .collect())
239 }
240}