llvm_bitstream/parser.rs
1//! Core parsing functionality for `llvm-bitstream`.
2
3use std::collections::HashMap;
4use std::convert::TryInto;
5use std::iter;
6
7use llvm_bitcursor::BitCursor;
8use llvm_support::bitcodes::{BlockInfoCode, ReservedAbbrevId, ReservedBlockId};
9use llvm_support::{FIRST_APPLICATION_ABBREV_ID, INITIAL_ABBREV_ID_WIDTH};
10
11use crate::abbrev::{self, AbbrevId};
12use crate::error::Error;
13use crate::record::{Block, Fields, Record};
14
15/// The kinds of entries we can see while advancing through the bitstream.
16/// Abbreviations are handled transparently by the parser, and thus are
17/// never surfaced as `StreamEntry` values.
18#[derive(Debug)]
19pub enum StreamEntry {
20 /// The end of a block scope.
21 EndBlock,
22 /// The beginning of a new block scope, for a block with the given ID.
23 SubBlock(Block),
24 /// The beginning of a new record within the current scope, with the given
25 /// abbreviation ID.
26 Record(Record),
27}
28
29impl StreamEntry {
30 /// Consumes this `StreamEntry` and returns its inner [Block](crate::record::Block), if it is
31 /// in fact a block.
32 ///
33 /// If the entry is not a block, returns `None.
34 pub fn as_block(self) -> Option<Block> {
35 match self {
36 StreamEntry::SubBlock(block) => Some(block),
37 _ => None,
38 }
39 }
40}
41
42/// Represents the necessary parse state for a particular scope in the bitstream.
43///
44/// Note that a scope does not *necessarily* correspond to a block: every
45/// parser begins with an initial non-block scope before the first block is encountered.
46#[derive(Debug)]
47enum Scope {
48 Initial,
49 Block {
50 abbrev_id_width: u64,
51 block_id: u64,
52 blockinfo_block_id: Option<u64>,
53 abbrevs: Vec<abbrev::Abbrev>,
54 },
55}
56
57impl Default for Scope {
58 fn default() -> Self {
59 Self::Initial
60 }
61}
62
63impl Scope {
64 /// Returns a new (block) scope.
65 pub(self) fn new(abbrev_id_width: u64, block_id: u64) -> Self {
66 Self::Block {
67 abbrev_id_width: abbrev_id_width,
68 block_id: block_id,
69 blockinfo_block_id: None,
70 abbrevs: vec![],
71 }
72 }
73
74 /// Returns the current width used for abbreviation IDs.
75 pub(self) fn abbrev_id_width(&self) -> u64 {
76 match self {
77 Scope::Initial => INITIAL_ABBREV_ID_WIDTH,
78 Scope::Block {
79 abbrev_id_width, ..
80 } => *abbrev_id_width,
81 }
82 }
83
84 /// Extend the current (block) scope's abbreviation definition list with the given
85 /// iterator.
86 ///
87 /// Returns an error if used on a non-block scope.
88 pub(self) fn extend_abbrevs(
89 &mut self,
90 new_abbrevs: impl iter::IntoIterator<Item = abbrev::Abbrev>,
91 ) -> Result<(), Error> {
92 match self {
93 Scope::Initial => Err(Error::BadScope(
94 "non-block scope cannot reference abbreviations".into(),
95 )),
96 Scope::Block { abbrevs, .. } => {
97 abbrevs.extend(new_abbrevs);
98 Ok(())
99 }
100 }
101 }
102
103 /// Return a reference to the abbreviation definition with the given `abbrev_id`.
104 ///
105 /// Returns an error if the scope cannot contain abbreviation definitions or does
106 /// not have one for the given ID.
107 pub(self) fn get_abbrev(&self, abbrev_id: u64) -> Result<&abbrev::Abbrev, Error> {
108 match self {
109 Scope::Initial => Err(Error::BadScope(
110 "non-block scope cannot contain records".into(),
111 )),
112 Scope::Block { abbrevs, .. } => {
113 let idx = (abbrev_id as usize) - FIRST_APPLICATION_ABBREV_ID;
114 abbrevs.get(idx).ok_or(Error::BadAbbrev(abbrev_id))
115 }
116 }
117 }
118
119 /// Returns `true` if this scope corresponds to a `BLOCKINFO` block.
120 ///
121 /// This keeps the [`StreamParser`](StreamParser) honest when determining
122 /// which blocks and/or records to emit entries for.
123 pub(self) fn is_blockinfo(&self) -> bool {
124 match self {
125 Scope::Initial => false,
126 Scope::Block { block_id, .. } => *block_id == ReservedBlockId::BlockInfo as u64,
127 }
128 }
129
130 /// Returns the last block ID recorded with `SETBID` in the `BLOCKINFO` block.
131 ///
132 /// This function's return is only sensible in the context of a scope corresponding
133 /// to `BLOCKINFO`. Use on any other scope constitutes API misuse.
134 pub(self) fn blockinfo_block_id(&self) -> Option<u64> {
135 match self {
136 Scope::Initial => None,
137 Scope::Block {
138 blockinfo_block_id, ..
139 } => *blockinfo_block_id,
140 }
141 }
142
143 /// Sets the current block ID for the `BLOCKINFO` block's state machine.
144 ///
145 /// Returns an error if requested in a nonsense context, such as on any
146 /// non-`BLOCKINFO` scope.
147 pub(self) fn set_blockinfo_block_id(&mut self, new_bid: u64) -> Result<(), Error> {
148 if let Scope::Block {
149 blockinfo_block_id, ..
150 } = self
151 {
152 *blockinfo_block_id = Some(new_bid);
153 return Ok(());
154 }
155
156 Err(Error::BadScope(
157 "can't set BLOCKINFO block ID for non-BLOCKINFO scope".into(),
158 ))
159 }
160}
161
162/// A parser for individual bitstream entries.
163///
164/// This structure is **not** a general-purpose parser for bitstream inputs:
165/// it expects to be given a prepared [`BitCursor`](BitCursor) whose internal
166/// state is correct (i.e., has been advanced past the initial input magic).
167///
168/// For a general-purpose parser with the correct state management, see
169/// [`Bitstream`](crate::Bitstream).
170#[derive(Debug)]
171pub struct StreamParser<T: AsRef<[u8]>> {
172 cursor: BitCursor<T>,
173 scopes: Vec<Scope>,
174 blockinfo: HashMap<u64, Vec<abbrev::Abbrev>>,
175}
176
177impl<T: AsRef<[u8]>> StreamParser<T> {
178 /// Create a new `StreamParser` from the given `BitCursor`.
179 ///
180 /// See the struct-level documentation for caveats.
181 pub(crate) fn new(cur: BitCursor<T>) -> Self {
182 Self {
183 cursor: cur,
184 scopes: vec![Scope::default()],
185 blockinfo: Default::default(),
186 }
187 }
188
189 /// Returns the current scope.
190 fn scope(&self) -> &Scope {
191 // Unwrap safety: `scopes` is always created with at least one scope, so
192 // `last()` cannot fail.
193 #[allow(clippy::unwrap_used)]
194 self.scopes.last().unwrap()
195 }
196
197 /// Returns the current scope as a mutable reference.
198 fn scope_mut(&mut self) -> &mut Scope {
199 // Unwrap safety: `scopes` is always created with at least one scope, so
200 // `last()` cannot fail.
201 #[allow(clippy::unwrap_used)]
202 self.scopes.last_mut().unwrap()
203 }
204
205 /// Enter a block, creating the appropriate scope state for interpreting
206 /// records within the block.
207 ///
208 /// If this block is a "metadata" one (e.g., `BLOCKINFO`), returns `None`.
209 fn enter_block(&mut self) -> Result<Option<StreamEntry>, Error> {
210 let block_id = self.cursor.read_vbr(8)?;
211 let new_width = self.cursor.read_vbr(4)?;
212
213 self.cursor.align32();
214
215 if new_width < 1 {
216 return Err(Error::BadScope(format!(
217 "can't enter block: invalid code side: {}",
218 new_width
219 )));
220 }
221
222 // The encoded block length is measured in 32-bit words, so our
223 // actual block length in bytes is the word count times the bytesize
224 // of each word.
225 let block_len = self.cursor.read(32)? * 4;
226 log::debug!(
227 "entered block: ID={}, new abbrev width={}, block_len={} @ bit position {}",
228 block_id,
229 new_width,
230 block_len,
231 self.cursor.tell_bit()
232 );
233
234 // Create a new scope for the block we've just entered.
235 self.scopes.push(Scope::new(new_width, block_id));
236
237 // If our blockinfo map contains any abbrevs for the current block ID, add them here.
238 if let Some(abbrevs) = self.blockinfo.get(&block_id).map(|a| a.to_vec()) {
239 self.scope_mut().extend_abbrevs(abbrevs)?;
240 }
241
242 // If we've just entered a BLOCKINFO block, return `None` to avoid
243 // surfacing parse details to the `advance()` API.
244 if self.scope().is_blockinfo() {
245 return Ok(None);
246 }
247
248 // Otherwise, return an appropriate entry.
249 Ok(Some(StreamEntry::SubBlock(Block {
250 block_id: block_id,
251 len: block_len,
252 })))
253 }
254
255 /// Exit a block, returning the scope to the appropriate state for the parent block.
256 fn exit_block(&mut self) -> Result<Option<StreamEntry>, Error> {
257 // An END_BLOCK record just aligns the stream.
258 self.cursor.align32();
259
260 // NOTE(ww): We never allow an END_BLOCK to pop the last scope,
261 // since the last scope is synthetic and does not correspond to a real block.
262 if self.scopes.len() <= 1 {
263 return Err(Error::BadScope(
264 "malformed stream: cannot perform END_BLOCK because scope stack is empty".into(),
265 ));
266 }
267
268 // Unwrap safety: we check for at least one scope above, so this cannot fail.
269 #[allow(clippy::unwrap_used)]
270 let scope = self.scopes.pop().unwrap();
271
272 log::debug!("exit_block: new active scope is {:?}", self.scope());
273
274 // If we're exiting a BLOCKINFO, we have nothing to return.
275 if scope.is_blockinfo() {
276 return Ok(None);
277 }
278
279 Ok(Some(StreamEntry::EndBlock))
280 }
281
282 /// Interpret a `DEFINE_ABBREV` record.
283 fn define_abbrev(&mut self) -> Result<(), Error> {
284 let abbrev = abbrev::Abbrev::new(&mut self.cursor)?;
285 log::debug!("new abbrev: {:?}", abbrev);
286
287 // `DEFINE_ABBREV` occurs in two contexts: either in a `BLOCKINFO`
288 // block (where it affects all blocks with block ID defined by the current `SETBID`),
289 // or in any other block, where it affects only the current scope.
290 // For the latter case we assume that any `BLOCKINFO`-defined abbrevs have
291 // already been loaded into the current scope.
292 if self.scope().is_blockinfo() {
293 let block_id = self.scope().blockinfo_block_id().ok_or_else(|| {
294 Error::StreamParse("DEFINE_ABBREV in BLOCKINFO but no preceding SETBID".into())
295 })?;
296 self.blockinfo
297 .entry(block_id)
298 .or_insert_with(Vec::new)
299 .push(abbrev);
300 } else {
301 self.scope_mut().extend_abbrevs(iter::once(abbrev))?;
302 }
303
304 Ok(())
305 }
306
307 /// Interpret an `UNABBREV_RECORD` record.
308 fn parse_unabbrev(&mut self) -> Result<Option<StreamEntry>, Error> {
309 // Sanity check: `UNABBREV_RECORD` can only occur inside a block,
310 // so the current scope must be a block.
311 if matches!(self.scope(), Scope::Initial) {
312 return Err(Error::StreamParse(
313 "UNABBREV_RECORD outside of any block scope".into(),
314 ));
315 }
316
317 // An unabbrev record looks like this:
318 // [code:VBR6, numops:VBR6, op0:VBR6, op1:VBR6, ...]
319 // This isn't worth generalizing, so do it all in the body here.
320 let code: u64 = self.cursor.read_vbr(6)?;
321 let num_opnds = self.cursor.read_vbr(6)?;
322
323 log::debug!("unabbrev record code={}, num_opnds={}", code, num_opnds);
324
325 let mut fields: Fields = Vec::with_capacity(num_opnds as usize);
326 for _ in 0..num_opnds {
327 fields.push(self.cursor.read_vbr(6)?);
328 }
329
330 let record = Record::from_unabbrev(code, fields);
331 if self.scope().is_blockinfo() {
332 let code: BlockInfoCode = record.code.try_into()?;
333 match code {
334 BlockInfoCode::SetBid => {
335 let block_id: u64 = record.fields[0];
336 log::debug!("SETBID: BLOCKINFO block ID is now {}", block_id);
337 self.scope_mut().set_blockinfo_block_id(block_id)?;
338 }
339 BlockInfoCode::BlockName => log::debug!("skipping BLOCKNAME code in BLOCKINFO"),
340 BlockInfoCode::SetRecordName => {
341 log::debug!("skipping SETRECORDNAME code in BLOCKINFO")
342 }
343 o => log::debug!("skipping unsupported record {:?} in BLOCKINFO", o),
344 };
345 return Ok(None);
346 }
347
348 Ok(Some(StreamEntry::Record(record)))
349 }
350
351 /// Interpret a record using its corresponding abbreviation definition.
352 fn parse_with_abbrev(&mut self, abbrev_id: u64) -> Result<Option<StreamEntry>, Error> {
353 // To parse a record according to an abbreviation definition, we
354 // fetch the corresponding abbreviation (failing if we don't have one),
355 // then use the abbreviation for the parse.
356 // TODO(ww): The clone at the end here is a little annoying, but we
357 // need it to avoid mixing mutable and immutable borrows here.
358 // There is absolutely a better way to do that.
359 let abbrev = self.scope().get_abbrev(abbrev_id)?.clone();
360
361 let mut fields = abbrev.parse(&mut self.cursor)?;
362 log::debug!("parsed fields: {:?}", fields);
363
364 // Panic safety: every abbrev contains at least one operand, so this cannot panic.
365 // We also expect the first operand to always be a u64, indicating the record code.
366 let code: u64 = fields.remove(0);
367
368 if self.scope().is_blockinfo() {
369 return Ok(None);
370 }
371
372 Ok(Some(StreamEntry::Record(Record {
373 abbrev_id: Some(abbrev_id),
374 code: code,
375 fields: fields,
376 })))
377 }
378
379 /// Return the next [`StreamEntry`](StreamEntry) in this bitstream.
380 ///
381 /// Returns an error on any parsing error, *or* the special
382 /// [`Error::Exhausted`](Error::Exhausted) if the bitstream has
383 /// been fully consumed.
384 pub fn advance(&mut self) -> Result<StreamEntry, Error> {
385 if self.cursor.exhausted() {
386 return Err(Error::Exhausted);
387 }
388
389 log::debug!(
390 "advancing, current scope: {:?} @ bit position {}",
391 self.scope(),
392 self.cursor.tell_bit()
393 );
394
395 // To return the next stream entry, we read the next abbreviation ID using
396 // our current width. The abbreviation ID we read determines our subsequent
397 // parse strategy and the kind of entry we return.
398 let id: abbrev::AbbrevId = self
399 .cursor
400 .read(self.scope().abbrev_id_width() as usize)?
401 .into();
402 log::debug!("next entry ID: {:?}", id);
403
404 // NOTE(ww): The strange `map` + `unwrap_or_else` pattern below is to keep the parser
405 // generalized without having to return `StreamEntries` that correspond to
406 // parse details that a stream consumer shouldn't have to be aware of
407 // (such as abbrev definitions and the BLOCKINFO block).
408 match id {
409 AbbrevId::Reserved(ReservedAbbrevId::EndBlock) => {
410 self.exit_block()?.map(Ok).unwrap_or_else(|| self.advance())
411 }
412 AbbrevId::Reserved(ReservedAbbrevId::EnterSubBlock) => self
413 .enter_block()?
414 .map(Ok)
415 .unwrap_or_else(|| self.advance()),
416 AbbrevId::Reserved(ReservedAbbrevId::DefineAbbrev) => {
417 // DEFINE_ABBREV is always a parse detail, so we don't even bother
418 // trying to return a StreamEntry for it.
419 self.define_abbrev()?;
420 self.advance()
421 }
422 AbbrevId::Reserved(ReservedAbbrevId::UnabbrevRecord) => self
423 .parse_unabbrev()?
424 .map(Ok)
425 .unwrap_or_else(|| self.advance()),
426 AbbrevId::Defined(abbrev_id) => self
427 .parse_with_abbrev(abbrev_id)?
428 .map(Ok)
429 .unwrap_or_else(|| self.advance()),
430 }
431 }
432}