llvm_bitstream/
parser.rs

1//! Core parsing functionality for `llvm-bitstream`.
2
3use std::collections::HashMap;
4use std::convert::TryInto;
5use std::iter;
6
7use llvm_bitcursor::BitCursor;
8use llvm_support::bitcodes::{BlockInfoCode, ReservedAbbrevId, ReservedBlockId};
9use llvm_support::{FIRST_APPLICATION_ABBREV_ID, INITIAL_ABBREV_ID_WIDTH};
10
11use crate::abbrev::{self, AbbrevId};
12use crate::error::Error;
13use crate::record::{Block, Fields, Record};
14
15/// The kinds of entries we can see while advancing through the bitstream.
16/// Abbreviations are handled transparently by the parser, and thus are
17/// never surfaced as `StreamEntry` values.
18#[derive(Debug)]
19pub enum StreamEntry {
20    /// The end of a block scope.
21    EndBlock,
22    /// The beginning of a new block scope, for a block with the given ID.
23    SubBlock(Block),
24    /// The beginning of a new record within the current scope, with the given
25    /// abbreviation ID.
26    Record(Record),
27}
28
29impl StreamEntry {
30    /// Consumes this `StreamEntry` and returns its inner [Block](crate::record::Block), if it is
31    /// in fact a block.
32    ///
33    /// If the entry is not a block, returns `None.
34    pub fn as_block(self) -> Option<Block> {
35        match self {
36            StreamEntry::SubBlock(block) => Some(block),
37            _ => None,
38        }
39    }
40}
41
42/// Represents the necessary parse state for a particular scope in the bitstream.
43///
44/// Note that a scope does not *necessarily* correspond to a block: every
45/// parser begins with an initial non-block scope before the first block is encountered.
46#[derive(Debug)]
47enum Scope {
48    Initial,
49    Block {
50        abbrev_id_width: u64,
51        block_id: u64,
52        blockinfo_block_id: Option<u64>,
53        abbrevs: Vec<abbrev::Abbrev>,
54    },
55}
56
57impl Default for Scope {
58    fn default() -> Self {
59        Self::Initial
60    }
61}
62
63impl Scope {
64    /// Returns a new (block) scope.
65    pub(self) fn new(abbrev_id_width: u64, block_id: u64) -> Self {
66        Self::Block {
67            abbrev_id_width: abbrev_id_width,
68            block_id: block_id,
69            blockinfo_block_id: None,
70            abbrevs: vec![],
71        }
72    }
73
74    /// Returns the current width used for abbreviation IDs.
75    pub(self) fn abbrev_id_width(&self) -> u64 {
76        match self {
77            Scope::Initial => INITIAL_ABBREV_ID_WIDTH,
78            Scope::Block {
79                abbrev_id_width, ..
80            } => *abbrev_id_width,
81        }
82    }
83
84    /// Extend the current (block) scope's abbreviation definition list with the given
85    /// iterator.
86    ///
87    /// Returns an error if used on a non-block scope.
88    pub(self) fn extend_abbrevs(
89        &mut self,
90        new_abbrevs: impl iter::IntoIterator<Item = abbrev::Abbrev>,
91    ) -> Result<(), Error> {
92        match self {
93            Scope::Initial => Err(Error::BadScope(
94                "non-block scope cannot reference abbreviations".into(),
95            )),
96            Scope::Block { abbrevs, .. } => {
97                abbrevs.extend(new_abbrevs);
98                Ok(())
99            }
100        }
101    }
102
103    /// Return a reference to the abbreviation definition with the given `abbrev_id`.
104    ///
105    /// Returns an error if the scope cannot contain abbreviation definitions or does
106    /// not have one for the given ID.
107    pub(self) fn get_abbrev(&self, abbrev_id: u64) -> Result<&abbrev::Abbrev, Error> {
108        match self {
109            Scope::Initial => Err(Error::BadScope(
110                "non-block scope cannot contain records".into(),
111            )),
112            Scope::Block { abbrevs, .. } => {
113                let idx = (abbrev_id as usize) - FIRST_APPLICATION_ABBREV_ID;
114                abbrevs.get(idx).ok_or(Error::BadAbbrev(abbrev_id))
115            }
116        }
117    }
118
119    /// Returns `true` if this scope corresponds to a `BLOCKINFO` block.
120    ///
121    /// This keeps the [`StreamParser`](StreamParser) honest when determining
122    /// which blocks and/or records to emit entries for.
123    pub(self) fn is_blockinfo(&self) -> bool {
124        match self {
125            Scope::Initial => false,
126            Scope::Block { block_id, .. } => *block_id == ReservedBlockId::BlockInfo as u64,
127        }
128    }
129
130    /// Returns the last block ID recorded with `SETBID` in the `BLOCKINFO` block.
131    ///
132    /// This function's return is only sensible in the context of a scope corresponding
133    /// to `BLOCKINFO`. Use on any other scope constitutes API misuse.
134    pub(self) fn blockinfo_block_id(&self) -> Option<u64> {
135        match self {
136            Scope::Initial => None,
137            Scope::Block {
138                blockinfo_block_id, ..
139            } => *blockinfo_block_id,
140        }
141    }
142
143    /// Sets the current block ID for the `BLOCKINFO` block's state machine.
144    ///
145    /// Returns an error if requested in a nonsense context, such as on any
146    /// non-`BLOCKINFO` scope.
147    pub(self) fn set_blockinfo_block_id(&mut self, new_bid: u64) -> Result<(), Error> {
148        if let Scope::Block {
149            blockinfo_block_id, ..
150        } = self
151        {
152            *blockinfo_block_id = Some(new_bid);
153            return Ok(());
154        }
155
156        Err(Error::BadScope(
157            "can't set BLOCKINFO block ID for non-BLOCKINFO scope".into(),
158        ))
159    }
160}
161
162/// A parser for individual bitstream entries.
163///
164/// This structure is **not** a general-purpose parser for bitstream inputs:
165/// it expects to be given a prepared [`BitCursor`](BitCursor) whose internal
166/// state is correct (i.e., has been advanced past the initial input magic).
167///
168/// For a general-purpose parser with the correct state management, see
169/// [`Bitstream`](crate::Bitstream).
170#[derive(Debug)]
171pub struct StreamParser<T: AsRef<[u8]>> {
172    cursor: BitCursor<T>,
173    scopes: Vec<Scope>,
174    blockinfo: HashMap<u64, Vec<abbrev::Abbrev>>,
175}
176
177impl<T: AsRef<[u8]>> StreamParser<T> {
178    /// Create a new `StreamParser` from the given `BitCursor`.
179    ///
180    /// See the struct-level documentation for caveats.
181    pub(crate) fn new(cur: BitCursor<T>) -> Self {
182        Self {
183            cursor: cur,
184            scopes: vec![Scope::default()],
185            blockinfo: Default::default(),
186        }
187    }
188
189    /// Returns the current scope.
190    fn scope(&self) -> &Scope {
191        // Unwrap safety: `scopes` is always created with at least one scope, so
192        // `last()` cannot fail.
193        #[allow(clippy::unwrap_used)]
194        self.scopes.last().unwrap()
195    }
196
197    /// Returns the current scope as a mutable reference.
198    fn scope_mut(&mut self) -> &mut Scope {
199        // Unwrap safety: `scopes` is always created with at least one scope, so
200        // `last()` cannot fail.
201        #[allow(clippy::unwrap_used)]
202        self.scopes.last_mut().unwrap()
203    }
204
205    /// Enter a block, creating the appropriate scope state for interpreting
206    /// records within the block.
207    ///
208    /// If this block is a "metadata" one (e.g., `BLOCKINFO`), returns `None`.
209    fn enter_block(&mut self) -> Result<Option<StreamEntry>, Error> {
210        let block_id = self.cursor.read_vbr(8)?;
211        let new_width = self.cursor.read_vbr(4)?;
212
213        self.cursor.align32();
214
215        if new_width < 1 {
216            return Err(Error::BadScope(format!(
217                "can't enter block: invalid code side: {}",
218                new_width
219            )));
220        }
221
222        // The encoded block length is measured in 32-bit words, so our
223        // actual block length in bytes is the word count times the bytesize
224        // of each word.
225        let block_len = self.cursor.read(32)? * 4;
226        log::debug!(
227            "entered block: ID={}, new abbrev width={}, block_len={} @ bit position {}",
228            block_id,
229            new_width,
230            block_len,
231            self.cursor.tell_bit()
232        );
233
234        // Create a new scope for the block we've just entered.
235        self.scopes.push(Scope::new(new_width, block_id));
236
237        // If our blockinfo map contains any abbrevs for the current block ID, add them here.
238        if let Some(abbrevs) = self.blockinfo.get(&block_id).map(|a| a.to_vec()) {
239            self.scope_mut().extend_abbrevs(abbrevs)?;
240        }
241
242        // If we've just entered a BLOCKINFO block, return `None` to avoid
243        // surfacing parse details to the `advance()` API.
244        if self.scope().is_blockinfo() {
245            return Ok(None);
246        }
247
248        // Otherwise, return an appropriate entry.
249        Ok(Some(StreamEntry::SubBlock(Block {
250            block_id: block_id,
251            len: block_len,
252        })))
253    }
254
255    /// Exit a block, returning the scope to the appropriate state for the parent block.
256    fn exit_block(&mut self) -> Result<Option<StreamEntry>, Error> {
257        // An END_BLOCK record just aligns the stream.
258        self.cursor.align32();
259
260        // NOTE(ww): We never allow an END_BLOCK to pop the last scope,
261        // since the last scope is synthetic and does not correspond to a real block.
262        if self.scopes.len() <= 1 {
263            return Err(Error::BadScope(
264                "malformed stream: cannot perform END_BLOCK because scope stack is empty".into(),
265            ));
266        }
267
268        // Unwrap safety: we check for at least one scope above, so this cannot fail.
269        #[allow(clippy::unwrap_used)]
270        let scope = self.scopes.pop().unwrap();
271
272        log::debug!("exit_block: new active scope is {:?}", self.scope());
273
274        // If we're exiting a BLOCKINFO, we have nothing to return.
275        if scope.is_blockinfo() {
276            return Ok(None);
277        }
278
279        Ok(Some(StreamEntry::EndBlock))
280    }
281
282    /// Interpret a `DEFINE_ABBREV` record.
283    fn define_abbrev(&mut self) -> Result<(), Error> {
284        let abbrev = abbrev::Abbrev::new(&mut self.cursor)?;
285        log::debug!("new abbrev: {:?}", abbrev);
286
287        // `DEFINE_ABBREV` occurs in two contexts: either in a `BLOCKINFO`
288        // block (where it affects all blocks with block ID defined by the current `SETBID`),
289        // or in any other block, where it affects only the current scope.
290        // For the latter case we assume that any `BLOCKINFO`-defined abbrevs have
291        // already been loaded into the current scope.
292        if self.scope().is_blockinfo() {
293            let block_id = self.scope().blockinfo_block_id().ok_or_else(|| {
294                Error::StreamParse("DEFINE_ABBREV in BLOCKINFO but no preceding SETBID".into())
295            })?;
296            self.blockinfo
297                .entry(block_id)
298                .or_insert_with(Vec::new)
299                .push(abbrev);
300        } else {
301            self.scope_mut().extend_abbrevs(iter::once(abbrev))?;
302        }
303
304        Ok(())
305    }
306
307    /// Interpret an `UNABBREV_RECORD` record.
308    fn parse_unabbrev(&mut self) -> Result<Option<StreamEntry>, Error> {
309        // Sanity check: `UNABBREV_RECORD` can only occur inside a block,
310        // so the current scope must be a block.
311        if matches!(self.scope(), Scope::Initial) {
312            return Err(Error::StreamParse(
313                "UNABBREV_RECORD outside of any block scope".into(),
314            ));
315        }
316
317        // An unabbrev record looks like this:
318        // [code:VBR6, numops:VBR6, op0:VBR6, op1:VBR6, ...]
319        // This isn't worth generalizing, so do it all in the body here.
320        let code: u64 = self.cursor.read_vbr(6)?;
321        let num_opnds = self.cursor.read_vbr(6)?;
322
323        log::debug!("unabbrev record code={}, num_opnds={}", code, num_opnds);
324
325        let mut fields: Fields = Vec::with_capacity(num_opnds as usize);
326        for _ in 0..num_opnds {
327            fields.push(self.cursor.read_vbr(6)?);
328        }
329
330        let record = Record::from_unabbrev(code, fields);
331        if self.scope().is_blockinfo() {
332            let code: BlockInfoCode = record.code.try_into()?;
333            match code {
334                BlockInfoCode::SetBid => {
335                    let block_id: u64 = record.fields[0];
336                    log::debug!("SETBID: BLOCKINFO block ID is now {}", block_id);
337                    self.scope_mut().set_blockinfo_block_id(block_id)?;
338                }
339                BlockInfoCode::BlockName => log::debug!("skipping BLOCKNAME code in BLOCKINFO"),
340                BlockInfoCode::SetRecordName => {
341                    log::debug!("skipping SETRECORDNAME code in BLOCKINFO")
342                }
343                o => log::debug!("skipping unsupported record {:?} in BLOCKINFO", o),
344            };
345            return Ok(None);
346        }
347
348        Ok(Some(StreamEntry::Record(record)))
349    }
350
351    /// Interpret a record using its corresponding abbreviation definition.
352    fn parse_with_abbrev(&mut self, abbrev_id: u64) -> Result<Option<StreamEntry>, Error> {
353        // To parse a record according to an abbreviation definition, we
354        // fetch the corresponding abbreviation (failing if we don't have one),
355        // then use the abbreviation for the parse.
356        // TODO(ww): The clone at the end here is a little annoying, but we
357        // need it to avoid mixing mutable and immutable borrows here.
358        // There is absolutely a better way to do that.
359        let abbrev = self.scope().get_abbrev(abbrev_id)?.clone();
360
361        let mut fields = abbrev.parse(&mut self.cursor)?;
362        log::debug!("parsed fields: {:?}", fields);
363
364        // Panic safety: every abbrev contains at least one operand, so this cannot panic.
365        // We also expect the first operand to always be a u64, indicating the record code.
366        let code: u64 = fields.remove(0);
367
368        if self.scope().is_blockinfo() {
369            return Ok(None);
370        }
371
372        Ok(Some(StreamEntry::Record(Record {
373            abbrev_id: Some(abbrev_id),
374            code: code,
375            fields: fields,
376        })))
377    }
378
379    /// Return the next [`StreamEntry`](StreamEntry) in this bitstream.
380    ///
381    /// Returns an error on any parsing error, *or* the special
382    /// [`Error::Exhausted`](Error::Exhausted) if the bitstream has
383    /// been fully consumed.
384    pub fn advance(&mut self) -> Result<StreamEntry, Error> {
385        if self.cursor.exhausted() {
386            return Err(Error::Exhausted);
387        }
388
389        log::debug!(
390            "advancing, current scope: {:?} @ bit position {}",
391            self.scope(),
392            self.cursor.tell_bit()
393        );
394
395        // To return the next stream entry, we read the next abbreviation ID using
396        // our current width. The abbreviation ID we read determines our subsequent
397        // parse strategy and the kind of entry we return.
398        let id: abbrev::AbbrevId = self
399            .cursor
400            .read(self.scope().abbrev_id_width() as usize)?
401            .into();
402        log::debug!("next entry ID: {:?}", id);
403
404        // NOTE(ww): The strange `map` + `unwrap_or_else` pattern below is to keep the parser
405        // generalized without having to return `StreamEntries` that correspond to
406        // parse details that a stream consumer shouldn't have to be aware of
407        // (such as abbrev definitions and the BLOCKINFO block).
408        match id {
409            AbbrevId::Reserved(ReservedAbbrevId::EndBlock) => {
410                self.exit_block()?.map(Ok).unwrap_or_else(|| self.advance())
411            }
412            AbbrevId::Reserved(ReservedAbbrevId::EnterSubBlock) => self
413                .enter_block()?
414                .map(Ok)
415                .unwrap_or_else(|| self.advance()),
416            AbbrevId::Reserved(ReservedAbbrevId::DefineAbbrev) => {
417                // DEFINE_ABBREV is always a parse detail, so we don't even bother
418                // trying to return a StreamEntry for it.
419                self.define_abbrev()?;
420                self.advance()
421            }
422            AbbrevId::Reserved(ReservedAbbrevId::UnabbrevRecord) => self
423                .parse_unabbrev()?
424                .map(Ok)
425                .unwrap_or_else(|| self.advance()),
426            AbbrevId::Defined(abbrev_id) => self
427                .parse_with_abbrev(abbrev_id)?
428                .map(Ok)
429                .unwrap_or_else(|| self.advance()),
430        }
431    }
432}