1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
//! Core parsing functionality for `llvm-bitstream`.

use std::collections::HashMap;
use std::convert::TryInto;
use std::iter;

use llvm_bitcursor::BitCursor;
use llvm_constants::{
    BlockInfoCode, ReservedAbbrevId, ReservedBlockId, FIRST_APPLICATION_ABBREV_ID,
    INITIAL_ABBREV_ID_WIDTH,
};

use crate::abbrev::{self, AbbrevId};
use crate::error::Error;
use crate::record::{Block, Fields, Record};

/// The kinds of entries we can see while advancing through the bitstream.
/// Abbreviations are handled transparently by the parser, and thus are
/// never surfaced as `StreamEntry` values.
#[derive(Debug)]
pub enum StreamEntry {
    /// The end of a block scope.
    EndBlock,
    /// The beginning of a new block scope, for a block with the given ID.
    SubBlock(Block),
    /// The beginning of a new record within the current scope, with the given
    /// abbreviation ID.
    Record(Record),
}

/// Represents the necessary parse state for a particular scope in the bitstream.
///
/// Note that a scope does not *necessarily* correspond to a block: every
/// parser begins with an initial non-block scope before the first block is encountered.
#[derive(Debug)]
enum Scope {
    Initial,
    Block {
        abbrev_id_width: u64,
        block_id: u64,
        blockinfo_block_id: Option<u64>,
        abbrevs: Vec<abbrev::Abbrev>,
    },
}

impl Default for Scope {
    fn default() -> Self {
        Self::Initial
    }
}

impl Scope {
    /// Returns a new (block) scope.
    pub(self) fn new(abbrev_id_width: u64, block_id: u64) -> Self {
        Self::Block {
            abbrev_id_width: abbrev_id_width,
            block_id: block_id,
            blockinfo_block_id: None,
            abbrevs: vec![],
        }
    }

    /// Returns the current width used for abbreviation IDs.
    pub(self) fn abbrev_id_width(&self) -> u64 {
        match self {
            Scope::Initial => INITIAL_ABBREV_ID_WIDTH,
            Scope::Block {
                abbrev_id_width, ..
            } => *abbrev_id_width,
        }
    }

    /// Extend the current (block) scope's abbreviation definition list with the given
    /// iterator.
    ///
    /// Returns an error if used on a non-block scope.
    pub(self) fn extend_abbrevs(
        &mut self,
        new_abbrevs: impl iter::IntoIterator<Item = abbrev::Abbrev>,
    ) -> Result<(), Error> {
        match self {
            Scope::Initial => Err(Error::BadScope(
                "non-block scope cannot reference abbreviations".into(),
            )),
            Scope::Block { abbrevs, .. } => {
                abbrevs.extend(new_abbrevs);
                Ok(())
            }
        }
    }

    /// Return a reference to the abbreviation definition with the given `abbrev_id`.
    ///
    /// Returns an error if the scope cannot contain abbreviation definitions or does
    /// not have one for the given ID.
    pub(self) fn get_abbrev(&self, abbrev_id: u64) -> Result<&abbrev::Abbrev, Error> {
        match self {
            Scope::Initial => Err(Error::BadScope(
                "non-block scope cannot contain records".into(),
            )),
            Scope::Block { abbrevs, .. } => {
                let idx = (abbrev_id as usize) - FIRST_APPLICATION_ABBREV_ID;
                abbrevs.get(idx).ok_or(Error::BadAbbrev(abbrev_id))
            }
        }
    }

    /// Returns `true` if this scope corresponds to a `BLOCKINFO` block.
    ///
    /// This keeps the [`StreamParser`](StreamParser) honest when determining
    /// which blocks and/or records to emit entries for.
    pub(self) fn is_blockinfo(&self) -> bool {
        match self {
            Scope::Initial => false,
            Scope::Block { block_id, .. } => *block_id == ReservedBlockId::BlockInfo as u64,
        }
    }

    /// Returns the last block ID recorded with `SETBID` in the `BLOCKINFO` block.
    ///
    /// This function's return is only sensible in the context of a scope corresponding
    /// to `BLOCKINFO`. Use on any other scope constitutes API misuse.
    pub(self) fn blockinfo_block_id(&self) -> Option<u64> {
        match self {
            Scope::Initial => None,
            Scope::Block {
                blockinfo_block_id, ..
            } => *blockinfo_block_id,
        }
    }

    /// Sets the current block ID for the `BLOCKINFO` block's state machine.
    ///
    /// Returns an error if requested in a nonsense context, such as on any
    /// non-`BLOCKINFO` scope.
    pub(self) fn set_blockinfo_block_id(&mut self, new_bid: u64) -> Result<(), Error> {
        if let Scope::Block {
            blockinfo_block_id, ..
        } = self
        {
            *blockinfo_block_id = Some(new_bid);
            return Ok(());
        }

        Err(Error::BadScope(
            "can't set BLOCKINFO block ID for non-BLOCKINFO scope".into(),
        ))
    }
}

/// A parser for individual bitstream entries.
///
/// This structure is **not** a general-purpose parser for bitstream inputs:
/// it expects to be given a prepared [`BitCursor`](BitCursor) whose internal
/// state is correct (i.e., has been advanced past the initial input magic).
///
/// For a general-purpose parser with the correct state management, see
/// [`Bitstream`](crate::Bitstream).
#[derive(Debug)]
pub struct StreamParser<T: AsRef<[u8]>> {
    cursor: BitCursor<T>,
    scopes: Vec<Scope>,
    blockinfo: HashMap<u64, Vec<abbrev::Abbrev>>,
}

impl<T: AsRef<[u8]>> StreamParser<T> {
    /// Create a new `StreamParser` from the given `BitCursor`.
    ///
    /// See the struct-level documentation for caveats.
    pub(crate) fn new(cur: BitCursor<T>) -> Self {
        Self {
            cursor: cur,
            scopes: vec![Scope::default()],
            blockinfo: Default::default(),
        }
    }

    /// Returns the current scope.
    fn scope(&self) -> &Scope {
        // Unwrap safety: `scopes` is always created with at least one scope, so
        // `last()` cannot fail.
        #[allow(clippy::unwrap_used)]
        self.scopes.last().unwrap()
    }

    /// Returns the current scope as a mutable reference.
    fn scope_mut(&mut self) -> &mut Scope {
        // Unwrap safety: `scopes` is always created with at least one scope, so
        // `last()` cannot fail.
        #[allow(clippy::unwrap_used)]
        self.scopes.last_mut().unwrap()
    }

    /// Enter a block, creating the appropriate scope state for interpreting
    /// records within the block.
    ///
    /// If this block is a "metadata" one (e.g., `BLOCKINFO`), returns `None`.
    fn enter_block(&mut self) -> Result<Option<StreamEntry>, Error> {
        let block_id = self.cursor.read_vbr(8)?;
        let new_width = self.cursor.read_vbr(4)?;

        self.cursor.align32();

        if new_width < 1 {
            return Err(Error::BadScope(format!(
                "can't enter block: invalid code side: {}",
                new_width
            )));
        }

        // The encoded block length is measured in 32-bit words, so our
        // actual block length in bytes is the word count times the bytesize
        // of each word.
        let block_len = self.cursor.read(32)? * 4;
        log::debug!(
            "entered block: ID={}, new abbrev width={}, block_len={} @ bit position {}",
            block_id,
            new_width,
            block_len,
            self.cursor.tell_bit()
        );

        // Create a new scope for the block we've just entered.
        self.scopes.push(Scope::new(new_width, block_id));

        // If our blockinfo map contains any abbrevs for the current block ID, add them here.
        if let Some(abbrevs) = self.blockinfo.get(&block_id).map(|a| a.to_vec()) {
            self.scope_mut().extend_abbrevs(abbrevs)?;
        }

        // If we've just entered a BLOCKINFO block, return `None` to avoid
        // surfacing parse details to the `advance()` API.
        if self.scope().is_blockinfo() {
            return Ok(None);
        }

        // Otherwise, return an appropriate entry.
        Ok(Some(StreamEntry::SubBlock(Block {
            block_id: block_id,
            len: block_len,
        })))
    }

    /// Exit a block, returning the scope to the appropriate state for the parent block.
    fn exit_block(&mut self) -> Result<Option<StreamEntry>, Error> {
        // An END_BLOCK record just aligns the stream.
        self.cursor.align32();

        // NOTE(ww): We never allow an END_BLOCK to pop the last scope,
        // since the last scope is synthetic and does not correspond to a real block.
        if self.scopes.len() <= 1 {
            return Err(Error::BadScope(
                "malformed stream: cannot perform END_BLOCK because scope stack is empty".into(),
            ));
        }

        // Unwrap safety: we check for at least one scope above, so this cannot fail.
        #[allow(clippy::unwrap_used)]
        let scope = self.scopes.pop().unwrap();

        log::debug!("exit_block: new active scope is {:?}", self.scope());

        // If we're exiting a BLOCKINFO, we have nothing to return.
        if scope.is_blockinfo() {
            return Ok(None);
        }

        Ok(Some(StreamEntry::EndBlock))
    }

    /// Interpret a `DEFINE_ABBREV` record.
    fn define_abbrev(&mut self) -> Result<(), Error> {
        let abbrev = abbrev::Abbrev::new(&mut self.cursor)?;
        log::debug!("new abbrev: {:?}", abbrev);

        // `DEFINE_ABBREV` occurs in two contexts: either in a `BLOCKINFO`
        // block (where it affects all blocks with block ID defined by the current `SETBID`),
        // or in any other block, where it affects only the current scope.
        // For the latter case we assume that any `BLOCKINFO`-defined abbrevs have
        // already been loaded into the current scope.
        if self.scope().is_blockinfo() {
            let block_id = self.scope().blockinfo_block_id().ok_or_else(|| {
                Error::StreamParse("DEFINE_ABBREV in BLOCKINFO but no preceding SETBID".into())
            })?;
            self.blockinfo
                .entry(block_id)
                .or_insert_with(Vec::new)
                .push(abbrev);
        } else {
            self.scope_mut().extend_abbrevs(iter::once(abbrev))?;
        }

        Ok(())
    }

    /// Interpret an `UNABBREV_RECORD` record.
    fn parse_unabbrev(&mut self) -> Result<Option<StreamEntry>, Error> {
        // Sanity check: `UNABBREV_RECORD` can only occur inside a block,
        // so the current scope must be a block.
        if matches!(self.scope(), Scope::Initial) {
            return Err(Error::StreamParse(
                "UNABBREV_RECORD outside of any block scope".into(),
            ));
        }

        // An unabbrev record looks like this:
        // [code:VBR6, numops:VBR6, op0:VBR6, op1:VBR6, ...]
        // This isn't worth generalizing, so do it all in the body here.
        let code: u64 = self.cursor.read_vbr(6)?;
        let num_opnds = self.cursor.read_vbr(6)?;

        log::debug!("unabbrev record code={}, num_opnds={}", code, num_opnds);

        let mut fields: Fields = Vec::with_capacity(num_opnds as usize);
        for _ in 0..num_opnds {
            fields.push(self.cursor.read_vbr(6)?);
        }

        let record = Record::from_unabbrev(code, fields);
        if self.scope().is_blockinfo() {
            let code: BlockInfoCode = record.code.try_into()?;
            match code {
                BlockInfoCode::SetBid => {
                    let block_id: u64 = record.fields[0];
                    log::debug!("SETBID: BLOCKINFO block ID is now {}", block_id);
                    self.scope_mut().set_blockinfo_block_id(block_id)?;
                }
                BlockInfoCode::BlockName => log::debug!("skipping BLOCKNAME code in BLOCKINFO"),
                BlockInfoCode::SetRecordName => {
                    log::debug!("skipping SETRECORDNAME code in BLOCKINFO")
                }
            };
            return Ok(None);
        }

        Ok(Some(StreamEntry::Record(record)))
    }

    /// Interpret a record using its corresponding abbreviation definition.
    fn parse_with_abbrev(&mut self, abbrev_id: u64) -> Result<Option<StreamEntry>, Error> {
        // To parse a record according to an abbreviation definition, we
        // fetch the corresponding abbreviation (failing if we don't have one),
        // then use the abbreviation for the parse.
        // TODO(ww): The clone at the end here is a little annoying, but we
        // need it to avoid mixing mutable and immutable borrows here.
        // There is absolutely a better way to do that.
        let abbrev = self.scope().get_abbrev(abbrev_id)?.clone();

        let mut fields = abbrev.parse(&mut self.cursor)?;
        log::debug!("parsed fields: {:?}", fields);

        // Panic safety: every abbrev contains at least one operand, so this cannot panic.
        // We also expect the first operand to always be a u64, indicating the record code.
        let code: u64 = fields.remove(0);

        if self.scope().is_blockinfo() {
            return Ok(None);
        }

        Ok(Some(StreamEntry::Record(Record {
            abbrev_id: Some(abbrev_id),
            code: code,
            fields: fields,
        })))
    }

    /// Return the next [`StreamEntry`](StreamEntry) in this bitstream.
    ///
    /// Returns an error on any parsing error, *or* the special
    /// [`Error::Exhausted`](Error::Exhausted) if the bitstream has
    /// been fully consumed.
    pub fn advance(&mut self) -> Result<StreamEntry, Error> {
        if self.cursor.exhausted() {
            return Err(Error::Exhausted);
        }

        log::debug!(
            "advancing, current scope: {:?} @ bit position {}",
            self.scope(),
            self.cursor.tell_bit()
        );

        // To return the next stream entry, we read the next abbreviation ID using
        // our current width. The abbreviation ID we read determines our subsequent
        // parse strategy and the kind of entry we return.
        let id: abbrev::AbbrevId = self
            .cursor
            .read(self.scope().abbrev_id_width() as usize)?
            .into();
        log::debug!("next entry ID: {:?}", id);

        // NOTE(ww): The strange `map` + `unwrap_or_else` pattern below is to keep the parser
        // generalized without having to return `StreamEntries` that correspond to
        // parse details that a stream consumer shouldn't have to be aware of
        // (such as abbrev definitions and the BLOCKINFO block).
        match id {
            AbbrevId::Reserved(ReservedAbbrevId::EndBlock) => {
                self.exit_block()?.map(Ok).unwrap_or_else(|| self.advance())
            }
            AbbrevId::Reserved(ReservedAbbrevId::EnterSubBlock) => self
                .enter_block()?
                .map(Ok)
                .unwrap_or_else(|| self.advance()),
            AbbrevId::Reserved(ReservedAbbrevId::DefineAbbrev) => {
                // DEFINE_ABBREV is always a parse detail, so we don't even bother
                // trying to return a StreamEntry for it.
                self.define_abbrev()?;
                self.advance()
            }
            AbbrevId::Reserved(ReservedAbbrevId::UnabbrevRecord) => self
                .parse_unabbrev()?
                .map(Ok)
                .unwrap_or_else(|| self.advance()),
            AbbrevId::Defined(abbrev_id) => self
                .parse_with_abbrev(abbrev_id)?
                .map(Ok)
                .unwrap_or_else(|| self.advance()),
        }
    }
}