llvm_mapper/
unroll.rs

1//! Routines and structures for "unrolling" a [`Bitstream`](llvm_bitstream::Bitstream)
2//! into a block-and-record hierarchy.
3
4use std::convert::{TryFrom, TryInto};
5
6use indexmap::IndexMap;
7use llvm_bitstream::parser::StreamEntry;
8use llvm_bitstream::record::{Block, Record};
9use llvm_bitstream::Bitstream;
10use llvm_support::bitcodes::IrBlockId;
11use thiserror::Error;
12
13use crate::block::{BlockId, BlockMapError, Identification, Module, Strtab, Symtab};
14use crate::error::Error;
15use crate::map::{PartialCtxMappable, PartialMapCtx};
16use crate::record::{RecordBlobError, RecordStringError};
17
18/// An "unrolled" record. This is internally indistinguishable from a raw bitstream
19/// [`Record`](llvm_bitstream::record::Record), but is newtyped to enforce proper
20/// isolation of concerns.
21#[derive(Clone, Debug)]
22pub struct UnrolledRecord(Record);
23
24impl UnrolledRecord {
25    /// Returns this record's code.
26    pub fn code(&self) -> u64 {
27        self.0.code
28    }
29
30    /// Attempt to pull a UTF-8 string from this record's fields.
31    ///
32    /// Strings are always the last fields in a record, so only the start
33    /// index is required.
34    pub fn try_string(&self, idx: usize) -> Result<String, RecordStringError> {
35        // If our start index lies beyond the record fields or would produce
36        // an empty string, it's invalid.
37        if idx >= self.0.fields.len() - 1 {
38            return Err(RecordStringError::BadIndex(idx, self.0.fields.len()));
39        }
40
41        // Each individual field in our string must fit into a byte.
42        let raw = self.0.fields[idx..]
43            .iter()
44            .map(|f| u8::try_from(*f))
45            .collect::<Result<Vec<_>, _>>()?;
46
47        // Finally, the buffer itself must decode correctly.
48        String::from_utf8(raw).map_err(RecordStringError::from)
49    }
50
51    /// Attempt to pull a blob of bytes from this record's fields.
52    ///
53    /// Blobs are always the last fields in a record, so only the start index is required.
54    pub fn try_blob(&self, idx: usize) -> Result<Vec<u8>, RecordBlobError> {
55        // If our start index lies beyond the record fields or would produce
56        // an empty string, it's invalid.
57        if idx >= self.0.fields.len() - 1 {
58            return Err(RecordBlobError::BadIndex(idx, self.0.fields.len()));
59        }
60
61        // Each individual field in our blob must fit into a byte.
62        Ok(self.0.fields[idx..]
63            .iter()
64            .map(|f| u8::try_from(*f))
65            .collect::<Result<Vec<_>, _>>()?)
66    }
67
68    /// Returns a reference to this record's fields.
69    pub fn fields(&self) -> &[u64] {
70        &self.0.fields
71    }
72}
73
74/// Errors that can occur when attempting to search for blocks and records within
75/// an unrolled bitstream.
76#[derive(Debug, Error)]
77pub enum ConsistencyError {
78    /// We expected a (sub-)block of the given ID, but couldn't find one.
79    #[error("expected a block with {0:?} but not present")]
80    MissingBlock(BlockId),
81
82    /// We expected exactly one (sub-)block of the given ID, but found more than one.
83    #[error("expected exactly one block with {0:?} but got more than one")]
84    TooManyBlocks(BlockId),
85
86    /// We expected a record of the given code, but couldn't find one.
87    #[error("expected a record of code {0} but not present")]
88    MissingRecord(u64),
89
90    /// We expected exactly one record of the given code, but found more than one.
91    #[error("expected exactly one record of code {0} but got more than one")]
92    TooManyRecords(u64),
93}
94
95/// Represents a collection of unrolled records.
96#[derive(Clone, Debug, Default)]
97pub struct UnrolledRecords(Vec<UnrolledRecord>);
98
99impl UnrolledRecords {
100    /// Return an iterator for all records that share the given code. Records
101    /// are iterated in the order of insertion.
102    ///
103    /// The returned iterator is empty if the block doesn't have any matching records.
104    pub(crate) fn by_code<'a>(
105        &'a self,
106        code: impl Into<u64> + 'a,
107    ) -> impl Iterator<Item = &UnrolledRecord> + 'a {
108        let code = code.into();
109        self.0.iter().filter(move |r| r.code() == code)
110    }
111
112    /// Returns the first record matching the given code, or `None` if there are
113    /// no matches.
114    ///
115    /// Ignores any subsequent matches.
116    pub(crate) fn one<'a>(&'a self, code: impl Into<u64> + 'a) -> Option<&UnrolledRecord> {
117        self.by_code(code).next()
118    }
119
120    /// Returns exactly one record matching the given code, or a variant indicating
121    /// the error condition (no matching records, or too many records).
122    pub(crate) fn exactly_one<'a>(
123        &'a self,
124        code: impl Into<u64> + 'a,
125    ) -> Result<&UnrolledRecord, ConsistencyError> {
126        let code = code.into();
127        let mut records = self.by_code(code);
128
129        match records.next() {
130            None => Err(ConsistencyError::MissingRecord(code)),
131            Some(r) => match records.next() {
132                None => Ok(r),
133                Some(_) => Err(ConsistencyError::TooManyRecords(code)),
134            },
135        }
136    }
137
138    /// Return an option of one record matching the given code or `None`, or an
139    /// `Err` variant if more than one matching record is present.
140    pub(crate) fn one_or_none<'a>(
141        &'a self,
142        code: impl Into<u64> + 'a,
143    ) -> Result<Option<&UnrolledRecord>, ConsistencyError> {
144        let code = code.into();
145        let mut records = self.by_code(code);
146
147        match records.next() {
148            None => Ok(None),
149            Some(r) => match records.next() {
150                None => Ok(Some(r)),
151                Some(_) => Err(ConsistencyError::TooManyRecords(code)),
152            },
153        }
154    }
155}
156
157impl<'a> IntoIterator for &'a UnrolledRecords {
158    type Item = &'a UnrolledRecord;
159    type IntoIter = std::slice::Iter<'a, UnrolledRecord>;
160
161    fn into_iter(self) -> Self::IntoIter {
162        self.0.iter()
163    }
164}
165
166/// Represents a collection of unrolled blocks.
167#[derive(Clone, Debug, Default)]
168pub struct UnrolledBlocks(IndexMap<BlockId, Vec<UnrolledBlock>>);
169
170impl UnrolledBlocks {
171    /// Return an iterator over all sub-blocks within this block that share the given ID.
172    ///
173    /// The returned iterator is empty if the block doesn't have any matching sub-blocks.
174    pub(crate) fn by_id(&self, id: BlockId) -> impl Iterator<Item = &UnrolledBlock> + '_ {
175        self.0.get(&id).into_iter().flatten()
176    }
177
178    pub(crate) fn exactly_one(&self, id: BlockId) -> Result<&UnrolledBlock, ConsistencyError> {
179        let mut blocks = self.by_id(id);
180
181        match blocks.next() {
182            None => Err(ConsistencyError::MissingBlock(id)),
183            Some(b) => match blocks.next() {
184                None => Ok(b),
185                Some(_) => Err(ConsistencyError::TooManyBlocks(id)),
186            },
187        }
188    }
189
190    /// Return an option of one block matching the given code or `None`, or an
191    /// `Err` variant if more than one matching block is present.
192    pub(crate) fn one_or_none(
193        &self,
194        id: BlockId,
195    ) -> Result<Option<&UnrolledBlock>, ConsistencyError> {
196        let mut blocks = self.by_id(id);
197
198        match blocks.next() {
199            None => Ok(None),
200            Some(b) => match blocks.next() {
201                None => Ok(Some(b)),
202                Some(_) => Err(ConsistencyError::TooManyBlocks(id)),
203            },
204        }
205    }
206}
207
208/// A fully unrolled block within the bitstream, with potential records
209/// and sub-blocks.
210#[derive(Clone, Debug)]
211pub struct UnrolledBlock {
212    /// This block's ID.
213    pub id: BlockId,
214    /// The [`UnrolledRecord`](UnrolledRecord)s directly contained by this block.
215    // NOTE(ww): It would be nice if we could map this list of records by their codes,
216    // since that would save us some time when scanning blocks for particular
217    // kinds of records. Doing so correctly is tricky: even with an order-preserving
218    // structure like IndexMap, we'd lose the correct order as we insert each record
219    // into its bucket.
220    records: UnrolledRecords,
221    /// The blocks directly contained by this block, mapped by their IDs. Like with records,
222    /// a block can contain multiple sub-blocks of the same ID.
223    blocks: UnrolledBlocks,
224}
225
226impl UnrolledBlock {
227    pub(self) fn new(id: u64) -> Self {
228        Self {
229            id: id.into(),
230            records: UnrolledRecords::default(),
231            // TODO(ww): Figure out a default capacity here.
232            blocks: UnrolledBlocks::default(),
233        }
234    }
235
236    /// Return a reference to all of the records in this block.
237    pub fn records(&self) -> &UnrolledRecords {
238        &self.records
239    }
240
241    /// Return a reference to all of the sub-blocks of this block.
242    pub fn blocks(&self) -> &UnrolledBlocks {
243        &self.blocks
244    }
245}
246
247/// A fully unrolled bitcode structure, taken from a bitstream.
248///
249/// Every `UnrolledBitcode` has a list of `BitstreamModule`s that it contains, each of
250/// which corresponds to a single LLVM IR module. In the simplest case, there will only be one.
251#[derive(Debug)]
252pub struct UnrolledBitcode {
253    /// The modules present in this bitcode stream.
254    pub modules: Vec<BitcodeModule>,
255}
256
257impl TryFrom<&[u8]> for UnrolledBitcode {
258    type Error = Error;
259
260    fn try_from(buf: &[u8]) -> Result<UnrolledBitcode, Self::Error> {
261        let (_, bitstream) = Bitstream::from(buf)?;
262
263        bitstream.try_into()
264    }
265}
266
267impl<T: AsRef<[u8]>> TryFrom<Bitstream<T>> for UnrolledBitcode {
268    type Error = Error;
269
270    fn try_from(mut bitstream: Bitstream<T>) -> Result<UnrolledBitcode, Self::Error> {
271        fn enter_block<T: AsRef<[u8]>>(
272            bitstream: &mut Bitstream<T>,
273            block: Block,
274        ) -> Result<UnrolledBlock, Error> {
275            let mut unrolled_block = UnrolledBlock::new(block.block_id);
276
277            // Once we're in a block, we do the following:
278            // 1. Take records, and add them to the current unrolled block;
279            // 2. Take sub-blocks, and enter them, adding them to our sub-block map;
280            // 3. Visit the end of our own block and return so that the caller
281            //    (which is either the bitstream context or another parent block)
282            //    can add us to its block map.
283            loop {
284                let entry = bitstream
285                    .next()
286                    .ok_or_else(|| Error::Unroll("unexpected stream end during unroll".into()))?;
287
288                match entry? {
289                    StreamEntry::Record(record) => {
290                        unrolled_block.records.0.push(UnrolledRecord(record))
291                    }
292                    StreamEntry::SubBlock(block) => {
293                        let unrolled_child = enter_block(bitstream, block)?;
294                        unrolled_block
295                            .blocks
296                            .0
297                            .entry(unrolled_child.id)
298                            .or_insert_with(Vec::new)
299                            .push(unrolled_child);
300                    }
301                    StreamEntry::EndBlock => {
302                        // End our current block scope.
303                        break;
304                    }
305                }
306            }
307
308            Ok(unrolled_block)
309        }
310
311        let mut partial_modules = Vec::new();
312
313        // Unrolling a bitstream into an `UnrolledBitcode` is a little involved:
314        //
315        // 1. There are multiple top-level blocks, each of which needs to be consumed.
316        // 2. Certain top-level blocks need to be grouped together to form a single BitcodeModule.
317        // 3. There can be multiple BitcodeModules-worth of top-level blocks in the stream.
318        loop {
319            // `None` means that we've exhausted the bitstream; we're done.
320            let entry = bitstream.next();
321            if entry.is_none() {
322                break;
323            }
324
325            // Take a top-level block from the stream.
326            let top_block = {
327                // Unwrap safety: we explicitly check the `None` case above.
328                // NOTE(ww): Other parts of the parser should be defensive against a malformed
329                // bitstream here, but it's difficult to represent that at the type level during unrolling.
330                #[allow(clippy::unwrap_used)]
331                let block = entry.unwrap()?.as_block().ok_or_else(|| {
332                    Error::Unroll("bitstream has non-blocks at the top-level scope".into())
333                })?;
334
335                enter_block(&mut bitstream, block)?
336            };
337
338            // Our top-level block can be one of four cases, if it's valid.
339            //
340            // Handle each accordingly.
341            match top_block.id {
342                BlockId::Ir(IrBlockId::Identification) => {
343                    // We've unrolled an IDENTIFICATION_BLOCK; this indicates the start of a new
344                    // bitcode module. Create a fresh PartialBitcodeModule to fill in, as more
345                    // top-level blocks become available.
346                    partial_modules.push(PartialBitcodeModule::new(top_block));
347                }
348                BlockId::Ir(IrBlockId::Module) => {
349                    // We've unrolled a MODULE_BLOCK; this contains the vast majority of the
350                    // state associated with an LLVM IR module. Grab the most recent
351                    // PartialBitcodeModule and fill it in, erroring appropriately if it already
352                    // has a module.
353                    //
354                    // NOTE(ww): We could encounter a top-level sequence that looks like this:
355                    //   [IDENTIFICATION_BLOCK, IDENTIFICATION_BLOCK, MODULE_BLOCK]
356                    // This would be malformed and in principle we should catch it here by searching
357                    // for the first PartialBitcodeModule lacking a module instead of taking
358                    // the most recent one, but the PartialBitcodeModule -> BitcodeModule reification
359                    // step will take care of that for us.
360                    let last_partial = partial_modules.last_mut().ok_or_else(|| {
361                        Error::Unroll("malformed bitstream: MODULE_BLOCK with no preceding IDENTIFICATION_BLOCK".into())
362                    })?;
363
364                    match &last_partial.module {
365                        Some(_) => {
366                            return Err(Error::Unroll(
367                                "malformed bitstream: adjacent MODULE_BLOCKs".into(),
368                            ))
369                        }
370                        None => last_partial.module = Some(top_block),
371                    }
372                }
373                BlockId::Ir(IrBlockId::Strtab) => {
374                    // We've unrolled a STRTAB_BLOCK; this contains the string table for one or
375                    // more preceding modules. Any modules that don't already have their own string
376                    // table are given their own copy of this one.
377                    //
378                    // NOTE(ww): Again, we could encounter a sequence that looks like this:
379                    //   [..., STRTAB_BLOCK, STRTAB_BLOCK]
380                    // This actually wouldn't be malformed, but is *is* nonsense: the second
381                    // STRTAB_BLOCK would have no effect on any BitcodeModule, since the first one
382                    // in sequence would already have been used for every prior module.
383                    // We don't bother catching this at the moment since LLVM's own reader doesn't
384                    // and it isn't erroneous per se (just pointless).
385                    for prev_partial in partial_modules
386                        .iter_mut()
387                        .rev()
388                        .take_while(|p| p.strtab.is_none())
389                    {
390                        prev_partial.strtab = Some(top_block.clone());
391                    }
392                }
393                BlockId::Ir(IrBlockId::Symtab) => {
394                    // We've unrolled a SYMTAB_BLOCK; this contains the symbol table (which, in
395                    // turn, references the string table) for one or more preceding modules. Any
396                    // modules that don't already have their own symbol table are given their own
397                    // copy of this one.
398                    //
399                    // NOTE(ww): The same nonsense layout with STRTAB_BLOCK applies here.
400                    for prev_partial in partial_modules
401                        .iter_mut()
402                        .rev()
403                        .take_while(|p| p.symtab.is_none())
404                    {
405                        prev_partial.symtab = Some(top_block.clone());
406                    }
407                }
408                _ => {
409                    return Err(Error::Unroll(format!(
410                        "unexpected top-level block: {:?}",
411                        top_block.id
412                    )))
413                }
414            }
415        }
416
417        let modules = partial_modules
418            .into_iter()
419            .map(|p| p.reify())
420            .collect::<Result<Vec<_>, _>>()?;
421        let unrolled = UnrolledBitcode { modules };
422
423        Ok(unrolled)
424    }
425}
426
427/// An internal, partial representation of a bitcode module, used when parsing each bitcode module
428/// to avoid polluting the `BitcodeModule` structure with optional types.
429#[derive(Debug)]
430struct PartialBitcodeModule {
431    identification: UnrolledBlock,
432    module: Option<UnrolledBlock>,
433    strtab: Option<UnrolledBlock>,
434    symtab: Option<UnrolledBlock>,
435}
436
437impl PartialBitcodeModule {
438    /// Create a new `PartialBitcodeModule`.
439    pub(self) fn new(identification: UnrolledBlock) -> Self {
440        Self {
441            identification: identification,
442            module: None,
443            strtab: None,
444            symtab: None,
445        }
446    }
447
448    /// Reify this `PartialBitcodeModule into a concrete `BitcodeModule`, mapping
449    /// each block along the way.
450    ///
451    /// Returns an error if the `PartialBitcodeModule` is lacking necessary state, or if
452    /// block and record mapping fails for any reason.
453    pub(self) fn reify(self) -> Result<BitcodeModule, Error> {
454        let mut ctx = PartialMapCtx::default();
455
456        // Grab the string table early, so that we can move it into our mapping context and
457        // use it for the remainder of the mapping phase.
458        let strtab = Strtab::try_map(
459            &self
460                .strtab
461                .ok_or_else(|| Error::Unroll("missing STRTAB_BLOCK for bitcode module".into()))?,
462            &mut ctx,
463        )
464        .map_err(BlockMapError::Strtab)?;
465
466        ctx.strtab = strtab;
467
468        let identification = Identification::try_map(&self.identification, &mut ctx)
469            .map_err(BlockMapError::Identification)?;
470
471        let module = Module::try_map(
472            &self
473                .module
474                .ok_or_else(|| Error::Unroll("missing MODULE_BLOCK for bitcode module".into()))?,
475            &mut ctx,
476        )
477        .map_err(BlockMapError::Module)?;
478
479        let symtab = self
480            .symtab
481            .map(|s| Symtab::try_map(&s, &mut ctx))
482            .transpose()
483            .map_err(BlockMapError::Symtab)?;
484
485        #[allow(clippy::unwrap_used)]
486        Ok(BitcodeModule {
487            identification: identification,
488            module: module,
489            strtab: ctx.strtab,
490            symtab: symtab,
491        })
492    }
493}
494
495/// A `BitcodeModule` encapsulates the top-level pieces of bitstream state needed for
496/// a single LLVM bitcode module: the `IDENTIFICATION_BLOCK`, the `MODULE_BLOCK` itself,
497/// a `STRTAB_BLOCK`, and a `SYMTAB_BLOCK` (if the last is present). A bitstream can
498/// contain multiple LLVM modules (e.g. if produced by `llvm-cat -b`), so parsing a bitstream
499/// can result in multiple `BitcodeModule`s.
500#[derive(Debug)]
501pub struct BitcodeModule {
502    /// The identification block associated with this module.
503    pub identification: Identification,
504
505    /// The module block associated with this module.
506    pub module: Module,
507
508    /// The string table associated with this module.
509    pub strtab: Strtab,
510
511    /// The symbol table associated with this module, if it has one.
512    pub symtab: Option<Symtab>,
513}
514
515#[cfg(test)]
516mod tests {
517    use super::*;
518
519    #[test]
520    fn test_unrolled_record_try_string() {
521        let record = UnrolledRecord(Record {
522            abbrev_id: None,
523            code: 0,
524            fields: b"\xff\xffvalid string!".iter().map(|b| *b as u64).collect(),
525        });
526
527        assert_eq!(record.try_string(2).unwrap(), "valid string!");
528        assert_eq!(record.try_string(8).unwrap(), "string!");
529
530        assert!(record.try_string(0).is_err());
531        assert!(record.try_string(record.0.fields.len()).is_err());
532        assert!(record.try_string(record.0.fields.len() - 1).is_err());
533    }
534
535    #[test]
536    fn test_unrolled_record_try_blob() {
537        let record = UnrolledRecord(Record {
538            abbrev_id: None,
539            code: 0,
540            fields: b"\xff\xffvalid string!".iter().map(|b| *b as u64).collect(),
541        });
542
543        assert_eq!(record.try_blob(0).unwrap(), b"\xff\xffvalid string!");
544        assert_eq!(record.try_blob(8).unwrap(), b"string!");
545
546        assert!(record.try_blob(record.0.fields.len()).is_err());
547        assert!(record.try_blob(record.0.fields.len() - 1).is_err());
548    }
549}