llvm_mapper/unroll.rs
1//! Routines and structures for "unrolling" a [`Bitstream`](llvm_bitstream::Bitstream)
2//! into a block-and-record hierarchy.
3
4use std::convert::{TryFrom, TryInto};
5
6use indexmap::IndexMap;
7use llvm_bitstream::parser::StreamEntry;
8use llvm_bitstream::record::{Block, Record};
9use llvm_bitstream::Bitstream;
10use llvm_support::bitcodes::IrBlockId;
11use thiserror::Error;
12
13use crate::block::{BlockId, BlockMapError, Identification, Module, Strtab, Symtab};
14use crate::error::Error;
15use crate::map::{PartialCtxMappable, PartialMapCtx};
16use crate::record::{RecordBlobError, RecordStringError};
17
18/// An "unrolled" record. This is internally indistinguishable from a raw bitstream
19/// [`Record`](llvm_bitstream::record::Record), but is newtyped to enforce proper
20/// isolation of concerns.
21#[derive(Clone, Debug)]
22pub struct UnrolledRecord(Record);
23
24impl UnrolledRecord {
25 /// Returns this record's code.
26 pub fn code(&self) -> u64 {
27 self.0.code
28 }
29
30 /// Attempt to pull a UTF-8 string from this record's fields.
31 ///
32 /// Strings are always the last fields in a record, so only the start
33 /// index is required.
34 pub fn try_string(&self, idx: usize) -> Result<String, RecordStringError> {
35 // If our start index lies beyond the record fields or would produce
36 // an empty string, it's invalid.
37 if idx >= self.0.fields.len() - 1 {
38 return Err(RecordStringError::BadIndex(idx, self.0.fields.len()));
39 }
40
41 // Each individual field in our string must fit into a byte.
42 let raw = self.0.fields[idx..]
43 .iter()
44 .map(|f| u8::try_from(*f))
45 .collect::<Result<Vec<_>, _>>()?;
46
47 // Finally, the buffer itself must decode correctly.
48 String::from_utf8(raw).map_err(RecordStringError::from)
49 }
50
51 /// Attempt to pull a blob of bytes from this record's fields.
52 ///
53 /// Blobs are always the last fields in a record, so only the start index is required.
54 pub fn try_blob(&self, idx: usize) -> Result<Vec<u8>, RecordBlobError> {
55 // If our start index lies beyond the record fields or would produce
56 // an empty string, it's invalid.
57 if idx >= self.0.fields.len() - 1 {
58 return Err(RecordBlobError::BadIndex(idx, self.0.fields.len()));
59 }
60
61 // Each individual field in our blob must fit into a byte.
62 Ok(self.0.fields[idx..]
63 .iter()
64 .map(|f| u8::try_from(*f))
65 .collect::<Result<Vec<_>, _>>()?)
66 }
67
68 /// Returns a reference to this record's fields.
69 pub fn fields(&self) -> &[u64] {
70 &self.0.fields
71 }
72}
73
74/// Errors that can occur when attempting to search for blocks and records within
75/// an unrolled bitstream.
76#[derive(Debug, Error)]
77pub enum ConsistencyError {
78 /// We expected a (sub-)block of the given ID, but couldn't find one.
79 #[error("expected a block with {0:?} but not present")]
80 MissingBlock(BlockId),
81
82 /// We expected exactly one (sub-)block of the given ID, but found more than one.
83 #[error("expected exactly one block with {0:?} but got more than one")]
84 TooManyBlocks(BlockId),
85
86 /// We expected a record of the given code, but couldn't find one.
87 #[error("expected a record of code {0} but not present")]
88 MissingRecord(u64),
89
90 /// We expected exactly one record of the given code, but found more than one.
91 #[error("expected exactly one record of code {0} but got more than one")]
92 TooManyRecords(u64),
93}
94
95/// Represents a collection of unrolled records.
96#[derive(Clone, Debug, Default)]
97pub struct UnrolledRecords(Vec<UnrolledRecord>);
98
99impl UnrolledRecords {
100 /// Return an iterator for all records that share the given code. Records
101 /// are iterated in the order of insertion.
102 ///
103 /// The returned iterator is empty if the block doesn't have any matching records.
104 pub(crate) fn by_code<'a>(
105 &'a self,
106 code: impl Into<u64> + 'a,
107 ) -> impl Iterator<Item = &UnrolledRecord> + 'a {
108 let code = code.into();
109 self.0.iter().filter(move |r| r.code() == code)
110 }
111
112 /// Returns the first record matching the given code, or `None` if there are
113 /// no matches.
114 ///
115 /// Ignores any subsequent matches.
116 pub(crate) fn one<'a>(&'a self, code: impl Into<u64> + 'a) -> Option<&UnrolledRecord> {
117 self.by_code(code).next()
118 }
119
120 /// Returns exactly one record matching the given code, or a variant indicating
121 /// the error condition (no matching records, or too many records).
122 pub(crate) fn exactly_one<'a>(
123 &'a self,
124 code: impl Into<u64> + 'a,
125 ) -> Result<&UnrolledRecord, ConsistencyError> {
126 let code = code.into();
127 let mut records = self.by_code(code);
128
129 match records.next() {
130 None => Err(ConsistencyError::MissingRecord(code)),
131 Some(r) => match records.next() {
132 None => Ok(r),
133 Some(_) => Err(ConsistencyError::TooManyRecords(code)),
134 },
135 }
136 }
137
138 /// Return an option of one record matching the given code or `None`, or an
139 /// `Err` variant if more than one matching record is present.
140 pub(crate) fn one_or_none<'a>(
141 &'a self,
142 code: impl Into<u64> + 'a,
143 ) -> Result<Option<&UnrolledRecord>, ConsistencyError> {
144 let code = code.into();
145 let mut records = self.by_code(code);
146
147 match records.next() {
148 None => Ok(None),
149 Some(r) => match records.next() {
150 None => Ok(Some(r)),
151 Some(_) => Err(ConsistencyError::TooManyRecords(code)),
152 },
153 }
154 }
155}
156
157impl<'a> IntoIterator for &'a UnrolledRecords {
158 type Item = &'a UnrolledRecord;
159 type IntoIter = std::slice::Iter<'a, UnrolledRecord>;
160
161 fn into_iter(self) -> Self::IntoIter {
162 self.0.iter()
163 }
164}
165
166/// Represents a collection of unrolled blocks.
167#[derive(Clone, Debug, Default)]
168pub struct UnrolledBlocks(IndexMap<BlockId, Vec<UnrolledBlock>>);
169
170impl UnrolledBlocks {
171 /// Return an iterator over all sub-blocks within this block that share the given ID.
172 ///
173 /// The returned iterator is empty if the block doesn't have any matching sub-blocks.
174 pub(crate) fn by_id(&self, id: BlockId) -> impl Iterator<Item = &UnrolledBlock> + '_ {
175 self.0.get(&id).into_iter().flatten()
176 }
177
178 pub(crate) fn exactly_one(&self, id: BlockId) -> Result<&UnrolledBlock, ConsistencyError> {
179 let mut blocks = self.by_id(id);
180
181 match blocks.next() {
182 None => Err(ConsistencyError::MissingBlock(id)),
183 Some(b) => match blocks.next() {
184 None => Ok(b),
185 Some(_) => Err(ConsistencyError::TooManyBlocks(id)),
186 },
187 }
188 }
189
190 /// Return an option of one block matching the given code or `None`, or an
191 /// `Err` variant if more than one matching block is present.
192 pub(crate) fn one_or_none(
193 &self,
194 id: BlockId,
195 ) -> Result<Option<&UnrolledBlock>, ConsistencyError> {
196 let mut blocks = self.by_id(id);
197
198 match blocks.next() {
199 None => Ok(None),
200 Some(b) => match blocks.next() {
201 None => Ok(Some(b)),
202 Some(_) => Err(ConsistencyError::TooManyBlocks(id)),
203 },
204 }
205 }
206}
207
208/// A fully unrolled block within the bitstream, with potential records
209/// and sub-blocks.
210#[derive(Clone, Debug)]
211pub struct UnrolledBlock {
212 /// This block's ID.
213 pub id: BlockId,
214 /// The [`UnrolledRecord`](UnrolledRecord)s directly contained by this block.
215 // NOTE(ww): It would be nice if we could map this list of records by their codes,
216 // since that would save us some time when scanning blocks for particular
217 // kinds of records. Doing so correctly is tricky: even with an order-preserving
218 // structure like IndexMap, we'd lose the correct order as we insert each record
219 // into its bucket.
220 records: UnrolledRecords,
221 /// The blocks directly contained by this block, mapped by their IDs. Like with records,
222 /// a block can contain multiple sub-blocks of the same ID.
223 blocks: UnrolledBlocks,
224}
225
226impl UnrolledBlock {
227 pub(self) fn new(id: u64) -> Self {
228 Self {
229 id: id.into(),
230 records: UnrolledRecords::default(),
231 // TODO(ww): Figure out a default capacity here.
232 blocks: UnrolledBlocks::default(),
233 }
234 }
235
236 /// Return a reference to all of the records in this block.
237 pub fn records(&self) -> &UnrolledRecords {
238 &self.records
239 }
240
241 /// Return a reference to all of the sub-blocks of this block.
242 pub fn blocks(&self) -> &UnrolledBlocks {
243 &self.blocks
244 }
245}
246
247/// A fully unrolled bitcode structure, taken from a bitstream.
248///
249/// Every `UnrolledBitcode` has a list of `BitstreamModule`s that it contains, each of
250/// which corresponds to a single LLVM IR module. In the simplest case, there will only be one.
251#[derive(Debug)]
252pub struct UnrolledBitcode {
253 /// The modules present in this bitcode stream.
254 pub modules: Vec<BitcodeModule>,
255}
256
257impl TryFrom<&[u8]> for UnrolledBitcode {
258 type Error = Error;
259
260 fn try_from(buf: &[u8]) -> Result<UnrolledBitcode, Self::Error> {
261 let (_, bitstream) = Bitstream::from(buf)?;
262
263 bitstream.try_into()
264 }
265}
266
267impl<T: AsRef<[u8]>> TryFrom<Bitstream<T>> for UnrolledBitcode {
268 type Error = Error;
269
270 fn try_from(mut bitstream: Bitstream<T>) -> Result<UnrolledBitcode, Self::Error> {
271 fn enter_block<T: AsRef<[u8]>>(
272 bitstream: &mut Bitstream<T>,
273 block: Block,
274 ) -> Result<UnrolledBlock, Error> {
275 let mut unrolled_block = UnrolledBlock::new(block.block_id);
276
277 // Once we're in a block, we do the following:
278 // 1. Take records, and add them to the current unrolled block;
279 // 2. Take sub-blocks, and enter them, adding them to our sub-block map;
280 // 3. Visit the end of our own block and return so that the caller
281 // (which is either the bitstream context or another parent block)
282 // can add us to its block map.
283 loop {
284 let entry = bitstream
285 .next()
286 .ok_or_else(|| Error::Unroll("unexpected stream end during unroll".into()))?;
287
288 match entry? {
289 StreamEntry::Record(record) => {
290 unrolled_block.records.0.push(UnrolledRecord(record))
291 }
292 StreamEntry::SubBlock(block) => {
293 let unrolled_child = enter_block(bitstream, block)?;
294 unrolled_block
295 .blocks
296 .0
297 .entry(unrolled_child.id)
298 .or_insert_with(Vec::new)
299 .push(unrolled_child);
300 }
301 StreamEntry::EndBlock => {
302 // End our current block scope.
303 break;
304 }
305 }
306 }
307
308 Ok(unrolled_block)
309 }
310
311 let mut partial_modules = Vec::new();
312
313 // Unrolling a bitstream into an `UnrolledBitcode` is a little involved:
314 //
315 // 1. There are multiple top-level blocks, each of which needs to be consumed.
316 // 2. Certain top-level blocks need to be grouped together to form a single BitcodeModule.
317 // 3. There can be multiple BitcodeModules-worth of top-level blocks in the stream.
318 loop {
319 // `None` means that we've exhausted the bitstream; we're done.
320 let entry = bitstream.next();
321 if entry.is_none() {
322 break;
323 }
324
325 // Take a top-level block from the stream.
326 let top_block = {
327 // Unwrap safety: we explicitly check the `None` case above.
328 // NOTE(ww): Other parts of the parser should be defensive against a malformed
329 // bitstream here, but it's difficult to represent that at the type level during unrolling.
330 #[allow(clippy::unwrap_used)]
331 let block = entry.unwrap()?.as_block().ok_or_else(|| {
332 Error::Unroll("bitstream has non-blocks at the top-level scope".into())
333 })?;
334
335 enter_block(&mut bitstream, block)?
336 };
337
338 // Our top-level block can be one of four cases, if it's valid.
339 //
340 // Handle each accordingly.
341 match top_block.id {
342 BlockId::Ir(IrBlockId::Identification) => {
343 // We've unrolled an IDENTIFICATION_BLOCK; this indicates the start of a new
344 // bitcode module. Create a fresh PartialBitcodeModule to fill in, as more
345 // top-level blocks become available.
346 partial_modules.push(PartialBitcodeModule::new(top_block));
347 }
348 BlockId::Ir(IrBlockId::Module) => {
349 // We've unrolled a MODULE_BLOCK; this contains the vast majority of the
350 // state associated with an LLVM IR module. Grab the most recent
351 // PartialBitcodeModule and fill it in, erroring appropriately if it already
352 // has a module.
353 //
354 // NOTE(ww): We could encounter a top-level sequence that looks like this:
355 // [IDENTIFICATION_BLOCK, IDENTIFICATION_BLOCK, MODULE_BLOCK]
356 // This would be malformed and in principle we should catch it here by searching
357 // for the first PartialBitcodeModule lacking a module instead of taking
358 // the most recent one, but the PartialBitcodeModule -> BitcodeModule reification
359 // step will take care of that for us.
360 let last_partial = partial_modules.last_mut().ok_or_else(|| {
361 Error::Unroll("malformed bitstream: MODULE_BLOCK with no preceding IDENTIFICATION_BLOCK".into())
362 })?;
363
364 match &last_partial.module {
365 Some(_) => {
366 return Err(Error::Unroll(
367 "malformed bitstream: adjacent MODULE_BLOCKs".into(),
368 ))
369 }
370 None => last_partial.module = Some(top_block),
371 }
372 }
373 BlockId::Ir(IrBlockId::Strtab) => {
374 // We've unrolled a STRTAB_BLOCK; this contains the string table for one or
375 // more preceding modules. Any modules that don't already have their own string
376 // table are given their own copy of this one.
377 //
378 // NOTE(ww): Again, we could encounter a sequence that looks like this:
379 // [..., STRTAB_BLOCK, STRTAB_BLOCK]
380 // This actually wouldn't be malformed, but is *is* nonsense: the second
381 // STRTAB_BLOCK would have no effect on any BitcodeModule, since the first one
382 // in sequence would already have been used for every prior module.
383 // We don't bother catching this at the moment since LLVM's own reader doesn't
384 // and it isn't erroneous per se (just pointless).
385 for prev_partial in partial_modules
386 .iter_mut()
387 .rev()
388 .take_while(|p| p.strtab.is_none())
389 {
390 prev_partial.strtab = Some(top_block.clone());
391 }
392 }
393 BlockId::Ir(IrBlockId::Symtab) => {
394 // We've unrolled a SYMTAB_BLOCK; this contains the symbol table (which, in
395 // turn, references the string table) for one or more preceding modules. Any
396 // modules that don't already have their own symbol table are given their own
397 // copy of this one.
398 //
399 // NOTE(ww): The same nonsense layout with STRTAB_BLOCK applies here.
400 for prev_partial in partial_modules
401 .iter_mut()
402 .rev()
403 .take_while(|p| p.symtab.is_none())
404 {
405 prev_partial.symtab = Some(top_block.clone());
406 }
407 }
408 _ => {
409 return Err(Error::Unroll(format!(
410 "unexpected top-level block: {:?}",
411 top_block.id
412 )))
413 }
414 }
415 }
416
417 let modules = partial_modules
418 .into_iter()
419 .map(|p| p.reify())
420 .collect::<Result<Vec<_>, _>>()?;
421 let unrolled = UnrolledBitcode { modules };
422
423 Ok(unrolled)
424 }
425}
426
427/// An internal, partial representation of a bitcode module, used when parsing each bitcode module
428/// to avoid polluting the `BitcodeModule` structure with optional types.
429#[derive(Debug)]
430struct PartialBitcodeModule {
431 identification: UnrolledBlock,
432 module: Option<UnrolledBlock>,
433 strtab: Option<UnrolledBlock>,
434 symtab: Option<UnrolledBlock>,
435}
436
437impl PartialBitcodeModule {
438 /// Create a new `PartialBitcodeModule`.
439 pub(self) fn new(identification: UnrolledBlock) -> Self {
440 Self {
441 identification: identification,
442 module: None,
443 strtab: None,
444 symtab: None,
445 }
446 }
447
448 /// Reify this `PartialBitcodeModule into a concrete `BitcodeModule`, mapping
449 /// each block along the way.
450 ///
451 /// Returns an error if the `PartialBitcodeModule` is lacking necessary state, or if
452 /// block and record mapping fails for any reason.
453 pub(self) fn reify(self) -> Result<BitcodeModule, Error> {
454 let mut ctx = PartialMapCtx::default();
455
456 // Grab the string table early, so that we can move it into our mapping context and
457 // use it for the remainder of the mapping phase.
458 let strtab = Strtab::try_map(
459 &self
460 .strtab
461 .ok_or_else(|| Error::Unroll("missing STRTAB_BLOCK for bitcode module".into()))?,
462 &mut ctx,
463 )
464 .map_err(BlockMapError::Strtab)?;
465
466 ctx.strtab = strtab;
467
468 let identification = Identification::try_map(&self.identification, &mut ctx)
469 .map_err(BlockMapError::Identification)?;
470
471 let module = Module::try_map(
472 &self
473 .module
474 .ok_or_else(|| Error::Unroll("missing MODULE_BLOCK for bitcode module".into()))?,
475 &mut ctx,
476 )
477 .map_err(BlockMapError::Module)?;
478
479 let symtab = self
480 .symtab
481 .map(|s| Symtab::try_map(&s, &mut ctx))
482 .transpose()
483 .map_err(BlockMapError::Symtab)?;
484
485 #[allow(clippy::unwrap_used)]
486 Ok(BitcodeModule {
487 identification: identification,
488 module: module,
489 strtab: ctx.strtab,
490 symtab: symtab,
491 })
492 }
493}
494
495/// A `BitcodeModule` encapsulates the top-level pieces of bitstream state needed for
496/// a single LLVM bitcode module: the `IDENTIFICATION_BLOCK`, the `MODULE_BLOCK` itself,
497/// a `STRTAB_BLOCK`, and a `SYMTAB_BLOCK` (if the last is present). A bitstream can
498/// contain multiple LLVM modules (e.g. if produced by `llvm-cat -b`), so parsing a bitstream
499/// can result in multiple `BitcodeModule`s.
500#[derive(Debug)]
501pub struct BitcodeModule {
502 /// The identification block associated with this module.
503 pub identification: Identification,
504
505 /// The module block associated with this module.
506 pub module: Module,
507
508 /// The string table associated with this module.
509 pub strtab: Strtab,
510
511 /// The symbol table associated with this module, if it has one.
512 pub symtab: Option<Symtab>,
513}
514
515#[cfg(test)]
516mod tests {
517 use super::*;
518
519 #[test]
520 fn test_unrolled_record_try_string() {
521 let record = UnrolledRecord(Record {
522 abbrev_id: None,
523 code: 0,
524 fields: b"\xff\xffvalid string!".iter().map(|b| *b as u64).collect(),
525 });
526
527 assert_eq!(record.try_string(2).unwrap(), "valid string!");
528 assert_eq!(record.try_string(8).unwrap(), "string!");
529
530 assert!(record.try_string(0).is_err());
531 assert!(record.try_string(record.0.fields.len()).is_err());
532 assert!(record.try_string(record.0.fields.len() - 1).is_err());
533 }
534
535 #[test]
536 fn test_unrolled_record_try_blob() {
537 let record = UnrolledRecord(Record {
538 abbrev_id: None,
539 code: 0,
540 fields: b"\xff\xffvalid string!".iter().map(|b| *b as u64).collect(),
541 });
542
543 assert_eq!(record.try_blob(0).unwrap(), b"\xff\xffvalid string!");
544 assert_eq!(record.try_blob(8).unwrap(), b"string!");
545
546 assert!(record.try_blob(record.0.fields.len()).is_err());
547 assert!(record.try_blob(record.0.fields.len() - 1).is_err());
548 }
549}