Skip to main content

llvm_bitcode/
bitcode.rs

1use crate::bits::Cursor;
2use crate::bitstream::{Abbreviation, Operand, PayloadOperand, ScalarOperand};
3use std::cell::RefCell;
4use std::collections::HashMap;
5use std::fmt;
6use std::num::NonZero;
7use std::ops::Range;
8use std::sync::Arc;
9
10use crate::read::{BitStreamReader, Error};
11use crate::visitor::{BitStreamVisitor, CollectingVisitor};
12
13const LLVM_BITCODE_WRAPPER_MAGIC: u32 = 0x0B17C0DE;
14
15/// Represents the contents of a file encoded using the
16/// [LLVM bitstream container format](https://llvm.org/docs/BitCodeFormat.html#bitstream-container-format)
17#[derive(Debug, Clone)]
18pub struct Bitcode {
19    pub signature: Signature,
20    pub elements: Vec<BitcodeElement>,
21    pub block_info: HashMap<u32, BlockInfo>,
22}
23
24/// Blocks in a bitstream denote nested regions of the stream,
25/// and are identified by a content-specific id number
26///
27/// Block IDs 0-7 are reserved for [standard blocks](https://llvm.org/docs/BitCodeFormat.html#standard-blocks)
28/// whose meaning is defined by Bitcode;
29/// block IDs 8 and greater are application specific.
30#[derive(Debug, Clone)]
31pub struct Block {
32    /// Block ID
33    pub id: u32,
34    /// Block elements
35    pub elements: Vec<BitcodeElement>,
36}
37
38#[derive(Debug, Clone)]
39pub enum Payload {
40    Array(Vec<u64>),
41    Char6String(String),
42    Blob(Vec<u8>),
43}
44
45/// Data records consist of a record code and a number of (up to) 64-bit integer values
46///
47/// The interpretation of the code and values is application specific and may vary between different block types.
48#[derive(Debug, Clone)]
49pub struct Record {
50    /// Record code
51    pub id: u64,
52    /// An abbreviated record has a abbreviation id followed by a set of fields
53    fields: Vec<u64>,
54    /// Array and Blob encoding has payload
55    payload: Option<Payload>,
56}
57
58impl Record {
59    #[must_use]
60    pub fn fields(&self) -> &[u64] {
61        &self.fields
62    }
63
64    pub fn take_payload(&mut self) -> Option<Payload> {
65        self.payload.take()
66    }
67}
68
69#[derive(Debug, Clone)]
70enum Ops {
71    Abbrev {
72        /// If under `abbrev.fields.len()`, then it's the next op to read
73        /// If equals `abbrev.fields.len()`, then payload is next
74        /// If greater than `abbrev.fields.len()`, then payload has been read
75        state: usize,
76        abbrev: Arc<Abbreviation>,
77    },
78    /// Num ops left
79    Full(usize),
80}
81
82/// Data records consist of a record code and a number of (up to) 64-bit integer values
83///
84/// The interpretation of the code and values is application specific and may vary between different block types.
85pub struct RecordIter<'cursor, 'input> {
86    /// Record code
87    pub id: u64,
88    cursor: &'cursor mut Cursor<'input>,
89    ops: Ops,
90}
91
92impl<'cursor, 'input> RecordIter<'cursor, 'input> {
93    pub(crate) fn into_record(mut self) -> Result<Record, Error> {
94        let mut fields = Vec::with_capacity(self.len());
95        while let Some(f) = self.try_next()? {
96            fields.push(f);
97        }
98        Ok(Record {
99            id: self.id,
100            fields,
101            payload: self.payload().ok().flatten(),
102        })
103    }
104
105    fn read_scalar_operand(cursor: &mut Cursor<'_>, operand: ScalarOperand) -> Result<u64, Error> {
106        match operand {
107            ScalarOperand::Char6 => {
108                let value = cursor.read(6)? as u8;
109                Ok(u64::from(match value {
110                    0..=25 => value + b'a',
111                    26..=51 => value + (b'A' - 26),
112                    52..=61 => value - (52 - b'0'),
113                    62 => b'.',
114                    63 => b'_',
115                    _ => return Err(Error::InvalidAbbrev),
116                }))
117            }
118            ScalarOperand::Literal(value) => Ok(value),
119            ScalarOperand::Fixed(width) => Ok(cursor.read(width)?),
120            ScalarOperand::Vbr(width) => Ok(cursor.read_vbr(width)?),
121        }
122    }
123
124    pub(crate) fn from_cursor_abbrev(
125        cursor: &'cursor mut Cursor<'input>,
126        abbrev: Arc<Abbreviation>,
127    ) -> Result<Self, Error> {
128        let id =
129            Self::read_scalar_operand(cursor, *abbrev.fields.first().ok_or(Error::InvalidAbbrev)?)?;
130        Ok(Self {
131            id,
132            cursor,
133            ops: Ops::Abbrev { state: 1, abbrev },
134        })
135    }
136
137    pub(crate) fn from_cursor(cursor: &'cursor mut Cursor<'input>) -> Result<Self, Error> {
138        let id = cursor.read_vbr_fixed::<6>()?;
139        let num_ops = cursor.read_vbr_fixed::<6>()? as usize;
140        Ok(Self {
141            id,
142            cursor,
143            ops: Ops::Full(num_ops),
144        })
145    }
146
147    pub fn payload(&mut self) -> Result<Option<Payload>, Error> {
148        match &mut self.ops {
149            Ops::Abbrev { state, abbrev } => {
150                if *state > abbrev.fields.len() {
151                    return Ok(None);
152                }
153                Ok(match abbrev.payload {
154                    Some(PayloadOperand::Blob) => Some(Payload::Blob(self.blob()?.to_vec())),
155                    Some(PayloadOperand::Array(ScalarOperand::Char6)) => {
156                        Some(Payload::Char6String(
157                            String::from_utf8(self.string()?).map_err(|_| Error::InvalidAbbrev)?,
158                        ))
159                    }
160                    Some(PayloadOperand::Array(_)) => Some(Payload::Array(self.array()?)),
161                    None => None,
162                })
163            }
164            Ops::Full(_) => Ok(None),
165        }
166    }
167
168    /// Number of unread fields, excludes string/array/blob payload
169    #[must_use]
170    pub fn len(&self) -> usize {
171        match &self.ops {
172            Ops::Abbrev { state, abbrev } => abbrev.fields.len().saturating_sub(*state),
173            Ops::Full(num_ops) => *num_ops,
174        }
175    }
176
177    /// Matches len, excludes string/array/blob payload
178    #[must_use]
179    pub fn is_empty(&self) -> bool {
180        self.len() == 0
181    }
182
183    #[doc(hidden)]
184    #[deprecated(note = "renamed to `try_next()` to avoid confusion with `Iterator::next`")]
185    #[allow(clippy::should_implement_trait)]
186    pub fn next(&mut self) -> Result<Option<u64>, Error> {
187        self.try_next()
188    }
189
190    /// Consume next record
191    #[doc(alias = "next")]
192    pub fn try_next(&mut self) -> Result<Option<u64>, Error> {
193        match &mut self.ops {
194            Ops::Abbrev { state, abbrev } => {
195                let Some(&op) = abbrev.fields.get(*state) else {
196                    return Ok(None);
197                };
198                *state += 1;
199                Ok(Some(Self::read_scalar_operand(self.cursor, op)?))
200            }
201            Ops::Full(num_ops) => {
202                if *num_ops == 0 {
203                    return Ok(None);
204                }
205                *num_ops -= 1;
206                Ok(Some(self.cursor.read_vbr_fixed::<6>()?))
207            }
208        }
209    }
210
211    #[cfg_attr(debug_assertions, track_caller)]
212    pub fn u64(&mut self) -> Result<u64, Error> {
213        match self.try_next()? {
214            Some(v) => Ok(v),
215            None => {
216                debug_assert!(false, "unexpected end of record");
217                Err(Error::EndOfRecord)
218            }
219        }
220    }
221
222    pub fn nzu64(&mut self) -> Result<Option<NonZero<u64>>, Error> {
223        self.u64().map(NonZero::new)
224    }
225
226    pub fn i64(&mut self) -> Result<i64, Error> {
227        let v = self.u64()?;
228        let shifted = (v >> 1) as i64;
229        Ok(if (v & 1) == 0 {
230            shifted
231        } else if v != 1 {
232            -shifted
233        } else {
234            1 << 63
235        })
236    }
237
238    #[cfg_attr(debug_assertions, track_caller)]
239    pub fn u16(&mut self) -> Result<u16, Error> {
240        let val = self.u64()?;
241        match val.try_into() {
242            Ok(v) => Ok(v),
243            Err(_) => {
244                debug_assert!(false, "{val} overflows u16");
245                Err(Error::ValueOverflow)
246            }
247        }
248    }
249
250    #[cfg_attr(debug_assertions, track_caller)]
251    pub fn u32(&mut self) -> Result<u32, Error> {
252        let val = self.u64()?;
253        match val.try_into() {
254            Ok(v) => Ok(v),
255            Err(_) => {
256                debug_assert!(false, "{val} overflows u32");
257                Err(Error::ValueOverflow)
258            }
259        }
260    }
261
262    pub fn nzu32(&mut self) -> Result<Option<NonZero<u32>>, Error> {
263        self.u32().map(NonZero::new)
264    }
265
266    #[cfg_attr(debug_assertions, track_caller)]
267    pub fn u8(&mut self) -> Result<u8, Error> {
268        let val = self.u64()?;
269        match val.try_into() {
270            Ok(v) => Ok(v),
271            Err(_) => {
272                debug_assert!(false, "{val} overflows u8");
273                Err(Error::ValueOverflow)
274            }
275        }
276    }
277
278    #[cfg_attr(debug_assertions, track_caller)]
279    #[inline]
280    pub fn try_from<U: TryFrom<u64>, T: TryFrom<U>>(&mut self) -> Result<T, Error> {
281        self.try_next_from::<U, T>()?.ok_or(Error::EndOfRecord)
282    }
283
284    #[cfg_attr(debug_assertions, track_caller)]
285    pub fn try_next_from<U: TryFrom<u64>, T: TryFrom<U>>(&mut self) -> Result<Option<T>, Error> {
286        match self.try_next()? {
287            Some(val) => {
288                if let Some(val) = val.try_into().ok().and_then(|v| T::try_from(v).ok()) {
289                    Ok(Some(val))
290                } else {
291                    debug_assert!(
292                        false,
293                        "{} can't be made from {val} as {}",
294                        std::any::type_name::<T>(),
295                        std::any::type_name::<U>()
296                    );
297                    Err(Error::ValueOverflow)
298                }
299            }
300            None => Ok(None),
301        }
302    }
303
304    pub fn nzu8(&mut self) -> Result<Option<NonZero<u8>>, Error> {
305        self.u8().map(NonZero::new)
306    }
307
308    #[cfg_attr(debug_assertions, track_caller)]
309    pub fn bool(&mut self) -> Result<bool, Error> {
310        match self.u64()? {
311            0 => Ok(false),
312            1 => Ok(true),
313            val => {
314                debug_assert!(false, "{val} overflows bool");
315                Err(Error::ValueOverflow)
316            }
317        }
318    }
319
320    /// Reads `start` and `len` into Rust's `start..end`
321    pub fn range(&mut self) -> Result<Range<usize>, Error> {
322        let start = self.u64()? as usize;
323        Ok(Range {
324            start,
325            end: start
326                .checked_add(self.u64()? as usize)
327                .ok_or(Error::ValueOverflow)?,
328        })
329    }
330
331    pub fn blob(&mut self) -> Result<&'input [u8], Error> {
332        match &mut self.ops {
333            Ops::Abbrev { state, abbrev } => match Self::take_payload_operand(state, abbrev)? {
334                Some(PayloadOperand::Blob) => {
335                    let length = self.cursor.read_vbr_fixed::<6>()? as usize;
336                    self.cursor.align32()?;
337                    let data = self.cursor.read_bytes(length)?;
338                    self.cursor.align32()?;
339                    Ok(data)
340                }
341                other => Err(Error::UnexpectedOperand(other.map(Operand::Payload))),
342            },
343            Ops::Full(_) => Err(Error::UnexpectedOperand(None)),
344        }
345    }
346
347    pub fn array(&mut self) -> Result<Vec<u64>, Error> {
348        match &mut self.ops {
349            Ops::Abbrev { state, abbrev } => match Self::take_payload_operand(state, abbrev)? {
350                Some(PayloadOperand::Array(op)) => {
351                    let len = self.cursor.read_vbr_fixed::<6>()? as usize;
352                    let mut out = Vec::with_capacity(len);
353                    for _ in 0..len {
354                        if out.len() == out.capacity() {
355                            debug_assert!(false);
356                            break;
357                        }
358                        out.push(Self::read_scalar_operand(self.cursor, op)?);
359                    }
360                    Ok(out)
361                }
362                other => Err(Error::UnexpectedOperand(other.map(Operand::Payload))),
363            },
364            // Not a proper array payload, but this fallback pattern is used by LLVM
365            Ops::Full(num_ops) => {
366                let len = *num_ops;
367                *num_ops = 0;
368                let mut out = Vec::with_capacity(len);
369                for _ in 0..len {
370                    if out.len() == out.capacity() {
371                        debug_assert!(false);
372                        break;
373                    }
374                    out.push(self.cursor.read_vbr_fixed::<6>()?);
375                }
376                Ok(out)
377            }
378        }
379    }
380
381    /// Mark payload as read, if there is one
382    fn take_payload_operand(
383        state: &mut usize,
384        abbrev: &Abbreviation,
385    ) -> Result<Option<PayloadOperand>, Error> {
386        if *state == abbrev.fields.len() {
387            if abbrev.payload.is_some() {
388                *state += 1;
389            }
390            Ok(abbrev.payload)
391        } else {
392            Err(Error::UnexpectedOperand(
393                abbrev.fields.get(*state).copied().map(Operand::Scalar),
394            ))
395        }
396    }
397
398    /// Read remainder of the fields as string chars.
399    ///
400    /// Interpret data as UTF-8.
401    /// The string may contain NUL terminator, depending on context.
402    pub fn string_utf8(&mut self) -> Result<String, Error> {
403        String::from_utf8(self.string()?).map_err(Error::Encoding)
404    }
405
406    /// Read remainder of the fields as string chars
407    ///
408    /// The strings are just binary blobs. LLVM doesn't guarantee any encoding.
409    /// The string may contain NUL terminator, depending on context.
410    #[cfg_attr(debug_assertions, track_caller)]
411    pub fn string(&mut self) -> Result<Vec<u8>, Error> {
412        match &mut self.ops {
413            Ops::Abbrev { state, abbrev } => match Self::take_payload_operand(state, abbrev)? {
414                Some(PayloadOperand::Array(el)) => {
415                    *state += 1;
416                    let len = self.cursor.read_vbr_fixed::<6>()? as usize;
417                    let mut out = Vec::with_capacity(len);
418
419                    match el {
420                        ScalarOperand::Char6 => {
421                            for _ in 0..len {
422                                if out.len() == out.capacity() {
423                                    debug_assert!(false);
424                                    break;
425                                }
426                                let ch = match self.cursor.read(6)? as u8 {
427                                    value @ 0..=25 => value + b'a',
428                                    value @ 26..=51 => value + (b'A' - 26),
429                                    value @ 52..=61 => value - (52 - b'0'),
430                                    62 => b'.',
431                                    63 => b'_',
432                                    _ => return Err(Error::InvalidAbbrev),
433                                };
434                                out.push(ch);
435                            }
436                        }
437                        ScalarOperand::Fixed(width @ 6..=8) => {
438                            for _ in 0..len {
439                                if out.len() == out.capacity() {
440                                    debug_assert!(false);
441                                    break;
442                                }
443                                out.push(self.cursor.read(width)? as u8);
444                            }
445                        }
446                        other => {
447                            return Err(Error::UnexpectedOperand(Some(Operand::Scalar(other))));
448                        }
449                    }
450                    Ok(out)
451                }
452                other => Err(Error::UnexpectedOperand(other.map(Operand::Payload))),
453            },
454            Ops::Full(num_ops) => {
455                let len = std::mem::replace(num_ops, 0);
456                let mut out = Vec::with_capacity(len);
457                for _ in 0..len {
458                    let ch = self.cursor.read_vbr_fixed::<6>()?;
459                    out.push(match u8::try_from(ch) {
460                        Ok(c) => c,
461                        Err(_) => {
462                            debug_assert!(false, "{ch} too big for char");
463                            return Err(Error::ValueOverflow);
464                        }
465                    });
466                }
467                Ok(out)
468            }
469        }
470    }
471
472    /// Zero-terminated string, assumes latin1 encoding
473    pub fn zstring(&mut self) -> Result<String, Error> {
474        let mut s = String::new();
475        while let Some(b) = self.nzu8()? {
476            s.push(b.get() as char);
477        }
478        Ok(s)
479    }
480
481    /// Internal ID of this record's abbreviation, if any.
482    ///
483    /// This is intended only for debugging and data dumps.
484    /// This isn't a stable identifier, and may be block-specific.
485    #[must_use]
486    pub fn debug_abbrev_id(&self) -> Option<u32> {
487        match &self.ops {
488            Ops::Abbrev { abbrev, .. } => Some(abbrev.id),
489            Ops::Full(_) => None,
490        }
491    }
492
493    /// For debug printing
494    fn with_cloned_cursor<'new_cursor>(
495        &self,
496        cursor: &'new_cursor mut Cursor<'input>,
497    ) -> RecordIter<'new_cursor, 'input> {
498        RecordIter {
499            id: self.id,
500            ops: self.ops.clone(),
501            cursor,
502        }
503    }
504}
505
506impl Iterator for RecordIter<'_, '_> {
507    type Item = Result<u64, Error>;
508    fn next(&mut self) -> Option<Self::Item> {
509        self.try_next().transpose()
510    }
511}
512
513impl Drop for RecordIter<'_, '_> {
514    /// Must drain the remaining records to advance the cursor to the next record
515    fn drop(&mut self) {
516        while let Ok(Some(_)) = self.try_next() {}
517        if let Ops::Abbrev { abbrev, .. } = &self.ops
518            && abbrev.payload.is_some()
519        {
520            let _ = self.payload();
521        }
522    }
523}
524
525struct RecordIterDebugFields<'c, 'i>(RefCell<RecordIter<'c, 'i>>);
526struct RecordIterDebugResult<T, E>(Result<T, E>);
527
528impl fmt::Debug for RecordIter<'_, '_> {
529    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
530        let mut c = self.cursor.clone();
531        let fields = RecordIterDebugFields(RefCell::new(self.with_cloned_cursor(&mut c)));
532
533        f.debug_struct("RecordIter")
534            .field("id", &self.id)
535            .field("fields", &fields)
536            .field("ops", &self.ops)
537            .field("cursor", &self.cursor)
538            .finish()
539    }
540}
541
542impl fmt::Debug for RecordIterDebugFields<'_, '_> {
543    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
544        let mut iter = self.0.borrow_mut();
545        let mut d = f.debug_list();
546        d.entries(iter.by_ref().map(RecordIterDebugResult));
547        if let Some(p) = iter.payload().transpose() {
548            d.entries([RecordIterDebugResult(p)]);
549        }
550        d.finish()
551    }
552}
553
554impl<T: fmt::Debug, E: fmt::Debug> fmt::Debug for RecordIterDebugResult<T, E> {
555    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
556        match &self.0 {
557            Ok(t) => t.fmt(f),
558            Err(e) => e.fmt(f),
559        }
560    }
561}
562
563/// Bitcode element
564#[derive(Debug, Clone)]
565pub enum BitcodeElement {
566    /// Block
567    Block(Block),
568    /// Data record
569    Record(Record),
570}
571
572impl BitcodeElement {
573    /// Returns true if it is a `Block`
574    #[must_use]
575    pub fn is_block(&self) -> bool {
576        matches!(self, Self::Block(_))
577    }
578
579    /// If it is a `Block`, returns the associated block. Returns `None` otherwise.
580    #[must_use]
581    pub fn as_block(&self) -> Option<&Block> {
582        match self {
583            Self::Block(block) => Some(block),
584            Self::Record(_) => None,
585        }
586    }
587
588    /// If it is a `Block`, returns the associated mutable block. Returns `None` otherwise.
589    pub fn as_block_mut(&mut self) -> Option<&mut Block> {
590        match self {
591            Self::Block(block) => Some(block),
592            Self::Record(_) => None,
593        }
594    }
595
596    /// Returns true if it is a `Record`
597    #[must_use]
598    pub fn is_record(&self) -> bool {
599        matches!(self, Self::Record(_))
600    }
601
602    /// If it is a `Record`, returns the associated record. Returns `None` otherwise.
603    #[must_use]
604    pub fn as_record(&self) -> Option<&Record> {
605        match self {
606            Self::Block(_) => None,
607            Self::Record(record) => Some(record),
608        }
609    }
610
611    /// If it is a `Record`, returns the associated mutable record. Returns `None` otherwise.
612    pub fn as_record_mut(&mut self) -> Option<&mut Record> {
613        match self {
614            Self::Block(_) => None,
615            Self::Record(record) => Some(record),
616        }
617    }
618}
619
620/// Block information
621#[derive(Debug, Clone, Default)]
622pub struct BlockInfo {
623    /// Block name
624    pub name: String,
625    /// Data record names
626    pub record_names: HashMap<u64, String>,
627}
628
629/// aka. Magic number
630#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
631pub struct Signature {
632    pub magic: u32,
633    pub magic2: u32,
634    pub version: u32,
635    pub offset: u32,
636    pub size: u32,
637    pub cpu_type: u32,
638}
639
640impl Signature {
641    #[must_use]
642    pub fn parse(data: &[u8]) -> Option<(Self, &[u8])> {
643        let (signature, remaining_data) = data.split_first_chunk::<4>()?;
644        let magic = u32::from_le_bytes(*signature);
645        if magic != LLVM_BITCODE_WRAPPER_MAGIC {
646            Some((
647                Self {
648                    version: 0,
649                    magic,
650                    magic2: 0,
651                    offset: 4,
652                    size: remaining_data.len() as _,
653                    cpu_type: 0,
654                },
655                remaining_data,
656            ))
657        } else {
658            // It is a LLVM Bitcode wrapper, remove wrapper header
659            if data.len() < 20 {
660                return None;
661            }
662            let mut words = data
663                .chunks_exact(4)
664                .skip(1)
665                .map(|w| u32::from_le_bytes(w.try_into().unwrap()));
666            let version = words.next()?;
667            let offset = words.next()?;
668            let size = words.next()?;
669            let cpu_id = words.next()?;
670            let data = data.get(offset as usize..offset as usize + size as usize)?;
671            let (magic2, remaining_data) = data.split_first_chunk::<4>()?;
672            let magic2 = u32::from_le_bytes(*magic2);
673            Some((
674                Self {
675                    version,
676                    magic,
677                    magic2,
678                    offset,
679                    size,
680                    cpu_type: cpu_id,
681                },
682                remaining_data,
683            ))
684        }
685    }
686}
687
688impl Bitcode {
689    /// Parse bitcode from bytes
690    ///
691    /// Accepts both LLVM bitcode and bitcode wrapper formats
692    pub fn new(data: &[u8]) -> Result<Self, Error> {
693        let (signature, stream) = Signature::parse(data).ok_or(Error::InvalidSignature(0))?;
694        let mut reader = BitStreamReader::new();
695        let mut visitor = CollectingVisitor::new();
696        reader.read_block(
697            Cursor::new(stream),
698            BitStreamReader::TOP_LEVEL_BLOCK_ID,
699            2,
700            &mut visitor,
701        )?;
702        Ok(Self {
703            signature,
704            elements: visitor.finalize_top_level_elements(),
705            block_info: reader.block_info,
706        })
707    }
708
709    /// Read bitcode from bytes with a visitor
710    ///
711    /// Accepts both LLVM bitcode and bitcode wrapper formats
712    pub fn read<V>(data: &[u8], visitor: &mut V) -> Result<(), Error>
713    where
714        V: BitStreamVisitor,
715    {
716        let (header, stream) = Signature::parse(data).ok_or(Error::InvalidSignature(0))?;
717        if !visitor.validate(header) {
718            return Err(Error::InvalidSignature(header.magic));
719        }
720        let mut reader = BitStreamReader::new();
721        reader.read_block(
722            Cursor::new(stream),
723            BitStreamReader::TOP_LEVEL_BLOCK_ID,
724            2,
725            visitor,
726        )
727    }
728}