Skip to main content

llvm_bitcode/
bitcode.rs

1use crate::bits::Cursor;
2use crate::bitstream::{Abbreviation, Operand};
3use crate::bitstream::{PayloadOperand, ScalarOperand};
4use std::cell::RefCell;
5use std::collections::HashMap;
6use std::fmt;
7use std::num::NonZero;
8use std::ops::Range;
9use std::sync::Arc;
10
11use crate::read::{BitStreamReader, Error};
12use crate::visitor::{BitStreamVisitor, CollectingVisitor};
13
14const LLVM_BITCODE_WRAPPER_MAGIC: u32 = 0x0B17C0DE;
15
16/// Represents the contents of a file encoded using the
17/// [LLVM bitstream container format](https://llvm.org/docs/BitCodeFormat.html#bitstream-container-format)
18#[derive(Debug, Clone)]
19pub struct Bitcode {
20    pub signature: Signature,
21    pub elements: Vec<BitcodeElement>,
22    pub block_info: HashMap<u32, BlockInfo>,
23}
24
25/// Blocks in a bitstream denote nested regions of the stream,
26/// and are identified by a content-specific id number
27///
28/// Block IDs 0-7 are reserved for [standard blocks](https://llvm.org/docs/BitCodeFormat.html#standard-blocks)
29/// whose meaning is defined by Bitcode;
30/// block IDs 8 and greater are application specific.
31#[derive(Debug, Clone)]
32pub struct Block {
33    /// Block ID
34    pub id: u32,
35    /// Block elements
36    pub elements: Vec<BitcodeElement>,
37}
38
39#[derive(Debug, Clone)]
40pub enum Payload {
41    Array(Vec<u64>),
42    Char6String(String),
43    Blob(Vec<u8>),
44}
45
46/// Data records consist of a record code and a number of (up to) 64-bit integer values
47///
48/// The interpretation of the code and values is application specific and may vary between different block types.
49#[derive(Debug, Clone)]
50pub struct Record {
51    /// Record code
52    pub id: u64,
53    /// An abbreviated record has a abbreviation id followed by a set of fields
54    fields: Vec<u64>,
55    /// Array and Blob encoding has payload
56    payload: Option<Payload>,
57}
58
59impl Record {
60    #[must_use]
61    pub fn fields(&self) -> &[u64] {
62        &self.fields
63    }
64
65    pub fn take_payload(&mut self) -> Option<Payload> {
66        self.payload.take()
67    }
68}
69
70#[derive(Debug, Clone)]
71enum Ops {
72    Abbrev {
73        /// If under `abbrev.fields.len()`, then it's the next op to read
74        /// If equals `abbrev.fields.len()`, then payload is next
75        /// If greater than `abbrev.fields.len()`, then payload has been read
76        state: usize,
77        abbrev: Arc<Abbreviation>,
78    },
79    /// Num ops left
80    Full(usize),
81}
82
83/// Data records consist of a record code and a number of (up to) 64-bit integer values
84///
85/// The interpretation of the code and values is application specific and may vary between different block types.
86pub struct RecordIter<'cursor, 'input> {
87    /// Record code
88    pub id: u64,
89    cursor: &'cursor mut Cursor<'input>,
90    ops: Ops,
91}
92
93impl<'cursor, 'input> RecordIter<'cursor, 'input> {
94    pub(crate) fn into_record(mut self) -> Result<Record, Error> {
95        let mut fields = Vec::with_capacity(self.len());
96        while let Some(f) = self.try_next()? {
97            fields.push(f);
98        }
99        Ok(Record {
100            id: self.id,
101            fields,
102            payload: self.payload().ok().flatten(),
103        })
104    }
105
106    fn read_scalar_operand(cursor: &mut Cursor<'_>, operand: ScalarOperand) -> Result<u64, Error> {
107        match operand {
108            ScalarOperand::Char6 => {
109                let value = cursor.read(6)? as u8;
110                Ok(u64::from(match value {
111                    0..=25 => value + b'a',
112                    26..=51 => value + (b'A' - 26),
113                    52..=61 => value - (52 - b'0'),
114                    62 => b'.',
115                    63 => b'_',
116                    _ => return Err(Error::InvalidAbbrev),
117                }))
118            }
119            ScalarOperand::Literal(value) => Ok(value),
120            ScalarOperand::Fixed(width) => Ok(cursor.read(width)?),
121            ScalarOperand::Vbr(width) => Ok(cursor.read_vbr(width)?),
122        }
123    }
124
125    pub(crate) fn from_cursor_abbrev(
126        cursor: &'cursor mut Cursor<'input>,
127        abbrev: Arc<Abbreviation>,
128    ) -> Result<Self, Error> {
129        let id =
130            Self::read_scalar_operand(cursor, *abbrev.fields.first().ok_or(Error::InvalidAbbrev)?)?;
131        Ok(Self {
132            id,
133            cursor,
134            ops: Ops::Abbrev { state: 1, abbrev },
135        })
136    }
137
138    pub(crate) fn from_cursor(cursor: &'cursor mut Cursor<'input>) -> Result<Self, Error> {
139        let id = cursor.read_vbr(6)?;
140        let num_ops = cursor.read_vbr(6)? as usize;
141        Ok(Self {
142            id,
143            cursor,
144            ops: Ops::Full(num_ops),
145        })
146    }
147
148    pub fn payload(&mut self) -> Result<Option<Payload>, Error> {
149        match &mut self.ops {
150            Ops::Abbrev { state, abbrev } => {
151                if *state > abbrev.fields.len() {
152                    return Ok(None);
153                }
154                Ok(match abbrev.payload {
155                    Some(PayloadOperand::Blob) => Some(Payload::Blob(self.blob()?.to_vec())),
156                    Some(PayloadOperand::Array(ScalarOperand::Char6)) => {
157                        Some(Payload::Char6String(
158                            String::from_utf8(self.string()?).map_err(|_| Error::InvalidAbbrev)?,
159                        ))
160                    }
161                    Some(PayloadOperand::Array(_)) => Some(Payload::Array(self.array()?)),
162                    None => None,
163                })
164            }
165            Ops::Full(_) => Ok(None),
166        }
167    }
168
169    /// Number of unread fields, excludes string/array/blob payload
170    #[must_use]
171    pub fn len(&self) -> usize {
172        match &self.ops {
173            Ops::Abbrev { state, abbrev } => abbrev.fields.len().saturating_sub(*state),
174            Ops::Full(num_ops) => *num_ops,
175        }
176    }
177
178    /// Matches len, excludes string/array/blob payload
179    #[must_use]
180    pub fn is_empty(&self) -> bool {
181        self.len() == 0
182    }
183
184    #[doc(hidden)]
185    #[deprecated(note = "renamed to `try_next()` to avoid confusion with `Iterator::next`")]
186    #[allow(clippy::should_implement_trait)]
187    pub fn next(&mut self) -> Result<Option<u64>, Error> {
188        self.try_next()
189    }
190
191    /// Consume next record
192    #[doc(alias = "next")]
193    pub fn try_next(&mut self) -> Result<Option<u64>, Error> {
194        match &mut self.ops {
195            Ops::Abbrev { state, abbrev } => {
196                let Some(&op) = abbrev.fields.get(*state) else {
197                    return Ok(None);
198                };
199                *state += 1;
200                Ok(Some(Self::read_scalar_operand(self.cursor, op)?))
201            }
202            Ops::Full(num_ops) => {
203                if *num_ops == 0 {
204                    return Ok(None);
205                }
206                *num_ops -= 1;
207                Ok(Some(self.cursor.read_vbr(6)?))
208            }
209        }
210    }
211
212    #[cfg_attr(debug_assertions, track_caller)]
213    pub fn u64(&mut self) -> Result<u64, Error> {
214        match self.try_next()? {
215            Some(v) => Ok(v),
216            None => {
217                debug_assert!(false, "unexpected end of record");
218                Err(Error::EndOfRecord)
219            }
220        }
221    }
222
223    pub fn nzu64(&mut self) -> Result<Option<NonZero<u64>>, Error> {
224        self.u64().map(NonZero::new)
225    }
226
227    pub fn i64(&mut self) -> Result<i64, Error> {
228        let v = self.u64()?;
229        let shifted = (v >> 1) as i64;
230        Ok(if (v & 1) == 0 {
231            shifted
232        } else if v != 1 {
233            -shifted
234        } else {
235            1 << 63
236        })
237    }
238
239    #[cfg_attr(debug_assertions, track_caller)]
240    pub fn u32(&mut self) -> Result<u32, Error> {
241        let val = self.u64()?;
242        match val.try_into() {
243            Ok(v) => Ok(v),
244            Err(_) => {
245                debug_assert!(false, "{val} overflows u32");
246                Err(Error::ValueOverflow)
247            }
248        }
249    }
250
251    pub fn nzu32(&mut self) -> Result<Option<NonZero<u32>>, Error> {
252        self.u32().map(NonZero::new)
253    }
254
255    #[cfg_attr(debug_assertions, track_caller)]
256    pub fn u8(&mut self) -> Result<u8, Error> {
257        let val = self.u64()?;
258        match val.try_into() {
259            Ok(v) => Ok(v),
260            Err(_) => {
261                debug_assert!(false, "{val} overflows u8");
262                Err(Error::ValueOverflow)
263            }
264        }
265    }
266
267    #[cfg_attr(debug_assertions, track_caller)]
268    pub fn try_from<U: TryFrom<u64>, T: TryFrom<U>>(&mut self) -> Result<T, Error> {
269        let val = self.u64()?;
270        match val.try_into().ok().and_then(|v| T::try_from(v).ok()) {
271            Some(val) => Ok(val),
272            None => {
273                debug_assert!(false, "{val} overflows {}", std::any::type_name::<U>());
274                Err(Error::ValueOverflow)
275            }
276        }
277    }
278
279    pub fn nzu8(&mut self) -> Result<Option<NonZero<u8>>, Error> {
280        self.u8().map(NonZero::new)
281    }
282
283    #[cfg_attr(debug_assertions, track_caller)]
284    pub fn bool(&mut self) -> Result<bool, Error> {
285        match self.u64()? {
286            0 => Ok(false),
287            1 => Ok(true),
288            val => {
289                debug_assert!(false, "{val} overflows bool");
290                Err(Error::ValueOverflow)
291            }
292        }
293    }
294
295    pub fn range(&mut self) -> Result<Range<usize>, Error> {
296        let start = self.u64()? as usize;
297        Ok(Range {
298            start,
299            end: start + self.u64()? as usize,
300        })
301    }
302
303    pub fn blob(&mut self) -> Result<&'input [u8], Error> {
304        match &mut self.ops {
305            Ops::Abbrev { state, abbrev } => match Self::take_payload_operand(state, abbrev)? {
306                Some(PayloadOperand::Blob) => {
307                    let length = self.cursor.read_vbr(6)? as usize;
308                    self.cursor.align32()?;
309                    let data = self.cursor.read_bytes(length)?;
310                    self.cursor.align32()?;
311                    Ok(data)
312                }
313                other => Err(Error::UnexpectedOperand(other.map(Operand::Payload))),
314            },
315            Ops::Full(_) => Err(Error::UnexpectedOperand(None)),
316        }
317    }
318
319    pub fn array(&mut self) -> Result<Vec<u64>, Error> {
320        match &mut self.ops {
321            Ops::Abbrev { state, abbrev } => match Self::take_payload_operand(state, abbrev)? {
322                Some(PayloadOperand::Array(op)) => {
323                    let len = self.cursor.read_vbr(6)? as usize;
324                    let mut out = Vec::with_capacity(len);
325                    for _ in 0..len {
326                        if out.len() == out.capacity() {
327                            debug_assert!(false);
328                            break;
329                        }
330                        out.push(Self::read_scalar_operand(self.cursor, op)?);
331                    }
332                    Ok(out)
333                }
334                other => Err(Error::UnexpectedOperand(other.map(Operand::Payload))),
335            },
336            // Not a proper array payload, but this fallback pattern is used by LLVM
337            Ops::Full(num_ops) => {
338                let len = *num_ops;
339                *num_ops = 0;
340                let mut out = Vec::with_capacity(len);
341                for _ in 0..len {
342                    if out.len() == out.capacity() {
343                        debug_assert!(false);
344                        break;
345                    }
346                    out.push(self.cursor.read_vbr(6)?);
347                }
348                Ok(out)
349            }
350        }
351    }
352
353    /// Mark payload as read, if there is one
354    fn take_payload_operand(
355        state: &mut usize,
356        abbrev: &Abbreviation,
357    ) -> Result<Option<PayloadOperand>, Error> {
358        if *state == abbrev.fields.len() {
359            if abbrev.payload.is_some() {
360                *state += 1;
361            }
362            Ok(abbrev.payload)
363        } else {
364            Err(Error::UnexpectedOperand(
365                abbrev.fields.get(*state).copied().map(Operand::Scalar),
366            ))
367        }
368    }
369
370    /// Read remainder of the fields as string chars.
371    ///
372    /// Interpret data as UTF-8.
373    /// The string may contain NUL terminator, depending on context.
374    pub fn string_utf8(&mut self) -> Result<String, Error> {
375        String::from_utf8(self.string()?).map_err(Error::Encoding)
376    }
377
378    /// Read remainder of the fields as string chars
379    ///
380    /// The strings are just binary blobs. LLVM doesn't guarantee any encoding.
381    /// The string may contain NUL terminator, depending on context.
382    #[cfg_attr(debug_assertions, track_caller)]
383    pub fn string(&mut self) -> Result<Vec<u8>, Error> {
384        match &mut self.ops {
385            Ops::Abbrev { state, abbrev } => match Self::take_payload_operand(state, abbrev)? {
386                Some(PayloadOperand::Array(el)) => {
387                    *state += 1;
388                    let len = self.cursor.read_vbr(6)? as usize;
389                    let mut out = Vec::with_capacity(len);
390
391                    match el {
392                        ScalarOperand::Char6 => {
393                            for _ in 0..len {
394                                if out.len() == out.capacity() {
395                                    debug_assert!(false);
396                                    break;
397                                }
398                                let ch = match self.cursor.read(6)? as u8 {
399                                    value @ 0..=25 => value + b'a',
400                                    value @ 26..=51 => value + (b'A' - 26),
401                                    value @ 52..=61 => value - (52 - b'0'),
402                                    62 => b'.',
403                                    63 => b'_',
404                                    _ => return Err(Error::InvalidAbbrev),
405                                };
406                                out.push(ch);
407                            }
408                        }
409                        ScalarOperand::Fixed(width @ 6..=8) => {
410                            for _ in 0..len {
411                                if out.len() == out.capacity() {
412                                    debug_assert!(false);
413                                    break;
414                                }
415                                out.push(self.cursor.read(width)? as u8);
416                            }
417                        }
418                        other => {
419                            return Err(Error::UnexpectedOperand(Some(Operand::Scalar(other))));
420                        }
421                    }
422                    Ok(out)
423                }
424                other => Err(Error::UnexpectedOperand(other.map(Operand::Payload))),
425            },
426            Ops::Full(num_ops) => {
427                let len = std::mem::replace(num_ops, 0);
428                let mut out = Vec::with_capacity(len);
429                for _ in 0..len {
430                    let ch = self.cursor.read_vbr(6)?;
431                    out.push(match u8::try_from(ch) {
432                        Ok(c) => c,
433                        Err(_) => {
434                            debug_assert!(false, "{ch} too big for char");
435                            return Err(Error::ValueOverflow);
436                        }
437                    });
438                }
439                Ok(out)
440            }
441        }
442    }
443
444    /// Zero-terminated string, assumes latin1 encoding
445    pub fn zstring(&mut self) -> Result<String, Error> {
446        let mut s = String::new();
447        while let Some(b) = self.nzu8()? {
448            s.push(b.get() as char);
449        }
450        Ok(s)
451    }
452
453    /// Internal ID of this record's abbreviation, if any.
454    ///
455    /// This is intended only for debugging and data dumps.
456    /// This isn't a stable identifier, and may be block-specific.
457    #[must_use]
458    pub fn debug_abbrev_id(&self) -> Option<u32> {
459        match &self.ops {
460            Ops::Abbrev { abbrev, .. } => Some(abbrev.id),
461            Ops::Full(_) => None,
462        }
463    }
464
465    /// For debug printing
466    fn with_cloned_cursor<'new_cursor>(
467        &self,
468        cursor: &'new_cursor mut Cursor<'input>,
469    ) -> RecordIter<'new_cursor, 'input> {
470        RecordIter {
471            id: self.id,
472            ops: self.ops.clone(),
473            cursor,
474        }
475    }
476}
477
478impl Iterator for RecordIter<'_, '_> {
479    type Item = Result<u64, Error>;
480    fn next(&mut self) -> Option<Self::Item> {
481        self.try_next().transpose()
482    }
483}
484
485impl Drop for RecordIter<'_, '_> {
486    /// Must drain the remaining records to advance the cursor to the next record
487    fn drop(&mut self) {
488        while let Ok(Some(_)) = self.try_next() {}
489        if let Ops::Abbrev { abbrev, .. } = &self.ops
490            && abbrev.payload.is_some()
491        {
492            let _ = self.payload();
493        }
494    }
495}
496
497struct RecordIterDebugFields<'c, 'i>(RefCell<RecordIter<'c, 'i>>);
498struct RecordIterDebugResult<T, E>(Result<T, E>);
499
500impl fmt::Debug for RecordIter<'_, '_> {
501    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
502        let mut c = self.cursor.clone();
503        let fields = RecordIterDebugFields(RefCell::new(self.with_cloned_cursor(&mut c)));
504
505        f.debug_struct("RecordIter")
506            .field("id", &self.id)
507            .field("fields", &fields)
508            .field("ops", &self.ops)
509            .field("cursor", &self.cursor)
510            .finish()
511    }
512}
513
514impl fmt::Debug for RecordIterDebugFields<'_, '_> {
515    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
516        let mut iter = self.0.borrow_mut();
517        let mut d = f.debug_list();
518        d.entries(iter.by_ref().map(RecordIterDebugResult));
519        if let Some(p) = iter.payload().transpose() {
520            d.entries([RecordIterDebugResult(p)]);
521        }
522        d.finish()
523    }
524}
525
526impl<T: fmt::Debug, E: fmt::Debug> fmt::Debug for RecordIterDebugResult<T, E> {
527    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
528        match &self.0 {
529            Ok(t) => t.fmt(f),
530            Err(e) => e.fmt(f),
531        }
532    }
533}
534
535/// Bitcode element
536#[derive(Debug, Clone)]
537pub enum BitcodeElement {
538    /// Block
539    Block(Block),
540    /// Data record
541    Record(Record),
542}
543
544impl BitcodeElement {
545    /// Returns true if it is a `Block`
546    #[must_use]
547    pub fn is_block(&self) -> bool {
548        matches!(self, Self::Block(_))
549    }
550
551    /// If it is a `Block`, returns the associated block. Returns `None` otherwise.
552    #[must_use]
553    pub fn as_block(&self) -> Option<&Block> {
554        match self {
555            Self::Block(block) => Some(block),
556            Self::Record(_) => None,
557        }
558    }
559
560    /// If it is a `Block`, returns the associated mutable block. Returns `None` otherwise.
561    pub fn as_block_mut(&mut self) -> Option<&mut Block> {
562        match self {
563            Self::Block(block) => Some(block),
564            Self::Record(_) => None,
565        }
566    }
567
568    /// Returns true if it is a `Record`
569    #[must_use]
570    pub fn is_record(&self) -> bool {
571        matches!(self, Self::Record(_))
572    }
573
574    /// If it is a `Record`, returns the associated record. Returns `None` otherwise.
575    #[must_use]
576    pub fn as_record(&self) -> Option<&Record> {
577        match self {
578            Self::Block(_) => None,
579            Self::Record(record) => Some(record),
580        }
581    }
582
583    /// If it is a `Record`, returns the associated mutable record. Returns `None` otherwise.
584    pub fn as_record_mut(&mut self) -> Option<&mut Record> {
585        match self {
586            Self::Block(_) => None,
587            Self::Record(record) => Some(record),
588        }
589    }
590}
591
592/// Block information
593#[derive(Debug, Clone, Default)]
594pub struct BlockInfo {
595    /// Block name
596    pub name: String,
597    /// Data record names
598    pub record_names: HashMap<u64, String>,
599}
600
601/// aka. Magic number
602#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
603pub struct Signature {
604    pub magic: u32,
605    pub magic2: u32,
606    pub version: u32,
607    pub offset: u32,
608    pub size: u32,
609    pub cpu_type: u32,
610}
611
612impl Signature {
613    #[must_use]
614    pub fn parse(data: &[u8]) -> Option<(Self, &[u8])> {
615        let (signature, remaining_data) = data.split_first_chunk::<4>()?;
616        let magic = u32::from_le_bytes(*signature);
617        if magic != LLVM_BITCODE_WRAPPER_MAGIC {
618            Some((
619                Self {
620                    version: 0,
621                    magic,
622                    magic2: 0,
623                    offset: 4,
624                    size: remaining_data.len() as _,
625                    cpu_type: 0,
626                },
627                remaining_data,
628            ))
629        } else {
630            // It is a LLVM Bitcode wrapper, remove wrapper header
631            if data.len() < 20 {
632                return None;
633            }
634            let mut words = data
635                .chunks_exact(4)
636                .skip(1)
637                .map(|w| u32::from_le_bytes(w.try_into().unwrap()));
638            let version = words.next()?;
639            let offset = words.next()?;
640            let size = words.next()?;
641            let cpu_id = words.next()?;
642            let data = data.get(offset as usize..offset as usize + size as usize)?;
643            let (magic2, remaining_data) = data.split_first_chunk::<4>()?;
644            let magic2 = u32::from_le_bytes(*magic2);
645            Some((
646                Self {
647                    version,
648                    magic,
649                    magic2,
650                    offset,
651                    size,
652                    cpu_type: cpu_id,
653                },
654                remaining_data,
655            ))
656        }
657    }
658}
659
660impl Bitcode {
661    /// Parse bitcode from bytes
662    ///
663    /// Accepts both LLVM bitcode and bitcode wrapper formats
664    pub fn new(data: &[u8]) -> Result<Self, Error> {
665        let (signature, stream) = Signature::parse(data).ok_or(Error::InvalidSignature(0))?;
666        let mut reader = BitStreamReader::new();
667        let mut visitor = CollectingVisitor::new();
668        reader.read_block(
669            Cursor::new(stream),
670            BitStreamReader::TOP_LEVEL_BLOCK_ID,
671            2,
672            &mut visitor,
673        )?;
674        Ok(Self {
675            signature,
676            elements: visitor.finalize_top_level_elements(),
677            block_info: reader.block_info,
678        })
679    }
680
681    /// Read bitcode from bytes with a visitor
682    ///
683    /// Accepts both LLVM bitcode and bitcode wrapper formats
684    pub fn read<V>(data: &[u8], visitor: &mut V) -> Result<(), Error>
685    where
686        V: BitStreamVisitor,
687    {
688        let (header, stream) = Signature::parse(data).ok_or(Error::InvalidSignature(0))?;
689        if !visitor.validate(header) {
690            return Err(Error::InvalidSignature(header.magic));
691        }
692        let mut reader = BitStreamReader::new();
693        reader.read_block(
694            Cursor::new(stream),
695            BitStreamReader::TOP_LEVEL_BLOCK_ID,
696            2,
697            visitor,
698        )
699    }
700}