marshal_parser/
parser.rs

1use std::borrow::Cow;
2use std::collections::HashMap;
3use std::io::{self, Cursor, Read, Seek, SeekFrom, Write};
4
5use num_bigint::BigInt;
6
7use crate::magic::{pyc_header_length, python_version_from_magic};
8use crate::objects::{CodeObject, Object, ObjectType, StringType};
9
10/// Custom error type for distinguishing different failure modes
11#[derive(Debug, thiserror::Error)]
12pub enum Error {
13    /// Invalid start of object and / or unknown type flag
14    #[error("Unknown type {byte:?} at offset {offset}.")]
15    UnknownType {
16        #[allow(missing_docs)]
17        byte: char,
18        #[allow(missing_docs)]
19        offset: usize,
20    },
21    /// Invalid file (premature end of file) or I/O error
22    #[error("{inner}")]
23    Io {
24        #[from]
25        #[allow(missing_docs)]
26        inner: io::Error,
27    },
28    /// Unsupported Python version (unhandled object type)
29    #[error("Handling for type {0:?} is not implemented.")]
30    UnhandledType(ObjectType),
31    /// Invalid file and / or unsupported Python version (unknown magic number)
32    #[error("Cannot determine Python version from file header.")]
33    UnknownVersion,
34    /// Parsing error resulted in no known objects with this ID
35    #[error("Missing object for reference with ID: {index}")]
36    UnknownReference {
37        #[allow(missing_docs)]
38        index: usize,
39    },
40}
41
42#[derive(Clone, Debug)]
43pub(crate) struct ReferencedObject {
44    pub(crate) offset: usize,
45    pub(crate) index: u32,
46    pub(crate) usages: u32,
47    pub(crate) typ: ObjectType,
48}
49
50/// Parsed contents of a `pyc` file or "marshal dump"
51///
52/// This data structure contains additional information about which objects are
53/// referenced by reference objects. This data can be used to clean up unused
54/// reference flags, which are, in general, not reproducible.
55#[derive(Debug)]
56pub struct MarshalObject {
57    pub(crate) object: Object,
58    pub(crate) references: HashMap<u32, Vec<usize>>,
59    pub(crate) referenced: Vec<ReferencedObject>,
60}
61
62impl MarshalObject {
63    /// Parse `pyc` file contents (header + marshal dump) from data
64    pub fn parse_pyc(data: &[u8]) -> Result<Self, Error> {
65        let mut reader = Cursor::new(data);
66
67        let mut buf = [0u8; 4];
68        reader.read_exact(&mut buf)?;
69
70        let Some((major, minor)) = python_version_from_magic(&buf) else {
71            return Err(Error::UnknownVersion);
72        };
73
74        let header_length = pyc_header_length((major, minor));
75        reader.seek_relative((header_length - 4) as i64)?;
76
77        let parser = Parser::new((major, minor), header_length);
78        let (object, references, referenced) = parser.read_marshal(&mut reader)?;
79
80        Ok(MarshalObject {
81            object,
82            references,
83            referenced,
84        })
85    }
86
87    /// Parse marshal dump contents from data
88    ///
89    /// Since plain "marshal dumps" do not contain a `pyc` file header, the
90    /// version of Python that was used to create the data must be specified.
91    pub fn parse_dump(data: &[u8], (major, minor): (u16, u16)) -> Result<Self, Error> {
92        let mut reader = Cursor::new(data);
93        let parser = Parser::new((major, minor), 0);
94        let (object, references, referenced) = parser.read_marshal(&mut reader)?;
95
96        Ok(MarshalObject {
97            object,
98            references,
99            referenced,
100        })
101    }
102
103    /// Clear unused reference flags from objects
104    ///
105    /// This method can be used to make `pyc` files more reproducible.
106    ///
107    /// Reference flags are removed from objects that are never referenced, and
108    /// remaining references are adjusted for the shuffled index numbers.
109    ///
110    /// If no changes are made, data is returned without modifications in a
111    /// [`Cow::Borrowed`], otherwise a [`Cow::Owned`] with new file contents is
112    /// returned.
113    pub fn clear_unused_ref_flags(self, data: &[u8]) -> Result<Cow<[u8]>, Error> {
114        // this method consumes self because it invalidates the unmarshaled object
115
116        let unreferenced: Vec<_> = self.referenced.iter().filter(|x| x.usages == 0).collect();
117        if unreferenced.is_empty() {
118            log::info!("No unused references found.");
119            return Ok(Cow::Borrowed(data));
120        }
121
122        let mut data = data.to_vec();
123
124        let mut dropped_indices = Vec::new();
125        for unref in &unreferenced {
126            log::info!(
127                "Clearing unused reference bit from object at offset {} with index {}",
128                unref.offset,
129                unref.index
130            );
131
132            data[unref.offset] = clear_bit(data[unref.offset], 7);
133            dropped_indices.push(unref.index);
134        }
135
136        let mut new_indices = Vec::new();
137        for (index, offsets) in &self.references {
138            let diff = dropped_indices.iter().filter(|x| **x < *index).count() as u32;
139
140            for offset in offsets {
141                new_indices.push((*offset, index - diff));
142            }
143        }
144
145        // sorting by offset costs more time than doing random memory accesses
146
147        let mut writer = Cursor::new(&mut data);
148        for (offset, new_index) in new_indices {
149            writer.seek(SeekFrom::Start(offset as u64))?;
150            writer.write_all(&new_index.to_le_bytes())?;
151        }
152
153        log::info!("Removed {} unused references.", unreferenced.len());
154        Ok(Cow::Owned(data))
155    }
156
157    /// Print objects with unused reference flags to stdout
158    pub fn print_unused_ref_flags(&self) {
159        for r in &self.referenced {
160            if r.usages == 0 {
161                println!(
162                    "Unused reference bit: {} object with reference index {} at offset {}",
163                    r.typ, r.index, r.offset
164                );
165            }
166        }
167    }
168
169    /// Obtain a reference to the inner [`Object`]
170    pub fn inner(&self) -> &Object {
171        &self.object
172    }
173
174    /// Consume this [`MarshalObject`] to obtain the inner [`Object`]
175    pub fn into_inner(self) -> Object {
176        self.object
177    }
178}
179
180type References = HashMap<u32, Vec<usize>>;
181type Referenced = Vec<ReferencedObject>;
182
183#[derive(Debug)]
184pub(crate) struct Parser {
185    version: (u16, u16),
186    offset: usize,
187    references: References,
188    referenced: Referenced,
189}
190
191impl Parser {
192    fn new(version: (u16, u16), offset: usize) -> Self {
193        Parser {
194            version,
195            offset,
196            references: HashMap::new(),
197            referenced: Vec::new(),
198        }
199    }
200
201    fn read_marshal<T: Read>(mut self, reader: &mut T) -> Result<(Object, References, Referenced), Error> {
202        let object = self.read_object(reader)?;
203
204        for (index, usages) in &self.references {
205            let index = *index as usize;
206
207            if let Some(r) = self.referenced.get_mut(index) {
208                r.usages = usages.len() as u32;
209            } else {
210                return Err(Error::UnknownReference { index });
211            }
212        }
213
214        Ok((object, self.references, self.referenced))
215    }
216
217    fn read_object<T: Read>(&mut self, bytes: &mut T) -> Result<Object, Error> {
218        log::debug!("Reading object at offset {}", self.offset);
219
220        let offset = self.offset;
221        let mut byte = self.read_u8(bytes)?;
222
223        let mut ref_id = None;
224
225        // check if this object has the reference flag bit set
226        if test_bit(byte, 7) {
227            let index = self.referenced.len() as u32;
228            log::debug!("Object at offset {} assigned reference index {}", self.offset, index);
229
230            byte = clear_bit(byte, 7);
231            ref_id = Some(index);
232        }
233
234        let Some(typ) = ObjectType::try_from(byte).ok() else {
235            return Err(Error::UnknownType {
236                byte: byte.into(),
237                offset,
238            });
239        };
240
241        if let Some(index) = ref_id {
242            let obj = ReferencedObject {
243                offset,
244                index,
245                usages: 0,
246                typ,
247            };
248
249            self.referenced.push(obj);
250        }
251
252        let result = match typ {
253            // singleton objects
254            ObjectType::Null => Object::Null,
255            ObjectType::None => Object::None,
256            ObjectType::False => Object::False,
257            ObjectType::True => Object::True,
258            ObjectType::StopIteration => Object::StopIteration,
259            ObjectType::Ellipsis => Object::Ellipsis,
260
261            // simple objects
262            ObjectType::Int => Object::Int(self.read_u32(bytes)?),
263            ObjectType::BinaryFloat => Object::BinaryFloat(self.read_f64(bytes)?),
264            ObjectType::BinaryComplex => Object::BinaryComplex((self.read_f64(bytes)?, self.read_f64(bytes)?)),
265
266            // string objects
267            ObjectType::String => Object::String {
268                typ: StringType::String,
269                bytes: self.read_string(bytes, false)?,
270            },
271            ObjectType::Interned => Object::String {
272                typ: StringType::Interned,
273                bytes: self.read_string(bytes, false)?,
274            },
275            ObjectType::Unicode => Object::String {
276                typ: StringType::Unicode,
277                bytes: self.read_string(bytes, false)?,
278            },
279            ObjectType::Ascii => Object::String {
280                typ: StringType::Ascii,
281                bytes: self.read_string(bytes, false)?,
282            },
283            ObjectType::AsciiInterned => Object::String {
284                typ: StringType::AsciiInterned,
285                bytes: self.read_string(bytes, false)?,
286            },
287            ObjectType::ShortAscii => Object::String {
288                typ: StringType::Ascii,
289                bytes: self.read_string(bytes, true)?,
290            },
291            ObjectType::ShortAsciiInterned => Object::String {
292                typ: StringType::AsciiInterned,
293                bytes: self.read_string(bytes, true)?,
294            },
295
296            // collection objects
297            ObjectType::Tuple => Object::Tuple(self.read_collection(bytes, false)?),
298            ObjectType::List => Object::List(self.read_collection(bytes, false)?),
299            ObjectType::Set => Object::Set(self.read_collection(bytes, false)?),
300            ObjectType::FrozenSet => Object::FrozenSet(self.read_collection(bytes, false)?),
301            ObjectType::SmallTuple => Object::Tuple(self.read_collection(bytes, true)?),
302            ObjectType::Dict => Object::Dict(self.read_dict(bytes)?),
303
304            // special cases
305            ObjectType::Long => Object::Long(self.read_long(bytes)?),
306            ObjectType::Ref => Object::Ref(self.read_ref(bytes)?),
307            ObjectType::Code => Object::Code(Box::new(self.read_code_object(bytes)?)),
308
309            // unhandled types:
310            // ObjectType::{Int64,Float,Complex,Unknown}
311            x => return Err(Error::UnhandledType(x)),
312        };
313
314        Ok(result)
315    }
316
317    #[inline(always)]
318    fn read_bytes<T: Read>(&mut self, bytes: &mut T, n: usize) -> Result<Vec<u8>, Error> {
319        let mut buf = vec![0u8; n];
320        bytes.read_exact(&mut buf)?;
321        self.offset += n;
322        Ok(buf)
323    }
324
325    #[inline(always)]
326    fn read_bytes_const<T: Read, const N: usize>(&mut self, bytes: &mut T) -> Result<[u8; N], Error> {
327        let mut buf = [0u8; N];
328        bytes.read_exact(&mut buf)?;
329        self.offset += N;
330        Ok(buf)
331    }
332
333    #[inline(always)]
334    fn read_u8<T: Read>(&mut self, bytes: &mut T) -> Result<u8, Error> {
335        log::debug!("Reading u8 at offset {}", self.offset);
336        Ok(u8::from_le_bytes(self.read_bytes_const(bytes)?))
337    }
338
339    #[inline(always)]
340    fn read_u32<T: Read>(&mut self, bytes: &mut T) -> Result<u32, Error> {
341        log::debug!("Reading u32 at offset {}", self.offset);
342        Ok(u32::from_le_bytes(self.read_bytes_const(bytes)?))
343    }
344
345    #[inline(always)]
346    fn read_i32<T: Read>(&mut self, bytes: &mut T) -> Result<i32, Error> {
347        log::debug!("Reading i32 at offset {}", self.offset);
348        Ok(i32::from_le_bytes(self.read_bytes_const(bytes)?))
349    }
350
351    #[inline(always)]
352    fn read_f64<T: Read>(&mut self, bytes: &mut T) -> Result<f64, Error> {
353        log::debug!("Reading f64 at offset {}", self.offset);
354        Ok(f64::from_le_bytes(self.read_bytes_const(bytes)?))
355    }
356
357    fn read_string<T: Read>(&mut self, bytes: &mut T, short: bool) -> Result<Vec<u8>, Error> {
358        let size = if short {
359            log::debug!("Reading short string at offset {}", self.offset);
360            self.read_u8(bytes)? as usize
361        } else {
362            log::debug!("Reading string at offset {}", self.offset);
363            self.read_u32(bytes)? as usize
364        };
365
366        let bytes = self.read_bytes(bytes, size)?;
367        Ok(bytes)
368    }
369
370    fn read_collection<T: Read>(&mut self, bytes: &mut T, small: bool) -> Result<Vec<Object>, Error> {
371        let size = if small {
372            log::debug!("Reading small tuple at offset {}", self.offset);
373            self.read_u8(bytes)? as usize
374        } else {
375            log::debug!("Reading collection at offset {}", self.offset);
376            self.read_u32(bytes)? as usize
377        };
378
379        let mut result = Vec::with_capacity(size);
380        for _ in 0..size {
381            result.push(self.read_object(bytes)?);
382        }
383
384        Ok(result)
385    }
386
387    fn read_dict<T: Read>(&mut self, bytes: &mut T) -> Result<Vec<(Object, Object)>, Error> {
388        log::debug!("Reading collection at offset {}", self.offset);
389
390        let mut result = Vec::new();
391
392        loop {
393            let key = self.read_object(bytes)?;
394            if key == Object::Null {
395                break;
396            }
397
398            let value = self.read_object(bytes)?;
399            result.push((key, value));
400        }
401
402        Ok(result)
403    }
404
405    fn read_long<T: Read>(&mut self, bytes: &mut T) -> Result<BigInt, Error> {
406        log::debug!("Reading long at offset {}", self.offset);
407
408        let size = self.read_i32(bytes)?;
409
410        let mut result = BigInt::ZERO;
411        let mut shift = 0;
412
413        for _ in 0..size.abs() {
414            let x = {
415                let b = self.read_bytes_const::<T, 2>(bytes)?;
416
417                let mut x = b[0] as i16;
418                x |= (b[1] as i16) << 8;
419                x |= -(x & 0x8000u16 as i16);
420
421                BigInt::from(x)
422            };
423
424            result += x << shift;
425            shift += 15;
426        }
427
428        if size > 0 {
429            Ok(result)
430        } else {
431            Ok(-result)
432        }
433    }
434
435    fn read_ref<T: Read>(&mut self, bytes: &mut T) -> Result<u32, Error> {
436        log::debug!("Reading reference at offset {}", self.offset);
437
438        let offset = self.offset;
439        let index = self.read_u32(bytes)?;
440        log::debug!("Found reference at offset {} with index {}", offset, index);
441
442        self.references
443            .entry(index)
444            .and_modify(|x| x.push(offset))
445            .or_insert(vec![offset]);
446        Ok(index)
447    }
448
449    fn read_code_object<T: Read>(&mut self, bytes: &mut T) -> Result<CodeObject, Error> {
450        log::debug!("Reading codeobject at offset {}", self.offset);
451
452        let result = CodeObject {
453            argcount: self.read_u32(bytes)?,
454            posonlyargcount: if self.version >= (3, 8) {
455                Some(self.read_u32(bytes)?)
456            } else {
457                None
458            },
459            kwonlyargcount: self.read_u32(bytes)?,
460            nlocals: if self.version < (3, 11) {
461                Some(self.read_u32(bytes)?)
462            } else {
463                None
464            },
465            stacksize: self.read_u32(bytes)?,
466            flags: self.read_u32(bytes)?,
467            code: self.read_object(bytes)?,
468            consts: self.read_object(bytes)?,
469            names: self.read_object(bytes)?,
470            varnames: if self.version < (3, 11) {
471                Some(self.read_object(bytes)?)
472            } else {
473                None
474            },
475            freevars: if self.version < (3, 11) {
476                Some(self.read_object(bytes)?)
477            } else {
478                None
479            },
480            cellvars: if self.version < (3, 11) {
481                Some(self.read_object(bytes)?)
482            } else {
483                None
484            },
485            localsplusnames: if self.version >= (3, 11) {
486                Some(self.read_object(bytes)?)
487            } else {
488                None
489            },
490            localspluskinds: if self.version >= (3, 11) {
491                Some(self.read_object(bytes)?)
492            } else {
493                None
494            },
495            filename: self.read_object(bytes)?,
496            name: self.read_object(bytes)?,
497            qualname: if self.version >= (3, 11) {
498                Some(self.read_object(bytes)?)
499            } else {
500                None
501            },
502            firstlineno: self.read_u32(bytes)?,
503            linetable: self.read_object(bytes)?,
504            exceptiontable: if self.version >= (3, 11) {
505                Some(self.read_object(bytes)?)
506            } else {
507                None
508            },
509        };
510
511        Ok(result)
512    }
513}
514
515#[inline(always)]
516fn test_bit(b: u8, i: u8) -> bool {
517    b & (1 << i) != 0u8
518}
519
520#[inline(always)]
521fn clear_bit(b: u8, i: u8) -> u8 {
522    b & !(1 << i)
523}