marshal_parser/lib.rs
1//! # Parser for the "marshal" binary de/serialization format used by CPython
2//!
3//! This crate implements a parser and some utilities for reading files in the
4//! "marshal" de/serialization format used internally in CPython. The exact
5//! format is not stable and can change between minor versions of CPython.
6//!
7//! This crate supports parsing "marshal" dumps and `pyc` files that were
8//! written by CPython versions `>= 3.6` and `< 3.14`.
9//!
10//! There is a high-level and a low-level API, depending on how much access to
11//! the underlying data structures is needed. The low-level API also provides
12//! more flexibility since it does not require files, but can operate on plain
13//! bytes ([`Vec<u8>`]).
14//!
15//! Reading a `pyc` file from disk:
16//!
17//! ```no_run
18//! use marshal_parser::{MarshalFile, Object};
19//!
20//! let pyc = MarshalFile::from_pyc_path("mod.cpython-310.pyc").unwrap();
21//! let object: Object = pyc.into_inner();
22//! ```
23//!
24//! Reading a "marshal" dump (i.e. a file without `pyc` header):
25//!
26//! ```no_run
27//! use marshal_parser::{MarshalFile, Object};
28//!
29//! let dump = MarshalFile::from_dump_path("dump.marshal", (3, 11)).unwrap();
30//! let object: Object = dump.into_inner();
31//! ```
32
33use std::borrow::Cow;
34use std::fs::{File, OpenOptions};
35use std::io::{Read, Write};
36use std::path::Path;
37
38mod magic;
39mod objects;
40mod parser;
41
42pub use objects::{CodeObject, Object, ObjectType, StringType};
43pub use parser::{Error, MarshalObject};
44
45type Result<T> = std::result::Result<T, Error>;
46
47/// High-level parser for `pyc` and "marshal dump" files
48#[derive(Debug)]
49pub struct MarshalFile {
50 data: Vec<u8>,
51 marshal: MarshalObject,
52}
53
54impl MarshalFile {
55 /// Read and parse a `pyc` file at the specified path
56 pub fn from_pyc_path<S>(path: S) -> Result<Self>
57 where
58 S: AsRef<Path>,
59 {
60 let mut file = OpenOptions::new()
61 .read(true)
62 .write(false)
63 .create_new(false)
64 .open(path)?;
65
66 let mut data = Vec::new();
67 file.read_to_end(&mut data)?;
68
69 let marshal = MarshalObject::parse_pyc(&data)?;
70 Ok(MarshalFile { data, marshal })
71 }
72
73 /// Read and parse a "marshal dump" file at the specified path
74 pub fn from_dump_path<S>(path: S, (major, minor): (u16, u16)) -> Result<Self>
75 where
76 S: AsRef<Path>,
77 {
78 let mut file = OpenOptions::new()
79 .read(true)
80 .write(false)
81 .create_new(false)
82 .open(path)?;
83
84 let mut data = Vec::new();
85 file.read_to_end(&mut data)?;
86
87 let marshal = MarshalObject::parse_dump(&data, (major, minor))?;
88 Ok(MarshalFile { data, marshal })
89 }
90
91 /// Obtain a reference to the inner [`Object`]
92 pub fn inner(&self) -> &Object {
93 &self.marshal.object
94 }
95
96 /// Consume this [`MarshalFile`] to obtain the inner [`Object`]
97 pub fn into_inner(self) -> Object {
98 self.marshal.object
99 }
100
101 /// Print objects with unused reference flags to stdout
102 pub fn print_unused_ref_flags(&self) {
103 self.marshal.print_unused_ref_flags();
104 }
105
106 /// Rewrite file to remove unused reference flags
107 ///
108 /// This can be useful to generate `pyc` files that are reproducible across
109 /// different CPU architectures.
110 ///
111 /// If no unused reference flags are found, no file is written, and `false`
112 /// is returned. If a file is written, `true` is returned.
113 pub fn write_normalized<S>(self, path: S) -> Result<bool>
114 where
115 S: AsRef<Path>,
116 {
117 let marshal = self.marshal;
118 let result = marshal.clear_unused_ref_flags(&self.data)?;
119
120 if let Cow::Owned(x) = result {
121 let mut file = File::create_new(path)?;
122 file.write_all(&x)?;
123
124 Ok(true)
125 } else {
126 Ok(false)
127 }
128 }
129}