1use std::borrow::Cow;
2use std::collections::HashMap;
3use std::io::{self, Cursor, Read, Seek, SeekFrom, Write};
4
5use num_bigint::BigInt;
6
7use crate::magic::{pyc_header_length, python_version_from_magic};
8use crate::objects::{CodeObject, Object, ObjectType, StringType};
9
10#[derive(Debug, thiserror::Error)]
12pub enum Error {
13 #[error("Unknown type {byte:?} at offset {offset}.")]
15 UnknownType {
16 #[allow(missing_docs)]
17 byte: char,
18 #[allow(missing_docs)]
19 offset: usize,
20 },
21 #[error("{inner}")]
23 Io {
24 #[from]
25 #[allow(missing_docs)]
26 inner: io::Error,
27 },
28 #[error("Handling for type {0:?} is not implemented.")]
30 UnhandledType(ObjectType),
31 #[error("Cannot determine Python version from file header.")]
33 UnknownVersion,
34 #[error("Missing object for reference with ID: {index}")]
36 UnknownReference {
37 #[allow(missing_docs)]
38 index: usize,
39 },
40}
41
42#[derive(Clone, Debug)]
43pub(crate) struct ReferencedObject {
44 pub(crate) offset: usize,
45 pub(crate) index: u32,
46 pub(crate) usages: u32,
47 pub(crate) typ: ObjectType,
48}
49
50#[derive(Debug)]
56pub struct MarshalObject {
57 pub(crate) object: Object,
58 pub(crate) references: HashMap<u32, Vec<usize>>,
59 pub(crate) referenced: Vec<ReferencedObject>,
60}
61
62impl MarshalObject {
63 pub fn parse_pyc(data: &[u8]) -> Result<Self, Error> {
65 let mut reader = Cursor::new(data);
66
67 let mut buf = [0u8; 4];
68 reader.read_exact(&mut buf)?;
69
70 let Some((major, minor)) = python_version_from_magic(&buf) else {
71 return Err(Error::UnknownVersion);
72 };
73
74 let header_length = pyc_header_length((major, minor));
75 reader.seek_relative((header_length - 4) as i64)?;
76
77 let parser = Parser::new((major, minor), header_length);
78 let (object, references, referenced) = parser.read_marshal(&mut reader)?;
79
80 Ok(MarshalObject {
81 object,
82 references,
83 referenced,
84 })
85 }
86
87 pub fn parse_dump(data: &[u8], (major, minor): (u16, u16)) -> Result<Self, Error> {
92 let mut reader = Cursor::new(data);
93 let parser = Parser::new((major, minor), 0);
94 let (object, references, referenced) = parser.read_marshal(&mut reader)?;
95
96 Ok(MarshalObject {
97 object,
98 references,
99 referenced,
100 })
101 }
102
103 pub fn clear_unused_ref_flags(self, data: &[u8]) -> Result<Cow<[u8]>, Error> {
114 let unreferenced: Vec<_> = self.referenced.iter().filter(|x| x.usages == 0).collect();
117 if unreferenced.is_empty() {
118 log::info!("No unused references found.");
119 return Ok(Cow::Borrowed(data));
120 }
121
122 let mut data = data.to_vec();
123
124 let mut dropped_indices = Vec::new();
125 for unref in &unreferenced {
126 log::info!(
127 "Clearing unused reference bit from object at offset {} with index {}",
128 unref.offset,
129 unref.index
130 );
131
132 data[unref.offset] = clear_bit(data[unref.offset], 7);
133 dropped_indices.push(unref.index);
134 }
135
136 let mut new_indices = Vec::new();
137 for (index, offsets) in &self.references {
138 let diff = dropped_indices.iter().filter(|x| **x < *index).count() as u32;
139
140 for offset in offsets {
141 new_indices.push((*offset, index - diff));
142 }
143 }
144
145 let mut writer = Cursor::new(&mut data);
148 for (offset, new_index) in new_indices {
149 writer.seek(SeekFrom::Start(offset as u64))?;
150 writer.write_all(&new_index.to_le_bytes())?;
151 }
152
153 log::info!("Removed {} unused references.", unreferenced.len());
154 Ok(Cow::Owned(data))
155 }
156
157 pub fn print_unused_ref_flags(&self) {
159 for r in &self.referenced {
160 if r.usages == 0 {
161 println!(
162 "Unused reference bit: {} object with reference index {} at offset {}",
163 r.typ, r.index, r.offset
164 );
165 }
166 }
167 }
168
169 pub fn inner(&self) -> &Object {
171 &self.object
172 }
173
174 pub fn into_inner(self) -> Object {
176 self.object
177 }
178}
179
180type References = HashMap<u32, Vec<usize>>;
181type Referenced = Vec<ReferencedObject>;
182
183#[derive(Debug)]
184pub(crate) struct Parser {
185 version: (u16, u16),
186 offset: usize,
187 references: References,
188 referenced: Referenced,
189}
190
191impl Parser {
192 fn new(version: (u16, u16), offset: usize) -> Self {
193 Parser {
194 version,
195 offset,
196 references: HashMap::new(),
197 referenced: Vec::new(),
198 }
199 }
200
201 fn read_marshal<T: Read>(mut self, reader: &mut T) -> Result<(Object, References, Referenced), Error> {
202 let object = self.read_object(reader)?;
203
204 for (index, usages) in &self.references {
205 let index = *index as usize;
206
207 if let Some(r) = self.referenced.get_mut(index) {
208 r.usages = usages.len() as u32;
209 } else {
210 return Err(Error::UnknownReference { index });
211 }
212 }
213
214 Ok((object, self.references, self.referenced))
215 }
216
217 fn read_object<T: Read>(&mut self, bytes: &mut T) -> Result<Object, Error> {
218 log::debug!("Reading object at offset {}", self.offset);
219
220 let offset = self.offset;
221 let mut byte = self.read_u8(bytes)?;
222
223 let mut ref_id = None;
224
225 if test_bit(byte, 7) {
227 let index = self.referenced.len() as u32;
228 log::debug!("Object at offset {} assigned reference index {}", self.offset, index);
229
230 byte = clear_bit(byte, 7);
231 ref_id = Some(index);
232 }
233
234 let Some(typ) = ObjectType::try_from(byte).ok() else {
235 return Err(Error::UnknownType {
236 byte: byte.into(),
237 offset,
238 });
239 };
240
241 if let Some(index) = ref_id {
242 let obj = ReferencedObject {
243 offset,
244 index,
245 usages: 0,
246 typ,
247 };
248
249 self.referenced.push(obj);
250 }
251
252 let result = match typ {
253 ObjectType::Null => Object::Null,
255 ObjectType::None => Object::None,
256 ObjectType::False => Object::False,
257 ObjectType::True => Object::True,
258 ObjectType::StopIteration => Object::StopIteration,
259 ObjectType::Ellipsis => Object::Ellipsis,
260
261 ObjectType::Int => Object::Int(self.read_u32(bytes)?),
263 ObjectType::BinaryFloat => Object::BinaryFloat(self.read_f64(bytes)?),
264 ObjectType::BinaryComplex => Object::BinaryComplex((self.read_f64(bytes)?, self.read_f64(bytes)?)),
265
266 ObjectType::String => Object::String {
268 typ: StringType::String,
269 bytes: self.read_string(bytes, false)?,
270 },
271 ObjectType::Interned => Object::String {
272 typ: StringType::Interned,
273 bytes: self.read_string(bytes, false)?,
274 },
275 ObjectType::Unicode => Object::String {
276 typ: StringType::Unicode,
277 bytes: self.read_string(bytes, false)?,
278 },
279 ObjectType::Ascii => Object::String {
280 typ: StringType::Ascii,
281 bytes: self.read_string(bytes, false)?,
282 },
283 ObjectType::AsciiInterned => Object::String {
284 typ: StringType::AsciiInterned,
285 bytes: self.read_string(bytes, false)?,
286 },
287 ObjectType::ShortAscii => Object::String {
288 typ: StringType::Ascii,
289 bytes: self.read_string(bytes, true)?,
290 },
291 ObjectType::ShortAsciiInterned => Object::String {
292 typ: StringType::AsciiInterned,
293 bytes: self.read_string(bytes, true)?,
294 },
295
296 ObjectType::Tuple => Object::Tuple(self.read_collection(bytes, false)?),
298 ObjectType::List => Object::List(self.read_collection(bytes, false)?),
299 ObjectType::Set => Object::Set(self.read_collection(bytes, false)?),
300 ObjectType::FrozenSet => Object::FrozenSet(self.read_collection(bytes, false)?),
301 ObjectType::SmallTuple => Object::Tuple(self.read_collection(bytes, true)?),
302 ObjectType::Dict => Object::Dict(self.read_dict(bytes)?),
303
304 ObjectType::Long => Object::Long(self.read_long(bytes)?),
306 ObjectType::Ref => Object::Ref(self.read_ref(bytes)?),
307 ObjectType::Code => Object::Code(Box::new(self.read_code_object(bytes)?)),
308
309 x => return Err(Error::UnhandledType(x)),
312 };
313
314 Ok(result)
315 }
316
317 #[inline(always)]
318 fn read_bytes<T: Read>(&mut self, bytes: &mut T, n: usize) -> Result<Vec<u8>, Error> {
319 let mut buf = vec![0u8; n];
320 bytes.read_exact(&mut buf)?;
321 self.offset += n;
322 Ok(buf)
323 }
324
325 #[inline(always)]
326 fn read_bytes_const<T: Read, const N: usize>(&mut self, bytes: &mut T) -> Result<[u8; N], Error> {
327 let mut buf = [0u8; N];
328 bytes.read_exact(&mut buf)?;
329 self.offset += N;
330 Ok(buf)
331 }
332
333 #[inline(always)]
334 fn read_u8<T: Read>(&mut self, bytes: &mut T) -> Result<u8, Error> {
335 log::debug!("Reading u8 at offset {}", self.offset);
336 Ok(u8::from_le_bytes(self.read_bytes_const(bytes)?))
337 }
338
339 #[inline(always)]
340 fn read_u32<T: Read>(&mut self, bytes: &mut T) -> Result<u32, Error> {
341 log::debug!("Reading u32 at offset {}", self.offset);
342 Ok(u32::from_le_bytes(self.read_bytes_const(bytes)?))
343 }
344
345 #[inline(always)]
346 fn read_i32<T: Read>(&mut self, bytes: &mut T) -> Result<i32, Error> {
347 log::debug!("Reading i32 at offset {}", self.offset);
348 Ok(i32::from_le_bytes(self.read_bytes_const(bytes)?))
349 }
350
351 #[inline(always)]
352 fn read_f64<T: Read>(&mut self, bytes: &mut T) -> Result<f64, Error> {
353 log::debug!("Reading f64 at offset {}", self.offset);
354 Ok(f64::from_le_bytes(self.read_bytes_const(bytes)?))
355 }
356
357 fn read_string<T: Read>(&mut self, bytes: &mut T, short: bool) -> Result<Vec<u8>, Error> {
358 let size = if short {
359 log::debug!("Reading short string at offset {}", self.offset);
360 self.read_u8(bytes)? as usize
361 } else {
362 log::debug!("Reading string at offset {}", self.offset);
363 self.read_u32(bytes)? as usize
364 };
365
366 let bytes = self.read_bytes(bytes, size)?;
367 Ok(bytes)
368 }
369
370 fn read_collection<T: Read>(&mut self, bytes: &mut T, small: bool) -> Result<Vec<Object>, Error> {
371 let size = if small {
372 log::debug!("Reading small tuple at offset {}", self.offset);
373 self.read_u8(bytes)? as usize
374 } else {
375 log::debug!("Reading collection at offset {}", self.offset);
376 self.read_u32(bytes)? as usize
377 };
378
379 let mut result = Vec::with_capacity(size);
380 for _ in 0..size {
381 result.push(self.read_object(bytes)?);
382 }
383
384 Ok(result)
385 }
386
387 fn read_dict<T: Read>(&mut self, bytes: &mut T) -> Result<Vec<(Object, Object)>, Error> {
388 log::debug!("Reading collection at offset {}", self.offset);
389
390 let mut result = Vec::new();
391
392 loop {
393 let key = self.read_object(bytes)?;
394 if key == Object::Null {
395 break;
396 }
397
398 let value = self.read_object(bytes)?;
399 result.push((key, value));
400 }
401
402 Ok(result)
403 }
404
405 fn read_long<T: Read>(&mut self, bytes: &mut T) -> Result<BigInt, Error> {
406 log::debug!("Reading long at offset {}", self.offset);
407
408 let size = self.read_i32(bytes)?;
409
410 let mut result = BigInt::ZERO;
411 let mut shift = 0;
412
413 for _ in 0..size.abs() {
414 let x = {
415 let b = self.read_bytes_const::<T, 2>(bytes)?;
416
417 let mut x = b[0] as i16;
418 x |= (b[1] as i16) << 8;
419 x |= -(x & 0x8000u16 as i16);
420
421 BigInt::from(x)
422 };
423
424 result += x << shift;
425 shift += 15;
426 }
427
428 if size > 0 {
429 Ok(result)
430 } else {
431 Ok(-result)
432 }
433 }
434
435 fn read_ref<T: Read>(&mut self, bytes: &mut T) -> Result<u32, Error> {
436 log::debug!("Reading reference at offset {}", self.offset);
437
438 let offset = self.offset;
439 let index = self.read_u32(bytes)?;
440 log::debug!("Found reference at offset {} with index {}", offset, index);
441
442 self.references
443 .entry(index)
444 .and_modify(|x| x.push(offset))
445 .or_insert(vec![offset]);
446 Ok(index)
447 }
448
449 fn read_code_object<T: Read>(&mut self, bytes: &mut T) -> Result<CodeObject, Error> {
450 log::debug!("Reading codeobject at offset {}", self.offset);
451
452 let result = CodeObject {
453 argcount: self.read_u32(bytes)?,
454 posonlyargcount: if self.version >= (3, 8) {
455 Some(self.read_u32(bytes)?)
456 } else {
457 None
458 },
459 kwonlyargcount: self.read_u32(bytes)?,
460 nlocals: if self.version < (3, 11) {
461 Some(self.read_u32(bytes)?)
462 } else {
463 None
464 },
465 stacksize: self.read_u32(bytes)?,
466 flags: self.read_u32(bytes)?,
467 code: self.read_object(bytes)?,
468 consts: self.read_object(bytes)?,
469 names: self.read_object(bytes)?,
470 varnames: if self.version < (3, 11) {
471 Some(self.read_object(bytes)?)
472 } else {
473 None
474 },
475 freevars: if self.version < (3, 11) {
476 Some(self.read_object(bytes)?)
477 } else {
478 None
479 },
480 cellvars: if self.version < (3, 11) {
481 Some(self.read_object(bytes)?)
482 } else {
483 None
484 },
485 localsplusnames: if self.version >= (3, 11) {
486 Some(self.read_object(bytes)?)
487 } else {
488 None
489 },
490 localspluskinds: if self.version >= (3, 11) {
491 Some(self.read_object(bytes)?)
492 } else {
493 None
494 },
495 filename: self.read_object(bytes)?,
496 name: self.read_object(bytes)?,
497 qualname: if self.version >= (3, 11) {
498 Some(self.read_object(bytes)?)
499 } else {
500 None
501 },
502 firstlineno: self.read_u32(bytes)?,
503 linetable: self.read_object(bytes)?,
504 exceptiontable: if self.version >= (3, 11) {
505 Some(self.read_object(bytes)?)
506 } else {
507 None
508 },
509 };
510
511 Ok(result)
512 }
513}
514
515#[inline(always)]
516fn test_bit(b: u8, i: u8) -> bool {
517 b & (1 << i) != 0u8
518}
519
520#[inline(always)]
521fn clear_bit(b: u8, i: u8) -> u8 {
522 b & !(1 << i)
523}