unfuck/
lib.rs

1#![feature(get_mut_unchecked)]
2#![feature(map_first_last)]
3
4use crate::error::Error;
5use code_graph::CodeGraph;
6use partial_execution::ExecutionPath;
7use petgraph::stable_graph::NodeIndex;
8use pydis::opcode::py27::{self, Standard};
9use pydis::prelude::{Instruction, Opcode};
10use rayon::prelude::*;
11
12use py27_marshal::{Code, Obj};
13use rayon::Scope;
14use smallvm::InstructionTracker;
15use std::collections::{HashMap, HashSet};
16use std::fmt::Debug;
17use std::marker::PhantomData;
18use std::path::Path;
19use std::sync::atomic::{AtomicUsize, Ordering};
20use std::sync::{Arc, Mutex, RwLock};
21use strings::CodeObjString;
22
23/// Representing code as a graph of basic blocks
24pub mod code_graph;
25/// Deobfuscation module
26pub mod deob;
27/// Errors
28pub mod error;
29/// Provides code for partially executing a code object and identifying const conditions
30pub mod partial_execution;
31/// Python VM
32pub mod smallvm;
33/// Management of Python strings for string dumping
34pub mod strings;
35
36pub struct Deobfuscator<'a, O: Opcode<Mnemonic = py27::Mnemonic> + PartialEq> {
37    /// Input stream.
38    input: &'a [u8],
39
40    /// Output to write dotviz graph to
41    enable_dotviz_graphs: bool,
42    files_processed: AtomicUsize,
43    graphviz_graphs: HashMap<String, String>,
44    on_graph_generated: Option<Box<dyn Fn(&str, &str) + Send + Sync>>,
45    on_store_to_named_var: Option<
46        Box<
47            dyn Fn(
48                    &Code,
49                    &HashSet<String>,
50                    &RwLock<&mut CodeGraph<O>>,
51                    &Instruction<O>,
52                    &(Option<Obj>, InstructionTracker<(NodeIndex<u32>, usize)>),
53                ) + Send
54                + Sync,
55        >,
56    >,
57    _opcode_phantom: PhantomData<O>,
58}
59
60impl<'a, O: Opcode<Mnemonic = py27::Mnemonic> + PartialEq> Debug for Deobfuscator<'a, O> {
61    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
62        f.debug_struct("Deobfuscator")
63            .field("input", &self.input)
64            .field("enable_dotviz_graphs", &self.enable_dotviz_graphs)
65            .field("files_processed", &self.files_processed)
66            .field("graphviz_graphs", &self.graphviz_graphs)
67            .field(
68                "on_graph_generated",
69                if let Some(callback) = &self.on_graph_generated {
70                    &"Some(callback)"
71                } else {
72                    &"None"
73                },
74            )
75            .field(
76                "on_store_to_named_var",
77                if let Some(callback) = &self.on_store_to_named_var {
78                    &"Some(callback)"
79                } else {
80                    &"None"
81                },
82            )
83            .field("_opcode_phantom", &self._opcode_phantom)
84            .finish()
85    }
86}
87
88impl<'a, O: Opcode<Mnemonic = py27::Mnemonic> + PartialEq> Deobfuscator<'a, O> {
89    /// Creates a new instance of a deobfuscator
90    pub fn new(input: &'a [u8]) -> Deobfuscator<'a, O> {
91        Deobfuscator {
92            input,
93            enable_dotviz_graphs: false,
94            files_processed: AtomicUsize::new(0),
95            graphviz_graphs: HashMap::new(),
96            on_graph_generated: None,
97            on_store_to_named_var: None,
98            _opcode_phantom: Default::default(),
99        }
100    }
101
102    /// Consumes the current Deobfuscator object and returns a new one with graph
103    /// output enabled.
104    pub fn enable_graphs(mut self) -> Deobfuscator<'a, O> {
105        self.enable_dotviz_graphs = true;
106        self
107    }
108
109    /// Callback for when a new graph is generated. This may be useful if deobfuscation
110    /// fails/panics and graphs can't be written, you can use this functionality
111    /// to write graphs on-the-fly
112    pub fn on_graph_generated(
113        mut self,
114        callback: impl Fn(&str, &str) + 'static + Send + Sync,
115    ) -> Deobfuscator<'a, O> {
116        self.on_graph_generated = Some(Box::new(callback));
117        self
118    }
119
120    /// Callback for when a `STORE_NAME` or `STORE_FAST` is encountered. This
121    /// may be useful for mapping obfuscated module names to their "clean" name.
122    pub fn on_store_to_named_var(
123        mut self,
124        callback: impl Fn(
125                &Code,
126                &HashSet<String>,
127                &RwLock<&mut CodeGraph<O>>,
128                &Instruction<O>,
129                &(Option<Obj>, InstructionTracker<(NodeIndex<u32>, usize)>),
130            )
131            + 'static
132            + Send
133            + Sync,
134    ) -> Deobfuscator<'a, O> {
135        self.on_store_to_named_var = Some(Box::new(callback));
136        self
137    }
138
139    /// Returns the generated graphviz graphs after a [`deobfuscate`] has been called.
140    /// Keys are their filenames, values are the dot data.
141    pub fn graphs(&self) -> &HashMap<String, String> {
142        &self.graphviz_graphs
143    }
144
145    /// Deobfuscates the marshalled code object and returns either the deobfuscated code object
146    /// or the [`crate::errors::Error`] encountered during execution
147    pub fn deobfuscate(&self) -> Result<DeobfuscatedCodeObject, Error<O>> {
148        if let py27_marshal::Obj::Code(code) = py27_marshal::read::marshal_loads(&self.input)? {
149            // This vector will contain the input code object and all nested objects
150            let mut results = vec![];
151            let mut mapped_names = HashMap::new();
152            let mut graphs = HashMap::new();
153            let out_results = Arc::new(Mutex::new(vec![]));
154            rayon::scope(|scope| {
155                self.deobfuscate_nested_code_objects(
156                    Arc::clone(&code),
157                    scope,
158                    Arc::clone(&out_results),
159                );
160            });
161
162            let out_results = Arc::try_unwrap(out_results)
163                .unwrap_or_else(|_| panic!("failed to unwrap mapped names"))
164                .into_inner()
165                .unwrap();
166            for result in out_results {
167                let result = result?;
168                results.push((result.file_number, result.new_bytecode));
169                mapped_names.extend(result.mapped_function_names);
170                graphs.extend(result.graphviz_graphs);
171            }
172
173            // sort these items by their file number. ordering matters since our python code pulls the objects as a
174            // queue
175            results.sort_by(|a, b| a.0.cmp(&b.0));
176
177            let output_data = self
178                .rename_vars(
179                    &mut results.iter().map(|result| result.1.as_slice()),
180                    &mapped_names,
181                )
182                .unwrap();
183
184            Ok(DeobfuscatedCodeObject {
185                data: output_data,
186                graphs,
187            })
188        } else {
189            Err(Error::InvalidCodeObject)
190        }
191    }
192
193    pub(crate) fn deobfuscate_nested_code_objects(
194        &'a self,
195        code: Arc<Code>,
196        scope: &Scope<'a>,
197        out_results: Arc<Mutex<Vec<Result<DeobfuscatedBytecode, Error<O>>>>>,
198    ) {
199        let file_number = self.files_processed.fetch_add(1, Ordering::Relaxed);
200
201        let task_code = Arc::clone(&code);
202        let thread_results = Arc::clone(&out_results);
203        scope.spawn(move |_scope| {
204            let res = self.deobfuscate_code(task_code, file_number);
205            thread_results.lock().unwrap().push(res);
206        });
207
208        // We need to find and replace the code sections which may also be in the const data
209        for c in code.consts.iter() {
210            if let Obj::Code(const_code) = c {
211                let thread_results = Arc::clone(&out_results);
212                let thread_code = Arc::clone(const_code);
213
214                self.deobfuscate_nested_code_objects(thread_code, scope, thread_results);
215            }
216        }
217    }
218}
219
220pub struct DeobfuscatedCodeObject {
221    /// Serialized code object with no header
222    pub data: Vec<u8>,
223    /// Graphs that were generated while deobfuscating this code object and any
224    /// nested objects. Keys represent file names and their deobfuscation pass
225    /// while the values represent the graphviz data in Dot format
226    pub graphs: HashMap<String, String>,
227}
228
229pub(crate) struct DeobfuscatedBytecode {
230    pub(crate) file_number: usize,
231    pub(crate) new_bytecode: Vec<u8>,
232    pub(crate) mapped_function_names: HashMap<String, String>,
233    pub(crate) graphviz_graphs: HashMap<String, String>,
234}
235
236/// Dumps all strings from a Code object. This will go over all of the `names`, variable names (`varnames`),
237/// `consts`, and all strings from any nested code objects.
238pub fn dump_strings<'a>(
239    pyc_filename: &'a Path,
240    data: &[u8],
241) -> Result<Vec<CodeObjString<'a>>, Error<Standard>> {
242    if let py27_marshal::Obj::Code(code) = py27_marshal::read::marshal_loads(data)? {
243        Ok(dump_codeobject_strings(pyc_filename, code))
244    } else {
245        Err(Error::InvalidCodeObject)
246    }
247}
248
249/// Dumps all strings from a Code object. This will go over all of the `names`, variable names (`varnames`),
250/// `consts`, and all strings from any nested code objects.
251fn dump_codeobject_strings(pyc_filename: &Path, code: Arc<Code>) -> Vec<CodeObjString> {
252    let new_strings = Mutex::new(vec![]);
253    code.names.par_iter().for_each(|name| {
254        new_strings.lock().unwrap().push(CodeObjString::new(
255            code.as_ref(),
256            pyc_filename,
257            crate::strings::StringType::Name,
258            name.to_string().as_ref(),
259        ))
260    });
261
262    code.varnames.par_iter().for_each(|name| {
263        new_strings.lock().unwrap().push(CodeObjString::new(
264            code.as_ref(),
265            pyc_filename,
266            crate::strings::StringType::VarName,
267            name.to_string().as_ref(),
268        ))
269    });
270
271    code.consts.as_ref().par_iter().for_each(|c| {
272        if let py27_marshal::Obj::String(s) = c {
273            new_strings.lock().unwrap().push(CodeObjString::new(
274                code.as_ref(),
275                pyc_filename,
276                crate::strings::StringType::Const,
277                s.to_string().as_ref(),
278            ))
279        }
280    });
281
282    // We need to find and replace the code sections which may also be in the const data
283    code.consts.par_iter().for_each(|c| {
284        if let Obj::Code(const_code) = c {
285            // Call deobfuscate_bytecode first since the bytecode comes before consts and other data
286            let mut strings = dump_codeobject_strings(pyc_filename, Arc::clone(&const_code));
287            new_strings.lock().unwrap().append(&mut strings);
288        }
289    });
290
291    new_strings.into_inner().unwrap()
292}