Skip to main content

malware_modeler/
unzip.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Functions for looking at large Zip files containing malware samples.
4//! Made with [VirusShare](https://virusshare.com) in mind.
5
6use crate::sorting::{FileTypeUnion, hash_depth};
7
8use std::collections::HashMap;
9use std::path::Path;
10
11use anyhow::{Result, ensure};
12use indicatif::{ProgressBar, ProgressStyle};
13use sha2::{Digest, Sha256};
14
15/// Unzip files of a specific type, or by all known types, from the specified ZIP archive to a destination directory
16/// Returns the number of files extracted.
17///
18/// # Errors
19///
20/// Errors result if the Zip file can't be read or if files can't be written in the destination directory.
21#[allow(clippy::too_many_lines)]
22pub fn unzip_files_by_type<P: AsRef<Path>>(
23    source: P,
24    destination: P,
25    password: Option<&String>,
26    depth: u8,
27    file_type: Option<FileTypeUnion>,
28    #[cfg(feature = "libmagic")] file_cmd: Option<&str>,
29    keep_unknowns: bool,
30) -> Result<usize> {
31    ensure!(source.as_ref().is_file(), "Source must be a file");
32    ensure!(
33        destination.as_ref().is_dir(),
34        "Destination must be a directory"
35    );
36
37    #[cfg(feature = "libmagic")]
38    let cookie = {
39        let cookie = magic::Cookie::open(magic::cookie::Flags::ERROR)?;
40        let database = magic::cookie::DatabasePaths::default();
41        cookie
42            .load(&database)
43            .map_err(|e| anyhow::anyhow!("Failed to load magic database: {e}"))?
44    };
45
46    let mut extracted_files = 0;
47    let file = std::fs::File::open(source)?;
48    let mut archive = zip::ZipArchive::new(file)?;
49    let pb = progress_bar_with_eta(archive.len() as u64);
50    for i in 0..archive.len() {
51        let mut file = if let Some(password) = password {
52            let Ok(f) = archive.by_index_decrypt(i, password.as_bytes()) else {
53                continue;
54            };
55            f
56        } else {
57            match archive.by_index(i) {
58                Ok(f) => f,
59                Err(e) => {
60                    eprintln!("ZipError: {e}");
61                    continue;
62                }
63            }
64        };
65
66        if (*file.name()).ends_with('/') {
67            continue;
68        }
69
70        let mut contents = Vec::new();
71        if let Err(e) = std::io::copy(&mut file, &mut contents) {
72            eprintln!("ZipError: {e}");
73            continue;
74        }
75
76        let hash = hex::encode(Sha256::digest(&contents));
77        #[cfg(not(feature = "libmagic"))]
78        let mut destination_directory = if let Some(file_type) = &file_type {
79            if file_type.matches(&contents) {
80                let mut dest = destination.as_ref().to_owned();
81                dest.push(file_type.to_string());
82                dest.push(hash_depth(&hash, depth));
83                dest
84            } else {
85                pb.inc(1);
86                continue;
87            }
88        } else {
89            let this_type = FileTypeUnion::from_bytes(&contents);
90            if !keep_unknowns && this_type.is_unknown() {
91                pb.inc(1);
92                continue;
93            }
94
95            let mut dest = destination.as_ref().to_owned();
96            dest.push(this_type.to_string());
97            dest.push(hash_depth(&hash, depth));
98            dest
99        };
100
101        #[cfg(feature = "libmagic")]
102        let mut destination_directory = if let Some(file_cmd) = file_cmd {
103            let file_cmd = file_cmd.to_lowercase();
104            let result = cookie.buffer(&contents)?.to_lowercase();
105            if result.contains(&file_cmd) {
106                let mut dest = destination.as_ref().to_owned();
107                dest.push(file_cmd);
108                dest.push(hash_depth(&hash, depth));
109                dest
110            } else {
111                pb.inc(1);
112                continue;
113            }
114        } else if let Some(file_type) = &file_type {
115            if file_type.matches(&contents) {
116                let mut dest = destination.as_ref().to_owned();
117                dest.push(file_type.to_string());
118                dest.push(hash_depth(&hash, depth));
119                dest
120            } else {
121                pb.inc(1);
122                continue;
123            }
124        } else {
125            let this_type = FileTypeUnion::from_bytes(&contents);
126            if !keep_unknowns && this_type.is_unknown() {
127                pb.inc(1);
128                continue;
129            }
130
131            let mut dest = destination.as_ref().to_owned();
132            dest.push(this_type.to_string());
133            dest.push(hash_depth(&hash, depth));
134            dest
135        };
136
137        if let Err(e) = std::fs::create_dir_all(&destination_directory) {
138            eprintln!(
139                "ZipError creating directories {}: {e}",
140                destination_directory.display()
141            );
142            return Err(e.into());
143        }
144        destination_directory.push(hash);
145        if let Err(e) = std::fs::write(&destination_directory, contents) {
146            eprintln!(
147                "ZipError writing file {}: {e}",
148                destination_directory.display()
149            );
150            return Err(e.into());
151        }
152
153        extracted_files += 1;
154        pb.inc(1);
155    }
156    pb.finish_and_clear();
157
158    Ok(extracted_files)
159}
160
161/// Summary of a Zip archive's contents
162pub struct ZipSummaryDetails {
163    /// Known file types and their occurrences in the Zip archive
164    pub file_type_counts: HashMap<FileTypeUnion, usize>,
165
166    /// Unknown file types where their first few bytes are captured
167    #[cfg(not(feature = "libmagic"))]
168    pub unknown_magic_counts: HashMap<Vec<u8>, usize>,
169
170    /// Unknown file types where their first few bytes are captured and the result from libmagic
171    #[cfg(feature = "libmagic")]
172    pub unknown_magic_counts: HashMap<Vec<u8>, (usize, String)>,
173
174    /// Total number of files in the Zip archive
175    pub total_files: usize,
176}
177
178/// Attempt to identify all file types contained within a zip file, returning a summary of file
179/// types and number of observations and total number of files.
180///
181/// # Errors
182///
183/// Returns errors if the Zip is malformed or if a password is required and missing.
184pub fn zip_file_type_counts<P: AsRef<Path>>(
185    source: P,
186    password: Option<&String>,
187    unknown_magic: usize,
188) -> Result<ZipSummaryDetails> {
189    ensure!(source.as_ref().is_file(), "Source must be a file");
190
191    #[cfg(feature = "libmagic")]
192    let (cookie, mut unknowns) = {
193        let cookie = magic::Cookie::open(magic::cookie::Flags::ERROR)?;
194        let database = &magic::cookie::DatabasePaths::default();
195        let cookie = cookie
196            .load(database)
197            .map_err(|e| anyhow::anyhow!("Failed to load magic database: {e}"))?;
198        (cookie, HashMap::<Vec<u8>, (usize, String)>::new())
199    };
200
201    #[cfg(not(feature = "libmagic"))]
202    let mut unknowns = HashMap::new();
203
204    let mut summary = HashMap::new();
205    let mut total_files = 0;
206    let file = std::fs::File::open(source)?;
207    let mut archive = zip::ZipArchive::new(file)?;
208    let pb = progress_bar_with_eta(archive.len() as u64);
209    for i in 0..archive.len() {
210        let mut file = if let Some(password) = password {
211            let Ok(f) = archive.by_index_decrypt(i, password.as_bytes()) else {
212                continue;
213            };
214            f
215        } else {
216            match archive.by_index(i) {
217                Ok(f) => f,
218                Err(e) => {
219                    eprintln!("ZipError: {e}");
220                    continue;
221                }
222            }
223        };
224
225        if (*file.name()).ends_with('/') {
226            continue;
227        }
228
229        let mut contents = Vec::new();
230        if let Err(e) = std::io::copy(&mut file, &mut contents) {
231            eprintln!("ZipError: {e}");
232            continue;
233        }
234
235        let this_type = FileTypeUnion::from_bytes(&contents);
236        summary
237            .entry(this_type)
238            .and_modify(|e| *e += 1)
239            .or_insert(1);
240
241        if this_type.is_unknown() && unknown_magic > 0 {
242            let first_bytes = contents
243                .iter()
244                .take(unknown_magic)
245                .copied()
246                .collect::<Vec<_>>();
247
248            #[cfg(not(feature = "libmagic"))]
249            unknowns
250                .entry(first_bytes)
251                .and_modify(|e| *e += 1)
252                .or_insert(1);
253
254            #[cfg(feature = "libmagic")]
255            {
256                if let Some(entry) = unknowns.get_mut(&first_bytes) {
257                    entry.0 += 1;
258                } else {
259                    let result = cookie.buffer(&contents)?;
260                    unknowns.insert(first_bytes, (1, result));
261                }
262            }
263        }
264
265        total_files += 1;
266        pb.inc(1);
267    }
268    pb.finish_and_clear();
269
270    Ok(ZipSummaryDetails {
271        file_type_counts: summary,
272        unknown_magic_counts: unknowns,
273        total_files,
274    })
275}
276
277/// Create a progress bar with an ETA, and it will not panic despite the `unwrap()`.
278fn progress_bar_with_eta(len: u64) -> ProgressBar {
279    ProgressBar::new(len)
280        .with_style(ProgressStyle::with_template("{wide_bar} {pos}/{len} {eta}").unwrap())
281}