malwaredb_server/db/
types.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use malwaredb_api::{SupportedFileType, SupportedFileTypes};
4use malwaredb_types::utils::EntropyCalc;
5
6use fuzzyhash::FuzzyHash;
7use human_hash::humanize;
8use magic::cookie::DatabasePaths;
9use malwaredb_lzjd::{LZDict, Murmur3HashState};
10use md5::Md5;
11use sha1::Sha1;
12use sha2::{Digest, Sha256, Sha384, Sha512};
13use tlsh_fixed::TlshBuilder;
14use tracing::error;
15use uuid::Uuid;
16
17/// Metadata about a file for storing as a record in Malware DB
18#[derive(Debug, Clone)]
19pub struct FileMetadata {
20    /// File name
21    pub name: Option<String>,
22
23    /// File size in bytes
24    pub size: u64,
25
26    /// Entropy of the file
27    pub entropy: f32,
28
29    /// SHA-1 hash
30    pub sha1: String,
31
32    /// SHA-256 hash
33    pub sha256: String,
34
35    /// SHA-384 hash
36    pub sha384: String,
37
38    /// SHA-512 hash
39    pub sha512: String,
40
41    /// MD5 hash
42    pub md5: String,
43
44    /// `LZJD` similarity hash
45    pub lzjd: Option<String>,
46
47    /// `SSDeep` similarity hash, if the file is large enough
48    pub ssdeep: Option<String>,
49
50    /// Trend Micro's similarity hash (distance metric)
51    pub tlsh: Option<String>,
52
53    /// Human Hash, based on <https://github.com/zacharyvoase/humanhash>
54    pub humanhash: String,
55
56    /// File command (or libmagic) description of the file
57    pub file_command: String,
58}
59
60impl FileMetadata {
61    /// Get the collection of file measurements given a byte sequence
62    ///
63    /// # Panics
64    ///
65    /// This won't actually panic despite a call to `.unwrap()` because the input to that function
66    /// is known to always be the correct size.
67    pub fn new(contents: &[u8], name: Option<&str>) -> Self {
68        let mut sha1 = Sha1::new();
69        sha1.update(contents);
70        let sha1 = sha1.finalize();
71
72        let mut sha256 = Sha256::new();
73        sha256.update(contents);
74        let sha256 = sha256.finalize();
75
76        let mut sha384 = Sha384::new();
77        sha384.update(contents);
78        let sha384 = sha384.finalize();
79
80        let mut sha512 = Sha512::new();
81        sha512.update(contents);
82        let sha512 = sha512.finalize();
83
84        let mut md5 = Md5::new();
85        md5.update(contents);
86        let md5 = md5.finalize();
87
88        let build_hasher = Murmur3HashState::default();
89        let lzjd_str =
90            LZDict::from_bytes_stream(contents.iter().copied(), &build_hasher).to_string();
91
92        let mut builder = TlshBuilder::new(
93            tlsh_fixed::BucketKind::Bucket256,
94            tlsh_fixed::ChecksumKind::ThreeByte,
95            tlsh_fixed::Version::Version4,
96        );
97
98        builder.update(contents);
99
100        let tlsh = if let Ok(hasher) = builder.build() {
101            Some(hasher.hash())
102        } else {
103            None
104        };
105
106        let md5 = hex::encode(md5);
107
108        // This won't panic since the MD5 hash is 16 bytes long
109        let uuid = Uuid::try_parse(&md5).unwrap();
110
111        let file_command = {
112            if let Ok(cookie) = magic::Cookie::open(magic::cookie::Flags::ERROR) {
113                let db_paths = DatabasePaths::default();
114                if let Ok(cookie) = cookie.load(&db_paths) {
115                    if let Ok(output) = cookie.buffer(contents) {
116                        output
117                    } else {
118                        error!("LibMagic: failed to get output for buffer");
119                        String::new()
120                    }
121                } else {
122                    error!("LibMagic: failed to load signature database");
123                    String::new()
124                }
125            } else {
126                error!("LibMagic: failed to get handle");
127                String::new()
128            }
129        };
130
131        Self {
132            name: name.map(str::to_ascii_lowercase),
133            size: contents.len() as u64,
134            entropy: contents.entropy(),
135            sha1: hex::encode(sha1),
136            sha256: hex::encode(sha256),
137            sha384: hex::encode(sha384),
138            sha512: hex::encode(sha512),
139            md5,
140            lzjd: Some(lzjd_str),
141            ssdeep: Some(FuzzyHash::new(contents).to_string()),
142            tlsh,
143            humanhash: humanize(&uuid, 4),
144            file_command,
145        }
146    }
147}
148
149/// File Types known to Malware DB; a magic number has to be matched to a database ID.
150#[derive(Debug, Clone)]
151pub struct FileType {
152    /// Database ID number
153    pub id: u32,
154
155    /// Friendly name
156    pub name: String,
157
158    /// Description of the type
159    pub description: Option<String>,
160
161    /// Magic numbers as bytes
162    /// These are the first few bytes of the file which identify it's type
163    /// Some types have more than one possible magic number, though it's rare
164    pub magic: Vec<Vec<u8>>,
165
166    /// Whether or not this file is executable on some system
167    /// Assumption: if not executable, it's a document
168    pub executable: bool,
169}
170
171/// File types, just simple Vec wrapper around [`FileType`].
172pub struct FileTypes(pub Vec<FileType>);
173
174impl From<FileType> for SupportedFileType {
175    fn from(value: FileType) -> Self {
176        Self {
177            name: value.name,
178            magic: value.magic.iter().map(hex::encode).collect(),
179            is_executable: value.executable,
180            description: value.description,
181        }
182    }
183}
184
185impl From<FileTypes> for SupportedFileTypes {
186    fn from(value: FileTypes) -> Self {
187        Self {
188            types: value.0.into_iter().map(std::convert::Into::into).collect(),
189            message: None,
190        }
191    }
192}
193
194#[cfg(test)]
195mod test {
196    use super::*;
197    use std::str::FromStr;
198
199    #[test]
200    fn meta_and_sim_hashes() {
201        let contents = include_bytes!("../../../types/testdata/elf/elf_haiku_x86").to_vec();
202        let meta = FileMetadata::new(&contents, Some("elf_haiku_x86"));
203        assert!(meta.lzjd.is_some());
204        assert!(meta.tlsh.is_some());
205        assert!(meta.ssdeep.is_some());
206
207        let ssdeep = meta.ssdeep.unwrap();
208        let tlsh = meta.tlsh.unwrap();
209        let lzjd = meta.lzjd.unwrap();
210
211        println!("LZJD: {lzjd}");
212        println!("Tlsh: {tlsh}");
213        println!("SSDeep: {ssdeep}");
214        println!("Human hash: {}", meta.humanhash);
215        println!("File command: {}", meta.file_command);
216
217        assert_eq!(FuzzyHash::compare(ssdeep.clone(), ssdeep).unwrap(), 100);
218
219        let tlsh =
220            tlsh_fixed::Tlsh::from_str(&tlsh).expect("failed to convert tlsh string to object");
221        assert_eq!(tlsh.diff(&tlsh, true), 0);
222
223        let lzjd =
224            LZDict::from_base64_string(&lzjd).expect("failed to convert lzjd string to object");
225        assert!(lzjd.jaccard_similarity(&lzjd) - 1.0f64 <= f64::EPSILON);
226    }
227}