1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
use malwaredb_api::{SupportedFileType, SupportedFileTypes};
use malwaredb_types::utils::EntropyCalc;

use fuzzyhash::FuzzyHash;
use human_hash::humanize;
use magic::cookie::DatabasePaths;
use malwaredb_lzjd::{LZDict, Murmur3HashState};
use md5::Md5;
use sha1::Sha1;
use sha2::{Digest, Sha256, Sha384, Sha512};
use tlsh_fixed::TlshBuilder;
use tracing::error;
use uuid::Uuid;

/// Metadata about a file for storing as a record in MalwareDB
#[derive(Debug, Clone)]
pub struct FileMetadata {
    /// File name
    pub name: Option<String>,

    /// Size in bytes
    pub size: u32,

    /// Entropy of the file
    pub entropy: f32,

    /// SHA-1 hash
    pub sha1: String,

    /// SHA-256 hash
    pub sha256: String,

    /// SHA-384 hash
    pub sha384: String,

    /// SHA-512 hash
    pub sha512: String,

    /// MD5 hash
    pub md5: String,

    /// LZJD similarity hash
    pub lzjd: Option<String>,

    /// SSDeep similarity hash, if the file is large enough
    pub ssdeep: Option<String>,

    /// Trend Micro's similarity hash (distance metric)
    pub tlsh: Option<String>,

    /// Human Hash, based on https://github.com/zacharyvoase/humanhash
    pub humanhash: String,

    /// File command (or libmagic) description of the file
    pub file_command: String,
}

impl FileMetadata {
    /// Get the collection of file measurements given a byte sequence
    pub fn new(contents: &[u8], name: Option<&str>) -> Self {
        let mut sha1 = Sha1::new();
        sha1.update(contents);
        let sha1 = sha1.finalize();

        let mut sha256 = Sha256::new();
        sha256.update(contents);
        let sha256 = sha256.finalize();

        let mut sha384 = Sha384::new();
        sha384.update(contents);
        let sha384 = sha384.finalize();

        let mut sha512 = Sha512::new();
        sha512.update(contents);
        let sha512 = sha512.finalize();

        let mut md5 = Md5::new();
        md5.update(contents);
        let md5 = md5.finalize();

        let build_hasher = Murmur3HashState::default();
        let lzjd_str =
            LZDict::from_bytes_stream(contents.iter().copied(), &build_hasher).to_string();

        let mut builder = TlshBuilder::new(
            tlsh_fixed::BucketKind::Bucket256,
            tlsh_fixed::ChecksumKind::ThreeByte,
            tlsh_fixed::Version::Version4,
        );

        builder.update(contents);

        let tlsh = if let Ok(hasher) = builder.build() {
            Some(hasher.hash())
        } else {
            None
        };

        let md5 = hex::encode(md5);
        let uuid = Uuid::parse_str(&md5).unwrap();

        let file_command = {
            if let Ok(cookie) = magic::Cookie::open(magic::cookie::Flags::ERROR) {
                let db_paths = DatabasePaths::default();
                if let Ok(cookie) = cookie.load(&db_paths) {
                    if let Ok(output) = cookie.buffer(contents) {
                        output
                    } else {
                        error!("LibMagic: failed to get output for buffer");
                        "".into()
                    }
                } else {
                    error!("LibMagic: failed to load signature database");
                    "".into()
                }
            } else {
                error!("LibMagic: failed to get handle");
                "".into()
            }
        };

        Self {
            name: name.map(|n| n.to_ascii_lowercase()),
            size: contents.len() as u32,
            entropy: contents.entropy(),
            sha1: hex::encode(sha1),
            sha256: hex::encode(sha256),
            sha384: hex::encode(sha384),
            sha512: hex::encode(sha512),
            md5,
            lzjd: Some(lzjd_str),
            ssdeep: Some(FuzzyHash::new(contents).to_string()),
            tlsh,
            humanhash: humanize(&uuid, 4),
            file_command,
        }
    }
}

/// File Types known to MalwareDB; a magic number has to be matched to a database ID.
#[derive(Debug, Clone)]
pub struct FileType {
    /// Database ID number
    pub id: i32,

    /// Friendly name
    pub name: String,

    /// Description of the type
    pub description: Option<String>,

    /// Magic numbers as bytes
    /// These are the first few bytes of the file which identify it's type
    /// Some types have more than one possible magic number, though it's rare
    pub magic: Vec<Vec<u8>>,

    /// Whether or not this file is executable on some system
    /// Assumption: if not executable, it's a document
    pub executable: bool,
}

/// File types, just simple Vec wrapper around [FileType].
pub struct FileTypes(pub Vec<FileType>);

impl From<FileType> for SupportedFileType {
    fn from(value: FileType) -> Self {
        Self {
            name: value.name,
            magic: value.magic.iter().map(hex::encode).collect(),
            is_executable: value.executable,
            description: value.description,
        }
    }
}

impl From<FileTypes> for SupportedFileTypes {
    fn from(value: FileTypes) -> Self {
        Self {
            types: value.0.into_iter().map(|t| t.into()).collect(),
            message: None,
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use std::str::FromStr;

    #[test]
    fn meta_and_sim_hashes() {
        let contents = include_bytes!("../../../types/testdata/elf/elf_haiku_x86").to_vec();
        let meta = FileMetadata::new(&contents, Some("elf_haiku_x86"));
        assert!(meta.lzjd.is_some());
        assert!(meta.tlsh.is_some());
        assert!(meta.ssdeep.is_some());

        let ssdeep = meta.ssdeep.unwrap();
        let tlsh = meta.tlsh.unwrap();
        let lzjd = meta.lzjd.unwrap();

        println!("LZJD: {lzjd}");
        println!("Tlsh: {tlsh}");
        println!("SSDeep: {ssdeep}");
        println!("Human hash: {}", meta.humanhash);
        println!("File command: {}", meta.file_command);

        assert_eq!(FuzzyHash::compare(ssdeep.clone(), ssdeep).unwrap(), 100);

        let tlsh =
            tlsh_fixed::Tlsh::from_str(&tlsh).expect("failed to convert tlsh string to object");
        assert_eq!(tlsh.diff(&tlsh, true), 0);

        let lzjd =
            LZDict::from_base64_string(&lzjd).expect("failed to convert lzjd string to object");
        assert_eq!(lzjd.jaccard_similarity(&lzjd), 1.0);
    }
}