ssdeep/lib.rs
1// ssdeep-rs: A Rust wrapper for ssdeep.
2//
3// Copyright (c) 2016 Petr Zemek <s3rvac@petrzemek.net>
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU General Public License for more details.
14//
15// You should have received a copy of the GNU General Public License
16// along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18//! A Rust wrapper for [ssdeep by Jesse
19//! Kornblum](https://ssdeep-project.github.io/ssdeep/), which is a C library
20//! for computing [context triggered piecewise
21//! hashes](http://dfrws.org/2006/proceedings/12-Kornblum.pdf) (CTPH). Also
22//! called fuzzy hashes, CTPH can match inputs that have homologies. Such
23//! inputs have sequences of identical bytes in the same order, although bytes
24//! in between these sequences may be different in both content and length. In
25//! contrast to standard hashing algorithms, CTPH can be used to identify files
26//! that are highly similar but not identical.
27//!
28//! Usage
29//! -----
30//!
31//! To compute the fuzzy hash of the given bytes, use
32//! [`hash()`](fn.hash.html):
33//! ```
34//! extern crate ssdeep;
35//!
36//! let h = ssdeep::hash(b"Hello there!").unwrap();
37//! assert_eq!(h, "3:aNRn:aNRn");
38//! ```
39//!
40//! To obtain the fuzzy hash of the contents of a file, use
41//! [`hash_from_file()`](fn.hash_from_file.html):
42//! ```
43//! let h = ssdeep::hash_from_file("tests/file.txt").unwrap();
44//! ```
45//!
46//! To compare two fuzzy hashes, use [`compare()`](fn.compare.html), which
47//! returns an integer between 0 (no match) and 100:
48//! ```
49//! let h1 = "3:AXGBicFlgVNhBGcL6wCrFQEv:AXGHsNhxLsr2C";
50//! let h2 = "3:AXGBicFlIHBGcL6wCrFQEv:AXGH6xLsr2Cx";
51//! let score = ssdeep::compare(h1, h2).unwrap();
52//! assert_eq!(score, 22);
53//! ```
54//!
55//! Each of these functions returns a
56//! [`Result`](https://doc.rust-lang.org/std/result/enum.Result.html), where an
57//! error is returned when the underlying C function fails.
58
59extern crate libc;
60extern crate libfuzzy_sys as raw;
61
62use libc::c_char;
63use std::error;
64use std::ffi::CString;
65use std::fmt;
66use std::path::Path;
67
68/// An enum containing errors that the library might return.
69#[derive(Debug, PartialEq)]
70pub enum Error {
71 /// Error returned when a function from the underlying C library fails.
72 CFunctionFailed {
73 /// Name of the C function.
74 name: String,
75 /// Return code of the function.
76 return_code: i32,
77 },
78}
79
80impl error::Error for Error {}
81
82impl std::fmt::Display for Error {
83 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
84 match self {
85 Error::CFunctionFailed { name, return_code } => {
86 write!(
87 f,
88 "ssdeep C function {}() failed with return code {}",
89 name, return_code
90 )
91 }
92 }
93 }
94}
95
96/// The result type used by the library.
97pub type Result<T> = std::result::Result<T, Error>;
98
99/// Computes the match score between two fuzzy hashes.
100///
101/// Returns a value from 0 to 100 indicating the match score of the two hashes.
102/// A match score of zero indicates that the hashes did not match. When an
103/// error occurs, it returns [`Error`](enum.Error.html).
104///
105/// # Examples
106///
107/// When the hashes are identical, it returns 100:
108/// ```
109/// let h1 = "3:AXGBicFlgVNhBGcL6wCrFQEv:AXGHsNhxLsr2C";
110/// let h2 = "3:AXGBicFlgVNhBGcL6wCrFQEv:AXGHsNhxLsr2C";
111/// assert_eq!(ssdeep::compare(h1, h2), Ok(100));
112/// ```
113///
114/// When the hashes are similar, it returns a positive integer:
115/// ```
116/// let h1 = "3:AXGBicFlgVNhBGcL6wCrFQEv:AXGHsNhxLsr2C";
117/// let h2 = "3:AXGBicFlIHBGcL6wCrFQEv:AXGH6xLsr2Cx";
118/// assert_eq!(ssdeep::compare(h1, h2), Ok(22));
119/// ```
120///
121/// When the hashes have no similarity at all, it returns zero:
122/// ```
123/// let h1 = "3:u+N:u+N";
124/// let h2 = "3:OWIXTn:OWQ";
125/// assert_eq!(ssdeep::compare(h1, h2), Ok(0));
126/// ```
127///
128/// When either of the hashes is invalid, it returns an error:
129/// ```
130/// let h1 = "XYZ";
131/// let h2 = "3:tc:u";
132/// assert_eq!(
133/// ssdeep::compare(h1, h2),
134/// Err(ssdeep::Error::CFunctionFailed {
135/// name: "fuzzy_compare".to_string(),
136/// return_code: -1,
137/// })
138/// );
139///
140/// ```
141///
142/// # Panics
143///
144/// If either of the hashes contain a null byte. Note that
145/// [`hash()`](fn.hash.html) never returns a hash with a null byte, so this may
146/// happen only if you handcrafted the hashes or obtained them from other
147/// sources.
148///
149/// # Implementation details
150///
151/// Internally, it calls the `fuzzy_compare()` function from the underlying C
152/// library. The return value `-1` is translated into
153/// [`Error`](enum.Error.html).
154pub fn compare(hash1: &str, hash2: &str) -> Result<u8> {
155 let h1 = str_to_cstring(hash1);
156 let h2 = str_to_cstring(hash2);
157 let score = unsafe {
158 raw::fuzzy_compare(
159 h1.as_bytes_with_nul().as_ptr() as *const c_char,
160 h2.as_bytes_with_nul().as_ptr() as *const c_char,
161 )
162 };
163 if score == -1 {
164 Err(Error::CFunctionFailed {
165 name: "fuzzy_compare".to_string(),
166 return_code: -1,
167 })
168 } else {
169 Ok(score as u8)
170 }
171}
172
173/// Computes the fuzzy hash of bytes.
174///
175/// Returns the fuzzy hash of the given bytes. When an error occurs, it returns
176/// [`Error`](enum.Error.html).
177///
178/// # Examples
179///
180/// ```
181/// let h = ssdeep::hash(b"Hello there!").unwrap();
182/// assert_eq!(h, "3:aNRn:aNRn");
183/// ```
184///
185/// # Panics
186///
187/// * If the length of the bytes is strictly greater than `2^32 - 1` bytes. The
188/// reason for this is that the corresponding function from the underlying C
189/// library accepts the length of the input buffer as an unsigned 32b
190/// integer.
191/// * If the function from the underyling C library provides a non-ASCII hash.
192/// This would be a bug in the C library.
193///
194/// # Implementation details
195///
196/// Internally, it calls the `fuzzy_hash_buf()` function from the underlying C
197/// library. A non-zero return value is translated into
198/// [`Error`](enum.Error.html).
199pub fn hash(buf: &[u8]) -> Result<String> {
200 assert!(buf.len() <= u32::max_value() as usize);
201
202 let mut result = create_buffer_for_result();
203 let rc = unsafe {
204 raw::fuzzy_hash_buf(
205 buf.as_ptr(),
206 buf.len() as u32,
207 result.as_mut_ptr() as *mut c_char,
208 )
209 };
210 result_buffer_to_string("fuzzy_hash_buf", result, rc)
211}
212
213/// Computes the fuzzy hash of the contents of a file.
214///
215/// Returns the fuzzy hash of the given file. When an error occurs, it returns
216/// [`Error`](enum.Error.html).
217///
218/// # Examples
219///
220/// ```
221/// let h = ssdeep::hash_from_file("tests/file.txt").unwrap();
222/// assert_eq!(h, "48:9MABzSwnjpDeSrLp8+nagE4f3ZMvcDT0MIhqy6Ic:9XMwnjdeSHS+n5ZfScX0MJ7");
223/// ```
224///
225/// # Panics
226///
227/// * If the path to the file cannot be converted into a string or it contains
228/// a null byte.
229/// * If the function from the underyling C library provides a non-ASCII hash.
230/// This would be a bug in the C library.
231///
232/// # Implementation details
233///
234/// Internally, it calls the `fuzzy_hash_filename()` function from the
235/// underlying C library. A non-zero return value is translated into
236/// [`Error`](enum.Error.html).
237pub fn hash_from_file<P: AsRef<Path>>(file_path: P) -> Result<String> {
238 let mut result = create_buffer_for_result();
239 let fp = path_as_cstring(file_path);
240 let rc = unsafe {
241 raw::fuzzy_hash_filename(
242 fp.as_bytes_with_nul().as_ptr() as *const c_char,
243 result.as_mut_ptr() as *mut c_char,
244 )
245 };
246 result_buffer_to_string("fuzzy_hash_filename", result, rc)
247}
248
249fn path_as_cstring<P: AsRef<Path>>(path: P) -> CString {
250 // We can unwrap() the result because if the path cannot be converted into
251 // a string, we panic, as documented in functions that call this function.
252 str_to_cstring(path.as_ref().to_str().unwrap())
253}
254
255fn str_to_cstring(s: &str) -> CString {
256 // We can unwrap() the result because if there is a null byte, we panic, as
257 // documented in functions that call this function.
258 CString::new(s).unwrap()
259}
260
261fn create_buffer_for_result() -> Vec<u8> {
262 // From fuzzy.h: "The buffer into which the fuzzy hash is stored has to be
263 // allocated to hold at least FUZZY_MAX_RESULT bytes."
264 Vec::with_capacity(raw::FUZZY_MAX_RESULT)
265}
266
267fn result_buffer_to_string(libfuzzy_func: &str, mut result: Vec<u8>, rc: i32) -> Result<String> {
268 if rc != 0 {
269 // The function from libfuzzy failed, so there is no result.
270 return Err(Error::CFunctionFailed {
271 name: libfuzzy_func.to_string(),
272 return_code: rc,
273 });
274 }
275
276 // Since the resulting vector that holds the fuzzy hash was populated in
277 // the underlying C library, we have to adjust its length because at this
278 // point, the vector thinks that its length is zero. We do this by finding
279 // the first null byte.
280 unsafe {
281 // Resize the vector to the maximum length before fiding the first null
282 // byte because slice::get_unchecked() panics when the index is not
283 // within the slice. The length will be adjusted shortly.
284 result.set_len(raw::FUZZY_MAX_RESULT);
285
286 let mut len = 0;
287 for i in 0..raw::FUZZY_MAX_RESULT {
288 if *result.get_unchecked(i) == 0 {
289 break;
290 }
291 len += 1;
292 }
293 result.set_len(len);
294 }
295
296 // The result should only be composed of ASCII characters, i.e. the result
297 // should be convertible to UTF-8. The presence of non-ASCII character
298 // would be a bug in libfuzzy, in which case we panic.
299 Ok(String::from_utf8(result).unwrap())
300}