ssdeep/internals/generate/hashes/
rolling_hash.rs

1// SPDX-License-Identifier: GPL-2.0-or-later
2// SPDX-FileCopyrightText: Copyright Andrew Tridgell <tridge@samba.org> 2002
3// SPDX-FileCopyrightText: Copyright (C) 2006 ManTech International Corporation
4// SPDX-FileCopyrightText: Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de>
5// SPDX-FileCopyrightText: Copyright (C) 2017, 2023–2025 Tsukasa OI <floss_ssdeep@irq.a4lg.com>
6
7//! A 32-bit rolling hash as used in the fuzzy hash generator.
8
9use core::ops::AddAssign;
10
11use crate::internals::macros::invariant;
12
13/// See [`RollingHash::WINDOW_SIZE`].
14pub const ROLLING_WINDOW: usize = 7;
15
16// grcov-excl-br-start:STRUCT_MEMBER
17
18/// Hasher which computes a variant of 32-bit rolling hash as used in ssdeep.
19///
20/// In ssdeep, this is the most important hash function to decide whether to
21/// trigger a context update based on the last 7 bytes it met.
22///
23/// Specifically, [`RollingHash`] implements the rolling hash implemented in
24/// ssdeep version 2.13 or later.  This is the first version that officially
25/// supported ≧4GiB files and implemented a true rolling hash function.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub struct RollingHash {
28    /// Current rolling window index.
29    ///
30    /// **Performance Analysis:**
31    /// Substituting this variable's type to `usize` (more semantically correct)
32    /// resulted in some slowdown (~10%).  Keeping this state for now.
33    index: u32,
34
35    /// Hash component 1.
36    ///
37    /// This is the sum of the last [`WINDOW_SIZE`](Self::WINDOW_SIZE) bytes.
38    ///
39    /// In the "ssdeep-compatible" configuration, this value is in the range
40    /// of `0..=1785` (`0..=(7*0xff)`).
41    h1: u32,
42
43    /// Hash component 2.
44    ///
45    /// This is the sum of the last [`WINDOW_SIZE`](Self::WINDOW_SIZE) bytes
46    /// but the more recent byte has a higher weight (the latest byte has a
47    /// weight of [`WINDOW_SIZE`](Self::WINDOW_SIZE) and the last (fading) byte
48    /// has a weight of 1).
49    ///
50    /// In the "ssdeep-compatible" configuration, this value is in the range
51    /// of `0..=7140` (`0..=(sum(1..=7)*0xff)`).
52    h2: u32,
53
54    /// Hash component 3.
55    ///
56    /// This is the only "big" component of the hash.
57    /// Each time it processes a byte, this value is left-shifted by
58    /// [`H3_LSHIFT`](Self::H3_LSHIFT) and xor-ed with the latest byte value.
59    ///
60    /// If it processes [`WINDOW_SIZE`](Self::WINDOW_SIZE) bytes,
61    /// older bytes are shifted out (larger than its MSB).
62    h3: u32,
63
64    /// The last [`WINDOW_SIZE`](Self::WINDOW_SIZE) bytes of the processed data.
65    window: [u8; ROLLING_WINDOW],
66}
67
68// grcov-excl-br-stop
69
70impl RollingHash {
71    /// The window size of the rolling hash.
72    ///
73    /// This is 7 bytes in ssdeep.
74    pub const WINDOW_SIZE: usize = ROLLING_WINDOW;
75
76    /// Left shift width of [`h3`](Self::h3) for each byte.
77    ///
78    /// This is 5 in ssdeep.
79    const H3_LSHIFT: usize = 5;
80
81    /// Creates a new [`RollingHash`] with the initial value.
82    pub fn new() -> Self {
83        RollingHash {
84            index: 0,
85            h1: 0,
86            h2: 0,
87            h3: 0,
88            window: [0; ROLLING_WINDOW],
89        }
90    }
91
92    /// Updates the hash value by processing a byte.
93    #[inline]
94    pub fn update_by_byte(&mut self, ch: u8) -> &mut Self {
95        invariant!((self.index as usize) < Self::WINDOW_SIZE);
96        self.h2 = self.h2.wrapping_sub(self.h1);
97        self.h2 = self
98            .h2
99            .wrapping_add(u32::wrapping_mul(ROLLING_WINDOW as u32, ch as u32));
100        self.h1 = self.h1.wrapping_add(ch as u32);
101        self.h1 = self
102            .h1
103            .wrapping_sub(self.window[self.index as usize] as u32); // grcov-excl-br-line:ARRAY
104        self.window[self.index as usize] = ch; // grcov-excl-br-line:ARRAY
105        self.index += 1;
106        if self.index as usize == ROLLING_WINDOW {
107            self.index = 0;
108        }
109        self.h3 <<= Self::H3_LSHIFT;
110        self.h3 ^= ch as u32;
111        self
112    }
113
114    /// Updates the hash value by processing an iterator of [`u8`].
115    pub fn update_by_iter(&mut self, iter: impl Iterator<Item = u8>) -> &mut Self {
116        for ch in iter {
117            self.update_by_byte(ch);
118        }
119        self
120    }
121
122    /// Updates the hash value by processing a slice of [`u8`].
123    pub fn update(&mut self, buf: &[u8]) -> &mut Self {
124        for &ch in buf.iter() {
125            self.update_by_byte(ch);
126        }
127        self
128    }
129
130    /// Returns the current hash value.
131    ///
132    /// Note that there's no "finalization" on this rolling hash.
133    /// You can even continue updating after reading the hash value.
134    ///
135    /// This is the sum of its three internal states (`h1`, `h2`, and `h3`).
136    /// See the source code and the private documentation for
137    /// its mathematical details.
138    #[inline]
139    pub fn value(&self) -> u32 {
140        self.h1.wrapping_add(self.h2).wrapping_add(self.h3)
141    }
142}
143
144impl AddAssign<&[u8]> for RollingHash {
145    /// Updates the hash value by processing a slice of [`u8`].
146    #[inline(always)]
147    fn add_assign(&mut self, buffer: &[u8]) {
148        self.update(buffer);
149    }
150}
151
152impl<const N: usize> AddAssign<&[u8; N]> for RollingHash {
153    /// Updates the hash value by processing an array of [`u8`].
154    #[inline(always)]
155    fn add_assign(&mut self, buffer: &[u8; N]) {
156        self.update(&buffer[..]);
157    }
158}
159
160impl AddAssign<u8> for RollingHash {
161    /// Updates the hash value by processing a byte.
162    #[inline(always)]
163    fn add_assign(&mut self, byte: u8) {
164        self.update_by_byte(byte);
165    }
166}
167
168impl Default for RollingHash {
169    fn default() -> Self {
170        Self::new()
171    }
172}
173
174/// Constant assertions related to this module.
175#[doc(hidden)]
176mod const_asserts {
177    use static_assertions::const_assert_eq;
178
179    use super::*;
180
181    // Compare with original ssdeep constants
182    // fuzzy.c: ROLLING_WINDOW
183    const_assert_eq!(ROLLING_WINDOW, 7);
184
185    // Consistency between module-local and struct-global one.
186    const_assert_eq!(ROLLING_WINDOW, RollingHash::WINDOW_SIZE);
187
188    // RollingHash::h3 is a rolling component and an input byte
189    // are vanished after processing more RollingHash::WINDOW_SIZE bytes.
190    // grcov-excl-tests-start
191    #[cfg(test)]
192    #[test]
193    fn rolling_hash_h3_shift_amount() {
194        assert!(RollingHash::WINDOW_SIZE
195            .checked_mul(RollingHash::H3_LSHIFT)
196            .and_then(|x| u32::try_from(x).ok())
197            .map(|x| x >= u32::BITS)
198            .unwrap_or(false));
199    }
200    // grcov-excl-tests-stop
201}
202
203pub(crate) mod test_utils;
204mod tests;