Skip to main content

cpd_core/
hash.rs

1// hash.rs
2// Attribution: inspired by jscpd-rs rolling hash approach; rewritten independently.
3
4use xxhash_rust::xxh3::xxh3_64;
5
6/// Fibonacci hashing constant for good bit distribution.
7pub const HASH_BASE: u64 = 0x9e3779b97f4a7c15;
8
9/// Compute a deterministic hash for a single token given its kind byte and value string.
10/// Uses xxh3_64 XORed with kind cast to u64.
11pub fn token_hash(kind: u8, value: &str) -> u64 {
12    xxh3_64(value.as_bytes()) ^ (kind as u64)
13}
14
15/// Hash a token with optional case folding.
16///
17/// Matches the jscpd-rs `hash_token` interface for cross-validation parity.
18/// `ignore_case = false` is equivalent to `token_hash(kind.discriminant(), value)`.
19#[inline]
20pub fn hash_token(kind_discriminant: u8, value: &str, ignore_case: bool) -> u64 {
21    if ignore_case {
22        xxh3_64(value.to_lowercase().as_bytes()) ^ (kind_discriminant as u64)
23    } else {
24        xxh3_64(value.as_bytes()) ^ (kind_discriminant as u64)
25    }
26}
27
28/// Compute the initial polynomial hash of a window of token hashes.
29/// hash = h[0]*BASE^(n-1) + h[1]*BASE^(n-2) + ... + h[n-1]*BASE^0
30/// Uses wrapping arithmetic throughout.
31pub fn hash_window(hashes: &[u64]) -> u64 {
32    hashes
33        .iter()
34        .fold(0u64, |acc, &h| acc.wrapping_mul(HASH_BASE).wrapping_add(h))
35}
36
37/// Roll the hash one position: remove `outgoing` (the token leaving the window),
38/// add `incoming` (the token entering the window).
39///
40/// `window_power` must be precomputed by the caller as `base_pow(window_size - 1)`
41/// **once per format group** before the sliding-window loop — not on every call.
42/// This eliminates an O(window_size) loop from the hot path.
43///
44/// If per-language min_tokens is introduced in future, recompute `window_power`
45/// per `detect_in_group` invocation using that group's min_tokens value.
46///
47/// new_hash = (current - outgoing * window_power) * BASE + incoming
48/// All arithmetic is wrapping.
49pub fn roll(current: u64, outgoing: u64, incoming: u64, window_power: u64) -> u64 {
50    current
51        .wrapping_sub(outgoing.wrapping_mul(window_power))
52        .wrapping_mul(HASH_BASE)
53        .wrapping_add(incoming)
54}
55
56/// Compute HASH_BASE^n using wrapping multiplication.
57/// Call once per format group to obtain the `window_power` argument for `roll()`.
58pub fn base_pow(n: usize) -> u64 {
59    let mut result = 1u64;
60    for _ in 0..n {
61        result = result.wrapping_mul(HASH_BASE);
62    }
63    result
64}
65
66#[cfg(test)]
67mod tests {
68    use super::*;
69
70    #[test]
71    fn token_hash_is_deterministic() {
72        let h1 = token_hash(1, "function");
73        let h2 = token_hash(1, "function");
74        assert_eq!(h1, h2);
75    }
76
77    #[test]
78    fn token_hash_differs_by_kind() {
79        let h1 = token_hash(1, "x");
80        let h2 = token_hash(2, "x");
81        assert_ne!(h1, h2);
82    }
83
84    #[test]
85    fn hash_window_single_element() {
86        let h = token_hash(0, "a");
87        // Window of 1: fold with initial 0 → 0 * BASE + h = h
88        assert_eq!(hash_window(&[h]), h);
89    }
90
91    #[test]
92    fn roll_matches_naive_recompute() {
93        let a = token_hash(0, "a");
94        let b = token_hash(0, "b");
95        let c = token_hash(0, "c");
96        let d = token_hash(0, "d");
97
98        let initial = hash_window(&[a, b, c]);
99        let wp = base_pow(3 - 1);
100        let rolled = roll(initial, a, d, wp);
101        let naive = hash_window(&[b, c, d]);
102        assert_eq!(rolled, naive, "rolled hash must match naive recomputation");
103    }
104
105    #[test]
106    fn roll_window_of_one() {
107        let a = token_hash(0, "hello");
108        let b = token_hash(0, "world");
109        let initial = hash_window(&[a]);
110        let wp = base_pow(1 - 1); // BASE^0 = 1
111        let rolled = roll(initial, a, b, wp);
112        let naive = hash_window(&[b]);
113        assert_eq!(rolled, naive);
114    }
115
116    #[test]
117    fn hash_window_empty_is_zero() {
118        assert_eq!(hash_window(&[]), 0u64);
119    }
120
121    #[test]
122    fn hash_token_case_insensitive_matches_different_case() {
123        let h1 = hash_token(1, "Function", true);
124        let h2 = hash_token(1, "function", true);
125        assert_eq!(h1, h2, "ignore_case=true must fold case before hashing");
126    }
127
128    #[test]
129    fn hash_token_case_sensitive_differs() {
130        let h1 = hash_token(1, "Function", false);
131        let h2 = hash_token(1, "function", false);
132        assert_ne!(h1, h2, "ignore_case=false must not fold case");
133    }
134
135    #[test]
136    fn hash_token_no_ignore_case_matches_token_hash() {
137        let h1 = hash_token(2, "hello", false);
138        let h2 = token_hash(2, "hello");
139        assert_eq!(
140            h1, h2,
141            "hash_token(ignore_case=false) must match token_hash"
142        );
143    }
144}