Skip to main content

cpd_core/
hash.rs

1// hash.rs
2
3use xxhash_rust::xxh3::xxh3_64;
4
5/// Fibonacci hashing constant for good bit distribution.
6pub const HASH_BASE: u64 = 0x9e3779b97f4a7c15;
7
8/// Compute a deterministic hash for a single token given its kind byte and value string.
9/// Uses xxh3_64 XORed with kind cast to u64.
10pub fn token_hash(kind: u8, value: &str) -> u64 {
11    xxh3_64(value.as_bytes()) ^ (kind as u64)
12}
13
14/// Hash a token with optional case folding.
15///
16/// `ignore_case = false` is equivalent to `token_hash(kind.discriminant(), value)`.
17#[inline]
18pub fn hash_token(kind_discriminant: u8, value: &str, ignore_case: bool) -> u64 {
19    if ignore_case {
20        xxh3_64(value.to_lowercase().as_bytes()) ^ (kind_discriminant as u64)
21    } else {
22        xxh3_64(value.as_bytes()) ^ (kind_discriminant as u64)
23    }
24}
25
26/// Compute the initial polynomial hash of a window of token hashes.
27/// hash = h[0]*BASE^(n-1) + h[1]*BASE^(n-2) + ... + h[n-1]*BASE^0
28/// Uses wrapping arithmetic throughout.
29pub fn hash_window(hashes: &[u64]) -> u64 {
30    hashes
31        .iter()
32        .fold(0u64, |acc, &h| acc.wrapping_mul(HASH_BASE).wrapping_add(h))
33}
34
35/// Roll the hash one position: remove `outgoing` (the token leaving the window),
36/// add `incoming` (the token entering the window).
37///
38/// `window_power` must be precomputed by the caller as `base_pow(window_size - 1)`
39/// **once per format group** before the sliding-window loop — not on every call.
40/// This eliminates an O(window_size) loop from the hot path.
41///
42/// If per-language min_tokens is introduced in future, recompute `window_power`
43/// per `detect_in_group` invocation using that group's min_tokens value.
44///
45/// new_hash = (current - outgoing * window_power) * BASE + incoming
46/// All arithmetic is wrapping.
47pub fn roll(current: u64, outgoing: u64, incoming: u64, window_power: u64) -> u64 {
48    current
49        .wrapping_sub(outgoing.wrapping_mul(window_power))
50        .wrapping_mul(HASH_BASE)
51        .wrapping_add(incoming)
52}
53
54/// Compute HASH_BASE^n using wrapping multiplication.
55/// Call once per format group to obtain the `window_power` argument for `roll()`.
56pub fn base_pow(n: usize) -> u64 {
57    let mut result = 1u64;
58    for _ in 0..n {
59        result = result.wrapping_mul(HASH_BASE);
60    }
61    result
62}
63
64#[cfg(test)]
65mod tests {
66    use super::*;
67
68    #[test]
69    fn token_hash_is_deterministic() {
70        let h1 = token_hash(1, "function");
71        let h2 = token_hash(1, "function");
72        assert_eq!(h1, h2);
73    }
74
75    #[test]
76    fn token_hash_differs_by_kind() {
77        let h1 = token_hash(1, "x");
78        let h2 = token_hash(2, "x");
79        assert_ne!(h1, h2);
80    }
81
82    #[test]
83    fn hash_window_single_element() {
84        let h = token_hash(0, "a");
85        // Window of 1: fold with initial 0 → 0 * BASE + h = h
86        assert_eq!(hash_window(&[h]), h);
87    }
88
89    #[test]
90    fn roll_matches_naive_recompute() {
91        let a = token_hash(0, "a");
92        let b = token_hash(0, "b");
93        let c = token_hash(0, "c");
94        let d = token_hash(0, "d");
95
96        let initial = hash_window(&[a, b, c]);
97        let wp = base_pow(3 - 1);
98        let rolled = roll(initial, a, d, wp);
99        let naive = hash_window(&[b, c, d]);
100        assert_eq!(rolled, naive, "rolled hash must match naive recomputation");
101    }
102
103    #[test]
104    fn roll_window_of_one() {
105        let a = token_hash(0, "hello");
106        let b = token_hash(0, "world");
107        let initial = hash_window(&[a]);
108        let wp = base_pow(1 - 1); // BASE^0 = 1
109        let rolled = roll(initial, a, b, wp);
110        let naive = hash_window(&[b]);
111        assert_eq!(rolled, naive);
112    }
113
114    #[test]
115    fn hash_window_empty_is_zero() {
116        assert_eq!(hash_window(&[]), 0u64);
117    }
118
119    #[test]
120    fn hash_token_case_insensitive_matches_different_case() {
121        let h1 = hash_token(1, "Function", true);
122        let h2 = hash_token(1, "function", true);
123        assert_eq!(h1, h2, "ignore_case=true must fold case before hashing");
124    }
125
126    #[test]
127    fn hash_token_case_sensitive_differs() {
128        let h1 = hash_token(1, "Function", false);
129        let h2 = hash_token(1, "function", false);
130        assert_ne!(h1, h2, "ignore_case=false must not fold case");
131    }
132
133    #[test]
134    fn hash_token_no_ignore_case_matches_token_hash() {
135        let h1 = hash_token(2, "hello", false);
136        let h2 = token_hash(2, "hello");
137        assert_eq!(
138            h1, h2,
139            "hash_token(ignore_case=false) must match token_hash"
140        );
141    }
142}