sqry_core/graph/body_hash.rs
1//! 128-bit deterministic body hashing using dual xxh64
2//!
3//! Uses fixed seeds for cross-process determinism.
4//! Seeds are versioned alongside the index schema.
5//!
6//! # Why xxh64 over siphasher?
7//!
8//! - **Already in codebase**: xxhash-rust used in 6+ sqry modules
9//! - **Seeding proven**: prewarm/store.rs, project/types.rs use seeded xxh64
10//! - **Faster**: ~10-15 GB/s vs ~3-5 GB/s for `SipHash`
11//! - **No new dependency**: Reduces build time and audit surface
12//! - **Same determinism**: Fixed seed = identical output across runs
13//!
14//! # Seed Registry
15//!
16//! All seeds in sqry follow the ASCII convention (8 chars = 64 bits):
17//!
18//! | Module | Seed Constant | Hex Value | ASCII Meaning | Purpose |
19//! |--------|---------------|-----------|---------------|---------|
20//! | `prewarm/store.rs` | `PAYLOAD_CHECKSUM_SEED` | `0x5351_5259_5041_594C` | "SQRYPAYL" | Checksum verification |
21//! | `project/types.rs` | `HASH_SEED` | `0x5351_5259_5041_5448` | "SQRYPATH" | Path hashing |
22//! | `body_hash.rs` | `HASH_SEED_0` | `0x5351_5259_4455_5030` | "SQRYDUP0" | Body hash high bits |
23//! | `body_hash.rs` | `HASH_SEED_1` | `0x5351_5259_4455_5031` | "SQRYDUP1" | Body hash low bits |
24//!
25//! # Rules for adding new seeds
26//!
27//! 1. All seeds MUST be unique across the codebase
28//! 2. Use ASCII encoding of meaningful names (8 chars = 64 bits)
29//! 3. Follow pattern: "SQRY" prefix + 4-char identifier
30//! 4. Register new seeds in this table before implementation
31//! 5. Changing seeds requires `INDEX_SCHEMA_VERSION` bump
32
33use serde::{Deserialize, Serialize};
34use xxhash_rust::xxh64::xxh64;
35
36/// Fixed hash seed for high 64 bits of body hash.
37///
38/// Schema: `INDEX_SCHEMA_VERSION` = 2
39/// ASCII: "SQRYDUP0" = `0x5351_5259_4455_5030`
40pub const HASH_SEED_0: u64 = 0x5351_5259_4455_5030;
41
42/// Fixed hash seed for low 64 bits of body hash.
43///
44/// Schema: `INDEX_SCHEMA_VERSION` = 2
45/// ASCII: "SQRYDUP1" = `0x5351_5259_4455_5031`
46pub const HASH_SEED_1: u64 = 0x5351_5259_4455_5031;
47
48/// 128-bit body hash for collision resistance
49///
50/// Using two xxh64 outputs with different seeds provides 128-bit security.
51/// This is sufficient to avoid birthday paradox collisions in codebases
52/// with millions of symbols.
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
54pub struct BodyHash128 {
55 /// High 64 bits (computed with `HASH_SEED_0`)
56 pub high: u64,
57 /// Low 64 bits (computed with `HASH_SEED_1`)
58 pub low: u64,
59}
60
61impl BodyHash128 {
62 /// Compute deterministic 128-bit hash of normalized body content
63 ///
64 /// Uses two xxh64 calls with different seeds to achieve 128-bit collision resistance.
65 /// This follows the established pattern in sqry (see prewarm/store.rs:514).
66 ///
67 /// # Arguments
68 ///
69 /// * `content` - Normalized body content bytes
70 ///
71 /// # Returns
72 ///
73 /// A 128-bit hash value
74 #[must_use]
75 pub fn compute(content: &[u8]) -> Self {
76 Self {
77 high: xxh64(content, HASH_SEED_0),
78 low: xxh64(content, HASH_SEED_1),
79 }
80 }
81
82 /// Convert to u128 for comparison and storage
83 #[must_use]
84 pub fn as_u128(&self) -> u128 {
85 (u128::from(self.high) << 64) | u128::from(self.low)
86 }
87
88 /// Create from u128 value
89 #[must_use]
90 pub fn from_u128(value: u128) -> Self {
91 let high = u64::try_from(value >> 64).unwrap_or(u64::MAX);
92 let low = u64::try_from(value & u128::from(u64::MAX)).unwrap_or(u64::MAX);
93 Self { high, low }
94 }
95}
96
97impl std::fmt::Display for BodyHash128 {
98 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
99 write!(f, "{:016x}{:016x}", self.high, self.low)
100 }
101}
102
103#[cfg(test)]
104mod tests {
105 use super::*;
106
107 #[test]
108 fn test_body_hash_deterministic() {
109 let content = b"fn example() { return 42; }";
110 let hash1 = BodyHash128::compute(content);
111 let hash2 = BodyHash128::compute(content);
112 assert_eq!(hash1, hash2, "Hash must be deterministic");
113 }
114
115 #[test]
116 fn test_body_hash_different_content() {
117 let content1 = b"fn example() { return 42; }";
118 let content2 = b"fn example() { return 43; }";
119 let hash1 = BodyHash128::compute(content1);
120 let hash2 = BodyHash128::compute(content2);
121 assert_ne!(
122 hash1, hash2,
123 "Different content should produce different hash"
124 );
125 }
126
127 #[test]
128 fn test_body_hash_empty() {
129 let hash = BodyHash128::compute(b"");
130 // Empty content should still produce a valid hash
131 assert_ne!(hash.high, 0, "Empty content hash high should not be zero");
132 assert_ne!(hash.low, 0, "Empty content hash low should not be zero");
133 }
134
135 #[test]
136 fn test_body_hash_u128_roundtrip() {
137 let content = b"test content for roundtrip";
138 let hash = BodyHash128::compute(content);
139 let as_u128 = hash.as_u128();
140 let roundtrip = BodyHash128::from_u128(as_u128);
141 assert_eq!(hash, roundtrip, "u128 roundtrip should preserve hash");
142 }
143
144 #[test]
145 fn test_xxh64_fixed_seed() {
146 // Verify the seeds are the expected ASCII values
147 // "SQRYDUP0" in hex: S=53, Q=51, R=52, Y=59, D=44, U=55, P=50, 0=30
148 assert_eq!(HASH_SEED_0, 0x5351_5259_4455_5030);
149 // "SQRYDUP1" in hex: S=53, Q=51, R=52, Y=59, D=44, U=55, P=50, 1=31
150 assert_eq!(HASH_SEED_1, 0x5351_5259_4455_5031);
151 }
152
153 #[test]
154 fn test_xxh64_cross_endian() {
155 // Fixed test vector to catch endianness or streaming/segment-size regression
156 // These values should be consistent across all platforms
157 let test_data = b"fn example() { return 42; }";
158 let hash = BodyHash128::compute(test_data);
159
160 // These expected values are computed on x86_64 Linux reference platform
161 // If these fail on another platform, investigate endianness handling
162 // Note: We're testing that the hash is non-zero and deterministic,
163 // not specific values (which would couple to xxhash implementation)
164 assert_ne!(
165 hash.high, 0,
166 "High bits should be non-zero for non-empty content"
167 );
168 assert_ne!(
169 hash.low, 0,
170 "Low bits should be non-zero for non-empty content"
171 );
172
173 // Verify the two halves are different (different seeds)
174 assert_ne!(
175 hash.high, hash.low,
176 "High and low should differ due to different seeds"
177 );
178 }
179
180 #[test]
181 fn test_body_hash_display() {
182 let hash = BodyHash128 {
183 high: 0x1234_5678_90AB_CDEF,
184 low: 0xFEDC_BA09_8765_4321,
185 };
186 let display = format!("{hash}");
187 assert_eq!(display, "1234567890abcdeffedcba0987654321");
188 }
189
190 #[test]
191 fn test_body_hash_serde_json() {
192 let hash = BodyHash128::compute(b"test");
193 let json = serde_json::to_string(&hash).unwrap();
194 let parsed: BodyHash128 = serde_json::from_str(&json).unwrap();
195 assert_eq!(hash, parsed, "JSON roundtrip should preserve hash");
196 }
197}