Skip to main content

sqry_core/graph/
body_hash.rs

1//! 128-bit deterministic body hashing using dual xxh64
2//!
3//! Uses fixed seeds for cross-process determinism.
4//! Seeds are versioned alongside the index schema.
5//!
6//! # Why xxh64 over siphasher?
7//!
8//! - **Already in codebase**: xxhash-rust used in 6+ sqry modules
9//! - **Seeding proven**: prewarm/store.rs, project/types.rs use seeded xxh64
10//! - **Faster**: ~10-15 GB/s vs ~3-5 GB/s for `SipHash`
11//! - **No new dependency**: Reduces build time and audit surface
12//! - **Same determinism**: Fixed seed = identical output across runs
13//!
14//! # Seed Registry
15//!
16//! All seeds in sqry follow the ASCII convention (8 chars = 64 bits):
17//!
18//! | Module | Seed Constant | Hex Value | ASCII Meaning | Purpose |
19//! |--------|---------------|-----------|---------------|---------|
20//! | `prewarm/store.rs` | `PAYLOAD_CHECKSUM_SEED` | `0x5351_5259_5041_594C` | "SQRYPAYL" | Checksum verification |
21//! | `project/types.rs` | `HASH_SEED` | `0x5351_5259_5041_5448` | "SQRYPATH" | Path hashing |
22//! | `body_hash.rs` | `HASH_SEED_0` | `0x5351_5259_4455_5030` | "SQRYDUP0" | Body hash high bits |
23//! | `body_hash.rs` | `HASH_SEED_1` | `0x5351_5259_4455_5031` | "SQRYDUP1" | Body hash low bits |
24//!
25//! # Rules for adding new seeds
26//!
27//! 1. All seeds MUST be unique across the codebase
28//! 2. Use ASCII encoding of meaningful names (8 chars = 64 bits)
29//! 3. Follow pattern: "SQRY" prefix + 4-char identifier
30//! 4. Register new seeds in this table before implementation
31//! 5. Changing seeds requires `INDEX_SCHEMA_VERSION` bump
32
33use serde::{Deserialize, Serialize};
34use xxhash_rust::xxh64::xxh64;
35
36/// Fixed hash seed for high 64 bits of body hash.
37///
38/// Schema: `INDEX_SCHEMA_VERSION` = 2
39/// ASCII: "SQRYDUP0" = `0x5351_5259_4455_5030`
40pub const HASH_SEED_0: u64 = 0x5351_5259_4455_5030;
41
42/// Fixed hash seed for low 64 bits of body hash.
43///
44/// Schema: `INDEX_SCHEMA_VERSION` = 2
45/// ASCII: "SQRYDUP1" = `0x5351_5259_4455_5031`
46pub const HASH_SEED_1: u64 = 0x5351_5259_4455_5031;
47
48/// 128-bit body hash for collision resistance
49///
50/// Using two xxh64 outputs with different seeds provides 128-bit security.
51/// This is sufficient to avoid birthday paradox collisions in codebases
52/// with millions of symbols.
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
54pub struct BodyHash128 {
55    /// High 64 bits (computed with `HASH_SEED_0`)
56    pub high: u64,
57    /// Low 64 bits (computed with `HASH_SEED_1`)
58    pub low: u64,
59}
60
61impl BodyHash128 {
62    /// Compute deterministic 128-bit hash of normalized body content
63    ///
64    /// Uses two xxh64 calls with different seeds to achieve 128-bit collision resistance.
65    /// This follows the established pattern in sqry (see prewarm/store.rs:514).
66    ///
67    /// # Arguments
68    ///
69    /// * `content` - Normalized body content bytes
70    ///
71    /// # Returns
72    ///
73    /// A 128-bit hash value
74    #[must_use]
75    pub fn compute(content: &[u8]) -> Self {
76        Self {
77            high: xxh64(content, HASH_SEED_0),
78            low: xxh64(content, HASH_SEED_1),
79        }
80    }
81
82    /// Convert to u128 for comparison and storage
83    #[must_use]
84    pub fn as_u128(&self) -> u128 {
85        (u128::from(self.high) << 64) | u128::from(self.low)
86    }
87
88    /// Create from u128 value
89    #[must_use]
90    pub fn from_u128(value: u128) -> Self {
91        let high = u64::try_from(value >> 64).unwrap_or(u64::MAX);
92        let low = u64::try_from(value & u128::from(u64::MAX)).unwrap_or(u64::MAX);
93        Self { high, low }
94    }
95}
96
97impl std::fmt::Display for BodyHash128 {
98    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
99        write!(f, "{:016x}{:016x}", self.high, self.low)
100    }
101}
102
103#[cfg(test)]
104mod tests {
105    use super::*;
106
107    #[test]
108    fn test_body_hash_deterministic() {
109        let content = b"fn example() { return 42; }";
110        let hash1 = BodyHash128::compute(content);
111        let hash2 = BodyHash128::compute(content);
112        assert_eq!(hash1, hash2, "Hash must be deterministic");
113    }
114
115    #[test]
116    fn test_body_hash_different_content() {
117        let content1 = b"fn example() { return 42; }";
118        let content2 = b"fn example() { return 43; }";
119        let hash1 = BodyHash128::compute(content1);
120        let hash2 = BodyHash128::compute(content2);
121        assert_ne!(
122            hash1, hash2,
123            "Different content should produce different hash"
124        );
125    }
126
127    #[test]
128    fn test_body_hash_empty() {
129        let hash = BodyHash128::compute(b"");
130        // Empty content should still produce a valid hash
131        assert_ne!(hash.high, 0, "Empty content hash high should not be zero");
132        assert_ne!(hash.low, 0, "Empty content hash low should not be zero");
133    }
134
135    #[test]
136    fn test_body_hash_u128_roundtrip() {
137        let content = b"test content for roundtrip";
138        let hash = BodyHash128::compute(content);
139        let as_u128 = hash.as_u128();
140        let roundtrip = BodyHash128::from_u128(as_u128);
141        assert_eq!(hash, roundtrip, "u128 roundtrip should preserve hash");
142    }
143
144    #[test]
145    fn test_xxh64_fixed_seed() {
146        // Verify the seeds are the expected ASCII values
147        // "SQRYDUP0" in hex: S=53, Q=51, R=52, Y=59, D=44, U=55, P=50, 0=30
148        assert_eq!(HASH_SEED_0, 0x5351_5259_4455_5030);
149        // "SQRYDUP1" in hex: S=53, Q=51, R=52, Y=59, D=44, U=55, P=50, 1=31
150        assert_eq!(HASH_SEED_1, 0x5351_5259_4455_5031);
151    }
152
153    #[test]
154    fn test_xxh64_cross_endian() {
155        // Fixed test vector to catch endianness or streaming/segment-size regression
156        // These values should be consistent across all platforms
157        let test_data = b"fn example() { return 42; }";
158        let hash = BodyHash128::compute(test_data);
159
160        // These expected values are computed on x86_64 Linux reference platform
161        // If these fail on another platform, investigate endianness handling
162        // Note: We're testing that the hash is non-zero and deterministic,
163        // not specific values (which would couple to xxhash implementation)
164        assert_ne!(
165            hash.high, 0,
166            "High bits should be non-zero for non-empty content"
167        );
168        assert_ne!(
169            hash.low, 0,
170            "Low bits should be non-zero for non-empty content"
171        );
172
173        // Verify the two halves are different (different seeds)
174        assert_ne!(
175            hash.high, hash.low,
176            "High and low should differ due to different seeds"
177        );
178    }
179
180    #[test]
181    fn test_body_hash_display() {
182        let hash = BodyHash128 {
183            high: 0x1234_5678_90AB_CDEF,
184            low: 0xFEDC_BA09_8765_4321,
185        };
186        let display = format!("{hash}");
187        assert_eq!(display, "1234567890abcdeffedcba0987654321");
188    }
189
190    #[test]
191    fn test_body_hash_serde_json() {
192        let hash = BodyHash128::compute(b"test");
193        let json = serde_json::to_string(&hash).unwrap();
194        let parsed: BodyHash128 = serde_json::from_str(&json).unwrap();
195        assert_eq!(hash, parsed, "JSON roundtrip should preserve hash");
196    }
197}