Skip to main content

vectorpin/
hash.rs

1// Copyright 2025 Jascha Wanger / Tarnover, LLC
2// SPDX-License-Identifier: Apache-2.0
3
4//! Canonical hashing for source text and embedding vectors.
5//!
6//! These three operations are the only places in the protocol where
7//! semantic content gets turned into bytes. Any disagreement between
8//! Python and Rust here breaks cross-language verification, so the
9//! semantics are pinned down explicitly:
10//!
11//! * Vectors: little-endian, 1-D, packed `f32` or `f64` bytes.
12//! * Text: UTF-8 of the NFC-normalized string.
13//! * Output digests: prefixed with `"sha256:"` and lowercase hex.
14
15use sha2::{Digest, Sha256};
16use unicode_normalization::UnicodeNormalization;
17
18/// Canonical scalar dtype identifier carried in the wire format.
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
20pub enum VecDtype {
21    /// 32-bit IEEE float, little endian.
22    F32,
23    /// 64-bit IEEE float, little endian.
24    F64,
25}
26
27impl VecDtype {
28    /// Wire form (`"f32"` or `"f64"`) used in the attestation JSON.
29    pub fn as_str(self) -> &'static str {
30        match self {
31            VecDtype::F32 => "f32",
32            VecDtype::F64 => "f64",
33        }
34    }
35
36    /// Parse a wire form back into a [`VecDtype`].
37    pub fn parse(s: &str) -> Result<Self, HashError> {
38        match s {
39            "f32" => Ok(VecDtype::F32),
40            "f64" => Ok(VecDtype::F64),
41            other => Err(HashError::UnsupportedDtype(other.to_string())),
42        }
43    }
44}
45
46impl std::str::FromStr for VecDtype {
47    type Err = HashError;
48    fn from_str(s: &str) -> Result<Self, HashError> {
49        Self::parse(s)
50    }
51}
52
53/// Errors produced by canonicalization helpers.
54#[derive(Debug, thiserror::Error)]
55pub enum HashError {
56    /// Vector dimensionality reported by the caller did not match the data.
57    #[error("vector dim mismatch: declared {declared}, actual {actual}")]
58    DimMismatch {
59        /// What the caller said.
60        declared: usize,
61        /// What the data actually contained.
62        actual: usize,
63    },
64    /// Unsupported scalar dtype identifier.
65    #[error("unsupported canonical dtype: {0}")]
66    UnsupportedDtype(String),
67}
68
69/// Untyped vector view — either f32 or f64 slice — handed to canonicalization.
70///
71/// Exists so callers can pin a vector without converting to/from a fixed
72/// dtype inside the call site. The hash is taken under whatever dtype
73/// the caller specifies in the [`PinHeader`](crate::PinHeader).
74#[derive(Debug, Clone, Copy)]
75pub enum VectorRef<'a> {
76    /// Borrowed `f32` slice.
77    F32(&'a [f32]),
78    /// Borrowed `f64` slice.
79    F64(&'a [f64]),
80}
81
82impl<'a> VectorRef<'a> {
83    /// Length of the underlying slice.
84    pub fn len(&self) -> usize {
85        match self {
86            VectorRef::F32(v) => v.len(),
87            VectorRef::F64(v) => v.len(),
88        }
89    }
90
91    /// True iff the underlying slice is empty.
92    pub fn is_empty(&self) -> bool {
93        self.len() == 0
94    }
95
96    /// Native dtype of the underlying slice.
97    pub fn native_dtype(&self) -> VecDtype {
98        match self {
99            VectorRef::F32(_) => VecDtype::F32,
100            VectorRef::F64(_) => VecDtype::F64,
101        }
102    }
103}
104
105impl<'a> From<&'a [f32]> for VectorRef<'a> {
106    fn from(v: &'a [f32]) -> Self {
107        VectorRef::F32(v)
108    }
109}
110
111impl<'a> From<&'a [f64]> for VectorRef<'a> {
112    fn from(v: &'a [f64]) -> Self {
113        VectorRef::F64(v)
114    }
115}
116
117/// Reproducible byte form of an embedding vector.
118///
119/// Always little-endian, always packed, always under the dtype
120/// requested by the caller. Two implementations must agree on these
121/// bytes byte-for-byte for cross-language verification to work.
122pub fn canonical_vector_bytes(vector: VectorRef<'_>, dtype: VecDtype) -> Vec<u8> {
123    match (vector, dtype) {
124        (VectorRef::F32(v), VecDtype::F32) => f32_le_bytes(v),
125        (VectorRef::F64(v), VecDtype::F32) => {
126            // Down-cast each f64 to f32 before packing.
127            let casted: Vec<f32> = v.iter().map(|&x| x as f32).collect();
128            f32_le_bytes(&casted)
129        }
130        (VectorRef::F32(v), VecDtype::F64) => {
131            // Up-cast each f32 to f64 before packing.
132            let casted: Vec<f64> = v.iter().map(|&x| x as f64).collect();
133            f64_le_bytes(&casted)
134        }
135        (VectorRef::F64(v), VecDtype::F64) => f64_le_bytes(v),
136    }
137}
138
139fn f32_le_bytes(v: &[f32]) -> Vec<u8> {
140    let mut out = Vec::with_capacity(v.len() * 4);
141    for x in v {
142        out.extend_from_slice(&x.to_le_bytes());
143    }
144    out
145}
146
147fn f64_le_bytes(v: &[f64]) -> Vec<u8> {
148    let mut out = Vec::with_capacity(v.len() * 8);
149    for x in v {
150        out.extend_from_slice(&x.to_le_bytes());
151    }
152    out
153}
154
155/// SHA-256 of a vector's canonical bytes, formatted as `"sha256:<hex>"`.
156pub fn hash_vector(vector: VectorRef<'_>, dtype: VecDtype) -> String {
157    sha256_prefixed(&canonical_vector_bytes(vector, dtype))
158}
159
160/// SHA-256 of a string after Unicode NFC normalization and UTF-8 encoding.
161pub fn hash_text(text: &str) -> String {
162    let normalized: String = text.nfc().collect();
163    sha256_prefixed(normalized.as_bytes())
164}
165
166/// SHA-256 over arbitrary bytes, formatted as `"sha256:<hex>"`.
167pub fn hash_bytes(data: &[u8]) -> String {
168    sha256_prefixed(data)
169}
170
171fn sha256_prefixed(data: &[u8]) -> String {
172    let mut hasher = Sha256::new();
173    hasher.update(data);
174    let digest = hasher.finalize();
175    format!("sha256:{}", hex::encode(digest))
176}
177
178#[cfg(test)]
179mod tests {
180    use super::*;
181
182    #[test]
183    fn hash_text_is_stable() {
184        assert_eq!(hash_text("hello"), hash_text("hello"));
185    }
186
187    #[test]
188    fn hash_text_normalizes_nfc() {
189        // Composed vs decomposed "café"
190        let composed = "caf\u{00e9}";
191        let decomposed = "cafe\u{0301}";
192        assert_eq!(hash_text(composed), hash_text(decomposed));
193    }
194
195    #[test]
196    fn hash_text_distinguishes_content() {
197        assert_ne!(hash_text("hello"), hash_text("Hello"));
198    }
199
200    #[test]
201    fn canonical_vector_bytes_endianness_is_explicit() {
202        let v = [1.0_f32];
203        let bytes = canonical_vector_bytes(VectorRef::F32(&v), VecDtype::F32);
204        assert_eq!(bytes, 1.0_f32.to_le_bytes().to_vec());
205    }
206
207    #[test]
208    fn vector_dtype_round_trip() {
209        assert_eq!(VecDtype::parse("f32").unwrap(), VecDtype::F32);
210        assert_eq!(VecDtype::parse("f64").unwrap(), VecDtype::F64);
211        assert!(VecDtype::parse("f16").is_err());
212    }
213
214    #[test]
215    fn hash_vector_format_is_sha256_hex() {
216        let v: Vec<f32> = (0..8).map(|i| i as f32).collect();
217        let h = hash_vector(VectorRef::F32(&v), VecDtype::F32);
218        assert!(h.starts_with("sha256:"));
219        assert_eq!(h.len(), "sha256:".len() + 64);
220    }
221}