1use sha2::{Digest, Sha256};
16use unicode_normalization::UnicodeNormalization;
17
18#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
20pub enum VecDtype {
21 F32,
23 F64,
25}
26
27impl VecDtype {
28 pub fn as_str(self) -> &'static str {
30 match self {
31 VecDtype::F32 => "f32",
32 VecDtype::F64 => "f64",
33 }
34 }
35
36 pub fn parse(s: &str) -> Result<Self, HashError> {
38 match s {
39 "f32" => Ok(VecDtype::F32),
40 "f64" => Ok(VecDtype::F64),
41 other => Err(HashError::UnsupportedDtype(other.to_string())),
42 }
43 }
44}
45
46impl std::str::FromStr for VecDtype {
47 type Err = HashError;
48 fn from_str(s: &str) -> Result<Self, HashError> {
49 Self::parse(s)
50 }
51}
52
53#[derive(Debug, thiserror::Error)]
55pub enum HashError {
56 #[error("vector dim mismatch: declared {declared}, actual {actual}")]
58 DimMismatch {
59 declared: usize,
61 actual: usize,
63 },
64 #[error("unsupported canonical dtype: {0}")]
66 UnsupportedDtype(String),
67}
68
69#[derive(Debug, Clone, Copy)]
75pub enum VectorRef<'a> {
76 F32(&'a [f32]),
78 F64(&'a [f64]),
80}
81
82impl<'a> VectorRef<'a> {
83 pub fn len(&self) -> usize {
85 match self {
86 VectorRef::F32(v) => v.len(),
87 VectorRef::F64(v) => v.len(),
88 }
89 }
90
91 pub fn is_empty(&self) -> bool {
93 self.len() == 0
94 }
95
96 pub fn native_dtype(&self) -> VecDtype {
98 match self {
99 VectorRef::F32(_) => VecDtype::F32,
100 VectorRef::F64(_) => VecDtype::F64,
101 }
102 }
103}
104
105impl<'a> From<&'a [f32]> for VectorRef<'a> {
106 fn from(v: &'a [f32]) -> Self {
107 VectorRef::F32(v)
108 }
109}
110
111impl<'a> From<&'a [f64]> for VectorRef<'a> {
112 fn from(v: &'a [f64]) -> Self {
113 VectorRef::F64(v)
114 }
115}
116
117pub fn canonical_vector_bytes(vector: VectorRef<'_>, dtype: VecDtype) -> Vec<u8> {
123 match (vector, dtype) {
124 (VectorRef::F32(v), VecDtype::F32) => f32_le_bytes(v),
125 (VectorRef::F64(v), VecDtype::F32) => {
126 let casted: Vec<f32> = v.iter().map(|&x| x as f32).collect();
128 f32_le_bytes(&casted)
129 }
130 (VectorRef::F32(v), VecDtype::F64) => {
131 let casted: Vec<f64> = v.iter().map(|&x| x as f64).collect();
133 f64_le_bytes(&casted)
134 }
135 (VectorRef::F64(v), VecDtype::F64) => f64_le_bytes(v),
136 }
137}
138
139fn f32_le_bytes(v: &[f32]) -> Vec<u8> {
140 let mut out = Vec::with_capacity(v.len() * 4);
141 for x in v {
142 out.extend_from_slice(&x.to_le_bytes());
143 }
144 out
145}
146
147fn f64_le_bytes(v: &[f64]) -> Vec<u8> {
148 let mut out = Vec::with_capacity(v.len() * 8);
149 for x in v {
150 out.extend_from_slice(&x.to_le_bytes());
151 }
152 out
153}
154
155pub fn hash_vector(vector: VectorRef<'_>, dtype: VecDtype) -> String {
157 sha256_prefixed(&canonical_vector_bytes(vector, dtype))
158}
159
160pub fn hash_text(text: &str) -> String {
162 let normalized: String = text.nfc().collect();
163 sha256_prefixed(normalized.as_bytes())
164}
165
166pub fn hash_bytes(data: &[u8]) -> String {
168 sha256_prefixed(data)
169}
170
171fn sha256_prefixed(data: &[u8]) -> String {
172 let mut hasher = Sha256::new();
173 hasher.update(data);
174 let digest = hasher.finalize();
175 format!("sha256:{}", hex::encode(digest))
176}
177
178#[cfg(test)]
179mod tests {
180 use super::*;
181
182 #[test]
183 fn hash_text_is_stable() {
184 assert_eq!(hash_text("hello"), hash_text("hello"));
185 }
186
187 #[test]
188 fn hash_text_normalizes_nfc() {
189 let composed = "caf\u{00e9}";
191 let decomposed = "cafe\u{0301}";
192 assert_eq!(hash_text(composed), hash_text(decomposed));
193 }
194
195 #[test]
196 fn hash_text_distinguishes_content() {
197 assert_ne!(hash_text("hello"), hash_text("Hello"));
198 }
199
200 #[test]
201 fn canonical_vector_bytes_endianness_is_explicit() {
202 let v = [1.0_f32];
203 let bytes = canonical_vector_bytes(VectorRef::F32(&v), VecDtype::F32);
204 assert_eq!(bytes, 1.0_f32.to_le_bytes().to_vec());
205 }
206
207 #[test]
208 fn vector_dtype_round_trip() {
209 assert_eq!(VecDtype::parse("f32").unwrap(), VecDtype::F32);
210 assert_eq!(VecDtype::parse("f64").unwrap(), VecDtype::F64);
211 assert!(VecDtype::parse("f16").is_err());
212 }
213
214 #[test]
215 fn hash_vector_format_is_sha256_hex() {
216 let v: Vec<f32> = (0..8).map(|i| i as f32).collect();
217 let h = hash_vector(VectorRef::F32(&v), VecDtype::F32);
218 assert!(h.starts_with("sha256:"));
219 assert_eq!(h.len(), "sha256:".len() + 64);
220 }
221}