manifoldb_vector/encoding/
keys.rs

1//! Key encoding for vector storage.
2
3use manifoldb_core::EntityId;
4
5use crate::types::EmbeddingName;
6
7/// Key prefix for embedding space metadata.
8pub const PREFIX_EMBEDDING_SPACE: u8 = 0x10;
9
10/// Key prefix for entity embeddings.
11pub const PREFIX_EMBEDDING: u8 = 0x11;
12
13/// Key prefix for sparse embedding space metadata.
14pub const PREFIX_SPARSE_EMBEDDING_SPACE: u8 = 0x12;
15
16/// Key prefix for sparse entity embeddings.
17pub const PREFIX_SPARSE_EMBEDDING: u8 = 0x13;
18
19/// Key prefix for multi-vector embedding space metadata.
20pub const PREFIX_MULTI_VECTOR_SPACE: u8 = 0x14;
21
22/// Key prefix for multi-vector entity embeddings.
23pub const PREFIX_MULTI_VECTOR: u8 = 0x15;
24
25/// Compute a hash for an embedding name.
26///
27/// Uses FNV-1a hash for efficient computation.
28#[must_use]
29pub fn hash_name(name: &str) -> u64 {
30    const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325;
31    const FNV_PRIME: u64 = 0x0100_0000_01b3;
32
33    let mut hash = FNV_OFFSET;
34    for byte in name.as_bytes() {
35        hash ^= u64::from(*byte);
36        hash = hash.wrapping_mul(FNV_PRIME);
37    }
38    hash
39}
40
41/// Encode a key for embedding space metadata.
42///
43/// Key format: `[PREFIX_EMBEDDING_SPACE][name_hash]`
44#[must_use]
45pub fn encode_embedding_space_key(name: &EmbeddingName) -> Vec<u8> {
46    let mut key = Vec::with_capacity(9);
47    key.push(PREFIX_EMBEDDING_SPACE);
48    key.extend_from_slice(&hash_name(name.as_str()).to_be_bytes());
49    key
50}
51
52/// Decode an embedding space name hash from a space key.
53///
54/// Returns `None` if the key doesn't have the correct format.
55#[must_use]
56pub fn decode_embedding_space_key(key: &[u8]) -> Option<u64> {
57    if key.len() != 9 || key[0] != PREFIX_EMBEDDING_SPACE {
58        return None;
59    }
60    let bytes: [u8; 8] = key[1..9].try_into().ok()?;
61    Some(u64::from_be_bytes(bytes))
62}
63
64/// Encode a key for an entity's embedding in a space.
65///
66/// Key format: `[PREFIX_EMBEDDING][name_hash][entity_id]`
67///
68/// This enables efficient range scans for "all embeddings in space X" or
69/// "embedding for entity Y in space X".
70#[must_use]
71pub fn encode_embedding_key(name: &EmbeddingName, entity_id: EntityId) -> Vec<u8> {
72    let mut key = Vec::with_capacity(17);
73    key.push(PREFIX_EMBEDDING);
74    key.extend_from_slice(&hash_name(name.as_str()).to_be_bytes());
75    key.extend_from_slice(&entity_id.as_u64().to_be_bytes());
76    key
77}
78
79/// Encode a prefix for scanning all embeddings in a space.
80///
81/// Returns a key that can be used as the start of a range scan
82/// for all embeddings in the given space.
83#[must_use]
84pub fn encode_embedding_prefix(name: &EmbeddingName) -> Vec<u8> {
85    let mut key = Vec::with_capacity(9);
86    key.push(PREFIX_EMBEDDING);
87    key.extend_from_slice(&hash_name(name.as_str()).to_be_bytes());
88    key
89}
90
91/// A decoded embedding key.
92#[derive(Debug, Clone, Copy, PartialEq, Eq)]
93pub struct EmbeddingKey {
94    /// The hash of the embedding space name.
95    pub name_hash: u64,
96    /// The entity ID.
97    pub entity_id: EntityId,
98}
99
100/// Decode an embedding key.
101///
102/// Returns `None` if the key doesn't have the correct format.
103#[must_use]
104pub fn decode_embedding_key(key: &[u8]) -> Option<EmbeddingKey> {
105    if key.len() != 17 || key[0] != PREFIX_EMBEDDING {
106        return None;
107    }
108
109    let name_hash_bytes: [u8; 8] = key[1..9].try_into().ok()?;
110    let entity_id_bytes: [u8; 8] = key[9..17].try_into().ok()?;
111
112    Some(EmbeddingKey {
113        name_hash: u64::from_be_bytes(name_hash_bytes),
114        entity_id: EntityId::new(u64::from_be_bytes(entity_id_bytes)),
115    })
116}
117
118/// Decode an entity ID from an embedding key.
119///
120/// Returns `None` if the key doesn't have the correct format.
121#[must_use]
122pub fn decode_embedding_entity_id(key: &[u8]) -> Option<EntityId> {
123    decode_embedding_key(key).map(|k| k.entity_id)
124}
125
126// --- Sparse embedding keys ---
127
128/// Encode a key for sparse embedding space metadata.
129///
130/// Key format: `[PREFIX_SPARSE_EMBEDDING_SPACE][name_hash]`
131#[must_use]
132pub fn encode_sparse_embedding_space_key(name: &EmbeddingName) -> Vec<u8> {
133    let mut key = Vec::with_capacity(9);
134    key.push(PREFIX_SPARSE_EMBEDDING_SPACE);
135    key.extend_from_slice(&hash_name(name.as_str()).to_be_bytes());
136    key
137}
138
139/// Decode a sparse embedding space name hash from a space key.
140///
141/// Returns `None` if the key doesn't have the correct format.
142#[must_use]
143pub fn decode_sparse_embedding_space_key(key: &[u8]) -> Option<u64> {
144    if key.len() != 9 || key[0] != PREFIX_SPARSE_EMBEDDING_SPACE {
145        return None;
146    }
147    let bytes: [u8; 8] = key[1..9].try_into().ok()?;
148    Some(u64::from_be_bytes(bytes))
149}
150
151/// Encode a key for an entity's sparse embedding in a space.
152///
153/// Key format: `[PREFIX_SPARSE_EMBEDDING][name_hash][entity_id]`
154#[must_use]
155pub fn encode_sparse_embedding_key(name: &EmbeddingName, entity_id: EntityId) -> Vec<u8> {
156    let mut key = Vec::with_capacity(17);
157    key.push(PREFIX_SPARSE_EMBEDDING);
158    key.extend_from_slice(&hash_name(name.as_str()).to_be_bytes());
159    key.extend_from_slice(&entity_id.as_u64().to_be_bytes());
160    key
161}
162
163/// Encode a prefix for scanning all sparse embeddings in a space.
164#[must_use]
165pub fn encode_sparse_embedding_prefix(name: &EmbeddingName) -> Vec<u8> {
166    let mut key = Vec::with_capacity(9);
167    key.push(PREFIX_SPARSE_EMBEDDING);
168    key.extend_from_slice(&hash_name(name.as_str()).to_be_bytes());
169    key
170}
171
172/// A decoded sparse embedding key.
173#[derive(Debug, Clone, Copy, PartialEq, Eq)]
174pub struct SparseEmbeddingKey {
175    /// The hash of the embedding space name.
176    pub name_hash: u64,
177    /// The entity ID.
178    pub entity_id: EntityId,
179}
180
181/// Decode a sparse embedding key.
182///
183/// Returns `None` if the key doesn't have the correct format.
184#[must_use]
185pub fn decode_sparse_embedding_key(key: &[u8]) -> Option<SparseEmbeddingKey> {
186    if key.len() != 17 || key[0] != PREFIX_SPARSE_EMBEDDING {
187        return None;
188    }
189
190    let name_hash_bytes: [u8; 8] = key[1..9].try_into().ok()?;
191    let entity_id_bytes: [u8; 8] = key[9..17].try_into().ok()?;
192
193    Some(SparseEmbeddingKey {
194        name_hash: u64::from_be_bytes(name_hash_bytes),
195        entity_id: EntityId::new(u64::from_be_bytes(entity_id_bytes)),
196    })
197}
198
199/// Decode an entity ID from a sparse embedding key.
200///
201/// Returns `None` if the key doesn't have the correct format.
202#[must_use]
203pub fn decode_sparse_embedding_entity_id(key: &[u8]) -> Option<EntityId> {
204    decode_sparse_embedding_key(key).map(|k| k.entity_id)
205}
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210
211    #[test]
212    fn embedding_space_key_roundtrip() {
213        let name = EmbeddingName::new("test_space").unwrap();
214        let key = encode_embedding_space_key(&name);
215
216        assert_eq!(key.len(), 9);
217        assert_eq!(key[0], PREFIX_EMBEDDING_SPACE);
218
219        let decoded = decode_embedding_space_key(&key);
220        assert_eq!(decoded, Some(hash_name("test_space")));
221    }
222
223    #[test]
224    fn embedding_key_roundtrip() {
225        let name = EmbeddingName::new("text_embedding").unwrap();
226        let entity_id = EntityId::new(12345);
227
228        let key = encode_embedding_key(&name, entity_id);
229        assert_eq!(key.len(), 17);
230        assert_eq!(key[0], PREFIX_EMBEDDING);
231
232        let decoded = decode_embedding_key(&key).unwrap();
233        assert_eq!(decoded.name_hash, hash_name("text_embedding"));
234        assert_eq!(decoded.entity_id, entity_id);
235    }
236
237    #[test]
238    fn embedding_keys_ordered_by_space_then_entity() {
239        let space1 = EmbeddingName::new("aaa").unwrap();
240        let space2 = EmbeddingName::new("zzz").unwrap();
241
242        let key1 = encode_embedding_key(&space1, EntityId::new(1));
243        let key2 = encode_embedding_key(&space1, EntityId::new(2));
244        let key3 = encode_embedding_key(&space2, EntityId::new(1));
245
246        // Same space: ordered by entity ID
247        assert!(key1 < key2);
248
249        // Note: Different spaces may not be lexicographically ordered by name
250        // due to hashing, but embeddings within a space ARE ordered by entity ID
251        let prefix1 = encode_embedding_prefix(&space1);
252        assert!(key1.starts_with(&prefix1));
253        assert!(key2.starts_with(&prefix1));
254        assert!(!key3.starts_with(&prefix1));
255    }
256
257    #[test]
258    fn embedding_prefix_groups_embeddings_by_space() {
259        let name = EmbeddingName::new("my_space").unwrap();
260        let prefix = encode_embedding_prefix(&name);
261
262        for id in [0u64, 1, 100, u64::MAX] {
263            let key = encode_embedding_key(&name, EntityId::new(id));
264            assert!(key.starts_with(&prefix));
265        }
266    }
267
268    #[test]
269    fn decode_invalid_embedding_key() {
270        // Wrong prefix
271        assert!(decode_embedding_key(&[PREFIX_EMBEDDING_SPACE; 17]).is_none());
272
273        // Wrong length
274        assert!(decode_embedding_key(&[PREFIX_EMBEDDING; 10]).is_none());
275
276        // Empty
277        assert!(decode_embedding_key(&[]).is_none());
278    }
279
280    #[test]
281    fn decode_invalid_space_key() {
282        // Wrong prefix
283        assert!(decode_embedding_space_key(&[PREFIX_EMBEDDING; 9]).is_none());
284
285        // Wrong length
286        assert!(decode_embedding_space_key(&[PREFIX_EMBEDDING_SPACE; 5]).is_none());
287
288        // Empty
289        assert!(decode_embedding_space_key(&[]).is_none());
290    }
291
292    #[test]
293    fn key_prefixes_partition_keyspace() {
294        let name = EmbeddingName::new("test").unwrap();
295        let space_key = encode_embedding_space_key(&name);
296        let embedding_key = encode_embedding_key(&name, EntityId::new(1));
297
298        assert!(space_key[0] != embedding_key[0]);
299    }
300
301    #[test]
302    fn hash_consistency() {
303        // Same name should produce same hash
304        assert_eq!(hash_name("test"), hash_name("test"));
305
306        // Different names should (almost certainly) produce different hashes
307        assert_ne!(hash_name("test1"), hash_name("test2"));
308    }
309}