manifoldb_vector/encoding/
inverted_keys.rs

1//! Key encoding for inverted index storage.
2//!
3//! This module provides key encoding functions for sparse vector inverted indexes,
4//! supporting efficient posting list lookups and range scans.
5//!
6//! # Key Format
7//!
8//! All keys use big-endian encoding for proper sort order in range scans.
9//!
10//! ## Posting list entry
11//! `[PREFIX_POSTING][collection_hash][vector_name_hash][token_id]` → `[(point_id, weight), ...]`
12//!
13//! ## Index metadata
14//! `[PREFIX_INVERTED_META][collection_hash][vector_name_hash]` → `InvertedIndexMeta`
15//!
16//! ## Point tokens (reverse mapping for deletion)
17//! `[PREFIX_POINT_TOKENS][collection_hash][vector_name_hash][point_id]` → `[token_ids]`
18
19use manifoldb_core::PointId;
20
21use super::point_keys::hash_name;
22
23/// Key prefix for posting lists.
24pub const PREFIX_POSTING: u8 = 0x30;
25
26/// Key prefix for inverted index metadata.
27pub const PREFIX_INVERTED_META: u8 = 0x31;
28
29/// Key prefix for point-to-tokens reverse mapping.
30pub const PREFIX_POINT_TOKENS: u8 = 0x32;
31
32// ============================================================================
33// Posting list keys
34// ============================================================================
35
36/// Encode a key for a posting list entry.
37///
38/// Key format: `[PREFIX_POSTING][collection_hash][vector_name_hash][token_id]`
39#[must_use]
40pub fn encode_posting_key(collection_name: &str, vector_name: &str, token_id: u32) -> Vec<u8> {
41    let mut key = Vec::with_capacity(21);
42    key.push(PREFIX_POSTING);
43    key.extend_from_slice(&hash_name(collection_name).to_be_bytes());
44    key.extend_from_slice(&hash_name(vector_name).to_be_bytes());
45    key.extend_from_slice(&token_id.to_be_bytes());
46    key
47}
48
49/// Encode a prefix for scanning all posting lists for a vector.
50#[must_use]
51pub fn encode_posting_prefix(collection_name: &str, vector_name: &str) -> Vec<u8> {
52    let mut key = Vec::with_capacity(17);
53    key.push(PREFIX_POSTING);
54    key.extend_from_slice(&hash_name(collection_name).to_be_bytes());
55    key.extend_from_slice(&hash_name(vector_name).to_be_bytes());
56    key
57}
58
59/// Encode a prefix for scanning all posting lists in a collection.
60#[must_use]
61pub fn encode_posting_collection_prefix(collection_name: &str) -> Vec<u8> {
62    let mut key = Vec::with_capacity(9);
63    key.push(PREFIX_POSTING);
64    key.extend_from_slice(&hash_name(collection_name).to_be_bytes());
65    key
66}
67
68/// A decoded posting key.
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub struct PostingKey {
71    /// The hash of the collection name.
72    pub collection_name_hash: u64,
73    /// The hash of the vector name.
74    pub vector_name_hash: u64,
75    /// The token ID.
76    pub token_id: u32,
77}
78
79/// Decode a posting key.
80///
81/// Returns `None` if the key doesn't have the correct format.
82#[must_use]
83pub fn decode_posting_key(key: &[u8]) -> Option<PostingKey> {
84    if key.len() != 21 || key[0] != PREFIX_POSTING {
85        return None;
86    }
87    let collection_hash_bytes: [u8; 8] = key[1..9].try_into().ok()?;
88    let vector_hash_bytes: [u8; 8] = key[9..17].try_into().ok()?;
89    let token_id_bytes: [u8; 4] = key[17..21].try_into().ok()?;
90    Some(PostingKey {
91        collection_name_hash: u64::from_be_bytes(collection_hash_bytes),
92        vector_name_hash: u64::from_be_bytes(vector_hash_bytes),
93        token_id: u32::from_be_bytes(token_id_bytes),
94    })
95}
96
97// ============================================================================
98// Index metadata keys
99// ============================================================================
100
101/// Encode a key for inverted index metadata.
102///
103/// Key format: `[PREFIX_INVERTED_META][collection_hash][vector_name_hash]`
104#[must_use]
105pub fn encode_inverted_meta_key(collection_name: &str, vector_name: &str) -> Vec<u8> {
106    let mut key = Vec::with_capacity(17);
107    key.push(PREFIX_INVERTED_META);
108    key.extend_from_slice(&hash_name(collection_name).to_be_bytes());
109    key.extend_from_slice(&hash_name(vector_name).to_be_bytes());
110    key
111}
112
113/// Encode a prefix for scanning all inverted indexes in a collection.
114#[must_use]
115pub fn encode_inverted_meta_collection_prefix(collection_name: &str) -> Vec<u8> {
116    let mut key = Vec::with_capacity(9);
117    key.push(PREFIX_INVERTED_META);
118    key.extend_from_slice(&hash_name(collection_name).to_be_bytes());
119    key
120}
121
122/// A decoded inverted meta key.
123#[derive(Debug, Clone, Copy, PartialEq, Eq)]
124pub struct InvertedMetaKey {
125    /// The hash of the collection name.
126    pub collection_name_hash: u64,
127    /// The hash of the vector name.
128    pub vector_name_hash: u64,
129}
130
131/// Decode an inverted meta key.
132///
133/// Returns `None` if the key doesn't have the correct format.
134#[must_use]
135pub fn decode_inverted_meta_key(key: &[u8]) -> Option<InvertedMetaKey> {
136    if key.len() != 17 || key[0] != PREFIX_INVERTED_META {
137        return None;
138    }
139    let collection_hash_bytes: [u8; 8] = key[1..9].try_into().ok()?;
140    let vector_hash_bytes: [u8; 8] = key[9..17].try_into().ok()?;
141    Some(InvertedMetaKey {
142        collection_name_hash: u64::from_be_bytes(collection_hash_bytes),
143        vector_name_hash: u64::from_be_bytes(vector_hash_bytes),
144    })
145}
146
147// ============================================================================
148// Point tokens keys (reverse mapping)
149// ============================================================================
150
151/// Encode a key for point-to-tokens reverse mapping.
152///
153/// Key format: `[PREFIX_POINT_TOKENS][collection_hash][vector_name_hash][point_id]`
154#[must_use]
155pub fn encode_point_tokens_key(
156    collection_name: &str,
157    vector_name: &str,
158    point_id: PointId,
159) -> Vec<u8> {
160    let mut key = Vec::with_capacity(25);
161    key.push(PREFIX_POINT_TOKENS);
162    key.extend_from_slice(&hash_name(collection_name).to_be_bytes());
163    key.extend_from_slice(&hash_name(vector_name).to_be_bytes());
164    key.extend_from_slice(&point_id.as_u64().to_be_bytes());
165    key
166}
167
168/// Encode a prefix for scanning all point tokens for a vector.
169#[must_use]
170pub fn encode_point_tokens_prefix(collection_name: &str, vector_name: &str) -> Vec<u8> {
171    let mut key = Vec::with_capacity(17);
172    key.push(PREFIX_POINT_TOKENS);
173    key.extend_from_slice(&hash_name(collection_name).to_be_bytes());
174    key.extend_from_slice(&hash_name(vector_name).to_be_bytes());
175    key
176}
177
178/// Encode a prefix for scanning all point tokens in a collection.
179#[must_use]
180pub fn encode_point_tokens_collection_prefix(collection_name: &str) -> Vec<u8> {
181    let mut key = Vec::with_capacity(9);
182    key.push(PREFIX_POINT_TOKENS);
183    key.extend_from_slice(&hash_name(collection_name).to_be_bytes());
184    key
185}
186
187/// A decoded point tokens key.
188#[derive(Debug, Clone, Copy, PartialEq, Eq)]
189pub struct PointTokensKey {
190    /// The hash of the collection name.
191    pub collection_name_hash: u64,
192    /// The hash of the vector name.
193    pub vector_name_hash: u64,
194    /// The point ID.
195    pub point_id: PointId,
196}
197
198/// Decode a point tokens key.
199///
200/// Returns `None` if the key doesn't have the correct format.
201#[must_use]
202pub fn decode_point_tokens_key(key: &[u8]) -> Option<PointTokensKey> {
203    if key.len() != 25 || key[0] != PREFIX_POINT_TOKENS {
204        return None;
205    }
206    let collection_hash_bytes: [u8; 8] = key[1..9].try_into().ok()?;
207    let vector_hash_bytes: [u8; 8] = key[9..17].try_into().ok()?;
208    let point_id_bytes: [u8; 8] = key[17..25].try_into().ok()?;
209    Some(PointTokensKey {
210        collection_name_hash: u64::from_be_bytes(collection_hash_bytes),
211        vector_name_hash: u64::from_be_bytes(vector_hash_bytes),
212        point_id: PointId::new(u64::from_be_bytes(point_id_bytes)),
213    })
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219
220    #[test]
221    fn posting_key_roundtrip() {
222        let collection = "documents";
223        let vector = "keywords";
224        let token_id = 12345u32;
225
226        let key = encode_posting_key(collection, vector, token_id);
227        assert_eq!(key.len(), 21);
228        assert_eq!(key[0], PREFIX_POSTING);
229
230        let decoded = decode_posting_key(&key).unwrap();
231        assert_eq!(decoded.collection_name_hash, hash_name(collection));
232        assert_eq!(decoded.vector_name_hash, hash_name(vector));
233        assert_eq!(decoded.token_id, token_id);
234    }
235
236    #[test]
237    fn posting_keys_sorted_by_token_id() {
238        let key1 = encode_posting_key("col", "vec", 1);
239        let key2 = encode_posting_key("col", "vec", 2);
240        let key3 = encode_posting_key("col", "vec", 100);
241
242        assert!(key1 < key2);
243        assert!(key2 < key3);
244    }
245
246    #[test]
247    fn posting_prefix_matches() {
248        let collection = "documents";
249        let vector = "keywords";
250        let prefix = encode_posting_prefix(collection, vector);
251
252        let key1 = encode_posting_key(collection, vector, 1);
253        let key2 = encode_posting_key(collection, vector, 1000);
254
255        assert!(key1.starts_with(&prefix));
256        assert!(key2.starts_with(&prefix));
257
258        // Different vector should not match
259        let key_other = encode_posting_key(collection, "other_vector", 1);
260        assert!(!key_other.starts_with(&prefix));
261    }
262
263    #[test]
264    fn inverted_meta_key_roundtrip() {
265        let collection = "documents";
266        let vector = "keywords";
267
268        let key = encode_inverted_meta_key(collection, vector);
269        assert_eq!(key.len(), 17);
270        assert_eq!(key[0], PREFIX_INVERTED_META);
271
272        let decoded = decode_inverted_meta_key(&key).unwrap();
273        assert_eq!(decoded.collection_name_hash, hash_name(collection));
274        assert_eq!(decoded.vector_name_hash, hash_name(vector));
275    }
276
277    #[test]
278    fn point_tokens_key_roundtrip() {
279        let collection = "documents";
280        let vector = "keywords";
281        let point_id = PointId::new(42);
282
283        let key = encode_point_tokens_key(collection, vector, point_id);
284        assert_eq!(key.len(), 25);
285        assert_eq!(key[0], PREFIX_POINT_TOKENS);
286
287        let decoded = decode_point_tokens_key(&key).unwrap();
288        assert_eq!(decoded.collection_name_hash, hash_name(collection));
289        assert_eq!(decoded.vector_name_hash, hash_name(vector));
290        assert_eq!(decoded.point_id, point_id);
291    }
292
293    #[test]
294    fn decode_invalid_keys() {
295        // Wrong prefix
296        assert!(decode_posting_key(&[PREFIX_INVERTED_META; 21]).is_none());
297        assert!(decode_inverted_meta_key(&[PREFIX_POSTING; 17]).is_none());
298        assert!(decode_point_tokens_key(&[PREFIX_POSTING; 25]).is_none());
299
300        // Wrong length
301        assert!(decode_posting_key(&[PREFIX_POSTING; 10]).is_none());
302        assert!(decode_inverted_meta_key(&[PREFIX_INVERTED_META; 10]).is_none());
303        assert!(decode_point_tokens_key(&[PREFIX_POINT_TOKENS; 10]).is_none());
304
305        // Empty
306        assert!(decode_posting_key(&[]).is_none());
307        assert!(decode_inverted_meta_key(&[]).is_none());
308        assert!(decode_point_tokens_key(&[]).is_none());
309    }
310}