vector/serde/id_dictionary.rs
1//! IdDictionary value encoding/decoding.
2//!
3//! Maps user-provided external IDs to system-assigned internal vector IDs.
4//!
5//! ## Why Two ID Spaces?
6//!
7//! Users provide **external IDs**—arbitrary strings up to 64 bytes—to identify
8//! their vectors. The system maps these to **internal IDs** (u64) because:
9//!
10//! 1. **Efficient bitmaps**: Fixed-width u64 keys enable RoaringTreemap operations
11//! 2. **Better compression**: Monotonically increasing IDs cluster well in bitmaps
12//! 3. **Lifecycle management**: System controls ID allocation and reuse
13//!
14//! ## Upsert Behavior
15//!
16//! When inserting a vector with an existing external ID:
17//!
18//! 1. Look up existing internal ID from `IdDictionary`
19//! 2. Delete old vector: add to deleted bitmap, tombstone data/metadata
20//! 3. Allocate new internal ID (from `SeqBlock`)
21//! 4. Write new vector with new internal ID
22//! 5. Update `IdDictionary` to point to new internal ID
23//!
24//! This "delete old + insert new" approach avoids expensive read-modify-write
25//! cycles to update every posting list and metadata index entry.
26//!
27//! ## Delete Operation
28//!
29//! Deleting a vector requires atomic operations via `WriteBatch`:
30//! 1. Add vector ID to deleted bitmap (centroid_id = 0 posting list)
31//! 2. Tombstone `VectorData` record
32//! 3. Tombstone `VectorMeta` record
33//! 4. Tombstone `IdDictionary` entry
34//!
35//! Metadata index cleanup happens during LIRE maintenance.
36
37use super::{Decode, Encode, EncodingError};
38use bytes::{Bytes, BytesMut};
39
40/// IdDictionary value storing the internal vector ID for an external ID.
41///
42/// The key for this record is `IdDictionaryKey { external_id }`, which uses
43/// `TerminatedBytes` encoding to preserve lexicographic ordering of external IDs.
44///
45/// ## Value Layout (little-endian)
46///
47/// ```text
48/// ┌────────────────────────────────────────────────────────────────┐
49/// │ vector_id: u64 (8 bytes, little-endian) │
50/// └────────────────────────────────────────────────────────────────┘
51/// ```
52///
53/// ## Usage
54///
55/// - **Insert**: Look up external ID → if exists, upsert; else allocate new internal ID
56/// - **Get by external ID**: Point lookup to resolve external → internal mapping
57/// - **Get by internal ID**: Use `VectorMeta` which stores the reverse mapping
58#[derive(Debug, Clone, PartialEq, Eq)]
59pub struct IdDictionaryValue {
60 /// Internal vector ID (system-assigned, monotonically increasing).
61 ///
62 /// This ID is used in `VectorData`, `VectorMeta`, posting lists, and
63 /// metadata indexes. It's allocated from `SeqBlock` using block-based
64 /// allocation for crash safety.
65 pub vector_id: u64,
66}
67
68impl IdDictionaryValue {
69 pub fn new(vector_id: u64) -> Self {
70 Self { vector_id }
71 }
72
73 pub fn encode_to_bytes(&self) -> Bytes {
74 let mut buf = BytesMut::with_capacity(8);
75 self.vector_id.encode(&mut buf);
76 buf.freeze()
77 }
78
79 pub fn decode_from_bytes(buf: &[u8]) -> Result<Self, EncodingError> {
80 if buf.len() < 8 {
81 return Err(EncodingError {
82 message: format!(
83 "Buffer too short for IdDictionaryValue: need 8 bytes, have {}",
84 buf.len()
85 ),
86 });
87 }
88 let mut slice = buf;
89 let vector_id = u64::decode(&mut slice)?;
90 Ok(IdDictionaryValue { vector_id })
91 }
92}
93
94#[cfg(test)]
95mod tests {
96 use super::*;
97
98 #[test]
99 fn should_encode_and_decode_id_dictionary_value() {
100 // given
101 let value = IdDictionaryValue::new(12345);
102
103 // when
104 let encoded = value.encode_to_bytes();
105 let decoded = IdDictionaryValue::decode_from_bytes(&encoded).unwrap();
106
107 // then
108 assert_eq!(decoded, value);
109 assert_eq!(encoded.len(), 8);
110 }
111
112 #[test]
113 fn should_encode_and_decode_max_value() {
114 // given
115 let value = IdDictionaryValue::new(u64::MAX);
116
117 // when
118 let encoded = value.encode_to_bytes();
119 let decoded = IdDictionaryValue::decode_from_bytes(&encoded).unwrap();
120
121 // then
122 assert_eq!(decoded, value);
123 }
124
125 #[test]
126 fn should_return_error_for_short_buffer() {
127 // given
128 let short_buf = vec![0u8; 4]; // Only 4 bytes, need 8
129
130 // when
131 let result = IdDictionaryValue::decode_from_bytes(&short_buf);
132
133 // then
134 assert!(result.is_err());
135 assert!(result.unwrap_err().message.contains("Buffer too short"));
136 }
137}