Skip to main content

chroma_types/
segment.rs

1use super::{
2    CollectionUuid, Metadata, MetadataValueConversionError, SegmentScope,
3    SegmentScopeConversionError,
4};
5use crate::{chroma_proto, DatabaseUuid};
6use chroma_error::{ChromaError, ErrorCodes};
7use std::{collections::HashMap, str::FromStr};
8use thiserror::Error;
9use tonic::Status;
10use uuid::Uuid;
11
12pub const USER_ID_TO_OFFSET_ID: &str = "user_id_to_offset_id";
13pub const OFFSET_ID_TO_USER_ID: &str = "offset_id_to_user_id";
14pub const OFFSET_ID_TO_DATA: &str = "offset_id_to_data";
15pub const MAX_OFFSET_ID: &str = "max_offset_id";
16
17pub const FULL_TEXT_PLS: &str = "full_text_pls";
18pub const STRING_METADATA: &str = "string_metadata";
19pub const BOOL_METADATA: &str = "bool_metadata";
20pub const F32_METADATA: &str = "f32_metadata";
21pub const U32_METADATA: &str = "u32_metadata";
22
23pub const SPARSE_MAX: &str = "sparse_max";
24pub const SPARSE_OFFSET_VALUE: &str = "sparse_offset_value";
25
26pub const HNSW_PATH: &str = "hnsw_path";
27pub const VERSION_MAP_PATH: &str = "version_map_path";
28pub const POSTING_LIST_PATH: &str = "posting_list_path";
29pub const MAX_HEAD_ID_BF_PATH: &str = "max_head_id_path";
30
31pub const QUANTIZED_SPANN_CLUSTER: &str = "quantized_spann_cluster";
32pub const QUANTIZED_SPANN_SCALAR_METADATA: &str = "quantized_spann_scalar_metadata";
33pub const QUANTIZED_SPANN_EMBEDDING_METADATA: &str = "quantized_spann_embedding_metadata";
34pub const QUANTIZED_SPANN_RAW_CENTROID: &str = "quantized_spann_raw_centroid";
35pub const QUANTIZED_SPANN_QUANTIZED_CENTROID: &str = "quantized_spann_quantized_centroid";
36
37/// SegmentUuid is a wrapper around Uuid to provide a type for the segment id.
38#[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash)]
39pub struct SegmentUuid(pub Uuid);
40
41impl SegmentUuid {
42    pub fn new() -> Self {
43        SegmentUuid(Uuid::new_v4())
44    }
45}
46
47impl FromStr for SegmentUuid {
48    type Err = SegmentConversionError;
49
50    fn from_str(s: &str) -> Result<Self, SegmentConversionError> {
51        match Uuid::parse_str(s) {
52            Ok(uuid) => Ok(SegmentUuid(uuid)),
53            Err(_) => Err(SegmentConversionError::InvalidUuid),
54        }
55    }
56}
57
58impl std::fmt::Display for SegmentUuid {
59    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
60        write!(f, "{}", self.0)
61    }
62}
63
64#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
65pub enum SegmentType {
66    BlockfileMetadata,
67    BlockfileRecord,
68    HnswDistributed,
69    HnswLocalMemory,
70    HnswLocalPersisted,
71    Sqlite,
72    Spann,
73    QuantizedSpann,
74}
75
76impl From<SegmentType> for String {
77    fn from(segment_type: SegmentType) -> String {
78        match segment_type {
79            SegmentType::BlockfileMetadata => "urn:chroma:segment/metadata/blockfile".to_string(),
80            SegmentType::BlockfileRecord => "urn:chroma:segment/record/blockfile".to_string(),
81            SegmentType::HnswDistributed => {
82                "urn:chroma:segment/vector/hnsw-distributed".to_string()
83            }
84            SegmentType::HnswLocalMemory => {
85                "urn:chroma:segment/vector/hnsw-local-memory".to_string()
86            }
87            SegmentType::HnswLocalPersisted => {
88                "urn:chroma:segment/vector/hnsw-local-persisted".to_string()
89            }
90            SegmentType::Spann => "urn:chroma:segment/vector/spann".to_string(),
91            SegmentType::QuantizedSpann => "urn:chroma:segment/vector/quantized-spann".to_string(),
92            SegmentType::Sqlite => "urn:chroma:segment/metadata/sqlite".to_string(),
93        }
94    }
95}
96
97impl TryFrom<&str> for SegmentType {
98    type Error = SegmentConversionError;
99
100    fn try_from(segment_type: &str) -> Result<Self, Self::Error> {
101        match segment_type {
102            "urn:chroma:segment/metadata/blockfile" => Ok(SegmentType::BlockfileMetadata),
103            "urn:chroma:segment/record/blockfile" => Ok(SegmentType::BlockfileRecord),
104            "urn:chroma:segment/vector/hnsw-distributed" => Ok(SegmentType::HnswDistributed),
105            "urn:chroma:segment/vector/hnsw-local-memory" => Ok(SegmentType::HnswLocalMemory),
106            "urn:chroma:segment/vector/hnsw-local-persisted" => Ok(Self::HnswLocalPersisted),
107            "urn:chroma:segment/vector/spann" => Ok(SegmentType::Spann),
108            "urn:chroma:segment/vector/quantized-spann" => Ok(SegmentType::QuantizedSpann),
109            "urn:chroma:segment/metadata/sqlite" => Ok(SegmentType::Sqlite),
110            _ => Err(SegmentConversionError::InvalidSegmentType),
111        }
112    }
113}
114
115#[derive(Clone, Debug, PartialEq)]
116pub struct Segment {
117    pub id: SegmentUuid,
118    pub r#type: SegmentType,
119    pub scope: SegmentScope,
120    pub collection: CollectionUuid,
121    pub metadata: Option<Metadata>,
122    pub file_path: HashMap<String, Vec<String>>,
123}
124
125impl Segment {
126    pub fn prefetch_supported(&self) -> bool {
127        matches!(
128            self.r#type,
129            SegmentType::BlockfileMetadata
130                | SegmentType::BlockfileRecord
131                | SegmentType::QuantizedSpann
132                | SegmentType::Spann
133        )
134    }
135
136    pub fn filepaths_to_prefetch(&self) -> Vec<String> {
137        let mut res = Vec::new();
138        match self.r#type {
139            SegmentType::QuantizedSpann => {
140                for key in [
141                    QUANTIZED_SPANN_CLUSTER,
142                    QUANTIZED_SPANN_EMBEDDING_METADATA,
143                    QUANTIZED_SPANN_SCALAR_METADATA,
144                ] {
145                    if let Some(paths) = self.file_path.get(key) {
146                        res.extend(paths.iter().cloned());
147                    }
148                }
149            }
150            SegmentType::Spann => {
151                if let Some(pl_path) = self.file_path.get(POSTING_LIST_PATH) {
152                    res.extend(pl_path.iter().cloned());
153                }
154            }
155            SegmentType::BlockfileMetadata | SegmentType::BlockfileRecord => {
156                for paths in self.file_path.values() {
157                    res.extend(paths.iter().cloned());
158                }
159            }
160            _ => {}
161        }
162        res
163    }
164
165    pub fn extract_prefix_and_id(path: &str) -> Result<(&str, uuid::Uuid), uuid::Error> {
166        let (prefix, id) = match path.rfind('/') {
167            Some(pos) => (&path[..pos], &path[pos + 1..]),
168            None => ("", path),
169        };
170        match Uuid::try_parse(id) {
171            Ok(uid) => Ok((prefix, uid)),
172            Err(e) => Err(e),
173        }
174    }
175
176    pub fn construct_prefix_path(&self, tenant: &str, database_id: &DatabaseUuid) -> String {
177        format!(
178            "tenant/{}/database/{}/collection/{}/segment/{}",
179            tenant, database_id, self.collection, self.id
180        )
181    }
182}
183
184#[derive(Error, Debug)]
185pub enum SegmentConversionError {
186    #[error("Invalid UUID")]
187    InvalidUuid,
188    #[error(transparent)]
189    MetadataValueConversionError(#[from] MetadataValueConversionError),
190    #[error(transparent)]
191    SegmentScopeConversionError(#[from] SegmentScopeConversionError),
192    #[error("Invalid segment type")]
193    InvalidSegmentType,
194}
195
196impl ChromaError for SegmentConversionError {
197    fn code(&self) -> ErrorCodes {
198        match self {
199            SegmentConversionError::InvalidUuid => ErrorCodes::InvalidArgument,
200            SegmentConversionError::InvalidSegmentType => ErrorCodes::InvalidArgument,
201            SegmentConversionError::SegmentScopeConversionError(e) => e.code(),
202            SegmentConversionError::MetadataValueConversionError(e) => e.code(),
203        }
204    }
205}
206
207impl From<SegmentConversionError> for Status {
208    fn from(value: SegmentConversionError) -> Self {
209        Status::invalid_argument(value.to_string())
210    }
211}
212
213impl TryFrom<chroma_proto::Segment> for Segment {
214    type Error = SegmentConversionError;
215
216    fn try_from(proto_segment: chroma_proto::Segment) -> Result<Self, Self::Error> {
217        let mut proto_segment = proto_segment;
218
219        let segment_uuid = match SegmentUuid::from_str(&proto_segment.id) {
220            Ok(uuid) => uuid,
221            Err(_) => return Err(SegmentConversionError::InvalidUuid),
222        };
223        let collection_uuid = match Uuid::try_parse(&proto_segment.collection) {
224            Ok(uuid) => uuid,
225            Err(_) => return Err(SegmentConversionError::InvalidUuid),
226        };
227        let collection_uuid = CollectionUuid(collection_uuid);
228        let segment_metadata: Option<Metadata> = match proto_segment.metadata {
229            Some(proto_metadata) => match proto_metadata.try_into() {
230                Ok(metadata) => Some(metadata),
231                Err(e) => return Err(SegmentConversionError::MetadataValueConversionError(e)),
232            },
233            None => None,
234        };
235        let scope: SegmentScope = match proto_segment.scope.try_into() {
236            Ok(scope) => scope,
237            Err(e) => return Err(SegmentConversionError::SegmentScopeConversionError(e)),
238        };
239
240        let segment_type: SegmentType = proto_segment.r#type.as_str().try_into()?;
241
242        let mut file_paths = HashMap::new();
243        let drain = proto_segment.file_paths.drain();
244        for (key, value) in drain {
245            file_paths.insert(key, value.paths);
246        }
247
248        Ok(Segment {
249            id: segment_uuid,
250            r#type: segment_type,
251            scope,
252            collection: collection_uuid,
253            metadata: segment_metadata,
254            file_path: file_paths,
255        })
256    }
257}
258
259impl From<Segment> for chroma_proto::Segment {
260    fn from(value: Segment) -> Self {
261        Self {
262            id: value.id.0.to_string(),
263            r#type: value.r#type.into(),
264            scope: chroma_proto::SegmentScope::from(value.scope) as i32,
265            collection: value.collection.0.to_string(),
266            metadata: value.metadata.map(Into::into),
267            file_paths: value
268                .file_path
269                .into_iter()
270                .map(|(name, paths)| (name, chroma_proto::FilePaths { paths }))
271                .collect(),
272        }
273    }
274}
275
276pub fn test_segment(collection_uuid: CollectionUuid, scope: SegmentScope) -> Segment {
277    let r#type = match scope {
278        SegmentScope::METADATA => SegmentType::BlockfileMetadata,
279        SegmentScope::RECORD => SegmentType::BlockfileRecord,
280        SegmentScope::VECTOR => SegmentType::HnswDistributed,
281        SegmentScope::SQLITE => unimplemented!("Sqlite segment is not implemented"),
282    };
283    Segment {
284        id: SegmentUuid::new(),
285        r#type,
286        scope,
287        collection: collection_uuid,
288        metadata: None,
289        file_path: HashMap::new(),
290    }
291}
292
293#[cfg(test)]
294mod tests {
295
296    use super::*;
297    use crate::MetadataValue;
298
299    #[test]
300    fn test_segment_try_from() {
301        let mut metadata = chroma_proto::UpdateMetadata {
302            metadata: HashMap::new(),
303        };
304        metadata.metadata.insert(
305            "foo".to_string(),
306            chroma_proto::UpdateMetadataValue {
307                value: Some(chroma_proto::update_metadata_value::Value::IntValue(42)),
308            },
309        );
310        let proto_segment = chroma_proto::Segment {
311            id: "00000000-0000-0000-0000-000000000000".to_string(),
312            r#type: "urn:chroma:segment/vector/hnsw-distributed".to_string(),
313            scope: chroma_proto::SegmentScope::Vector as i32,
314            collection: "00000000-0000-0000-0000-000000000000".to_string(),
315            metadata: Some(metadata),
316            file_paths: HashMap::new(),
317        };
318        let converted_segment: Segment = proto_segment.try_into().unwrap();
319        assert_eq!(converted_segment.id, SegmentUuid(Uuid::nil()));
320        assert_eq!(converted_segment.r#type, SegmentType::HnswDistributed);
321        assert_eq!(converted_segment.scope, SegmentScope::VECTOR);
322        assert_eq!(converted_segment.collection, CollectionUuid(Uuid::nil()));
323        let metadata = converted_segment.metadata.unwrap();
324        assert_eq!(metadata.len(), 1);
325        assert_eq!(metadata.get("foo").unwrap(), &MetadataValue::Int(42));
326    }
327
328    #[test]
329    fn test_segment_construct_prefix_path() {
330        let segment = Segment {
331            id: SegmentUuid(Uuid::nil()),
332            r#type: SegmentType::BlockfileMetadata,
333            scope: SegmentScope::METADATA,
334            collection: CollectionUuid(Uuid::nil()),
335            metadata: None,
336            file_path: HashMap::new(),
337        };
338        let tenant = "test_tenant";
339        let database_id = &DatabaseUuid(Uuid::nil());
340        let prefix_path = segment.construct_prefix_path(tenant, database_id);
341        assert_eq!(
342            prefix_path,
343            "tenant/test_tenant/database/00000000-0000-0000-0000-000000000000/collection/00000000-0000-0000-0000-000000000000/segment/00000000-0000-0000-0000-000000000000"
344        );
345    }
346
347    #[test]
348    fn test_segment_extract_prefix_and_id() {
349        let path = "tenant/test_tenant/database/00000000-0000-0000-0000-000000000000/collection/00000000-0000-0000-0000-000000000000/segment/00000000-0000-0000-0000-000000000000/00000000-0000-0000-0000-000000000001";
350        let (prefix, id) =
351            Segment::extract_prefix_and_id(path).expect("Failed to extract prefix and id");
352        assert_eq!(
353            prefix,
354            "tenant/test_tenant/database/00000000-0000-0000-0000-000000000000/collection/00000000-0000-0000-0000-000000000000/segment/00000000-0000-0000-0000-000000000000"
355        );
356        assert_eq!(
357            id,
358            Uuid::from_str("00000000-0000-0000-0000-000000000001").expect("Cannot happen")
359        );
360    }
361
362    #[test]
363    fn test_segment_extract_prefix_and_id_legacy() {
364        let path = "00000000-0000-0000-0000-000000000001";
365        let (prefix, id) =
366            Segment::extract_prefix_and_id(path).expect("Failed to extract prefix and id");
367        assert_eq!(prefix, "");
368        assert_eq!(
369            id,
370            Uuid::from_str("00000000-0000-0000-0000-000000000001").expect("Cannot happen")
371        );
372    }
373}