Skip to main content

chroma_types/
segment.rs

1use super::{
2    CollectionUuid, Metadata, MetadataValueConversionError, SegmentScope,
3    SegmentScopeConversionError,
4};
5use crate::{chroma_proto, DatabaseUuid};
6use chroma_error::{ChromaError, ErrorCodes};
7use std::{collections::HashMap, str::FromStr};
8use thiserror::Error;
9use tonic::Status;
10use uuid::Uuid;
11
12pub const USER_ID_TO_OFFSET_ID: &str = "user_id_to_offset_id";
13pub const OFFSET_ID_TO_USER_ID: &str = "offset_id_to_user_id";
14pub const OFFSET_ID_TO_DATA: &str = "offset_id_to_data";
15pub const MAX_OFFSET_ID: &str = "max_offset_id";
16pub const USER_ID_BLOOM_FILTER: &str = "user_id_bloom_filter";
17
18pub const FULL_TEXT_PLS: &str = "full_text_pls";
19pub const STRING_METADATA: &str = "string_metadata";
20pub const BOOL_METADATA: &str = "bool_metadata";
21pub const F32_METADATA: &str = "f32_metadata";
22pub const U32_METADATA: &str = "u32_metadata";
23
24pub const SPARSE_MAX: &str = "sparse_max";
25pub const SPARSE_OFFSET_VALUE: &str = "sparse_offset_value";
26
27pub const HNSW_PATH: &str = "hnsw_path";
28pub const VERSION_MAP_PATH: &str = "version_map_path";
29pub const POSTING_LIST_PATH: &str = "posting_list_path";
30pub const MAX_HEAD_ID_BF_PATH: &str = "max_head_id_path";
31
32pub const QUANTIZED_SPANN_CLUSTER: &str = "quantized_spann_cluster";
33pub const QUANTIZED_SPANN_SCALAR_METADATA: &str = "quantized_spann_scalar_metadata";
34pub const QUANTIZED_SPANN_EMBEDDING_METADATA: &str = "quantized_spann_embedding_metadata";
35pub const QUANTIZED_SPANN_RAW_CENTROID: &str = "quantized_spann_raw_centroid";
36pub const QUANTIZED_SPANN_QUANTIZED_CENTROID: &str = "quantized_spann_quantized_centroid";
37
38/// SegmentUuid is a wrapper around Uuid to provide a type for the segment id.
39#[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash)]
40pub struct SegmentUuid(pub Uuid);
41
42impl SegmentUuid {
43    pub fn new() -> Self {
44        SegmentUuid(Uuid::new_v4())
45    }
46}
47
48impl FromStr for SegmentUuid {
49    type Err = SegmentConversionError;
50
51    fn from_str(s: &str) -> Result<Self, SegmentConversionError> {
52        match Uuid::parse_str(s) {
53            Ok(uuid) => Ok(SegmentUuid(uuid)),
54            Err(_) => Err(SegmentConversionError::InvalidUuid),
55        }
56    }
57}
58
59impl std::fmt::Display for SegmentUuid {
60    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
61        write!(f, "{}", self.0)
62    }
63}
64
65#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
66pub enum SegmentType {
67    BlockfileMetadata,
68    BlockfileRecord,
69    HnswDistributed,
70    HnswLocalMemory,
71    HnswLocalPersisted,
72    Sqlite,
73    Spann,
74    QuantizedSpann,
75}
76
77impl From<SegmentType> for String {
78    fn from(segment_type: SegmentType) -> String {
79        match segment_type {
80            SegmentType::BlockfileMetadata => "urn:chroma:segment/metadata/blockfile".to_string(),
81            SegmentType::BlockfileRecord => "urn:chroma:segment/record/blockfile".to_string(),
82            SegmentType::HnswDistributed => {
83                "urn:chroma:segment/vector/hnsw-distributed".to_string()
84            }
85            SegmentType::HnswLocalMemory => {
86                "urn:chroma:segment/vector/hnsw-local-memory".to_string()
87            }
88            SegmentType::HnswLocalPersisted => {
89                "urn:chroma:segment/vector/hnsw-local-persisted".to_string()
90            }
91            SegmentType::Spann => "urn:chroma:segment/vector/spann".to_string(),
92            SegmentType::QuantizedSpann => "urn:chroma:segment/vector/quantized-spann".to_string(),
93            SegmentType::Sqlite => "urn:chroma:segment/metadata/sqlite".to_string(),
94        }
95    }
96}
97
98impl TryFrom<&str> for SegmentType {
99    type Error = SegmentConversionError;
100
101    fn try_from(segment_type: &str) -> Result<Self, Self::Error> {
102        match segment_type {
103            "urn:chroma:segment/metadata/blockfile" => Ok(SegmentType::BlockfileMetadata),
104            "urn:chroma:segment/record/blockfile" => Ok(SegmentType::BlockfileRecord),
105            "urn:chroma:segment/vector/hnsw-distributed" => Ok(SegmentType::HnswDistributed),
106            "urn:chroma:segment/vector/hnsw-local-memory" => Ok(SegmentType::HnswLocalMemory),
107            "urn:chroma:segment/vector/hnsw-local-persisted" => Ok(Self::HnswLocalPersisted),
108            "urn:chroma:segment/vector/spann" => Ok(SegmentType::Spann),
109            "urn:chroma:segment/vector/quantized-spann" => Ok(SegmentType::QuantizedSpann),
110            "urn:chroma:segment/metadata/sqlite" => Ok(SegmentType::Sqlite),
111            _ => Err(SegmentConversionError::InvalidSegmentType),
112        }
113    }
114}
115
116#[derive(Clone, Debug, PartialEq)]
117pub struct Segment {
118    pub id: SegmentUuid,
119    pub r#type: SegmentType,
120    pub scope: SegmentScope,
121    pub collection: CollectionUuid,
122    pub metadata: Option<Metadata>,
123    pub file_path: HashMap<String, Vec<String>>,
124}
125
126impl Segment {
127    pub fn prefetch_supported(&self) -> bool {
128        matches!(
129            self.r#type,
130            SegmentType::BlockfileMetadata
131                | SegmentType::BlockfileRecord
132                | SegmentType::QuantizedSpann
133                | SegmentType::Spann
134        )
135    }
136
137    pub fn filepaths_to_prefetch(&self) -> Vec<String> {
138        let mut res = Vec::new();
139        match self.r#type {
140            SegmentType::QuantizedSpann => {
141                for key in [
142                    QUANTIZED_SPANN_CLUSTER,
143                    QUANTIZED_SPANN_EMBEDDING_METADATA,
144                    QUANTIZED_SPANN_SCALAR_METADATA,
145                ] {
146                    if let Some(paths) = self.file_path.get(key) {
147                        res.extend(paths.iter().cloned());
148                    }
149                }
150            }
151            SegmentType::Spann => {
152                if let Some(pl_path) = self.file_path.get(POSTING_LIST_PATH) {
153                    res.extend(pl_path.iter().cloned());
154                }
155            }
156            SegmentType::BlockfileMetadata | SegmentType::BlockfileRecord => {
157                for (key, paths) in &self.file_path {
158                    if key == USER_ID_BLOOM_FILTER {
159                        continue;
160                    }
161                    res.extend(paths.iter().cloned());
162                }
163            }
164            _ => {}
165        }
166        res
167    }
168
169    pub fn extract_prefix_and_id(path: &str) -> Result<(&str, uuid::Uuid), uuid::Error> {
170        let (prefix, id) = match path.rfind('/') {
171            Some(pos) => (&path[..pos], &path[pos + 1..]),
172            None => ("", path),
173        };
174        match Uuid::try_parse(id) {
175            Ok(uid) => Ok((prefix, uid)),
176            Err(e) => Err(e),
177        }
178    }
179
180    pub fn construct_prefix_path(&self, tenant: &str, database_id: &DatabaseUuid) -> String {
181        format!(
182            "tenant/{}/database/{}/collection/{}/segment/{}",
183            tenant, database_id, self.collection, self.id
184        )
185    }
186}
187
188#[derive(Error, Debug)]
189pub enum SegmentConversionError {
190    #[error("Invalid UUID")]
191    InvalidUuid,
192    #[error(transparent)]
193    MetadataValueConversionError(#[from] MetadataValueConversionError),
194    #[error(transparent)]
195    SegmentScopeConversionError(#[from] SegmentScopeConversionError),
196    #[error("Invalid segment type")]
197    InvalidSegmentType,
198}
199
200impl ChromaError for SegmentConversionError {
201    fn code(&self) -> ErrorCodes {
202        match self {
203            SegmentConversionError::InvalidUuid => ErrorCodes::InvalidArgument,
204            SegmentConversionError::InvalidSegmentType => ErrorCodes::InvalidArgument,
205            SegmentConversionError::SegmentScopeConversionError(e) => e.code(),
206            SegmentConversionError::MetadataValueConversionError(e) => e.code(),
207        }
208    }
209}
210
211impl From<SegmentConversionError> for Status {
212    fn from(value: SegmentConversionError) -> Self {
213        Status::invalid_argument(value.to_string())
214    }
215}
216
217impl TryFrom<chroma_proto::Segment> for Segment {
218    type Error = SegmentConversionError;
219
220    fn try_from(proto_segment: chroma_proto::Segment) -> Result<Self, Self::Error> {
221        let mut proto_segment = proto_segment;
222
223        let segment_uuid = match SegmentUuid::from_str(&proto_segment.id) {
224            Ok(uuid) => uuid,
225            Err(_) => return Err(SegmentConversionError::InvalidUuid),
226        };
227        let collection_uuid = match Uuid::try_parse(&proto_segment.collection) {
228            Ok(uuid) => uuid,
229            Err(_) => return Err(SegmentConversionError::InvalidUuid),
230        };
231        let collection_uuid = CollectionUuid(collection_uuid);
232        let segment_metadata: Option<Metadata> = match proto_segment.metadata {
233            Some(proto_metadata) => match proto_metadata.try_into() {
234                Ok(metadata) => Some(metadata),
235                Err(e) => return Err(SegmentConversionError::MetadataValueConversionError(e)),
236            },
237            None => None,
238        };
239        let scope: SegmentScope = match proto_segment.scope.try_into() {
240            Ok(scope) => scope,
241            Err(e) => return Err(SegmentConversionError::SegmentScopeConversionError(e)),
242        };
243
244        let segment_type: SegmentType = proto_segment.r#type.as_str().try_into()?;
245
246        let mut file_paths = HashMap::new();
247        let drain = proto_segment.file_paths.drain();
248        for (key, value) in drain {
249            file_paths.insert(key, value.paths);
250        }
251
252        Ok(Segment {
253            id: segment_uuid,
254            r#type: segment_type,
255            scope,
256            collection: collection_uuid,
257            metadata: segment_metadata,
258            file_path: file_paths,
259        })
260    }
261}
262
263impl From<Segment> for chroma_proto::Segment {
264    fn from(value: Segment) -> Self {
265        Self {
266            id: value.id.0.to_string(),
267            r#type: value.r#type.into(),
268            scope: chroma_proto::SegmentScope::from(value.scope) as i32,
269            collection: value.collection.0.to_string(),
270            metadata: value.metadata.map(Into::into),
271            file_paths: value
272                .file_path
273                .into_iter()
274                .map(|(name, paths)| (name, chroma_proto::FilePaths { paths }))
275                .collect(),
276        }
277    }
278}
279
280pub fn test_segment(collection_uuid: CollectionUuid, scope: SegmentScope) -> Segment {
281    let r#type = match scope {
282        SegmentScope::METADATA => SegmentType::BlockfileMetadata,
283        SegmentScope::RECORD => SegmentType::BlockfileRecord,
284        SegmentScope::VECTOR => SegmentType::HnswDistributed,
285        SegmentScope::SQLITE => unimplemented!("Sqlite segment is not implemented"),
286    };
287    Segment {
288        id: SegmentUuid::new(),
289        r#type,
290        scope,
291        collection: collection_uuid,
292        metadata: None,
293        file_path: HashMap::new(),
294    }
295}
296
297#[cfg(test)]
298mod tests {
299
300    use super::*;
301    use crate::MetadataValue;
302
303    #[test]
304    fn test_segment_try_from() {
305        let mut metadata = chroma_proto::UpdateMetadata {
306            metadata: HashMap::new(),
307        };
308        metadata.metadata.insert(
309            "foo".to_string(),
310            chroma_proto::UpdateMetadataValue {
311                value: Some(chroma_proto::update_metadata_value::Value::IntValue(42)),
312            },
313        );
314        let proto_segment = chroma_proto::Segment {
315            id: "00000000-0000-0000-0000-000000000000".to_string(),
316            r#type: "urn:chroma:segment/vector/hnsw-distributed".to_string(),
317            scope: chroma_proto::SegmentScope::Vector as i32,
318            collection: "00000000-0000-0000-0000-000000000000".to_string(),
319            metadata: Some(metadata),
320            file_paths: HashMap::new(),
321        };
322        let converted_segment: Segment = proto_segment.try_into().unwrap();
323        assert_eq!(converted_segment.id, SegmentUuid(Uuid::nil()));
324        assert_eq!(converted_segment.r#type, SegmentType::HnswDistributed);
325        assert_eq!(converted_segment.scope, SegmentScope::VECTOR);
326        assert_eq!(converted_segment.collection, CollectionUuid(Uuid::nil()));
327        let metadata = converted_segment.metadata.unwrap();
328        assert_eq!(metadata.len(), 1);
329        assert_eq!(metadata.get("foo").unwrap(), &MetadataValue::Int(42));
330    }
331
332    #[test]
333    fn test_segment_construct_prefix_path() {
334        let segment = Segment {
335            id: SegmentUuid(Uuid::nil()),
336            r#type: SegmentType::BlockfileMetadata,
337            scope: SegmentScope::METADATA,
338            collection: CollectionUuid(Uuid::nil()),
339            metadata: None,
340            file_path: HashMap::new(),
341        };
342        let tenant = "test_tenant";
343        let database_id = &DatabaseUuid(Uuid::nil());
344        let prefix_path = segment.construct_prefix_path(tenant, database_id);
345        assert_eq!(
346            prefix_path,
347            "tenant/test_tenant/database/00000000-0000-0000-0000-000000000000/collection/00000000-0000-0000-0000-000000000000/segment/00000000-0000-0000-0000-000000000000"
348        );
349    }
350
351    #[test]
352    fn test_segment_extract_prefix_and_id() {
353        let path = "tenant/test_tenant/database/00000000-0000-0000-0000-000000000000/collection/00000000-0000-0000-0000-000000000000/segment/00000000-0000-0000-0000-000000000000/00000000-0000-0000-0000-000000000001";
354        let (prefix, id) =
355            Segment::extract_prefix_and_id(path).expect("Failed to extract prefix and id");
356        assert_eq!(
357            prefix,
358            "tenant/test_tenant/database/00000000-0000-0000-0000-000000000000/collection/00000000-0000-0000-0000-000000000000/segment/00000000-0000-0000-0000-000000000000"
359        );
360        assert_eq!(
361            id,
362            Uuid::from_str("00000000-0000-0000-0000-000000000001").expect("Cannot happen")
363        );
364    }
365
366    #[test]
367    fn test_segment_extract_prefix_and_id_legacy() {
368        let path = "00000000-0000-0000-0000-000000000001";
369        let (prefix, id) =
370            Segment::extract_prefix_and_id(path).expect("Failed to extract prefix and id");
371        assert_eq!(prefix, "");
372        assert_eq!(
373            id,
374            Uuid::from_str("00000000-0000-0000-0000-000000000001").expect("Cannot happen")
375        );
376    }
377}