Skip to main content

chroma_types/
segment.rs

1use super::{
2    CollectionUuid, Metadata, MetadataValueConversionError, SegmentScope,
3    SegmentScopeConversionError,
4};
5use crate::{chroma_proto, DatabaseUuid};
6use chroma_error::{ChromaError, ErrorCodes};
7use std::{collections::HashMap, str::FromStr};
8use thiserror::Error;
9use tonic::Status;
10use uuid::Uuid;
11
12pub const USER_ID_TO_OFFSET_ID: &str = "user_id_to_offset_id";
13pub const OFFSET_ID_TO_USER_ID: &str = "offset_id_to_user_id";
14pub const OFFSET_ID_TO_DATA: &str = "offset_id_to_data";
15pub const MAX_OFFSET_ID: &str = "max_offset_id";
16
17pub const FULL_TEXT_PLS: &str = "full_text_pls";
18pub const STRING_METADATA: &str = "string_metadata";
19pub const BOOL_METADATA: &str = "bool_metadata";
20pub const F32_METADATA: &str = "f32_metadata";
21pub const U32_METADATA: &str = "u32_metadata";
22
23pub const SPARSE_MAX: &str = "sparse_max";
24pub const SPARSE_OFFSET_VALUE: &str = "sparse_offset_value";
25
26pub const HNSW_PATH: &str = "hnsw_path";
27pub const VERSION_MAP_PATH: &str = "version_map_path";
28pub const POSTING_LIST_PATH: &str = "posting_list_path";
29pub const MAX_HEAD_ID_BF_PATH: &str = "max_head_id_path";
30
31/// SegmentUuid is a wrapper around Uuid to provide a type for the segment id.
32#[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash)]
33pub struct SegmentUuid(pub Uuid);
34
35impl SegmentUuid {
36    pub fn new() -> Self {
37        SegmentUuid(Uuid::new_v4())
38    }
39}
40
41impl FromStr for SegmentUuid {
42    type Err = SegmentConversionError;
43
44    fn from_str(s: &str) -> Result<Self, SegmentConversionError> {
45        match Uuid::parse_str(s) {
46            Ok(uuid) => Ok(SegmentUuid(uuid)),
47            Err(_) => Err(SegmentConversionError::InvalidUuid),
48        }
49    }
50}
51
52impl std::fmt::Display for SegmentUuid {
53    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54        write!(f, "{}", self.0)
55    }
56}
57
58#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
59pub enum SegmentType {
60    BlockfileMetadata,
61    BlockfileRecord,
62    HnswDistributed,
63    HnswLocalMemory,
64    HnswLocalPersisted,
65    Sqlite,
66    Spann,
67    QuantizedSpann,
68}
69
70impl From<SegmentType> for String {
71    fn from(segment_type: SegmentType) -> String {
72        match segment_type {
73            SegmentType::BlockfileMetadata => "urn:chroma:segment/metadata/blockfile".to_string(),
74            SegmentType::BlockfileRecord => "urn:chroma:segment/record/blockfile".to_string(),
75            SegmentType::HnswDistributed => {
76                "urn:chroma:segment/vector/hnsw-distributed".to_string()
77            }
78            SegmentType::HnswLocalMemory => {
79                "urn:chroma:segment/vector/hnsw-local-memory".to_string()
80            }
81            SegmentType::HnswLocalPersisted => {
82                "urn:chroma:segment/vector/hnsw-local-persisted".to_string()
83            }
84            SegmentType::Spann => "urn:chroma:segment/vector/spann".to_string(),
85            SegmentType::QuantizedSpann => "urn:chroma:segment/vector/quantized-spann".to_string(),
86            SegmentType::Sqlite => "urn:chroma:segment/metadata/sqlite".to_string(),
87        }
88    }
89}
90
91impl TryFrom<&str> for SegmentType {
92    type Error = SegmentConversionError;
93
94    fn try_from(segment_type: &str) -> Result<Self, Self::Error> {
95        match segment_type {
96            "urn:chroma:segment/metadata/blockfile" => Ok(SegmentType::BlockfileMetadata),
97            "urn:chroma:segment/record/blockfile" => Ok(SegmentType::BlockfileRecord),
98            "urn:chroma:segment/vector/hnsw-distributed" => Ok(SegmentType::HnswDistributed),
99            "urn:chroma:segment/vector/hnsw-local-memory" => Ok(SegmentType::HnswLocalMemory),
100            "urn:chroma:segment/vector/hnsw-local-persisted" => Ok(Self::HnswLocalPersisted),
101            "urn:chroma:segment/vector/spann" => Ok(SegmentType::Spann),
102            "urn:chroma:segment/vector/quantized-spann" => Ok(SegmentType::QuantizedSpann),
103            "urn:chroma:segment/metadata/sqlite" => Ok(SegmentType::Sqlite),
104            _ => Err(SegmentConversionError::InvalidSegmentType),
105        }
106    }
107}
108
109#[derive(Clone, Debug, PartialEq)]
110pub struct Segment {
111    pub id: SegmentUuid,
112    pub r#type: SegmentType,
113    pub scope: SegmentScope,
114    pub collection: CollectionUuid,
115    pub metadata: Option<Metadata>,
116    pub file_path: HashMap<String, Vec<String>>,
117}
118
119impl Segment {
120    // TODO(Sanket): Add QuantizedSpann to the prefetch supported list when
121    // we have intelligent prefetching.
122    pub fn prefetch_supported(&self) -> bool {
123        matches!(
124            self.r#type,
125            SegmentType::BlockfileMetadata | SegmentType::BlockfileRecord | SegmentType::Spann
126        )
127    }
128
129    // TODO(Sanket): Add file paths for QuantizedSpann when we have intelligent prefetching.
130    pub fn filepaths_to_prefetch(&self) -> Vec<String> {
131        let mut res = Vec::new();
132        match self.r#type {
133            SegmentType::Spann => {
134                if let Some(pl_path) = self.file_path.get(POSTING_LIST_PATH) {
135                    res.extend(pl_path.iter().cloned());
136                }
137            }
138            SegmentType::BlockfileMetadata | SegmentType::BlockfileRecord => {
139                for paths in self.file_path.values() {
140                    res.extend(paths.iter().cloned());
141                }
142            }
143            _ => {}
144        }
145        res
146    }
147
148    pub fn extract_prefix_and_id(path: &str) -> Result<(&str, uuid::Uuid), uuid::Error> {
149        let (prefix, id) = match path.rfind('/') {
150            Some(pos) => (&path[..pos], &path[pos + 1..]),
151            None => ("", path),
152        };
153        match Uuid::try_parse(id) {
154            Ok(uid) => Ok((prefix, uid)),
155            Err(e) => Err(e),
156        }
157    }
158
159    pub fn construct_prefix_path(&self, tenant: &str, database_id: &DatabaseUuid) -> String {
160        format!(
161            "tenant/{}/database/{}/collection/{}/segment/{}",
162            tenant, database_id, self.collection, self.id
163        )
164    }
165}
166
167#[derive(Error, Debug)]
168pub enum SegmentConversionError {
169    #[error("Invalid UUID")]
170    InvalidUuid,
171    #[error(transparent)]
172    MetadataValueConversionError(#[from] MetadataValueConversionError),
173    #[error(transparent)]
174    SegmentScopeConversionError(#[from] SegmentScopeConversionError),
175    #[error("Invalid segment type")]
176    InvalidSegmentType,
177}
178
179impl ChromaError for SegmentConversionError {
180    fn code(&self) -> ErrorCodes {
181        match self {
182            SegmentConversionError::InvalidUuid => ErrorCodes::InvalidArgument,
183            SegmentConversionError::InvalidSegmentType => ErrorCodes::InvalidArgument,
184            SegmentConversionError::SegmentScopeConversionError(e) => e.code(),
185            SegmentConversionError::MetadataValueConversionError(e) => e.code(),
186        }
187    }
188}
189
190impl From<SegmentConversionError> for Status {
191    fn from(value: SegmentConversionError) -> Self {
192        Status::invalid_argument(value.to_string())
193    }
194}
195
196impl TryFrom<chroma_proto::Segment> for Segment {
197    type Error = SegmentConversionError;
198
199    fn try_from(proto_segment: chroma_proto::Segment) -> Result<Self, Self::Error> {
200        let mut proto_segment = proto_segment;
201
202        let segment_uuid = match SegmentUuid::from_str(&proto_segment.id) {
203            Ok(uuid) => uuid,
204            Err(_) => return Err(SegmentConversionError::InvalidUuid),
205        };
206        let collection_uuid = match Uuid::try_parse(&proto_segment.collection) {
207            Ok(uuid) => uuid,
208            Err(_) => return Err(SegmentConversionError::InvalidUuid),
209        };
210        let collection_uuid = CollectionUuid(collection_uuid);
211        let segment_metadata: Option<Metadata> = match proto_segment.metadata {
212            Some(proto_metadata) => match proto_metadata.try_into() {
213                Ok(metadata) => Some(metadata),
214                Err(e) => return Err(SegmentConversionError::MetadataValueConversionError(e)),
215            },
216            None => None,
217        };
218        let scope: SegmentScope = match proto_segment.scope.try_into() {
219            Ok(scope) => scope,
220            Err(e) => return Err(SegmentConversionError::SegmentScopeConversionError(e)),
221        };
222
223        let segment_type: SegmentType = proto_segment.r#type.as_str().try_into()?;
224
225        let mut file_paths = HashMap::new();
226        let drain = proto_segment.file_paths.drain();
227        for (key, value) in drain {
228            file_paths.insert(key, value.paths);
229        }
230
231        Ok(Segment {
232            id: segment_uuid,
233            r#type: segment_type,
234            scope,
235            collection: collection_uuid,
236            metadata: segment_metadata,
237            file_path: file_paths,
238        })
239    }
240}
241
242impl From<Segment> for chroma_proto::Segment {
243    fn from(value: Segment) -> Self {
244        Self {
245            id: value.id.0.to_string(),
246            r#type: value.r#type.into(),
247            scope: chroma_proto::SegmentScope::from(value.scope) as i32,
248            collection: value.collection.0.to_string(),
249            metadata: value.metadata.map(Into::into),
250            file_paths: value
251                .file_path
252                .into_iter()
253                .map(|(name, paths)| (name, chroma_proto::FilePaths { paths }))
254                .collect(),
255        }
256    }
257}
258
259pub fn test_segment(collection_uuid: CollectionUuid, scope: SegmentScope) -> Segment {
260    let r#type = match scope {
261        SegmentScope::METADATA => SegmentType::BlockfileMetadata,
262        SegmentScope::RECORD => SegmentType::BlockfileRecord,
263        SegmentScope::VECTOR => SegmentType::HnswDistributed,
264        SegmentScope::SQLITE => unimplemented!("Sqlite segment is not implemented"),
265    };
266    Segment {
267        id: SegmentUuid::new(),
268        r#type,
269        scope,
270        collection: collection_uuid,
271        metadata: None,
272        file_path: HashMap::new(),
273    }
274}
275
276#[cfg(test)]
277mod tests {
278
279    use super::*;
280    use crate::MetadataValue;
281
282    #[test]
283    fn test_segment_try_from() {
284        let mut metadata = chroma_proto::UpdateMetadata {
285            metadata: HashMap::new(),
286        };
287        metadata.metadata.insert(
288            "foo".to_string(),
289            chroma_proto::UpdateMetadataValue {
290                value: Some(chroma_proto::update_metadata_value::Value::IntValue(42)),
291            },
292        );
293        let proto_segment = chroma_proto::Segment {
294            id: "00000000-0000-0000-0000-000000000000".to_string(),
295            r#type: "urn:chroma:segment/vector/hnsw-distributed".to_string(),
296            scope: chroma_proto::SegmentScope::Vector as i32,
297            collection: "00000000-0000-0000-0000-000000000000".to_string(),
298            metadata: Some(metadata),
299            file_paths: HashMap::new(),
300        };
301        let converted_segment: Segment = proto_segment.try_into().unwrap();
302        assert_eq!(converted_segment.id, SegmentUuid(Uuid::nil()));
303        assert_eq!(converted_segment.r#type, SegmentType::HnswDistributed);
304        assert_eq!(converted_segment.scope, SegmentScope::VECTOR);
305        assert_eq!(converted_segment.collection, CollectionUuid(Uuid::nil()));
306        let metadata = converted_segment.metadata.unwrap();
307        assert_eq!(metadata.len(), 1);
308        assert_eq!(metadata.get("foo").unwrap(), &MetadataValue::Int(42));
309    }
310
311    #[test]
312    fn test_segment_construct_prefix_path() {
313        let segment = Segment {
314            id: SegmentUuid(Uuid::nil()),
315            r#type: SegmentType::BlockfileMetadata,
316            scope: SegmentScope::METADATA,
317            collection: CollectionUuid(Uuid::nil()),
318            metadata: None,
319            file_path: HashMap::new(),
320        };
321        let tenant = "test_tenant";
322        let database_id = &DatabaseUuid(Uuid::nil());
323        let prefix_path = segment.construct_prefix_path(tenant, database_id);
324        assert_eq!(
325            prefix_path,
326            "tenant/test_tenant/database/00000000-0000-0000-0000-000000000000/collection/00000000-0000-0000-0000-000000000000/segment/00000000-0000-0000-0000-000000000000"
327        );
328    }
329
330    #[test]
331    fn test_segment_extract_prefix_and_id() {
332        let path = "tenant/test_tenant/database/00000000-0000-0000-0000-000000000000/collection/00000000-0000-0000-0000-000000000000/segment/00000000-0000-0000-0000-000000000000/00000000-0000-0000-0000-000000000001";
333        let (prefix, id) =
334            Segment::extract_prefix_and_id(path).expect("Failed to extract prefix and id");
335        assert_eq!(
336            prefix,
337            "tenant/test_tenant/database/00000000-0000-0000-0000-000000000000/collection/00000000-0000-0000-0000-000000000000/segment/00000000-0000-0000-0000-000000000000"
338        );
339        assert_eq!(
340            id,
341            Uuid::from_str("00000000-0000-0000-0000-000000000001").expect("Cannot happen")
342        );
343    }
344
345    #[test]
346    fn test_segment_extract_prefix_and_id_legacy() {
347        let path = "00000000-0000-0000-0000-000000000001";
348        let (prefix, id) =
349            Segment::extract_prefix_and_id(path).expect("Failed to extract prefix and id");
350        assert_eq!(prefix, "");
351        assert_eq!(
352            id,
353            Uuid::from_str("00000000-0000-0000-0000-000000000001").expect("Cannot happen")
354        );
355    }
356}