Skip to main content

cognee_vector/
models.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use uuid::Uuid;
4
5/// Vector point to be indexed
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct VectorPoint {
8    /// Data point ID
9    pub id: Uuid,
10
11    /// Embedding vector
12    pub vector: Vec<f32>,
13
14    /// Metadata (type, field, original data)
15    pub metadata: HashMap<String, serde_json::Value>,
16}
17
18/// Result from similarity search
19#[derive(Debug, Clone)]
20pub struct SearchResult {
21    /// Data point ID
22    pub id: Uuid,
23
24    /// Similarity score (higher = more similar)
25    pub score: f32,
26
27    /// Metadata from the indexed point
28    pub metadata: HashMap<String, serde_json::Value>,
29}
30
31/// Configuration for vector collection
32#[derive(Debug, Clone)]
33pub struct CollectionConfig {
34    /// Collection name (e.g., "DocumentChunk_text")
35    pub name: String,
36
37    /// Vector dimension
38    pub dimension: usize,
39
40    /// Distance metric (Cosine, Euclidean, Dot)
41    pub distance: DistanceMetric,
42}
43
44/// Distance metric used for vector similarity comparisons.
45#[derive(Debug, Clone, Copy)]
46pub enum DistanceMetric {
47    /// Cosine similarity (angle-based, ignores magnitude).
48    Cosine,
49    /// Euclidean (L2) distance.
50    Euclidean,
51    /// Dot-product similarity.
52    Dot,
53}
54
55impl VectorPoint {
56    /// Create a new vector point
57    pub fn new(id: Uuid, vector: Vec<f32>) -> Self {
58        Self {
59            id,
60            vector,
61            metadata: HashMap::new(),
62        }
63    }
64
65    /// Add metadata field
66    pub fn with_metadata(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
67        self.metadata.insert(key.into(), value);
68        self
69    }
70
71    /// Accumulate the dataset membership recorded on a `previous` point (an
72    /// existing point with the same id) into `self`'s [`DATASET_IDS_KEY`] array.
73    ///
74    /// Point IDs are content-addressed (UUID v5 of the content), so the *same*
75    /// point is indexed once per dataset that contains that content. Vector
76    /// adapters upsert by id with full replacement, so a plain replace keeps
77    /// only the last dataset's scalar `dataset_id` and silently drops the
78    /// earlier datasets' membership — making the content unretrievable when a
79    /// search is scoped to one of those earlier datasets (the cross-dataset
80    /// dedup bug). Calling this in `index_points` upsert paths before replacing
81    /// an existing point keeps `dataset_ids` as the union of every dataset the
82    /// content belongs to, mirroring Python's `belongs_to_set` union semantics.
83    pub fn merge_dataset_membership(&mut self, previous: &VectorPoint) {
84        let mut ids: Vec<String> = Vec::new();
85        // `previous` first so membership order is stable (oldest dataset first).
86        collect_dataset_ids(previous, &mut ids);
87        collect_dataset_ids(self, &mut ids);
88        if !ids.is_empty() {
89            self.metadata.insert(
90                DATASET_IDS_KEY.to_string(),
91                serde_json::Value::Array(ids.into_iter().map(serde_json::Value::String).collect()),
92            );
93        }
94    }
95}
96
97/// Metadata key holding the array of dataset-ID strings a point belongs to.
98/// This is the union accumulated across every dataset the content-addressed
99/// point has been indexed under (see [`VectorPoint::merge_dataset_membership`]).
100pub const DATASET_IDS_KEY: &str = "dataset_ids";
101
102/// Scalar metadata key written by the cognify indexer for the single dataset
103/// currently being indexed. Retained for back-compat; the authoritative
104/// membership is the union in [`DATASET_IDS_KEY`].
105pub const DATASET_ID_KEY: &str = "dataset_id";
106
107/// Append every dataset-ID string recorded on `point` (from both the
108/// [`DATASET_IDS_KEY`] array and the scalar [`DATASET_ID_KEY`]) into `out`,
109/// skipping empties and duplicates.
110fn collect_dataset_ids(point: &VectorPoint, out: &mut Vec<String>) {
111    if let Some(arr) = point
112        .metadata
113        .get(DATASET_IDS_KEY)
114        .and_then(|v| v.as_array())
115    {
116        for v in arr {
117            if let Some(s) = v.as_str()
118                && !s.is_empty()
119                && !out.iter().any(|x| x == s)
120            {
121                out.push(s.to_string());
122            }
123        }
124    }
125    if let Some(s) = point.metadata.get(DATASET_ID_KEY).and_then(|v| v.as_str())
126        && !s.is_empty()
127        && !out.iter().any(|x| x == s)
128    {
129        out.push(s.to_string());
130    }
131}