cognee_vector/models.rs
1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use uuid::Uuid;
4
5/// Vector point to be indexed
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct VectorPoint {
8 /// Data point ID
9 pub id: Uuid,
10
11 /// Embedding vector
12 pub vector: Vec<f32>,
13
14 /// Metadata (type, field, original data)
15 pub metadata: HashMap<String, serde_json::Value>,
16}
17
18/// Result from similarity search
19#[derive(Debug, Clone)]
20pub struct SearchResult {
21 /// Data point ID
22 pub id: Uuid,
23
24 /// Similarity score (higher = more similar)
25 pub score: f32,
26
27 /// Metadata from the indexed point
28 pub metadata: HashMap<String, serde_json::Value>,
29}
30
31/// Configuration for vector collection
32#[derive(Debug, Clone)]
33pub struct CollectionConfig {
34 /// Collection name (e.g., "DocumentChunk_text")
35 pub name: String,
36
37 /// Vector dimension
38 pub dimension: usize,
39
40 /// Distance metric (Cosine, Euclidean, Dot)
41 pub distance: DistanceMetric,
42}
43
44/// Distance metric used for vector similarity comparisons.
45#[derive(Debug, Clone, Copy)]
46pub enum DistanceMetric {
47 /// Cosine similarity (angle-based, ignores magnitude).
48 Cosine,
49 /// Euclidean (L2) distance.
50 Euclidean,
51 /// Dot-product similarity.
52 Dot,
53}
54
55impl VectorPoint {
56 /// Create a new vector point
57 pub fn new(id: Uuid, vector: Vec<f32>) -> Self {
58 Self {
59 id,
60 vector,
61 metadata: HashMap::new(),
62 }
63 }
64
65 /// Add metadata field
66 pub fn with_metadata(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
67 self.metadata.insert(key.into(), value);
68 self
69 }
70
71 /// Accumulate the dataset membership recorded on a `previous` point (an
72 /// existing point with the same id) into `self`'s [`DATASET_IDS_KEY`] array.
73 ///
74 /// Point IDs are content-addressed (UUID v5 of the content), so the *same*
75 /// point is indexed once per dataset that contains that content. Vector
76 /// adapters upsert by id with full replacement, so a plain replace keeps
77 /// only the last dataset's scalar `dataset_id` and silently drops the
78 /// earlier datasets' membership — making the content unretrievable when a
79 /// search is scoped to one of those earlier datasets (the cross-dataset
80 /// dedup bug). Calling this in `index_points` upsert paths before replacing
81 /// an existing point keeps `dataset_ids` as the union of every dataset the
82 /// content belongs to, mirroring Python's `belongs_to_set` union semantics.
83 pub fn merge_dataset_membership(&mut self, previous: &VectorPoint) {
84 let mut ids: Vec<String> = Vec::new();
85 // `previous` first so membership order is stable (oldest dataset first).
86 collect_dataset_ids(previous, &mut ids);
87 collect_dataset_ids(self, &mut ids);
88 if !ids.is_empty() {
89 self.metadata.insert(
90 DATASET_IDS_KEY.to_string(),
91 serde_json::Value::Array(ids.into_iter().map(serde_json::Value::String).collect()),
92 );
93 }
94 }
95}
96
97/// Metadata key holding the array of dataset-ID strings a point belongs to.
98/// This is the union accumulated across every dataset the content-addressed
99/// point has been indexed under (see [`VectorPoint::merge_dataset_membership`]).
100pub const DATASET_IDS_KEY: &str = "dataset_ids";
101
102/// Scalar metadata key written by the cognify indexer for the single dataset
103/// currently being indexed. Retained for back-compat; the authoritative
104/// membership is the union in [`DATASET_IDS_KEY`].
105pub const DATASET_ID_KEY: &str = "dataset_id";
106
107/// Append every dataset-ID string recorded on `point` (from both the
108/// [`DATASET_IDS_KEY`] array and the scalar [`DATASET_ID_KEY`]) into `out`,
109/// skipping empties and duplicates.
110fn collect_dataset_ids(point: &VectorPoint, out: &mut Vec<String>) {
111 if let Some(arr) = point
112 .metadata
113 .get(DATASET_IDS_KEY)
114 .and_then(|v| v.as_array())
115 {
116 for v in arr {
117 if let Some(s) = v.as_str()
118 && !s.is_empty()
119 && !out.iter().any(|x| x == s)
120 {
121 out.push(s.to_string());
122 }
123 }
124 }
125 if let Some(s) = point.metadata.get(DATASET_ID_KEY).and_then(|v| v.as_str())
126 && !s.is_empty()
127 && !out.iter().any(|x| x == s)
128 {
129 out.push(s.to_string());
130 }
131}