1use anyhow::Result;
4use std::collections::HashMap;
5
6use crate::embeddings;
7use crate::vector_index::{MemoryVectorIndex, VectorIndex};
8use crate::{BatchSearchResult, Vector, VectorId, VectorStoreTrait};
9
10#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
12pub struct VectorStoreConfig {
13 pub auto_embed: bool,
14 pub cache_embeddings: bool,
15 pub similarity_threshold: f32,
16 pub max_results: usize,
17}
18
19impl Default for VectorStoreConfig {
20 fn default() -> Self {
21 Self {
22 auto_embed: true,
23 cache_embeddings: true,
24 similarity_threshold: 0.7,
25 max_results: 100,
26 }
27 }
28}
29
30pub struct VectorStore {
32 index: Box<dyn VectorIndex>,
33 embedding_manager: Option<embeddings::EmbeddingManager>,
34 config: VectorStoreConfig,
35}
36
37impl VectorStore {
38 pub fn new() -> Self {
40 Self {
41 index: Box::new(MemoryVectorIndex::new()),
42 embedding_manager: None,
43 config: VectorStoreConfig::default(),
44 }
45 }
46
47 pub fn with_embedding_strategy(strategy: embeddings::EmbeddingStrategy) -> Result<Self> {
49 let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
50
51 Ok(Self {
52 index: Box::new(MemoryVectorIndex::new()),
53 embedding_manager: Some(embedding_manager),
54 config: VectorStoreConfig::default(),
55 })
56 }
57
58 pub fn with_index(index: Box<dyn VectorIndex>) -> Self {
60 Self {
61 index,
62 embedding_manager: None,
63 config: VectorStoreConfig::default(),
64 }
65 }
66
67 pub fn with_index_and_embeddings(
69 index: Box<dyn VectorIndex>,
70 strategy: embeddings::EmbeddingStrategy,
71 ) -> Result<Self> {
72 let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
73
74 Ok(Self {
75 index,
76 embedding_manager: Some(embedding_manager),
77 config: VectorStoreConfig::default(),
78 })
79 }
80
81 pub fn with_config(mut self, config: VectorStoreConfig) -> Self {
83 self.config = config;
84 self
85 }
86
87 pub fn index_resource(&mut self, uri: String, content: &str) -> Result<()> {
89 if let Some(ref mut embedding_manager) = self.embedding_manager {
90 let embeddable_content = embeddings::EmbeddableContent::Text(content.to_string());
91 let vector = embedding_manager.get_embedding(&embeddable_content)?;
92 self.index.insert(uri, vector)
93 } else {
94 let vector = self.generate_fallback_vector(content);
96 self.index.insert(uri, vector)
97 }
98 }
99
100 pub fn index_rdf_resource(
102 &mut self,
103 uri: String,
104 label: Option<String>,
105 description: Option<String>,
106 properties: std::collections::HashMap<String, Vec<String>>,
107 ) -> Result<()> {
108 if let Some(ref mut embedding_manager) = self.embedding_manager {
109 let embeddable_content = embeddings::EmbeddableContent::RdfResource {
110 uri: uri.clone(),
111 label,
112 description,
113 properties,
114 };
115 let vector = embedding_manager.get_embedding(&embeddable_content)?;
116 self.index.insert(uri, vector)
117 } else {
118 Err(anyhow::anyhow!(
119 "Embedding manager required for RDF resource indexing"
120 ))
121 }
122 }
123
124 pub fn index_vector(&mut self, uri: String, vector: Vector) -> Result<()> {
126 self.index.insert(uri, vector)
127 }
128
129 pub fn similarity_search(&self, query: &str, limit: usize) -> Result<Vec<(String, f32)>> {
131 let query_vector = if let Some(ref _embedding_manager) = self.embedding_manager {
132 let _embeddable_content = embeddings::EmbeddableContent::Text(query.to_string());
133 self.generate_fallback_vector(query)
136 } else {
137 self.generate_fallback_vector(query)
138 };
139
140 self.index.search_knn(&query_vector, limit)
141 }
142
143 pub fn similarity_search_vector(
145 &self,
146 query: &Vector,
147 limit: usize,
148 ) -> Result<Vec<(String, f32)>> {
149 self.index.search_knn(query, limit)
150 }
151
152 pub fn threshold_search(&self, query: &str, threshold: f32) -> Result<Vec<(String, f32)>> {
154 let query_vector = self.generate_fallback_vector(query);
155 self.index.search_threshold(&query_vector, threshold)
156 }
157
158 pub fn advanced_search(&self, options: SearchOptions) -> Result<Vec<(String, f32)>> {
160 let query_vector = match options.query {
161 SearchQuery::Text(text) => self.generate_fallback_vector(&text),
162 SearchQuery::Vector(vector) => vector,
163 };
164
165 let results = match options.search_type {
166 SearchType::KNN(k) => self.index.search_knn(&query_vector, k)?,
167 SearchType::Threshold(threshold) => {
168 self.index.search_threshold(&query_vector, threshold)?
169 }
170 };
171
172 Ok(results)
173 }
174
175 fn generate_fallback_vector(&self, text: &str) -> Vector {
176 use std::collections::hash_map::DefaultHasher;
178 use std::hash::{Hash, Hasher};
179
180 let mut hasher = DefaultHasher::new();
181 text.hash(&mut hasher);
182 let hash = hasher.finish();
183
184 let mut values = Vec::with_capacity(384); let mut seed = hash;
186
187 for _ in 0..384 {
188 seed = seed.wrapping_mul(1103515245).wrapping_add(12345);
189 let normalized = (seed as f32) / (u64::MAX as f32);
190 values.push((normalized - 0.5) * 2.0); }
192
193 Vector::new(values)
194 }
195
196 pub fn embedding_stats(&self) -> Option<(usize, usize)> {
198 self.embedding_manager.as_ref().map(|em| em.cache_stats())
199 }
200
201 pub fn build_vocabulary(&mut self, documents: &[String]) -> Result<()> {
203 if let Some(ref mut embedding_manager) = self.embedding_manager {
204 embedding_manager.build_vocabulary(documents)
205 } else {
206 Ok(()) }
208 }
209
210 pub fn calculate_similarity(&self, uri1: &str, uri2: &str) -> Result<f32> {
212 if uri1 == uri2 {
214 return Ok(1.0);
215 }
216
217 let vector1 = self
219 .index
220 .get_vector(uri1)
221 .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri1))?;
222
223 let vector2 = self
224 .index
225 .get_vector(uri2)
226 .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri2))?;
227
228 vector1.cosine_similarity(vector2)
230 }
231
232 pub fn get_vector(&self, id: &str) -> Option<&Vector> {
234 self.index.get_vector(id)
235 }
236
237 pub fn iter_vectors(&self) -> Vec<(String, Vector)> {
243 self.index.iter_vectors()
244 }
245
246 pub fn index_vector_with_metadata(
248 &mut self,
249 uri: String,
250 vector: Vector,
251 _metadata: HashMap<String, String>,
252 ) -> Result<()> {
253 self.index_vector(uri, vector)
256 }
257
258 pub fn index_resource_with_metadata(
260 &mut self,
261 uri: String,
262 content: &str,
263 _metadata: HashMap<String, String>,
264 ) -> Result<()> {
265 self.index_resource(uri, content)
268 }
269
270 pub fn similarity_search_with_params(
272 &self,
273 query: &str,
274 limit: usize,
275 _params: HashMap<String, String>,
276 ) -> Result<Vec<(String, f32)>> {
277 self.similarity_search(query, limit)
280 }
281
282 pub fn vector_search_with_params(
284 &self,
285 query: &Vector,
286 limit: usize,
287 _params: HashMap<String, String>,
288 ) -> Result<Vec<(String, f32)>> {
289 self.similarity_search_vector(query, limit)
292 }
293
294 pub fn get_vector_ids(&self) -> Result<Vec<String>> {
296 Ok(Vec::new())
299 }
300
301 pub fn remove_vector(&mut self, uri: &str) -> Result<()> {
303 self.index.remove_vector(uri.to_string())
305 }
306
307 pub fn get_statistics(&self) -> Result<HashMap<String, String>> {
309 let mut stats = HashMap::new();
312 stats.insert("type".to_string(), "VectorStore".to_string());
313
314 if let Some((cache_size, cache_capacity)) = self.embedding_stats() {
315 stats.insert("embedding_cache_size".to_string(), cache_size.to_string());
316 stats.insert(
317 "embedding_cache_capacity".to_string(),
318 cache_capacity.to_string(),
319 );
320 }
321
322 Ok(stats)
323 }
324
325 pub fn save_to_disk(&self, path: &str) -> Result<()> {
336 use anyhow::Context as _;
337
338 std::fs::create_dir_all(path)
339 .with_context(|| format!("Failed to create directory: {}", path))?;
340
341 let vectors = self.index.iter_vectors();
343 let metadata = serde_json::json!({
344 "config": self.config,
345 "vector_count": vectors.len(),
346 "index_type": "memory",
347 });
348 let metadata_path = std::path::Path::new(path).join("metadata.json");
349 let metadata_str = serde_json::to_string_pretty(&metadata)
350 .with_context(|| "Failed to serialize VectorStore metadata")?;
351 std::fs::write(&metadata_path, metadata_str)
352 .with_context(|| format!("Failed to write {}", metadata_path.display()))?;
353
354 let vectors_path = std::path::Path::new(path).join("vectors.json");
356 let vectors_str = serde_json::to_string_pretty(&vectors)
357 .with_context(|| "Failed to serialize VectorStore vectors")?;
358 std::fs::write(&vectors_path, vectors_str)
359 .with_context(|| format!("Failed to write {}", vectors_path.display()))?;
360
361 Ok(())
362 }
363
364 pub fn load_from_disk(path: &str) -> Result<Self> {
371 use anyhow::Context as _;
372
373 let metadata_path = std::path::Path::new(path).join("metadata.json");
375 let metadata_str = std::fs::read_to_string(&metadata_path)
376 .with_context(|| format!("Failed to read {}", metadata_path.display()))?;
377 let metadata: serde_json::Value = serde_json::from_str(&metadata_str)
378 .with_context(|| "Failed to parse VectorStore metadata")?;
379
380 let config: VectorStoreConfig = serde_json::from_value(metadata["config"].clone())
381 .with_context(|| "Failed to deserialize VectorStoreConfig from metadata")?;
382
383 let vectors_path = std::path::Path::new(path).join("vectors.json");
385 let vectors_str = std::fs::read_to_string(&vectors_path)
386 .with_context(|| format!("Failed to read {}", vectors_path.display()))?;
387 let entries: Vec<(String, Vector)> = serde_json::from_str(&vectors_str)
388 .with_context(|| "Failed to deserialize VectorStore vectors")?;
389
390 let mut store = Self {
392 index: Box::new(MemoryVectorIndex::new()),
393 embedding_manager: None,
394 config,
395 };
396
397 for (id, vector) in entries {
398 store
399 .index
400 .insert(id.clone(), vector)
401 .with_context(|| format!("Failed to re-insert vector '{}'", id))?;
402 }
403
404 Ok(store)
405 }
406
407 pub fn optimize_index(&mut self) -> Result<()> {
409 Ok(())
412 }
413}
414
415impl Default for VectorStore {
416 fn default() -> Self {
417 Self::new()
418 }
419}
420
421impl VectorStoreTrait for VectorStore {
422 fn insert_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
423 self.index.insert(id, vector)
424 }
425
426 fn add_vector(&mut self, vector: Vector) -> Result<VectorId> {
427 let id = format!("vec_{}", uuid::Uuid::new_v4());
429 self.index.insert(id.clone(), vector)?;
430 Ok(id)
431 }
432
433 fn get_vector(&self, id: &VectorId) -> Result<Option<Vector>> {
434 Ok(self.index.get_vector(id).cloned())
435 }
436
437 fn get_all_vector_ids(&self) -> Result<Vec<VectorId>> {
438 Ok(Vec::new())
441 }
442
443 fn search_similar(&self, query: &Vector, k: usize) -> Result<Vec<(VectorId, f32)>> {
444 self.index.search_knn(query, k)
445 }
446
447 fn remove_vector(&mut self, id: &VectorId) -> Result<bool> {
448 let _ = id;
451 Ok(false)
452 }
453
454 fn len(&self) -> usize {
455 0
458 }
459}
460
461#[derive(Debug, Clone)]
463pub enum SearchQuery {
464 Text(String),
465 Vector(Vector),
466}
467
468#[derive(Debug, Clone)]
470pub enum SearchType {
471 KNN(usize),
472 Threshold(f32),
473}
474
475#[derive(Debug, Clone)]
477pub struct SearchOptions {
478 pub query: SearchQuery,
479 pub search_type: SearchType,
480}
481
482#[derive(Debug, Clone)]
484pub struct VectorOperationResult {
485 pub uri: String,
486 pub similarity: f32,
487 pub vector: Option<Vector>,
488 pub metadata: Option<std::collections::HashMap<String, String>>,
489 pub rank: usize,
490}
491
492pub struct DocumentBatchProcessor;
494
495impl DocumentBatchProcessor {
496 pub fn batch_index(
498 store: &mut VectorStore,
499 documents: &[(String, String)], ) -> Result<Vec<Result<()>>> {
501 let mut results = Vec::new();
502
503 for (uri, content) in documents {
504 let result = store.index_resource(uri.clone(), content);
505 results.push(result);
506 }
507
508 Ok(results)
509 }
510
511 pub fn batch_search(
513 store: &VectorStore,
514 queries: &[String],
515 limit: usize,
516 ) -> Result<BatchSearchResult> {
517 let mut results = Vec::new();
518
519 for query in queries {
520 let result = store.similarity_search(query, limit);
521 results.push(result);
522 }
523
524 Ok(results)
525 }
526}