use anyhow::Result;
use std::collections::HashMap;
use crate::embeddings;
use crate::vector_index::{MemoryVectorIndex, VectorIndex};
use crate::{BatchSearchResult, Vector, VectorId, VectorStoreTrait};
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct VectorStoreConfig {
pub auto_embed: bool,
pub cache_embeddings: bool,
pub similarity_threshold: f32,
pub max_results: usize,
}
impl Default for VectorStoreConfig {
fn default() -> Self {
Self {
auto_embed: true,
cache_embeddings: true,
similarity_threshold: 0.7,
max_results: 100,
}
}
}
pub struct VectorStore {
index: Box<dyn VectorIndex>,
embedding_manager: Option<embeddings::EmbeddingManager>,
config: VectorStoreConfig,
}
impl VectorStore {
pub fn new() -> Self {
Self {
index: Box::new(MemoryVectorIndex::new()),
embedding_manager: None,
config: VectorStoreConfig::default(),
}
}
pub fn with_embedding_strategy(strategy: embeddings::EmbeddingStrategy) -> Result<Self> {
let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
Ok(Self {
index: Box::new(MemoryVectorIndex::new()),
embedding_manager: Some(embedding_manager),
config: VectorStoreConfig::default(),
})
}
pub fn with_index(index: Box<dyn VectorIndex>) -> Self {
Self {
index,
embedding_manager: None,
config: VectorStoreConfig::default(),
}
}
pub fn with_index_and_embeddings(
index: Box<dyn VectorIndex>,
strategy: embeddings::EmbeddingStrategy,
) -> Result<Self> {
let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
Ok(Self {
index,
embedding_manager: Some(embedding_manager),
config: VectorStoreConfig::default(),
})
}
pub fn with_config(mut self, config: VectorStoreConfig) -> Self {
self.config = config;
self
}
pub fn index_resource(&mut self, uri: String, content: &str) -> Result<()> {
if let Some(ref mut embedding_manager) = self.embedding_manager {
let embeddable_content = embeddings::EmbeddableContent::Text(content.to_string());
let vector = embedding_manager.get_embedding(&embeddable_content)?;
self.index.insert(uri, vector)
} else {
let vector = self.generate_fallback_vector(content);
self.index.insert(uri, vector)
}
}
pub fn index_rdf_resource(
&mut self,
uri: String,
label: Option<String>,
description: Option<String>,
properties: std::collections::HashMap<String, Vec<String>>,
) -> Result<()> {
if let Some(ref mut embedding_manager) = self.embedding_manager {
let embeddable_content = embeddings::EmbeddableContent::RdfResource {
uri: uri.clone(),
label,
description,
properties,
};
let vector = embedding_manager.get_embedding(&embeddable_content)?;
self.index.insert(uri, vector)
} else {
Err(anyhow::anyhow!(
"Embedding manager required for RDF resource indexing"
))
}
}
pub fn index_vector(&mut self, uri: String, vector: Vector) -> Result<()> {
self.index.insert(uri, vector)
}
pub fn similarity_search(&self, query: &str, limit: usize) -> Result<Vec<(String, f32)>> {
let query_vector = if let Some(ref _embedding_manager) = self.embedding_manager {
let _embeddable_content = embeddings::EmbeddableContent::Text(query.to_string());
self.generate_fallback_vector(query)
} else {
self.generate_fallback_vector(query)
};
self.index.search_knn(&query_vector, limit)
}
pub fn similarity_search_vector(
&self,
query: &Vector,
limit: usize,
) -> Result<Vec<(String, f32)>> {
self.index.search_knn(query, limit)
}
pub fn threshold_search(&self, query: &str, threshold: f32) -> Result<Vec<(String, f32)>> {
let query_vector = self.generate_fallback_vector(query);
self.index.search_threshold(&query_vector, threshold)
}
pub fn advanced_search(&self, options: SearchOptions) -> Result<Vec<(String, f32)>> {
let query_vector = match options.query {
SearchQuery::Text(text) => self.generate_fallback_vector(&text),
SearchQuery::Vector(vector) => vector,
};
let results = match options.search_type {
SearchType::KNN(k) => self.index.search_knn(&query_vector, k)?,
SearchType::Threshold(threshold) => {
self.index.search_threshold(&query_vector, threshold)?
}
};
Ok(results)
}
fn generate_fallback_vector(&self, text: &str) -> Vector {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
text.hash(&mut hasher);
let hash = hasher.finish();
let mut values = Vec::with_capacity(384); let mut seed = hash;
for _ in 0..384 {
seed = seed.wrapping_mul(1103515245).wrapping_add(12345);
let normalized = (seed as f32) / (u64::MAX as f32);
values.push((normalized - 0.5) * 2.0); }
Vector::new(values)
}
pub fn embedding_stats(&self) -> Option<(usize, usize)> {
self.embedding_manager.as_ref().map(|em| em.cache_stats())
}
pub fn build_vocabulary(&mut self, documents: &[String]) -> Result<()> {
if let Some(ref mut embedding_manager) = self.embedding_manager {
embedding_manager.build_vocabulary(documents)
} else {
Ok(()) }
}
pub fn calculate_similarity(&self, uri1: &str, uri2: &str) -> Result<f32> {
if uri1 == uri2 {
return Ok(1.0);
}
let vector1 = self
.index
.get_vector(uri1)
.ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri1))?;
let vector2 = self
.index
.get_vector(uri2)
.ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri2))?;
vector1.cosine_similarity(vector2)
}
pub fn get_vector(&self, id: &str) -> Option<&Vector> {
self.index.get_vector(id)
}
pub fn iter_vectors(&self) -> Vec<(String, Vector)> {
self.index.iter_vectors()
}
pub fn index_vector_with_metadata(
&mut self,
uri: String,
vector: Vector,
_metadata: HashMap<String, String>,
) -> Result<()> {
self.index_vector(uri, vector)
}
pub fn index_resource_with_metadata(
&mut self,
uri: String,
content: &str,
_metadata: HashMap<String, String>,
) -> Result<()> {
self.index_resource(uri, content)
}
pub fn similarity_search_with_params(
&self,
query: &str,
limit: usize,
_params: HashMap<String, String>,
) -> Result<Vec<(String, f32)>> {
self.similarity_search(query, limit)
}
pub fn vector_search_with_params(
&self,
query: &Vector,
limit: usize,
_params: HashMap<String, String>,
) -> Result<Vec<(String, f32)>> {
self.similarity_search_vector(query, limit)
}
pub fn get_vector_ids(&self) -> Result<Vec<String>> {
Ok(Vec::new())
}
pub fn remove_vector(&mut self, uri: &str) -> Result<()> {
self.index.remove_vector(uri.to_string())
}
pub fn get_statistics(&self) -> Result<HashMap<String, String>> {
let mut stats = HashMap::new();
stats.insert("type".to_string(), "VectorStore".to_string());
if let Some((cache_size, cache_capacity)) = self.embedding_stats() {
stats.insert("embedding_cache_size".to_string(), cache_size.to_string());
stats.insert(
"embedding_cache_capacity".to_string(),
cache_capacity.to_string(),
);
}
Ok(stats)
}
pub fn save_to_disk(&self, path: &str) -> Result<()> {
use anyhow::Context as _;
std::fs::create_dir_all(path)
.with_context(|| format!("Failed to create directory: {}", path))?;
let vectors = self.index.iter_vectors();
let metadata = serde_json::json!({
"config": self.config,
"vector_count": vectors.len(),
"index_type": "memory",
});
let metadata_path = std::path::Path::new(path).join("metadata.json");
let metadata_str = serde_json::to_string_pretty(&metadata)
.with_context(|| "Failed to serialize VectorStore metadata")?;
std::fs::write(&metadata_path, metadata_str)
.with_context(|| format!("Failed to write {}", metadata_path.display()))?;
let vectors_path = std::path::Path::new(path).join("vectors.json");
let vectors_str = serde_json::to_string_pretty(&vectors)
.with_context(|| "Failed to serialize VectorStore vectors")?;
std::fs::write(&vectors_path, vectors_str)
.with_context(|| format!("Failed to write {}", vectors_path.display()))?;
Ok(())
}
pub fn load_from_disk(path: &str) -> Result<Self> {
use anyhow::Context as _;
let metadata_path = std::path::Path::new(path).join("metadata.json");
let metadata_str = std::fs::read_to_string(&metadata_path)
.with_context(|| format!("Failed to read {}", metadata_path.display()))?;
let metadata: serde_json::Value = serde_json::from_str(&metadata_str)
.with_context(|| "Failed to parse VectorStore metadata")?;
let config: VectorStoreConfig = serde_json::from_value(metadata["config"].clone())
.with_context(|| "Failed to deserialize VectorStoreConfig from metadata")?;
let vectors_path = std::path::Path::new(path).join("vectors.json");
let vectors_str = std::fs::read_to_string(&vectors_path)
.with_context(|| format!("Failed to read {}", vectors_path.display()))?;
let entries: Vec<(String, Vector)> = serde_json::from_str(&vectors_str)
.with_context(|| "Failed to deserialize VectorStore vectors")?;
let mut store = Self {
index: Box::new(MemoryVectorIndex::new()),
embedding_manager: None,
config,
};
for (id, vector) in entries {
store
.index
.insert(id.clone(), vector)
.with_context(|| format!("Failed to re-insert vector '{}'", id))?;
}
Ok(store)
}
pub fn optimize_index(&mut self) -> Result<()> {
Ok(())
}
}
impl Default for VectorStore {
fn default() -> Self {
Self::new()
}
}
impl VectorStoreTrait for VectorStore {
fn insert_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
self.index.insert(id, vector)
}
fn add_vector(&mut self, vector: Vector) -> Result<VectorId> {
let id = format!("vec_{}", uuid::Uuid::new_v4());
self.index.insert(id.clone(), vector)?;
Ok(id)
}
fn get_vector(&self, id: &VectorId) -> Result<Option<Vector>> {
Ok(self.index.get_vector(id).cloned())
}
fn get_all_vector_ids(&self) -> Result<Vec<VectorId>> {
Ok(Vec::new())
}
fn search_similar(&self, query: &Vector, k: usize) -> Result<Vec<(VectorId, f32)>> {
self.index.search_knn(query, k)
}
fn remove_vector(&mut self, id: &VectorId) -> Result<bool> {
let _ = id;
Ok(false)
}
fn len(&self) -> usize {
0
}
}
#[derive(Debug, Clone)]
pub enum SearchQuery {
Text(String),
Vector(Vector),
}
#[derive(Debug, Clone)]
pub enum SearchType {
KNN(usize),
Threshold(f32),
}
#[derive(Debug, Clone)]
pub struct SearchOptions {
pub query: SearchQuery,
pub search_type: SearchType,
}
#[derive(Debug, Clone)]
pub struct VectorOperationResult {
pub uri: String,
pub similarity: f32,
pub vector: Option<Vector>,
pub metadata: Option<std::collections::HashMap<String, String>>,
pub rank: usize,
}
pub struct DocumentBatchProcessor;
impl DocumentBatchProcessor {
pub fn batch_index(
store: &mut VectorStore,
documents: &[(String, String)], ) -> Result<Vec<Result<()>>> {
let mut results = Vec::new();
for (uri, content) in documents {
let result = store.index_resource(uri.clone(), content);
results.push(result);
}
Ok(results)
}
pub fn batch_search(
store: &VectorStore,
queries: &[String],
limit: usize,
) -> Result<BatchSearchResult> {
let mut results = Vec::new();
for query in queries {
let result = store.similarity_search(query, limit);
results.push(result);
}
Ok(results)
}
}