use anyhow::{anyhow, Result};
use serde_json::Value as JsonValue;
use std::collections::HashMap;
use std::path::Path;
use uuid::Uuid;
use crate::storage::engine::{BlobStorage, JsonStorage};
use crate::storage::embedding::EmbeddingEngine;
use crate::storage::vector::{SearchResult, VectorEngine};
#[derive(Clone)]
pub struct DocumentStorage {
meta: JsonStorage,
blobs: BlobStorage,
vectors: VectorEngine,
}
impl DocumentStorage {
pub fn with_embedding(root: &str, engine: EmbeddingEngine) -> Result<Self> {
let paths = Paths::from(root)?;
Ok(Self {
meta: JsonStorage::new(&paths.metadata_db, 4, "doc")?,
blobs: BlobStorage::new(&paths.blobs_db, 4)?,
vectors: VectorEngine::with_embedding(&paths.vec, engine)?,
})
}
pub fn add_document(&self, metadata: JsonValue, content: &[u8]) -> Result<Uuid> {
let id = Uuid::now_v7();
let id_str = id.to_string();
self.meta.add_json_with_id(id, metadata.clone())?;
self.blobs.add_blob_with_key(id, content)?;
let content_text = String::from_utf8_lossy(content).into_owned();
self.vectors.store_documents_batch(&[
(&format!("{id_str}:meta"), metadata),
(&format!("{id_str}:content"), serde_json::json!(content_text)),
])?;
Ok(id)
}
pub fn add_document_no_embed(&self, metadata: JsonValue, content: &[u8]) -> Result<Uuid> {
let id = Uuid::now_v7();
self.meta.add_json_with_id(id, metadata)?;
self.blobs.add_blob_with_key(id, content)?;
Ok(id)
}
pub fn update_metadata(&self, id: Uuid, metadata: JsonValue) -> Result<()> {
self.meta.update_json(id, metadata)
}
pub fn update_content(&self, id: Uuid, content: &[u8]) -> Result<()> {
self.blobs.update_blob(id, content)
}
pub fn delete_document(&self, id: Uuid) -> Result<()> {
let id_str = id.to_string();
self.meta.drop_json(id)?;
self.blobs.drop_blob(id)?;
self.vectors.delete_vector(&format!("{id_str}:meta"))?;
self.vectors.delete_vector(&format!("{id_str}:content"))?;
Ok(())
}
pub fn reembed_document(&self, id: Uuid) -> Result<()> {
let id_str = id.to_string();
if let Some(metadata) = self.meta.get_json(id)? {
self.vectors
.store_document(&format!("{id_str}:meta"), metadata)?;
}
if let Some(bytes) = self.blobs.get_blob(id)? {
let text = String::from_utf8_lossy(&bytes).into_owned();
self.vectors.store_document(
&format!("{id_str}:content"),
serde_json::json!(text),
)?;
}
Ok(())
}
pub fn get_content(&self, id: Uuid) -> Result<Option<Vec<u8>>> {
self.blobs.get_blob(id)
}
pub fn list_metadata(&self) -> Result<Vec<(Uuid, JsonValue)>> {
self.meta.list_all()
}
pub fn search_document_text(&self, query: &str, limit: usize) -> Result<Vec<JsonValue>> {
let pool = limit.max(1) * 4;
let candidates = self
.vectors
.search_json(&serde_json::json!(query), pool)?;
self.build_results(candidates, limit)
}
pub fn sync(&self) -> Result<()> {
self.vectors.sync()
}
pub fn checkpoint(&self) -> Result<()> {
self.meta.checkpoint()?;
self.blobs.checkpoint()?;
Ok(())
}
fn build_results(
&self,
candidates: Vec<SearchResult>,
limit: usize,
) -> Result<Vec<JsonValue>> {
let mut best: HashMap<String, f32> = HashMap::new();
for r in &candidates {
let uuid_str = strip_suffix(&r.id).to_string();
let entry = best.entry(uuid_str).or_insert(f32::NEG_INFINITY);
if r.score > *entry {
*entry = r.score;
}
}
let mut ranked: Vec<(String, f32)> = best.into_iter().collect();
ranked.sort_by(|a, b| {
b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
});
ranked.truncate(limit);
let mut out = Vec::with_capacity(ranked.len());
for (uuid_str, score) in ranked {
let uuid = Uuid::parse_str(&uuid_str)
.map_err(|e| anyhow!("invalid UUID in vector index: {e}"))?;
let metadata = self.meta.get_json(uuid)?.unwrap_or(JsonValue::Null);
let content_bytes = self.blobs.get_blob(uuid)?.unwrap_or_default();
let document = String::from_utf8_lossy(&content_bytes).into_owned();
out.push(serde_json::json!({
"id": uuid_str,
"metadata": metadata,
"document": document,
"score": score,
}));
}
Ok(out)
}
}
fn strip_suffix(id: &str) -> &str {
id.strip_suffix(":meta")
.or_else(|| id.strip_suffix(":content"))
.unwrap_or(id)
}
struct Paths {
metadata_db: String,
blobs_db: String,
vec: String,
}
impl Paths {
fn from(root: &str) -> Result<Self> {
let root = Path::new(root);
std::fs::create_dir_all(root)
.map_err(|e| anyhow!("cannot create root dir {root:?}: {e}"))?;
std::fs::create_dir_all(root.join("vectors"))
.map_err(|e| anyhow!("cannot create vectors dir: {e}"))?;
Ok(Self {
metadata_db: root.join("metadata.db").to_string_lossy().into_owned(),
blobs_db: root.join("blobs.db").to_string_lossy().into_owned(),
vec: root.join("vectors").to_string_lossy().into_owned(),
})
}
}