#![allow(dead_code)]
use super::{CodeEmbedding, SemanticConfig};
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use tokio::fs;
use tracing::{info, debug, warn};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmbeddingManifest {
pub embedder_id: String,
pub saved_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct IndexCache {
pub embedder_id: String,
pub files: std::collections::HashMap<String, FileCacheEntry>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileCacheEntry {
pub content_hash: String,
pub embeddings: Vec<CodeEmbedding>,
}
pub struct EmbeddingStorage {
base_path: PathBuf,
config: SemanticConfig,
}
impl EmbeddingStorage {
pub fn new(base_path: &Path, config: SemanticConfig) -> Result<Self> {
let storage_path = base_path.join("embeddings");
std::fs::create_dir_all(&storage_path)?;
Ok(Self {
base_path: storage_path,
config,
})
}
fn manifest_path(&self) -> PathBuf {
self.base_path.join("manifest.json")
}
fn cache_path(&self) -> PathBuf {
self.base_path.join("cache.json")
}
pub async fn load_cache(&self, expected_id: &str) -> IndexCache {
let path = self.cache_path();
if !path.exists() {
return IndexCache {
embedder_id: expected_id.to_string(),
files: Default::default(),
};
}
let content = match fs::read_to_string(&path).await {
Ok(c) => c,
Err(_) => {
return IndexCache {
embedder_id: expected_id.to_string(),
files: Default::default(),
}
}
};
let parsed: IndexCache = match serde_json::from_str(&content) {
Ok(c) => c,
Err(_) => {
warn!("embedding cache at {} unreadable; rebuilding", path.display());
return IndexCache {
embedder_id: expected_id.to_string(),
files: Default::default(),
};
}
};
if parsed.embedder_id != expected_id {
warn!(
"embedding cache was produced by `{}`, current backend is `{}` — discarding",
parsed.embedder_id, expected_id
);
return IndexCache {
embedder_id: expected_id.to_string(),
files: Default::default(),
};
}
parsed
}
pub async fn save_cache(&self, cache: &IndexCache) -> Result<()> {
let path = self.cache_path();
let content = serde_json::to_string(cache)?;
fs::write(path, content).await?;
Ok(())
}
pub async fn load_manifest(&self) -> Option<EmbeddingManifest> {
let path = self.manifest_path();
if !path.exists() {
return None;
}
let content = fs::read_to_string(&path).await.ok()?;
serde_json::from_str(&content).ok()
}
async fn write_manifest(&self, embedder_id: &str) -> Result<()> {
let m = EmbeddingManifest {
embedder_id: embedder_id.to_string(),
saved_at: chrono::Utc::now(),
};
fs::write(self.manifest_path(), serde_json::to_string_pretty(&m)?).await?;
Ok(())
}
pub async fn save_embeddings(&self, embeddings: &[CodeEmbedding]) -> Result<()> {
debug!("Saving {} embeddings", embeddings.len());
let mut by_repo: std::collections::HashMap<String, Vec<CodeEmbedding>> =
std::collections::HashMap::new();
for embedding in embeddings {
by_repo.entry(embedding.metadata.repository.clone())
.or_default()
.push(embedding.clone());
}
for (repo, repo_embeddings) in by_repo {
let safe_name = repo.replace('/', "_");
let file_path = self.base_path.join(format!("{}.json", safe_name));
let data = serde_json::to_string_pretty(&repo_embeddings)?;
fs::write(&file_path, data).await?;
info!("Saved {} embeddings for {}", repo_embeddings.len(), repo);
}
let config_path = self.base_path.join("config.json");
let config_data = serde_json::to_string_pretty(&self.config)?;
fs::write(&config_path, config_data).await?;
Ok(())
}
pub async fn save_embeddings_with_id(
&self,
embeddings: &[CodeEmbedding],
embedder_id: &str,
) -> Result<()> {
self.save_embeddings(embeddings).await?;
self.write_manifest(embedder_id).await?;
Ok(())
}
pub async fn load_embeddings(&self) -> Result<Vec<CodeEmbedding>> {
let mut all_embeddings = Vec::new();
let mut entries = fs::read_dir(&self.base_path).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.extension().map(|e| e == "json").unwrap_or(false) {
let name = path.file_name();
if name == Some(std::ffi::OsStr::new("config.json"))
|| name == Some(std::ffi::OsStr::new("manifest.json"))
{
continue;
}
let content = fs::read_to_string(&path).await?;
let embeddings: Vec<CodeEmbedding> = serde_json::from_str(&content)?;
debug!("Loaded {} embeddings from {}", embeddings.len(), path.display());
all_embeddings.extend(embeddings);
}
}
info!("Loaded {} total embeddings", all_embeddings.len());
Ok(all_embeddings)
}
pub async fn load_embeddings_for(&self, expected_id: &str) -> Result<Vec<CodeEmbedding>> {
match self.load_manifest().await {
Some(m) if m.embedder_id == expected_id => self.load_embeddings().await,
Some(m) => {
warn!(
"embeddings on disk were produced by `{}` but the current embedder is `{}`. \
Refusing to mix vector spaces — semantic search will return empty until you \
re-run `i-self index <path>`.",
m.embedder_id, expected_id
);
Ok(Vec::new())
}
None => {
let has_any_embeddings = match fs::read_dir(&self.base_path).await {
Ok(mut entries) => {
let mut any = false;
while let Ok(Some(entry)) = entries.next_entry().await {
let p = entry.path();
let n = p.file_name();
if p.extension().map(|e| e == "json").unwrap_or(false)
&& n != Some(std::ffi::OsStr::new("config.json"))
&& n != Some(std::ffi::OsStr::new("manifest.json"))
{
any = true;
break;
}
}
any
}
Err(_) => false,
};
if has_any_embeddings {
warn!(
"embeddings directory has data but no manifest (pre-manifest layout). \
Refusing to use to avoid mixing with current `{}` backend. \
Re-run `i-self index <path>` to rebuild.",
expected_id
);
Ok(Vec::new())
} else {
Ok(Vec::new())
}
}
}
}
pub async fn load_repository_embeddings(&self, repository: &str) -> Result<Vec<CodeEmbedding>> {
let safe_name = repository.replace('/', "_");
let file_path = self.base_path.join(format!("{}.json", safe_name));
if !file_path.exists() {
return Ok(Vec::new());
}
let content = fs::read_to_string(&file_path).await?;
let embeddings: Vec<CodeEmbedding> = serde_json::from_str(&content)?;
Ok(embeddings)
}
pub async fn delete_repository(&self, repository: &str) -> Result<()> {
let safe_name = repository.replace('/', "_");
let file_path = self.base_path.join(format!("{}.json", safe_name));
if file_path.exists() {
fs::remove_file(&file_path).await?;
info!("Deleted embeddings for {}", repository);
}
Ok(())
}
pub async fn stats(&self) -> StorageStats {
let mut total_files = 0;
let mut total_size = 0;
let mut total_embeddings = 0;
if let Ok(mut entries) = fs::read_dir(&self.base_path).await {
while let Ok(Some(entry)) = entries.next_entry().await {
if let Ok(metadata) = entry.metadata().await {
if metadata.is_file() && entry.path().extension().map(|e| e == "json").unwrap_or(false) {
total_files += 1;
total_size += metadata.len();
if let Ok(content) = fs::read_to_string(entry.path()).await {
if let Ok(embeddings) = serde_json::from_str::<Vec<CodeEmbedding>>(&content) {
total_embeddings += embeddings.len();
}
}
}
}
}
}
StorageStats {
total_files,
total_size_bytes: total_size,
total_embeddings,
}
}
pub async fn export(&self, output_path: &Path) -> Result<()> {
let embeddings = self.load_embeddings().await?;
let data = serde_json::to_string_pretty(&embeddings)?;
fs::write(output_path, data).await?;
info!("Exported {} embeddings to {}", embeddings.len(), output_path.display());
Ok(())
}
pub async fn import(&self, input_path: &Path) -> Result<()> {
let content = fs::read_to_string(input_path).await?;
let embeddings: Vec<CodeEmbedding> = serde_json::from_str(&content)?;
self.save_embeddings(&embeddings).await?;
info!("Imported {} embeddings from {}", embeddings.len(), input_path.display());
Ok(())
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StorageStats {
pub total_files: usize,
pub total_size_bytes: u64,
pub total_embeddings: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct IndexMetadata {
pub repositories: Vec<RepositoryIndex>,
pub last_updated: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RepositoryIndex {
pub name: String,
pub embedding_count: usize,
pub languages: Vec<String>,
pub last_indexed: chrono::DateTime<chrono::Utc>,
}
#[cfg(test)]
mod tests {
use super::*;
use crate::semantic::{CodeEmbedding, EmbeddingMetadata};
fn sample_embedding() -> CodeEmbedding {
CodeEmbedding {
id: "1".to_string(),
content: "fn main() {}".to_string(),
embedding: vec![0.1; 384],
metadata: EmbeddingMetadata {
source_file: "main.rs".into(),
repository: "test/repo".into(),
language: "Rust".into(),
start_line: 0,
end_line: 1,
function_name: Some("main".into()),
tags: vec!["rust".into()],
},
created_at: chrono::Utc::now(),
}
}
#[tokio::test]
async fn manifest_round_trip_matches() {
let dir = tempfile::tempdir().unwrap();
let storage = EmbeddingStorage::new(dir.path(), SemanticConfig::default()).unwrap();
storage
.save_embeddings_with_id(&[sample_embedding()], "openai:text-embedding-3-small@384")
.await
.unwrap();
let loaded = storage
.load_embeddings_for("openai:text-embedding-3-small@384")
.await
.unwrap();
assert_eq!(loaded.len(), 1);
}
#[tokio::test]
async fn manifest_mismatch_returns_empty_with_warning() {
let dir = tempfile::tempdir().unwrap();
let storage = EmbeddingStorage::new(dir.path(), SemanticConfig::default()).unwrap();
storage
.save_embeddings_with_id(&[sample_embedding()], "openai:text-embedding-3-small@384")
.await
.unwrap();
let loaded = storage.load_embeddings_for("hash-bucket@384").await.unwrap();
assert!(loaded.is_empty(), "mismatch should drop everything");
}
#[tokio::test]
async fn legacy_index_without_manifest_is_refused() {
let dir = tempfile::tempdir().unwrap();
let storage = EmbeddingStorage::new(dir.path(), SemanticConfig::default()).unwrap();
storage.save_embeddings(&[sample_embedding()]).await.unwrap();
let loaded = storage.load_embeddings_for("hash-bucket@384").await.unwrap();
assert!(loaded.is_empty(), "legacy data should not be silently used");
let raw = storage.load_embeddings().await.unwrap();
assert_eq!(raw.len(), 1);
}
#[tokio::test]
async fn empty_dir_is_not_a_warning_case() {
let dir = tempfile::tempdir().unwrap();
let storage = EmbeddingStorage::new(dir.path(), SemanticConfig::default()).unwrap();
let loaded = storage.load_embeddings_for("hash-bucket@384").await.unwrap();
assert!(loaded.is_empty());
}
}