#![allow(dead_code)]
use super::{CodeEmbedding, EmbeddingGenerator, EmbeddingMetadata, SemanticConfig};
use anyhow::Result;
use std::path::Path;
use tokio::fs;
use tracing::{info, debug, warn};
use walkdir::WalkDir;
pub struct SemanticIndex {
generator: EmbeddingGenerator,
config: SemanticConfig,
}
impl SemanticIndex {
pub fn new(config: SemanticConfig) -> Result<Self> {
let generator = EmbeddingGenerator::new(config.clone())?;
Ok(Self { generator, config })
}
pub async fn index_directory(&self, path: &Path, repository: &str) -> Result<Vec<CodeEmbedding>> {
info!("Indexing directory: {}", path.display());
let mut embeddings = Vec::new();
let code_files = self.find_code_files(path).await?;
info!("Found {} code files to index", code_files.len());
for file_path in code_files {
match self.index_file(&file_path, repository).await {
Ok(file_embeddings) => {
debug!("Indexed {} chunks from {}", file_embeddings.len(), file_path.display());
embeddings.extend(file_embeddings);
}
Err(e) => {
warn!("Failed to index {}: {}", file_path.display(), e);
}
}
}
info!("Generated {} total embeddings", embeddings.len());
Ok(embeddings)
}
pub async fn index_directory_incremental(
&self,
path: &Path,
repository: &str,
cache: &mut crate::semantic::storage::IndexCache,
) -> Result<(Vec<CodeEmbedding>, usize, usize)> {
info!("Incrementally indexing directory: {}", path.display());
let code_files = self.find_code_files(path).await?;
let mut embeddings = Vec::new();
let mut hits = 0usize;
let mut misses = 0usize;
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
for file_path in &code_files {
let path_key = file_path.to_string_lossy().to_string();
seen.insert(path_key.clone());
let content = match tokio::fs::read_to_string(file_path).await {
Ok(c) => c,
Err(e) => {
warn!("read {}: {}", file_path.display(), e);
continue;
}
};
let hash = sha256_hex(&content);
if let Some(entry) = cache.files.get(&path_key) {
if entry.content_hash == hash {
embeddings.extend(entry.embeddings.iter().cloned());
hits += 1;
continue;
}
}
misses += 1;
match self.index_file(file_path, repository).await {
Ok(file_embeddings) => {
cache.files.insert(
path_key,
crate::semantic::storage::FileCacheEntry {
content_hash: hash,
embeddings: file_embeddings.clone(),
},
);
embeddings.extend(file_embeddings);
}
Err(e) => warn!("Failed to index {}: {}", file_path.display(), e),
}
}
let root_str = path.to_string_lossy().to_string();
cache
.files
.retain(|k, _| !k.starts_with(&root_str) || seen.contains(k));
info!(
"Generated {} embeddings — {} cached, {} fresh ({} files total)",
embeddings.len(),
hits,
misses,
code_files.len()
);
Ok((embeddings, hits, misses))
}
async fn index_file(&self, file_path: &Path, repository: &str) -> Result<Vec<CodeEmbedding>> {
let content = fs::read_to_string(file_path).await?;
let relative_path = file_path.strip_prefix(file_path.parent().unwrap_or(file_path))
.unwrap_or(file_path)
.to_string_lossy()
.to_string();
let chunks = self.generator.chunk_code(&content, &relative_path);
if chunks.is_empty() {
return Ok(Vec::new());
}
let texts: Vec<String> = chunks.iter().map(|c| c.content.clone()).collect();
let vectors = self.generator.embed(&texts).await?;
let embeddings: Vec<CodeEmbedding> = chunks.into_iter()
.zip(vectors.into_iter())
.map(|(chunk, embedding)| {
CodeEmbedding {
id: uuid::Uuid::new_v4().to_string(),
content: chunk.content,
embedding,
metadata: EmbeddingMetadata {
source_file: relative_path.clone(),
repository: repository.to_string(),
language: chunk.language,
start_line: chunk.start_line,
end_line: chunk.end_line,
function_name: chunk.function_name,
tags: extract_tags(&relative_path, &content),
},
created_at: chrono::Utc::now(),
}
})
.collect();
Ok(embeddings)
}
async fn find_code_files(&self, path: &Path) -> Result<Vec<std::path::PathBuf>> {
let mut files = Vec::new();
for entry in WalkDir::new(path)
.follow_links(false)
.max_depth(10)
.into_iter()
.filter_map(|e| e.ok())
{
let path = entry.path();
if !path.is_file() {
continue;
}
if should_skip_path(path) {
continue;
}
if is_code_file(path) {
files.push(path.to_path_buf());
}
}
Ok(files)
}
pub async fn index_github_repo(&self, owner: &str, repo: &str, token: Option<&str>) -> Result<Vec<CodeEmbedding>> {
use std::process::Command;
let temp_dir = tempfile::tempdir()?;
let clone_url = format!("https://github.com/{}/{}.git", owner, repo);
info!("Cloning repository: {}", clone_url);
let mut cmd = Command::new("git");
cmd.args(&["clone", "--depth", "1", &clone_url, temp_dir.path().to_str().unwrap()]);
if let Some(token) = token {
let auth_url = format!("https://{}@github.com/{}/{}.git", token, owner, repo);
cmd.args(&["--origin", "origin", &auth_url]);
}
let output = cmd.output()?;
if !output.status.success() {
anyhow::bail!("Failed to clone repository: {}", String::from_utf8_lossy(&output.stderr));
}
let embeddings = self.index_directory(temp_dir.path(), &format!("{}/{}", owner, repo)).await?;
Ok(embeddings)
}
}
fn sha256_hex(content: &str) -> String {
use sha2::{Digest, Sha256};
let mut h = Sha256::new();
h.update(content.as_bytes());
let bytes = h.finalize();
const HEX: &[u8; 16] = b"0123456789abcdef";
let mut s = String::with_capacity(64);
for b in bytes {
s.push(HEX[(b >> 4) as usize] as char);
s.push(HEX[(b & 0x0f) as usize] as char);
}
s
}
fn is_code_file(path: &Path) -> bool {
let code_extensions = [
"rs", "py", "js", "ts", "jsx", "tsx", "go", "java", "kt", "scala",
"rb", "php", "c", "cpp", "cc", "cxx", "h", "hpp", "cs", "swift",
"m", "mm", "r", "pl", "pm", "lua", "sh", "bash", "zsh", "ps1",
"sql", "html", "css", "scss", "sass", "less", "vue", "svelte",
];
if let Some(ext) = path.extension() {
let ext = ext.to_string_lossy().to_lowercase();
return code_extensions.contains(&ext.as_str());
}
false
}
fn should_skip_path(path: &Path) -> bool {
let skip_dirs = [
"node_modules", "target", "build", "dist", ".git",
"__pycache__", ".next", "out", "vendor", "bin", "obj",
".idea", ".vscode", "coverage", ".nyc_output",
];
path.components().any(|c| {
if let Some(s) = c.as_os_str().to_str() {
skip_dirs.contains(&s) || s.starts_with('.')
} else {
false
}
})
}
fn extract_tags(file_path: &str, content: &str) -> Vec<String> {
let mut tags = Vec::new();
let ext = std::path::Path::new(file_path)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("");
match ext {
"rs" => tags.push("rust".to_string()),
"py" => tags.push("python".to_string()),
"js" => tags.push("javascript".to_string()),
"ts" => tags.push("typescript".to_string()),
"go" => tags.push("go".to_string()),
_ => {}
}
if content.contains("async fn") || content.contains("async def") {
tags.push("async".to_string());
}
if content.contains("test") || content.contains("#[test]") {
tags.push("testing".to_string());
}
if content.contains("TODO") || content.contains("FIXME") {
tags.push("has-todos".to_string());
}
tags
}