pub mod ast_chunker;
mod chunker;
mod file_walker;
mod hasher;
pub mod language;
pub mod regex_chunker;
pub use chunker::{Chunk, Chunker};
pub use file_walker::FileWalker;
pub use hasher::content_hash;
#[allow(unused_imports)]
pub use chunker::ChunkType;
#[allow(unused_imports)]
pub use language::{detect_language, detect_language_from_str, Language};
use crate::config::VyctorConfig;
use crate::embeddings::{create_provider, EmbeddingProvider};
use crate::storage::Storage;
use anyhow::{Context, Result};
use indicatif::{ProgressBar, ProgressStyle};
use rayon::prelude::*;
use std::collections::HashSet;
use std::path::Path;
use std::sync::Arc;
pub struct Indexer {
storage: Storage,
embedder: Arc<dyn EmbeddingProvider>,
chunker: Chunker,
file_walker: FileWalker,
batch_size: usize,
}
impl Indexer {
pub fn new(root: &Path, config: &VyctorConfig) -> Result<Self> {
let db_path = root.join(".vyctor").join("index.duckdb");
let storage = Storage::new(&db_path, config.embedding.dimensions)?;
let embedder = create_provider(&config.embedding, true)?;
let chunker = Chunker::with_options(
config.indexing.chunk_size,
config.indexing.chunk_overlap,
config.indexing.max_chunk_size,
config.indexing.semantic_chunking,
);
let file_walker = FileWalker::new(
root.to_path_buf(),
config.indexing.include.clone(),
config.indexing.exclude.clone(),
);
Ok(Self {
storage,
embedder,
chunker,
file_walker,
batch_size: config.embedding.batch_size,
})
}
pub async fn index_all(&self, force: bool) -> Result<IndexResult> {
let files: Vec<_> = self.file_walker.walk().collect();
let total_files = files.len();
if total_files == 0 {
return Ok(IndexResult {
files_indexed: 0,
files_skipped: 0,
files_deleted: 0,
chunks_created: 0,
});
}
let pb = ProgressBar::new(total_files as u64);
pb.set_style(
ProgressStyle::default_bar()
.template(
"{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} files ({elapsed}) {msg}",
)
.unwrap()
.progress_chars("#>-"),
);
let mut total_embed_time = std::time::Duration::ZERO;
let mut files_indexed = 0;
let mut files_skipped = 0;
let mut chunks_created = 0;
let existing_hashes: std::collections::HashMap<String, String> = if !force {
self.storage.get_all_file_hashes()?.into_iter().collect()
} else {
std::collections::HashMap::new()
};
let root = self.file_walker.root().to_path_buf();
let chunker = self.chunker.clone();
for file_batch in files.chunks(self.batch_size) {
pb.set_message("Reading & chunking files...");
let processed: Vec<_> = file_batch
.par_iter()
.filter_map(|file_path| {
let relative_path = file_path
.strip_prefix(&root)
.unwrap_or(file_path)
.to_string_lossy()
.to_string();
let content = match std::fs::read_to_string(file_path) {
Ok(c) => c,
Err(_) => return None,
};
let hash = content_hash(&content);
if !force {
if let Some(existing_hash) = existing_hashes.get(&relative_path) {
if existing_hash == &hash {
return Some(Err(relative_path)); }
}
}
let chunks = chunker.chunk_with_path(&content, Some(&relative_path));
if chunks.is_empty() {
return None;
}
Some(Ok(ProcessedFile {
relative_path,
hash,
chunks,
}))
})
.collect();
let mut to_embed: Vec<ProcessedFile> = Vec::new();
for result in processed {
match result {
Ok(pf) => to_embed.push(pf),
Err(_) => {
files_skipped += 1;
pb.inc(1);
}
}
}
if to_embed.is_empty() {
continue;
}
let mut files_with_ids: Vec<(String, i64, Vec<Chunk>)> = Vec::new();
for pf in to_embed {
match self.storage.upsert_file(&pf.relative_path, &pf.hash) {
Ok(file_id) => {
files_with_ids.push((pf.relative_path, file_id, pf.chunks));
}
Err(e) => {
eprintln!("Warning: Could not upsert {}: {}", pf.relative_path, e);
pb.inc(1);
}
}
}
if !files_with_ids.is_empty() {
const EMBED_BATCH_SIZE: usize = 32;
for (_, file_id, chunks) in &files_with_ids {
let texts: Vec<_> = chunks.iter().map(|c| c.content.clone()).collect();
let mut all_embeddings = Vec::with_capacity(texts.len());
for (batch_idx, text_batch) in texts.chunks(EMBED_BATCH_SIZE).enumerate() {
let batch_num = batch_idx + 1;
let total_batches = texts.len().div_ceil(EMBED_BATCH_SIZE);
if total_batches > 1 {
pb.set_message(format!(
"Embedding batch {}/{} ({} chunks)...",
batch_num,
total_batches,
text_batch.len()
));
} else {
pb.set_message(format!("Embedding {} chunks...", text_batch.len()));
}
let embed_start = std::time::Instant::now();
let batch_embeddings = self.embedder.embed_batch(text_batch).await?;
total_embed_time += embed_start.elapsed();
all_embeddings.extend(batch_embeddings);
}
for (chunk_idx, (chunk, result)) in
chunks.iter().zip(all_embeddings.iter()).enumerate()
{
self.storage.insert_chunk(
*file_id,
chunk_idx as i32,
&chunk.content,
chunk.start_line as i32,
chunk.end_line as i32,
&result.embedding,
)?;
chunks_created += 1;
}
files_indexed += 1;
pb.inc(1);
}
}
}
pb.finish_with_message(format!("done (embedding: {:?})", total_embed_time));
let indexed_paths: HashSet<_> = self.storage.get_all_file_paths()?.into_iter().collect();
let current_paths: HashSet<_> = files
.iter()
.filter_map(|p| {
p.strip_prefix(self.file_walker.root())
.ok()
.map(|p| p.to_string_lossy().to_string())
})
.collect();
let deleted_paths: Vec<_> = indexed_paths.difference(¤t_paths).cloned().collect();
let files_deleted = self.storage.delete_files(&deleted_paths)?;
Ok(IndexResult {
files_indexed,
files_skipped,
files_deleted,
chunks_created,
})
}
pub async fn sync(&self) -> Result<IndexResult> {
self.index_all(false).await
}
pub async fn reindex(&self) -> Result<IndexResult> {
self.storage.clear()?;
self.index_all(true).await
}
pub async fn index_file(&self, path: &Path) -> Result<bool> {
let relative_path = path
.strip_prefix(self.file_walker.root())
.unwrap_or(path)
.to_string_lossy()
.to_string();
let content = std::fs::read_to_string(path)
.with_context(|| format!("Failed to read file: {}", path.display()))?;
let hash = content_hash(&content);
if let Ok(Some(existing)) = self.storage.get_file(&relative_path) {
if existing.content_hash == hash {
return Ok(false);
}
}
let chunks = self.chunker.chunk_with_path(&content, Some(&relative_path));
if chunks.is_empty() {
return Ok(false);
}
let file_id = self.storage.upsert_file(&relative_path, &hash)?;
let texts: Vec<_> = chunks.iter().map(|c| c.content.clone()).collect();
let embeddings = self.embedder.embed_batch(&texts).await?;
for (chunk_idx, (chunk, result)) in chunks.iter().zip(embeddings.iter()).enumerate() {
self.storage.insert_chunk(
file_id,
chunk_idx as i32,
&chunk.content,
chunk.start_line as i32,
chunk.end_line as i32,
&result.embedding,
)?;
}
Ok(true)
}
pub fn remove_file(&self, path: &Path) -> Result<bool> {
let relative_path = path
.strip_prefix(self.file_walker.root())
.unwrap_or(path)
.to_string_lossy()
.to_string();
self.storage.delete_file(&relative_path)
}
#[allow(dead_code)]
pub fn storage(&self) -> &Storage {
&self.storage
}
#[allow(dead_code)]
pub fn embedder(&self) -> &Arc<dyn EmbeddingProvider> {
&self.embedder
}
}
#[derive(Debug, Clone)]
pub struct IndexResult {
pub files_indexed: usize,
pub files_skipped: usize,
pub files_deleted: usize,
pub chunks_created: usize,
}
struct ProcessedFile {
relative_path: String,
hash: String,
chunks: Vec<Chunk>,
}