use crate::config::Config;
use crate::embedding::count_tokens;
use crate::indexer::contextual::{
build_enriched_embedding_input, generate_contextual_descriptions, FileContextMap,
};
use crate::mcp::logging::log_performance_metrics;
use crate::store::{CodeBlock, DocumentBlock, Store, TextBlock};
use anyhow::Result;
use std::collections::HashMap;
#[derive(Default)]
pub struct FileMetadataBatch {
pending_files: HashMap<String, u64>,
}
impl FileMetadataBatch {
pub fn new() -> Self {
Self::default()
}
pub fn add(&mut self, file_path: &str, mtime: u64) {
self.pending_files.insert(file_path.to_string(), mtime);
}
pub fn extend(&mut self, other: &FileMetadataBatch) {
self.pending_files.extend(other.pending_files.clone());
}
pub fn is_empty(&self) -> bool {
self.pending_files.is_empty()
}
pub fn clear(&mut self) {
self.pending_files.clear();
}
pub async fn persist(&self, store: &Store) -> Result<()> {
for (file_path, mtime) in &self.pending_files {
store.store_file_metadata(file_path, *mtime).await?;
}
Ok(())
}
}
pub async fn process_code_blocks_batch(
store: &Store,
blocks: &[CodeBlock],
config: &Config,
file_metadata: &FileMetadataBatch,
file_context: &FileContextMap,
) -> Result<()> {
let start_time = std::time::Instant::now();
let descriptions = if config.index.contextual_descriptions {
generate_contextual_descriptions(blocks, config, file_context).await?
} else {
HashMap::new()
};
let contents: Vec<String> = blocks
.iter()
.enumerate()
.map(|(i, block)| {
build_enriched_embedding_input(block, descriptions.get(&i).map(|s| s.as_str()))
})
.collect();
let embeddings = crate::embedding::generate_embeddings_batch(
contents,
true,
config,
crate::embedding::types::InputType::Document,
)
.await?;
store.store_code_blocks(blocks, &embeddings).await?;
file_metadata.persist(store).await?;
let duration_ms = start_time.elapsed().as_millis() as u64;
log_performance_metrics("code_blocks_batch", duration_ms, blocks.len(), None);
Ok(())
}
pub async fn process_text_blocks_batch(
store: &Store,
blocks: &[TextBlock],
config: &Config,
file_metadata: &FileMetadataBatch,
) -> Result<()> {
let start_time = std::time::Instant::now();
let contents: Vec<String> = blocks
.iter()
.map(|b| format!("# File: {}\n\n{}", b.path, b.content))
.collect();
let embeddings = crate::embedding::generate_embeddings_batch(
contents,
false,
config,
crate::embedding::types::InputType::Document,
)
.await?;
store.store_text_blocks(blocks, &embeddings).await?;
file_metadata.persist(store).await?;
let duration_ms = start_time.elapsed().as_millis() as u64;
log_performance_metrics("text_blocks_batch", duration_ms, blocks.len(), None);
Ok(())
}
pub async fn process_document_blocks_batch(
store: &Store,
blocks: &[DocumentBlock],
config: &Config,
file_metadata: &FileMetadataBatch,
) -> Result<()> {
let start_time = std::time::Instant::now();
let contents: Vec<String> = blocks
.iter()
.map(|b| {
let mut parts = Vec::new();
parts.push(format!("# File: {}", b.path));
if !b.context.is_empty() {
parts.push(b.context.join("\n"));
}
parts.push(String::new());
parts.push(b.content.clone());
parts.join("\n")
})
.collect();
let embeddings = crate::embedding::generate_embeddings_batch(
contents,
false,
config,
crate::embedding::types::InputType::Document,
)
.await?;
store.store_document_blocks(blocks, &embeddings).await?;
file_metadata.persist(store).await?;
let duration_ms = start_time.elapsed().as_millis() as u64;
log_performance_metrics("document_blocks_batch", duration_ms, blocks.len(), None);
Ok(())
}
pub fn should_process_batch<T>(
batch: &[T],
get_content: impl Fn(&T) -> &str,
config: &Config,
) -> bool {
if batch.is_empty() {
return false;
}
if batch.len() >= config.index.embeddings_batch_size {
return true;
}
let total_tokens: usize = batch
.iter()
.map(|item| count_tokens(get_content(item)))
.sum();
total_tokens >= config.index.embeddings_max_tokens_per_batch
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_code_block_enriched_formatting() {
use crate::indexer::contextual::build_enriched_embedding_input;
let block = CodeBlock {
path: "src/main.rs".to_string(),
language: "rust".to_string(),
content: "fn main() {\n println!(\"Hello, world!\");\n}".to_string(),
symbols: vec!["main".to_string()],
start_line: 1,
end_line: 3,
hash: "test_hash".to_string(),
distance: None,
};
let formatted = build_enriched_embedding_input(&block, None);
assert!(formatted.contains("# File: src/main.rs"));
assert!(formatted.contains("# Language: rust"));
assert!(formatted.contains("# Defines: main"));
assert!(formatted.contains("fn main()"));
assert!(formatted.contains("Hello, world!"));
}
#[test]
fn test_code_block_enriched_with_description() {
use crate::indexer::contextual::build_enriched_embedding_input;
let block = CodeBlock {
path: "src/utils.rs".to_string(),
language: "rust".to_string(),
content: "const VERSION: &str = \"1.0.0\";".to_string(),
symbols: vec![],
start_line: 1,
end_line: 1,
hash: "test_hash2".to_string(),
distance: None,
};
let formatted =
build_enriched_embedding_input(&block, Some("Application version constant"));
assert!(formatted.starts_with("Application version constant"));
assert!(formatted.contains("# File: src/utils.rs"));
assert!(formatted.contains("const VERSION"));
}
}