use crate::config::Config;
use crate::embedding::{calculate_content_hash_with_lines, calculate_unique_content_hash};
use crate::indexer::code_region_extractor::extract_meaningful_regions;
use crate::indexer::file_processor::chunk_text;
use crate::indexer::languages;
use crate::indexer::markdown_processor::parse_markdown_content;
use crate::state::SharedState;
use crate::store::{CodeBlock, DocumentBlock, Store, TextBlock};
use anyhow::Result;
use std::collections::HashSet;
use tree_sitter::Parser;
pub struct ProcessFileContext<'a> {
pub store: &'a Store,
pub config: &'a Config,
pub state: SharedState,
}
pub async fn process_file_differential(
ctx: &ProcessFileContext<'_>,
contents: &str,
file_path: &str,
language: &str,
code_blocks_batch: &mut Vec<CodeBlock>,
_text_blocks_batch: &mut [TextBlock], all_code_blocks: &mut Vec<CodeBlock>,
) -> Result<()> {
let mut parser = Parser::new();
let force_reindex = ctx.state.read().force_reindex;
let lang_impl = match languages::get_language(language) {
Some(impl_) => impl_,
None => return Ok(()), };
parser.set_language(&lang_impl.get_ts_language())?;
let tree = parser
.parse(contents, None)
.unwrap_or_else(|| parser.parse("", None).unwrap());
let mut code_regions = Vec::new();
extract_meaningful_regions(
tree.root_node(),
contents,
lang_impl.as_ref(),
&mut code_regions,
);
let existing_hashes = if force_reindex {
Vec::new()
} else {
ctx.store
.get_file_blocks_metadata(file_path, "code_blocks")
.await?
};
let mut new_hashes = HashSet::new();
let mut graphrag_blocks_added = 0;
for region in code_regions {
let content_hash = calculate_content_hash_with_lines(
®ion.content,
file_path,
region.start_line,
region.end_line,
);
new_hashes.insert(content_hash.clone());
let exists = !force_reindex
&& ctx
.store
.content_exists(&content_hash, "code_blocks")
.await?;
if !exists {
let code_block = CodeBlock {
path: file_path.to_string(),
hash: content_hash.clone(),
language: lang_impl.name().to_string(),
content: region.content.clone(),
symbols: region.symbols.clone(),
start_line: region.start_line,
end_line: region.end_line,
distance: None, };
code_blocks_batch.push(code_block.clone());
if ctx.config.graphrag.enabled {
all_code_blocks.push(code_block);
graphrag_blocks_added += 1;
}
} else if ctx.config.graphrag.enabled {
if let Ok(existing_block) = ctx.store.get_code_block_by_hash(&content_hash).await {
all_code_blocks.push(existing_block);
graphrag_blocks_added += 1;
}
}
}
if !force_reindex && !existing_hashes.is_empty() {
let hashes_to_remove: Vec<String> = existing_hashes
.into_iter()
.filter(|hash| !new_hashes.contains(hash))
.collect();
if !hashes_to_remove.is_empty() {
ctx.store
.remove_blocks_by_hashes(&hashes_to_remove, "code_blocks")
.await?;
}
}
if ctx.config.graphrag.enabled && graphrag_blocks_added > 0 {
let mut state_guard = ctx.state.write();
state_guard.graphrag_blocks += graphrag_blocks_added;
}
Ok(())
}
pub async fn process_text_file_differential(
store: &Store,
contents: &str,
file_path: &str,
text_blocks_batch: &mut Vec<TextBlock>,
config: &Config,
state: SharedState,
) -> Result<()> {
let force_reindex = state.read().force_reindex;
let existing_hashes = if force_reindex {
Vec::new()
} else {
store
.get_file_blocks_metadata(file_path, "text_blocks")
.await?
};
let chunks = chunk_text(
contents,
config.index.chunk_size,
config.index.chunk_overlap,
);
let mut new_hashes = HashSet::new();
for (chunk_idx, chunk_with_lines) in chunks.iter().enumerate() {
let chunk_hash = calculate_unique_content_hash(
&chunk_with_lines.content,
&format!("{}#{}", file_path, chunk_idx),
);
new_hashes.insert(chunk_hash.clone());
let exists = !force_reindex && store.content_exists(&chunk_hash, "text_blocks").await?;
if !exists {
text_blocks_batch.push(TextBlock {
path: file_path.to_string(),
language: "text".to_string(),
content: chunk_with_lines.content.clone(),
start_line: chunk_with_lines.start_line, end_line: chunk_with_lines.end_line, hash: chunk_hash,
distance: None,
});
}
}
if !force_reindex && !existing_hashes.is_empty() {
let hashes_to_remove: Vec<String> = existing_hashes
.into_iter()
.filter(|hash| !new_hashes.contains(hash))
.collect();
if !hashes_to_remove.is_empty() {
store
.remove_blocks_by_hashes(&hashes_to_remove, "text_blocks")
.await?;
}
}
Ok(())
}
pub async fn process_markdown_file_differential(
store: &Store,
contents: &str,
file_path: &str,
document_blocks_batch: &mut Vec<DocumentBlock>,
config: &Config,
state: SharedState,
) -> Result<()> {
let force_reindex = state.read().force_reindex;
let existing_hashes = if force_reindex {
Vec::new()
} else {
store
.get_file_blocks_metadata(file_path, "document_blocks")
.await?
};
let document_blocks = parse_markdown_content(contents, file_path, config);
let mut new_hashes = HashSet::new();
for doc_block in document_blocks {
new_hashes.insert(doc_block.hash.clone());
let exists = !force_reindex
&& store
.content_exists(&doc_block.hash, "document_blocks")
.await?;
if !exists {
document_blocks_batch.push(doc_block);
}
}
if !force_reindex && !existing_hashes.is_empty() {
let hashes_to_remove: Vec<String> = existing_hashes
.into_iter()
.filter(|hash| !new_hashes.contains(hash))
.collect();
if !hashes_to_remove.is_empty() {
store
.remove_blocks_by_hashes(&hashes_to_remove, "document_blocks")
.await?;
}
}
Ok(())
}
pub async fn process_file(
ctx: &ProcessFileContext<'_>,
contents: &str,
file_path: &str,
language: &str,
code_blocks_batch: &mut Vec<CodeBlock>,
_text_blocks_batch: &mut [TextBlock], all_code_blocks: &mut Vec<CodeBlock>,
) -> Result<()> {
let mut parser = Parser::new();
let force_reindex = ctx.state.read().force_reindex;
let lang_impl = match languages::get_language(language) {
Some(impl_) => impl_,
None => return Ok(()), };
parser.set_language(&lang_impl.get_ts_language())?;
let tree = parser
.parse(contents, None)
.unwrap_or_else(|| parser.parse("", None).unwrap());
let mut code_regions = Vec::new();
extract_meaningful_regions(
tree.root_node(),
contents,
lang_impl.as_ref(),
&mut code_regions,
);
let mut graphrag_blocks_added = 0;
for region in code_regions {
let content_hash = calculate_content_hash_with_lines(
®ion.content,
file_path,
region.start_line,
region.end_line,
);
let exists = !force_reindex
&& ctx
.store
.content_exists(&content_hash, "code_blocks")
.await?;
if !exists {
let code_block = CodeBlock {
path: file_path.to_string(),
hash: content_hash,
language: lang_impl.name().to_string(),
content: region.content.clone(),
symbols: region.symbols.clone(),
start_line: region.start_line,
end_line: region.end_line,
distance: None, };
code_blocks_batch.push(code_block.clone());
if ctx.config.graphrag.enabled {
all_code_blocks.push(code_block);
graphrag_blocks_added += 1;
}
} else if ctx.config.graphrag.enabled {
if let Ok(existing_block) = ctx.store.get_code_block_by_hash(&content_hash).await {
all_code_blocks.push(existing_block);
graphrag_blocks_added += 1;
}
}
}
if ctx.config.graphrag.enabled && graphrag_blocks_added > 0 {
let mut state_guard = ctx.state.write();
state_guard.graphrag_blocks += graphrag_blocks_added;
}
Ok(())
}