use std::collections::HashSet;
use std::path::Path;
use anyhow::{bail, Context, Result};
use crate::bm25::Bm25Index;
use crate::chunking::chunk_source;
use crate::encoder::{SemanticIndex, StaticEncoder};
use crate::graph::DependencyGraph;
use crate::model::Chunk;
use crate::source_files::{filter_extensions, language_for_path, walk_source_files};
use crate::tokens::tokenize;
pub(crate) const MAX_FILE_BYTES: u64 = 1_000_000;
pub(crate) fn enrich_for_bm25(chunk: &Chunk) -> String {
let path = Path::new(&chunk.file_path);
let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
let dir_parts: Vec<&str> = path
.parent()
.map(|p| {
p.components()
.filter_map(|c| {
let s = c.as_os_str().to_str()?;
if s == "." || s == "/" {
None
} else {
Some(s)
}
})
.collect::<Vec<_>>()
})
.unwrap_or_default();
let dir_text: String = dir_parts
.iter()
.rev()
.take(3)
.rev()
.cloned()
.collect::<Vec<_>>()
.join(" ");
if matches!(chunk.language.as_deref(), Some("markdown") | Some("text")) {
format!("{} {stem} {stem} {dir_text}", chunk.content)
} else {
format!("{} {stem} {dir_text}", chunk.content)
}
}
pub fn build_index_from_path(
path: &Path,
encoder: &StaticEncoder,
extensions: Option<&HashSet<String>>,
ignore: Option<&HashSet<String>>,
include_text_files: bool,
display_root: &Path,
) -> Result<(Bm25Index, SemanticIndex, Vec<Chunk>, DependencyGraph)> {
let (bm25_index, chunks, graph) =
build_bm25_index_from_path(path, extensions, ignore, include_text_files, display_root)?;
let texts: Vec<&str> = chunks.iter().map(|c| c.content.as_str()).collect();
let embeddings = encoder
.encode_batch(&texts)
.context("Failed to encode chunks")?;
let semantic_index = SemanticIndex::new(embeddings);
Ok((bm25_index, semantic_index, chunks, graph))
}
pub fn build_bm25_index_from_path(
path: &Path,
extensions: Option<&HashSet<String>>,
ignore: Option<&HashSet<String>>,
include_text_files: bool,
display_root: &Path,
) -> Result<(Bm25Index, Vec<Chunk>, DependencyGraph)> {
let (chunks, mut graph) =
collect_chunks_and_graph(path, extensions, ignore, include_text_files, display_root)?;
if chunks.is_empty() {
bail!("No supported files found under {}", path.display());
}
graph.resolve_dependencies();
let bm25_index = build_bm25_index_from_chunks(&chunks);
Ok((bm25_index, chunks, graph))
}
pub(crate) fn build_bm25_index_from_chunks(chunks: &[Chunk]) -> Bm25Index {
let bm25_docs: Vec<Vec<String>> = chunks
.iter()
.map(|chunk| tokenize(&enrich_for_bm25(chunk)))
.collect();
Bm25Index::new(&bm25_docs)
}
pub fn create_graph_from_path(
path: &Path,
extensions: Option<&HashSet<String>>,
ignore: Option<&HashSet<String>>,
include_text_files: bool,
display_root: &Path,
) -> Result<DependencyGraph> {
let (_, mut graph) =
collect_chunks_and_graph(path, extensions, ignore, include_text_files, display_root)?;
if graph.file_count() == 0 {
bail!("No supported files found under {}", path.display());
}
graph.resolve_dependencies();
Ok(graph)
}
fn collect_chunks_and_graph(
path: &Path,
extensions: Option<&HashSet<String>>,
ignore: Option<&HashSet<String>>,
include_text_files: bool,
display_root: &Path,
) -> Result<(Vec<Chunk>, DependencyGraph)> {
let exts = filter_extensions(extensions, include_text_files);
let files = walk_source_files(path, &exts, ignore);
let mut chunks: Vec<Chunk> = Vec::new();
let mut graph = DependencyGraph::new();
for file_path in &files {
let metadata = match file_path.metadata() {
Ok(m) => m,
Err(_) => continue,
};
if metadata.len() > MAX_FILE_BYTES {
continue;
}
let source = match std::fs::read_to_string(file_path) {
Ok(s) => s,
Err(_) => continue,
};
let language = language_for_path(file_path);
let chunk_path = file_path
.strip_prefix(display_root)
.unwrap_or(file_path)
.to_string_lossy()
.replace('\\', "/");
chunks.extend(chunk_source(&source, &chunk_path, language));
if let Some(lang) = language {
graph.add_file(&chunk_path, &source, lang);
}
}
Ok((chunks, graph))
}