use std::path::{Path, PathBuf};
use crate::chunking::source::chunk_source;
use crate::indexing::dense::{embed_chunks, Model, SelectableBasicBackend};
use crate::indexing::file_walker::walk_files;
use crate::indexing::files::{detect_language, get_extensions};
use crate::indexing::sparse::{enrich_for_bm25, Bm25Index};
use crate::tokens::tokenize;
use crate::types::{Chunk, ContentType};
pub const MAX_FILE_BYTES: u64 = 1_000_000;
pub struct CreateIndexOptions<'a> {
pub model: &'a Model,
pub extensions: Option<Vec<String>>,
pub content: Option<Vec<ContentType>>,
pub display_root: Option<PathBuf>,
}
#[derive(Debug)]
pub struct CreateIndexResult {
pub bm25_index: Bm25Index,
pub semantic_index: SelectableBasicBackend,
pub chunks: Vec<Chunk>,
}
pub fn create_index_from_path(
path: &Path,
options: &CreateIndexOptions,
) -> Result<CreateIndexResult, String> {
let content = options
.content
.clone()
.unwrap_or_else(|| vec![ContentType::Code]);
let resolved = get_extensions(&content, options.extensions.as_deref());
let ext_refs: Vec<&str> = resolved.iter().map(String::as_str).collect();
let mut chunks: Vec<Chunk> = Vec::new();
for file_path in walk_files(path, &ext_refs, &[]) {
let language = detect_language(&file_path.to_string_lossy());
let size = match std::fs::metadata(&file_path) {
Ok(meta) => meta.len(),
Err(_) => continue,
};
if size > MAX_FILE_BYTES {
continue;
}
let source = match std::fs::read(&file_path) {
Ok(bytes) => String::from_utf8_lossy(&bytes).into_owned(),
Err(_) => continue,
};
let chunk_path = match &options.display_root {
Some(root) => file_path
.strip_prefix(root)
.unwrap_or(&file_path)
.to_string_lossy()
.into_owned(),
None => file_path.to_string_lossy().into_owned(),
};
chunks.extend(chunk_source(&source, &chunk_path, language));
}
if chunks.is_empty() {
return Err(format!(
"No supported files found under {}.",
path.display()
));
}
let embeddings = embed_chunks(options.model, &chunks);
let documents: Vec<Vec<String>> = chunks
.iter()
.map(|c| tokenize(&enrich_for_bm25(c)))
.collect();
let bm25_index = Bm25Index::build(&documents);
let semantic_index = SelectableBasicBackend::from_vectors(embeddings)?;
Ok(CreateIndexResult {
bm25_index,
semantic_index,
chunks,
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::indexing::dense::make_stub_model;
use tempfile::tempdir;
fn opts(model: &Model, display_root: Option<PathBuf>) -> CreateIndexOptions<'_> {
CreateIndexOptions {
model,
extensions: None,
content: None,
display_root,
}
}
#[test]
fn builds_indexes_for_small_ts_file() {
let dir = tempdir().unwrap();
std::fs::write(
dir.path().join("sample.ts"),
"export function greet(name: string) {\n return `hi ${name}`\n}\n",
)
.unwrap();
let model = make_stub_model(4);
let result =
create_index_from_path(dir.path(), &opts(&model, Some(dir.path().to_path_buf())))
.unwrap();
assert!(!result.chunks.is_empty());
assert_eq!(result.chunks[0].file_path, "sample.ts");
assert_eq!(result.semantic_index.vectors.len(), result.chunks.len());
assert_eq!(result.bm25_index.num_docs(), result.chunks.len());
}
#[test]
fn errors_when_no_supported_files() {
let dir = tempdir().unwrap();
std::fs::write(dir.path().join("data.bin"), "binary").unwrap();
let model = make_stub_model(4);
let err = create_index_from_path(dir.path(), &opts(&model, None)).unwrap_err();
assert!(err.contains("No supported files found"));
}
#[test]
fn respects_extensions_override() {
let dir = tempdir().unwrap();
std::fs::write(dir.path().join("a.txt"), "hello world").unwrap();
let model = make_stub_model(4);
let options = CreateIndexOptions {
model: &model,
extensions: Some(vec![".txt".to_string()]),
content: Some(vec![ContentType::Docs]),
display_root: Some(dir.path().to_path_buf()),
};
let result = create_index_from_path(dir.path(), &options).unwrap();
assert_eq!(result.chunks.len(), 1);
assert_eq!(result.chunks[0].file_path, "a.txt");
}
#[test]
fn skips_files_over_max_bytes() {
let dir = tempdir().unwrap();
std::fs::write(dir.path().join("big.ts"), "a".repeat(2_000_000)).unwrap();
std::fs::write(dir.path().join("small.ts"), "export const x = 1\n").unwrap();
let model = make_stub_model(4);
let result =
create_index_from_path(dir.path(), &opts(&model, Some(dir.path().to_path_buf())))
.unwrap();
let paths: Vec<&str> = result.chunks.iter().map(|c| c.file_path.as_str()).collect();
assert!(paths.contains(&"small.ts"));
assert!(!paths.contains(&"big.ts"));
}
#[test]
fn descends_into_subdirectories() {
let dir = tempdir().unwrap();
std::fs::create_dir(dir.path().join("sub")).unwrap();
std::fs::write(dir.path().join("sub/nested.ts"), "const a = 1\n").unwrap();
let model = make_stub_model(4);
let result =
create_index_from_path(dir.path(), &opts(&model, Some(dir.path().to_path_buf())))
.unwrap();
assert!(result
.chunks
.iter()
.any(|c| c.file_path.ends_with("nested.ts")));
}
}