#![cfg(feature = "native")]
use crate::ansi_colors::Colorize;
use crate::oracle;
pub(crate) trait ChunkIndexer {
fn index_chunk(&mut self, chunk_id: &str, content: &str);
}
impl ChunkIndexer for oracle::rag::HybridRetriever {
fn index_chunk(&mut self, chunk_id: &str, content: &str) {
self.index_document(chunk_id, content);
}
}
fn should_skip_directory(name: &str) -> bool {
name.starts_with('.') || name == "target" || name == "__pycache__"
}
fn should_skip_python_dir(name: &str) -> bool {
name.starts_with('.') || name == "__pycache__"
}
fn should_skip_rust_dir(name: &str) -> bool {
name.starts_with('.') || name == "target"
}
fn should_skip_hidden_dir(name: &str) -> bool {
name.starts_with('.')
}
fn should_recurse_dir(path: &std::path::Path, skip_fn: fn(&str) -> bool) -> bool {
path.file_name().and_then(|n| n.to_str()).is_some_and(|name| !skip_fn(name))
}
#[allow(clippy::too_many_arguments)]
fn index_file_chunks(
content: &str,
doc_id: &str,
chunker: &oracle::rag::SemanticChunker,
indexer: &mut dyn ChunkIndexer,
total_chunks: &mut usize,
chunk_contents: &mut std::collections::HashMap<String, String>,
) -> usize {
let chunks = chunker.split(content);
let chunk_count = chunks.len();
for chunk in &chunks {
let chunk_id = format!("{}#{}", doc_id, chunk.start_line);
#[cfg(not(feature = "rag"))]
chunk_contents.insert(chunk_id.clone(), chunk.content.clone());
#[cfg(feature = "rag")]
let _ = &chunk_contents;
indexer.index_chunk(&chunk_id, &chunk.content);
*total_chunks += 1;
}
chunk_count
}
fn print_file_indexed(relative_path: &std::path::Path, chunks_len: usize) {
use oracle::rag::tui::inline;
if chunks_len > 1 {
let bar = inline::bar(chunks_len as f64, 20.0, 15);
println!(
" {} {:40} {} ({} chunks)",
"✓".bright_green(),
relative_path.display().to_string().cyan(),
bar,
chunks_len
);
}
}
fn print_markdown_indexed(display_path: &str, chunks_len: usize) {
use oracle::rag::tui::inline;
let bar = inline::bar(chunks_len as f64, 20.0, 15);
println!(" {} {:40} {} ({} chunks)", "✓".bright_green(), display_path.cyan(), bar, chunks_len);
}
fn is_trivial_content(content: &str) -> bool {
content.trim().is_empty() || content.lines().count() < 5
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_doc_file(
file_path: &std::path::Path,
doc_id: &str,
display_name: &str,
chunker: &oracle::rag::SemanticChunker,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
reindexer: &mut oracle::rag::HeijunkaReindexer,
indexer: &mut dyn ChunkIndexer,
indexed_count: &mut usize,
total_chunks: &mut usize,
fingerprints: &mut std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
chunk_contents: &mut std::collections::HashMap<String, String>,
) {
let Ok(content) = std::fs::read_to_string(file_path) else {
return;
};
fingerprints.insert(
doc_id.to_string(),
oracle::rag::DocumentFingerprint::new(content.as_bytes(), chunker_config, model_hash),
);
reindexer.enqueue(doc_id, file_path.to_path_buf(), 0);
let chunk_count =
index_file_chunks(&content, doc_id, chunker, indexer, total_chunks, chunk_contents);
*indexed_count += 1;
print_markdown_indexed(display_name, chunk_count);
}
pub(crate) fn doc_fingerprint_changed(
file_path: &std::path::Path,
doc_id: &str,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
existing_fingerprints: &std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
) -> bool {
if let Some(stored_fp) = existing_fingerprints.get(doc_id) {
if let Ok(meta) = std::fs::metadata(file_path) {
if let Ok(mtime) = meta.modified() {
let mtime_ms = mtime
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_millis() as u64)
.unwrap_or(0);
if mtime_ms < stored_fp.indexed_at {
return false;
}
}
}
}
let Ok(content) = std::fs::read_to_string(file_path) else {
return false;
};
let current_fp =
oracle::rag::DocumentFingerprint::new(content.as_bytes(), chunker_config, model_hash);
match existing_fingerprints.get(doc_id) {
Some(stored_fp) if !stored_fp.needs_reindex(¤t_fp) => false,
Some(_) => true,
None => true, }
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_component(
path: &std::path::Path,
component: &str,
chunker: &oracle::rag::SemanticChunker,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
extension: &str,
include_specs: bool,
include_book: bool,
reindexer: &mut oracle::rag::HeijunkaReindexer,
indexer: &mut dyn ChunkIndexer,
indexed_count: &mut usize,
total_chunks: &mut usize,
fingerprints: &mut std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
chunk_contents: &mut std::collections::HashMap<String, String>,
) {
if let Ok(entries) = std::fs::read_dir(path) {
let mut md_files: Vec<_> = entries
.flatten()
.filter(|e| e.path().extension().is_some_and(|ext| ext == "md") && e.path().is_file())
.collect();
md_files.sort_by_key(|e| e.file_name());
for entry in md_files {
let md_path = entry.path();
let Some(fname) = md_path.file_name() else { continue };
let file_name = fname.to_string_lossy();
let doc_id = format!("{}/{}", component, file_name);
index_doc_file(
&md_path,
&doc_id,
&doc_id,
chunker,
chunker_config,
model_hash,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
);
}
}
let src_dir = path.join("src");
let (scan_dir, base) = if src_dir.exists() {
(src_dir.clone(), src_dir.parent().unwrap_or(&src_dir).to_path_buf())
} else if extension == "py" {
(path.to_path_buf(), path.to_path_buf())
} else {
(src_dir.clone(), path.to_path_buf())
};
if scan_dir.exists() {
match extension {
"rs" => index_rust_files(
&scan_dir,
&base,
component,
chunker,
chunker_config,
model_hash,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
),
"py" => index_python_files(
&scan_dir,
&base,
component,
chunker,
chunker_config,
model_hash,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
),
_ => {}
}
}
if include_specs {
let specs_dir = path.join("docs/specifications");
if specs_dir.exists() {
index_markdown_files(
&specs_dir,
component,
chunker,
chunker_config,
model_hash,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
);
}
}
if include_book {
let book_dir = path.join("book/src");
if book_dir.exists() {
index_markdown_files_recursive(
&book_dir,
book_dir.parent().unwrap_or(&book_dir),
component,
chunker,
chunker_config,
model_hash,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
);
}
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_dir_group(
dirs: &[String],
show_not_found: bool,
chunker: &oracle::rag::SemanticChunker,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
extension: &str,
include_specs: bool,
include_book: bool,
reindexer: &mut oracle::rag::HeijunkaReindexer,
indexer: &mut dyn ChunkIndexer,
indexed_count: &mut usize,
total_chunks: &mut usize,
fingerprints: &mut std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
chunk_contents: &mut std::collections::HashMap<String, String>,
) {
for dir in dirs {
let path = std::path::Path::new(dir);
if !path.exists() {
if show_not_found {
println!(" {} {} (not found)", "⊘".dimmed(), dir.dimmed());
}
continue;
}
let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
let component = canonical.file_name().and_then(|n| n.to_str()).unwrap_or("unknown");
index_component(
path,
component,
chunker,
chunker_config,
model_hash,
extension,
include_specs,
include_book,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
);
}
}
#[allow(clippy::too_many_arguments)]
fn process_and_index_file(
path: &std::path::Path,
base_dir: &std::path::Path,
doc_id_prefix: &str,
chunker: &oracle::rag::SemanticChunker,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
reindexer: &mut oracle::rag::HeijunkaReindexer,
indexer: &mut dyn ChunkIndexer,
indexed_count: &mut usize,
total_chunks: &mut usize,
fingerprints: &mut std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
chunk_contents: &mut std::collections::HashMap<String, String>,
) {
let Ok(content) = std::fs::read_to_string(path) else {
return;
};
if is_trivial_content(&content) {
return;
}
let relative_path = path.strip_prefix(base_dir).unwrap_or(path);
let doc_id = format!("{}/{}", doc_id_prefix, relative_path.display());
fingerprints.insert(
doc_id.clone(),
oracle::rag::DocumentFingerprint::new(content.as_bytes(), chunker_config, model_hash),
);
reindexer.enqueue(&doc_id, path.to_path_buf(), 0);
let chunk_count =
index_file_chunks(&content, &doc_id, chunker, indexer, total_chunks, chunk_contents);
*indexed_count += 1;
print_file_indexed(relative_path, chunk_count);
}
fn check_file_changed(
path: &std::path::Path,
base_dir: &std::path::Path,
component: &str,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
existing_fingerprints: &std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
) -> Option<bool> {
use oracle::rag::fingerprint::DocumentFingerprint;
let relative_path = path.strip_prefix(base_dir).unwrap_or(path);
let doc_id = format!("{}/{}", component, relative_path.display());
if let Some(stored_fp) = existing_fingerprints.get(&doc_id) {
if let Ok(meta) = std::fs::metadata(path) {
if let Ok(mtime) = meta.modified() {
let mtime_ms = mtime
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_millis() as u64)
.unwrap_or(0);
if mtime_ms < stored_fp.indexed_at {
return Some(false);
}
}
}
}
let content = std::fs::read_to_string(path).ok()?;
if is_trivial_content(&content) {
return Some(false);
}
let current_fp = DocumentFingerprint::new(content.as_bytes(), chunker_config, model_hash);
match existing_fingerprints.get(&doc_id) {
Some(stored_fp) if !stored_fp.needs_reindex(¤t_fp) => Some(false),
_ => Some(true), }
}
fn entry_has_changes(
path: &std::path::Path,
base_dir: &std::path::Path,
component: &str,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
existing_fingerprints: &std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
extension: &str,
) -> bool {
if path.is_dir() && should_recurse_dir(path, should_skip_directory) {
return check_dir_for_changes(
path,
base_dir,
component,
chunker_config,
model_hash,
existing_fingerprints,
extension,
);
}
!path.is_dir()
&& path.extension().is_some_and(|ext| ext == extension)
&& check_file_changed(
path,
base_dir,
component,
chunker_config,
model_hash,
existing_fingerprints,
) == Some(true)
}
pub(crate) fn check_dir_for_changes(
dir: &std::path::Path,
base_dir: &std::path::Path,
component: &str,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
existing_fingerprints: &std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
extension: &str,
) -> bool {
let Ok(entries) = std::fs::read_dir(dir) else {
return false;
};
entries.flatten().any(|entry| {
entry_has_changes(
&entry.path(),
base_dir,
component,
chunker_config,
model_hash,
existing_fingerprints,
extension,
)
})
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_python_files(
dir: &std::path::Path,
base_dir: &std::path::Path,
component: &str,
chunker: &oracle::rag::SemanticChunker,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
reindexer: &mut oracle::rag::HeijunkaReindexer,
indexer: &mut dyn ChunkIndexer,
indexed_count: &mut usize,
total_chunks: &mut usize,
fingerprints: &mut std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
chunk_contents: &mut std::collections::HashMap<String, String>,
) {
let Ok(entries) = std::fs::read_dir(dir) else {
return;
};
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() && should_recurse_dir(&path, should_skip_python_dir) {
index_python_files(
&path,
base_dir,
component,
chunker,
chunker_config,
model_hash,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
);
} else if path.extension().is_some_and(|ext| ext == "py") {
process_and_index_file(
&path,
base_dir,
component,
chunker,
chunker_config,
model_hash,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
);
}
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_rust_files(
dir: &std::path::Path,
base_dir: &std::path::Path,
component: &str,
chunker: &oracle::rag::SemanticChunker,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
reindexer: &mut oracle::rag::HeijunkaReindexer,
indexer: &mut dyn ChunkIndexer,
indexed_count: &mut usize,
total_chunks: &mut usize,
fingerprints: &mut std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
chunk_contents: &mut std::collections::HashMap<String, String>,
) {
let Ok(entries) = std::fs::read_dir(dir) else {
return;
};
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() && should_recurse_dir(&path, should_skip_rust_dir) {
index_rust_files(
&path,
base_dir,
component,
chunker,
chunker_config,
model_hash,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
);
} else if path.extension().is_some_and(|ext| ext == "rs") {
process_and_index_file(
&path,
base_dir,
component,
chunker,
chunker_config,
model_hash,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
);
}
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_markdown_files(
dir: &std::path::Path,
component: &str,
chunker: &oracle::rag::SemanticChunker,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
reindexer: &mut oracle::rag::HeijunkaReindexer,
indexer: &mut dyn ChunkIndexer,
indexed_count: &mut usize,
total_chunks: &mut usize,
fingerprints: &mut std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
chunk_contents: &mut std::collections::HashMap<String, String>,
) {
let Ok(entries) = std::fs::read_dir(dir) else {
return;
};
for entry in entries.flatten() {
let path = entry.path();
if path.extension().is_none_or(|ext| ext != "md") {
continue;
}
let Ok(content) = std::fs::read_to_string(&path) else {
continue;
};
if content.trim().is_empty() {
continue;
}
let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("file.md");
let doc_id = format!("{}/docs/{}", component, file_name);
fingerprints.insert(
doc_id.clone(),
oracle::rag::DocumentFingerprint::new(content.as_bytes(), chunker_config, model_hash),
);
reindexer.enqueue(&doc_id, path.clone(), 0);
let chunk_count =
index_file_chunks(&content, &doc_id, chunker, indexer, total_chunks, chunk_contents);
*indexed_count += 1;
let display_path = format!("{}/docs/{}", component, file_name);
print_markdown_indexed(&display_path, chunk_count);
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_markdown_files_recursive(
dir: &std::path::Path,
base_dir: &std::path::Path,
component: &str,
chunker: &oracle::rag::SemanticChunker,
chunker_config: &oracle::rag::ChunkerConfig,
model_hash: [u8; 32],
reindexer: &mut oracle::rag::HeijunkaReindexer,
indexer: &mut dyn ChunkIndexer,
indexed_count: &mut usize,
total_chunks: &mut usize,
fingerprints: &mut std::collections::HashMap<String, oracle::rag::DocumentFingerprint>,
chunk_contents: &mut std::collections::HashMap<String, String>,
) {
let Ok(entries) = std::fs::read_dir(dir) else {
return;
};
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() && should_recurse_dir(&path, should_skip_hidden_dir) {
index_markdown_files_recursive(
&path,
base_dir,
component,
chunker,
chunker_config,
model_hash,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
);
} else if path.extension().is_some_and(|ext| ext == "md") {
let book_prefix = format!("{}/book", component);
process_and_index_file(
&path,
base_dir,
&book_prefix,
chunker,
chunker_config,
model_hash,
reindexer,
indexer,
indexed_count,
total_chunks,
fingerprints,
chunk_contents,
);
}
}
}