use std::collections::HashMap;
use std::fs::File;
use std::io::Read;
use std::path::{Path, PathBuf};
use std::time::{Duration, Instant};
use ignore::WalkBuilder;
use crate::types::*;
use crate::{extract_source, parse_file};
#[derive(Debug, Clone)]
pub struct IndexedSymbol {
pub file_path: String,
pub symbol: Symbol,
pub source: String,
}
#[derive(Debug, Clone)]
pub struct CrossReference {
pub from_file: String,
pub from_symbol: String,
pub to_file: String,
pub to_symbol: String,
}
#[derive(Debug)]
pub struct ProjectIndex {
pub files: HashMap<String, ParsedFile>,
pub symbols: HashMap<String, IndexedSymbol>,
pub references: Vec<CrossReference>,
pub root: PathBuf,
}
const EXTENSIONS: &[&str] = &["rs", "py", "ts", "tsx", "js", "jsx", "mjs", "cjs", "go"];
const MAX_FILES: usize = 5000;
const MAX_WALL: Duration = Duration::from_secs(5);
const MAX_SOURCE_BYTES: usize = 1024 * 1024;
const SKIP_DIRS: &[&str] = &[
".cargo",
".git",
".next",
"__pycache__",
"bin",
"build",
"dist",
"node_modules",
"obj",
"target",
"vendor",
];
impl ProjectIndex {
pub fn build(root: &Path) -> Self {
let start = Instant::now();
let mut index = Self {
files: HashMap::new(),
symbols: HashMap::new(),
references: Vec::new(),
root: root.to_path_buf(),
};
let mut truncated = false;
let files = collect_source_files(root, start, &mut truncated);
for file_path in &files {
if start.elapsed() >= MAX_WALL {
truncated = true;
tracing::warn!(
reason = "wall_clock",
phase = "parse",
root = %root.display(),
partial_files = index.files.len(),
cross_references_dropped = true,
"ProjectIndex::build exceeded budget during parse phase",
);
break;
}
let rel_path = file_path
.strip_prefix(root)
.unwrap_or(file_path)
.to_string_lossy()
.to_string();
let content = match read_source_file_bounded(file_path) {
SourceRead::Content(content) => content,
SourceRead::Oversized { bytes } => {
truncated = true;
tracing::warn!(
reason = "file_size",
phase = "parse",
root = %root.display(),
file = %file_path.display(),
file_bytes = bytes,
max_file_bytes = MAX_SOURCE_BYTES,
partial_files = index.files.len(),
cross_references_dropped = true,
"ProjectIndex::build skipped oversized source file during parse phase",
);
continue;
}
SourceRead::Unreadable => continue,
};
if let Some(parsed) = parse_file(&content, &rel_path) {
for sym in &parsed.symbols {
let key = format!("{}::{}", rel_path, sym.name);
index.symbols.insert(
key,
IndexedSymbol {
file_path: rel_path.clone(),
symbol: sym.clone(),
source: extract_source(sym, &content),
},
);
for child in &sym.children {
let child_key = format!("{}::{}::{}", rel_path, sym.name, child.name);
index.symbols.insert(
child_key,
IndexedSymbol {
file_path: rel_path.clone(),
symbol: child.clone(),
source: extract_source(child, &content),
},
);
}
}
index.files.insert(rel_path, parsed);
}
if start.elapsed() >= MAX_WALL {
truncated = true;
tracing::warn!(
reason = "wall_clock",
phase = "parse",
root = %root.display(),
partial_files = index.files.len(),
cross_references_dropped = true,
"ProjectIndex::build exceeded budget during parse phase",
);
break;
}
}
if !truncated {
truncated = index.build_cross_references(&files, root, start);
}
if truncated {
index.references.clear();
tracing::warn!(
reason = "truncated",
phase = "cross_reference",
root = %root.display(),
partial_files = index.files.len(),
cross_references_dropped = true,
"ProjectIndex::build truncated; skipping cross-reference output",
);
}
index
}
pub fn find(&self, name: &str) -> Vec<&IndexedSymbol> {
self.symbols
.values()
.filter(|s| s.symbol.name == name)
.collect()
}
pub fn find_fuzzy(&self, query: &str) -> Vec<&IndexedSymbol> {
let query_lower = query.to_lowercase();
self.symbols
.values()
.filter(|s| s.symbol.name.to_lowercase().contains(&query_lower))
.collect()
}
pub fn callers_of(&self, symbol_name: &str) -> Vec<&CrossReference> {
self.references
.iter()
.filter(|r| r.to_symbol == symbol_name)
.collect()
}
pub fn callees_of(&self, symbol_name: &str) -> Vec<&CrossReference> {
self.references
.iter()
.filter(|r| r.from_symbol == symbol_name)
.collect()
}
fn build_cross_references(&mut self, files: &[PathBuf], root: &Path, start: Instant) -> bool {
let symbol_names: Vec<(String, String)> = self
.symbols
.values()
.filter(|s| !matches!(s.symbol.kind, SymbolKind::Import | SymbolKind::Const))
.map(|s| (s.symbol.name.clone(), s.file_path.clone()))
.collect();
for file_path in files {
if cross_reference_budget_exceeded(start, root, self.files.len(), self.references.len())
{
return true;
}
let rel_path = file_path
.strip_prefix(root)
.unwrap_or(file_path)
.to_string_lossy()
.to_string();
let content = match read_source_file_bounded(file_path) {
SourceRead::Content(content) => content,
SourceRead::Oversized { bytes } => {
tracing::warn!(
reason = "file_size",
phase = "cross_reference",
root = %root.display(),
file = %file_path.display(),
file_bytes = bytes,
max_file_bytes = MAX_SOURCE_BYTES,
partial_files = self.files.len(),
partial_references = self.references.len(),
cross_references_dropped = true,
"ProjectIndex::build skipped oversized source file during cross-reference phase",
);
return true;
}
SourceRead::Unreadable => continue,
};
if cross_reference_budget_exceeded(start, root, self.files.len(), self.references.len())
{
return true;
}
let file_symbols: Vec<&IndexedSymbol> = self
.symbols
.values()
.filter(|s| s.file_path == rel_path)
.collect();
for (target_name, target_file) in &symbol_names {
if cross_reference_budget_exceeded(
start,
root,
self.files.len(),
self.references.len(),
) {
return true;
}
if *target_file == rel_path {
continue;
}
if target_name.len() < 3 {
continue;
}
let has_reference = match contains_identifier_bounded(&content, target_name, start)
{
IdentifierSearch::Found(found) => found,
IdentifierSearch::BudgetExceeded => {
warn_cross_reference_budget_exceeded(
root,
self.files.len(),
self.references.len(),
);
return true;
}
};
if !has_reference {
continue;
}
let mut referencing_symbol = None;
for symbol in &file_symbols {
match contains_identifier_bounded(&symbol.source, target_name, start) {
IdentifierSearch::Found(true) => {
referencing_symbol = Some(symbol.symbol.name.clone());
break;
}
IdentifierSearch::Found(false) => {}
IdentifierSearch::BudgetExceeded => {
warn_cross_reference_budget_exceeded(
root,
self.files.len(),
self.references.len(),
);
return true;
}
}
}
let referencing_symbol = referencing_symbol.unwrap_or_else(|| rel_path.clone());
if cross_reference_budget_exceeded(
start,
root,
self.files.len(),
self.references.len(),
) {
return true;
}
self.references.push(CrossReference {
from_file: rel_path.clone(),
from_symbol: referencing_symbol,
to_file: target_file.clone(),
to_symbol: target_name.clone(),
});
if cross_reference_budget_exceeded(
start,
root,
self.files.len(),
self.references.len(),
) {
return true;
}
}
}
false
}
pub fn stats(&self) -> IndexStats {
IndexStats {
files: self.files.len(),
symbols: self.symbols.len(),
references: self.references.len(),
}
}
}
fn cross_reference_budget_exceeded(
start: Instant,
root: &Path,
partial_files: usize,
partial_references: usize,
) -> bool {
if start.elapsed() < MAX_WALL {
return false;
}
warn_cross_reference_budget_exceeded(root, partial_files, partial_references);
true
}
fn warn_cross_reference_budget_exceeded(
root: &Path,
partial_files: usize,
partial_references: usize,
) {
tracing::warn!(
reason = "wall_clock",
phase = "cross_reference",
root = %root.display(),
partial_files,
partial_references,
cross_references_dropped = true,
"ProjectIndex::build exceeded budget during cross-reference phase",
);
}
#[derive(Debug, Clone)]
pub struct IndexStats {
pub files: usize,
pub symbols: usize,
pub references: usize,
}
impl std::fmt::Display for IndexStats {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{} files, {} symbols, {} cross-references",
self.files, self.symbols, self.references
)
}
}
fn collect_source_files(root: &Path, start: Instant, truncated: &mut bool) -> Vec<PathBuf> {
let mut files = Vec::new();
let root_path = root.to_path_buf();
let walker = WalkBuilder::new(root)
.standard_filters(true)
.require_git(false)
.filter_entry(move |entry| {
let is_dir = entry
.file_type()
.map(|file_type| file_type.is_dir())
.unwrap_or(false);
if !is_dir || entry.path() == root_path {
return true;
}
entry
.file_name()
.to_str()
.map(|name| !SKIP_DIRS.contains(&name))
.unwrap_or(true)
})
.build();
for entry in walker {
if start.elapsed() >= MAX_WALL {
*truncated = true;
tracing::warn!(
reason = "wall_clock",
phase = "walk",
root = %root.display(),
partial_files = files.len(),
cross_references_dropped = true,
"ProjectIndex::build exceeded budget during walk phase",
);
break;
}
let entry = match entry {
Ok(entry) => entry,
Err(_) => continue,
};
let is_file = entry
.file_type()
.map(|file_type| file_type.is_file())
.unwrap_or(false);
if !is_file {
continue;
}
let ext = entry
.path()
.extension()
.and_then(|e| e.to_str())
.unwrap_or("");
if EXTENSIONS.contains(&ext) {
if files.len() >= MAX_FILES {
*truncated = true;
tracing::warn!(
reason = "file_count",
phase = "walk",
root = %root.display(),
partial_files = files.len(),
cross_references_dropped = true,
"ProjectIndex::build exceeded file budget during walk phase",
);
break;
}
files.push(entry.into_path());
}
}
files
}
enum SourceRead {
Content(String),
Oversized { bytes: u64 },
Unreadable,
}
fn read_source_file_bounded(path: &Path) -> SourceRead {
let file = match File::open(path) {
Ok(file) => file,
Err(_) => return SourceRead::Unreadable,
};
if let Ok(metadata) = file.metadata() {
if metadata.len() > MAX_SOURCE_BYTES as u64 {
return SourceRead::Oversized {
bytes: metadata.len(),
};
}
}
let mut bytes = Vec::new();
let mut limited = file.take(MAX_SOURCE_BYTES as u64 + 1);
if limited.read_to_end(&mut bytes).is_err() {
return SourceRead::Unreadable;
}
if bytes.len() > MAX_SOURCE_BYTES {
return SourceRead::Oversized {
bytes: bytes.len() as u64,
};
}
match String::from_utf8(bytes) {
Ok(content) => SourceRead::Content(content),
Err(_) => SourceRead::Unreadable,
}
}
enum IdentifierSearch {
Found(bool),
BudgetExceeded,
}
#[cfg(test)]
fn contains_identifier(source: &str, name: &str) -> bool {
match contains_identifier_bounded(source, name, Instant::now()) {
IdentifierSearch::Found(found) => found,
IdentifierSearch::BudgetExceeded => false,
}
}
fn contains_identifier_bounded(source: &str, name: &str, start: Instant) -> IdentifierSearch {
let bytes = source.as_bytes();
let name_bytes = name.as_bytes();
let mut pos = 0;
while pos + name_bytes.len() <= bytes.len() {
if start.elapsed() >= MAX_WALL {
return IdentifierSearch::BudgetExceeded;
}
if let Some(found) = source[pos..].find(name) {
let abs = pos + found;
let before = if abs > 0 { bytes[abs - 1] } else { b' ' };
let after_pos = abs + name_bytes.len();
let after = if after_pos < bytes.len() {
bytes[after_pos]
} else {
b' '
};
if !is_ident_char(before) && !is_ident_char(after) {
return IdentifierSearch::Found(true);
}
pos = abs + 1;
} else {
break;
}
}
IdentifierSearch::Found(false)
}
fn is_ident_char(b: u8) -> bool {
b.is_ascii_alphanumeric() || b == b'_'
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
#[cfg(unix)]
use std::os::unix::fs::PermissionsExt;
use tracing_test::traced_test;
fn write_file(path: &Path, content: impl AsRef<str>) {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent).unwrap();
}
fs::write(path, content.as_ref()).unwrap();
}
fn rust_fn(name: &str) -> String {
format!("pub fn {}() {{}}\n", name)
}
fn rel(parts: &[&str]) -> String {
let sep = std::path::MAIN_SEPARATOR.to_string();
parts.join(&sep)
}
#[test]
fn test_project_index() {
let dir = tempfile::tempdir().unwrap();
fs::write(
dir.path().join("lib.rs"),
r#"
pub struct Parser {
lang: String,
}
impl Parser {
pub fn new(lang: String) -> Self {
Self { lang }
}
pub fn parse(&self, source: &str) -> Vec<String> {
vec![]
}
}
pub fn create_parser() -> Parser {
Parser::new("rust".into())
}
"#,
)
.unwrap();
fs::write(
dir.path().join("main.rs"),
r#"
mod lib;
fn main() {
let p = create_parser();
let result = p.parse("fn foo() {}");
println!("{:?}", result);
}
"#,
)
.unwrap();
let index = ProjectIndex::build(dir.path());
let stats = index.stats();
assert_eq!(stats.files, 2);
assert!(stats.symbols >= 4);
let parsers = index.find("Parser");
assert!(!parsers.is_empty());
let refs = index.callers_of("create_parser");
assert!(!refs.is_empty(), "main.rs should reference create_parser");
assert!(refs.iter().any(|r| r.from_file == "main.rs"));
let fuzzy = index.find_fuzzy("pars");
assert!(fuzzy.len() >= 2); }
#[test]
fn test_contains_identifier() {
assert!(contains_identifier("let x = foo();", "foo"));
assert!(!contains_identifier("let x = foobar();", "foo"));
assert!(contains_identifier("use crate::Parser;", "Parser"));
assert!(!contains_identifier("use crate::ParserBuilder;", "Parser"));
}
#[test]
fn test_walker_honors_gitignore_outside_git_repo() {
let dir = tempfile::tempdir().unwrap();
write_file(&dir.path().join(".gitignore"), "ignored-by-gitignore/\n");
write_file(&dir.path().join("src").join("lib.rs"), rust_fn("visible"));
write_file(
&dir.path().join("generated-local").join("generated.ts"),
"export function includedGenerated() {}\n",
);
write_file(
&dir.path().join("ignored-by-gitignore").join("generated.ts"),
"export function generated() {}\n",
);
let index = ProjectIndex::build(dir.path());
assert!(index.files.contains_key(&rel(&["src", "lib.rs"])));
assert!(index
.files
.contains_key(&rel(&["generated-local", "generated.ts"])));
assert!(!index
.files
.contains_key(&rel(&["ignored-by-gitignore", "generated.ts"])));
}
#[test]
fn test_walker_honors_ignore_file() {
let dir = tempfile::tempdir().unwrap();
write_file(&dir.path().join(".ignore"), "ignored.rs\n");
write_file(&dir.path().join("visible.rs"), rust_fn("visible"));
write_file(&dir.path().join("ignored.rs"), rust_fn("ignored"));
let index = ProjectIndex::build(dir.path());
assert!(index.files.contains_key("visible.rs"));
assert!(!index.files.contains_key("ignored.rs"));
}
#[test]
fn test_walker_honors_git_info_exclude_without_global_config() {
let dir = tempfile::tempdir().unwrap();
write_file(
&dir.path().join(".git").join("info").join("exclude"),
"excluded.rs\n",
);
write_file(&dir.path().join("visible.rs"), rust_fn("visible"));
write_file(&dir.path().join("excluded.rs"), rust_fn("excluded"));
let index = ProjectIndex::build(dir.path());
assert!(index.files.contains_key("visible.rs"));
assert!(!index.files.contains_key("excluded.rs"));
}
#[test]
fn test_walker_skips_hidden_paths() {
let dir = tempfile::tempdir().unwrap();
write_file(&dir.path().join("visible.rs"), rust_fn("visible"));
write_file(&dir.path().join(".hidden.rs"), rust_fn("hidden_file"));
write_file(
&dir.path().join(".hidden_dir").join("secret.rs"),
rust_fn("hidden_dir"),
);
let index = ProjectIndex::build(dir.path());
assert!(index.files.contains_key("visible.rs"));
assert!(!index.files.contains_key(".hidden.rs"));
assert!(!index
.files
.contains_key(&rel(&[".hidden_dir", "secret.rs"])));
}
#[test]
fn test_walker_prunes_skip_dirs() {
let dir = tempfile::tempdir().unwrap();
write_file(&dir.path().join("src").join("lib.rs"), rust_fn("visible"));
write_file(
&dir.path().join("node_modules").join("pkg").join("index.js"),
"export function ignored() {}\n",
);
write_file(
&dir.path().join("target").join("debug").join("generated.rs"),
rust_fn("target_generated"),
);
write_file(&dir.path().join("bin").join("cli.rs"), rust_fn("bin_cli"));
write_file(
&dir.path().join("obj").join("generated.rs"),
rust_fn("obj_generated"),
);
let index = ProjectIndex::build(dir.path());
assert!(index.files.contains_key(&rel(&["src", "lib.rs"])));
assert!(!index
.files
.contains_key(&rel(&["node_modules", "pkg", "index.js"])));
assert!(!index
.files
.contains_key(&rel(&["target", "debug", "generated.rs"])));
assert!(!index.files.contains_key(&rel(&["bin", "cli.rs"])));
assert!(!index.files.contains_key(&rel(&["obj", "generated.rs"])));
}
#[test]
fn test_walker_cascades_subdir_gitignore() {
let dir = tempfile::tempdir().unwrap();
for project in ["app_one", "app_two"] {
let project_dir = dir.path().join(project);
write_file(&project_dir.join(".gitignore"), "ignored-by-gitignore/\n");
write_file(&project_dir.join("src").join("lib.rs"), rust_fn("visible"));
write_file(
&project_dir.join("generated-local").join("generated.rs"),
rust_fn("included_generated"),
);
write_file(
&project_dir
.join("ignored-by-gitignore")
.join("generated.rs"),
rust_fn("generated"),
);
}
let index = ProjectIndex::build(dir.path());
assert!(index
.files
.contains_key(&rel(&["app_one", "src", "lib.rs"])));
assert!(index
.files
.contains_key(&rel(&["app_two", "src", "lib.rs"])));
assert!(index
.files
.contains_key(&rel(&["app_one", "generated-local", "generated.rs"])));
assert!(index
.files
.contains_key(&rel(&["app_two", "generated-local", "generated.rs"])));
assert!(!index.files.contains_key(&rel(&[
"app_one",
"ignored-by-gitignore",
"generated.rs"
])));
assert!(!index.files.contains_key(&rel(&[
"app_two",
"ignored-by-gitignore",
"generated.rs"
])));
}
#[test]
fn test_walker_includes_root_even_if_named_skip() {
let parent = tempfile::tempdir().unwrap();
let root = parent.path().join("node_modules");
write_file(&root.join("index.js"), "export function visible() {}\n");
write_file(
&root.join("node_modules").join("nested.js"),
"export function hidden() {}\n",
);
let index = ProjectIndex::build(&root);
assert!(index.files.contains_key("index.js"));
assert!(!index
.files
.contains_key(&rel(&["node_modules", "nested.js"])));
}
#[cfg(unix)]
#[test]
fn test_build_does_not_panic_on_unreadable_dir() {
let dir = tempfile::tempdir().unwrap();
let unreadable = dir.path().join("unreadable");
write_file(&dir.path().join("visible.rs"), rust_fn("visible"));
write_file(&unreadable.join("hidden.rs"), rust_fn("hidden"));
fs::set_permissions(&unreadable, fs::Permissions::from_mode(0o000)).unwrap();
let result = std::panic::catch_unwind(|| ProjectIndex::build(dir.path()));
fs::set_permissions(&unreadable, fs::Permissions::from_mode(0o755)).unwrap();
let index = result.unwrap();
assert!(index.files.contains_key("visible.rs"));
}
#[test]
#[traced_test]
fn test_build_caps_at_max_files_and_warns() {
let dir = tempfile::tempdir().unwrap();
for i in 0..(MAX_FILES + 50) {
write_file(&dir.path().join(format!("file_{i:05}.rs")), "");
}
let mut truncated = false;
let files = collect_source_files(dir.path(), Instant::now(), &mut truncated);
assert_eq!(files.len(), MAX_FILES);
assert!(truncated);
assert!(logs_contain("reason=\"file_count\""));
assert!(logs_contain("phase=\"walk\""));
}
#[test]
#[traced_test]
fn test_oversized_source_truncates_warns_and_drops_xrefs() {
let dir = tempfile::tempdir().unwrap();
write_file(&dir.path().join("a.rs"), "pub fn target_fn() {}\n");
write_file(
&dir.path().join("b.rs"),
"use crate::a::target_fn;\npub fn caller() { target_fn(); }\n",
);
write_file(
&dir.path().join("oversized.rs"),
format!(
"pub fn oversized() {{}}\n// {}\n",
"x".repeat(MAX_SOURCE_BYTES + 1)
),
);
let index = ProjectIndex::build(dir.path());
assert!(index.files.contains_key("a.rs"));
assert!(index.files.contains_key("b.rs"));
assert!(index.references.is_empty());
assert!(logs_contain("reason=\"file_size\""));
assert!(logs_contain("cross_references_dropped=true"));
}
#[test]
#[traced_test]
fn test_truncated_build_emits_xref_dropped_warn() {
let dir = tempfile::tempdir().unwrap();
for i in 0..(MAX_FILES + 1) {
write_file(
&dir.path().join(format!("file_{i:05}.rs")),
if i == 0 {
rust_fn("visible")
} else {
String::new()
},
);
}
let index = ProjectIndex::build(dir.path());
assert_eq!(index.files.len(), MAX_FILES);
assert!(index.references.is_empty());
assert!(logs_contain("reason=\"file_count\""));
assert!(logs_contain("phase=\"walk\""));
assert!(logs_contain("cross_references_dropped=true"));
}
}