1use super::graph::GraphBuilder;
6use super::types::{BuildError, BuildOptions, ParsedFile, ParsedSymbol, THREAD_PARSER};
7use crate::index::convert::{convert_symbol_kind, convert_visibility};
8use crate::index::patterns::{
9 GO_IMPORT, JAVA_IMPORT, JS_IMPORT, JS_IMPORT_MULTILINE, JS_REQUIRE, PYTHON_FROM_IMPORT,
10 PYTHON_IMPORT, RUST_USE,
11};
12use crate::index::types::{
13 DepGraph, FileEntry, FileId, Import, IndexSymbol, Language, Span, SymbolId, SymbolIndex,
14};
15use crate::parser::Language as ParserLanguage;
16use ignore::WalkBuilder;
17use rayon::prelude::*;
18use regex::Regex;
19use std::collections::HashMap;
20use std::fs;
21use std::path::{Path, PathBuf};
22use std::time::{SystemTime, UNIX_EPOCH};
23
24pub struct IndexBuilder {
26 pub(super) repo_root: PathBuf,
28 pub(super) options: BuildOptions,
30}
31
32impl IndexBuilder {
33 pub fn new(repo_root: impl AsRef<Path>) -> Self {
35 Self { repo_root: repo_root.as_ref().to_path_buf(), options: BuildOptions::default() }
36 }
37
38 pub fn with_options(mut self, options: BuildOptions) -> Self {
40 self.options = options;
41 self
42 }
43
44 #[must_use = "index should be used for context queries or saved to disk"]
54 pub fn build(&self) -> Result<(SymbolIndex, DepGraph), BuildError> {
55 use std::time::Instant;
56
57 if !self.repo_root.exists() {
58 return Err(BuildError::RepoNotFound(self.repo_root.clone()));
59 }
60
61 let repo_name = self
62 .repo_root
63 .file_name()
64 .and_then(|n| n.to_str())
65 .unwrap_or("unknown")
66 .to_owned();
67
68 let t0 = Instant::now();
70 let files = self.collect_files()?;
71 let collect_time = t0.elapsed();
72 tracing::info!("Found {} files to index", files.len());
73
74 let t1 = Instant::now();
76 let parsed_files = self.parse_files_parallel(&files)?;
77 let parse_time = t1.elapsed();
78 tracing::info!("Parsed {} files", parsed_files.len());
79
80 let show_timing = std::env::var("INFINILOOM_TIMING").is_ok();
82 if show_timing {
83 tracing::info!(" [timing] collect: {:?}", collect_time);
84 tracing::info!(" [timing] parse: {:?}", parse_time);
85 }
86
87 let mut index = SymbolIndex::new();
89 index.repo_name = repo_name;
90 index.created_at = SystemTime::now()
91 .duration_since(UNIX_EPOCH)
92 .map(|d| d.as_secs())
93 .unwrap_or(0);
94
95 index.commit_hash = self.get_current_commit();
97
98 let mut symbol_id_counter = 0u32;
100 let mut file_path_to_id: HashMap<String, u32> = HashMap::new();
101 let mut symbol_calls: Vec<(u32, Vec<String>)> = Vec::new();
102 let mut symbol_parents: Vec<(u32, String)> = Vec::new();
103
104 for (file_id, parsed) in parsed_files.into_iter().enumerate() {
105 let file_id = file_id as u32;
106 file_path_to_id.insert(parsed.path.clone(), file_id);
107
108 let symbol_start = symbol_id_counter;
109
110 for sym in parsed.symbols {
112 index.symbols.push(IndexSymbol {
113 id: SymbolId::new(symbol_id_counter),
114 name: sym.name.clone(),
115 kind: convert_symbol_kind(sym.kind),
116 file_id: FileId::new(file_id),
117 span: Span::new(sym.start_line, 0, sym.end_line, 0),
118 signature: sym.signature,
119 parent: None, visibility: convert_visibility(sym.visibility),
121 docstring: sym.docstring,
122 });
123 if !sym.calls.is_empty() {
125 symbol_calls.push((symbol_id_counter, sym.calls));
126 }
127 if let Some(parent_name) = sym.parent {
129 symbol_parents.push((symbol_id_counter, parent_name));
130 }
131 symbol_id_counter += 1;
132 }
133
134 index.files.push(FileEntry {
135 id: FileId::new(file_id),
136 path: parsed.path,
137 language: parsed.language,
138 content_hash: parsed.content_hash,
139 symbols: symbol_start..symbol_id_counter,
140 imports: parsed.imports,
141 lines: parsed.lines,
142 tokens: parsed.tokens,
143 });
144 }
145
146 let t2 = Instant::now();
148 index.rebuild_lookups();
149 let lookup_time = t2.elapsed();
150
151 for (symbol_id, parent_name) in &symbol_parents {
153 let symbol = &index.symbols[*symbol_id as usize];
155 let file_id = symbol.file_id;
156 if let Some(parent_sym) = index
157 .symbols
158 .iter()
159 .find(|s| s.file_id == file_id && s.name == *parent_name && s.kind.is_scope())
160 {
161 index.symbols[*symbol_id as usize].parent = Some(parent_sym.id);
162 }
163 }
164
165 let t3 = Instant::now();
167 let mut graph = DepGraph::new();
168 let graph_builder = GraphBuilder::new(&self.repo_root);
169 graph_builder.build_graph(&index, &file_path_to_id, &symbol_calls, &mut graph);
170 let graph_time = t3.elapsed();
171
172 let mut pagerank_time = std::time::Duration::ZERO;
174 if self.options.compute_pagerank {
175 let t4 = Instant::now();
176 graph_builder.compute_pagerank(&index, &mut graph);
177 pagerank_time = t4.elapsed();
178 }
179
180 if show_timing {
181 tracing::info!(" [timing] lookups: {:?}", lookup_time);
182 tracing::info!(" [timing] graph: {:?}", graph_time);
183 tracing::info!(" [timing] pagerank: {:?}", pagerank_time);
184 }
185
186 Ok((index, graph))
187 }
188
189 fn collect_files(&self) -> Result<Vec<PathBuf>, BuildError> {
191 let mut files = Vec::new();
192 let exclude_dirs = self.options.exclude_dirs.clone();
194
195 let walker = WalkBuilder::new(&self.repo_root)
197 .hidden(false) .git_ignore(self.options.respect_gitignore)
199 .git_global(self.options.respect_gitignore)
200 .git_exclude(self.options.respect_gitignore)
201 .filter_entry(move |entry| {
202 let path = entry.path();
203 if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
205 if name == ".git" {
206 return false;
207 }
208 if path.is_dir() && exclude_dirs.iter().any(|dir| dir == name) {
210 return false;
211 }
212 if path.is_dir() && name.starts_with('.') {
214 return false;
215 }
216 }
217 true
218 })
219 .build();
220
221 for entry in walker.flatten() {
222 let path = entry.path();
223 if path.is_file() && self.should_index_file(path) {
224 files.push(path.to_path_buf());
225 }
226 }
227
228 Ok(files)
229 }
230
231 fn should_index_file(&self, path: &Path) -> bool {
232 if let Ok(metadata) = fs::metadata(path) {
234 if metadata.len() > self.options.max_file_size {
235 return false;
236 }
237 }
238
239 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
241 let lang = Language::from_extension(ext);
242
243 if lang == Language::Unknown {
244 return false;
245 }
246
247 if !self.options.include_extensions.is_empty()
249 && !self
250 .options
251 .include_extensions
252 .iter()
253 .any(|entry| entry == ext)
254 {
255 return false;
256 }
257
258 true
259 }
260
261 fn parse_files_parallel(&self, files: &[PathBuf]) -> Result<Vec<ParsedFile>, BuildError> {
263 let results: Vec<Result<ParsedFile, BuildError>> =
264 files.par_iter().map(|path| self.parse_file(path)).collect();
265
266 let mut parsed = Vec::with_capacity(results.len());
268 for result in results {
269 match result {
270 Ok(f) => parsed.push(f),
271 Err(e) => tracing::warn!("Failed to parse file: {}", e),
272 }
273 }
274
275 Ok(parsed)
276 }
277
278 fn parse_file(&self, path: &Path) -> Result<ParsedFile, BuildError> {
280 let content = fs::read_to_string(path)?;
281 let relative_path = path
282 .strip_prefix(&self.repo_root)
283 .unwrap_or(path)
284 .to_string_lossy()
285 .to_string();
286
287 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
288 let language = Language::from_extension(ext);
289
290 let content_hash = blake3::hash(content.as_bytes());
292
293 let lines = content.lines().count() as u32;
295
296 let tokens = (content.len() / 4) as u32;
298
299 let parser_lang = match language {
301 Language::Rust => Some(ParserLanguage::Rust),
302 Language::Python => Some(ParserLanguage::Python),
303 Language::JavaScript => Some(ParserLanguage::JavaScript),
304 Language::TypeScript => Some(ParserLanguage::TypeScript),
305 Language::Go => Some(ParserLanguage::Go),
306 Language::Java => Some(ParserLanguage::Java),
307 Language::C => Some(ParserLanguage::C),
308 Language::Cpp => Some(ParserLanguage::Cpp),
309 Language::CSharp => Some(ParserLanguage::CSharp),
310 Language::Ruby => Some(ParserLanguage::Ruby),
311 Language::Bash => Some(ParserLanguage::Bash),
312 Language::Php => Some(ParserLanguage::Php),
313 Language::Kotlin => Some(ParserLanguage::Kotlin),
314 Language::Swift => Some(ParserLanguage::Swift),
315 Language::Scala => Some(ParserLanguage::Scala),
316 Language::Haskell => Some(ParserLanguage::Haskell),
317 Language::Elixir => Some(ParserLanguage::Elixir),
318 Language::Clojure => Some(ParserLanguage::Clojure),
319 Language::OCaml => Some(ParserLanguage::OCaml),
320 Language::Lua => Some(ParserLanguage::Lua),
321 Language::R => Some(ParserLanguage::R),
322 Language::Unknown => None,
323 };
324
325 let mut symbols = Vec::new();
326 let imports = self.extract_imports(&content, language);
327
328 if let Some(lang) = parser_lang {
329 THREAD_PARSER.with(|parser_cell| {
331 let mut parser = parser_cell.borrow_mut();
332 if let Ok(parsed_symbols) = parser.parse(&content, lang) {
333 for sym in parsed_symbols {
334 symbols.push(ParsedSymbol {
335 name: sym.name,
336 kind: sym.kind,
337 start_line: sym.start_line,
338 end_line: sym.end_line,
339 signature: sym.signature,
340 docstring: sym.docstring,
341 parent: sym.parent,
342 visibility: sym.visibility,
343 calls: sym.calls,
344 });
345 }
346 }
347 });
348 }
349
350 Ok(ParsedFile {
351 path: relative_path,
352 language,
353 content_hash: *content_hash.as_bytes(),
354 lines,
355 tokens,
356 symbols,
357 imports,
358 })
359 }
360
361 fn extract_imports(&self, content: &str, language: Language) -> Vec<Import> {
363 let mut imports = Vec::new();
364
365 if matches!(language, Language::JavaScript | Language::TypeScript) {
366 use std::collections::HashSet;
367
368 let mut seen_sources: HashSet<String> = HashSet::new();
369
370 let patterns: &[(&Regex, bool)] = &[(&JS_IMPORT, true), (&JS_REQUIRE, true)];
372 for (line_num, line) in content.lines().enumerate() {
373 for (re, check_external) in patterns {
374 if let Some(captures) = re.captures(line) {
375 if let Some(source) = captures.get(1) {
376 let source_str = source.as_str().to_owned();
377 if !seen_sources.insert(source_str.clone()) {
378 continue;
379 }
380 let is_external = if *check_external {
381 !source_str.starts_with('.')
382 && !source_str.starts_with('/')
383 && !source_str.starts_with("src/")
384 } else {
385 false
386 };
387 imports.push(Import {
388 source: source_str,
389 resolved_file: None,
390 symbols: vec![],
391 span: Span::new(line_num as u32 + 1, 0, line_num as u32 + 1, 0),
392 is_external,
393 });
394 }
395 }
396 }
397 }
398
399 for caps in JS_IMPORT_MULTILINE.captures_iter(content) {
401 if let Some(source) = caps.get(1) {
402 let source_str = source.as_str().to_owned();
403 if !seen_sources.insert(source_str.clone()) {
404 continue;
405 }
406 let line_num = content[..source.start()].matches('\n').count() as u32 + 1;
407 let is_external = !source_str.starts_with('.')
408 && !source_str.starts_with('/')
409 && !source_str.starts_with("src/");
410 imports.push(Import {
411 source: source_str,
412 resolved_file: None,
413 symbols: vec![],
414 span: Span::new(line_num, 0, line_num, 0),
415 is_external,
416 });
417 }
418 }
419
420 return imports;
421 }
422
423 let patterns: &[(&Regex, bool)] = match language {
425 Language::Python => &[(&PYTHON_IMPORT, false), (&PYTHON_FROM_IMPORT, false)],
426 Language::Rust => &[(&RUST_USE, false)],
427 Language::Go => &[(&GO_IMPORT, true)],
428 Language::Java => &[(&JAVA_IMPORT, false)],
429 _ => return imports, };
431
432 for (line_num, line) in content.lines().enumerate() {
433 for (re, check_external) in patterns {
434 if let Some(captures) = re.captures(line) {
435 if let Some(source) = captures.get(1) {
436 let source_str = source.as_str().to_owned();
437 let is_external = if *check_external {
438 !source_str.starts_with('.')
440 && !source_str.starts_with('/')
441 && !source_str.starts_with("src/")
442 } else {
443 false
444 };
445
446 imports.push(Import {
447 source: source_str,
448 resolved_file: None,
449 symbols: vec![],
450 span: Span::new(line_num as u32 + 1, 0, line_num as u32 + 1, 0),
451 is_external,
452 });
453 }
454 }
455 }
456 }
457
458 imports
459 }
460
461 pub(super) fn get_current_commit(&self) -> Option<String> {
463 let git_head = self.repo_root.join(".git/HEAD");
464 if let Ok(content) = fs::read_to_string(&git_head) {
465 if content.starts_with("ref: ") {
466 let ref_path = content.trim_start_matches("ref: ").trim();
468 let ref_file = self.repo_root.join(".git").join(ref_path);
469 if let Ok(hash) = fs::read_to_string(&ref_file) {
470 return Some(hash.trim().to_owned());
471 }
472 } else {
473 return Some(content.trim().to_owned());
475 }
476 }
477 None
478 }
479}