infiniloom_engine/index/builder/
core.rs

1//! Core index builder implementation.
2//!
3//! Contains the IndexBuilder struct and main build logic.
4
5use super::graph::GraphBuilder;
6use super::types::{BuildError, BuildOptions, ParsedFile, ParsedSymbol, THREAD_PARSER};
7use crate::index::convert::{convert_symbol_kind, convert_visibility};
8use crate::index::patterns::{
9    GO_IMPORT, JAVA_IMPORT, JS_IMPORT, JS_IMPORT_MULTILINE, JS_REQUIRE, PYTHON_FROM_IMPORT,
10    PYTHON_IMPORT, RUST_USE,
11};
12use crate::index::types::{
13    DepGraph, FileEntry, FileId, Import, IndexSymbol, Language, Span, SymbolId, SymbolIndex,
14};
15use crate::parser::Language as ParserLanguage;
16use ignore::WalkBuilder;
17use rayon::prelude::*;
18use regex::Regex;
19use std::collections::HashMap;
20use std::fs;
21use std::path::{Path, PathBuf};
22use std::time::{SystemTime, UNIX_EPOCH};
23
24/// Index builder
25pub struct IndexBuilder {
26    /// Repository root path
27    pub(super) repo_root: PathBuf,
28    /// Build options
29    pub(super) options: BuildOptions,
30}
31
32impl IndexBuilder {
33    /// Create a new index builder
34    pub fn new(repo_root: impl AsRef<Path>) -> Self {
35        Self { repo_root: repo_root.as_ref().to_path_buf(), options: BuildOptions::default() }
36    }
37
38    /// Set build options
39    pub fn with_options(mut self, options: BuildOptions) -> Self {
40        self.options = options;
41        self
42    }
43
44    /// Build the symbol index and dependency graph.
45    ///
46    /// This parses all source files in the repository, extracts symbols,
47    /// resolves imports, and computes PageRank scores.
48    ///
49    /// # Returns
50    ///
51    /// A tuple of (SymbolIndex, DepGraph) that can be used for fast
52    /// diff context generation.
53    #[must_use = "index should be used for context queries or saved to disk"]
54    pub fn build(&self) -> Result<(SymbolIndex, DepGraph), BuildError> {
55        use std::time::Instant;
56
57        if !self.repo_root.exists() {
58            return Err(BuildError::RepoNotFound(self.repo_root.clone()));
59        }
60
61        let repo_name = self
62            .repo_root
63            .file_name()
64            .and_then(|n| n.to_str())
65            .unwrap_or("unknown")
66            .to_owned();
67
68        // Collect files to index
69        let t0 = Instant::now();
70        let files = self.collect_files()?;
71        let collect_time = t0.elapsed();
72        tracing::info!("Found {} files to index", files.len());
73
74        // Parse files in parallel
75        let t1 = Instant::now();
76        let parsed_files = self.parse_files_parallel(&files)?;
77        let parse_time = t1.elapsed();
78        tracing::info!("Parsed {} files", parsed_files.len());
79
80        // Debug timing (when INFINILOOM_TIMING is set)
81        let show_timing = std::env::var("INFINILOOM_TIMING").is_ok();
82        if show_timing {
83            tracing::info!("  [timing] collect: {:?}", collect_time);
84            tracing::info!("  [timing] parse: {:?}", parse_time);
85        }
86
87        // Build the index
88        let mut index = SymbolIndex::new();
89        index.repo_name = repo_name;
90        index.created_at = SystemTime::now()
91            .duration_since(UNIX_EPOCH)
92            .map(|d| d.as_secs())
93            .unwrap_or(0);
94
95        // Try to get current git commit
96        index.commit_hash = self.get_current_commit();
97
98        // Assign IDs and build index
99        let mut symbol_id_counter = 0u32;
100        let mut file_path_to_id: HashMap<String, u32> = HashMap::new();
101        let mut symbol_calls: Vec<(u32, Vec<String>)> = Vec::new();
102        let mut symbol_parents: Vec<(u32, String)> = Vec::new();
103
104        for (file_id, parsed) in parsed_files.into_iter().enumerate() {
105            let file_id = file_id as u32;
106            file_path_to_id.insert(parsed.path.clone(), file_id);
107
108            let symbol_start = symbol_id_counter;
109
110            // Convert parsed symbols to index symbols
111            for sym in parsed.symbols {
112                index.symbols.push(IndexSymbol {
113                    id: SymbolId::new(symbol_id_counter),
114                    name: sym.name.clone(),
115                    kind: convert_symbol_kind(sym.kind),
116                    file_id: FileId::new(file_id),
117                    span: Span::new(sym.start_line, 0, sym.end_line, 0),
118                    signature: sym.signature,
119                    parent: None, // Will be resolved after all symbols are indexed
120                    visibility: convert_visibility(sym.visibility),
121                    docstring: sym.docstring,
122                });
123                // Store calls for later graph building (symbol_id -> call names)
124                if !sym.calls.is_empty() {
125                    symbol_calls.push((symbol_id_counter, sym.calls));
126                }
127                // Store parent name for later resolution
128                if let Some(parent_name) = sym.parent {
129                    symbol_parents.push((symbol_id_counter, parent_name));
130                }
131                symbol_id_counter += 1;
132            }
133
134            index.files.push(FileEntry {
135                id: FileId::new(file_id),
136                path: parsed.path,
137                language: parsed.language,
138                content_hash: parsed.content_hash,
139                symbols: symbol_start..symbol_id_counter,
140                imports: parsed.imports,
141                lines: parsed.lines,
142                tokens: parsed.tokens,
143            });
144        }
145
146        // Build lookup tables
147        let t2 = Instant::now();
148        index.rebuild_lookups();
149        let lookup_time = t2.elapsed();
150
151        // Resolve parent symbols
152        for (symbol_id, parent_name) in &symbol_parents {
153            // Find the parent symbol by name (in the same file)
154            let symbol = &index.symbols[*symbol_id as usize];
155            let file_id = symbol.file_id;
156            if let Some(parent_sym) = index
157                .symbols
158                .iter()
159                .find(|s| s.file_id == file_id && s.name == *parent_name && s.kind.is_scope())
160            {
161                index.symbols[*symbol_id as usize].parent = Some(parent_sym.id);
162            }
163        }
164
165        // Build dependency graph
166        let t3 = Instant::now();
167        let mut graph = DepGraph::new();
168        let graph_builder = GraphBuilder::new(&self.repo_root);
169        graph_builder.build_graph(&index, &file_path_to_id, &symbol_calls, &mut graph);
170        let graph_time = t3.elapsed();
171
172        // Compute PageRank if enabled
173        let mut pagerank_time = std::time::Duration::ZERO;
174        if self.options.compute_pagerank {
175            let t4 = Instant::now();
176            graph_builder.compute_pagerank(&index, &mut graph);
177            pagerank_time = t4.elapsed();
178        }
179
180        if show_timing {
181            tracing::info!("  [timing] lookups: {:?}", lookup_time);
182            tracing::info!("  [timing] graph: {:?}", graph_time);
183            tracing::info!("  [timing] pagerank: {:?}", pagerank_time);
184        }
185
186        Ok((index, graph))
187    }
188
189    /// Collect files to index using gitignore-aware walking
190    fn collect_files(&self) -> Result<Vec<PathBuf>, BuildError> {
191        let mut files = Vec::new();
192        // Clone exclude_dirs so the closure owns it (needs 'static lifetime for WalkBuilder)
193        let exclude_dirs = self.options.exclude_dirs.clone();
194
195        // Use ignore crate for gitignore-aware file walking
196        let walker = WalkBuilder::new(&self.repo_root)
197            .hidden(false) // Don't skip hidden files by default (we filter below)
198            .git_ignore(self.options.respect_gitignore)
199            .git_global(self.options.respect_gitignore)
200            .git_exclude(self.options.respect_gitignore)
201            .filter_entry(move |entry| {
202                let path = entry.path();
203                // Always skip .git directory
204                if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
205                    if name == ".git" {
206                        return false;
207                    }
208                    // Skip excluded directories
209                    if path.is_dir() && exclude_dirs.iter().any(|dir| dir == name) {
210                        return false;
211                    }
212                    // Skip hidden directories (but not hidden files)
213                    if path.is_dir() && name.starts_with('.') {
214                        return false;
215                    }
216                }
217                true
218            })
219            .build();
220
221        for entry in walker.flatten() {
222            let path = entry.path();
223            if path.is_file() && self.should_index_file(path) {
224                files.push(path.to_path_buf());
225            }
226        }
227
228        Ok(files)
229    }
230
231    fn should_index_file(&self, path: &Path) -> bool {
232        // Check file size
233        if let Ok(metadata) = fs::metadata(path) {
234            if metadata.len() > self.options.max_file_size {
235                return false;
236            }
237        }
238
239        // Check extension
240        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
241        let lang = Language::from_extension(ext);
242
243        if lang == Language::Unknown {
244            return false;
245        }
246
247        // Check include filter
248        if !self.options.include_extensions.is_empty()
249            && !self
250                .options
251                .include_extensions
252                .iter()
253                .any(|entry| entry == ext)
254        {
255            return false;
256        }
257
258        true
259    }
260
261    /// Parse files in parallel
262    fn parse_files_parallel(&self, files: &[PathBuf]) -> Result<Vec<ParsedFile>, BuildError> {
263        let results: Vec<Result<ParsedFile, BuildError>> =
264            files.par_iter().map(|path| self.parse_file(path)).collect();
265
266        // Collect results, logging errors
267        let mut parsed = Vec::with_capacity(results.len());
268        for result in results {
269            match result {
270                Ok(f) => parsed.push(f),
271                Err(e) => tracing::warn!("Failed to parse file: {}", e),
272            }
273        }
274
275        Ok(parsed)
276    }
277
278    /// Parse a single file
279    fn parse_file(&self, path: &Path) -> Result<ParsedFile, BuildError> {
280        let content = fs::read_to_string(path)?;
281        let relative_path = path
282            .strip_prefix(&self.repo_root)
283            .unwrap_or(path)
284            .to_string_lossy()
285            .to_string();
286
287        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
288        let language = Language::from_extension(ext);
289
290        // Compute content hash
291        let content_hash = blake3::hash(content.as_bytes());
292
293        // Count lines
294        let lines = content.lines().count() as u32;
295
296        // Estimate tokens (simple approximation)
297        let tokens = (content.len() / 4) as u32;
298
299        // Parse symbols using tree-sitter - all 21 languages supported
300        let parser_lang = match language {
301            Language::Rust => Some(ParserLanguage::Rust),
302            Language::Python => Some(ParserLanguage::Python),
303            Language::JavaScript => Some(ParserLanguage::JavaScript),
304            Language::TypeScript => Some(ParserLanguage::TypeScript),
305            Language::Go => Some(ParserLanguage::Go),
306            Language::Java => Some(ParserLanguage::Java),
307            Language::C => Some(ParserLanguage::C),
308            Language::Cpp => Some(ParserLanguage::Cpp),
309            Language::CSharp => Some(ParserLanguage::CSharp),
310            Language::Ruby => Some(ParserLanguage::Ruby),
311            Language::Bash => Some(ParserLanguage::Bash),
312            Language::Php => Some(ParserLanguage::Php),
313            Language::Kotlin => Some(ParserLanguage::Kotlin),
314            Language::Swift => Some(ParserLanguage::Swift),
315            Language::Scala => Some(ParserLanguage::Scala),
316            Language::Haskell => Some(ParserLanguage::Haskell),
317            Language::Elixir => Some(ParserLanguage::Elixir),
318            Language::Clojure => Some(ParserLanguage::Clojure),
319            Language::OCaml => Some(ParserLanguage::OCaml),
320            Language::Lua => Some(ParserLanguage::Lua),
321            Language::R => Some(ParserLanguage::R),
322            Language::Unknown => None,
323        };
324
325        let mut symbols = Vec::new();
326        let imports = self.extract_imports(&content, language);
327
328        if let Some(lang) = parser_lang {
329            // Use thread-local parser to avoid re-initialization overhead
330            THREAD_PARSER.with(|parser_cell| {
331                let mut parser = parser_cell.borrow_mut();
332                if let Ok(parsed_symbols) = parser.parse(&content, lang) {
333                    for sym in parsed_symbols {
334                        symbols.push(ParsedSymbol {
335                            name: sym.name,
336                            kind: sym.kind,
337                            start_line: sym.start_line,
338                            end_line: sym.end_line,
339                            signature: sym.signature,
340                            docstring: sym.docstring,
341                            parent: sym.parent,
342                            visibility: sym.visibility,
343                            calls: sym.calls,
344                        });
345                    }
346                }
347            });
348        }
349
350        Ok(ParsedFile {
351            path: relative_path,
352            language,
353            content_hash: *content_hash.as_bytes(),
354            lines,
355            tokens,
356            symbols,
357            imports,
358        })
359    }
360
361    /// Extract import statements from source code using pre-compiled regexes
362    fn extract_imports(&self, content: &str, language: Language) -> Vec<Import> {
363        let mut imports = Vec::new();
364
365        if matches!(language, Language::JavaScript | Language::TypeScript) {
366            use std::collections::HashSet;
367
368            let mut seen_sources: HashSet<String> = HashSet::new();
369
370            // Line-based imports first (fast path)
371            let patterns: &[(&Regex, bool)] = &[(&JS_IMPORT, true), (&JS_REQUIRE, true)];
372            for (line_num, line) in content.lines().enumerate() {
373                for (re, check_external) in patterns {
374                    if let Some(captures) = re.captures(line) {
375                        if let Some(source) = captures.get(1) {
376                            let source_str = source.as_str().to_owned();
377                            if !seen_sources.insert(source_str.clone()) {
378                                continue;
379                            }
380                            let is_external = if *check_external {
381                                !source_str.starts_with('.')
382                                    && !source_str.starts_with('/')
383                                    && !source_str.starts_with("src/")
384                            } else {
385                                false
386                            };
387                            imports.push(Import {
388                                source: source_str,
389                                resolved_file: None,
390                                symbols: vec![],
391                                span: Span::new(line_num as u32 + 1, 0, line_num as u32 + 1, 0),
392                                is_external,
393                            });
394                        }
395                    }
396                }
397            }
398
399            // Multi-line imports (e.g., import { a, b } from 'x';)
400            for caps in JS_IMPORT_MULTILINE.captures_iter(content) {
401                if let Some(source) = caps.get(1) {
402                    let source_str = source.as_str().to_owned();
403                    if !seen_sources.insert(source_str.clone()) {
404                        continue;
405                    }
406                    let line_num = content[..source.start()].matches('\n').count() as u32 + 1;
407                    let is_external = !source_str.starts_with('.')
408                        && !source_str.starts_with('/')
409                        && !source_str.starts_with("src/");
410                    imports.push(Import {
411                        source: source_str,
412                        resolved_file: None,
413                        symbols: vec![],
414                        span: Span::new(line_num, 0, line_num, 0),
415                        is_external,
416                    });
417                }
418            }
419
420            return imports;
421        }
422
423        // Get pre-compiled regexes for this language (from shared patterns module)
424        let patterns: &[(&Regex, bool)] = match language {
425            Language::Python => &[(&PYTHON_IMPORT, false), (&PYTHON_FROM_IMPORT, false)],
426            Language::Rust => &[(&RUST_USE, false)],
427            Language::Go => &[(&GO_IMPORT, true)],
428            Language::Java => &[(&JAVA_IMPORT, false)],
429            _ => return imports, // Early return for unsupported languages
430        };
431
432        for (line_num, line) in content.lines().enumerate() {
433            for (re, check_external) in patterns {
434                if let Some(captures) = re.captures(line) {
435                    if let Some(source) = captures.get(1) {
436                        let source_str = source.as_str().to_owned();
437                        let is_external = if *check_external {
438                            // Check if it looks like an external package
439                            !source_str.starts_with('.')
440                                && !source_str.starts_with('/')
441                                && !source_str.starts_with("src/")
442                        } else {
443                            false
444                        };
445
446                        imports.push(Import {
447                            source: source_str,
448                            resolved_file: None,
449                            symbols: vec![],
450                            span: Span::new(line_num as u32 + 1, 0, line_num as u32 + 1, 0),
451                            is_external,
452                        });
453                    }
454                }
455            }
456        }
457
458        imports
459    }
460
461    /// Get current git commit hash
462    pub(super) fn get_current_commit(&self) -> Option<String> {
463        let git_head = self.repo_root.join(".git/HEAD");
464        if let Ok(content) = fs::read_to_string(&git_head) {
465            if content.starts_with("ref: ") {
466                // It's a reference to a branch
467                let ref_path = content.trim_start_matches("ref: ").trim();
468                let ref_file = self.repo_root.join(".git").join(ref_path);
469                if let Ok(hash) = fs::read_to_string(&ref_file) {
470                    return Some(hash.trim().to_owned());
471                }
472            } else {
473                // It's a direct commit hash
474                return Some(content.trim().to_owned());
475            }
476        }
477        None
478    }
479}