infiniloom_engine/index/
builder.rs

1//! Index builder - constructs the symbol index and dependency graph.
2//!
3//! This module handles parsing files, extracting symbols, resolving imports,
4//! and building the dependency graph.
5
6use super::convert::{convert_symbol_kind, convert_visibility};
7use super::patterns::{
8    GO_IMPORT, JAVA_IMPORT, JS_IMPORT, JS_IMPORT_MULTILINE, JS_REQUIRE, PYTHON_FROM_IMPORT,
9    PYTHON_IMPORT, RUST_USE,
10};
11use super::types::{
12    DepGraph, FileEntry, FileId, Import, IndexSymbol, Language, Span, SymbolId, SymbolIndex,
13};
14use crate::parser::{Language as ParserLanguage, Parser};
15use crate::SymbolKind;
16use ignore::WalkBuilder;
17use once_cell::sync::Lazy;
18use rayon::prelude::*;
19use regex::Regex;
20use std::cell::RefCell;
21use std::collections::{HashMap, HashSet};
22use std::fs;
23use std::path::{Path, PathBuf};
24use std::time::{SystemTime, UNIX_EPOCH};
25use thiserror::Error;
26
27// Thread-local parser storage to avoid re-initialization
28thread_local! {
29    static THREAD_PARSER: RefCell<Parser> = RefCell::new(Parser::new());
30}
31
32/// Errors that can occur during index building
33#[derive(Error, Debug)]
34pub enum BuildError {
35    #[error("IO error: {0}")]
36    Io(#[from] std::io::Error),
37
38    #[error("Parse error in {file}: {message}")]
39    Parse { file: String, message: String },
40
41    #[error("Repository not found: {0}")]
42    RepoNotFound(PathBuf),
43
44    #[error("Git error: {0}")]
45    Git(String),
46}
47
48static IDENT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[A-Za-z_][A-Za-z0-9_]*").unwrap());
49
50static COMMON_KEYWORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
51    [
52        "if",
53        "else",
54        "for",
55        "while",
56        "return",
57        "break",
58        "continue",
59        "match",
60        "case",
61        "switch",
62        "default",
63        "try",
64        "catch",
65        "throw",
66        "finally",
67        "yield",
68        "await",
69        "async",
70        "new",
71        "in",
72        "of",
73        "do",
74        "fn",
75        "function",
76        "def",
77        "class",
78        "struct",
79        "enum",
80        "trait",
81        "interface",
82        "type",
83        "impl",
84        "let",
85        "var",
86        "const",
87        "static",
88        "public",
89        "private",
90        "protected",
91        "internal",
92        "use",
93        "import",
94        "from",
95        "package",
96        "module",
97        "export",
98        "super",
99        "self",
100        "this",
101        "crate",
102        "pub",
103        "mod",
104    ]
105    .into_iter()
106    .collect()
107});
108
109/// Options for index building
110#[derive(Debug, Clone)]
111pub struct BuildOptions {
112    /// Respect .gitignore files
113    pub respect_gitignore: bool,
114    /// Maximum file size to index (in bytes)
115    pub max_file_size: u64,
116    /// File extensions to include (empty = all supported)
117    pub include_extensions: Vec<String>,
118    /// Directories to exclude
119    pub exclude_dirs: Vec<String>,
120    /// Whether to compute PageRank
121    pub compute_pagerank: bool,
122}
123
124impl Default for BuildOptions {
125    fn default() -> Self {
126        Self {
127            respect_gitignore: true,
128            max_file_size: 10 * 1024 * 1024, // 10 MB
129            include_extensions: vec![],
130            exclude_dirs: vec![
131                "node_modules".into(),
132                ".git".into(),
133                "target".into(),
134                "build".into(),
135                "dist".into(),
136                "__pycache__".into(),
137                ".venv".into(),
138                "venv".into(),
139            ],
140            compute_pagerank: true,
141        }
142    }
143}
144
145/// Index builder
146pub struct IndexBuilder {
147    /// Repository root path
148    repo_root: PathBuf,
149    /// Build options
150    options: BuildOptions,
151}
152
153impl IndexBuilder {
154    /// Create a new index builder
155    pub fn new(repo_root: impl AsRef<Path>) -> Self {
156        Self { repo_root: repo_root.as_ref().to_path_buf(), options: BuildOptions::default() }
157    }
158
159    /// Set build options
160    pub fn with_options(mut self, options: BuildOptions) -> Self {
161        self.options = options;
162        self
163    }
164
165    /// Build the symbol index and dependency graph.
166    ///
167    /// This parses all source files in the repository, extracts symbols,
168    /// resolves imports, and computes PageRank scores.
169    ///
170    /// # Returns
171    ///
172    /// A tuple of (SymbolIndex, DepGraph) that can be used for fast
173    /// diff context generation.
174    #[must_use = "index should be used for context queries or saved to disk"]
175    pub fn build(&self) -> Result<(SymbolIndex, DepGraph), BuildError> {
176        use std::time::Instant;
177
178        if !self.repo_root.exists() {
179            return Err(BuildError::RepoNotFound(self.repo_root.clone()));
180        }
181
182        let repo_name = self
183            .repo_root
184            .file_name()
185            .and_then(|n| n.to_str())
186            .unwrap_or("unknown")
187            .to_owned();
188
189        // Collect files to index
190        let t0 = Instant::now();
191        let files = self.collect_files()?;
192        let collect_time = t0.elapsed();
193        log::info!("Found {} files to index", files.len());
194
195        // Parse files in parallel
196        let t1 = Instant::now();
197        let parsed_files = self.parse_files_parallel(&files)?;
198        let parse_time = t1.elapsed();
199        log::info!("Parsed {} files", parsed_files.len());
200
201        // Debug timing (when INFINILOOM_TIMING is set)
202        let show_timing = std::env::var("INFINILOOM_TIMING").is_ok();
203        if show_timing {
204            log::info!("  [timing] collect: {:?}", collect_time);
205            log::info!("  [timing] parse: {:?}", parse_time);
206        }
207
208        // Build the index
209        let mut index = SymbolIndex::new();
210        index.repo_name = repo_name;
211        index.created_at = SystemTime::now()
212            .duration_since(UNIX_EPOCH)
213            .map(|d| d.as_secs())
214            .unwrap_or(0);
215
216        // Try to get current git commit
217        index.commit_hash = self.get_current_commit();
218
219        // Assign IDs and build index
220        let mut symbol_id_counter = 0u32;
221        let mut file_path_to_id: HashMap<String, u32> = HashMap::new();
222        let mut symbol_calls: Vec<(u32, Vec<String>)> = Vec::new();
223        let mut symbol_parents: Vec<(u32, String)> = Vec::new();
224
225        for (file_id, parsed) in parsed_files.into_iter().enumerate() {
226            let file_id = file_id as u32;
227            file_path_to_id.insert(parsed.path.clone(), file_id);
228
229            let symbol_start = symbol_id_counter;
230
231            // Convert parsed symbols to index symbols
232            for sym in parsed.symbols {
233                index.symbols.push(IndexSymbol {
234                    id: SymbolId::new(symbol_id_counter),
235                    name: sym.name.clone(),
236                    kind: convert_symbol_kind(sym.kind),
237                    file_id: FileId::new(file_id),
238                    span: Span::new(sym.start_line, 0, sym.end_line, 0),
239                    signature: sym.signature,
240                    parent: None, // Will be resolved after all symbols are indexed
241                    visibility: convert_visibility(sym.visibility),
242                    docstring: sym.docstring,
243                });
244                // Store calls for later graph building (symbol_id -> call names)
245                if !sym.calls.is_empty() {
246                    symbol_calls.push((symbol_id_counter, sym.calls));
247                }
248                // Store parent name for later resolution
249                if let Some(parent_name) = sym.parent {
250                    symbol_parents.push((symbol_id_counter, parent_name));
251                }
252                symbol_id_counter += 1;
253            }
254
255            index.files.push(FileEntry {
256                id: FileId::new(file_id),
257                path: parsed.path,
258                language: parsed.language,
259                content_hash: parsed.content_hash,
260                symbols: symbol_start..symbol_id_counter,
261                imports: parsed.imports,
262                lines: parsed.lines,
263                tokens: parsed.tokens,
264            });
265        }
266
267        // Build lookup tables
268        let t2 = Instant::now();
269        index.rebuild_lookups();
270        let lookup_time = t2.elapsed();
271
272        // Resolve parent symbols
273        for (symbol_id, parent_name) in &symbol_parents {
274            // Find the parent symbol by name (in the same file)
275            let symbol = &index.symbols[*symbol_id as usize];
276            let file_id = symbol.file_id;
277            if let Some(parent_sym) = index
278                .symbols
279                .iter()
280                .find(|s| s.file_id == file_id && s.name == *parent_name && s.kind.is_scope())
281            {
282                index.symbols[*symbol_id as usize].parent = Some(parent_sym.id);
283            }
284        }
285
286        // Build dependency graph
287        let t3 = Instant::now();
288        let mut graph = DepGraph::new();
289        self.build_graph(&index, &file_path_to_id, &symbol_calls, &mut graph);
290        let graph_time = t3.elapsed();
291
292        // Compute PageRank if enabled
293        let mut pagerank_time = std::time::Duration::ZERO;
294        if self.options.compute_pagerank {
295            let t4 = Instant::now();
296            self.compute_pagerank(&index, &mut graph);
297            pagerank_time = t4.elapsed();
298        }
299
300        if show_timing {
301            log::info!("  [timing] lookups: {:?}", lookup_time);
302            log::info!("  [timing] graph: {:?}", graph_time);
303            log::info!("  [timing] pagerank: {:?}", pagerank_time);
304        }
305
306        Ok((index, graph))
307    }
308
309    /// Collect files to index using gitignore-aware walking
310    fn collect_files(&self) -> Result<Vec<PathBuf>, BuildError> {
311        let mut files = Vec::new();
312        // Clone exclude_dirs so the closure owns it (needs 'static lifetime for WalkBuilder)
313        let exclude_dirs = self.options.exclude_dirs.clone();
314
315        // Use ignore crate for gitignore-aware file walking
316        let walker = WalkBuilder::new(&self.repo_root)
317            .hidden(false) // Don't skip hidden files by default (we filter below)
318            .git_ignore(self.options.respect_gitignore)
319            .git_global(self.options.respect_gitignore)
320            .git_exclude(self.options.respect_gitignore)
321            .filter_entry(move |entry| {
322                let path = entry.path();
323                // Always skip .git directory
324                if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
325                    if name == ".git" {
326                        return false;
327                    }
328                    // Skip excluded directories
329                    if path.is_dir() && exclude_dirs.iter().any(|dir| dir == name) {
330                        return false;
331                    }
332                    // Skip hidden directories (but not hidden files)
333                    if path.is_dir() && name.starts_with('.') {
334                        return false;
335                    }
336                }
337                true
338            })
339            .build();
340
341        for entry in walker.flatten() {
342            let path = entry.path();
343            if path.is_file() && self.should_index_file(path) {
344                files.push(path.to_path_buf());
345            }
346        }
347
348        Ok(files)
349    }
350
351    fn should_index_file(&self, path: &Path) -> bool {
352        // Check file size
353        if let Ok(metadata) = fs::metadata(path) {
354            if metadata.len() > self.options.max_file_size {
355                return false;
356            }
357        }
358
359        // Check extension
360        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
361        let lang = Language::from_extension(ext);
362
363        if lang == Language::Unknown {
364            return false;
365        }
366
367        // Check include filter
368        if !self.options.include_extensions.is_empty()
369            && !self
370                .options
371                .include_extensions
372                .iter()
373                .any(|entry| entry == ext)
374        {
375            return false;
376        }
377
378        true
379    }
380
381    /// Parse files in parallel
382    fn parse_files_parallel(&self, files: &[PathBuf]) -> Result<Vec<ParsedFile>, BuildError> {
383        let results: Vec<Result<ParsedFile, BuildError>> =
384            files.par_iter().map(|path| self.parse_file(path)).collect();
385
386        // Collect results, logging errors
387        let mut parsed = Vec::with_capacity(results.len());
388        for result in results {
389            match result {
390                Ok(f) => parsed.push(f),
391                Err(e) => log::warn!("Failed to parse file: {}", e),
392            }
393        }
394
395        Ok(parsed)
396    }
397
398    /// Parse a single file
399    fn parse_file(&self, path: &Path) -> Result<ParsedFile, BuildError> {
400        let content = fs::read_to_string(path)?;
401        let relative_path = path
402            .strip_prefix(&self.repo_root)
403            .unwrap_or(path)
404            .to_string_lossy()
405            .to_string();
406
407        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
408        let language = Language::from_extension(ext);
409
410        // Compute content hash
411        let content_hash = blake3::hash(content.as_bytes());
412
413        // Count lines
414        let lines = content.lines().count() as u32;
415
416        // Estimate tokens (simple approximation)
417        let tokens = (content.len() / 4) as u32;
418
419        // Parse symbols using tree-sitter - all 21 languages supported
420        let parser_lang = match language {
421            Language::Rust => Some(ParserLanguage::Rust),
422            Language::Python => Some(ParserLanguage::Python),
423            Language::JavaScript => Some(ParserLanguage::JavaScript),
424            Language::TypeScript => Some(ParserLanguage::TypeScript),
425            Language::Go => Some(ParserLanguage::Go),
426            Language::Java => Some(ParserLanguage::Java),
427            Language::C => Some(ParserLanguage::C),
428            Language::Cpp => Some(ParserLanguage::Cpp),
429            Language::CSharp => Some(ParserLanguage::CSharp),
430            Language::Ruby => Some(ParserLanguage::Ruby),
431            Language::Bash => Some(ParserLanguage::Bash),
432            Language::Php => Some(ParserLanguage::Php),
433            Language::Kotlin => Some(ParserLanguage::Kotlin),
434            Language::Swift => Some(ParserLanguage::Swift),
435            Language::Scala => Some(ParserLanguage::Scala),
436            Language::Haskell => Some(ParserLanguage::Haskell),
437            Language::Elixir => Some(ParserLanguage::Elixir),
438            Language::Clojure => Some(ParserLanguage::Clojure),
439            Language::OCaml => Some(ParserLanguage::OCaml),
440            Language::Lua => Some(ParserLanguage::Lua),
441            Language::R => Some(ParserLanguage::R),
442            Language::Unknown => None,
443        };
444
445        let mut symbols = Vec::new();
446        let imports = self.extract_imports(&content, language);
447
448        if let Some(lang) = parser_lang {
449            // Use thread-local parser to avoid re-initialization overhead
450            THREAD_PARSER.with(|parser_cell| {
451                let mut parser = parser_cell.borrow_mut();
452                if let Ok(parsed_symbols) = parser.parse(&content, lang) {
453                    for sym in parsed_symbols {
454                        symbols.push(ParsedSymbol {
455                            name: sym.name,
456                            kind: sym.kind,
457                            start_line: sym.start_line,
458                            end_line: sym.end_line,
459                            signature: sym.signature,
460                            docstring: sym.docstring,
461                            parent: sym.parent,
462                            visibility: sym.visibility,
463                            calls: sym.calls,
464                        });
465                    }
466                }
467            });
468        }
469
470        Ok(ParsedFile {
471            path: relative_path,
472            language,
473            content_hash: *content_hash.as_bytes(),
474            lines,
475            tokens,
476            symbols,
477            imports,
478        })
479    }
480
481    /// Extract import statements from source code using pre-compiled regexes
482    fn extract_imports(&self, content: &str, language: Language) -> Vec<Import> {
483        let mut imports = Vec::new();
484
485        if matches!(language, Language::JavaScript | Language::TypeScript) {
486            use std::collections::HashSet;
487
488            let mut seen_sources: HashSet<String> = HashSet::new();
489
490            // Line-based imports first (fast path)
491            let patterns: &[(&Regex, bool)] = &[(&JS_IMPORT, true), (&JS_REQUIRE, true)];
492            for (line_num, line) in content.lines().enumerate() {
493                for (re, check_external) in patterns {
494                    if let Some(captures) = re.captures(line) {
495                        if let Some(source) = captures.get(1) {
496                            let source_str = source.as_str().to_owned();
497                            if !seen_sources.insert(source_str.clone()) {
498                                continue;
499                            }
500                            let is_external = if *check_external {
501                                !source_str.starts_with('.')
502                                    && !source_str.starts_with('/')
503                                    && !source_str.starts_with("src/")
504                            } else {
505                                false
506                            };
507                            imports.push(Import {
508                                source: source_str,
509                                resolved_file: None,
510                                symbols: vec![],
511                                span: Span::new(line_num as u32 + 1, 0, line_num as u32 + 1, 0),
512                                is_external,
513                            });
514                        }
515                    }
516                }
517            }
518
519            // Multi-line imports (e.g., import { a, b } from 'x';)
520            for caps in JS_IMPORT_MULTILINE.captures_iter(content) {
521                if let Some(source) = caps.get(1) {
522                    let source_str = source.as_str().to_owned();
523                    if !seen_sources.insert(source_str.clone()) {
524                        continue;
525                    }
526                    let line_num = content[..source.start()].matches('\n').count() as u32 + 1;
527                    let is_external = !source_str.starts_with('.')
528                        && !source_str.starts_with('/')
529                        && !source_str.starts_with("src/");
530                    imports.push(Import {
531                        source: source_str,
532                        resolved_file: None,
533                        symbols: vec![],
534                        span: Span::new(line_num, 0, line_num, 0),
535                        is_external,
536                    });
537                }
538            }
539
540            return imports;
541        }
542
543        // Get pre-compiled regexes for this language (from shared patterns module)
544        let patterns: &[(&Regex, bool)] = match language {
545            Language::Python => &[(&PYTHON_IMPORT, false), (&PYTHON_FROM_IMPORT, false)],
546            Language::Rust => &[(&RUST_USE, false)],
547            Language::Go => &[(&GO_IMPORT, true)],
548            Language::Java => &[(&JAVA_IMPORT, false)],
549            _ => return imports, // Early return for unsupported languages
550        };
551
552        for (line_num, line) in content.lines().enumerate() {
553            for (re, check_external) in patterns {
554                if let Some(captures) = re.captures(line) {
555                    if let Some(source) = captures.get(1) {
556                        let source_str = source.as_str().to_owned();
557                        let is_external = if *check_external {
558                            // Check if it looks like an external package
559                            !source_str.starts_with('.')
560                                && !source_str.starts_with('/')
561                                && !source_str.starts_with("src/")
562                        } else {
563                            false
564                        };
565
566                        imports.push(Import {
567                            source: source_str,
568                            resolved_file: None,
569                            symbols: vec![],
570                            span: Span::new(line_num as u32 + 1, 0, line_num as u32 + 1, 0),
571                            is_external,
572                        });
573                    }
574                }
575            }
576        }
577
578        imports
579    }
580
581    /// Build dependency graph from index
582    fn build_graph(
583        &self,
584        index: &SymbolIndex,
585        file_path_to_id: &HashMap<String, u32>,
586        symbol_calls: &[(u32, Vec<String>)],
587        graph: &mut DepGraph,
588    ) {
589        let mut file_imports_by_file: HashMap<u32, Vec<u32>> = HashMap::new();
590
591        // Resolve file imports and build edges
592        for file in &index.files {
593            for import in &file.imports {
594                // Try to resolve the import to a file in the repository
595                if let Some(resolved_id) =
596                    self.resolve_import(&import.source, &file.path, file_path_to_id)
597                {
598                    graph.add_file_import(file.id.as_u32(), resolved_id);
599                    file_imports_by_file
600                        .entry(file.id.as_u32())
601                        .or_default()
602                        .push(resolved_id);
603                }
604            }
605        }
606
607        // Build symbol name to ID lookup for call resolution
608        let mut symbol_name_to_ids: HashMap<&str, Vec<u32>> = HashMap::new();
609        for sym in &index.symbols {
610            symbol_name_to_ids
611                .entry(&sym.name)
612                .or_default()
613                .push(sym.id.as_u32());
614        }
615
616        // Resolve function calls to symbol IDs and add call edges
617        for (caller_id, call_names) in symbol_calls {
618            let caller = &index.symbols[*caller_id as usize];
619            let caller_file_id = caller.file_id;
620            let imported_file_ids = file_imports_by_file
621                .get(&caller_file_id.as_u32())
622                .map(|ids| ids.iter().copied().collect::<HashSet<u32>>());
623
624            for call_name in call_names {
625                if let Some(callee_ids) = symbol_name_to_ids.get(call_name.as_str()) {
626                    // Prefer symbols in the same file, then any match
627                    let callee_id = callee_ids
628                        .iter()
629                        .find(|&&id| index.symbols[id as usize].file_id == caller_file_id)
630                        .or_else(|| {
631                            imported_file_ids.as_ref().and_then(|imports| {
632                                callee_ids.iter().find(|&&id| {
633                                    imports.contains(&index.symbols[id as usize].file_id.as_u32())
634                                })
635                            })
636                        })
637                        .or_else(|| callee_ids.first())
638                        .copied();
639
640                    if let Some(callee_id) = callee_id {
641                        // Don't add self-calls
642                        if callee_id != *caller_id {
643                            graph.add_call(*caller_id, callee_id);
644                        }
645                    }
646                }
647            }
648        }
649
650        self.add_symbol_reference_edges(index, &file_imports_by_file, &symbol_name_to_ids, graph);
651    }
652
653    fn add_symbol_reference_edges(
654        &self,
655        index: &SymbolIndex,
656        file_imports_by_file: &HashMap<u32, Vec<u32>>,
657        symbol_name_to_ids: &HashMap<&str, Vec<u32>>,
658        graph: &mut DepGraph,
659    ) {
660        let mut added: HashSet<(u32, u32)> = HashSet::new();
661
662        for file in &index.files {
663            let content = match fs::read_to_string(self.repo_root.join(&file.path)) {
664                Ok(content) => content,
665                Err(_) => continue,
666            };
667
668            let imported_file_ids = file_imports_by_file
669                .get(&file.id.as_u32())
670                .map(|ids| ids.iter().copied().collect::<HashSet<u32>>());
671
672            for (line_idx, line) in content.lines().enumerate() {
673                if self.should_skip_reference_line(line, file.language) {
674                    continue;
675                }
676                let line_no = line_idx as u32 + 1;
677                let referencer = match index.find_symbol_at_line(file.id, line_no) {
678                    Some(symbol) => symbol,
679                    None => continue,
680                };
681
682                for mat in IDENT_RE.find_iter(line) {
683                    let name = mat.as_str();
684                    if name.len() <= 1 || self.is_reference_keyword(name) {
685                        continue;
686                    }
687                    if referencer.span.start_line == line_no && referencer.name == name {
688                        continue;
689                    }
690
691                    let target_id = symbol_name_to_ids.get(name).and_then(|candidate_ids| {
692                        candidate_ids
693                            .iter()
694                            .find(|&&id| index.symbols[id as usize].file_id == file.id)
695                            .or_else(|| {
696                                imported_file_ids.as_ref().and_then(|imports| {
697                                    candidate_ids.iter().find(|&&id| {
698                                        imports
699                                            .contains(&index.symbols[id as usize].file_id.as_u32())
700                                    })
701                                })
702                            })
703                            .or_else(|| candidate_ids.first())
704                            .copied()
705                    });
706
707                    if let Some(target_id) = target_id {
708                        if target_id != referencer.id.as_u32()
709                            && added.insert((referencer.id.as_u32(), target_id))
710                        {
711                            graph.add_symbol_ref(referencer.id.as_u32(), target_id);
712                        }
713                    }
714                }
715            }
716        }
717    }
718
719    fn should_skip_reference_line(&self, line: &str, language: Language) -> bool {
720        let trimmed = line.trim_start();
721        if trimmed.is_empty() {
722            return true;
723        }
724
725        let comment_prefixes: &[&str] = match language {
726            Language::Python | Language::R => &["#"],
727            Language::Bash => &["#"],
728            Language::Ruby => &["#"],
729            Language::Lua => &["--"],
730            Language::JavaScript
731            | Language::TypeScript
732            | Language::C
733            | Language::Cpp
734            | Language::CSharp
735            | Language::Go
736            | Language::Java
737            | Language::Php
738            | Language::Kotlin
739            | Language::Swift
740            | Language::Scala => &["//"],
741            _ => &["//"],
742        };
743
744        if comment_prefixes.iter().any(|p| trimmed.starts_with(p)) {
745            return true;
746        }
747
748        let import_prefixes: &[&str] = match language {
749            Language::Python => &["import ", "from "],
750            Language::Rust => &["use "],
751            Language::Go => &["import "],
752            Language::Java => &["import "],
753            Language::JavaScript | Language::TypeScript => &["import ", "export ", "require("],
754            _ => &[],
755        };
756
757        import_prefixes.iter().any(|p| trimmed.starts_with(p))
758    }
759
760    fn is_reference_keyword(&self, name: &str) -> bool {
761        COMMON_KEYWORDS.contains(name)
762    }
763
764    /// Resolve an import path to a file ID
765    ///
766    /// Handles both absolute and relative imports:
767    /// - `./utils` resolves relative to the importing file's directory
768    /// - `../shared` resolves to parent directory
769    /// - `module` resolves using various strategies (src/, extensions, etc.)
770    fn resolve_import(
771        &self,
772        source: &str,
773        importing_file: &str,
774        file_path_to_id: &HashMap<String, u32>,
775    ) -> Option<u32> {
776        // Handle relative imports (./foo, ../bar)
777        if source.starts_with("./") || source.starts_with("../") {
778            let import_dir = Path::new(importing_file).parent().unwrap_or(Path::new(""));
779
780            // Strip leading ./ for resolution
781            let relative_source = source.strip_prefix("./").unwrap_or(source);
782
783            // Resolve the relative path
784            let resolved = import_dir.join(relative_source);
785            let resolved_str = resolved.to_string_lossy();
786            let resolved_str = resolved_str.as_ref();
787
788            // Try different extensions for the relative path
789            let relative_candidates = [
790                resolved_str.to_owned(),
791                format!("{}.ts", resolved_str),
792                format!("{}.js", resolved_str),
793                format!("{}.tsx", resolved_str),
794                format!("{}.jsx", resolved_str),
795                format!("{}/index.ts", resolved_str),
796                format!("{}/index.js", resolved_str),
797                format!("{}.py", resolved_str),
798                format!("{}/__init__.py", resolved_str),
799            ];
800
801            for candidate in relative_candidates {
802                // Normalize path (remove ../ segments)
803                let normalized = self.normalize_path(&candidate);
804                if let Some(&id) = file_path_to_id.get(&normalized) {
805                    return Some(id);
806                }
807            }
808        }
809
810        // Try absolute resolution strategies
811        let candidates = [
812            source.to_owned(),
813            format!("{}.rs", source.replace("::", "/")),
814            format!("{}/mod.rs", source.replace("::", "/")),
815            format!("{}.py", source.replace(".", "/")),
816            format!("{}/__init__.py", source.replace(".", "/")),
817            format!("{}.ts", source),
818            format!("{}.js", source),
819            format!("{}/index.ts", source),
820            format!("{}/index.js", source),
821            format!("src/{}.rs", source.replace("::", "/")),
822            format!("src/{}.py", source.replace(".", "/")),
823            format!("src/{}.ts", source),
824            format!("src/{}.js", source),
825        ];
826
827        for candidate in candidates {
828            if let Some(&id) = file_path_to_id.get(&candidate) {
829                return Some(id);
830            }
831        }
832
833        None
834    }
835
836    /// Normalize a path by resolving . and .. segments
837    fn normalize_path(&self, path: &str) -> String {
838        let mut parts: Vec<&str> = Vec::new();
839        for part in path.split('/') {
840            match part {
841                "" | "." => continue,
842                ".." => {
843                    parts.pop();
844                },
845                _ => parts.push(part),
846            }
847        }
848        parts.join("/")
849    }
850
851    /// Compute PageRank for files and symbols
852    fn compute_pagerank(&self, index: &SymbolIndex, graph: &mut DepGraph) {
853        // Compute file-level PageRank
854        self.compute_file_pagerank(index, graph);
855
856        // Compute symbol-level PageRank
857        self.compute_symbol_pagerank(index, graph);
858    }
859
860    /// Compute PageRank for files based on import graph
861    fn compute_file_pagerank(&self, index: &SymbolIndex, graph: &mut DepGraph) {
862        let n = index.files.len();
863        if n == 0 {
864            return;
865        }
866
867        let damping = 0.85f32;
868        let iterations = 20;
869        let initial_rank = 1.0 / n as f32;
870
871        let mut ranks: Vec<f32> = vec![initial_rank; n];
872        let mut new_ranks: Vec<f32> = vec![0.0; n];
873
874        // Build adjacency for PageRank
875        let mut outgoing: Vec<Vec<u32>> = vec![vec![]; n];
876        for &(from, to) in &graph.file_imports {
877            if (from as usize) < n && (to as usize) < n {
878                outgoing[from as usize].push(to);
879            }
880        }
881
882        for _ in 0..iterations {
883            // Reset new ranks
884            for r in &mut new_ranks {
885                *r = (1.0 - damping) / n as f32;
886            }
887
888            // Distribute rank
889            for (i, neighbors) in outgoing.iter().enumerate() {
890                if !neighbors.is_empty() {
891                    let contribution = damping * ranks[i] / neighbors.len() as f32;
892                    for &j in neighbors {
893                        new_ranks[j as usize] += contribution;
894                    }
895                } else {
896                    // Dangling node: distribute to all
897                    let contribution = damping * ranks[i] / n as f32;
898                    for r in &mut new_ranks {
899                        *r += contribution;
900                    }
901                }
902            }
903
904            std::mem::swap(&mut ranks, &mut new_ranks);
905        }
906
907        graph.file_pagerank = ranks;
908    }
909
910    /// Compute PageRank for symbols based on call graph
911    fn compute_symbol_pagerank(&self, index: &SymbolIndex, graph: &mut DepGraph) {
912        let n = index.symbols.len();
913        if n == 0 {
914            graph.symbol_pagerank = Vec::new();
915            return;
916        }
917
918        let damping = 0.85f32;
919        let iterations = 20;
920        let initial_rank = 1.0 / n as f32;
921
922        let mut ranks: Vec<f32> = vec![initial_rank; n];
923        let mut new_ranks: Vec<f32> = vec![0.0; n];
924
925        // Build adjacency for symbol PageRank using call graph
926        // A symbol's importance is determined by how many other symbols call it
927        let mut outgoing: Vec<Vec<u32>> = vec![vec![]; n];
928        for &(caller, callee) in &graph.calls {
929            if (caller as usize) < n && (callee as usize) < n {
930                outgoing[caller as usize].push(callee);
931            }
932        }
933
934        // Also consider symbol references
935        for &(from, to) in &graph.symbol_refs {
936            if (from as usize) < n && (to as usize) < n {
937                // Avoid duplicate edges
938                if !outgoing[from as usize].contains(&to) {
939                    outgoing[from as usize].push(to);
940                }
941            }
942        }
943
944        for _ in 0..iterations {
945            // Reset new ranks
946            for r in &mut new_ranks {
947                *r = (1.0 - damping) / n as f32;
948            }
949
950            // Distribute rank
951            for (i, neighbors) in outgoing.iter().enumerate() {
952                if !neighbors.is_empty() {
953                    let contribution = damping * ranks[i] / neighbors.len() as f32;
954                    for &j in neighbors {
955                        new_ranks[j as usize] += contribution;
956                    }
957                } else {
958                    // Dangling node: distribute to all (but with smaller contribution)
959                    let contribution = damping * ranks[i] / n as f32;
960                    for r in &mut new_ranks {
961                        *r += contribution;
962                    }
963                }
964            }
965
966            std::mem::swap(&mut ranks, &mut new_ranks);
967        }
968
969        graph.symbol_pagerank = ranks;
970    }
971
972    /// Get current git commit hash
973    fn get_current_commit(&self) -> Option<String> {
974        let git_head = self.repo_root.join(".git/HEAD");
975        if let Ok(content) = fs::read_to_string(&git_head) {
976            if content.starts_with("ref: ") {
977                // It's a reference to a branch
978                let ref_path = content.trim_start_matches("ref: ").trim();
979                let ref_file = self.repo_root.join(".git").join(ref_path);
980                if let Ok(hash) = fs::read_to_string(&ref_file) {
981                    return Some(hash.trim().to_owned());
982                }
983            } else {
984                // It's a direct commit hash
985                return Some(content.trim().to_owned());
986            }
987        }
988        None
989    }
990}
991
992/// Intermediate parsed file structure
993struct ParsedFile {
994    path: String,
995    language: Language,
996    content_hash: [u8; 32],
997    lines: u32,
998    tokens: u32,
999    symbols: Vec<ParsedSymbol>,
1000    imports: Vec<Import>,
1001}
1002
1003/// Intermediate parsed symbol
1004struct ParsedSymbol {
1005    name: String,
1006    kind: SymbolKind,
1007    start_line: u32,
1008    end_line: u32,
1009    signature: Option<String>,
1010    docstring: Option<String>,
1011    parent: Option<String>,
1012    visibility: crate::types::Visibility,
1013    calls: Vec<String>,
1014}
1015
1016#[cfg(test)]
1017mod tests {
1018    use super::*;
1019    use std::fs;
1020    use tempfile::TempDir;
1021
1022    #[test]
1023    fn test_build_simple_index() {
1024        let tmp = TempDir::new().unwrap();
1025
1026        // Create test files directly in tmp root (simpler test)
1027        fs::write(
1028            tmp.path().join("main.rs"),
1029            r#"fn main() {
1030    println!("Hello, world!");
1031    helper();
1032}
1033
1034fn helper() {
1035    // Do something
1036}
1037"#,
1038        )
1039        .unwrap();
1040
1041        fs::write(
1042            tmp.path().join("lib.rs"),
1043            r#"pub mod utils;
1044
1045pub fn public_fn() {}
1046"#,
1047        )
1048        .unwrap();
1049
1050        // Build index
1051        let builder = IndexBuilder::new(tmp.path());
1052        let (index, graph) = builder.build().unwrap();
1053
1054        // Verify index found the files
1055        assert_eq!(
1056            index.files.len(),
1057            2,
1058            "Expected 2 files, found {:?}",
1059            index.files.iter().map(|f| &f.path).collect::<Vec<_>>()
1060        );
1061
1062        // Verify symbols were extracted
1063        assert!(
1064            index.symbols.len() >= 3,
1065            "Expected at least 3 symbols, got {}",
1066            index.symbols.len()
1067        );
1068
1069        // Verify lookups work
1070        assert!(index.get_file("main.rs").is_some(), "main.rs not found in index");
1071        assert!(index.get_file("lib.rs").is_some(), "lib.rs not found in index");
1072
1073        // Verify PageRank was computed
1074        assert_eq!(graph.file_pagerank.len(), 2);
1075    }
1076
1077    #[test]
1078    fn test_symbol_reference_edges() {
1079        let tmp = TempDir::new().unwrap();
1080
1081        fs::write(
1082            tmp.path().join("lib.rs"),
1083            r#"pub struct Foo;
1084"#,
1085        )
1086        .unwrap();
1087
1088        fs::write(
1089            tmp.path().join("main.rs"),
1090            r#"mod lib;
1091
1092fn main() {
1093    let _value: Foo;
1094}
1095"#,
1096        )
1097        .unwrap();
1098
1099        let builder = IndexBuilder::new(tmp.path());
1100        let (index, graph) = builder.build().unwrap();
1101
1102        let foo = index.find_symbols("Foo");
1103        let main = index.find_symbols("main");
1104        assert!(!foo.is_empty(), "Expected Foo symbol");
1105        assert!(!main.is_empty(), "Expected main symbol");
1106
1107        let foo_id = foo[0].id.as_u32();
1108        let main_id = main[0].id.as_u32();
1109
1110        let referencers = graph.get_referencers(foo_id);
1111        assert!(referencers.contains(&main_id), "Expected main to reference Foo");
1112    }
1113
1114    #[test]
1115    fn test_language_detection() {
1116        // Original languages
1117        assert_eq!(Language::from_extension("rs"), Language::Rust);
1118        assert_eq!(Language::from_extension("py"), Language::Python);
1119        assert_eq!(Language::from_extension("ts"), Language::TypeScript);
1120        assert_eq!(Language::from_extension("tsx"), Language::TypeScript);
1121        assert_eq!(Language::from_extension("go"), Language::Go);
1122        assert_eq!(Language::from_extension("java"), Language::Java);
1123        assert_eq!(Language::from_extension("js"), Language::JavaScript);
1124        assert_eq!(Language::from_extension("c"), Language::C);
1125        assert_eq!(Language::from_extension("cpp"), Language::Cpp);
1126        assert_eq!(Language::from_extension("cs"), Language::CSharp);
1127        assert_eq!(Language::from_extension("rb"), Language::Ruby);
1128        assert_eq!(Language::from_extension("sh"), Language::Bash);
1129        // New languages
1130        assert_eq!(Language::from_extension("php"), Language::Php);
1131        assert_eq!(Language::from_extension("kt"), Language::Kotlin);
1132        assert_eq!(Language::from_extension("swift"), Language::Swift);
1133        assert_eq!(Language::from_extension("scala"), Language::Scala);
1134        assert_eq!(Language::from_extension("hs"), Language::Haskell);
1135        assert_eq!(Language::from_extension("ex"), Language::Elixir);
1136        assert_eq!(Language::from_extension("clj"), Language::Clojure);
1137        assert_eq!(Language::from_extension("ml"), Language::OCaml);
1138        assert_eq!(Language::from_extension("lua"), Language::Lua);
1139        assert_eq!(Language::from_extension("r"), Language::R);
1140    }
1141}