Skip to main content

codemem_engine/index/
indexer.rs

1//! Main indexing pipeline orchestrator.
2//!
3//! Walks a directory, filters supported files, checks for changes,
4//! and parses each file using the CodeParser.
5
6use crate::index::chunker::CodeChunk;
7use crate::index::incremental::ChangeDetector;
8use crate::index::parser::{CodeParser, ParseResult};
9use crate::index::resolver::{ReferenceResolver, ResolvedEdge, UnresolvedRef};
10use crate::index::symbol::{Reference, Symbol};
11use ignore::WalkBuilder;
12use std::collections::HashSet;
13use std::path::{Path, PathBuf};
14
15/// Progress event emitted during directory indexing.
16#[derive(Debug, Clone)]
17pub struct IndexProgress {
18    /// Number of files scanned so far.
19    pub files_scanned: usize,
20    /// Number of files parsed so far.
21    pub files_parsed: usize,
22    /// Total symbols extracted so far.
23    pub total_symbols: usize,
24    /// Current file being processed (relative path).
25    pub current_file: String,
26}
27
28/// Result of indexing a directory.
29#[derive(Debug)]
30pub struct IndexResult {
31    /// Total number of files scanned (walked).
32    pub files_scanned: usize,
33    /// Number of files successfully parsed.
34    pub files_parsed: usize,
35    /// Number of files skipped (unchanged since last index).
36    pub files_skipped: usize,
37    /// Total symbols extracted across all files.
38    pub total_symbols: usize,
39    /// Total references extracted across all files.
40    pub total_references: usize,
41    /// Total CST-aware chunks extracted across all files.
42    pub total_chunks: usize,
43    /// Individual parse results for each successfully parsed file.
44    pub parse_results: Vec<ParseResult>,
45}
46
47/// Result of `index_and_resolve()` — the complete indexing + resolution output.
48#[derive(Debug)]
49pub struct IndexAndResolveResult {
50    /// The underlying index result with per-file parse data.
51    pub index: IndexResult,
52    /// All symbols collected from all parsed files.
53    pub symbols: Vec<Symbol>,
54    /// All references collected from all parsed files.
55    pub references: Vec<Reference>,
56    /// All CST-aware chunks collected from all parsed files.
57    pub chunks: Vec<CodeChunk>,
58    /// All unique file paths that were parsed (relative to `root_path`).
59    pub file_paths: HashSet<String>,
60    /// Resolved edges from reference resolution.
61    pub edges: Vec<ResolvedEdge>,
62    /// References that could not be resolved to known symbols.
63    /// Preserved for deferred cross-repo linking.
64    pub unresolved: Vec<UnresolvedRef>,
65    /// The absolute root path that was indexed. Downstream code can use this
66    /// to reconstruct absolute paths (e.g. for `git -C` or file reading).
67    pub root_path: PathBuf,
68    /// SCIP graph build result (nodes, edges, memories). None if SCIP was skipped.
69    pub scip_build: Option<super::scip::graph_builder::ScipBuildResult>,
70}
71
72/// The main indexing pipeline.
73///
74/// Coordinates directory walking, change detection, and parsing
75/// to produce a complete index of a codebase.
76pub struct Indexer {
77    parser: CodeParser,
78    change_detector: ChangeDetector,
79}
80
81impl Indexer {
82    /// Create a new Indexer with default settings.
83    pub fn new() -> Self {
84        Self {
85            parser: CodeParser::new(),
86            change_detector: ChangeDetector::new(),
87        }
88    }
89
90    /// Create a new Indexer with a pre-loaded ChangeDetector.
91    pub fn with_change_detector(change_detector: ChangeDetector) -> Self {
92        Self {
93            parser: CodeParser::new(),
94            change_detector,
95        }
96    }
97
98    /// Get a reference to the change detector for persistence.
99    pub fn change_detector(&self) -> &ChangeDetector {
100        &self.change_detector
101    }
102
103    /// Get a mutable reference to the change detector.
104    pub fn change_detector_mut(&mut self) -> &mut ChangeDetector {
105        &mut self.change_detector
106    }
107
108    /// Index a directory, returning all parse results.
109    ///
110    /// Walks the directory respecting `.gitignore` rules (via the `ignore` crate),
111    /// filters by supported file extensions, checks incremental state, and parses
112    /// each changed file.
113    pub fn index_directory(
114        &mut self,
115        root: &Path,
116    ) -> Result<IndexResult, codemem_core::CodememError> {
117        self.index_directory_inner(root, None)
118    }
119
120    /// Index a directory with optional progress reporting.
121    ///
122    /// If a broadcast sender is provided, progress events are sent as files
123    /// are processed. This is useful for SSE streaming to the frontend.
124    pub fn index_directory_with_progress(
125        &mut self,
126        root: &Path,
127        tx: Option<&tokio::sync::broadcast::Sender<IndexProgress>>,
128    ) -> Result<IndexResult, codemem_core::CodememError> {
129        self.index_directory_inner(root, tx)
130    }
131
132    /// Common implementation for directory indexing with optional progress callback.
133    fn index_directory_inner(
134        &mut self,
135        root: &Path,
136        tx: Option<&tokio::sync::broadcast::Sender<IndexProgress>>,
137    ) -> Result<IndexResult, codemem_core::CodememError> {
138        let mut files_scanned = 0usize;
139        let mut files_parsed = 0usize;
140        let mut files_skipped = 0usize;
141        let mut total_symbols = 0usize;
142        let mut total_references = 0usize;
143        let mut total_chunks = 0usize;
144        let mut parse_results = Vec::new();
145
146        let walker = WalkBuilder::new(root)
147            .hidden(true) // skip hidden files/dirs
148            .git_ignore(true) // respect .gitignore
149            .git_global(true) // respect global gitignore
150            .git_exclude(true) // respect .git/info/exclude
151            .build();
152
153        for entry in walker {
154            let entry = match entry {
155                Ok(e) => e,
156                Err(err) => {
157                    tracing::warn!("Walk error: {}", err);
158                    continue;
159                }
160            };
161
162            // Skip directories
163            if !entry.file_type().is_some_and(|ft| ft.is_file()) {
164                continue;
165            }
166
167            let path = entry.path();
168
169            // Check if the file extension is supported
170            let ext = match path.extension().and_then(|e| e.to_str()) {
171                Some(e) => e,
172                None => continue,
173            };
174
175            if !self.parser.supports_extension(ext) {
176                continue;
177            }
178
179            files_scanned += 1;
180
181            // Read file content
182            let content = match std::fs::read(path) {
183                Ok(c) => c,
184                Err(err) => {
185                    tracing::warn!("Failed to read {}: {}", path.display(), err);
186                    continue;
187                }
188            };
189
190            // Use paths relative to root so node IDs are portable across machines.
191            let rel_path = path.strip_prefix(root).unwrap_or(path);
192            let path_str = rel_path.to_string_lossy().to_string();
193
194            // Check incremental state (returns pre-computed hash to avoid double-hashing)
195            let (changed, hash) = self.change_detector.check_changed(&path_str, &content);
196            if !changed {
197                files_skipped += 1;
198                continue;
199            }
200
201            // Parse the file
202            match self.parser.parse_file(&path_str, &content) {
203                Some(result) => {
204                    total_symbols += result.symbols.len();
205                    total_references += result.references.len();
206                    total_chunks += result.chunks.len();
207                    files_parsed += 1;
208
209                    // Record the pre-computed hash (avoids re-hashing)
210                    self.change_detector.record_hash(&path_str, hash);
211
212                    parse_results.push(result);
213
214                    // Send progress event if a sender is provided
215                    if let Some(tx) = tx {
216                        let _ = tx.send(IndexProgress {
217                            files_scanned,
218                            files_parsed,
219                            total_symbols,
220                            current_file: path_str.clone(),
221                        });
222                    }
223                }
224                None => {
225                    tracing::warn!("Failed to parse {}", path_str);
226                }
227            }
228        }
229
230        tracing::info!(
231            "Indexed {}: {} scanned, {} parsed, {} skipped, {} symbols, {} references, {} chunks",
232            root.display(),
233            files_scanned,
234            files_parsed,
235            files_skipped,
236            total_symbols,
237            total_references,
238            total_chunks,
239        );
240
241        Ok(IndexResult {
242            files_scanned,
243            files_parsed,
244            files_skipped,
245            total_symbols,
246            total_references,
247            total_chunks,
248            parse_results,
249        })
250    }
251
252    /// Index a directory, collect all symbols/references/chunks, and resolve
253    /// references into graph edges.
254    ///
255    /// This is the high-level entry point that combines `index_directory()`
256    /// with reference resolution — the common pipeline shared by the MCP
257    /// `index_codebase` tool and the CLI `index` command.
258    pub fn index_and_resolve(
259        &mut self,
260        root: &Path,
261    ) -> Result<IndexAndResolveResult, codemem_core::CodememError> {
262        self.index_and_resolve_with_scip(root, None, None)
263    }
264
265    /// Index a directory with optional SCIP integration.
266    ///
267    /// If `scip_covered_files` is provided, symbol/reference extraction is skipped
268    /// for those files (SCIP already handled them). Code chunking still runs for ALL files.
269    /// The `scip_build` result is attached to the output for persistence.
270    pub fn index_and_resolve_with_scip(
271        &mut self,
272        root: &Path,
273        scip_covered_files: Option<&HashSet<String>>,
274        scip_build: Option<super::scip::graph_builder::ScipBuildResult>,
275    ) -> Result<IndexAndResolveResult, codemem_core::CodememError> {
276        let result = self.index_directory(root)?;
277
278        let mut all_symbols = Vec::new();
279        let mut all_references = Vec::new();
280        let mut all_chunks = Vec::new();
281        let mut file_paths = HashSet::new();
282
283        // Consume parse_results by value to avoid cloning symbols/references/chunks
284        let IndexResult {
285            files_scanned,
286            files_parsed,
287            files_skipped,
288            total_symbols,
289            total_references,
290            total_chunks,
291            parse_results,
292        } = result;
293
294        for pr in parse_results {
295            file_paths.insert(pr.file_path.clone());
296            // For SCIP-covered files, only keep chunks — skip symbols/references
297            // since SCIP provides compiler-grade data for those.
298            if scip_covered_files.is_some_and(|s| s.contains(&pr.file_path)) {
299                all_chunks.extend(pr.chunks);
300            } else {
301                all_symbols.extend(pr.symbols);
302                all_references.extend(pr.references);
303                all_chunks.extend(pr.chunks);
304            }
305        }
306
307        let mut resolver = ReferenceResolver::new();
308        resolver.add_symbols(&all_symbols);
309        resolver.add_imports(&all_references);
310        let resolve_result = resolver.resolve_all_with_unresolved(&all_references);
311
312        // Canonicalize the root so downstream code can reconstruct absolute paths.
313        let root_path = std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
314
315        Ok(IndexAndResolveResult {
316            index: IndexResult {
317                files_scanned,
318                files_parsed,
319                files_skipped,
320                total_symbols,
321                total_references,
322                total_chunks,
323                parse_results: Vec::new(),
324            },
325            symbols: all_symbols,
326            references: all_references,
327            chunks: all_chunks,
328            file_paths,
329            edges: resolve_result.edges,
330            unresolved: resolve_result.unresolved,
331            root_path,
332            scip_build,
333        })
334    }
335}
336
337impl Default for Indexer {
338    fn default() -> Self {
339        Self::new()
340    }
341}
342
343#[cfg(test)]
344#[path = "tests/indexer_tests.rs"]
345mod tests;