Skip to main content

codemem_engine/index/
indexer.rs

1//! Main indexing pipeline orchestrator.
2//!
3//! Walks a directory, filters supported files, checks for changes,
4//! and parses each file using the CodeParser.
5
6use crate::index::chunker::CodeChunk;
7use crate::index::incremental::ChangeDetector;
8use crate::index::parser::{CodeParser, ParseResult};
9use crate::index::resolver::{ReferenceResolver, ResolvedEdge};
10use crate::index::symbol::{Reference, Symbol};
11use ignore::WalkBuilder;
12use std::collections::HashSet;
13use std::path::{Path, PathBuf};
14
15/// Progress event emitted during directory indexing.
16#[derive(Debug, Clone)]
17pub struct IndexProgress {
18    /// Number of files scanned so far.
19    pub files_scanned: usize,
20    /// Number of files parsed so far.
21    pub files_parsed: usize,
22    /// Total symbols extracted so far.
23    pub total_symbols: usize,
24    /// Current file being processed (relative path).
25    pub current_file: String,
26}
27
28/// Result of indexing a directory.
29#[derive(Debug)]
30pub struct IndexResult {
31    /// Total number of files scanned (walked).
32    pub files_scanned: usize,
33    /// Number of files successfully parsed.
34    pub files_parsed: usize,
35    /// Number of files skipped (unchanged since last index).
36    pub files_skipped: usize,
37    /// Total symbols extracted across all files.
38    pub total_symbols: usize,
39    /// Total references extracted across all files.
40    pub total_references: usize,
41    /// Total CST-aware chunks extracted across all files.
42    pub total_chunks: usize,
43    /// Individual parse results for each successfully parsed file.
44    pub parse_results: Vec<ParseResult>,
45}
46
47/// Result of `index_and_resolve()` — the complete indexing + resolution output.
48#[derive(Debug)]
49pub struct IndexAndResolveResult {
50    /// The underlying index result with per-file parse data.
51    pub index: IndexResult,
52    /// All symbols collected from all parsed files.
53    pub symbols: Vec<Symbol>,
54    /// All references collected from all parsed files.
55    pub references: Vec<Reference>,
56    /// All CST-aware chunks collected from all parsed files.
57    pub chunks: Vec<CodeChunk>,
58    /// All unique file paths that were parsed (relative to `root_path`).
59    pub file_paths: HashSet<String>,
60    /// Resolved edges from reference resolution.
61    pub edges: Vec<ResolvedEdge>,
62    /// The absolute root path that was indexed. Downstream code can use this
63    /// to reconstruct absolute paths (e.g. for `git -C` or file reading).
64    pub root_path: PathBuf,
65}
66
67/// The main indexing pipeline.
68///
69/// Coordinates directory walking, change detection, and parsing
70/// to produce a complete index of a codebase.
71pub struct Indexer {
72    parser: CodeParser,
73    change_detector: ChangeDetector,
74}
75
76impl Indexer {
77    /// Create a new Indexer with default settings.
78    pub fn new() -> Self {
79        Self {
80            parser: CodeParser::new(),
81            change_detector: ChangeDetector::new(),
82        }
83    }
84
85    /// Create a new Indexer with a pre-loaded ChangeDetector.
86    pub fn with_change_detector(change_detector: ChangeDetector) -> Self {
87        Self {
88            parser: CodeParser::new(),
89            change_detector,
90        }
91    }
92
93    /// Get a reference to the change detector for persistence.
94    pub fn change_detector(&self) -> &ChangeDetector {
95        &self.change_detector
96    }
97
98    /// Get a mutable reference to the change detector.
99    pub fn change_detector_mut(&mut self) -> &mut ChangeDetector {
100        &mut self.change_detector
101    }
102
103    /// Index a directory, returning all parse results.
104    ///
105    /// Walks the directory respecting `.gitignore` rules (via the `ignore` crate),
106    /// filters by supported file extensions, checks incremental state, and parses
107    /// each changed file.
108    pub fn index_directory(
109        &mut self,
110        root: &Path,
111    ) -> Result<IndexResult, codemem_core::CodememError> {
112        self.index_directory_inner(root, None)
113    }
114
115    /// Index a directory with optional progress reporting.
116    ///
117    /// If a broadcast sender is provided, progress events are sent as files
118    /// are processed. This is useful for SSE streaming to the frontend.
119    pub fn index_directory_with_progress(
120        &mut self,
121        root: &Path,
122        tx: Option<&tokio::sync::broadcast::Sender<IndexProgress>>,
123    ) -> Result<IndexResult, codemem_core::CodememError> {
124        self.index_directory_inner(root, tx)
125    }
126
127    /// Common implementation for directory indexing with optional progress callback.
128    fn index_directory_inner(
129        &mut self,
130        root: &Path,
131        tx: Option<&tokio::sync::broadcast::Sender<IndexProgress>>,
132    ) -> Result<IndexResult, codemem_core::CodememError> {
133        let mut files_scanned = 0usize;
134        let mut files_parsed = 0usize;
135        let mut files_skipped = 0usize;
136        let mut total_symbols = 0usize;
137        let mut total_references = 0usize;
138        let mut total_chunks = 0usize;
139        let mut parse_results = Vec::new();
140
141        let walker = WalkBuilder::new(root)
142            .hidden(true) // skip hidden files/dirs
143            .git_ignore(true) // respect .gitignore
144            .git_global(true) // respect global gitignore
145            .git_exclude(true) // respect .git/info/exclude
146            .build();
147
148        for entry in walker {
149            let entry = match entry {
150                Ok(e) => e,
151                Err(err) => {
152                    tracing::warn!("Walk error: {}", err);
153                    continue;
154                }
155            };
156
157            // Skip directories
158            if !entry.file_type().is_some_and(|ft| ft.is_file()) {
159                continue;
160            }
161
162            let path = entry.path();
163
164            // Check if the file extension is supported
165            let ext = match path.extension().and_then(|e| e.to_str()) {
166                Some(e) => e,
167                None => continue,
168            };
169
170            if !self.parser.supports_extension(ext) {
171                continue;
172            }
173
174            files_scanned += 1;
175
176            // Read file content
177            let content = match std::fs::read(path) {
178                Ok(c) => c,
179                Err(err) => {
180                    tracing::warn!("Failed to read {}: {}", path.display(), err);
181                    continue;
182                }
183            };
184
185            // Use paths relative to root so node IDs are portable across machines.
186            let rel_path = path.strip_prefix(root).unwrap_or(path);
187            let path_str = rel_path.to_string_lossy().to_string();
188
189            // Check incremental state (returns pre-computed hash to avoid double-hashing)
190            let (changed, hash) = self.change_detector.check_changed(&path_str, &content);
191            if !changed {
192                files_skipped += 1;
193                continue;
194            }
195
196            // Parse the file
197            match self.parser.parse_file(&path_str, &content) {
198                Some(result) => {
199                    total_symbols += result.symbols.len();
200                    total_references += result.references.len();
201                    total_chunks += result.chunks.len();
202                    files_parsed += 1;
203
204                    // Record the pre-computed hash (avoids re-hashing)
205                    self.change_detector.record_hash(&path_str, hash);
206
207                    parse_results.push(result);
208
209                    // Send progress event if a sender is provided
210                    if let Some(tx) = tx {
211                        let _ = tx.send(IndexProgress {
212                            files_scanned,
213                            files_parsed,
214                            total_symbols,
215                            current_file: path_str.clone(),
216                        });
217                    }
218                }
219                None => {
220                    tracing::warn!("Failed to parse {}", path_str);
221                }
222            }
223        }
224
225        tracing::info!(
226            "Indexed {}: {} scanned, {} parsed, {} skipped, {} symbols, {} references, {} chunks",
227            root.display(),
228            files_scanned,
229            files_parsed,
230            files_skipped,
231            total_symbols,
232            total_references,
233            total_chunks,
234        );
235
236        Ok(IndexResult {
237            files_scanned,
238            files_parsed,
239            files_skipped,
240            total_symbols,
241            total_references,
242            total_chunks,
243            parse_results,
244        })
245    }
246
247    /// Index a directory, collect all symbols/references/chunks, and resolve
248    /// references into graph edges.
249    ///
250    /// This is the high-level entry point that combines `index_directory()`
251    /// with reference resolution — the common pipeline shared by the MCP
252    /// `index_codebase` tool and the CLI `index` command.
253    pub fn index_and_resolve(
254        &mut self,
255        root: &Path,
256    ) -> Result<IndexAndResolveResult, codemem_core::CodememError> {
257        let result = self.index_directory(root)?;
258
259        let mut all_symbols = Vec::new();
260        let mut all_references = Vec::new();
261        let mut all_chunks = Vec::new();
262        let mut file_paths = HashSet::new();
263
264        // Consume parse_results by value to avoid cloning symbols/references/chunks
265        let IndexResult {
266            files_scanned,
267            files_parsed,
268            files_skipped,
269            total_symbols,
270            total_references,
271            total_chunks,
272            parse_results,
273        } = result;
274
275        for pr in parse_results {
276            file_paths.insert(pr.file_path);
277            all_symbols.extend(pr.symbols);
278            all_references.extend(pr.references);
279            all_chunks.extend(pr.chunks);
280        }
281
282        let mut resolver = ReferenceResolver::new();
283        resolver.add_symbols(&all_symbols);
284        resolver.add_imports(&all_references);
285        let edges = resolver.resolve_all(&all_references);
286
287        // Canonicalize the root so downstream code can reconstruct absolute paths.
288        let root_path = std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
289
290        Ok(IndexAndResolveResult {
291            index: IndexResult {
292                files_scanned,
293                files_parsed,
294                files_skipped,
295                total_symbols,
296                total_references,
297                total_chunks,
298                parse_results: Vec::new(),
299            },
300            symbols: all_symbols,
301            references: all_references,
302            chunks: all_chunks,
303            file_paths,
304            edges,
305            root_path,
306        })
307    }
308}
309
310impl Default for Indexer {
311    fn default() -> Self {
312        Self::new()
313    }
314}
315
316#[cfg(test)]
317#[path = "tests/indexer_tests.rs"]
318mod tests;