Skip to main content

codemem_engine/index/
indexer.rs

1//! Main indexing pipeline orchestrator.
2//!
3//! Walks a directory, filters supported files, checks for changes,
4//! and parses each file using the CodeParser.
5
6use crate::index::chunker::CodeChunk;
7use crate::index::incremental::ChangeDetector;
8use crate::index::parser::{CodeParser, ParseResult};
9use crate::index::resolver::{ReferenceResolver, ResolvedEdge};
10use crate::index::symbol::{Reference, Symbol};
11use ignore::WalkBuilder;
12use std::collections::HashSet;
13use std::path::Path;
14
15/// Progress event emitted during directory indexing.
16#[derive(Debug, Clone)]
17pub struct IndexProgress {
18    /// Number of files scanned so far.
19    pub files_scanned: usize,
20    /// Number of files parsed so far.
21    pub files_parsed: usize,
22    /// Total symbols extracted so far.
23    pub total_symbols: usize,
24    /// Current file being processed (relative path).
25    pub current_file: String,
26}
27
28/// Result of indexing a directory.
29#[derive(Debug)]
30pub struct IndexResult {
31    /// Total number of files scanned (walked).
32    pub files_scanned: usize,
33    /// Number of files successfully parsed.
34    pub files_parsed: usize,
35    /// Number of files skipped (unchanged since last index).
36    pub files_skipped: usize,
37    /// Total symbols extracted across all files.
38    pub total_symbols: usize,
39    /// Total references extracted across all files.
40    pub total_references: usize,
41    /// Total CST-aware chunks extracted across all files.
42    pub total_chunks: usize,
43    /// Individual parse results for each successfully parsed file.
44    pub parse_results: Vec<ParseResult>,
45}
46
47/// Result of `index_and_resolve()` — the complete indexing + resolution output.
48#[derive(Debug)]
49pub struct IndexAndResolveResult {
50    /// The underlying index result with per-file parse data.
51    pub index: IndexResult,
52    /// All symbols collected from all parsed files.
53    pub symbols: Vec<Symbol>,
54    /// All references collected from all parsed files.
55    pub references: Vec<Reference>,
56    /// All CST-aware chunks collected from all parsed files.
57    pub chunks: Vec<CodeChunk>,
58    /// All unique file paths that were parsed.
59    pub file_paths: HashSet<String>,
60    /// Resolved edges from reference resolution.
61    pub edges: Vec<ResolvedEdge>,
62}
63
64/// The main indexing pipeline.
65///
66/// Coordinates directory walking, change detection, and parsing
67/// to produce a complete index of a codebase.
68pub struct Indexer {
69    parser: CodeParser,
70    change_detector: ChangeDetector,
71}
72
73impl Indexer {
74    /// Create a new Indexer with default settings.
75    pub fn new() -> Self {
76        Self {
77            parser: CodeParser::new(),
78            change_detector: ChangeDetector::new(),
79        }
80    }
81
82    /// Create a new Indexer with a pre-loaded ChangeDetector.
83    pub fn with_change_detector(change_detector: ChangeDetector) -> Self {
84        Self {
85            parser: CodeParser::new(),
86            change_detector,
87        }
88    }
89
90    /// Get a reference to the change detector for persistence.
91    pub fn change_detector(&self) -> &ChangeDetector {
92        &self.change_detector
93    }
94
95    /// Get a mutable reference to the change detector.
96    pub fn change_detector_mut(&mut self) -> &mut ChangeDetector {
97        &mut self.change_detector
98    }
99
100    /// Index a directory, returning all parse results.
101    ///
102    /// Walks the directory respecting `.gitignore` rules (via the `ignore` crate),
103    /// filters by supported file extensions, checks incremental state, and parses
104    /// each changed file.
105    pub fn index_directory(
106        &mut self,
107        root: &Path,
108    ) -> Result<IndexResult, codemem_core::CodememError> {
109        self.index_directory_inner(root, None)
110    }
111
112    /// Index a directory with optional progress reporting.
113    ///
114    /// If a broadcast sender is provided, progress events are sent as files
115    /// are processed. This is useful for SSE streaming to the frontend.
116    pub fn index_directory_with_progress(
117        &mut self,
118        root: &Path,
119        tx: Option<&tokio::sync::broadcast::Sender<IndexProgress>>,
120    ) -> Result<IndexResult, codemem_core::CodememError> {
121        self.index_directory_inner(root, tx)
122    }
123
124    /// Common implementation for directory indexing with optional progress callback.
125    fn index_directory_inner(
126        &mut self,
127        root: &Path,
128        tx: Option<&tokio::sync::broadcast::Sender<IndexProgress>>,
129    ) -> Result<IndexResult, codemem_core::CodememError> {
130        let mut files_scanned = 0usize;
131        let mut files_parsed = 0usize;
132        let mut files_skipped = 0usize;
133        let mut total_symbols = 0usize;
134        let mut total_references = 0usize;
135        let mut total_chunks = 0usize;
136        let mut parse_results = Vec::new();
137
138        let walker = WalkBuilder::new(root)
139            .hidden(true) // skip hidden files/dirs
140            .git_ignore(true) // respect .gitignore
141            .git_global(true) // respect global gitignore
142            .git_exclude(true) // respect .git/info/exclude
143            .build();
144
145        for entry in walker {
146            let entry = match entry {
147                Ok(e) => e,
148                Err(err) => {
149                    tracing::warn!("Walk error: {}", err);
150                    continue;
151                }
152            };
153
154            // Skip directories
155            if !entry.file_type().is_some_and(|ft| ft.is_file()) {
156                continue;
157            }
158
159            let path = entry.path();
160
161            // Check if the file extension is supported
162            let ext = match path.extension().and_then(|e| e.to_str()) {
163                Some(e) => e,
164                None => continue,
165            };
166
167            if !self.parser.supports_extension(ext) {
168                continue;
169            }
170
171            files_scanned += 1;
172
173            // Read file content
174            let content = match std::fs::read(path) {
175                Ok(c) => c,
176                Err(err) => {
177                    tracing::warn!("Failed to read {}: {}", path.display(), err);
178                    continue;
179                }
180            };
181
182            let path_str = path.to_string_lossy().to_string();
183
184            // Check incremental state (returns pre-computed hash to avoid double-hashing)
185            let (changed, hash) = self.change_detector.check_changed(&path_str, &content);
186            if !changed {
187                files_skipped += 1;
188                continue;
189            }
190
191            // Parse the file
192            match self.parser.parse_file(&path_str, &content) {
193                Some(result) => {
194                    total_symbols += result.symbols.len();
195                    total_references += result.references.len();
196                    total_chunks += result.chunks.len();
197                    files_parsed += 1;
198
199                    // Record the pre-computed hash (avoids re-hashing)
200                    self.change_detector.record_hash(&path_str, hash);
201
202                    parse_results.push(result);
203
204                    // Send progress event if a sender is provided
205                    if let Some(tx) = tx {
206                        let relative_path = path
207                            .strip_prefix(root)
208                            .unwrap_or(path)
209                            .to_string_lossy()
210                            .to_string();
211                        let _ = tx.send(IndexProgress {
212                            files_scanned,
213                            files_parsed,
214                            total_symbols,
215                            current_file: relative_path,
216                        });
217                    }
218                }
219                None => {
220                    tracing::warn!("Failed to parse {}", path_str);
221                }
222            }
223        }
224
225        tracing::info!(
226            "Indexed {}: {} scanned, {} parsed, {} skipped, {} symbols, {} references, {} chunks",
227            root.display(),
228            files_scanned,
229            files_parsed,
230            files_skipped,
231            total_symbols,
232            total_references,
233            total_chunks,
234        );
235
236        Ok(IndexResult {
237            files_scanned,
238            files_parsed,
239            files_skipped,
240            total_symbols,
241            total_references,
242            total_chunks,
243            parse_results,
244        })
245    }
246
247    /// Index a directory, collect all symbols/references/chunks, and resolve
248    /// references into graph edges.
249    ///
250    /// This is the high-level entry point that combines `index_directory()`
251    /// with reference resolution — the common pipeline shared by the MCP
252    /// `index_codebase` tool and the CLI `index` command.
253    pub fn index_and_resolve(
254        &mut self,
255        root: &Path,
256    ) -> Result<IndexAndResolveResult, codemem_core::CodememError> {
257        let result = self.index_directory(root)?;
258
259        let mut all_symbols = Vec::new();
260        let mut all_references = Vec::new();
261        let mut all_chunks = Vec::new();
262        let mut file_paths = HashSet::new();
263
264        // Consume parse_results by value to avoid cloning symbols/references/chunks
265        let IndexResult {
266            files_scanned,
267            files_parsed,
268            files_skipped,
269            total_symbols,
270            total_references,
271            total_chunks,
272            parse_results,
273        } = result;
274
275        for pr in parse_results {
276            file_paths.insert(pr.file_path);
277            all_symbols.extend(pr.symbols);
278            all_references.extend(pr.references);
279            all_chunks.extend(pr.chunks);
280        }
281
282        let mut resolver = ReferenceResolver::new();
283        resolver.add_symbols(&all_symbols);
284        let edges = resolver.resolve_all(&all_references);
285
286        Ok(IndexAndResolveResult {
287            index: IndexResult {
288                files_scanned,
289                files_parsed,
290                files_skipped,
291                total_symbols,
292                total_references,
293                total_chunks,
294                parse_results: Vec::new(),
295            },
296            symbols: all_symbols,
297            references: all_references,
298            chunks: all_chunks,
299            file_paths,
300            edges,
301        })
302    }
303}
304
305impl Default for Indexer {
306    fn default() -> Self {
307        Self::new()
308    }
309}
310
311#[cfg(test)]
312#[path = "tests/indexer_tests.rs"]
313mod tests;