greppy/cli/
index.rs

1//! Index command implementation
2//!
3//! Memory-safe parallel indexing:
4//! - Phase 1: Collect file paths (small memory footprint)
5//! - Phase 2: Parallel read + chunk with rayon (bounded by thread pool)
6//! - Phase 3: Sequential write to Tantivy with periodic commits
7//! - Phase 4: Build semantic trace index (symbols, calls, references)
8//!
9//! This avoids holding all file contents or chunks in memory at once.
10
11use crate::cli::IndexArgs;
12use crate::core::config::Config;
13use crate::core::error::Result;
14use crate::core::project::Project;
15use crate::index::{IndexWriter, TantivyIndex};
16use crate::parse::{chunk_file, Chunk};
17use crate::trace::{
18    build_and_save_index, detect_language, find_dead_symbols, is_treesitter_supported, load_index,
19    snapshots::create_snapshot, trace_index_path, SemanticIndex,
20};
21use ignore::WalkBuilder;
22use rayon::prelude::*;
23use std::collections::{HashMap, HashSet};
24use std::env;
25use std::path::PathBuf;
26use std::sync::atomic::{AtomicUsize, Ordering};
27use std::time::Instant;
28use tracing::{debug, info};
29
30/// Batch size for commits - prevents unbounded memory growth in Tantivy
31const COMMIT_BATCH_SIZE: usize = 5000;
32
33/// Run the index command
34pub fn run(args: IndexArgs) -> Result<()> {
35    // Determine project path
36    let project_path = args
37        .project
38        .unwrap_or_else(|| env::current_dir().expect("Failed to get current directory"));
39
40    // Detect project
41    let project = Project::detect(&project_path)?;
42    info!(project = %project.name, root = %project.root.display(), "Indexing project");
43
44    let start = Instant::now();
45
46    // Load config for ignore patterns
47    let config = Config::load()?;
48
49    // Create or open index
50    let index = if args.force {
51        TantivyIndex::delete(&project.root)?;
52        TantivyIndex::open_or_create(&project.root)?
53    } else {
54        TantivyIndex::open_or_create(&project.root)?
55    };
56
57    // =========================================================================
58    // PHASE 1: Collect file paths (memory-efficient - just PathBufs)
59    // =========================================================================
60    let walker = WalkBuilder::new(&project.root)
61        .hidden(true)
62        .git_ignore(true)
63        .git_global(true)
64        .git_exclude(true)
65        .max_filesize(Some(config.index.max_file_size))
66        .build();
67
68    let ignore_patterns = config.ignore.patterns.clone();
69
70    let file_paths: Vec<PathBuf> = walker
71        .flatten()
72        .filter_map(|entry| {
73            let path = entry.path();
74            if path.is_dir() {
75                return None;
76            }
77            if !is_code_file(path) {
78                return None;
79            }
80            if should_ignore(path, &ignore_patterns) {
81                debug!(path = %path.display(), "Skipping ignored file");
82                return None;
83            }
84            Some(path.to_path_buf())
85        })
86        .collect();
87
88    let total_files = file_paths.len();
89    info!(files = total_files, "Found files to index");
90
91    // =========================================================================
92    // PHASE 2: Parallel read + chunk (rayon handles thread pool bounds)
93    // Memory safety: Each file is read, chunked, and dropped before next batch
94    // =========================================================================
95    let file_count = AtomicUsize::new(0);
96    let chunk_count = AtomicUsize::new(0);
97
98    // Process in batches to control memory - don't load all files at once
99    let batch_size = 500; // Process 500 files at a time
100    let mut writer = IndexWriter::new(&index)?;
101    let mut total_chunks_written = 0usize;
102
103    for batch in file_paths.chunks(batch_size) {
104        // Parallel: read and chunk files in this batch
105        let batch_chunks: Vec<Chunk> = batch
106            .par_iter()
107            .filter_map(|path| {
108                let content = match std::fs::read_to_string(path) {
109                    Ok(c) => c,
110                    Err(e) => {
111                        debug!(path = %path.display(), error = %e, "Failed to read file");
112                        return None;
113                    }
114                };
115
116                file_count.fetch_add(1, Ordering::Relaxed);
117                let chunks = chunk_file(path, &content);
118                chunk_count.fetch_add(chunks.len(), Ordering::Relaxed);
119
120                // Return chunks, content is dropped here (memory freed)
121                Some(chunks)
122            })
123            .flatten()
124            .collect();
125
126        // Sequential: write to Tantivy (thread-safe requirement)
127        for chunk in &batch_chunks {
128            writer.add_chunk(chunk)?;
129            total_chunks_written += 1;
130
131            // Periodic commit to prevent unbounded Tantivy buffer growth
132            if total_chunks_written % COMMIT_BATCH_SIZE == 0 {
133                debug!(chunks = total_chunks_written, "Intermediate commit");
134                writer = writer.commit_and_reopen(&index)?;
135            }
136        }
137        // batch_chunks dropped here - memory freed before next batch
138    }
139
140    // Final commit
141    writer.commit()?;
142
143    let tantivy_elapsed = start.elapsed();
144    let final_file_count = file_count.load(Ordering::Relaxed);
145    let final_chunk_count = chunk_count.load(Ordering::Relaxed);
146
147    let chunks_per_sec = if tantivy_elapsed.as_secs_f64() > 0.0 {
148        final_chunk_count as f64 / tantivy_elapsed.as_secs_f64()
149    } else {
150        0.0
151    };
152
153    info!(
154        files = final_file_count,
155        chunks = final_chunk_count,
156        elapsed_ms = tantivy_elapsed.as_millis(),
157        chunks_per_sec = chunks_per_sec as u64,
158        "Text index complete"
159    );
160
161    println!(
162        "Text index: {} files ({} chunks) in {:.2}s",
163        final_file_count,
164        final_chunk_count,
165        tantivy_elapsed.as_secs_f64(),
166    );
167
168    // =========================================================================
169    // PHASE 4: Build semantic trace index
170    // =========================================================================
171    let trace_start = Instant::now();
172    info!("Building semantic trace index...");
173
174    // Collect files that support tree-sitter for semantic indexing
175    // We need to re-read files for semantic extraction (different from chunking)
176    let semantic_files: Vec<(PathBuf, String)> = file_paths
177        .par_iter()
178        .filter_map(|path| {
179            let lang = detect_language(path);
180            if !is_treesitter_supported(lang) {
181                return None;
182            }
183            match std::fs::read_to_string(path) {
184                Ok(content) => Some((path.clone(), content)),
185                Err(_) => None,
186            }
187        })
188        .collect();
189
190    let semantic_file_count = semantic_files.len();
191
192    if semantic_file_count > 0 {
193        match build_and_save_index(&project.root, &semantic_files) {
194            Ok(stats) => {
195                let trace_elapsed = trace_start.elapsed();
196                info!(
197                    files = stats.files,
198                    symbols = stats.symbols,
199                    tokens = stats.tokens,
200                    edges = stats.edges,
201                    elapsed_ms = trace_elapsed.as_millis(),
202                    "Trace index complete"
203                );
204                println!(
205                    "Trace index: {} files ({} symbols, {} edges) in {:.2}s",
206                    stats.files,
207                    stats.symbols,
208                    stats.edges,
209                    trace_elapsed.as_secs_f64(),
210                );
211            }
212            Err(e) => {
213                tracing::warn!("Failed to build trace index: {}", e);
214                println!("Warning: Trace index build failed: {}", e);
215            }
216        }
217    } else {
218        println!("Trace index: skipped (no supported languages)");
219    }
220
221    let total_elapsed = start.elapsed();
222    println!(
223        "\nTotal: {:.2}s ({:.0} chunks/sec)",
224        total_elapsed.as_secs_f64(),
225        chunks_per_sec
226    );
227
228    // =========================================================================
229    // PHASE 5: Create automatic snapshot
230    // =========================================================================
231    // Only create snapshot if trace index was successfully built
232    if semantic_file_count > 0 {
233        let trace_path = trace_index_path(&project.root);
234        if trace_path.exists() {
235            match load_index(&trace_path) {
236                Ok(index) => {
237                    let dead_symbols = find_dead_symbols(&index);
238                    let cycles_count = count_cycles(&index) as u32;
239
240                    match create_snapshot(
241                        &index,
242                        &project.root,
243                        &project.name,
244                        &dead_symbols.iter().map(|s| s.id).collect(),
245                        cycles_count,
246                        None, // Auto-generated, no custom name
247                    ) {
248                        Ok(_) => {
249                            debug!("Auto-created snapshot after indexing");
250                        }
251                        Err(e) => {
252                            debug!("Failed to create snapshot: {}", e);
253                        }
254                    }
255                }
256                Err(e) => {
257                    debug!("Failed to load trace index for snapshot: {}", e);
258                }
259            }
260        }
261    }
262
263    Ok(())
264}
265
266/// Count cycles using DFS (simplified version)
267fn count_cycles(index: &SemanticIndex) -> usize {
268    let mut graph: HashMap<u16, HashSet<u16>> = HashMap::new();
269
270    for edge in &index.edges {
271        if let (Some(from_sym), Some(to_sym)) =
272            (index.symbol(edge.from_symbol), index.symbol(edge.to_symbol))
273        {
274            if from_sym.file_id != to_sym.file_id {
275                graph
276                    .entry(from_sym.file_id)
277                    .or_default()
278                    .insert(to_sym.file_id);
279            }
280        }
281    }
282
283    let mut cycles = 0;
284    let mut visited = HashSet::new();
285    let mut rec_stack = HashSet::new();
286
287    for &node in graph.keys() {
288        if !visited.contains(&node) {
289            cycles += count_cycles_dfs(node, &graph, &mut visited, &mut rec_stack);
290        }
291    }
292
293    cycles
294}
295
296fn count_cycles_dfs(
297    node: u16,
298    graph: &HashMap<u16, HashSet<u16>>,
299    visited: &mut HashSet<u16>,
300    rec_stack: &mut HashSet<u16>,
301) -> usize {
302    visited.insert(node);
303    rec_stack.insert(node);
304
305    let mut cycles = 0;
306
307    if let Some(neighbors) = graph.get(&node) {
308        for &neighbor in neighbors {
309            if !visited.contains(&neighbor) {
310                cycles += count_cycles_dfs(neighbor, graph, visited, rec_stack);
311            } else if rec_stack.contains(&neighbor) {
312                cycles += 1;
313            }
314        }
315    }
316
317    rec_stack.remove(&node);
318    cycles
319}
320
321/// Check if a file is a code file worth indexing
322fn is_code_file(path: &std::path::Path) -> bool {
323    let ext = path
324        .extension()
325        .and_then(|e| e.to_str())
326        .unwrap_or("")
327        .to_lowercase();
328
329    matches!(
330        ext.as_str(),
331        "ts" | "tsx"
332            | "js"
333            | "jsx"
334            | "mjs"
335            | "cjs"
336            | "py"
337            | "pyi"
338            | "rs"
339            | "go"
340            | "java"
341            | "kt"
342            | "kts"
343            | "scala"
344            | "rb"
345            | "php"
346            | "c"
347            | "h"
348            | "cpp"
349            | "cc"
350            | "cxx"
351            | "hpp"
352            | "cs"
353            | "swift"
354            | "ex"
355            | "exs"
356            | "erl"
357            | "hrl"
358            | "hs"
359            | "ml"
360            | "mli"
361            | "lua"
362            | "sh"
363            | "bash"
364            | "zsh"
365            | "sql"
366            | "vue"
367            | "svelte"
368    )
369}
370
371/// Check if a path matches any ignore pattern
372fn should_ignore(path: &std::path::Path, patterns: &[String]) -> bool {
373    let path_str = path.to_string_lossy();
374
375    for pattern in patterns {
376        // Simple substring matching for now
377        // TODO: Use proper glob matching
378        if path_str.contains(pattern.trim_matches('*')) {
379            return true;
380        }
381    }
382
383    false
384}