greppy/cli/
index.rs

1//! Index command implementation
2//!
3//! Memory-safe parallel indexing:
4//! - Phase 1: Collect file paths (small memory footprint)
5//! - Phase 2: Parallel read + chunk with rayon (bounded by thread pool)
6//! - Phase 3: Sequential write to Tantivy with periodic commits
7//!
8//! This avoids holding all file contents or chunks in memory at once.
9
10use crate::cli::IndexArgs;
11use crate::core::config::Config;
12use crate::core::error::Result;
13use crate::core::project::Project;
14use crate::index::{IndexWriter, TantivyIndex};
15use crate::parse::{chunk_file, Chunk};
16use ignore::WalkBuilder;
17use rayon::prelude::*;
18use std::env;
19use std::path::PathBuf;
20use std::sync::atomic::{AtomicUsize, Ordering};
21use std::time::Instant;
22use tracing::{debug, info};
23
24/// Batch size for commits - prevents unbounded memory growth in Tantivy
25const COMMIT_BATCH_SIZE: usize = 5000;
26
27/// Run the index command
28pub fn run(args: IndexArgs) -> Result<()> {
29    // Determine project path
30    let project_path = args
31        .project
32        .unwrap_or_else(|| env::current_dir().expect("Failed to get current directory"));
33
34    // Detect project
35    let project = Project::detect(&project_path)?;
36    info!(project = %project.name, root = %project.root.display(), "Indexing project");
37
38    let start = Instant::now();
39
40    // Load config for ignore patterns
41    let config = Config::load()?;
42
43    // Create or open index
44    let index = if args.force {
45        TantivyIndex::delete(&project.root)?;
46        TantivyIndex::open_or_create(&project.root)?
47    } else {
48        TantivyIndex::open_or_create(&project.root)?
49    };
50
51    // =========================================================================
52    // PHASE 1: Collect file paths (memory-efficient - just PathBufs)
53    // =========================================================================
54    let walker = WalkBuilder::new(&project.root)
55        .hidden(true)
56        .git_ignore(true)
57        .git_global(true)
58        .git_exclude(true)
59        .max_filesize(Some(config.index.max_file_size))
60        .build();
61
62    let ignore_patterns = config.ignore.patterns.clone();
63
64    let file_paths: Vec<PathBuf> = walker
65        .flatten()
66        .filter_map(|entry| {
67            let path = entry.path();
68            if path.is_dir() {
69                return None;
70            }
71            if !is_code_file(path) {
72                return None;
73            }
74            if should_ignore(path, &ignore_patterns) {
75                debug!(path = %path.display(), "Skipping ignored file");
76                return None;
77            }
78            Some(path.to_path_buf())
79        })
80        .collect();
81
82    let total_files = file_paths.len();
83    info!(files = total_files, "Found files to index");
84
85    // =========================================================================
86    // PHASE 2: Parallel read + chunk (rayon handles thread pool bounds)
87    // Memory safety: Each file is read, chunked, and dropped before next batch
88    // =========================================================================
89    let file_count = AtomicUsize::new(0);
90    let chunk_count = AtomicUsize::new(0);
91
92    // Process in batches to control memory - don't load all files at once
93    let batch_size = 500; // Process 500 files at a time
94    let mut writer = IndexWriter::new(&index)?;
95    let mut total_chunks_written = 0usize;
96
97    for batch in file_paths.chunks(batch_size) {
98        // Parallel: read and chunk files in this batch
99        let batch_chunks: Vec<Chunk> = batch
100            .par_iter()
101            .filter_map(|path| {
102                let content = match std::fs::read_to_string(path) {
103                    Ok(c) => c,
104                    Err(e) => {
105                        debug!(path = %path.display(), error = %e, "Failed to read file");
106                        return None;
107                    }
108                };
109
110                file_count.fetch_add(1, Ordering::Relaxed);
111                let chunks = chunk_file(path, &content);
112                chunk_count.fetch_add(chunks.len(), Ordering::Relaxed);
113
114                // Return chunks, content is dropped here (memory freed)
115                Some(chunks)
116            })
117            .flatten()
118            .collect();
119
120        // Sequential: write to Tantivy (thread-safe requirement)
121        for chunk in &batch_chunks {
122            writer.add_chunk(chunk)?;
123            total_chunks_written += 1;
124
125            // Periodic commit to prevent unbounded Tantivy buffer growth
126            if total_chunks_written % COMMIT_BATCH_SIZE == 0 {
127                debug!(chunks = total_chunks_written, "Intermediate commit");
128                writer = writer.commit_and_reopen(&index)?;
129            }
130        }
131        // batch_chunks dropped here - memory freed before next batch
132    }
133
134    // Final commit
135    writer.commit()?;
136
137    let elapsed = start.elapsed();
138    let final_file_count = file_count.load(Ordering::Relaxed);
139    let final_chunk_count = chunk_count.load(Ordering::Relaxed);
140
141    let chunks_per_sec = if elapsed.as_secs_f64() > 0.0 {
142        final_chunk_count as f64 / elapsed.as_secs_f64()
143    } else {
144        0.0
145    };
146
147    info!(
148        files = final_file_count,
149        chunks = final_chunk_count,
150        elapsed_ms = elapsed.as_millis(),
151        chunks_per_sec = chunks_per_sec as u64,
152        "Indexing complete"
153    );
154
155    println!(
156        "Indexed {} files ({} chunks) in {:.2}s ({:.0} chunks/sec)",
157        final_file_count,
158        final_chunk_count,
159        elapsed.as_secs_f64(),
160        chunks_per_sec
161    );
162
163    Ok(())
164}
165
166/// Check if a file is a code file worth indexing
167fn is_code_file(path: &std::path::Path) -> bool {
168    let ext = path
169        .extension()
170        .and_then(|e| e.to_str())
171        .unwrap_or("")
172        .to_lowercase();
173
174    matches!(
175        ext.as_str(),
176        "ts" | "tsx"
177            | "js"
178            | "jsx"
179            | "mjs"
180            | "cjs"
181            | "py"
182            | "pyi"
183            | "rs"
184            | "go"
185            | "java"
186            | "kt"
187            | "kts"
188            | "scala"
189            | "rb"
190            | "php"
191            | "c"
192            | "h"
193            | "cpp"
194            | "cc"
195            | "cxx"
196            | "hpp"
197            | "cs"
198            | "swift"
199            | "ex"
200            | "exs"
201            | "erl"
202            | "hrl"
203            | "hs"
204            | "ml"
205            | "mli"
206            | "lua"
207            | "sh"
208            | "bash"
209            | "zsh"
210            | "sql"
211            | "vue"
212            | "svelte"
213    )
214}
215
216/// Check if a path matches any ignore pattern
217fn should_ignore(path: &std::path::Path, patterns: &[String]) -> bool {
218    let path_str = path.to_string_lossy();
219
220    for pattern in patterns {
221        // Simple substring matching for now
222        // TODO: Use proper glob matching
223        if path_str.contains(pattern.trim_matches('*')) {
224            return true;
225        }
226    }
227
228    false
229}