Skip to main content

st/mcp/
smart_background_searcher.rs

1// Smart Background Searcher - Intelligent file content searching with limits
2// "Like ripgrep but knows when to stop reading!" - Aye
3
4use anyhow::Result;
5use fuzzy_matcher::skim::SkimMatcherV2;
6use fuzzy_matcher::FuzzyMatcher;
7use notify::{Config, Event, EventKind, RecommendedWatcher, RecursiveMode, Watcher};
8use serde::{Deserialize, Serialize};
9use serde_json::Value;
10use std::collections::HashMap;
11use std::fs::{self, File};
12use std::io::{BufRead, BufReader};
13use std::path::{Path, PathBuf};
14use std::sync::mpsc::{channel, Receiver, Sender};
15use std::sync::{Arc, Mutex};
16use std::thread;
17use std::time::{Duration, Instant};
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct SearchConfig {
21    pub max_lines_per_file: usize, // Default: 1000 for JSONL, 5000 for others
22    pub max_file_size_mb: u64,     // Skip files larger than this
23    pub search_timeout_ms: u64,    // Timeout per file
24    pub fuzzy_threshold: i64,      // Fuzzy match score threshold
25    pub smart_sampling: bool,      // Sample large files intelligently
26    pub watch_patterns: Vec<String>, // File patterns to watch
27}
28
29impl Default for SearchConfig {
30    fn default() -> Self {
31        Self {
32            max_lines_per_file: 1000,
33            max_file_size_mb: 50,
34            search_timeout_ms: 500,
35            fuzzy_threshold: 50,
36            smart_sampling: true,
37            watch_patterns: vec![
38                "*.json".to_string(),
39                "*.jsonl".to_string(),
40                "*.md".to_string(),
41                "*.log".to_string(),
42                "*.txt".to_string(),
43            ],
44        }
45    }
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct SearchResult {
50    pub file_path: PathBuf,
51    pub line_number: usize,
52    pub content: String,
53    pub score: i64,
54    pub context: Vec<String>, // Lines around the match
55    pub file_type: String,
56    pub timestamp: std::time::SystemTime,
57}
58
59pub struct SmartBackgroundSearcher {
60    config: SearchConfig,
61    search_index: Arc<Mutex<HashMap<PathBuf, Vec<SearchResult>>>>,
62    watcher: Option<RecommendedWatcher>,
63    sender: Sender<SearchEvent>,
64}
65
66enum SearchEvent {
67    Search { query: String, paths: Vec<PathBuf> },
68    FileChanged(PathBuf),
69    Stop,
70}
71
72impl SmartBackgroundSearcher {
73    pub fn new(config: SearchConfig) -> Result<Self> {
74        let (sender, receiver) = channel();
75        let search_index = Arc::new(Mutex::new(HashMap::new()));
76
77        // Start background search thread
78        let index_clone = search_index.clone();
79        let config_clone = config.clone();
80
81        thread::spawn(move || {
82            Self::search_worker(receiver, index_clone, config_clone);
83        });
84
85        Ok(Self {
86            config,
87            search_index,
88            watcher: None,
89            sender,
90        })
91    }
92
93    fn search_worker(
94        receiver: Receiver<SearchEvent>,
95        index: Arc<Mutex<HashMap<PathBuf, Vec<SearchResult>>>>,
96        config: SearchConfig,
97    ) {
98        let fuzzy_matcher = SkimMatcherV2::default();
99
100        while let Ok(event) = receiver.recv() {
101            match event {
102                SearchEvent::Search { query, paths } => {
103                    for path in paths {
104                        if let Ok(results) =
105                            Self::search_file(&path, &query, &config, &fuzzy_matcher)
106                        {
107                            if !results.is_empty() {
108                                if let Ok(mut idx) = index.lock() {
109                                    idx.insert(path, results);
110                                }
111                            }
112                        }
113                    }
114                }
115                SearchEvent::FileChanged(path) => {
116                    // Re-index changed file
117                    if let Ok(mut idx) = index.lock() {
118                        idx.remove(&path);
119                    }
120                }
121                SearchEvent::Stop => break,
122            }
123        }
124    }
125
126    fn search_file(
127        path: &Path,
128        query: &str,
129        config: &SearchConfig,
130        matcher: &SkimMatcherV2,
131    ) -> Result<Vec<SearchResult>> {
132        let start = Instant::now();
133        let mut results = Vec::new();
134
135        // Check file size
136        let metadata = fs::metadata(path)?;
137        if metadata.len() > config.max_file_size_mb * 1024 * 1024 {
138            return Ok(results); // Skip large files
139        }
140
141        let file = File::open(path)?;
142        let reader = BufReader::new(file);
143
144        let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("");
145        let file_type = Self::detect_file_type(ext);
146
147        // Determine max lines based on file type
148        let max_lines = match ext {
149            "jsonl" => config.max_lines_per_file,
150            "log" => config.max_lines_per_file,
151            _ => config.max_lines_per_file * 5, // Allow more for regular files
152        };
153
154        let mut line_number = 0;
155        let mut lines_buffer: Vec<String> = Vec::with_capacity(5);
156
157        for line_result in reader.lines() {
158            // Check timeout
159            if start.elapsed().as_millis() > config.search_timeout_ms as u128 {
160                break;
161            }
162
163            line_number += 1;
164            if line_number > max_lines {
165                if config.smart_sampling {
166                    // Smart sampling: read every Nth line after limit
167                    if line_number % 10 != 0 {
168                        continue;
169                    }
170                } else {
171                    break;
172                }
173            }
174
175            if let Ok(line) = line_result {
176                // Keep a rolling buffer for context
177                lines_buffer.push(line.clone());
178                if lines_buffer.len() > 5 {
179                    lines_buffer.remove(0);
180                }
181
182                // Try fuzzy matching
183                if let Some(score) = matcher.fuzzy_match(&line, query) {
184                    if score >= config.fuzzy_threshold {
185                        // For JSONL, try to parse and extract relevant fields
186                        let content = if ext == "jsonl" {
187                            Self::extract_jsonl_content(&line).unwrap_or(line.clone())
188                        } else {
189                            line.clone()
190                        };
191
192                        results.push(SearchResult {
193                            file_path: path.to_path_buf(),
194                            line_number,
195                            content,
196                            score,
197                            context: lines_buffer.clone(),
198                            file_type: file_type.clone(),
199                            timestamp: std::time::SystemTime::now(),
200                        });
201                    }
202                }
203
204                // Also check for exact substring match (case insensitive)
205                if line.to_lowercase().contains(&query.to_lowercase()) {
206                    let content = if ext == "jsonl" {
207                        Self::extract_jsonl_content(&line).unwrap_or(line.clone())
208                    } else {
209                        line.clone()
210                    };
211
212                    results.push(SearchResult {
213                        file_path: path.to_path_buf(),
214                        line_number,
215                        content,
216                        score: 100, // High score for exact match
217                        context: lines_buffer.clone(),
218                        file_type: file_type.clone(),
219                        timestamp: std::time::SystemTime::now(),
220                    });
221                }
222            }
223        }
224
225        Ok(results)
226    }
227
228    fn extract_jsonl_content(line: &str) -> Option<String> {
229        // Try to parse JSONL and extract meaningful content
230        if let Ok(json) = serde_json::from_str::<Value>(line) {
231            // Extract common AI assistant fields
232            let mut parts = Vec::new();
233
234            if let Some(msg) = json.get("message").and_then(|v| v.as_str()) {
235                parts.push(msg.to_string());
236            }
237            if let Some(prompt) = json.get("prompt").and_then(|v| v.as_str()) {
238                parts.push(format!("Prompt: {}", prompt));
239            }
240            if let Some(response) = json.get("response").and_then(|v| v.as_str()) {
241                parts.push(format!("Response: {}", response));
242            }
243            if let Some(content) = json.get("content").and_then(|v| v.as_str()) {
244                parts.push(content.to_string());
245            }
246
247            if !parts.is_empty() {
248                return Some(parts.join(" | "));
249            }
250        }
251        None
252    }
253
254    fn detect_file_type(ext: &str) -> String {
255        match ext {
256            "json" => "json".to_string(),
257            "jsonl" => "jsonl_stream".to_string(),
258            "md" | "markdown" => "markdown".to_string(),
259            "log" => "log_file".to_string(),
260            "txt" => "text_file".to_string(),
261            _ => "unknown".to_string(),
262        }
263    }
264
265    pub async fn search(&self, query: &str, paths: Vec<PathBuf>) -> Vec<SearchResult> {
266        // Send search request to background thread
267        let _ = self.sender.send(SearchEvent::Search {
268            query: query.to_string(),
269            paths: paths.clone(),
270        });
271
272        // Wait a bit for results
273        thread::sleep(Duration::from_millis(self.config.search_timeout_ms));
274
275        // Collect results from index
276        let mut all_results = Vec::new();
277        if let Ok(idx) = self.search_index.lock() {
278            for path in paths {
279                if let Some(results) = idx.get(&path) {
280                    all_results.extend(results.clone());
281                }
282            }
283        }
284
285        // Sort by score
286        all_results.sort_by(|a, b| b.score.cmp(&a.score));
287        all_results
288    }
289
290    pub fn start_watching(&mut self, watch_paths: Vec<PathBuf>) -> Result<()> {
291        let sender = self.sender.clone();
292        let config = self.config.clone();
293
294        let mut watcher = RecommendedWatcher::new(
295            move |res: Result<Event, notify::Error>| {
296                if let Ok(event) = res {
297                    if matches!(event.kind, EventKind::Create(_) | EventKind::Modify(_)) {
298                        for path in event.paths {
299                            // Check if file matches our watch patterns
300                            if let Some(ext) = path.extension().and_then(|s| s.to_str()) {
301                                let should_watch = config.watch_patterns.iter().any(|pattern| {
302                                    pattern.ends_with(&format!("*.{}", ext))
303                                        || pattern == &format!("*.{}", ext)
304                                });
305
306                                if should_watch {
307                                    println!("🔍 File changed, re-indexing: {}", path.display());
308                                    let _ = sender.send(SearchEvent::FileChanged(path));
309                                }
310                            }
311                        }
312                    }
313                }
314            },
315            Config::default(),
316        )?;
317
318        // Watch specified paths
319        for path in &watch_paths {
320            if path.exists() {
321                watcher.watch(path, RecursiveMode::Recursive)?;
322                println!("👁️  Watching for changes in: {}", path.display());
323            }
324        }
325
326        self.watcher = Some(watcher);
327
328        // Do initial indexing of existing files
329        self.initial_index(watch_paths)?;
330
331        Ok(())
332    }
333
334    fn initial_index(&self, watch_paths: Vec<PathBuf>) -> Result<()> {
335        println!("🔍 Initial indexing of watched directories...");
336
337        for watch_path in watch_paths {
338            if watch_path.is_dir() {
339                // Find all matching files in the directory
340                for pattern in &self.config.watch_patterns {
341                    let glob_pattern = format!("{}/{}", watch_path.display(), pattern);
342                    if let Ok(paths) = glob::glob(&glob_pattern) {
343                        let files: Vec<PathBuf> = paths
344                            .filter_map(|p| p.ok())
345                            .filter(|p| p.is_file())
346                            .collect();
347
348                        if !files.is_empty() {
349                            println!("  Found {} {} files", files.len(), pattern);
350                            // Trigger initial search/index for these files
351                            let _ = self.sender.send(SearchEvent::Search {
352                                query: String::new(), // Empty query for initial indexing
353                                paths: files,
354                            });
355                        }
356                    }
357                }
358            }
359        }
360
361        println!("✅ Initial indexing complete!");
362        Ok(())
363    }
364
365    pub fn get_cached_results(&self, path: &Path) -> Vec<SearchResult> {
366        if let Ok(idx) = self.search_index.lock() {
367            idx.get(path).cloned().unwrap_or_default()
368        } else {
369            Vec::new()
370        }
371    }
372
373    pub fn clear_cache(&self) {
374        if let Ok(mut idx) = self.search_index.lock() {
375            idx.clear();
376        }
377    }
378}
379
380// Integration with MCP
381pub async fn handle_smart_search(params: Value) -> Result<Value> {
382    let query = params["query"]
383        .as_str()
384        .ok_or_else(|| anyhow::anyhow!("Missing query parameter"))?;
385
386    let paths: Vec<PathBuf> = params["paths"]
387        .as_array()
388        .map(|arr| {
389            arr.iter()
390                .filter_map(|v| v.as_str())
391                .map(PathBuf::from)
392                .collect()
393        })
394        .unwrap_or_else(|| vec![std::env::current_dir().unwrap_or_default()]);
395
396    let config = SearchConfig::default();
397    let searcher = SmartBackgroundSearcher::new(config)?;
398
399    let results = searcher.search(query, paths).await;
400
401    // Format results for MCP
402    let formatted: Vec<Value> = results
403        .into_iter()
404        .take(20) // Limit results
405        .map(|r| {
406            serde_json::json!({
407                "file": r.file_path.to_string_lossy(),
408                "line": r.line_number,
409                "content": r.content,
410                "score": r.score,
411                "type": r.file_type,
412                "context": r.context,
413            })
414        })
415        .collect();
416
417    Ok(serde_json::json!({
418        "results": formatted,
419        "count": formatted.len(),
420        "message": format!("Found {} matches for '{}'", formatted.len(), query)
421    }))
422}