Skip to main content

forgekit_core/
indexing.rs

1//! Incremental indexing for processing file changes.
2//!
3//! This module provides change-based incremental indexing to avoid full
4//! re-scans when files are modified.
5
6use crate::storage::UnifiedGraphStore;
7use crate::watcher::WatchEvent;
8use std::collections::HashSet;
9use std::path::{Path, PathBuf};
10use std::sync::Arc;
11
12/// Path filter for controlling which files get indexed.
13///
14/// By default, only files in `src/` and `tests/` directories are indexed.
15#[derive(Clone, Debug)]
16pub struct PathFilter {
17    /// Include patterns (files must match at least one)
18    include_patterns: Vec<String>,
19    /// Exclude patterns (files matching these are rejected)
20    exclude_patterns: Vec<String>,
21    /// File extensions to include (empty = all)
22    extensions: Vec<String>,
23}
24
25impl Default for PathFilter {
26    fn default() -> Self {
27        Self::new_with_defaults()
28    }
29}
30
31impl PathFilter {
32    /// Creates an empty path filter with no patterns.
33    /// Use `PathFilter::default()` or `PathFilter::new_with_defaults()` for the standard filter.
34    pub fn new() -> Self {
35        Self {
36            include_patterns: vec![],
37            exclude_patterns: vec![],
38            extensions: vec![],
39        }
40    }
41
42    /// Creates a path filter with default settings (src/ and tests/ only).
43    pub fn new_with_defaults() -> Self {
44        Self {
45            // Only index src/ and tests/ directories
46            include_patterns: vec!["**/src/**".to_string(), "**/tests/**".to_string()],
47            // Exclude common non-source directories and files
48            exclude_patterns: vec![
49                "**/target/**".to_string(),
50                "**/node_modules/**".to_string(),
51                ".git/**".to_string(),
52                "**/.forge/**".to_string(),
53                "**/Cargo.lock".to_string(),
54                "**/package-lock.json".to_string(),
55                "**/yarn.lock".to_string(),
56                "**/*.min.js".to_string(),
57                "**/*.min.css".to_string(),
58            ],
59            // Only index source code files
60            extensions: vec![
61                "rs".to_string(),   // Rust
62                "py".to_string(),   // Python
63                "js".to_string(),   // JavaScript
64                "ts".to_string(),   // TypeScript
65                "jsx".to_string(),  // React JSX
66                "tsx".to_string(),  // React TSX
67                "go".to_string(),   // Go
68                "java".to_string(), // Java
69                "c".to_string(),    // C
70                "cpp".to_string(),  // C++
71                "h".to_string(),    // C header
72                "hpp".to_string(),  // C++ header
73                "mod".to_string(),  // Go module
74            ],
75        }
76    }
77
78    /// Creates a path filter that only includes specific directories.
79    ///
80    /// # Arguments
81    ///
82    /// * `dirs` - Directories to include (e.g., ["src", "tests"])
83    pub fn include_dirs(dirs: &[&str]) -> Self {
84        Self {
85            include_patterns: dirs.iter().map(|d| format!("**/{}/**", d)).collect(),
86            ..Self::default()
87        }
88    }
89
90    /// Checks if a path should be indexed.
91    ///
92    /// A path is indexed if:
93    /// 1. It matches at least one include pattern
94    /// 2. It does NOT match any exclude pattern
95    /// 3. It has an allowed extension (if extensions are specified)
96    ///
97    /// # Arguments
98    ///
99    /// * `path` - The file path to check
100    pub fn should_index(&self, path: &Path) -> bool {
101        let path_str = path.to_string_lossy();
102
103        // Check exclude patterns first
104        for pattern in &self.exclude_patterns {
105            if Self::match_glob(&path_str, pattern) {
106                return false;
107            }
108        }
109
110        // Check include patterns
111        let mut included = false;
112        for pattern in &self.include_patterns {
113            if Self::match_glob(&path_str, pattern) {
114                included = true;
115                break;
116            }
117        }
118        if !included {
119            return false;
120        }
121
122        // Check extension
123        if !self.extensions.is_empty() {
124            if let Some(ext) = path.extension() {
125                let ext = ext.to_string_lossy().to_lowercase();
126                if !self.extensions.contains(&ext) {
127                    return false;
128                }
129            } else {
130                // No extension and we require one
131                return false;
132            }
133        }
134
135        true
136    }
137
138    /// Simple glob matching (supports * and ** wildcards).
139    fn match_glob(path: &str, pattern: &str) -> bool {
140        // Handle **/dir/** pattern (matches dir anywhere in path, with contents)
141        if pattern.starts_with("**/") && pattern.ends_with("/**") {
142            let dir = &pattern[3..pattern.len() - 3]; // Extract "dir" from "**/dir/**"
143                                                      // Path should contain the directory
144            return path.contains(&format!("{}/", dir)) || path.starts_with(&format!("{}/", dir));
145        }
146
147        // Handle **/suffix pattern (matches suffix anywhere in path)
148        if let Some(suffix) = pattern.strip_prefix("**/") {
149            // Remove "**/"
150            return path.contains(suffix) || path.ends_with(suffix);
151        }
152
153        // Handle pattern with ** in the middle (e.g., "src/**/test.rs")
154        if pattern.contains("/**/") {
155            let parts: Vec<&str> = pattern.split("/**/").collect();
156            if parts.len() == 2 {
157                let prefix = parts[0];
158                let suffix = parts[1];
159                return path.starts_with(prefix) && path.contains(suffix);
160            }
161        }
162
163        // Handle single * wildcard (matches within a path component)
164        if pattern.contains('*') {
165            // Convert glob pattern to regex
166            let mut regex_str = String::with_capacity(pattern.len() * 2);
167            regex_str.push('^');
168
169            for c in pattern.chars() {
170                match c {
171                    '*' => regex_str.push_str(".*"),
172                    '.' => regex_str.push_str("\\."),
173                    '?' => regex_str.push('.'),
174                    '+' => regex_str.push_str("\\+"),
175                    '(' | ')' | '[' | ']' | '{' | '}' | '^' | '$' | '|' | '\\' => {
176                        regex_str.push('\\');
177                        regex_str.push(c);
178                    }
179                    _ => regex_str.push(c),
180                }
181            }
182            regex_str.push('$');
183
184            if let Ok(re) = regex::Regex::new(&regex_str) {
185                return re.is_match(path);
186            }
187        }
188
189        // Exact match or substring match
190        path == pattern || path.contains(pattern)
191    }
192
193    /// Adds an include pattern.
194    pub fn add_include(&mut self, pattern: impl Into<String>) {
195        self.include_patterns.push(pattern.into());
196    }
197
198    /// Adds an exclude pattern.
199    pub fn add_exclude(&mut self, pattern: impl Into<String>) {
200        self.exclude_patterns.push(pattern.into());
201    }
202
203    /// Adds an allowed extension.
204    pub fn add_extension(&mut self, ext: impl Into<String>) {
205        self.extensions.push(ext.into());
206    }
207}
208
209/// Incremental indexer for processing file changes.
210///
211/// The `IncrementalIndexer` batches file system events and processes
212/// them on flush, avoiding full re-indexing of the codebase.
213///
214/// # Examples
215///
216/// ```no_run
217/// use forgekit_core::indexing::IncrementalIndexer;
218/// use forgekit_core::watcher::WatchEvent;
219/// use std::path::PathBuf;
220///
221/// # #[tokio::main]
222/// # async fn main() -> anyhow::Result<()> {
223/// # use forgekit_core::BackendKind;
224/// # use std::sync::Arc;
225/// # let store = Arc::new(forgekit_core::storage::UnifiedGraphStore::open(".", BackendKind::SQLite).await?);
226/// let indexer = IncrementalIndexer::new(store);
227///
228/// // Queue some changes (only src/ and tests/ files will be indexed)
229/// indexer.queue(WatchEvent::Modified(PathBuf::from("src/lib.rs")));
230/// indexer.queue(WatchEvent::Created(PathBuf::from("tests/test.rs")));
231/// indexer.queue(WatchEvent::Modified(PathBuf::from("target/debug/build.rs"))); // Ignored
232///
233/// // Process changes
234/// indexer.flush().await?;
235/// # Ok(())
236/// # }
237/// ```
238#[derive(Clone, Debug)]
239pub struct IncrementalIndexer {
240    /// The graph store for writing index updates.
241    store: Arc<UnifiedGraphStore>,
242    /// Pending files to process.
243    pending: Arc<tokio::sync::Mutex<HashSet<PathBuf>>>,
244    /// Files to delete.
245    deleted: Arc<tokio::sync::Mutex<HashSet<PathBuf>>>,
246    /// Path filter for controlling which files get indexed.
247    filter: PathFilter,
248}
249
250impl IncrementalIndexer {
251    /// Creates a new incremental indexer with default path filtering.
252    ///
253    /// By default, only files in `src/` and `tests/` directories are indexed.
254    ///
255    /// # Arguments
256    ///
257    /// * `store` - The graph store for index updates
258    pub fn new(store: Arc<UnifiedGraphStore>) -> Self {
259        Self {
260            store,
261            pending: Arc::new(tokio::sync::Mutex::new(HashSet::new())),
262            deleted: Arc::new(tokio::sync::Mutex::new(HashSet::new())),
263            filter: PathFilter::default(),
264        }
265    }
266
267    /// Creates a new incremental indexer with a custom path filter.
268    ///
269    /// # Arguments
270    ///
271    /// * `store` - The graph store for index updates
272    /// * `filter` - Custom path filter
273    pub fn with_filter(store: Arc<UnifiedGraphStore>, filter: PathFilter) -> Self {
274        Self {
275            store,
276            pending: Arc::new(tokio::sync::Mutex::new(HashSet::new())),
277            deleted: Arc::new(tokio::sync::Mutex::new(HashSet::new())),
278            filter,
279        }
280    }
281
282    /// Returns a reference to the path filter.
283    pub fn filter(&self) -> &PathFilter {
284        &self.filter
285    }
286
287    /// Sets a new path filter.
288    pub fn set_filter(&mut self, filter: PathFilter) {
289        self.filter = filter;
290    }
291
292    /// Queues a watch event for processing.
293    ///
294    /// Only files matching the path filter will be queued.
295    ///
296    /// # Arguments
297    ///
298    /// * `event` - The watch event to queue
299    pub fn queue(&self, event: WatchEvent) {
300        match event {
301            WatchEvent::Created(path) | WatchEvent::Modified(path) => {
302                // Apply path filter
303                if !self.filter.should_index(&path) {
304                    return;
305                }
306
307                let pending = self.pending.clone();
308                tokio::spawn(async move {
309                    pending.lock().await.insert(path);
310                });
311            }
312            WatchEvent::Deleted(path) => {
313                // Apply path filter for deletions too
314                if !self.filter.should_index(&path) {
315                    return;
316                }
317
318                let deleted = self.deleted.clone();
319                tokio::spawn(async move {
320                    deleted.lock().await.insert(path);
321                });
322            }
323            WatchEvent::Error(_) => {
324                // Log error but don't fail
325            }
326        }
327    }
328
329    /// Flushes pending changes to the graph store.
330    ///
331    /// This method processes all queued file changes and updates
332    /// the index incrementally.
333    ///
334    /// # Returns
335    ///
336    /// `Ok(())` if flush succeeded, or an error.
337    ///
338    /// # Errors
339    ///
340    /// Returns an error if any file cannot be indexed.
341    pub async fn flush(&self) -> anyhow::Result<FlushStats> {
342        let mut pending = self.pending.lock().await;
343        let mut deleted = self.deleted.lock().await;
344
345        let mut stats = FlushStats::default();
346
347        // Process deletions first
348        for path in deleted.drain() {
349            if let Err(e) = self.delete_file(&path).await {
350                eprintln!("Error deleting {:?}: {}", path, e);
351            } else {
352                stats.deleted += 1;
353            }
354        }
355
356        // Process additions/updates
357        for path in pending.drain() {
358            if let Err(e) = self.index_file(&path).await {
359                eprintln!("Error indexing {:?}: {}", path, e);
360            } else {
361                stats.indexed += 1;
362            }
363        }
364
365        Ok(stats)
366    }
367
368    /// Performs a full rescan of the codebase.
369    ///
370    /// This clears all pending changes and re-indexes from scratch,
371    /// respecting the path filter.
372    ///
373    /// # Arguments
374    ///
375    /// * `root` - The root directory to scan
376    ///
377    /// # Returns
378    ///
379    /// `Ok(count)` with number of files indexed, or an error.
380    pub async fn full_rescan(&self, root: &Path) -> anyhow::Result<usize> {
381        // Clear pending
382        self.pending.lock().await.clear();
383        self.deleted.lock().await.clear();
384
385        let mut count = 0;
386
387        // Walk directory tree
388        if root.is_dir() {
389            self.scan_directory(root, &mut count).await?;
390        }
391
392        Ok(count)
393    }
394
395    /// Recursively scans a directory for files to index.
396    async fn scan_directory(&self, dir: &Path, count: &mut usize) -> anyhow::Result<()> {
397        let mut entries = tokio::fs::read_dir(dir).await?;
398
399        while let Some(entry) = entries.next_entry().await? {
400            let path = entry.path();
401
402            if path.is_dir() {
403                // Skip excluded directories early
404                let path_str = path.to_string_lossy();
405                if path_str.contains("/target/")
406                    || path_str.contains("/node_modules/")
407                    || path_str.contains("/.git/")
408                    || path_str.contains("/.forge/")
409                {
410                    continue;
411                }
412
413                // Recurse into allowed directories
414                Box::pin(self.scan_directory(&path, count)).await?;
415            } else if path.is_file() && self.filter.should_index(&path) {
416                self.pending.lock().await.insert(path);
417                *count += 1;
418            }
419        }
420
421        Ok(())
422    }
423
424    /// Returns the number of pending files to process.
425    pub async fn pending_count(&self) -> usize {
426        self.pending.lock().await.len() + self.deleted.lock().await.len()
427    }
428
429    /// Clears all pending changes without processing.
430    pub async fn clear_pending(&self) {
431        self.pending.lock().await.clear();
432        self.deleted.lock().await.clear();
433    }
434
435    /// Indexes a single file using magellan.
436    async fn index_file(&self, path: &Path) -> anyhow::Result<()> {
437        if !path.exists() || !path.is_file() {
438            return Ok(());
439        }
440
441        let db_path = self.store.db_path().join("graph.db");
442        if !db_path.exists() {
443            return Ok(());
444        }
445
446        {
447            let mut graph = magellan::CodeGraph::open(&db_path)?;
448            if let Some(parent) = path.parent() {
449                graph.scan_directory(parent, None)?;
450            }
451        }
452
453        Ok(())
454    }
455
456    /// Deletes a file from the index using magellan.
457    async fn delete_file(&self, path: &Path) -> anyhow::Result<()> {
458        let db_path = self.store.db_path().join("graph.db");
459        if !db_path.exists() {
460            return Ok(());
461        }
462
463        {
464            let mut graph = magellan::CodeGraph::open(&db_path)?;
465            let path_str = path.to_string_lossy();
466            let _ = graph.delete_file(&path_str);
467        }
468
469        Ok(())
470    }
471}
472
473/// Statistics from a flush operation.
474#[derive(Debug, Default, Clone, PartialEq)]
475pub struct FlushStats {
476    /// Number of files indexed.
477    pub indexed: usize,
478    /// Number of files deleted.
479    pub deleted: usize,
480}
481
482#[cfg(test)]
483mod tests {
484    use super::*;
485    use crate::storage::{BackendKind, UnifiedGraphStore};
486
487    #[tokio::test]
488    async fn test_indexer_creation() {
489        let store = Arc::new(UnifiedGraphStore::memory().await.unwrap());
490        let indexer = IncrementalIndexer::new(store);
491
492        assert_eq!(indexer.pending_count().await, 0);
493    }
494
495    #[tokio::test]
496    async fn test_queue_events() {
497        let store = Arc::new(UnifiedGraphStore::memory().await.unwrap());
498        let indexer = IncrementalIndexer::new(store);
499
500        indexer.queue(WatchEvent::Created(PathBuf::from("src/a.rs")));
501        indexer.queue(WatchEvent::Modified(PathBuf::from("src/b.rs")));
502        indexer.queue(WatchEvent::Deleted(PathBuf::from("src/c.rs")));
503
504        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
505
506        assert_eq!(indexer.pending_count().await, 3);
507    }
508
509    #[tokio::test]
510    async fn test_queue_filtered_events() {
511        let store = Arc::new(UnifiedGraphStore::memory().await.unwrap());
512        let indexer = IncrementalIndexer::new(store);
513
514        // These should be indexed (src/ and tests/)
515        indexer.queue(WatchEvent::Created(PathBuf::from("src/a.rs")));
516        indexer.queue(WatchEvent::Modified(PathBuf::from("tests/b.rs")));
517
518        // These should be filtered out
519        indexer.queue(WatchEvent::Modified(PathBuf::from("target/debug/build.rs")));
520        indexer.queue(WatchEvent::Modified(PathBuf::from(
521            "node_modules/foo/index.js",
522        )));
523        indexer.queue(WatchEvent::Modified(PathBuf::from(".git/config")));
524        indexer.queue(WatchEvent::Modified(PathBuf::from("Cargo.lock")));
525        indexer.queue(WatchEvent::Modified(PathBuf::from("README.md"))); // Not in src/ or tests/
526
527        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
528
529        // Only src/ and tests/ files should be queued
530        assert_eq!(indexer.pending_count().await, 2);
531    }
532
533    #[test]
534    fn test_path_filter_default() {
535        let filter = PathFilter::default();
536
537        // Should index src/ files
538        assert!(filter.should_index(Path::new("src/lib.rs")));
539        assert!(filter.should_index(Path::new("src/main.rs")));
540        assert!(filter.should_index(Path::new("project/src/module.rs")));
541
542        // Should index tests/ files
543        assert!(filter.should_index(Path::new("tests/test.rs")));
544        assert!(filter.should_index(Path::new("project/tests/integration.rs")));
545
546        // Should NOT index target/
547        assert!(!filter.should_index(Path::new("target/debug/build.rs")));
548        assert!(!filter.should_index(Path::new("target/release/app")));
549
550        // Should NOT index node_modules/
551        assert!(!filter.should_index(Path::new("node_modules/foo/index.js")));
552
553        // Should NOT index .git/
554        assert!(!filter.should_index(Path::new(".git/config")));
555
556        // Should NOT index Cargo.lock
557        assert!(!filter.should_index(Path::new("Cargo.lock")));
558
559        // Should NOT index files outside src/ or tests/
560        assert!(!filter.should_index(Path::new("README.md")));
561        assert!(!filter.should_index(Path::new("Cargo.toml")));
562        assert!(!filter.should_index(Path::new("build.rs"))); // Not in src/
563    }
564
565    #[test]
566    fn test_path_filter_extensions() {
567        let filter = PathFilter::default();
568
569        // Rust files
570        assert!(filter.should_index(Path::new("src/lib.rs")));
571        assert!(filter.should_index(Path::new("tests/test.rs")));
572
573        // Python files
574        assert!(filter.should_index(Path::new("src/main.py")));
575
576        // JavaScript/TypeScript
577        assert!(filter.should_index(Path::new("src/index.js")));
578        assert!(filter.should_index(Path::new("src/index.ts")));
579        assert!(filter.should_index(Path::new("src/App.jsx")));
580        assert!(filter.should_index(Path::new("src/App.tsx")));
581
582        // Binary files should be excluded
583        assert!(!filter.should_index(Path::new("src/logo.png")));
584        assert!(!filter.should_index(Path::new("src/data.bin")));
585    }
586
587    #[test]
588    fn test_path_filter_custom() {
589        let mut filter = PathFilter::new();
590        filter.add_include("**/lib/**");
591        filter.add_extension("go");
592
593        assert!(filter.should_index(Path::new("lib/main.go")));
594        assert!(!filter.should_index(Path::new("src/main.go"))); // Not in lib/
595        assert!(!filter.should_index(Path::new("lib/main.rs"))); // Wrong extension
596    }
597
598    #[tokio::test]
599    async fn test_flush_clears_pending() {
600        let store = Arc::new(UnifiedGraphStore::memory().await.unwrap());
601        let indexer = IncrementalIndexer::new(store);
602
603        indexer.queue(WatchEvent::Modified(PathBuf::from("src/lib.rs")));
604        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
605
606        let stats = indexer.flush().await.unwrap();
607        assert_eq!(stats.indexed, 1);
608        assert_eq!(indexer.pending_count().await, 0);
609    }
610
611    #[tokio::test]
612    async fn test_flush_stats() {
613        let store = Arc::new(UnifiedGraphStore::memory().await.unwrap());
614        let indexer = IncrementalIndexer::new(store);
615
616        indexer.queue(WatchEvent::Modified(PathBuf::from("src/a.rs")));
617        indexer.queue(WatchEvent::Created(PathBuf::from("src/b.rs")));
618        indexer.queue(WatchEvent::Deleted(PathBuf::from("src/c.rs")));
619
620        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
621
622        let stats = indexer.flush().await.unwrap();
623
624        assert_eq!(stats.indexed, 2);
625        assert_eq!(stats.deleted, 1);
626    }
627
628    #[tokio::test]
629    async fn test_clear_pending() {
630        let store = Arc::new(UnifiedGraphStore::memory().await.unwrap());
631        let indexer = IncrementalIndexer::new(store);
632
633        indexer.queue(WatchEvent::Modified(PathBuf::from("src/a.rs")));
634        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
635
636        assert_eq!(indexer.pending_count().await, 1);
637
638        indexer.clear_pending().await;
639
640        assert_eq!(indexer.pending_count().await, 0);
641    }
642
643    #[tokio::test]
644    async fn test_full_rescan() {
645        let temp = tempfile::tempdir().unwrap();
646        let store = Arc::new(
647            UnifiedGraphStore::open(temp.path(), BackendKind::default())
648                .await
649                .unwrap(),
650        );
651        let indexer = IncrementalIndexer::new(store);
652
653        // Create a directory structure
654        let src_dir = temp.path().join("src");
655        let tests_dir = temp.path().join("tests");
656        let target_dir = temp.path().join("target");
657        tokio::fs::create_dir(&src_dir).await.unwrap();
658        tokio::fs::create_dir(&tests_dir).await.unwrap();
659        tokio::fs::create_dir(&target_dir).await.unwrap();
660
661        // Create source files
662        tokio::fs::write(src_dir.join("lib.rs"), "pub fn foo() {}")
663            .await
664            .unwrap();
665        tokio::fs::write(src_dir.join("main.rs"), "fn main() {}")
666            .await
667            .unwrap();
668        tokio::fs::write(tests_dir.join("test.rs"), "#[test] fn test() {}")
669            .await
670            .unwrap();
671        tokio::fs::write(target_dir.join("build.rs"), "// build")
672            .await
673            .unwrap(); // Should be ignored
674        tokio::fs::write(temp.path().join("README.md"), "# Project")
675            .await
676            .unwrap(); // Should be ignored
677
678        // Perform rescan
679        let count = indexer.full_rescan(temp.path()).await.unwrap();
680
681        // Should only find src/ and tests/ files, not target/ or README.md
682        assert_eq!(count, 3);
683
684        // Verify pending queue has the files
685        let pending = indexer.pending.lock().await;
686        assert!(pending.contains(&src_dir.join("lib.rs")));
687        assert!(pending.contains(&src_dir.join("main.rs")));
688        assert!(pending.contains(&tests_dir.join("test.rs")));
689        assert!(!pending.contains(&target_dir.join("build.rs")));
690        assert!(!pending.contains(&temp.path().join("README.md")));
691    }
692}