codeprysm_core/
incremental.rs

1//! Incremental Updater for Code Graph System
2//!
3//! This module orchestrates incremental updates to the code graph based on
4//! file changes detected via Merkle trees. It enables efficient updates by
5//! only reprocessing modified, added, or deleted files.
6
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9
10use thiserror::Error;
11use tracing::{debug, info, warn};
12
13use crate::builder::{BuilderConfig, GraphBuilder};
14use crate::graph::PetCodeGraph;
15use crate::lazy::manager::LazyGraphManager;
16use crate::lazy::partitioner::GraphPartitioner;
17use crate::merkle::{ChangeSet, ExclusionFilter, MerkleTree, MerkleTreeManager};
18
19// ============================================================================
20// Errors
21// ============================================================================
22
23/// Errors that can occur during incremental updates.
24#[derive(Debug, Error)]
25pub enum UpdaterError {
26    /// IO error
27    #[error("IO error: {0}")]
28    Io(#[from] std::io::Error),
29
30    /// Lazy graph error
31    #[error("Graph error: {0}")]
32    LazyGraph(#[from] crate::lazy::manager::LazyGraphError),
33
34    /// Partition error
35    #[error("Partition error: {0}")]
36    Partition(#[from] crate::lazy::partitioner::PartitionerError),
37
38    /// Repository not found
39    #[error("Repository not found: {0}")]
40    RepoNotFound(PathBuf),
41
42    /// Queries directory not found
43    #[error("Queries directory not found: {0}")]
44    QueriesNotFound(PathBuf),
45
46    /// Builder error
47    #[error("Builder error: {0}")]
48    Builder(#[from] crate::builder::BuilderError),
49
50    /// Merkle tree error
51    #[error("Merkle tree error: {0}")]
52    Merkle(#[from] crate::merkle::MerkleError),
53}
54
55/// Result type for updater operations.
56pub type Result<T> = std::result::Result<T, UpdaterError>;
57
58// ============================================================================
59// Incremental Updater
60// ============================================================================
61
62/// Manages incremental updates to the code graph.
63///
64/// Uses Merkle tree change detection to identify modified files, then
65/// selectively updates only the affected entities in the graph.
66///
67/// ## Example
68///
69/// ```ignore
70/// use codeprysm_core::incremental::IncrementalUpdater;
71/// use std::path::Path;
72///
73/// let mut updater = IncrementalUpdater::new(
74///     Path::new("./my-repo"),
75///     Path::new("./.codeprysm"),
76///     Path::new("./queries"),
77/// )?;
78///
79/// // Perform incremental update
80/// let result = updater.update_repository(false)?;
81/// if result.has_changes() {
82///     println!("Updated {} files", result.changes.total_changes());
83/// }
84/// ```
85pub struct IncrementalUpdater {
86    /// Path to the repository being indexed
87    repo_path: PathBuf,
88    /// Path to the .codeprysm directory containing partitions
89    prism_dir: PathBuf,
90    /// Path to SCM query files (None = use embedded queries)
91    queries_dir: Option<PathBuf>,
92    /// Builder configuration
93    builder_config: BuilderConfig,
94    /// Merkle tree manager
95    merkle_manager: MerkleTreeManager,
96    /// Current graph state (loaded or built) - uses PetCodeGraph for efficient operations
97    graph: Option<PetCodeGraph>,
98    /// Current Merkle tree extracted from graph
99    current_merkle_tree: MerkleTree,
100}
101
102impl IncrementalUpdater {
103    /// Create a new incremental updater using embedded queries.
104    ///
105    /// This is the preferred constructor for production use as it doesn't require
106    /// external query files.
107    ///
108    /// # Arguments
109    ///
110    /// * `repo_path` - Path to the repository being indexed
111    /// * `prism_dir` - Path to the .codeprysm directory containing partitions
112    ///
113    /// # Errors
114    ///
115    /// Returns an error if the repository doesn't exist.
116    pub fn new_with_embedded_queries(repo_path: &Path, prism_dir: &Path) -> Result<Self> {
117        if !repo_path.exists() {
118            return Err(UpdaterError::RepoNotFound(repo_path.to_path_buf()));
119        }
120
121        let merkle_manager = MerkleTreeManager::default();
122
123        Ok(Self {
124            repo_path: repo_path.to_path_buf(),
125            prism_dir: prism_dir.to_path_buf(),
126            queries_dir: None,
127            builder_config: BuilderConfig::default(),
128            merkle_manager,
129            graph: None,
130            current_merkle_tree: HashMap::new(),
131        })
132    }
133
134    /// Create an updater with embedded queries and custom configuration.
135    pub fn with_embedded_queries(
136        repo_path: &Path,
137        prism_dir: &Path,
138        exclusion_filter: ExclusionFilter,
139        builder_config: BuilderConfig,
140    ) -> Result<Self> {
141        if !repo_path.exists() {
142            return Err(UpdaterError::RepoNotFound(repo_path.to_path_buf()));
143        }
144
145        let merkle_manager = MerkleTreeManager::new(exclusion_filter);
146
147        Ok(Self {
148            repo_path: repo_path.to_path_buf(),
149            prism_dir: prism_dir.to_path_buf(),
150            queries_dir: None,
151            builder_config,
152            merkle_manager,
153            graph: None,
154            current_merkle_tree: HashMap::new(),
155        })
156    }
157
158    /// Create a new incremental updater using query files.
159    ///
160    /// # Arguments
161    ///
162    /// * `repo_path` - Path to the repository being indexed
163    /// * `prism_dir` - Path to the .codeprysm directory containing partitions
164    /// * `queries_dir` - Path to directory containing SCM query files
165    ///
166    /// # Errors
167    ///
168    /// Returns an error if the repository or queries directory doesn't exist.
169    pub fn new(repo_path: &Path, prism_dir: &Path, queries_dir: &Path) -> Result<Self> {
170        // Validate paths
171        if !repo_path.exists() {
172            return Err(UpdaterError::RepoNotFound(repo_path.to_path_buf()));
173        }
174        if !queries_dir.exists() {
175            return Err(UpdaterError::QueriesNotFound(queries_dir.to_path_buf()));
176        }
177
178        let merkle_manager = MerkleTreeManager::default();
179
180        Ok(Self {
181            repo_path: repo_path.to_path_buf(),
182            prism_dir: prism_dir.to_path_buf(),
183            queries_dir: Some(queries_dir.to_path_buf()),
184            builder_config: BuilderConfig::default(),
185            merkle_manager,
186            graph: None,
187            current_merkle_tree: HashMap::new(),
188        })
189    }
190
191    /// Create an updater with custom configuration using query files.
192    pub fn with_config(
193        repo_path: &Path,
194        prism_dir: &Path,
195        queries_dir: &Path,
196        exclusion_filter: ExclusionFilter,
197        builder_config: BuilderConfig,
198    ) -> Result<Self> {
199        if !repo_path.exists() {
200            return Err(UpdaterError::RepoNotFound(repo_path.to_path_buf()));
201        }
202        if !queries_dir.exists() {
203            return Err(UpdaterError::QueriesNotFound(queries_dir.to_path_buf()));
204        }
205
206        let merkle_manager = MerkleTreeManager::new(exclusion_filter);
207
208        Ok(Self {
209            repo_path: repo_path.to_path_buf(),
210            prism_dir: prism_dir.to_path_buf(),
211            queries_dir: Some(queries_dir.to_path_buf()),
212            builder_config,
213            merkle_manager,
214            graph: None,
215            current_merkle_tree: HashMap::new(),
216        })
217    }
218
219    /// Load the existing graph state from partitions.
220    ///
221    /// # Returns
222    ///
223    /// `true` if graph was loaded successfully, `false` if prism_dir doesn't exist.
224    pub fn load_graph_state(&mut self) -> Result<bool> {
225        let manifest_path = self.prism_dir.join("manifest.json");
226        if !manifest_path.exists() {
227            info!("Partitioned graph not found: {:?}", self.prism_dir);
228            return Ok(false);
229        }
230
231        info!("Loading graph from {:?}", self.prism_dir);
232
233        // Open lazy graph manager and load all partitions
234        let manager = LazyGraphManager::open(&self.prism_dir)?;
235        manager.load_all_partitions()?;
236
237        // Clone the graph for our use (acquire read lock first)
238        let graph = manager.graph_read().clone();
239
240        info!(
241            "Loaded graph: {} nodes, {} edges",
242            graph.node_count(),
243            graph.edge_count()
244        );
245
246        // Extract Merkle tree from FILE entities
247        self.current_merkle_tree = self.extract_merkle_tree_from_graph(&graph);
248        info!(
249            "Extracted Merkle tree: {} files",
250            self.current_merkle_tree.len()
251        );
252
253        self.graph = Some(graph);
254        Ok(true)
255    }
256
257    /// Extract file hashes from file nodes in the graph.
258    /// Supports both legacy FILE type and Container with kind="file".
259    fn extract_merkle_tree_from_graph(&self, graph: &PetCodeGraph) -> MerkleTree {
260        let mut merkle_tree = HashMap::new();
261
262        for node in graph.iter_nodes().filter(|n| n.is_file()) {
263            if let Some(hash) = &node.hash {
264                merkle_tree.insert(node.file.clone(), hash.clone());
265            }
266        }
267
268        merkle_tree
269    }
270
271    /// Detect changes in the repository since last update.
272    ///
273    /// Builds a new Merkle tree from the current filesystem state and
274    /// compares it with the stored state.
275    pub fn detect_repository_changes(&mut self) -> Result<ChangeSet> {
276        info!("Detecting repository changes...");
277
278        // Build current Merkle tree
279        let new_merkle_tree = self.merkle_manager.build_merkle_tree(&self.repo_path)?;
280
281        // Compare with stored state
282        let changes = self
283            .merkle_manager
284            .detect_changes(&self.current_merkle_tree, &new_merkle_tree);
285
286        // Update current state
287        self.current_merkle_tree = new_merkle_tree;
288
289        Ok(changes)
290    }
291
292    /// Perform incremental update of the repository.
293    ///
294    /// # Arguments
295    ///
296    /// * `force_rebuild` - If true, rebuild everything regardless of changes
297    ///
298    /// # Returns
299    ///
300    /// `UpdateResult` with information about what was updated.
301    pub fn update_repository(&mut self, force_rebuild: bool) -> Result<UpdateResult> {
302        if force_rebuild {
303            info!("Performing force rebuild...");
304            return self.full_rebuild();
305        }
306
307        // Load existing state
308        if !self.load_graph_state()? {
309            info!("No existing graph found, performing initial build...");
310            return self.full_rebuild();
311        }
312
313        // Detect changes
314        let changes = self.detect_repository_changes()?;
315
316        if !changes.has_changes() {
317            info!("No changes detected, graph is up to date");
318            return Ok(UpdateResult {
319                success: true,
320                changes,
321                was_full_rebuild: false,
322            });
323        }
324
325        info!("Processing {} changed files...", changes.total_changes());
326
327        // Process changes
328        self.process_changes(&changes)?;
329
330        // Save updated graph
331        self.save_graph()?;
332
333        info!("Incremental update completed successfully");
334
335        Ok(UpdateResult {
336            success: true,
337            changes,
338            was_full_rebuild: false,
339        })
340    }
341
342    /// Process detected file changes.
343    fn process_changes(&mut self, changes: &ChangeSet) -> Result<()> {
344        let start = std::time::Instant::now();
345
346        // Handle deleted files first
347        if !changes.deleted.is_empty() {
348            self.process_deleted_files(&changes.deleted);
349        }
350
351        // Handle modified files - remove old nodes before reparsing
352        if !changes.modified.is_empty() {
353            self.process_modified_files(&changes.modified);
354        }
355
356        // Reparse modified and added files
357        let files_to_reparse: Vec<String> = changes
358            .modified
359            .iter()
360            .chain(changes.added.iter())
361            .cloned()
362            .collect();
363
364        if !files_to_reparse.is_empty() {
365            self.reparse_files(&files_to_reparse)?;
366        }
367
368        let elapsed = start.elapsed();
369        info!(
370            "Change processing completed in {:.2}s",
371            elapsed.as_secs_f64()
372        );
373
374        Ok(())
375    }
376
377    /// Remove nodes for deleted files.
378    fn process_deleted_files(&mut self, deleted_files: &[String]) {
379        info!("Processing {} deleted files...", deleted_files.len());
380
381        let graph = self
382            .graph
383            .as_mut()
384            .expect("Graph must be loaded before processing changes");
385
386        for file_path in deleted_files {
387            graph.remove_file_nodes(file_path);
388            debug!("Removed nodes for deleted file: {}", file_path);
389        }
390    }
391
392    /// Remove nodes for modified files (before reparsing).
393    fn process_modified_files(&mut self, modified_files: &[String]) {
394        let graph = self
395            .graph
396            .as_mut()
397            .expect("Graph must be loaded before processing changes");
398
399        for file_path in modified_files {
400            graph.remove_file_nodes(file_path);
401            debug!("Removed nodes for modified file: {}", file_path);
402        }
403    }
404
405    /// Reparse modified and added files.
406    fn reparse_files(&mut self, file_paths: &[String]) -> Result<()> {
407        info!("Reparsing {} files...", file_paths.len());
408
409        // Create a builder for parsing - use embedded queries or custom directory
410        let mut builder = match &self.queries_dir {
411            Some(dir) => GraphBuilder::with_config(dir, self.builder_config.clone())?,
412            None => GraphBuilder::with_embedded_queries(self.builder_config.clone()),
413        };
414
415        // Collect file graphs first to avoid borrow issues
416        let mut file_graphs = Vec::new();
417
418        for rel_path in file_paths {
419            let abs_path = self.repo_path.join(rel_path);
420
421            if !abs_path.exists() {
422                warn!("File not found during reparse: {}", rel_path);
423                continue;
424            }
425
426            // Parse single file with full entity extraction (returns PetCodeGraph directly)
427            match builder.parse_file(&abs_path, rel_path) {
428                Ok(file_graph) => {
429                    file_graphs.push((rel_path.clone(), file_graph));
430                }
431                Err(e) => {
432                    warn!("Error reparsing {}: {}", rel_path, e);
433                }
434            }
435        }
436
437        // Now merge all file graphs into the main graph
438        let graph = self
439            .graph
440            .as_mut()
441            .expect("Graph must be loaded before processing changes");
442
443        for (rel_path, file_graph) in file_graphs {
444            Self::merge_file_graph(graph, file_graph);
445            debug!("Reparsed file: {}", rel_path);
446        }
447
448        Ok(())
449    }
450
451    /// Merge a file's graph into the main graph.
452    fn merge_file_graph(main_graph: &mut PetCodeGraph, file_graph: PetCodeGraph) {
453        // Add all nodes from file graph
454        for node in file_graph.iter_nodes() {
455            if !main_graph.contains_node(&node.id) {
456                main_graph.add_node(node.clone());
457            }
458        }
459
460        // Add all edges from file graph
461        for edge in file_graph.iter_edges() {
462            main_graph.add_edge_from_struct(&edge);
463        }
464    }
465
466    /// Save the updated graph to partitions.
467    fn save_graph(&self) -> Result<()> {
468        let graph = self.graph.as_ref().expect("Graph must exist to save");
469
470        info!("Saving graph to {:?}", self.prism_dir);
471
472        // Determine root name from repo path
473        let root_name = self
474            .repo_path
475            .file_name()
476            .map(|s| s.to_string_lossy().to_string())
477            .unwrap_or_else(|| "default".to_string());
478
479        // Partition and save to prism directory
480        let (_, stats) =
481            GraphPartitioner::partition_with_stats(graph, &self.prism_dir, Some(&root_name))?;
482
483        info!(
484            "Saved graph: {} nodes, {} partitions, {} cross-partition edges",
485            stats.total_nodes, stats.partition_count, stats.cross_partition_edges
486        );
487
488        Ok(())
489    }
490
491    /// Perform a full rebuild of the repository.
492    fn full_rebuild(&mut self) -> Result<UpdateResult> {
493        info!("Performing full rebuild...");
494
495        // Build Merkle tree
496        let merkle_tree = self.merkle_manager.build_merkle_tree(&self.repo_path)?;
497
498        // Build graph using GraphBuilder - use embedded queries or custom directory
499        let mut builder = match &self.queries_dir {
500            Some(dir) => GraphBuilder::with_config(dir, self.builder_config.clone())?,
501            None => GraphBuilder::with_embedded_queries(self.builder_config.clone()),
502        };
503        let mut graph = builder.build_from_directory(&self.repo_path)?;
504
505        // Add file hashes to file entities (legacy FILE type or Container with kind="file")
506        // Collect file nodes first to avoid borrow issues
507        let file_nodes: Vec<(String, String)> = graph
508            .iter_nodes()
509            .filter(|n| n.is_file())
510            .map(|n| (n.id.clone(), n.file.clone()))
511            .collect();
512
513        for (node_id, file_path) in file_nodes {
514            if let Some(hash) = merkle_tree.get(&file_path) {
515                if let Some(node_mut) = graph.get_node_mut(&node_id) {
516                    node_mut.hash = Some(hash.clone());
517                }
518            }
519        }
520
521        // Store state
522        self.graph = Some(graph);
523        self.current_merkle_tree = merkle_tree.clone();
524
525        // Save graph
526        self.save_graph()?;
527
528        // Return result indicating full rebuild
529        let changes = ChangeSet {
530            added: merkle_tree.keys().cloned().collect(),
531            modified: vec![],
532            deleted: vec![],
533        };
534
535        info!("Full rebuild completed");
536
537        Ok(UpdateResult {
538            success: true,
539            changes,
540            was_full_rebuild: true,
541        })
542    }
543
544    /// Get a reference to the current graph.
545    pub fn graph(&self) -> Option<&PetCodeGraph> {
546        self.graph.as_ref()
547    }
548
549    /// Get a mutable reference to the current graph.
550    pub fn graph_mut(&mut self) -> Option<&mut PetCodeGraph> {
551        self.graph.as_mut()
552    }
553
554    /// Get the current Merkle tree.
555    pub fn merkle_tree(&self) -> &MerkleTree {
556        &self.current_merkle_tree
557    }
558}
559
560// ============================================================================
561// Update Result
562// ============================================================================
563
564/// Result of an incremental update operation.
565#[derive(Debug, Clone)]
566pub struct UpdateResult {
567    /// Whether the update was successful.
568    pub success: bool,
569    /// Changes that were processed.
570    pub changes: ChangeSet,
571    /// Whether this was a full rebuild.
572    pub was_full_rebuild: bool,
573}
574
575impl UpdateResult {
576    /// Check if any changes were made.
577    pub fn has_changes(&self) -> bool {
578        self.changes.has_changes() || self.was_full_rebuild
579    }
580}
581
582// ============================================================================
583// Tests
584// ============================================================================
585
586#[cfg(test)]
587mod tests {
588    use super::*;
589    use std::fs::File;
590    use std::io::Write;
591    use tempfile::TempDir;
592
593    fn setup_test_repo() -> (TempDir, PathBuf) {
594        let temp_dir = TempDir::new().unwrap();
595        let repo_path = temp_dir.path().to_path_buf();
596
597        // Create a simple Python file
598        let py_file = repo_path.join("test.py");
599        let mut file = File::create(&py_file).unwrap();
600        writeln!(file, "def hello():").unwrap();
601        writeln!(file, "    print('Hello, World!')").unwrap();
602
603        (temp_dir, repo_path)
604    }
605
606    fn get_queries_dir() -> PathBuf {
607        // Find the queries directory relative to the crate root
608        let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
609        let queries_dir = manifest_dir.join("queries");
610
611        if queries_dir.exists() {
612            queries_dir
613        } else {
614            // Try parent directory (workspace root)
615            manifest_dir
616                .parent()
617                .unwrap()
618                .parent()
619                .unwrap()
620                .join("src")
621                .join("queries")
622        }
623    }
624
625    #[test]
626    fn test_updater_creation() {
627        let (_temp_dir, repo_path) = setup_test_repo();
628        let prism_dir = repo_path.join(".codeprysm");
629        std::fs::create_dir_all(&prism_dir).unwrap();
630        let queries_dir = get_queries_dir();
631
632        if !queries_dir.exists() {
633            // Skip test if queries not available
634            return;
635        }
636
637        let result = IncrementalUpdater::new(&repo_path, &prism_dir, &queries_dir);
638        assert!(result.is_ok());
639    }
640
641    #[test]
642    fn test_updater_missing_repo() {
643        let prism_dir = PathBuf::from("/tmp/.codeprysm");
644        let queries_dir = get_queries_dir();
645
646        if !queries_dir.exists() {
647            return;
648        }
649
650        let result =
651            IncrementalUpdater::new(Path::new("/nonexistent/repo"), &prism_dir, &queries_dir);
652
653        assert!(matches!(result, Err(UpdaterError::RepoNotFound(_))));
654    }
655
656    #[test]
657    fn test_extract_merkle_tree_from_graph() {
658        let (_temp_dir, repo_path) = setup_test_repo();
659        let prism_dir = repo_path.join(".codeprysm");
660        std::fs::create_dir_all(&prism_dir).unwrap();
661        let queries_dir = get_queries_dir();
662
663        if !queries_dir.exists() {
664            return;
665        }
666
667        let updater = IncrementalUpdater::new(&repo_path, &prism_dir, &queries_dir).unwrap();
668
669        // Create a graph with file nodes (Container with kind="file")
670        let mut graph = PetCodeGraph::new();
671        graph.add_node(crate::graph::Node::source_file(
672            "test.py".to_string(),
673            "test.py".to_string(),
674            "abc123".to_string(),
675            100,
676        ));
677        graph.add_node(crate::graph::Node::source_file(
678            "main.py".to_string(),
679            "main.py".to_string(),
680            "def456".to_string(),
681            100,
682        ));
683
684        let merkle_tree = updater.extract_merkle_tree_from_graph(&graph);
685
686        assert_eq!(merkle_tree.len(), 2);
687        assert_eq!(merkle_tree.get("test.py"), Some(&"abc123".to_string()));
688        assert_eq!(merkle_tree.get("main.py"), Some(&"def456".to_string()));
689    }
690
691    #[test]
692    fn test_update_result() {
693        let result = UpdateResult {
694            success: true,
695            changes: ChangeSet {
696                modified: vec!["a.py".to_string()],
697                added: vec![],
698                deleted: vec![],
699            },
700            was_full_rebuild: false,
701        };
702
703        assert!(result.has_changes());
704
705        let result_no_changes = UpdateResult {
706            success: true,
707            changes: ChangeSet::new(),
708            was_full_rebuild: false,
709        };
710
711        assert!(!result_no_changes.has_changes());
712
713        let result_full_rebuild = UpdateResult {
714            success: true,
715            changes: ChangeSet::new(),
716            was_full_rebuild: true,
717        };
718
719        assert!(result_full_rebuild.has_changes());
720    }
721}