codeprysm_core/
builder.rs

1//! Graph Builder for Code Graph Generation
2//!
3//! This module provides the `GraphBuilder` struct for constructing code graphs
4//! from source files using tree-sitter parsing and SCM queries.
5//!
6//! ## Usage
7//!
8//! ```ignore
9//! use codeprysm_core::builder::GraphBuilder;
10//! use std::path::Path;
11//!
12//! let builder = GraphBuilder::new(Path::new("queries"))?;
13//! let graph = builder.build_from_directory(Path::new("src"))?;
14//! ```
15
16use std::collections::HashMap;
17use std::path::{Path, PathBuf};
18
19use ignore::WalkBuilder;
20use thiserror::Error;
21use tracing::{debug, info, warn};
22
23use crate::discovery::{DiscoveredRoot, RootDiscovery};
24use crate::graph::{
25    CallableKind, ContainerKind, DataKind, Edge, EdgeType, Node, NodeMetadata, NodeType,
26    PetCodeGraph,
27};
28use crate::manifest::{DependencyType, LocalDependency, ManifestInfo, ManifestParser};
29use crate::merkle::compute_file_hash;
30use crate::parser::{
31    ContainmentContext, ManifestLanguage, MetadataExtractor, SupportedLanguage, TagExtractor,
32    generate_node_id,
33};
34use crate::tags::{TagParseResult, parse_tag_string};
35
36// ============================================================================
37// Errors
38// ============================================================================
39
40/// Errors that can occur during graph building.
41#[derive(Debug, Error)]
42pub enum BuilderError {
43    /// IO error
44    #[error("IO error: {0}")]
45    Io(#[from] std::io::Error),
46
47    /// Parser error
48    #[error("Parser error: {0}")]
49    Parser(#[from] crate::parser::ParserError),
50
51    /// Query directory not found
52    #[error("Query directory not found: {0}")]
53    QueryDirNotFound(PathBuf),
54
55    /// No supported files found
56    #[error("No supported files found in directory: {0}")]
57    NoFilesFound(PathBuf),
58}
59
60// ============================================================================
61// Builder Configuration
62// ============================================================================
63
64/// Configuration for the graph builder.
65#[derive(Debug, Clone)]
66pub struct BuilderConfig {
67    /// Skip Data nodes (parameters, locals, fields) for smaller graphs
68    pub skip_data_nodes: bool,
69    /// Maximum containment depth (None = unlimited)
70    pub max_containment_depth: Option<usize>,
71    /// Maximum number of files to process (None = unlimited)
72    pub max_files: Option<usize>,
73    /// File patterns to exclude (glob patterns)
74    pub exclude_patterns: Vec<String>,
75}
76
77impl Default for BuilderConfig {
78    fn default() -> Self {
79        Self {
80            skip_data_nodes: false,
81            max_containment_depth: None,
82            max_files: None,
83            exclude_patterns: vec![
84                "**/.git/**".to_string(),
85                "**/node_modules/**".to_string(),
86                "**/target/**".to_string(),
87                "**/__pycache__/**".to_string(),
88                "**/.venv/**".to_string(),
89                "**/venv/**".to_string(),
90                "**/.tox/**".to_string(),
91                "**/dist/**".to_string(),
92                "**/build/**".to_string(),
93            ],
94        }
95    }
96}
97
98// ============================================================================
99// Reference Info
100// ============================================================================
101
102/// Information about a reference to be resolved later.
103#[derive(Debug, Clone)]
104struct ReferenceInfo {
105    /// Source node ID (where the reference comes from)
106    source_id: String,
107    /// Line number of the reference
108    line: usize,
109}
110
111// ============================================================================
112// Graph Builder
113// ============================================================================
114
115/// Builds code graphs from source directories.
116///
117/// The `GraphBuilder` walks a source directory, parses files using tree-sitter,
118/// extracts code entities via SCM queries, and constructs a code graph with
119/// nodes and edges representing code structure and dependencies.
120///
121/// ## Example
122///
123/// ```ignore
124/// use codeprysm_core::builder::{GraphBuilder, BuilderConfig};
125/// use std::path::Path;
126///
127/// let config = BuilderConfig::default();
128/// let builder = GraphBuilder::with_config(Path::new("queries"), config)?;
129/// let graph = builder.build_from_directory(Path::new("src"))?;
130///
131/// println!("Built graph with {} nodes", graph.node_count());
132/// ```
133pub struct GraphBuilder {
134    /// Path to SCM query files (None = use embedded queries)
135    queries_dir: Option<PathBuf>,
136    /// Builder configuration
137    config: BuilderConfig,
138}
139
140impl GraphBuilder {
141    /// Create a new graph builder with default configuration using embedded queries.
142    ///
143    /// This is the preferred constructor for production use as it doesn't require
144    /// external query files.
145    pub fn new_with_embedded_queries() -> Self {
146        Self {
147            queries_dir: None,
148            config: BuilderConfig::default(),
149        }
150    }
151
152    /// Create a new graph builder with custom configuration using embedded queries.
153    ///
154    /// This is the preferred constructor for production use as it doesn't require
155    /// external query files.
156    pub fn with_embedded_queries(config: BuilderConfig) -> Self {
157        Self {
158            queries_dir: None,
159            config,
160        }
161    }
162
163    /// Create a new graph builder with default configuration using query files.
164    ///
165    /// # Arguments
166    ///
167    /// * `queries_dir` - Path to directory containing SCM query files
168    ///
169    /// # Errors
170    ///
171    /// Returns an error if the queries directory doesn't exist.
172    pub fn new(queries_dir: &Path) -> Result<Self, BuilderError> {
173        Self::with_config(queries_dir, BuilderConfig::default())
174    }
175
176    /// Create a new graph builder with custom configuration using query files.
177    ///
178    /// # Arguments
179    ///
180    /// * `queries_dir` - Path to directory containing SCM query files
181    /// * `config` - Builder configuration
182    ///
183    /// # Errors
184    ///
185    /// Returns an error if the queries directory doesn't exist.
186    pub fn with_config(queries_dir: &Path, config: BuilderConfig) -> Result<Self, BuilderError> {
187        if !queries_dir.exists() {
188            return Err(BuilderError::QueryDirNotFound(queries_dir.to_path_buf()));
189        }
190
191        Ok(Self {
192            queries_dir: Some(queries_dir.to_path_buf()),
193            config,
194        })
195    }
196
197    /// Build a code graph from a directory.
198    ///
199    /// Walks the directory, processes all supported source files, and
200    /// constructs a code graph with nodes and edges.
201    ///
202    /// # Arguments
203    ///
204    /// * `directory` - Root directory to process
205    ///
206    /// # Returns
207    ///
208    /// A `PetCodeGraph` containing all discovered code entities and relationships.
209    /// Uses petgraph::StableGraph internally for efficient traversal and algorithms.
210    pub fn build_from_directory(&mut self, directory: &Path) -> Result<PetCodeGraph, BuilderError> {
211        let mut graph = PetCodeGraph::new();
212
213        // Create Repository node as root of the hierarchy
214        let repo_name = get_repo_name(directory);
215        let (git_remote, git_branch, git_commit) = extract_git_metadata(directory);
216        let repo_metadata = NodeMetadata::default().with_git(git_remote, git_branch, git_commit);
217        let repo_node = Node::repository(repo_name.clone(), repo_metadata);
218        graph.add_node(repo_node);
219
220        info!("Created repository node: {}", repo_name);
221
222        // Track definitions and references for later resolution
223        let mut defines: HashMap<String, String> = HashMap::new();
224        let mut references: HashMap<String, Vec<ReferenceInfo>> = HashMap::new();
225
226        // Statistics
227        let mut file_count = 0;
228        let mut skipped_data_nodes = 0;
229        let mut skipped_depth_nodes = 0;
230
231        info!("Processing files in {}", directory.display());
232
233        // Collect files to process
234        let files: Vec<PathBuf> = self.collect_files(directory)?;
235
236        if files.is_empty() {
237            return Err(BuilderError::NoFilesFound(directory.to_path_buf()));
238        }
239
240        info!("Found {} files to process", files.len());
241
242        // Process each file
243        for file_path in files {
244            // Check max files limit
245            if let Some(max) = self.config.max_files {
246                if file_count >= max {
247                    info!("Reached maximum file limit of {}", max);
248                    break;
249                }
250            }
251
252            // Get relative path
253            let rel_path = file_path
254                .strip_prefix(directory)
255                .unwrap_or(&file_path)
256                .to_string_lossy()
257                .to_string();
258
259            // Process the file
260            match self.process_file(
261                &file_path,
262                &rel_path,
263                &repo_name,
264                &mut graph,
265                &mut defines,
266                &mut references,
267                &mut skipped_data_nodes,
268                &mut skipped_depth_nodes,
269            ) {
270                Ok(_) => {
271                    file_count += 1;
272                    if file_count % 100 == 0 {
273                        debug!("Processed {} files", file_count);
274                    }
275                }
276                Err(e) => {
277                    warn!("Error processing {}: {}", rel_path, e);
278                }
279            }
280        }
281
282        info!("Processed {} files", file_count);
283
284        // Resolve references and create USES edges
285        self.resolve_references(&mut graph, &defines, &references);
286
287        // Log statistics
288        let contains_count = graph.edges_by_type(EdgeType::Contains).count();
289        let uses_count = graph.edges_by_type(EdgeType::Uses).count();
290        let defines_count = graph.edges_by_type(EdgeType::Defines).count();
291
292        info!("Graph summary:");
293        info!("  - Nodes: {}", graph.node_count());
294        info!("  - CONTAINS edges: {}", contains_count);
295        info!("  - USES edges: {}", uses_count);
296        info!("  - DEFINES edges: {}", defines_count);
297        info!("  - Total edges: {}", graph.edge_count());
298
299        if skipped_data_nodes > 0 || skipped_depth_nodes > 0 {
300            info!("Performance filtering:");
301            if skipped_data_nodes > 0 {
302                info!("  - Skipped Data nodes: {}", skipped_data_nodes);
303            }
304            if skipped_depth_nodes > 0 {
305                info!("  - Skipped nodes (max depth): {}", skipped_depth_nodes);
306            }
307        }
308
309        Ok(graph)
310    }
311
312    /// Build a code graph from a workspace root that may contain multiple repositories.
313    ///
314    /// This method discovers all code roots (git repositories and code directories)
315    /// under the given workspace path and builds a unified graph.
316    ///
317    /// - If the root is a single repository or contains only one code root,
318    ///   returns a graph with that repository as the root (backward compatible).
319    /// - Otherwise, creates a workspace container with multiple repository children.
320    ///
321    /// # Arguments
322    ///
323    /// * `workspace_path` - Root directory to analyze (may contain multiple repos)
324    ///
325    /// # Returns
326    ///
327    /// A tuple containing:
328    /// - The unified `PetCodeGraph` with all discovered code entities
329    /// - A list of `DiscoveredRoot` describing each discovered root
330    ///
331    /// # Example
332    ///
333    /// ```ignore
334    /// use codeprysm_core::builder::GraphBuilder;
335    /// use std::path::Path;
336    ///
337    /// let builder = GraphBuilder::new(Path::new("queries"))?;
338    /// let (graph, roots) = builder.build_from_workspace(Path::new("/workspace"))?;
339    ///
340    /// println!("Found {} code roots", roots.len());
341    /// println!("Graph has {} nodes", graph.node_count());
342    /// ```
343    pub fn build_from_workspace(
344        &mut self,
345        workspace_path: &Path,
346    ) -> Result<(PetCodeGraph, Vec<DiscoveredRoot>), BuilderError> {
347        let workspace_path = workspace_path.canonicalize().map_err(BuilderError::Io)?;
348
349        info!("Building workspace graph from {:?}", workspace_path);
350
351        // Discover roots under the workspace
352        let discovery = RootDiscovery::with_defaults();
353        let roots = discovery
354            .discover(&workspace_path)
355            .map_err(|e| BuilderError::Io(std::io::Error::other(e.to_string())))?;
356
357        info!("Discovered {} code root(s)", roots.len());
358
359        // Single root case: use existing behavior for backward compatibility
360        // This means the workspace itself is the single root (git repo or code dir)
361        if roots.len() == 1 && roots[0].relative_path == "." {
362            info!("Single root at workspace path, using standard build");
363            let graph = self.build_from_directory(&workspace_path)?;
364            return Ok((graph, roots));
365        }
366
367        // Multi-root case: create workspace container and merge roots
368        let workspace_name = workspace_path
369            .file_name()
370            .map(|n| n.to_string_lossy().to_string())
371            .unwrap_or_else(|| "workspace".to_string());
372
373        info!(
374            "Creating workspace '{}' with {} roots",
375            workspace_name,
376            roots.len()
377        );
378
379        // Use PetCodeGraph directly for efficient construction and traversal
380        let mut workspace_graph = PetCodeGraph::new();
381
382        // Create workspace node as the root
383        let workspace_node = Node::workspace(workspace_name.clone());
384        let workspace_id = workspace_node.id.clone();
385        workspace_graph.add_node(workspace_node);
386
387        // Build and merge each discovered root
388        for root in &roots {
389            info!("Processing root: {} ({:?})", root.name, root.root_type);
390
391            // Build graph for this root (returns PetCodeGraph)
392            let root_graph = match self.build_from_directory(&root.path) {
393                Ok(g) => g,
394                Err(e) => {
395                    warn!("Failed to build graph for {}: {}", root.name, e);
396                    continue;
397                }
398            };
399
400            // Find the repository/directory node (root of this sub-graph)
401            let root_node_id = self.find_root_node_id(&root_graph, root);
402
403            // Merge the root graph into the workspace graph
404            self.merge_root_graph(
405                &mut workspace_graph,
406                root_graph,
407                &workspace_id,
408                &root_node_id,
409            );
410
411            info!("Merged root '{}' into workspace graph", root.name);
412        }
413
414        info!(
415            "Workspace graph complete: {} nodes, {} edges across {} roots",
416            workspace_graph.node_count(),
417            workspace_graph.edge_count(),
418            roots.len()
419        );
420
421        Ok((workspace_graph, roots))
422    }
423
424    /// Find the root node ID in a built graph (repository or first container)
425    fn find_root_node_id(&self, graph: &PetCodeGraph, root: &DiscoveredRoot) -> String {
426        // Look for repository node first
427        graph
428            .iter_nodes()
429            .find(|n| n.is_repository())
430            .map(|n| n.id.clone())
431            .unwrap_or_else(|| root.name.clone())
432    }
433
434    /// Merge a root graph into the workspace graph
435    fn merge_root_graph(
436        &self,
437        workspace_graph: &mut PetCodeGraph,
438        root_graph: PetCodeGraph,
439        workspace_id: &str,
440        root_node_id: &str,
441    ) {
442        // Add all nodes from the root graph
443        for node in root_graph.iter_nodes() {
444            workspace_graph.add_node(node.clone());
445        }
446
447        // Add all edges from the root graph
448        for edge in root_graph.iter_edges() {
449            workspace_graph.add_edge_from_struct(&edge);
450        }
451
452        // Add CONTAINS edge from workspace to this root
453        workspace_graph.add_edge_from_struct(&Edge::contains(
454            workspace_id.to_string(),
455            root_node_id.to_string(),
456        ));
457    }
458
459    /// Collect all supported source files from a directory.
460    ///
461    /// Uses the `ignore` crate to respect:
462    /// - `.gitignore` files
463    /// - `.codeprysmignore` files (custom exclusions for CodePrysm indexing)
464    /// - Global gitignore patterns
465    fn collect_files(&self, directory: &Path) -> Result<Vec<PathBuf>, BuilderError> {
466        let mut files = Vec::new();
467        let glob_set = self.build_exclude_glob_set();
468
469        // Use ignore::WalkBuilder which respects .gitignore and custom ignore files
470        let walker = WalkBuilder::new(directory)
471            .follow_links(false)
472            .hidden(true) // Skip hidden files/directories
473            .git_ignore(true) // Respect .gitignore
474            .git_global(true) // Respect global gitignore
475            .git_exclude(true) // Respect .git/info/exclude
476            .add_custom_ignore_filename(".codeprysmignore") // Respect .codeprysmignore
477            .build();
478
479        for entry in walker {
480            let entry = match entry {
481                Ok(e) => e,
482                Err(e) => {
483                    debug!("Error walking directory: {}", e);
484                    continue;
485                }
486            };
487
488            // Skip directories - we only want files
489            let file_type = match entry.file_type() {
490                Some(ft) => ft,
491                None => continue,
492            };
493            if !file_type.is_file() {
494                continue;
495            }
496
497            let path = entry.path();
498
499            // Check if file is supported
500            if SupportedLanguage::from_path(path).is_none() {
501                continue;
502            }
503
504            // Check additional exclude patterns from config (beyond .gitignore/.codeprysmignore)
505            let rel_path = path
506                .strip_prefix(directory)
507                .unwrap_or(path)
508                .to_string_lossy();
509            if glob_set.is_match(rel_path.as_ref()) {
510                continue;
511            }
512
513            files.push(path.to_path_buf());
514        }
515
516        // Sort for deterministic ordering
517        files.sort();
518
519        Ok(files)
520    }
521
522    /// Build a glob set from exclude patterns.
523    fn build_exclude_glob_set(&self) -> globset::GlobSet {
524        let mut builder = globset::GlobSetBuilder::new();
525        for pattern in &self.config.exclude_patterns {
526            if let Ok(glob) = globset::Glob::new(pattern) {
527                builder.add(glob);
528            }
529        }
530        builder
531            .build()
532            .unwrap_or_else(|_| globset::GlobSet::empty())
533    }
534
535    /// Process a single file and add its entities to the graph.
536    #[allow(clippy::too_many_arguments)]
537    fn process_file(
538        &mut self,
539        file_path: &Path,
540        rel_path: &str,
541        repo_name: &str,
542        graph: &mut PetCodeGraph,
543        defines: &mut HashMap<String, String>,
544        references: &mut HashMap<String, Vec<ReferenceInfo>>,
545        skipped_data_nodes: &mut usize,
546        skipped_depth_nodes: &mut usize,
547    ) -> Result<(), BuilderError> {
548        // Detect language
549        let language = match SupportedLanguage::from_path(file_path) {
550            Some(lang) => lang,
551            None => return Ok(()), // Skip unsupported files
552        };
553
554        // Read file content
555        let source = std::fs::read_to_string(file_path)?;
556
557        // Compute file hash
558        let file_hash = compute_file_hash(file_path)?;
559
560        // Count lines
561        let line_count = source.lines().count();
562
563        // Add file container node
564        let file_node = Node::source_file(
565            rel_path.to_string(),
566            rel_path.to_string(),
567            file_hash,
568            line_count,
569        );
570        graph.add_node(file_node);
571
572        // Add CONTAINS edge from Repository to File (if we have a repo context)
573        if !repo_name.is_empty() {
574            graph
575                .add_edge_from_struct(&Edge::contains(repo_name.to_string(), rel_path.to_string()));
576        }
577
578        // Get or create tag extractor for this language
579        let mut extractor = match &self.queries_dir {
580            Some(dir) => TagExtractor::from_queries_dir(language, dir)?,
581            None => TagExtractor::from_embedded(language)?,
582        };
583        let metadata_extractor = MetadataExtractor::new(language);
584
585        // Extract tags
586        let tags = extractor.extract(&source)?;
587
588        // Separate definition and reference tags
589        // IMPORTANT: Only use `name.` prefixed tags (e.g., @name.definition.X) which capture
590        // the identifier. Tags without `name.` prefix (e.g., @definition.X) capture the whole
591        // node body and should be skipped for node creation.
592        let mut definition_tags: Vec<_> = tags
593            .iter()
594            .filter(|t| t.tag.starts_with("name.") && t.tag.contains(".definition."))
595            .collect();
596
597        let reference_tags: Vec<_> = tags
598            .iter()
599            .filter(|t| t.tag.starts_with("name.") && t.tag.contains(".reference."))
600            .collect();
601
602        // Sort definition tags by line for proper containment tracking
603        definition_tags.sort_by_key(|t| (t.start_line, t.end_line));
604
605        // Initialize containment context
606        let mut containment_ctx = ContainmentContext::new();
607
608        // Process definitions
609        for tag in &definition_tags {
610            // Parse tag type
611            let tag_string = normalize_tag_string(&tag.tag);
612            let tag_info = match parse_tag_string(&tag_string) {
613                Ok(info) => info,
614                Err(e) => {
615                    warn!(
616                        "Could not parse tag type '{}' in {}:{}: {}",
617                        tag.tag,
618                        rel_path,
619                        tag.line_number(),
620                        e
621                    );
622                    continue;
623                }
624            };
625
626            // Skip Data nodes if configured
627            if self.config.skip_data_nodes && tag_info.node_type == NodeType::Data {
628                *skipped_data_nodes += 1;
629                continue;
630            }
631
632            // Update containment context (use parent line range for proper nesting)
633            containment_ctx.update(tag.containment_start_line());
634
635            // Check max containment depth
636            if let Some(max_depth) = self.config.max_containment_depth {
637                let current_depth = containment_ctx.depth();
638                if current_depth >= max_depth {
639                    *skipped_depth_nodes += 1;
640                    continue;
641                }
642            }
643
644            // Get containment path - special handling for Rust impl methods
645            let (containment_path, parent_id) = if let Some(impl_type) = &tag.impl_target {
646                // For Rust methods inside impl blocks, use the impl type as parent
647                let impl_type_id = format!("{}:{}", rel_path, impl_type);
648                (vec![impl_type.as_str()], impl_type_id)
649            } else {
650                // Normal containment tracking
651                let path = containment_ctx.get_containment_path();
652                let parent = containment_ctx
653                    .get_current_parent_id()
654                    .map(String::from)
655                    .unwrap_or_else(|| rel_path.to_string());
656                (path, parent)
657            };
658
659            // Skip self-referential containment
660            if containment_path.last() == Some(&tag.name.as_str()) {
661                continue;
662            }
663
664            // Generate node ID
665            let node_id = generate_node_id(
666                rel_path,
667                &containment_path,
668                &tag.name,
669                Some(tag.line_number()),
670            );
671
672            // Add to definitions dictionary
673            defines.insert(tag.name.clone(), node_id.clone());
674
675            // Create node
676            let node = self.create_node_from_tag(
677                &node_id,
678                &tag.name,
679                &tag_info,
680                rel_path,
681                tag.line_number(),
682                tag.end_line_number(),
683                &metadata_extractor,
684            );
685
686            // Skip if node already exists
687            if graph.contains_node(&node_id) {
688                continue;
689            }
690
691            // Add node to graph
692            graph.add_node(node);
693
694            // Add CONTAINS edge from parent
695            graph.add_edge_from_struct(&Edge::contains(parent_id.clone(), node_id.clone()));
696
697            // Add DEFINES edge for Data nodes (if parent is not the file)
698            if tag_info.node_type == NodeType::Data && parent_id != rel_path {
699                graph.add_edge_from_struct(&Edge::defines(parent_id.clone(), node_id.clone()));
700            }
701
702            // Push containers onto containment stack (use parent line range for proper nesting)
703            let node_type_str = tag_info.node_type.as_str();
704            if node_type_str == "Container" || node_type_str == "Callable" {
705                containment_ctx.push_container(
706                    node_id,
707                    node_type_str.to_string(),
708                    tag.containment_start_line(),
709                    tag.containment_end_line(),
710                    tag.name.clone(),
711                );
712            }
713        }
714
715        // Process references
716        for tag in &reference_tags {
717            let tag_string = normalize_tag_string(&tag.tag);
718            let _tag_info = match parse_tag_string(&tag_string) {
719                Ok(info) => info,
720                Err(_) => continue,
721            };
722
723            // Find source entity context
724            // Rebuild containment by finding enclosing definitions
725            let source_id = self.find_enclosing_context(&definition_tags, tag.start_line, rel_path);
726
727            // Store reference
728            references
729                .entry(tag.name.clone())
730                .or_default()
731                .push(ReferenceInfo {
732                    source_id,
733                    line: tag.line_number(),
734                });
735        }
736
737        Ok(())
738    }
739
740    /// Create a Node from a parsed tag.
741    #[allow(clippy::too_many_arguments)]
742    fn create_node_from_tag(
743        &self,
744        node_id: &str,
745        name: &str,
746        tag_info: &TagParseResult,
747        file: &str,
748        line: usize,
749        end_line: usize,
750        metadata_extractor: &MetadataExtractor,
751    ) -> Node {
752        // Extract metadata from name (for visibility conventions)
753        let metadata = metadata_extractor.extract_from_name(name);
754
755        match tag_info.node_type {
756            NodeType::Container => {
757                let kind = match &tag_info.kind {
758                    Some(crate::graph::NodeKind::Container(k)) => *k,
759                    _ => ContainerKind::Type,
760                };
761                // File containers use source_file() constructor
762                if kind == ContainerKind::File {
763                    Node::source_file(
764                        node_id.to_string(),
765                        file.to_string(),
766                        String::new(),
767                        end_line,
768                    )
769                } else {
770                    Node::container(
771                        node_id.to_string(),
772                        name.to_string(),
773                        kind,
774                        tag_info.subtype.clone(),
775                        file.to_string(),
776                        line,
777                        end_line,
778                    )
779                    .with_metadata(metadata)
780                }
781            }
782            NodeType::Callable => {
783                let kind = match &tag_info.kind {
784                    Some(crate::graph::NodeKind::Callable(k)) => *k,
785                    _ => CallableKind::Function,
786                };
787                let node = Node::callable(
788                    node_id.to_string(),
789                    name.to_string(),
790                    kind,
791                    file.to_string(),
792                    line,
793                    end_line,
794                );
795
796                // Add scope to metadata if present
797                let mut meta = metadata;
798                if let Some(scope) = &tag_info.scope {
799                    meta.scope = Some(scope.clone());
800                }
801                node.with_metadata(meta)
802            }
803            NodeType::Data => {
804                let kind = match &tag_info.kind {
805                    Some(crate::graph::NodeKind::Data(k)) => *k,
806                    _ => DataKind::Value,
807                };
808                Node::data(
809                    node_id.to_string(),
810                    name.to_string(),
811                    kind,
812                    tag_info.subtype.clone(),
813                    file.to_string(),
814                    line,
815                    end_line,
816                )
817                .with_metadata(metadata)
818            }
819        }
820    }
821
822    /// Find the enclosing context for a reference at a given line.
823    fn find_enclosing_context(
824        &self,
825        definition_tags: &[&crate::parser::ExtractedTag],
826        line: usize,
827        file: &str,
828    ) -> String {
829        // Find the innermost enclosing definition (use containment lines for proper nesting)
830        let mut enclosing: Option<&crate::parser::ExtractedTag> = None;
831
832        for tag in definition_tags {
833            // Check if this definition contains the reference line
834            let tag_start = tag.containment_start_line();
835            let tag_end = tag.containment_end_line();
836            if tag_start <= line && tag_end >= line {
837                // Parse tag to check if it's a Container or Callable
838                let tag_string = normalize_tag_string(&tag.tag);
839                if let Ok(info) = parse_tag_string(&tag_string) {
840                    if info.node_type == NodeType::Container || info.node_type == NodeType::Callable
841                    {
842                        // Check if this is more specific (narrower) than current enclosing
843                        if let Some(current) = enclosing {
844                            let current_start = current.containment_start_line();
845                            let current_end = current.containment_end_line();
846                            if tag_start >= current_start && tag_end <= current_end {
847                                enclosing = Some(tag);
848                            }
849                        } else {
850                            enclosing = Some(tag);
851                        }
852                    }
853                }
854            }
855        }
856
857        // Build containment path for the enclosing entity
858        if let Some(enc) = enclosing {
859            let mut path = Vec::new();
860            let enc_start = enc.containment_start_line();
861            let enc_end = enc.containment_end_line();
862
863            // Find all parents of the enclosing definition
864            for tag in definition_tags {
865                let tag_start = tag.containment_start_line();
866                let tag_end = tag.containment_end_line();
867                if tag_start < enc_start && tag_end >= enc_end {
868                    let tag_string = normalize_tag_string(&tag.tag);
869                    if let Ok(info) = parse_tag_string(&tag_string) {
870                        if info.node_type == NodeType::Container
871                            || info.node_type == NodeType::Callable
872                        {
873                            path.push(tag.name.as_str());
874                        }
875                    }
876                }
877            }
878
879            path.push(enc.name.as_str());
880            generate_node_id(file, &path[..path.len() - 1], path.last().unwrap(), None)
881        } else {
882            // Reference is at file level
883            file.to_string()
884        }
885    }
886
887    /// Resolve references and create USES edges.
888    fn resolve_references(
889        &self,
890        graph: &mut PetCodeGraph,
891        defines: &HashMap<String, String>,
892        references: &HashMap<String, Vec<ReferenceInfo>>,
893    ) {
894        info!("Creating USES relationships...");
895        let mut uses_count = 0;
896        let mut forward_refs = 0;
897        let mut skipped_missing_source = 0;
898
899        for (name, refs) in references {
900            if let Some(target_id) = defines.get(name) {
901                // Target definition found - create USES edges
902                for ref_info in refs {
903                    // Only create edge if source and target are different
904                    if ref_info.source_id != *target_id {
905                        // Only create edge if source node exists in the graph
906                        if graph.contains_node(&ref_info.source_id) {
907                            graph.add_edge_from_struct(&Edge::uses(
908                                ref_info.source_id.clone(),
909                                target_id.clone(),
910                                Some(ref_info.line),
911                                Some(name.clone()),
912                            ));
913                            uses_count += 1;
914                        } else {
915                            // Source node doesn't exist (e.g., reference inside impl block
916                            // for a type not defined in this codebase)
917                            skipped_missing_source += 1;
918                            debug!(
919                                "Skipped USES edge: source '{}' not found (ref to '{}')",
920                                ref_info.source_id, name
921                            );
922                        }
923                    }
924                }
925            } else {
926                // Forward reference or external dependency
927                forward_refs += refs.len();
928                debug!(
929                    "Forward/external reference to '{}' ({} occurrences)",
930                    name,
931                    refs.len()
932                );
933            }
934        }
935
936        info!("Created {} USES relationships", uses_count);
937        if forward_refs > 0 {
938            info!("Skipped {} forward/external references", forward_refs);
939        }
940        if skipped_missing_source > 0 {
941            info!(
942                "Skipped {} references with missing source nodes",
943                skipped_missing_source
944            );
945        }
946    }
947
948    /// Parse a single file and return a graph with its entities.
949    ///
950    /// This method is useful for incremental updates where only specific files
951    /// need to be reparsed. Returns a graph containing:
952    /// - FILE node with hash
953    /// - All definition nodes (Container, Callable, Data)
954    /// - CONTAINS edges (parent → child)
955    /// - DEFINES edges (Container/Callable → Data)
956    ///
957    /// Note: USES edges are NOT included because they require cross-file
958    /// reference resolution. Call `resolve_references_for_file()` after
959    /// merging into the main graph if needed.
960    ///
961    /// # Arguments
962    ///
963    /// * `file_path` - Absolute path to the file
964    /// * `rel_path` - Relative path (used for node IDs)
965    ///
966    /// # Returns
967    ///
968    /// A `PetCodeGraph` containing entities from this file only.
969    pub fn parse_file(
970        &mut self,
971        file_path: &Path,
972        rel_path: &str,
973    ) -> Result<PetCodeGraph, BuilderError> {
974        let mut graph = PetCodeGraph::new();
975        let mut defines = HashMap::new();
976        let mut references = HashMap::new();
977        let mut skipped_data = 0;
978        let mut skipped_depth = 0;
979
980        self.process_file(
981            file_path,
982            rel_path,
983            "", // No repository context for single file parsing
984            &mut graph,
985            &mut defines,
986            &mut references,
987            &mut skipped_data,
988            &mut skipped_depth,
989        )?;
990
991        debug!(
992            "Parsed {}: {} nodes, {} edges",
993            rel_path,
994            graph.node_count(),
995            graph.edge_count()
996        );
997
998        Ok(graph)
999    }
1000}
1001
1002// ============================================================================
1003// Component Builder
1004// ============================================================================
1005
1006/// Information about a discovered component from a manifest file.
1007#[derive(Debug, Clone)]
1008pub struct DiscoveredComponent {
1009    /// Node ID for the component (e.g., "my-repo:packages/core")
1010    pub node_id: String,
1011    /// Component name from manifest (e.g., "@myorg/core")
1012    pub name: String,
1013    /// Path to the manifest file relative to repo root
1014    pub manifest_path: String,
1015    /// Directory containing the manifest (relative to repo root)
1016    pub directory: String,
1017    /// Parsed manifest info
1018    pub info: ManifestInfo,
1019}
1020
1021/// Builds Component nodes and DependsOn edges from manifest files.
1022///
1023/// The `ComponentBuilder` discovers manifest files in a repository,
1024/// parses them to extract component metadata, and creates the component
1025/// graph with proper containment and dependency edges.
1026///
1027/// ## Usage
1028///
1029/// ```ignore
1030/// use codeprysm_core::builder::{ComponentBuilder, BuilderConfig};
1031/// use codeprysm_core::graph::PetCodeGraph;
1032/// use std::path::Path;
1033///
1034/// let mut graph = PetCodeGraph::new();
1035/// let mut builder = ComponentBuilder::new()?;
1036///
1037/// let components = builder.discover_components(Path::new("my-repo"), &[])?;
1038/// builder.add_to_graph(&mut graph, "my-repo", &components)?;
1039/// ```
1040pub struct ComponentBuilder {
1041    /// Reusable manifest parser
1042    parser: ManifestParser,
1043    /// Index from manifest directory path to component node ID
1044    path_index: HashMap<PathBuf, String>,
1045}
1046
1047impl ComponentBuilder {
1048    /// Create a new component builder.
1049    pub fn new() -> Result<Self, BuilderError> {
1050        let parser = ManifestParser::new()
1051            .map_err(|e| BuilderError::Io(std::io::Error::other(e.to_string())))?;
1052
1053        Ok(Self {
1054            parser,
1055            path_index: HashMap::new(),
1056        })
1057    }
1058
1059    /// Discover all components (manifest files) in a directory.
1060    ///
1061    /// Walks the directory tree, finds manifest files, parses them,
1062    /// and returns information about each discovered component.
1063    ///
1064    /// # Arguments
1065    ///
1066    /// * `root` - Root directory to search
1067    /// * `exclude_patterns` - Glob patterns to exclude (e.g., "node_modules", "target")
1068    ///
1069    /// # Returns
1070    ///
1071    /// A list of discovered components with their manifest information.
1072    pub fn discover_components(
1073        &mut self,
1074        root: &Path,
1075        exclude_patterns: &[String],
1076    ) -> Result<Vec<DiscoveredComponent>, BuilderError> {
1077        let root = root.canonicalize().map_err(BuilderError::Io)?;
1078        let repo_name = get_repo_name(&root);
1079
1080        let mut components = Vec::new();
1081        let glob_set = build_exclude_glob_set(exclude_patterns);
1082
1083        info!("Discovering components in {}", root.display());
1084
1085        // Use ignore::WalkBuilder which respects .gitignore and .codeprysmignore
1086        let walker = WalkBuilder::new(&root)
1087            .follow_links(false)
1088            .hidden(true) // Skip hidden files/directories
1089            .git_ignore(true) // Respect .gitignore
1090            .git_global(true) // Respect global gitignore
1091            .git_exclude(true) // Respect .git/info/exclude
1092            .add_custom_ignore_filename(".codeprysmignore") // Respect .codeprysmignore
1093            .build();
1094
1095        for entry in walker {
1096            let entry = match entry {
1097                Ok(e) => e,
1098                Err(e) => {
1099                    debug!("Error walking directory: {}", e);
1100                    continue;
1101                }
1102            };
1103
1104            // Skip directories - we only want files
1105            let file_type = match entry.file_type() {
1106                Some(ft) => ft,
1107                None => continue,
1108            };
1109            if !file_type.is_file() {
1110                continue;
1111            }
1112
1113            let path = entry.path();
1114
1115            // Check if this is a manifest file
1116            if ManifestLanguage::from_path(path).is_none() {
1117                continue;
1118            }
1119
1120            // Check additional exclude patterns from config (beyond .gitignore/.codeprysmignore)
1121            let rel_path = path.strip_prefix(&root).unwrap_or(path);
1122            let rel_path_str = rel_path.to_string_lossy();
1123            if glob_set.is_match(rel_path_str.as_ref()) {
1124                debug!("Skipping excluded manifest: {}", rel_path_str);
1125                continue;
1126            }
1127
1128            // Parse the manifest
1129            match self.parse_manifest_file(path, &root, &repo_name) {
1130                Ok(Some(component)) => {
1131                    debug!(
1132                        "Discovered component: {} at {}",
1133                        component.name, component.manifest_path
1134                    );
1135                    components.push(component);
1136                }
1137                Ok(None) => {
1138                    // Manifest parsed but no component info extracted
1139                    debug!("No component info in {}", rel_path_str);
1140                }
1141                Err(e) => {
1142                    warn!("Failed to parse manifest {}: {}", rel_path_str, e);
1143                }
1144            }
1145        }
1146
1147        info!("Discovered {} components", components.len());
1148        Ok(components)
1149    }
1150
1151    /// Parse a manifest file and create a DiscoveredComponent.
1152    fn parse_manifest_file(
1153        &mut self,
1154        path: &Path,
1155        root: &Path,
1156        repo_name: &str,
1157    ) -> Result<Option<DiscoveredComponent>, BuilderError> {
1158        let content = std::fs::read_to_string(path)?;
1159        let rel_path = path
1160            .strip_prefix(root)
1161            .unwrap_or(path)
1162            .to_string_lossy()
1163            .to_string();
1164
1165        let info = self
1166            .parser
1167            .parse(path, &content)
1168            .map_err(|e| BuilderError::Io(std::io::Error::other(e.to_string())))?;
1169
1170        // Get the manifest directory (relative to root)
1171        let manifest_dir = path
1172            .parent()
1173            .and_then(|p| p.strip_prefix(root).ok())
1174            .map(|p| p.to_string_lossy().to_string())
1175            .unwrap_or_default();
1176
1177        // Determine component name
1178        let name = info.component_name.clone().unwrap_or_else(|| {
1179            // Infer name from directory
1180            if manifest_dir.is_empty() {
1181                repo_name.to_string()
1182            } else {
1183                manifest_dir
1184                    .rsplit('/')
1185                    .find(|s| !s.is_empty())
1186                    .unwrap_or(&manifest_dir)
1187                    .to_string()
1188            }
1189        });
1190
1191        // Skip empty manifests (no name, no workspace, no deps)
1192        if info.is_empty() && info.component_name.is_none() {
1193            return Ok(None);
1194        }
1195
1196        // Generate node ID: repo_name:relative_dir or repo_name for root
1197        let node_id = if manifest_dir.is_empty() {
1198            format!("component:{}", repo_name)
1199        } else {
1200            format!(
1201                "component:{}:{}",
1202                repo_name,
1203                manifest_dir.replace('\\', "/")
1204            )
1205        };
1206
1207        Ok(Some(DiscoveredComponent {
1208            node_id,
1209            name,
1210            manifest_path: rel_path,
1211            directory: manifest_dir,
1212            info,
1213        }))
1214    }
1215
1216    /// Add discovered components to a graph.
1217    ///
1218    /// Creates Component nodes with proper metadata and builds the
1219    /// containment hierarchy and dependency edges.
1220    ///
1221    /// # Arguments
1222    ///
1223    /// * `graph` - The graph to add components to
1224    /// * `repo_name` - Name of the repository (for parent containment)
1225    /// * `components` - List of discovered components
1226    ///
1227    /// # Returns
1228    ///
1229    /// The number of Component nodes added to the graph.
1230    pub fn add_to_graph(
1231        &mut self,
1232        graph: &mut PetCodeGraph,
1233        repo_name: &str,
1234        components: &[DiscoveredComponent],
1235    ) -> Result<usize, BuilderError> {
1236        // Build path index for dependency resolution
1237        self.build_path_index(components);
1238
1239        // First pass: create all Component nodes
1240        let mut added = 0;
1241        for component in components {
1242            self.add_component_node(graph, component);
1243            added += 1;
1244        }
1245
1246        // Second pass: create CONTAINS edges for hierarchy
1247        self.build_containment_hierarchy(graph, repo_name, components);
1248
1249        // Third pass: create DependsOn edges
1250        self.create_dependency_edges(graph, components);
1251
1252        info!(
1253            "Added {} components with {} dependency edges",
1254            added,
1255            graph.edges_by_type(EdgeType::DependsOn).count()
1256        );
1257
1258        Ok(added)
1259    }
1260
1261    /// Build the path index for dependency resolution.
1262    fn build_path_index(&mut self, components: &[DiscoveredComponent]) {
1263        self.path_index.clear();
1264
1265        for component in components {
1266            // Index by directory path
1267            let dir_path = PathBuf::from(&component.directory);
1268            self.path_index
1269                .insert(dir_path.clone(), component.node_id.clone());
1270
1271            // Also index by normalized path (forward slashes)
1272            let normalized = component.directory.replace('\\', "/");
1273            if normalized != component.directory {
1274                self.path_index
1275                    .insert(PathBuf::from(normalized), component.node_id.clone());
1276            }
1277        }
1278
1279        debug!("Built path index with {} entries", self.path_index.len());
1280    }
1281
1282    /// Add a single Component node to the graph.
1283    fn add_component_node(&self, graph: &mut PetCodeGraph, component: &DiscoveredComponent) {
1284        let metadata = NodeMetadata::default().with_component(
1285            Some(component.info.is_workspace_root),
1286            Some(component.info.is_publishable()),
1287            Some(component.manifest_path.clone()),
1288        );
1289
1290        let node = Node::component(
1291            component.node_id.clone(),
1292            component.name.clone(),
1293            component.manifest_path.clone(),
1294            metadata,
1295        );
1296
1297        graph.add_node(node);
1298    }
1299
1300    /// Build the containment hierarchy for components.
1301    ///
1302    /// - Repository CONTAINS top-level components
1303    /// - Workspace root CONTAINS its member components
1304    fn build_containment_hierarchy(
1305        &self,
1306        graph: &mut PetCodeGraph,
1307        repo_name: &str,
1308        components: &[DiscoveredComponent],
1309    ) {
1310        // Find workspace roots and their members
1311        let workspace_roots: Vec<_> = components
1312            .iter()
1313            .filter(|c| c.info.is_workspace_root)
1314            .collect();
1315
1316        for component in components {
1317            // Determine the parent for this component
1318            let parent_id = self.find_parent_component(component, &workspace_roots, repo_name);
1319
1320            // Only add CONTAINS edge if parent exists in graph
1321            if graph.contains_node(&parent_id) && graph.contains_node(&component.node_id) {
1322                graph.add_edge_from_struct(&Edge::contains(parent_id, component.node_id.clone()));
1323            }
1324        }
1325    }
1326
1327    /// Find the parent for a component.
1328    ///
1329    /// Returns the workspace root if this component is a member,
1330    /// otherwise returns the repository node.
1331    fn find_parent_component(
1332        &self,
1333        component: &DiscoveredComponent,
1334        workspace_roots: &[&DiscoveredComponent],
1335        repo_name: &str,
1336    ) -> String {
1337        // Check if this component is a workspace member
1338        for root in workspace_roots {
1339            // Skip if this IS the workspace root
1340            if root.node_id == component.node_id {
1341                continue;
1342            }
1343
1344            // Check if component directory matches any workspace member pattern
1345            for pattern in &root.info.workspace_members {
1346                if self.matches_workspace_pattern(&component.directory, pattern, &root.directory) {
1347                    return root.node_id.clone();
1348                }
1349            }
1350        }
1351
1352        // Default to repository as parent
1353        repo_name.to_string()
1354    }
1355
1356    /// Check if a component directory matches a workspace member pattern.
1357    fn matches_workspace_pattern(
1358        &self,
1359        component_dir: &str,
1360        pattern: &str,
1361        workspace_dir: &str,
1362    ) -> bool {
1363        // Normalize paths
1364        let component_dir = component_dir.replace('\\', "/");
1365        let pattern = pattern.replace('\\', "/");
1366        let workspace_dir = workspace_dir.replace('\\', "/");
1367
1368        // Calculate the full pattern path relative to repo root
1369        let full_pattern = if workspace_dir.is_empty() {
1370            pattern.clone()
1371        } else {
1372            format!("{}/{}", workspace_dir, pattern)
1373        };
1374
1375        // Handle glob patterns (e.g., "packages/*", "crates/*")
1376        if full_pattern.ends_with("/*") {
1377            let prefix = full_pattern.trim_end_matches("/*");
1378            component_dir.starts_with(prefix) && component_dir != prefix
1379        } else if full_pattern.contains('*') {
1380            // More complex glob - use simple prefix matching for now
1381            let prefix = full_pattern.split('*').next().unwrap_or("");
1382            !prefix.is_empty() && component_dir.starts_with(prefix)
1383        } else {
1384            // Exact match
1385            component_dir == full_pattern
1386        }
1387    }
1388
1389    /// Create DependsOn edges for local dependencies.
1390    fn create_dependency_edges(
1391        &self,
1392        graph: &mut PetCodeGraph,
1393        components: &[DiscoveredComponent],
1394    ) {
1395        for component in components {
1396            for dep in &component.info.local_dependencies {
1397                if let Some(target_id) = self.resolve_dependency(component, dep) {
1398                    // Only create edge if both nodes exist
1399                    if graph.contains_node(&component.node_id) && graph.contains_node(&target_id) {
1400                        let version_spec = self.format_version_spec(dep);
1401                        let edge = Edge::depends_on(
1402                            component.node_id.clone(),
1403                            target_id,
1404                            Some(dep.name.clone()),
1405                            version_spec,
1406                            Some(dep.is_dev),
1407                        );
1408                        graph.add_edge_from_struct(&edge);
1409                    }
1410                } else {
1411                    debug!(
1412                        "Could not resolve dependency '{}' from {} (path: {:?})",
1413                        dep.name, component.node_id, dep.path
1414                    );
1415                }
1416            }
1417        }
1418    }
1419
1420    /// Resolve a local dependency to a component node ID.
1421    fn resolve_dependency(
1422        &self,
1423        from: &DiscoveredComponent,
1424        dep: &LocalDependency,
1425    ) -> Option<String> {
1426        // First, try to resolve by path
1427        if let Some(ref dep_path) = dep.path {
1428            let mut resolved = self.resolve_dependency_path(&from.directory, dep_path);
1429
1430            // For .NET ProjectReference, strip the .csproj/.vbproj/.fsproj filename
1431            // to get the directory containing the manifest
1432            if dep.dep_type == DependencyType::ProjectReference {
1433                if let Some(parent) = resolved.parent() {
1434                    resolved = parent.to_path_buf();
1435                }
1436            }
1437
1438            if let Some(id) = self.path_index.get(&resolved) {
1439                return Some(id.clone());
1440            }
1441
1442            // Try the path as-is
1443            if let Some(id) = self.path_index.get(&PathBuf::from(dep_path)) {
1444                return Some(id.clone());
1445            }
1446        }
1447
1448        // For workspace dependencies, search by name
1449        if dep.dep_type == DependencyType::Workspace {
1450            // Search for a component with matching name
1451            // This is a simple linear search - could be optimized with a name index
1452            for (path, id) in &self.path_index {
1453                let dir_name = path
1454                    .file_name()
1455                    .map(|n| n.to_string_lossy().to_string())
1456                    .unwrap_or_default();
1457                if dir_name == dep.name || dep.name.ends_with(&format!("/{}", dir_name)) {
1458                    return Some(id.clone());
1459                }
1460            }
1461        }
1462
1463        None
1464    }
1465
1466    /// Resolve a dependency path relative to a component directory.
1467    fn resolve_dependency_path(&self, from_dir: &str, dep_path: &str) -> PathBuf {
1468        let from_dir = from_dir.replace('\\', "/");
1469        let dep_path = dep_path.replace('\\', "/");
1470
1471        // Handle absolute paths
1472        if let Some(stripped) = dep_path.strip_prefix('/') {
1473            return PathBuf::from(stripped);
1474        }
1475
1476        // Handle relative paths
1477        let from_parts: Vec<&str> = from_dir.split('/').filter(|s| !s.is_empty()).collect();
1478        let dep_parts: Vec<&str> = dep_path.split('/').filter(|s| !s.is_empty()).collect();
1479
1480        let mut result: Vec<&str> = from_parts.clone();
1481
1482        for part in dep_parts {
1483            match part {
1484                ".." => {
1485                    result.pop();
1486                }
1487                "." => {}
1488                _ => {
1489                    result.push(part);
1490                }
1491            }
1492        }
1493
1494        PathBuf::from(result.join("/"))
1495    }
1496
1497    /// Format a version spec string for a dependency.
1498    fn format_version_spec(&self, dep: &LocalDependency) -> Option<String> {
1499        match dep.dep_type {
1500            DependencyType::Path => dep.path.as_ref().map(|p| format!("path:{}", p)),
1501            DependencyType::Workspace => Some("workspace:*".to_string()),
1502            DependencyType::ProjectReference => dep.path.as_ref().map(|p| format!("project:{}", p)),
1503            DependencyType::Replace => dep.path.as_ref().map(|p| format!("replace:{}", p)),
1504            DependencyType::Subdirectory => dep.path.as_ref().map(|p| format!("subdir:{}", p)),
1505        }
1506    }
1507
1508    /// Get the path index (for testing or inspection).
1509    pub fn path_index(&self) -> &HashMap<PathBuf, String> {
1510        &self.path_index
1511    }
1512}
1513
1514/// Build a glob set from exclude patterns.
1515fn build_exclude_glob_set(patterns: &[String]) -> globset::GlobSet {
1516    let mut builder = globset::GlobSetBuilder::new();
1517    for pattern in patterns {
1518        if let Ok(glob) = globset::Glob::new(pattern) {
1519            builder.add(glob);
1520        }
1521    }
1522    // Add default excludes
1523    for pattern in &[
1524        "**/.git/**",
1525        "**/node_modules/**",
1526        "**/target/**",
1527        "**/__pycache__/**",
1528        "**/.venv/**",
1529        "**/venv/**",
1530        "**/.tox/**",
1531        "**/dist/**",
1532        "**/build/**",
1533    ] {
1534        if let Ok(glob) = globset::Glob::new(pattern) {
1535            builder.add(glob);
1536        }
1537    }
1538    builder
1539        .build()
1540        .unwrap_or_else(|_| globset::GlobSet::empty())
1541}
1542
1543// ============================================================================
1544// Helper Functions
1545// ============================================================================
1546
1547/// Normalize a tag string to the expected format.
1548///
1549/// Handles various tag formats:
1550/// - `@definition.callable.function` -> `definition.callable.function`
1551/// - `name.definition.callable.function` -> `definition.callable.function`
1552fn normalize_tag_string(tag: &str) -> String {
1553    // Remove @ prefix if present
1554    let tag = tag.strip_prefix('@').unwrap_or(tag);
1555
1556    // Handle `name.` prefix (tree-sitter capture name convention)
1557    if let Some(stripped) = tag.strip_prefix("name.") {
1558        return stripped.to_string();
1559    }
1560
1561    tag.to_string()
1562}
1563
1564/// Extract git metadata from a repository directory.
1565///
1566/// Returns (remote_url, branch, commit_sha) - all optional.
1567fn extract_git_metadata(repo_path: &Path) -> (Option<String>, Option<String>, Option<String>) {
1568    let git_dir = repo_path.join(".git");
1569    if !git_dir.exists() {
1570        return (None, None, None);
1571    }
1572
1573    // Try to get remote URL
1574    let remote = std::process::Command::new("git")
1575        .args(["remote", "get-url", "origin"])
1576        .current_dir(repo_path)
1577        .output()
1578        .ok()
1579        .filter(|o| o.status.success())
1580        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
1581        .filter(|s| !s.is_empty());
1582
1583    // Try to get current branch
1584    let branch = std::process::Command::new("git")
1585        .args(["rev-parse", "--abbrev-ref", "HEAD"])
1586        .current_dir(repo_path)
1587        .output()
1588        .ok()
1589        .filter(|o| o.status.success())
1590        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
1591        .filter(|s| !s.is_empty());
1592
1593    // Try to get current commit SHA
1594    let commit = std::process::Command::new("git")
1595        .args(["rev-parse", "HEAD"])
1596        .current_dir(repo_path)
1597        .output()
1598        .ok()
1599        .filter(|o| o.status.success())
1600        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
1601        .filter(|s| !s.is_empty());
1602
1603    (remote, branch, commit)
1604}
1605
1606/// Extract repository name from a directory path.
1607fn get_repo_name(directory: &Path) -> String {
1608    directory
1609        .file_name()
1610        .map(|n| n.to_string_lossy().to_string())
1611        .unwrap_or_else(|| "repository".to_string())
1612}
1613
1614// ============================================================================
1615// Tests
1616// ============================================================================
1617
1618#[cfg(test)]
1619mod tests {
1620    use super::*;
1621
1622    #[test]
1623    fn test_normalize_tag_string() {
1624        assert_eq!(
1625            normalize_tag_string("@definition.callable.function"),
1626            "definition.callable.function"
1627        );
1628        assert_eq!(
1629            normalize_tag_string("definition.callable.function"),
1630            "definition.callable.function"
1631        );
1632        assert_eq!(
1633            normalize_tag_string("name.definition.callable.function"),
1634            "definition.callable.function"
1635        );
1636    }
1637
1638    #[test]
1639    fn test_builder_config_default() {
1640        let config = BuilderConfig::default();
1641        assert!(!config.skip_data_nodes);
1642        assert!(config.max_containment_depth.is_none());
1643        assert!(config.max_files.is_none());
1644        assert!(!config.exclude_patterns.is_empty());
1645    }
1646
1647    #[test]
1648    fn test_builder_new_missing_queries_dir() {
1649        let result = GraphBuilder::new(Path::new("/nonexistent/queries"));
1650        assert!(matches!(result, Err(BuilderError::QueryDirNotFound(_))));
1651    }
1652
1653    // ========================================================================
1654    // ComponentBuilder Tests
1655    // ========================================================================
1656
1657    #[test]
1658    fn test_component_builder_new() {
1659        let builder = ComponentBuilder::new();
1660        assert!(builder.is_ok());
1661    }
1662
1663    #[test]
1664    fn test_resolve_dependency_path_relative() {
1665        let builder = ComponentBuilder::new().unwrap();
1666
1667        // From packages/core, ../utils -> packages/utils
1668        let result = builder.resolve_dependency_path("packages/core", "../utils");
1669        assert_eq!(result, PathBuf::from("packages/utils"));
1670
1671        // From packages/core, ./lib -> packages/core/lib
1672        let result = builder.resolve_dependency_path("packages/core", "./lib");
1673        assert_eq!(result, PathBuf::from("packages/core/lib"));
1674
1675        // From empty (root), packages/shared -> packages/shared
1676        let result = builder.resolve_dependency_path("", "packages/shared");
1677        assert_eq!(result, PathBuf::from("packages/shared"));
1678
1679        // Multiple .. traversals
1680        let result = builder.resolve_dependency_path("deep/nested/path", "../../sibling");
1681        assert_eq!(result, PathBuf::from("deep/sibling"));
1682    }
1683
1684    #[test]
1685    fn test_resolve_dependency_path_windows() {
1686        let builder = ComponentBuilder::new().unwrap();
1687
1688        // Windows path separators
1689        let result = builder.resolve_dependency_path("packages\\core", "..\\utils");
1690        assert_eq!(result, PathBuf::from("packages/utils"));
1691    }
1692
1693    #[test]
1694    fn test_matches_workspace_pattern() {
1695        let builder = ComponentBuilder::new().unwrap();
1696
1697        // Simple glob pattern: packages/*
1698        assert!(builder.matches_workspace_pattern("packages/core", "packages/*", ""));
1699        assert!(builder.matches_workspace_pattern("packages/utils", "packages/*", ""));
1700        assert!(!builder.matches_workspace_pattern("packages", "packages/*", "")); // exact match should fail
1701        assert!(!builder.matches_workspace_pattern("other/core", "packages/*", ""));
1702
1703        // Pattern with workspace directory
1704        assert!(builder.matches_workspace_pattern("apps/web/core", "core", "apps/web"));
1705
1706        // Nested glob: crates/*
1707        assert!(builder.matches_workspace_pattern("crates/codeprysm-core", "crates/*", ""));
1708        assert!(builder.matches_workspace_pattern("crates/codeprysm-search", "crates/*", ""));
1709    }
1710
1711    #[test]
1712    fn test_format_version_spec() {
1713        let builder = ComponentBuilder::new().unwrap();
1714
1715        let path_dep = LocalDependency::with_path(
1716            "my-dep".to_string(),
1717            "../shared".to_string(),
1718            DependencyType::Path,
1719        );
1720        assert_eq!(
1721            builder.format_version_spec(&path_dep),
1722            Some("path:../shared".to_string())
1723        );
1724
1725        let workspace_dep = LocalDependency::new("my-dep".to_string(), DependencyType::Workspace);
1726        assert_eq!(
1727            builder.format_version_spec(&workspace_dep),
1728            Some("workspace:*".to_string())
1729        );
1730
1731        let project_ref_dep = LocalDependency::with_path(
1732            "Shared".to_string(),
1733            "../Shared/Shared.csproj".to_string(),
1734            DependencyType::ProjectReference,
1735        );
1736        assert_eq!(
1737            builder.format_version_spec(&project_ref_dep),
1738            Some("project:../Shared/Shared.csproj".to_string())
1739        );
1740    }
1741
1742    #[test]
1743    fn test_build_exclude_glob_set() {
1744        let default_set = build_exclude_glob_set(&[]);
1745
1746        // Default excludes should match common directories
1747        assert!(default_set.is_match("node_modules/foo"));
1748        assert!(default_set.is_match("target/debug"));
1749        assert!(default_set.is_match(".git/objects"));
1750        assert!(default_set.is_match("__pycache__/module"));
1751
1752        // Custom patterns
1753        let custom_set = build_exclude_glob_set(&["vendor/**".to_string()]);
1754        assert!(custom_set.is_match("vendor/github.com"));
1755    }
1756
1757    #[test]
1758    fn test_discovered_component_creation() {
1759        let info = ManifestInfo {
1760            component_name: Some("my-package".to_string()),
1761            version: Some("1.0.0".to_string()),
1762            is_workspace_root: false,
1763            workspace_members: vec![],
1764            local_dependencies: vec![],
1765            ecosystem: Some("npm".to_string()),
1766        };
1767
1768        let component = DiscoveredComponent {
1769            node_id: "component:my-repo:packages/core".to_string(),
1770            name: "my-package".to_string(),
1771            manifest_path: "packages/core/package.json".to_string(),
1772            directory: "packages/core".to_string(),
1773            info,
1774        };
1775
1776        assert_eq!(component.node_id, "component:my-repo:packages/core");
1777        assert_eq!(component.name, "my-package");
1778        assert!(!component.info.is_workspace_root);
1779        assert!(component.info.is_publishable());
1780    }
1781
1782    #[test]
1783    fn test_add_component_node() {
1784        let builder = ComponentBuilder::new().unwrap();
1785        let mut graph = PetCodeGraph::new();
1786
1787        let info = ManifestInfo {
1788            component_name: Some("test-component".to_string()),
1789            version: Some("0.1.0".to_string()),
1790            is_workspace_root: true,
1791            workspace_members: vec!["packages/*".to_string()],
1792            local_dependencies: vec![],
1793            ecosystem: Some("cargo".to_string()),
1794        };
1795
1796        let component = DiscoveredComponent {
1797            node_id: "component:test-repo".to_string(),
1798            name: "test-component".to_string(),
1799            manifest_path: "Cargo.toml".to_string(),
1800            directory: "".to_string(),
1801            info,
1802        };
1803
1804        builder.add_component_node(&mut graph, &component);
1805
1806        // Verify node was added
1807        assert!(graph.contains_node("component:test-repo"));
1808        let node = graph.get_node("component:test-repo").unwrap();
1809        assert_eq!(node.name, "test-component");
1810        assert_eq!(node.node_type, NodeType::Container);
1811        assert_eq!(node.kind, Some("component".to_string()));
1812
1813        // Check metadata
1814        assert_eq!(node.metadata.is_workspace_root, Some(true));
1815        assert_eq!(node.metadata.is_publishable, Some(true));
1816        assert_eq!(node.metadata.manifest_path, Some("Cargo.toml".to_string()));
1817    }
1818
1819    #[test]
1820    fn test_build_path_index() {
1821        let mut builder = ComponentBuilder::new().unwrap();
1822
1823        let components = vec![
1824            DiscoveredComponent {
1825                node_id: "component:repo:packages/core".to_string(),
1826                name: "core".to_string(),
1827                manifest_path: "packages/core/package.json".to_string(),
1828                directory: "packages/core".to_string(),
1829                info: ManifestInfo::new(),
1830            },
1831            DiscoveredComponent {
1832                node_id: "component:repo:packages/utils".to_string(),
1833                name: "utils".to_string(),
1834                manifest_path: "packages/utils/package.json".to_string(),
1835                directory: "packages/utils".to_string(),
1836                info: ManifestInfo::new(),
1837            },
1838        ];
1839
1840        builder.build_path_index(&components);
1841
1842        let index = builder.path_index();
1843        assert_eq!(index.len(), 2);
1844        assert_eq!(
1845            index.get(&PathBuf::from("packages/core")),
1846            Some(&"component:repo:packages/core".to_string())
1847        );
1848        assert_eq!(
1849            index.get(&PathBuf::from("packages/utils")),
1850            Some(&"component:repo:packages/utils".to_string())
1851        );
1852    }
1853
1854    #[test]
1855    fn test_resolve_dependency() {
1856        let mut builder = ComponentBuilder::new().unwrap();
1857
1858        let components = vec![
1859            DiscoveredComponent {
1860                node_id: "component:repo:packages/core".to_string(),
1861                name: "core".to_string(),
1862                manifest_path: "packages/core/package.json".to_string(),
1863                directory: "packages/core".to_string(),
1864                info: ManifestInfo::new(),
1865            },
1866            DiscoveredComponent {
1867                node_id: "component:repo:packages/utils".to_string(),
1868                name: "utils".to_string(),
1869                manifest_path: "packages/utils/package.json".to_string(),
1870                directory: "packages/utils".to_string(),
1871                info: ManifestInfo::new(),
1872            },
1873        ];
1874
1875        builder.build_path_index(&components);
1876
1877        // Test path-based resolution
1878        let from = &components[0]; // packages/core
1879        let dep = LocalDependency::with_path(
1880            "utils".to_string(),
1881            "../utils".to_string(),
1882            DependencyType::Path,
1883        );
1884
1885        let resolved = builder.resolve_dependency(from, &dep);
1886        assert_eq!(resolved, Some("component:repo:packages/utils".to_string()));
1887    }
1888
1889    #[test]
1890    fn test_dependency_edges_created() {
1891        let mut builder = ComponentBuilder::new().unwrap();
1892        let mut graph = PetCodeGraph::new();
1893
1894        let dep = LocalDependency::with_path(
1895            "utils".to_string(),
1896            "../utils".to_string(),
1897            DependencyType::Path,
1898        );
1899
1900        let mut core_info = ManifestInfo::new();
1901        core_info.component_name = Some("core".to_string());
1902        core_info.local_dependencies.push(dep);
1903
1904        let mut utils_info = ManifestInfo::new();
1905        utils_info.component_name = Some("utils".to_string());
1906
1907        let components = vec![
1908            DiscoveredComponent {
1909                node_id: "component:repo:packages/core".to_string(),
1910                name: "core".to_string(),
1911                manifest_path: "packages/core/package.json".to_string(),
1912                directory: "packages/core".to_string(),
1913                info: core_info,
1914            },
1915            DiscoveredComponent {
1916                node_id: "component:repo:packages/utils".to_string(),
1917                name: "utils".to_string(),
1918                manifest_path: "packages/utils/package.json".to_string(),
1919                directory: "packages/utils".to_string(),
1920                info: utils_info,
1921            },
1922        ];
1923
1924        builder
1925            .add_to_graph(&mut graph, "repo", &components)
1926            .unwrap();
1927
1928        // Verify DependsOn edge was created
1929        let deps_edges: Vec<_> = graph.edges_by_type(EdgeType::DependsOn).collect();
1930        assert_eq!(deps_edges.len(), 1);
1931
1932        let (source, target, data) = &deps_edges[0];
1933        assert_eq!(source.id, "component:repo:packages/core");
1934        assert_eq!(target.id, "component:repo:packages/utils");
1935        assert_eq!(data.ident, Some("utils".to_string()));
1936        assert_eq!(data.version_spec, Some("path:../utils".to_string()));
1937    }
1938}