Skip to main content

codemem_engine/index/scip/
graph_builder.rs

1//! SCIP graph builder: create nodes + edges from parsed SCIP data.
2//!
3//! Takes the intermediate structs from the reader and produces `GraphNode`s,
4//! `Edge`s, and `MemoryNode`s (for hover documentation).
5
6use std::collections::{HashMap, HashSet};
7
8use chrono::Utc;
9use codemem_core::{Edge, GraphNode, MemoryNode, MemoryType, NodeKind, RelationshipType};
10
11use codemem_core::ScipConfig;
12
13use super::{
14    is_import_ref, is_read_ref, is_write_ref, ScipDefinition, ScipReadResult, ROLE_IMPORT,
15    ROLE_READ_ACCESS, ROLE_WRITE_ACCESS,
16};
17
18/// Result of building graph structures from SCIP data.
19#[derive(Debug, Clone, Default)]
20pub struct ScipBuildResult {
21    pub nodes: Vec<GraphNode>,
22    pub edges: Vec<Edge>,
23    pub memories: Vec<(MemoryNode, String)>, // (memory, related_node_id) for RELATES_TO edges
24    pub ext_nodes_created: usize,
25    pub files_covered: HashSet<String>,
26    pub doc_memories_created: usize,
27}
28
29/// Build graph nodes, edges, and doc memories from a parsed SCIP result.
30///
31/// Respects `config.max_references_per_symbol`, `config.create_external_nodes`,
32/// and `config.store_docs_as_memories` settings.
33pub fn build_graph(
34    scip: &ScipReadResult,
35    namespace: Option<&str>,
36    config: &ScipConfig,
37) -> ScipBuildResult {
38    let now = Utc::now();
39    let ns = namespace.map(|s| s.to_string());
40
41    let mut nodes = Vec::new();
42    let mut edges = Vec::new();
43    let mut memories: Vec<(MemoryNode, String)> = Vec::new();
44    let mut ext_nodes_created = 0;
45    let mut doc_memories_created = 0;
46
47    // Filter out definitions from files outside the project root. SCIP indexers
48    // may include build cache, vendored deps, or virtualenv paths that ast-grep
49    // never walks. A source file path must be relative and stay within the project.
50    // Also skip wildcard ambient module declarations (e.g., `declare module '*.css'`)
51    // which act as catch-all type stubs — every matching import resolves to them,
52    // creating thousands of useless edges with massive fan-in.
53    let source_defs: Vec<&ScipDefinition> = scip
54        .definitions
55        .iter()
56        .filter(|d| is_source_path(&d.file_path) && !is_wildcard_module(&d.qualified_name))
57        .collect();
58
59    // Build a set of defined symbol strings -> qualified names for edge resolution.
60    let mut symbol_to_qname: HashMap<&str, &str> = HashMap::new();
61    for def in &source_defs {
62        symbol_to_qname.insert(&def.scip_symbol, &def.qualified_name);
63    }
64
65    // Phase 1: Create sym: nodes from definitions.
66    // Track created node IDs to avoid duplicates for synthetic parents.
67    let mut created_node_ids: HashSet<String> = HashSet::new();
68    let mut created_edge_ids: HashSet<String> = HashSet::new();
69    // Tier 3 folding: map folded symbol qname → parent node ID for edge redirection.
70    let mut folded_to_parent: HashMap<String, String> = HashMap::new();
71    // Collect folded children to batch-add to parent payloads after all nodes are created.
72    // Key: parent qname, Value: vec of (child label, tier3 category like "fields"/"type_params")
73    let mut folded_children: HashMap<String, Vec<(String, &'static str)>> = HashMap::new();
74
75    // Pre-compute containment chains for all definitions (needed for tier 3 parent lookup).
76    let def_chains: Vec<Vec<(String, NodeKind)>> = source_defs
77        .iter()
78        .map(|d| extract_containment_chain(&d.scip_symbol))
79        .collect();
80
81    for (def_idx, def) in source_defs.iter().enumerate() {
82        let kind = if def.is_test {
83            NodeKind::Test
84        } else {
85            def.kind
86        };
87
88        // Node tiering: Tier 3 kinds get folded into parent metadata.
89        let tier3_category = match kind {
90            NodeKind::Field | NodeKind::Property => Some("fields"),
91            NodeKind::TypeParameter => Some("type_params"),
92            NodeKind::EnumVariant => Some("variants"),
93            _ => None,
94        };
95
96        if let Some(category) = tier3_category {
97            // Find parent from containment chain.
98            let chain = &def_chains[def_idx];
99            if chain.len() >= 2 {
100                let parent_qname = &chain[chain.len() - 2].0;
101                let leaf_name = def
102                    .qualified_name
103                    .rsplit([':', '.'])
104                    .next()
105                    .unwrap_or(&def.qualified_name);
106                folded_children
107                    .entry(parent_qname.clone())
108                    .or_default()
109                    .push((leaf_name.to_string(), category));
110                folded_to_parent.insert(def.qualified_name.clone(), format!("sym:{parent_qname}"));
111                // Also map the scip_symbol for reference resolution.
112                symbol_to_qname.insert(&def.scip_symbol, &def.qualified_name);
113                continue; // Don't create a node for this definition.
114            }
115        }
116
117        let node_id = format!("sym:{}", def.qualified_name);
118
119        let mut payload = HashMap::new();
120        payload.insert(
121            "scip_symbol".to_string(),
122            serde_json::Value::String(def.scip_symbol.clone()),
123        );
124        payload.insert("line_start".to_string(), serde_json::json!(def.line_start));
125        payload.insert("line_end".to_string(), serde_json::json!(def.line_end));
126        payload.insert(
127            "file_path".to_string(),
128            serde_json::Value::String(def.file_path.clone()),
129        );
130        if def.is_test {
131            payload.insert("is_test".to_string(), serde_json::json!(true));
132        }
133        if def.is_generated {
134            payload.insert("is_generated".to_string(), serde_json::json!(true));
135        }
136        // Store type signature from first documentation line if available.
137        if let Some(type_sig) = def.documentation.first() {
138            payload.insert(
139                "type_signature".to_string(),
140                serde_json::Value::String(type_sig.clone()),
141            );
142        }
143        payload.insert(
144            "source".to_string(),
145            serde_json::Value::String("scip".to_string()),
146        );
147
148        created_node_ids.insert(node_id.clone());
149        nodes.push(GraphNode {
150            id: node_id.clone(),
151            kind,
152            label: def.qualified_name.clone(),
153            payload,
154            centrality: 0.0,
155            memory_id: None,
156            namespace: ns.clone(),
157            valid_from: None,
158            valid_to: None,
159        });
160
161        // Create containment edges: either hierarchical (nested chain) or flat (file→sym).
162        if config.hierarchical_containment {
163            let chain = &def_chains[def_idx];
164            let file_node_id = format!("file:{}", def.file_path);
165
166            if chain.len() <= 1 {
167                // No intermediate parents — just file→sym.
168                let edge_id = format!("contains:{file_node_id}->{node_id}");
169                if created_edge_ids.insert(edge_id.clone()) {
170                    edges.push(Edge {
171                        id: edge_id,
172                        src: file_node_id,
173                        dst: node_id.clone(),
174                        relationship: RelationshipType::Contains,
175                        weight: 0.1,
176                        properties: scip_edge_properties(),
177                        created_at: now,
178                        valid_from: Some(now),
179                        valid_to: None,
180                    });
181                }
182            } else {
183                // Build chain: file→top_parent→...→parent→leaf
184                for (i, (seg_qname, seg_kind)) in chain.iter().enumerate() {
185                    let seg_node_id = format!("sym:{seg_qname}");
186
187                    // Create synthetic intermediate node if needed (not the leaf itself).
188                    if seg_qname != &def.qualified_name
189                        && created_node_ids.insert(seg_node_id.clone())
190                    {
191                        let mut syn_payload = HashMap::new();
192                        syn_payload.insert(
193                            "source".to_string(),
194                            serde_json::Value::String("scip-synthetic".to_string()),
195                        );
196                        syn_payload.insert(
197                            "file_path".to_string(),
198                            serde_json::Value::String(def.file_path.clone()),
199                        );
200                        nodes.push(GraphNode {
201                            id: seg_node_id.clone(),
202                            kind: *seg_kind,
203                            label: seg_qname.clone(),
204                            payload: syn_payload,
205                            centrality: 0.0,
206                            memory_id: None,
207                            namespace: ns.clone(),
208                            valid_from: None,
209                            valid_to: None,
210                        });
211                    }
212
213                    // Create CONTAINS edge from parent to this segment.
214                    let parent_id = if i == 0 {
215                        file_node_id.clone()
216                    } else {
217                        format!("sym:{}", chain[i - 1].0)
218                    };
219
220                    let edge_id = format!("contains:{parent_id}->{seg_node_id}");
221                    if created_edge_ids.insert(edge_id.clone()) {
222                        edges.push(Edge {
223                            id: edge_id,
224                            src: parent_id,
225                            dst: seg_node_id,
226                            relationship: RelationshipType::Contains,
227                            weight: 0.1,
228                            properties: scip_edge_properties(),
229                            created_at: now,
230                            valid_from: Some(now),
231                            valid_to: None,
232                        });
233                    }
234                }
235            }
236        } else {
237            // Flat containment: file → symbol (original behavior).
238            let file_node_id = format!("file:{}", def.file_path);
239            edges.push(Edge {
240                id: format!("contains:{file_node_id}->{node_id}"),
241                src: file_node_id,
242                dst: node_id.clone(),
243                relationship: RelationshipType::Contains,
244                weight: 0.1,
245                properties: scip_edge_properties(),
246                created_at: now,
247                valid_from: Some(now),
248                valid_to: None,
249            });
250        }
251
252        // Create hover doc memories (if enabled in config).
253        if config.store_docs_as_memories && !def.documentation.is_empty() {
254            let doc_text = def.documentation.join("\n");
255            let mem_id = format!("scip-doc:{}", def.qualified_name);
256            let memory = MemoryNode {
257                id: mem_id,
258                content: doc_text,
259                memory_type: MemoryType::Context,
260                importance: 0.4,
261                confidence: 1.0,
262                access_count: 0,
263                content_hash: String::new(), // Will be computed by engine on persist.
264                tags: vec!["scip-doc".to_string(), "auto-generated".to_string()],
265                metadata: HashMap::new(),
266                namespace: ns.clone(),
267                session_id: None,
268                repo: None,
269                git_ref: None,
270                expires_at: None,
271                created_at: now,
272                updated_at: now,
273                last_accessed_at: now,
274            };
275            memories.push((memory, node_id.clone()));
276            doc_memories_created += 1;
277        }
278
279        // Create edges from SCIP relationships.
280        for rel in &def.relationships {
281            if rel.target_symbol.is_empty() {
282                continue;
283            }
284            // Resolve target to qualified name if it's a known symbol.
285            let target_node_id =
286                if let Some(qname) = symbol_to_qname.get(rel.target_symbol.as_str()) {
287                    format!("sym:{qname}")
288                } else {
289                    // Target might be external — try to parse as external node ID.
290                    match parse_external_node_id(&rel.target_symbol) {
291                        Some(ext_id) => ext_id,
292                        None => continue,
293                    }
294                };
295
296            if rel.is_implementation {
297                edges.push(Edge {
298                    id: format!("implements:{node_id}->{target_node_id}"),
299                    src: node_id.clone(),
300                    dst: target_node_id.clone(),
301                    relationship: RelationshipType::Implements,
302                    weight: 0.8,
303                    properties: scip_edge_properties(),
304                    created_at: now,
305                    valid_from: Some(now),
306                    valid_to: None,
307                });
308                // If the source is a method, also create OVERRIDES edge.
309                if def.kind == NodeKind::Method {
310                    edges.push(Edge {
311                        id: format!("overrides:{node_id}->{target_node_id}"),
312                        src: node_id.clone(),
313                        dst: target_node_id.clone(),
314                        relationship: RelationshipType::Overrides,
315                        weight: 0.8,
316                        properties: scip_edge_properties(),
317                        created_at: now,
318                        valid_from: Some(now),
319                        valid_to: None,
320                    });
321                }
322            }
323            if rel.is_type_definition {
324                edges.push(Edge {
325                    id: format!("typedef:{node_id}->{target_node_id}"),
326                    src: node_id.clone(),
327                    dst: target_node_id.clone(),
328                    relationship: RelationshipType::TypeDefinition,
329                    weight: 0.6,
330                    properties: scip_edge_properties(),
331                    created_at: now,
332                    valid_from: Some(now),
333                    valid_to: None,
334                });
335            }
336            // `is_reference` on a relationship indicates a superclass/supertype
337            // reference (e.g., class Dog extends Animal — Dog's SymbolInformation
338            // has a relationship to Animal with is_reference=true). Map to Inherits.
339            if rel.is_reference && !rel.is_implementation {
340                edges.push(Edge {
341                    id: format!("inherits:{node_id}->{target_node_id}"),
342                    src: node_id.clone(),
343                    dst: target_node_id,
344                    relationship: RelationshipType::Inherits,
345                    weight: 0.8,
346                    properties: scip_edge_properties(),
347                    created_at: now,
348                    valid_from: Some(now),
349                    valid_to: None,
350                });
351            }
352        }
353    }
354
355    // Apply folded Tier 3 children to parent node payloads.
356    for node in &mut nodes {
357        let qname = node.label.as_str();
358        if let Some(children) = folded_children.get(qname) {
359            let mut fields = Vec::new();
360            let mut type_params = Vec::new();
361            let mut variants = Vec::new();
362            for (name, category) in children {
363                match *category {
364                    "fields" => fields.push(serde_json::Value::String(name.clone())),
365                    "type_params" => type_params.push(serde_json::Value::String(name.clone())),
366                    "variants" => variants.push(serde_json::Value::String(name.clone())),
367                    _ => {}
368                }
369            }
370            if !fields.is_empty() {
371                node.payload
372                    .insert("fields".to_string(), serde_json::Value::Array(fields));
373            }
374            if !type_params.is_empty() {
375                node.payload.insert(
376                    "type_params".to_string(),
377                    serde_json::Value::Array(type_params),
378                );
379            }
380            if !variants.is_empty() {
381                node.payload
382                    .insert("variants".to_string(), serde_json::Value::Array(variants));
383            }
384        }
385    }
386
387    // Phase 2: Create pkg: nodes from external symbols (if enabled in config).
388    // Instead of one node per external symbol (thousands), we aggregate to one node
389    // per external *package* — this gives the API surface graph ("which modules depend
390    // on which packages") without polluting the graph with individual library symbols.
391    if config.create_external_nodes {
392        let mut pkg_nodes_created: HashSet<String> = HashSet::new();
393        for ext in &scip.externals {
394            if ext.package_manager.is_empty() || ext.package_name.is_empty() {
395                continue;
396            }
397            let node_id = format!("pkg:{}:{}", ext.package_manager, ext.package_name);
398            if !pkg_nodes_created.insert(node_id.clone()) {
399                continue; // Already created this package node
400            }
401
402            let mut payload = HashMap::new();
403            payload.insert(
404                "package_manager".to_string(),
405                serde_json::Value::String(ext.package_manager.clone()),
406            );
407            payload.insert(
408                "package_name".to_string(),
409                serde_json::Value::String(ext.package_name.clone()),
410            );
411            payload.insert(
412                "package_version".to_string(),
413                serde_json::Value::String(ext.package_version.clone()),
414            );
415            payload.insert(
416                "source".to_string(),
417                serde_json::Value::String("scip".to_string()),
418            );
419
420            nodes.push(GraphNode {
421                id: node_id,
422                kind: NodeKind::External,
423                label: ext.package_name.clone(),
424                payload,
425                centrality: 0.0,
426                memory_id: None,
427                namespace: ns.clone(),
428                valid_from: None,
429                valid_to: None,
430            });
431            ext_nodes_created += 1;
432        }
433    } // end if create_external_nodes
434
435    // Phase 3: Create edges from references.
436    // Pre-index source definitions by file path for O(1) lookup in find_enclosing_def.
437    // Exclude Tier 3 folded definitions — they have no graph nodes and would only
438    // inflate the linear scan in find_enclosing_def_indexed.
439    let mut defs_by_file: HashMap<&str, Vec<&ScipDefinition>> = HashMap::new();
440    for def in &source_defs {
441        if folded_to_parent.contains_key(&def.qualified_name) {
442            continue;
443        }
444        defs_by_file
445            .entry(def.file_path.as_str())
446            .or_default()
447            .push(def);
448    }
449
450    // Filter references to source files only.
451    let source_refs: Vec<&super::ScipReference> = scip
452        .references
453        .iter()
454        .filter(|r| is_source_path(&r.file_path))
455        .collect();
456
457    // Count references per (symbol, file) to enforce per-kind fan-out limits.
458    // Intentionally per-file, not global: a utility function referenced 30 times in
459    // file A and 30 times in file B stays under the limit in each file independently.
460    // Global counting would require a second pass; per-file is cheaper and still
461    // prevents the worst offenders (e.g., `log()` called 200 times in one file).
462    let mut ref_counts: HashMap<(&str, &str), usize> = HashMap::new();
463    for r in &source_refs {
464        *ref_counts
465            .entry((&r.scip_symbol, &r.file_path))
466            .or_insert(0) += 1;
467    }
468
469    // Build scip_symbol → NodeKind map for per-kind fan-out limits.
470    let symbol_to_kind: HashMap<&str, NodeKind> = source_defs
471        .iter()
472        .map(|d| (d.scip_symbol.as_str(), d.kind))
473        .collect();
474
475    for r in &source_refs {
476        // Skip high fan-out symbols using per-kind limits.
477        let count = ref_counts
478            .get(&(r.scip_symbol.as_str(), r.file_path.as_str()))
479            .copied()
480            .unwrap_or(0);
481        let target_kind = symbol_to_kind.get(r.scip_symbol.as_str()).copied();
482        let limit = match target_kind {
483            Some(NodeKind::Module) => config.fan_out_limits.module,
484            Some(NodeKind::Function) => config.fan_out_limits.function,
485            Some(NodeKind::Method) => config.fan_out_limits.method,
486            Some(NodeKind::Class | NodeKind::Trait | NodeKind::Interface) => {
487                config.fan_out_limits.class
488            }
489            _ => config.max_references_per_symbol,
490        };
491        if count > limit {
492            continue;
493        }
494
495        // R5: Filter noise calls using the blocklist.
496        if crate::index::blocklist::is_blocked_call_scip(&r.scip_symbol) {
497            continue;
498        }
499
500        // Resolve the referenced symbol to a node ID.
501        let mut target_node_id = if let Some(qname) = symbol_to_qname.get(r.scip_symbol.as_str()) {
502            format!("sym:{qname}")
503        } else {
504            // Might reference an external symbol.
505            match parse_external_node_id(&r.scip_symbol) {
506                Some(ext_id) => ext_id,
507                None => continue,
508            }
509        };
510
511        // Redirect folded Tier 3 symbols to their parent node.
512        if let Some(qname) = symbol_to_qname.get(r.scip_symbol.as_str()) {
513            if let Some(parent_id) = folded_to_parent.get(*qname) {
514                target_node_id = parent_id.clone();
515            }
516        }
517
518        // Find the enclosing definition in the same file to use as source.
519        // If we can't find one, use the file node.
520        let mut source_node_id = find_enclosing_def_indexed(&defs_by_file, &r.file_path, r.line)
521            .map(|def| format!("sym:{}", def.qualified_name))
522            .unwrap_or_else(|| format!("file:{}", r.file_path));
523
524        // Redirect if the enclosing def was itself folded.
525        if let Some(parent_id) = source_node_id
526            .strip_prefix("sym:")
527            .and_then(|qn| folded_to_parent.get(qn))
528        {
529            source_node_id = parent_id.clone();
530        }
531
532        // Don't create self-edges.
533        if source_node_id == target_node_id {
534            continue;
535        }
536
537        // Pick the most specific role for each reference. Priority:
538        //   IMPORT > WRITE > READ > generic CALLS
539        // A reference can have multiple role flags (e.g., IMPORT + READ_ACCESS),
540        // but we emit one edge per reference to avoid double-counting in
541        // PageRank — the more specific role subsumes the less specific one.
542        //
543        // scip-go workaround: scip-go sets READ_ACCESS on ALL references
544        // without semantic differentiation. When ONLY READ_ACCESS is set
545        // (no IMPORT, WRITE), fall through to CALLS.
546        let semantic_mask = ROLE_IMPORT | ROLE_WRITE_ACCESS | ROLE_READ_ACCESS;
547        let is_scip_go_generic = r.role_bitmask & semantic_mask == ROLE_READ_ACCESS;
548
549        let (rel, weight) = if is_import_ref(r.role_bitmask) {
550            (RelationshipType::Imports, 0.5)
551        } else if is_write_ref(r.role_bitmask) {
552            (RelationshipType::Writes, 0.4)
553        } else if is_read_ref(r.role_bitmask) && !is_scip_go_generic {
554            (RelationshipType::Reads, 0.3)
555        } else {
556            (RelationshipType::Calls, 1.0)
557        };
558
559        let edge_prefix = rel.to_string().to_lowercase();
560        edges.push(Edge {
561            id: format!(
562                "{edge_prefix}:{source_node_id}->{target_node_id}:{}:{}",
563                r.file_path, r.line
564            ),
565            src: source_node_id.clone(),
566            dst: target_node_id.clone(),
567            relationship: rel,
568            weight,
569            properties: scip_edge_properties(),
570            created_at: now,
571            valid_from: Some(now),
572            valid_to: None,
573        });
574
575        // For non-import references to type-like symbols, also create a
576        // DependsOn edge. This captures "function X uses type Y" which is
577        // critical for blast-radius analysis. This is the ONE case where
578        // we emit two edges per reference — the structural relationship
579        // (Calls/Reads/Writes) AND the type dependency are distinct signals.
580        if !is_import_ref(r.role_bitmask) {
581            let is_type_target = matches!(
582                target_kind,
583                Some(
584                    NodeKind::Class
585                        | NodeKind::Trait
586                        | NodeKind::Interface
587                        | NodeKind::Type
588                        | NodeKind::Enum
589                )
590            );
591            if is_type_target {
592                edges.push(Edge {
593                    id: format!(
594                        "depends:{source_node_id}->{target_node_id}:{}:{}",
595                        r.file_path, r.line
596                    ),
597                    src: source_node_id,
598                    dst: target_node_id,
599                    relationship: RelationshipType::DependsOn,
600                    weight: 0.7,
601                    properties: scip_edge_properties(),
602                    created_at: now,
603                    valid_from: Some(now),
604                    valid_to: None,
605                });
606            }
607        }
608    }
609
610    // Deduplicate edges by ID (keep first occurrence).
611    let mut seen_edge_ids = HashSet::new();
612    edges.retain(|e| seen_edge_ids.insert(e.id.clone()));
613
614    // Collapse intra-class edges: methods of the same class calling each other
615    // are replaced by metadata on the parent class node.
616    if config.collapse_intra_class_edges && config.hierarchical_containment {
617        // Build sym:child → sym:parent map from containment edges.
618        let mut child_to_parent: HashMap<&str, &str> = HashMap::new();
619        for edge in &edges {
620            if edge.relationship == RelationshipType::Contains
621                && edge.src.starts_with("sym:")
622                && edge.dst.starts_with("sym:")
623            {
624                child_to_parent.insert(&edge.dst, &edge.src);
625            }
626        }
627
628        // Build node ID → kind map so we only collapse within class-like parents.
629        let node_kind_map: HashMap<&str, NodeKind> =
630            nodes.iter().map(|n| (n.id.as_str(), n.kind)).collect();
631
632        // Find edges where src and dst share the same class/struct/trait parent.
633        let mut intra_class_counts: HashMap<String, Vec<(String, String)>> = HashMap::new();
634        let mut intra_edge_ids: HashSet<String> = HashSet::new();
635        for edge in &edges {
636            if !matches!(
637                edge.relationship,
638                RelationshipType::Calls | RelationshipType::Reads | RelationshipType::Writes
639            ) {
640                continue;
641            }
642            let src_parent = child_to_parent.get(edge.src.as_str());
643            let dst_parent = child_to_parent.get(edge.dst.as_str());
644            if let (Some(sp), Some(dp)) = (src_parent, dst_parent) {
645                // Only collapse within class-like parents, not modules.
646                let parent_kind = node_kind_map.get(sp).copied();
647                let is_class_like = matches!(
648                    parent_kind,
649                    Some(NodeKind::Class | NodeKind::Trait | NodeKind::Interface | NodeKind::Enum)
650                );
651                if sp == dp && is_class_like {
652                    // Same parent — mark for collapsing.
653                    let src_leaf = edge.src.rsplit([':', '.']).next().unwrap_or(&edge.src);
654                    let dst_leaf = edge.dst.rsplit([':', '.']).next().unwrap_or(&edge.dst);
655                    intra_class_counts
656                        .entry(sp.to_string())
657                        .or_default()
658                        .push((src_leaf.to_string(), dst_leaf.to_string()));
659                    intra_edge_ids.insert(edge.id.clone());
660                }
661            }
662        }
663
664        // Remove intra-class edges and add metadata to parent nodes.
665        if !intra_edge_ids.is_empty() {
666            edges.retain(|e| !intra_edge_ids.contains(&e.id));
667            for node in &mut nodes {
668                if let Some(calls) = intra_class_counts.get(&node.id) {
669                    let call_entries: Vec<serde_json::Value> = calls
670                        .iter()
671                        .map(|(from, to)| serde_json::json!({"from": from, "to": to}))
672                        .collect();
673                    node.payload.insert(
674                        "intra_class_calls".to_string(),
675                        serde_json::Value::Array(call_entries),
676                    );
677                }
678            }
679        }
680    }
681
682    let files_covered: HashSet<String> = scip.covered_files.iter().cloned().collect();
683
684    // Ensure every node referenced by an edge exists. SCIP edges reference:
685    // - file: nodes (from CONTAINS edges) — may not exist if ast-grep didn't walk the file
686    // - ext: nodes (from reference/relationship edges) — may not be in scip.externals
687    // Without these, FK constraints cause entire edge batches to fail silently.
688    let existing_node_ids: HashSet<&str> = nodes.iter().map(|n| n.id.as_str()).collect();
689    let mut missing_ids: HashSet<String> = HashSet::new();
690    for edge in &edges {
691        if !existing_node_ids.contains(edge.src.as_str()) {
692            missing_ids.insert(edge.src.clone());
693        }
694        if !existing_node_ids.contains(edge.dst.as_str()) {
695            missing_ids.insert(edge.dst.clone());
696        }
697    }
698    for missing_id in &missing_ids {
699        let (kind, label) = if let Some(file_path) = missing_id.strip_prefix("file:") {
700            (NodeKind::File, file_path.to_string())
701        } else if let Some(pkg_rest) = missing_id.strip_prefix("pkg:") {
702            // pkg:{manager}:{name} — use package name as label
703            let label = pkg_rest.rsplit(':').next().unwrap_or(pkg_rest).to_string();
704            ext_nodes_created += 1;
705            (NodeKind::External, label)
706        } else if missing_id.starts_with("ext:") {
707            // Legacy ext: IDs from relationships — still create stub if needed
708            let label = missing_id
709                .rsplit(':')
710                .next()
711                .unwrap_or(missing_id)
712                .to_string();
713            ext_nodes_created += 1;
714            (NodeKind::External, label)
715        } else if let Some(qname) = missing_id.strip_prefix("sym:") {
716            // Interface methods, abstract methods, or symbols defined in external
717            // code that SCIP references but didn't emit a definition for.
718            let label = qname.rsplit([':', '.']).next().unwrap_or(qname).to_string();
719            (NodeKind::Method, label)
720        } else {
721            continue; // Don't create stubs for unknown prefixes
722        };
723        let mut payload = HashMap::new();
724        payload.insert(
725            "source".to_string(),
726            serde_json::Value::String("scip".to_string()),
727        );
728        nodes.push(GraphNode {
729            id: missing_id.clone(),
730            kind,
731            label,
732            payload,
733            centrality: 0.0,
734            memory_id: None,
735            namespace: ns.clone(),
736            valid_from: None,
737            valid_to: None,
738        });
739    }
740
741    ScipBuildResult {
742        nodes,
743        edges,
744        memories,
745        ext_nodes_created,
746        files_covered,
747        doc_memories_created,
748    }
749}
750
751/// Find the innermost definition that encloses a given line in a file,
752/// using a pre-indexed HashMap for O(defs_in_file) instead of O(all_defs).
753fn find_enclosing_def_indexed<'a>(
754    defs_by_file: &HashMap<&str, Vec<&'a ScipDefinition>>,
755    file_path: &str,
756    line: u32,
757) -> Option<&'a ScipDefinition> {
758    defs_by_file
759        .get(file_path)?
760        .iter()
761        .filter(|d| d.line_start <= line && d.line_end >= line)
762        .min_by_key(|d| d.line_end - d.line_start)
763        .copied()
764}
765
766/// Check if a file path looks like a project source file (relative, no escape).
767///
768/// Rejects paths that escape the project root (`..`), absolute paths, and common
769/// non-source directories (build caches, vendor dirs, virtualenvs, node_modules).
770fn is_source_path(path: &str) -> bool {
771    // Must be relative and not escape project root
772    if path.starts_with('/') || path.starts_with("..") {
773        return false;
774    }
775    // Reject common non-source paths across languages
776    let reject = [
777        "node_modules/",
778        ".venv/",
779        "site-packages/",
780        "__pycache__/",
781        ".gradle/",
782        ".m2/",
783        "/go-build/",
784        "vendor/", // Go vendored deps
785    ];
786    !reject.iter().any(|r| path.contains(r))
787}
788
789/// Try to parse a SCIP symbol string into a package-level external node ID.
790///
791/// Returns `pkg:{manager}:{name}` — collapsing all symbols from the same package
792/// into a single node. This gives the API surface graph without per-symbol noise.
793fn parse_external_node_id(scip_symbol: &str) -> Option<String> {
794    let parsed = scip::symbol::parse_symbol(scip_symbol).ok()?;
795    let package = parsed.package.as_ref()?;
796    if package.manager.is_empty() || package.name.is_empty() {
797        return None;
798    }
799    Some(format!("pkg:{}:{}", package.manager, package.name))
800}
801
802/// Extract the containment chain from a SCIP symbol's descriptor chain.
803///
804/// For a symbol like `rust-analyzer cargo foo 1.0 auth/middleware/validate_token().`,
805/// returns: `[("auth", Module), ("auth::middleware", Module), ("auth::middleware::validate_token", Function)]`.
806///
807/// The chain represents the hierarchical nesting: file→auth→auth::middleware→validate_token.
808fn extract_containment_chain(scip_symbol: &str) -> Vec<(String, NodeKind)> {
809    let parsed = match scip::symbol::parse_symbol(scip_symbol) {
810        Ok(p) => p,
811        Err(_) => return vec![],
812    };
813
814    // Detect separator from scheme
815    let scheme = &parsed.scheme;
816    let sep = if scheme == "rust-analyzer" || scheme == "lsif-clang" {
817        "::"
818    } else {
819        "."
820    };
821
822    let mut chain = Vec::new();
823    let mut cumulative_parts: Vec<&str> = Vec::new();
824    let leaf_kind = super::infer_kind_from_parsed(&parsed);
825
826    for desc in &parsed.descriptors {
827        if desc.name.is_empty() {
828            continue;
829        }
830        cumulative_parts.push(&desc.name);
831        let qname = cumulative_parts.join(sep);
832        // For intermediate segments, use descriptor suffix to determine kind.
833        let seg_kind = if cumulative_parts.len() < parsed.descriptors.len() {
834            use scip::types::descriptor::Suffix;
835            match desc.suffix.enum_value() {
836                Ok(Suffix::Package | Suffix::Namespace) => NodeKind::Module,
837                Ok(Suffix::Type) => NodeKind::Class,
838                Ok(Suffix::Method) => NodeKind::Method,
839                Ok(Suffix::Macro) => NodeKind::Macro,
840                _ => NodeKind::Module,
841            }
842        } else {
843            leaf_kind
844        };
845        chain.push((qname, seg_kind));
846    }
847
848    chain
849}
850
851/// Check if a qualified name represents a wildcard ambient module declaration
852/// (e.g., TypeScript `declare module '*.css'`).
853///
854/// These are type-system catch-alls — every matching import resolves to them,
855/// creating thousands of fan-in edges with no semantic value.
856fn is_wildcard_module(qualified_name: &str) -> bool {
857    // SCIP represents these as qualified names containing `'*` (the glob pattern
858    // is part of the module name in the TS declaration).
859    qualified_name.contains("'*")
860}
861
862/// Standard edge properties for SCIP-derived edges (allocated once, cloned per edge).
863/// SCIP base confidence for multi-layer fusion.
864/// ast-grep = 0.10, SCIP = 0.15, LSP = 0.20 (per north-star).
865const SCIP_BASE_CONFIDENCE: f64 = 0.15;
866
867fn scip_edge_properties() -> HashMap<String, serde_json::Value> {
868    use std::sync::LazyLock;
869    static PROPS: LazyLock<HashMap<String, serde_json::Value>> = LazyLock::new(|| {
870        let mut props = HashMap::new();
871        props.insert(
872            "source".to_string(),
873            serde_json::Value::String("scip".to_string()),
874        );
875        props.insert(
876            "confidence".to_string(),
877            serde_json::json!(SCIP_BASE_CONFIDENCE),
878        );
879        props.insert("source_layers".to_string(), serde_json::json!(["scip"]));
880        props
881    });
882    PROPS.clone()
883}
884
885#[cfg(test)]
886#[path = "../tests/scip_graph_builder_tests.rs"]
887mod tests;