Skip to main content

codemem_engine/index/scip/
graph_builder.rs

1//! SCIP graph builder: create nodes + edges from parsed SCIP data.
2//!
3//! Takes the intermediate structs from the reader and produces `GraphNode`s,
4//! `Edge`s, and `MemoryNode`s (for hover documentation).
5
6use std::collections::{HashMap, HashSet};
7
8use chrono::Utc;
9use codemem_core::{Edge, GraphNode, MemoryNode, MemoryType, NodeKind, RelationshipType};
10
11use codemem_core::ScipConfig;
12
13use super::{
14    is_import_ref, is_read_ref, is_write_ref, ScipDefinition, ScipReadResult, ROLE_IMPORT,
15    ROLE_READ_ACCESS, ROLE_WRITE_ACCESS,
16};
17
18/// Result of building graph structures from SCIP data.
19#[derive(Debug, Clone, Default)]
20pub struct ScipBuildResult {
21    pub nodes: Vec<GraphNode>,
22    pub edges: Vec<Edge>,
23    pub memories: Vec<(MemoryNode, String)>, // (memory, related_node_id) for RELATES_TO edges
24    pub ext_nodes_created: usize,
25    pub files_covered: HashSet<String>,
26    pub doc_memories_created: usize,
27}
28
29/// Build graph nodes, edges, and doc memories from a parsed SCIP result.
30///
31/// Respects `config.max_references_per_symbol`, `config.create_external_nodes`,
32/// and `config.store_docs_as_memories` settings.
33pub fn build_graph(
34    scip: &ScipReadResult,
35    namespace: Option<&str>,
36    config: &ScipConfig,
37) -> ScipBuildResult {
38    let now = Utc::now();
39    let ns = namespace.map(|s| s.to_string());
40
41    let mut nodes = Vec::new();
42    let mut edges = Vec::new();
43    let mut memories: Vec<(MemoryNode, String)> = Vec::new();
44    let mut ext_nodes_created = 0;
45    let mut doc_memories_created = 0;
46
47    // Filter out definitions from files outside the project root. SCIP indexers
48    // may include build cache, vendored deps, or virtualenv paths that ast-grep
49    // never walks. A source file path must be relative and stay within the project.
50    // Also skip wildcard ambient module declarations (e.g., `declare module '*.css'`)
51    // which act as catch-all type stubs — every matching import resolves to them,
52    // creating thousands of useless edges with massive fan-in.
53    let source_defs: Vec<&ScipDefinition> = scip
54        .definitions
55        .iter()
56        .filter(|d| is_source_path(&d.file_path) && !is_wildcard_module(&d.qualified_name))
57        .collect();
58
59    // Build a set of defined symbol strings -> qualified names for edge resolution.
60    let mut symbol_to_qname: HashMap<&str, &str> = HashMap::new();
61    for def in &source_defs {
62        symbol_to_qname.insert(&def.scip_symbol, &def.qualified_name);
63    }
64
65    // Phase 1: Create sym: nodes from definitions.
66    // Track created node IDs to avoid duplicates for synthetic parents.
67    let mut created_node_ids: HashSet<String> = HashSet::new();
68    let mut created_edge_ids: HashSet<String> = HashSet::new();
69    // Tier 3 folding: map folded symbol qname → parent node ID for edge redirection.
70    let mut folded_to_parent: HashMap<String, String> = HashMap::new();
71    // Collect folded children to batch-add to parent payloads after all nodes are created.
72    // Key: parent qname, Value: vec of (child label, tier3 category like "fields"/"type_params")
73    let mut folded_children: HashMap<String, Vec<(String, &'static str)>> = HashMap::new();
74
75    // Pre-compute containment chains for all definitions (needed for tier 3 parent lookup).
76    let def_chains: Vec<Vec<(String, NodeKind)>> = source_defs
77        .iter()
78        .map(|d| extract_containment_chain(&d.scip_symbol))
79        .collect();
80
81    for (def_idx, def) in source_defs.iter().enumerate() {
82        let kind = if def.is_test {
83            NodeKind::Test
84        } else {
85            def.kind
86        };
87
88        // Node tiering: Tier 3 kinds get folded into parent metadata.
89        let tier3_category = match kind {
90            NodeKind::Field | NodeKind::Property => Some("fields"),
91            NodeKind::TypeParameter => Some("type_params"),
92            NodeKind::EnumVariant => Some("variants"),
93            _ => None,
94        };
95
96        if let Some(category) = tier3_category {
97            // Find parent from containment chain.
98            let chain = &def_chains[def_idx];
99            if chain.len() >= 2 {
100                let parent_qname = &chain[chain.len() - 2].0;
101                let leaf_name = def
102                    .qualified_name
103                    .rsplit([':', '.'])
104                    .next()
105                    .unwrap_or(&def.qualified_name);
106                folded_children
107                    .entry(parent_qname.clone())
108                    .or_default()
109                    .push((leaf_name.to_string(), category));
110                folded_to_parent.insert(def.qualified_name.clone(), format!("sym:{parent_qname}"));
111                // Also map the scip_symbol for reference resolution.
112                symbol_to_qname.insert(&def.scip_symbol, &def.qualified_name);
113                continue; // Don't create a node for this definition.
114            }
115        }
116
117        let node_id = format!("sym:{}", def.qualified_name);
118
119        let mut payload = HashMap::new();
120        payload.insert(
121            "scip_symbol".to_string(),
122            serde_json::Value::String(def.scip_symbol.clone()),
123        );
124        payload.insert("line_start".to_string(), serde_json::json!(def.line_start));
125        payload.insert("line_end".to_string(), serde_json::json!(def.line_end));
126        payload.insert(
127            "file_path".to_string(),
128            serde_json::Value::String(def.file_path.clone()),
129        );
130        if def.is_test {
131            payload.insert("is_test".to_string(), serde_json::json!(true));
132        }
133        if def.is_generated {
134            payload.insert("is_generated".to_string(), serde_json::json!(true));
135        }
136        // Store type signature from first documentation line if available.
137        if let Some(type_sig) = def.documentation.first() {
138            payload.insert(
139                "type_signature".to_string(),
140                serde_json::Value::String(type_sig.clone()),
141            );
142        }
143        payload.insert(
144            "source".to_string(),
145            serde_json::Value::String("scip".to_string()),
146        );
147
148        created_node_ids.insert(node_id.clone());
149        nodes.push(GraphNode {
150            id: node_id.clone(),
151            kind,
152            label: def.qualified_name.clone(),
153            payload,
154            centrality: 0.0,
155            memory_id: None,
156            namespace: ns.clone(),
157        });
158
159        // Create containment edges: either hierarchical (nested chain) or flat (file→sym).
160        if config.hierarchical_containment {
161            let chain = &def_chains[def_idx];
162            let file_node_id = format!("file:{}", def.file_path);
163
164            if chain.len() <= 1 {
165                // No intermediate parents — just file→sym.
166                let edge_id = format!("contains:{file_node_id}->{node_id}");
167                if created_edge_ids.insert(edge_id.clone()) {
168                    edges.push(Edge {
169                        id: edge_id,
170                        src: file_node_id,
171                        dst: node_id.clone(),
172                        relationship: RelationshipType::Contains,
173                        weight: 0.1,
174                        properties: scip_edge_properties(),
175                        created_at: now,
176                        valid_from: Some(now),
177                        valid_to: None,
178                    });
179                }
180            } else {
181                // Build chain: file→top_parent→...→parent→leaf
182                for (i, (seg_qname, seg_kind)) in chain.iter().enumerate() {
183                    let seg_node_id = format!("sym:{seg_qname}");
184
185                    // Create synthetic intermediate node if needed (not the leaf itself).
186                    if seg_qname != &def.qualified_name
187                        && created_node_ids.insert(seg_node_id.clone())
188                    {
189                        let mut syn_payload = HashMap::new();
190                        syn_payload.insert(
191                            "source".to_string(),
192                            serde_json::Value::String("scip-synthetic".to_string()),
193                        );
194                        syn_payload.insert(
195                            "file_path".to_string(),
196                            serde_json::Value::String(def.file_path.clone()),
197                        );
198                        nodes.push(GraphNode {
199                            id: seg_node_id.clone(),
200                            kind: *seg_kind,
201                            label: seg_qname.clone(),
202                            payload: syn_payload,
203                            centrality: 0.0,
204                            memory_id: None,
205                            namespace: ns.clone(),
206                        });
207                    }
208
209                    // Create CONTAINS edge from parent to this segment.
210                    let parent_id = if i == 0 {
211                        file_node_id.clone()
212                    } else {
213                        format!("sym:{}", chain[i - 1].0)
214                    };
215
216                    let edge_id = format!("contains:{parent_id}->{seg_node_id}");
217                    if created_edge_ids.insert(edge_id.clone()) {
218                        edges.push(Edge {
219                            id: edge_id,
220                            src: parent_id,
221                            dst: seg_node_id,
222                            relationship: RelationshipType::Contains,
223                            weight: 0.1,
224                            properties: scip_edge_properties(),
225                            created_at: now,
226                            valid_from: Some(now),
227                            valid_to: None,
228                        });
229                    }
230                }
231            }
232        } else {
233            // Flat containment: file → symbol (original behavior).
234            let file_node_id = format!("file:{}", def.file_path);
235            edges.push(Edge {
236                id: format!("contains:{file_node_id}->{node_id}"),
237                src: file_node_id,
238                dst: node_id.clone(),
239                relationship: RelationshipType::Contains,
240                weight: 0.1,
241                properties: scip_edge_properties(),
242                created_at: now,
243                valid_from: Some(now),
244                valid_to: None,
245            });
246        }
247
248        // Create hover doc memories (if enabled in config).
249        if config.store_docs_as_memories && !def.documentation.is_empty() {
250            let doc_text = def.documentation.join("\n");
251            let mem_id = format!("scip-doc:{}", def.qualified_name);
252            let memory = MemoryNode {
253                id: mem_id,
254                content: doc_text,
255                memory_type: MemoryType::Context,
256                importance: 0.4,
257                confidence: 1.0,
258                access_count: 0,
259                content_hash: String::new(), // Will be computed by engine on persist.
260                tags: vec!["scip-doc".to_string(), "auto-generated".to_string()],
261                metadata: HashMap::new(),
262                namespace: ns.clone(),
263                session_id: None,
264                repo: None,
265                git_ref: None,
266                expires_at: None,
267                created_at: now,
268                updated_at: now,
269                last_accessed_at: now,
270            };
271            memories.push((memory, node_id.clone()));
272            doc_memories_created += 1;
273        }
274
275        // Create edges from SCIP relationships.
276        for rel in &def.relationships {
277            if rel.target_symbol.is_empty() {
278                continue;
279            }
280            // Resolve target to qualified name if it's a known symbol.
281            let target_node_id =
282                if let Some(qname) = symbol_to_qname.get(rel.target_symbol.as_str()) {
283                    format!("sym:{qname}")
284                } else {
285                    // Target might be external — try to parse as external node ID.
286                    match parse_external_node_id(&rel.target_symbol) {
287                        Some(ext_id) => ext_id,
288                        None => continue,
289                    }
290                };
291
292            if rel.is_implementation {
293                edges.push(Edge {
294                    id: format!("implements:{node_id}->{target_node_id}"),
295                    src: node_id.clone(),
296                    dst: target_node_id.clone(),
297                    relationship: RelationshipType::Implements,
298                    weight: 0.8,
299                    properties: scip_edge_properties(),
300                    created_at: now,
301                    valid_from: Some(now),
302                    valid_to: None,
303                });
304                // If the source is a method, also create OVERRIDES edge.
305                if def.kind == NodeKind::Method {
306                    edges.push(Edge {
307                        id: format!("overrides:{node_id}->{target_node_id}"),
308                        src: node_id.clone(),
309                        dst: target_node_id.clone(),
310                        relationship: RelationshipType::Overrides,
311                        weight: 0.8,
312                        properties: scip_edge_properties(),
313                        created_at: now,
314                        valid_from: Some(now),
315                        valid_to: None,
316                    });
317                }
318            }
319            if rel.is_type_definition {
320                edges.push(Edge {
321                    id: format!("typedef:{node_id}->{target_node_id}"),
322                    src: node_id.clone(),
323                    dst: target_node_id.clone(),
324                    relationship: RelationshipType::TypeDefinition,
325                    weight: 0.6,
326                    properties: scip_edge_properties(),
327                    created_at: now,
328                    valid_from: Some(now),
329                    valid_to: None,
330                });
331            }
332            // `is_reference` on a relationship indicates a superclass/supertype
333            // reference (e.g., class Dog extends Animal — Dog's SymbolInformation
334            // has a relationship to Animal with is_reference=true). Map to Inherits.
335            if rel.is_reference && !rel.is_implementation {
336                edges.push(Edge {
337                    id: format!("inherits:{node_id}->{target_node_id}"),
338                    src: node_id.clone(),
339                    dst: target_node_id,
340                    relationship: RelationshipType::Inherits,
341                    weight: 0.8,
342                    properties: scip_edge_properties(),
343                    created_at: now,
344                    valid_from: Some(now),
345                    valid_to: None,
346                });
347            }
348        }
349    }
350
351    // Apply folded Tier 3 children to parent node payloads.
352    for node in &mut nodes {
353        let qname = node.label.as_str();
354        if let Some(children) = folded_children.get(qname) {
355            let mut fields = Vec::new();
356            let mut type_params = Vec::new();
357            let mut variants = Vec::new();
358            for (name, category) in children {
359                match *category {
360                    "fields" => fields.push(serde_json::Value::String(name.clone())),
361                    "type_params" => type_params.push(serde_json::Value::String(name.clone())),
362                    "variants" => variants.push(serde_json::Value::String(name.clone())),
363                    _ => {}
364                }
365            }
366            if !fields.is_empty() {
367                node.payload
368                    .insert("fields".to_string(), serde_json::Value::Array(fields));
369            }
370            if !type_params.is_empty() {
371                node.payload.insert(
372                    "type_params".to_string(),
373                    serde_json::Value::Array(type_params),
374                );
375            }
376            if !variants.is_empty() {
377                node.payload
378                    .insert("variants".to_string(), serde_json::Value::Array(variants));
379            }
380        }
381    }
382
383    // Phase 2: Create pkg: nodes from external symbols (if enabled in config).
384    // Instead of one node per external symbol (thousands), we aggregate to one node
385    // per external *package* — this gives the API surface graph ("which modules depend
386    // on which packages") without polluting the graph with individual library symbols.
387    if config.create_external_nodes {
388        let mut pkg_nodes_created: HashSet<String> = HashSet::new();
389        for ext in &scip.externals {
390            if ext.package_manager.is_empty() || ext.package_name.is_empty() {
391                continue;
392            }
393            let node_id = format!("pkg:{}:{}", ext.package_manager, ext.package_name);
394            if !pkg_nodes_created.insert(node_id.clone()) {
395                continue; // Already created this package node
396            }
397
398            let mut payload = HashMap::new();
399            payload.insert(
400                "package_manager".to_string(),
401                serde_json::Value::String(ext.package_manager.clone()),
402            );
403            payload.insert(
404                "package_name".to_string(),
405                serde_json::Value::String(ext.package_name.clone()),
406            );
407            payload.insert(
408                "package_version".to_string(),
409                serde_json::Value::String(ext.package_version.clone()),
410            );
411            payload.insert(
412                "source".to_string(),
413                serde_json::Value::String("scip".to_string()),
414            );
415
416            nodes.push(GraphNode {
417                id: node_id,
418                kind: NodeKind::External,
419                label: ext.package_name.clone(),
420                payload,
421                centrality: 0.0,
422                memory_id: None,
423                namespace: ns.clone(),
424            });
425            ext_nodes_created += 1;
426        }
427    } // end if create_external_nodes
428
429    // Phase 3: Create edges from references.
430    // Pre-index source definitions by file path for O(1) lookup in find_enclosing_def.
431    // Exclude Tier 3 folded definitions — they have no graph nodes and would only
432    // inflate the linear scan in find_enclosing_def_indexed.
433    let mut defs_by_file: HashMap<&str, Vec<&ScipDefinition>> = HashMap::new();
434    for def in &source_defs {
435        if folded_to_parent.contains_key(&def.qualified_name) {
436            continue;
437        }
438        defs_by_file
439            .entry(def.file_path.as_str())
440            .or_default()
441            .push(def);
442    }
443
444    // Filter references to source files only.
445    let source_refs: Vec<&super::ScipReference> = scip
446        .references
447        .iter()
448        .filter(|r| is_source_path(&r.file_path))
449        .collect();
450
451    // Count references per (symbol, file) to enforce per-kind fan-out limits.
452    // Intentionally per-file, not global: a utility function referenced 30 times in
453    // file A and 30 times in file B stays under the limit in each file independently.
454    // Global counting would require a second pass; per-file is cheaper and still
455    // prevents the worst offenders (e.g., `log()` called 200 times in one file).
456    let mut ref_counts: HashMap<(&str, &str), usize> = HashMap::new();
457    for r in &source_refs {
458        *ref_counts
459            .entry((&r.scip_symbol, &r.file_path))
460            .or_insert(0) += 1;
461    }
462
463    // Build scip_symbol → NodeKind map for per-kind fan-out limits.
464    let symbol_to_kind: HashMap<&str, NodeKind> = source_defs
465        .iter()
466        .map(|d| (d.scip_symbol.as_str(), d.kind))
467        .collect();
468
469    for r in &source_refs {
470        // Skip high fan-out symbols using per-kind limits.
471        let count = ref_counts
472            .get(&(r.scip_symbol.as_str(), r.file_path.as_str()))
473            .copied()
474            .unwrap_or(0);
475        let target_kind = symbol_to_kind.get(r.scip_symbol.as_str()).copied();
476        let limit = match target_kind {
477            Some(NodeKind::Module) => config.fan_out_limits.module,
478            Some(NodeKind::Function) => config.fan_out_limits.function,
479            Some(NodeKind::Method) => config.fan_out_limits.method,
480            Some(NodeKind::Class | NodeKind::Trait | NodeKind::Interface) => {
481                config.fan_out_limits.class
482            }
483            _ => config.max_references_per_symbol,
484        };
485        if count > limit {
486            continue;
487        }
488
489        // Resolve the referenced symbol to a node ID.
490        let mut target_node_id = if let Some(qname) = symbol_to_qname.get(r.scip_symbol.as_str()) {
491            format!("sym:{qname}")
492        } else {
493            // Might reference an external symbol.
494            match parse_external_node_id(&r.scip_symbol) {
495                Some(ext_id) => ext_id,
496                None => continue,
497            }
498        };
499
500        // Redirect folded Tier 3 symbols to their parent node.
501        if let Some(qname) = symbol_to_qname.get(r.scip_symbol.as_str()) {
502            if let Some(parent_id) = folded_to_parent.get(*qname) {
503                target_node_id = parent_id.clone();
504            }
505        }
506
507        // Find the enclosing definition in the same file to use as source.
508        // If we can't find one, use the file node.
509        let mut source_node_id = find_enclosing_def_indexed(&defs_by_file, &r.file_path, r.line)
510            .map(|def| format!("sym:{}", def.qualified_name))
511            .unwrap_or_else(|| format!("file:{}", r.file_path));
512
513        // Redirect if the enclosing def was itself folded.
514        if let Some(parent_id) = source_node_id
515            .strip_prefix("sym:")
516            .and_then(|qn| folded_to_parent.get(qn))
517        {
518            source_node_id = parent_id.clone();
519        }
520
521        // Don't create self-edges.
522        if source_node_id == target_node_id {
523            continue;
524        }
525
526        // Pick the most specific role for each reference. Priority:
527        //   IMPORT > WRITE > READ > generic CALLS
528        // A reference can have multiple role flags (e.g., IMPORT + READ_ACCESS),
529        // but we emit one edge per reference to avoid double-counting in
530        // PageRank — the more specific role subsumes the less specific one.
531        //
532        // scip-go workaround: scip-go sets READ_ACCESS on ALL references
533        // without semantic differentiation. When ONLY READ_ACCESS is set
534        // (no IMPORT, WRITE), fall through to CALLS.
535        let semantic_mask = ROLE_IMPORT | ROLE_WRITE_ACCESS | ROLE_READ_ACCESS;
536        let is_scip_go_generic = r.role_bitmask & semantic_mask == ROLE_READ_ACCESS;
537
538        let (rel, weight) = if is_import_ref(r.role_bitmask) {
539            (RelationshipType::Imports, 0.5)
540        } else if is_write_ref(r.role_bitmask) {
541            (RelationshipType::Writes, 0.4)
542        } else if is_read_ref(r.role_bitmask) && !is_scip_go_generic {
543            (RelationshipType::Reads, 0.3)
544        } else {
545            (RelationshipType::Calls, 1.0)
546        };
547
548        let edge_prefix = rel.to_string().to_lowercase();
549        edges.push(Edge {
550            id: format!(
551                "{edge_prefix}:{source_node_id}->{target_node_id}:{}:{}",
552                r.file_path, r.line
553            ),
554            src: source_node_id.clone(),
555            dst: target_node_id.clone(),
556            relationship: rel,
557            weight,
558            properties: scip_edge_properties(),
559            created_at: now,
560            valid_from: Some(now),
561            valid_to: None,
562        });
563
564        // For non-import references to type-like symbols, also create a
565        // DependsOn edge. This captures "function X uses type Y" which is
566        // critical for blast-radius analysis. This is the ONE case where
567        // we emit two edges per reference — the structural relationship
568        // (Calls/Reads/Writes) AND the type dependency are distinct signals.
569        if !is_import_ref(r.role_bitmask) {
570            let is_type_target = matches!(
571                target_kind,
572                Some(
573                    NodeKind::Class
574                        | NodeKind::Trait
575                        | NodeKind::Interface
576                        | NodeKind::Type
577                        | NodeKind::Enum
578                )
579            );
580            if is_type_target {
581                edges.push(Edge {
582                    id: format!(
583                        "depends:{source_node_id}->{target_node_id}:{}:{}",
584                        r.file_path, r.line
585                    ),
586                    src: source_node_id,
587                    dst: target_node_id,
588                    relationship: RelationshipType::DependsOn,
589                    weight: 0.7,
590                    properties: scip_edge_properties(),
591                    created_at: now,
592                    valid_from: Some(now),
593                    valid_to: None,
594                });
595            }
596        }
597    }
598
599    // Deduplicate edges by ID (keep first occurrence).
600    let mut seen_edge_ids = HashSet::new();
601    edges.retain(|e| seen_edge_ids.insert(e.id.clone()));
602
603    // Collapse intra-class edges: methods of the same class calling each other
604    // are replaced by metadata on the parent class node.
605    if config.collapse_intra_class_edges && config.hierarchical_containment {
606        // Build sym:child → sym:parent map from containment edges.
607        let mut child_to_parent: HashMap<&str, &str> = HashMap::new();
608        for edge in &edges {
609            if edge.relationship == RelationshipType::Contains
610                && edge.src.starts_with("sym:")
611                && edge.dst.starts_with("sym:")
612            {
613                child_to_parent.insert(&edge.dst, &edge.src);
614            }
615        }
616
617        // Build node ID → kind map so we only collapse within class-like parents.
618        let node_kind_map: HashMap<&str, NodeKind> =
619            nodes.iter().map(|n| (n.id.as_str(), n.kind)).collect();
620
621        // Find edges where src and dst share the same class/struct/trait parent.
622        let mut intra_class_counts: HashMap<String, Vec<(String, String)>> = HashMap::new();
623        let mut intra_edge_ids: HashSet<String> = HashSet::new();
624        for edge in &edges {
625            if !matches!(
626                edge.relationship,
627                RelationshipType::Calls | RelationshipType::Reads | RelationshipType::Writes
628            ) {
629                continue;
630            }
631            let src_parent = child_to_parent.get(edge.src.as_str());
632            let dst_parent = child_to_parent.get(edge.dst.as_str());
633            if let (Some(sp), Some(dp)) = (src_parent, dst_parent) {
634                // Only collapse within class-like parents, not modules.
635                let parent_kind = node_kind_map.get(sp).copied();
636                let is_class_like = matches!(
637                    parent_kind,
638                    Some(NodeKind::Class | NodeKind::Trait | NodeKind::Interface | NodeKind::Enum)
639                );
640                if sp == dp && is_class_like {
641                    // Same parent — mark for collapsing.
642                    let src_leaf = edge.src.rsplit([':', '.']).next().unwrap_or(&edge.src);
643                    let dst_leaf = edge.dst.rsplit([':', '.']).next().unwrap_or(&edge.dst);
644                    intra_class_counts
645                        .entry(sp.to_string())
646                        .or_default()
647                        .push((src_leaf.to_string(), dst_leaf.to_string()));
648                    intra_edge_ids.insert(edge.id.clone());
649                }
650            }
651        }
652
653        // Remove intra-class edges and add metadata to parent nodes.
654        if !intra_edge_ids.is_empty() {
655            edges.retain(|e| !intra_edge_ids.contains(&e.id));
656            for node in &mut nodes {
657                if let Some(calls) = intra_class_counts.get(&node.id) {
658                    let call_entries: Vec<serde_json::Value> = calls
659                        .iter()
660                        .map(|(from, to)| serde_json::json!({"from": from, "to": to}))
661                        .collect();
662                    node.payload.insert(
663                        "intra_class_calls".to_string(),
664                        serde_json::Value::Array(call_entries),
665                    );
666                }
667            }
668        }
669    }
670
671    let files_covered: HashSet<String> = scip.covered_files.iter().cloned().collect();
672
673    // Ensure every node referenced by an edge exists. SCIP edges reference:
674    // - file: nodes (from CONTAINS edges) — may not exist if ast-grep didn't walk the file
675    // - ext: nodes (from reference/relationship edges) — may not be in scip.externals
676    // Without these, FK constraints cause entire edge batches to fail silently.
677    let existing_node_ids: HashSet<&str> = nodes.iter().map(|n| n.id.as_str()).collect();
678    let mut missing_ids: HashSet<String> = HashSet::new();
679    for edge in &edges {
680        if !existing_node_ids.contains(edge.src.as_str()) {
681            missing_ids.insert(edge.src.clone());
682        }
683        if !existing_node_ids.contains(edge.dst.as_str()) {
684            missing_ids.insert(edge.dst.clone());
685        }
686    }
687    for missing_id in &missing_ids {
688        let (kind, label) = if let Some(file_path) = missing_id.strip_prefix("file:") {
689            (NodeKind::File, file_path.to_string())
690        } else if let Some(pkg_rest) = missing_id.strip_prefix("pkg:") {
691            // pkg:{manager}:{name} — use package name as label
692            let label = pkg_rest.rsplit(':').next().unwrap_or(pkg_rest).to_string();
693            ext_nodes_created += 1;
694            (NodeKind::External, label)
695        } else if missing_id.starts_with("ext:") {
696            // Legacy ext: IDs from relationships — still create stub if needed
697            let label = missing_id
698                .rsplit(':')
699                .next()
700                .unwrap_or(missing_id)
701                .to_string();
702            ext_nodes_created += 1;
703            (NodeKind::External, label)
704        } else if let Some(qname) = missing_id.strip_prefix("sym:") {
705            // Interface methods, abstract methods, or symbols defined in external
706            // code that SCIP references but didn't emit a definition for.
707            let label = qname.rsplit([':', '.']).next().unwrap_or(qname).to_string();
708            (NodeKind::Method, label)
709        } else {
710            continue; // Don't create stubs for unknown prefixes
711        };
712        let mut payload = HashMap::new();
713        payload.insert(
714            "source".to_string(),
715            serde_json::Value::String("scip".to_string()),
716        );
717        nodes.push(GraphNode {
718            id: missing_id.clone(),
719            kind,
720            label,
721            payload,
722            centrality: 0.0,
723            memory_id: None,
724            namespace: ns.clone(),
725        });
726    }
727
728    ScipBuildResult {
729        nodes,
730        edges,
731        memories,
732        ext_nodes_created,
733        files_covered,
734        doc_memories_created,
735    }
736}
737
738/// Find the innermost definition that encloses a given line in a file,
739/// using a pre-indexed HashMap for O(defs_in_file) instead of O(all_defs).
740fn find_enclosing_def_indexed<'a>(
741    defs_by_file: &HashMap<&str, Vec<&'a ScipDefinition>>,
742    file_path: &str,
743    line: u32,
744) -> Option<&'a ScipDefinition> {
745    defs_by_file
746        .get(file_path)?
747        .iter()
748        .filter(|d| d.line_start <= line && d.line_end >= line)
749        .min_by_key(|d| d.line_end - d.line_start)
750        .copied()
751}
752
753/// Check if a file path looks like a project source file (relative, no escape).
754///
755/// Rejects paths that escape the project root (`..`), absolute paths, and common
756/// non-source directories (build caches, vendor dirs, virtualenvs, node_modules).
757fn is_source_path(path: &str) -> bool {
758    // Must be relative and not escape project root
759    if path.starts_with('/') || path.starts_with("..") {
760        return false;
761    }
762    // Reject common non-source paths across languages
763    let reject = [
764        "node_modules/",
765        ".venv/",
766        "site-packages/",
767        "__pycache__/",
768        ".gradle/",
769        ".m2/",
770        "/go-build/",
771        "vendor/", // Go vendored deps
772    ];
773    !reject.iter().any(|r| path.contains(r))
774}
775
776/// Try to parse a SCIP symbol string into a package-level external node ID.
777///
778/// Returns `pkg:{manager}:{name}` — collapsing all symbols from the same package
779/// into a single node. This gives the API surface graph without per-symbol noise.
780fn parse_external_node_id(scip_symbol: &str) -> Option<String> {
781    let parsed = scip::symbol::parse_symbol(scip_symbol).ok()?;
782    let package = parsed.package.as_ref()?;
783    if package.manager.is_empty() || package.name.is_empty() {
784        return None;
785    }
786    Some(format!("pkg:{}:{}", package.manager, package.name))
787}
788
789/// Extract the containment chain from a SCIP symbol's descriptor chain.
790///
791/// For a symbol like `rust-analyzer cargo foo 1.0 auth/middleware/validate_token().`,
792/// returns: `[("auth", Module), ("auth::middleware", Module), ("auth::middleware::validate_token", Function)]`.
793///
794/// The chain represents the hierarchical nesting: file→auth→auth::middleware→validate_token.
795fn extract_containment_chain(scip_symbol: &str) -> Vec<(String, NodeKind)> {
796    let parsed = match scip::symbol::parse_symbol(scip_symbol) {
797        Ok(p) => p,
798        Err(_) => return vec![],
799    };
800
801    // Detect separator from scheme
802    let scheme = &parsed.scheme;
803    let sep = if scheme == "rust-analyzer" || scheme == "lsif-clang" {
804        "::"
805    } else {
806        "."
807    };
808
809    let mut chain = Vec::new();
810    let mut cumulative_parts: Vec<&str> = Vec::new();
811    let leaf_kind = super::infer_kind_from_parsed(&parsed);
812
813    for desc in &parsed.descriptors {
814        if desc.name.is_empty() {
815            continue;
816        }
817        cumulative_parts.push(&desc.name);
818        let qname = cumulative_parts.join(sep);
819        // For intermediate segments, use descriptor suffix to determine kind.
820        let seg_kind = if cumulative_parts.len() < parsed.descriptors.len() {
821            use scip::types::descriptor::Suffix;
822            match desc.suffix.enum_value() {
823                Ok(Suffix::Package | Suffix::Namespace) => NodeKind::Module,
824                Ok(Suffix::Type) => NodeKind::Class,
825                Ok(Suffix::Method) => NodeKind::Method,
826                Ok(Suffix::Macro) => NodeKind::Macro,
827                _ => NodeKind::Module,
828            }
829        } else {
830            leaf_kind
831        };
832        chain.push((qname, seg_kind));
833    }
834
835    chain
836}
837
838/// Check if a qualified name represents a wildcard ambient module declaration
839/// (e.g., TypeScript `declare module '*.css'`).
840///
841/// These are type-system catch-alls — every matching import resolves to them,
842/// creating thousands of fan-in edges with no semantic value.
843fn is_wildcard_module(qualified_name: &str) -> bool {
844    // SCIP represents these as qualified names containing `'*` (the glob pattern
845    // is part of the module name in the TS declaration).
846    qualified_name.contains("'*")
847}
848
849/// Standard edge properties for SCIP-derived edges (allocated once, cloned per edge).
850/// SCIP base confidence for multi-layer fusion.
851/// ast-grep = 0.10, SCIP = 0.15, LSP = 0.20 (per north-star).
852const SCIP_BASE_CONFIDENCE: f64 = 0.15;
853
854fn scip_edge_properties() -> HashMap<String, serde_json::Value> {
855    use std::sync::LazyLock;
856    static PROPS: LazyLock<HashMap<String, serde_json::Value>> = LazyLock::new(|| {
857        let mut props = HashMap::new();
858        props.insert(
859            "source".to_string(),
860            serde_json::Value::String("scip".to_string()),
861        );
862        props.insert(
863            "confidence".to_string(),
864            serde_json::json!(SCIP_BASE_CONFIDENCE),
865        );
866        props.insert("source_layers".to_string(), serde_json::json!(["scip"]));
867        props
868    });
869    PROPS.clone()
870}
871
872#[cfg(test)]
873#[path = "../tests/scip_graph_builder_tests.rs"]
874mod tests;