Skip to main content

codemem_engine/index/scip/
graph_builder.rs

1//! SCIP graph builder: create nodes + edges from parsed SCIP data.
2//!
3//! Takes the intermediate structs from the reader and produces `GraphNode`s,
4//! `Edge`s, and `MemoryNode`s (for hover documentation).
5
6use std::collections::{HashMap, HashSet};
7
8use chrono::Utc;
9use codemem_core::{Edge, GraphNode, MemoryNode, MemoryType, NodeKind, RelationshipType};
10
11use codemem_core::ScipConfig;
12
13use super::{
14    is_import_ref, is_read_ref, is_write_ref, ScipDefinition, ScipReadResult, ROLE_IMPORT,
15    ROLE_READ_ACCESS, ROLE_WRITE_ACCESS,
16};
17
18/// Result of building graph structures from SCIP data.
19#[derive(Debug, Clone, Default)]
20pub struct ScipBuildResult {
21    pub nodes: Vec<GraphNode>,
22    pub edges: Vec<Edge>,
23    pub memories: Vec<(MemoryNode, String)>, // (memory, related_node_id) for RELATES_TO edges
24    pub ext_nodes_created: usize,
25    pub files_covered: HashSet<String>,
26    pub doc_memories_created: usize,
27}
28
29/// Build graph nodes, edges, and doc memories from a parsed SCIP result.
30///
31/// Respects `config.max_references_per_symbol`, `config.create_external_nodes`,
32/// and `config.store_docs_as_memories` settings.
33pub fn build_graph(
34    scip: &ScipReadResult,
35    namespace: Option<&str>,
36    config: &ScipConfig,
37) -> ScipBuildResult {
38    let now = Utc::now();
39    let ns = namespace.map(|s| s.to_string());
40
41    let mut nodes = Vec::new();
42    let mut edges = Vec::new();
43    let mut memories: Vec<(MemoryNode, String)> = Vec::new();
44    let mut ext_nodes_created = 0;
45    let mut doc_memories_created = 0;
46
47    // Filter out definitions from files outside the project root. SCIP indexers
48    // may include build cache, vendored deps, or virtualenv paths that ast-grep
49    // never walks. A source file path must be relative and stay within the project.
50    // Also skip wildcard ambient module declarations (e.g., `declare module '*.css'`)
51    // which act as catch-all type stubs — every matching import resolves to them,
52    // creating thousands of useless edges with massive fan-in.
53    // Stage 1: Filter by file path and wildcard module (cheap string checks).
54    let path_filtered: Vec<&ScipDefinition> = scip
55        .definitions
56        .iter()
57        .filter(|d| is_source_path(&d.file_path) && !is_wildcard_module(&d.qualified_name))
58        .collect();
59
60    // Stage 2: Parse SCIP symbols once, use for both noise filtering and containment chains.
61    // This avoids double-parsing (is_noise_definition + extract_containment_chain both need it).
62    let mut source_defs: Vec<&ScipDefinition> = Vec::with_capacity(path_filtered.len());
63    let mut parsed_symbols: Vec<scip::types::Symbol> = Vec::with_capacity(path_filtered.len());
64    for def in &path_filtered {
65        let parsed = match scip::symbol::parse_symbol(&def.scip_symbol) {
66            Ok(p) => p,
67            Err(_) => {
68                // Can't parse — keep it to be safe
69                source_defs.push(def);
70                parsed_symbols.push(scip::types::Symbol::default());
71                continue;
72            }
73        };
74        if is_noise_symbol(def, &parsed) {
75            continue;
76        }
77        source_defs.push(def);
78        parsed_symbols.push(parsed);
79    }
80
81    // Build a set of defined symbol strings -> qualified names for edge resolution.
82    let mut symbol_to_qname: HashMap<&str, &str> = HashMap::new();
83    for def in &source_defs {
84        symbol_to_qname.insert(&def.scip_symbol, &def.qualified_name);
85    }
86
87    // Phase 1: Create sym: nodes from definitions.
88    // Track created node IDs to avoid duplicates for synthetic parents.
89    let mut created_node_ids: HashSet<String> = HashSet::new();
90    let mut created_edge_ids: HashSet<String> = HashSet::new();
91    // Tier 3 folding: map folded symbol qname → parent node ID for edge redirection.
92    let mut folded_to_parent: HashMap<String, String> = HashMap::new();
93    // Collect folded children to batch-add to parent payloads after all nodes are created.
94    // Key: parent qname, Value: vec of (child label, tier3 category like "fields"/"type_params")
95    let mut folded_children: HashMap<String, Vec<(String, &'static str)>> = HashMap::new();
96
97    // Build containment chains from pre-parsed symbols (no re-parsing needed).
98    let def_chains: Vec<Vec<(String, NodeKind)>> = parsed_symbols
99        .iter()
100        .map(extract_containment_chain_from_parsed)
101        .collect();
102
103    for (def_idx, def) in source_defs.iter().enumerate() {
104        let kind = if def.is_test {
105            NodeKind::Test
106        } else {
107            def.kind
108        };
109
110        // Node tiering: Tier 3 kinds get folded into parent metadata.
111        let tier3_category = match kind {
112            NodeKind::Field | NodeKind::Property => Some("fields"),
113            NodeKind::TypeParameter => Some("type_params"),
114            NodeKind::EnumVariant => Some("variants"),
115            _ => None,
116        };
117
118        if let Some(category) = tier3_category {
119            // Find parent from containment chain.
120            let chain = &def_chains[def_idx];
121            if chain.len() >= 2 {
122                let parent_qname = &chain[chain.len() - 2].0;
123                let leaf_name = def
124                    .qualified_name
125                    .rsplit([':', '.'])
126                    .next()
127                    .unwrap_or(&def.qualified_name);
128                folded_children
129                    .entry(parent_qname.clone())
130                    .or_default()
131                    .push((leaf_name.to_string(), category));
132                folded_to_parent.insert(def.qualified_name.clone(), format!("sym:{parent_qname}"));
133                // Also map the scip_symbol for reference resolution.
134                symbol_to_qname.insert(&def.scip_symbol, &def.qualified_name);
135                continue; // Don't create a node for this definition.
136            }
137        }
138
139        let node_id = format!("sym:{}", def.qualified_name);
140
141        let mut payload = HashMap::new();
142        payload.insert(
143            "scip_symbol".to_string(),
144            serde_json::Value::String(def.scip_symbol.clone()),
145        );
146        payload.insert("line_start".to_string(), serde_json::json!(def.line_start));
147        payload.insert("line_end".to_string(), serde_json::json!(def.line_end));
148        payload.insert(
149            "file_path".to_string(),
150            serde_json::Value::String(def.file_path.clone()),
151        );
152        if def.is_test {
153            payload.insert("is_test".to_string(), serde_json::json!(true));
154        }
155        if def.is_generated {
156            payload.insert("is_generated".to_string(), serde_json::json!(true));
157        }
158        // Store type signature from first documentation line if available.
159        if let Some(type_sig) = def.documentation.first() {
160            payload.insert(
161                "type_signature".to_string(),
162                serde_json::Value::String(type_sig.clone()),
163            );
164        }
165        payload.insert(
166            "source".to_string(),
167            serde_json::Value::String("scip".to_string()),
168        );
169
170        created_node_ids.insert(node_id.clone());
171        nodes.push(GraphNode {
172            id: node_id.clone(),
173            kind,
174            label: def.qualified_name.clone(),
175            payload,
176            centrality: 0.0,
177            memory_id: None,
178            namespace: ns.clone(),
179            valid_from: None,
180            valid_to: None,
181        });
182
183        // Create containment edges: either hierarchical (nested chain) or flat (file→sym).
184        if config.hierarchical_containment {
185            let chain = &def_chains[def_idx];
186            let file_node_id = format!("file:{}", def.file_path);
187
188            if chain.len() <= 1 {
189                // No intermediate parents — just file→sym.
190                let edge_id = format!("contains:{file_node_id}->{node_id}");
191                if created_edge_ids.insert(edge_id.clone()) {
192                    edges.push(Edge {
193                        id: edge_id,
194                        src: file_node_id,
195                        dst: node_id.clone(),
196                        relationship: RelationshipType::Contains,
197                        weight: 0.1,
198                        properties: scip_edge_properties(),
199                        created_at: now,
200                        valid_from: Some(now),
201                        valid_to: None,
202                    });
203                }
204            } else {
205                // Build chain: file→top_parent→...→parent→leaf
206                for (i, (seg_qname, seg_kind)) in chain.iter().enumerate() {
207                    let seg_node_id = format!("sym:{seg_qname}");
208
209                    // Create synthetic intermediate node if needed (not the leaf itself).
210                    if seg_qname != &def.qualified_name
211                        && created_node_ids.insert(seg_node_id.clone())
212                    {
213                        let mut syn_payload = HashMap::new();
214                        syn_payload.insert(
215                            "source".to_string(),
216                            serde_json::Value::String("scip-synthetic".to_string()),
217                        );
218                        syn_payload.insert(
219                            "file_path".to_string(),
220                            serde_json::Value::String(def.file_path.clone()),
221                        );
222                        nodes.push(GraphNode {
223                            id: seg_node_id.clone(),
224                            kind: *seg_kind,
225                            label: seg_qname.clone(),
226                            payload: syn_payload,
227                            centrality: 0.0,
228                            memory_id: None,
229                            namespace: ns.clone(),
230                            valid_from: None,
231                            valid_to: None,
232                        });
233                    }
234
235                    // Create CONTAINS edge from parent to this segment.
236                    let parent_id = if i == 0 {
237                        file_node_id.clone()
238                    } else {
239                        format!("sym:{}", chain[i - 1].0)
240                    };
241
242                    let edge_id = format!("contains:{parent_id}->{seg_node_id}");
243                    if created_edge_ids.insert(edge_id.clone()) {
244                        edges.push(Edge {
245                            id: edge_id,
246                            src: parent_id,
247                            dst: seg_node_id,
248                            relationship: RelationshipType::Contains,
249                            weight: 0.1,
250                            properties: scip_edge_properties(),
251                            created_at: now,
252                            valid_from: Some(now),
253                            valid_to: None,
254                        });
255                    }
256                }
257            }
258        } else {
259            // Flat containment: file → symbol (original behavior).
260            let file_node_id = format!("file:{}", def.file_path);
261            edges.push(Edge {
262                id: format!("contains:{file_node_id}->{node_id}"),
263                src: file_node_id,
264                dst: node_id.clone(),
265                relationship: RelationshipType::Contains,
266                weight: 0.1,
267                properties: scip_edge_properties(),
268                created_at: now,
269                valid_from: Some(now),
270                valid_to: None,
271            });
272        }
273
274        // Create hover doc memories (if enabled in config).
275        if config.store_docs_as_memories && !def.documentation.is_empty() {
276            let doc_text = def.documentation.join("\n");
277            let mem_id = format!("scip-doc:{}", def.qualified_name);
278            let memory = MemoryNode {
279                id: mem_id,
280                content: doc_text,
281                memory_type: MemoryType::Context,
282                importance: 0.4,
283                confidence: 1.0,
284                access_count: 0,
285                content_hash: String::new(), // Will be computed by engine on persist.
286                tags: vec!["scip-doc".to_string(), "auto-generated".to_string()],
287                metadata: HashMap::new(),
288                namespace: ns.clone(),
289                session_id: None,
290                repo: None,
291                git_ref: None,
292                expires_at: None,
293                created_at: now,
294                updated_at: now,
295                last_accessed_at: now,
296            };
297            memories.push((memory, node_id.clone()));
298            doc_memories_created += 1;
299        }
300
301        // Create edges from SCIP relationships.
302        for rel in &def.relationships {
303            if rel.target_symbol.is_empty() {
304                continue;
305            }
306            // Resolve target to qualified name if it's a known symbol.
307            let target_node_id =
308                if let Some(qname) = symbol_to_qname.get(rel.target_symbol.as_str()) {
309                    format!("sym:{qname}")
310                } else {
311                    // Target might be external — try to parse as external node ID.
312                    match parse_external_node_id(&rel.target_symbol) {
313                        Some(ext_id) => ext_id,
314                        None => continue,
315                    }
316                };
317
318            if rel.is_implementation {
319                edges.push(Edge {
320                    id: format!("implements:{node_id}->{target_node_id}"),
321                    src: node_id.clone(),
322                    dst: target_node_id.clone(),
323                    relationship: RelationshipType::Implements,
324                    weight: 0.8,
325                    properties: scip_edge_properties(),
326                    created_at: now,
327                    valid_from: Some(now),
328                    valid_to: None,
329                });
330                // If the source is a method, also create OVERRIDES edge.
331                if def.kind == NodeKind::Method {
332                    edges.push(Edge {
333                        id: format!("overrides:{node_id}->{target_node_id}"),
334                        src: node_id.clone(),
335                        dst: target_node_id.clone(),
336                        relationship: RelationshipType::Overrides,
337                        weight: 0.8,
338                        properties: scip_edge_properties(),
339                        created_at: now,
340                        valid_from: Some(now),
341                        valid_to: None,
342                    });
343                }
344            }
345            if rel.is_type_definition {
346                edges.push(Edge {
347                    id: format!("typedef:{node_id}->{target_node_id}"),
348                    src: node_id.clone(),
349                    dst: target_node_id.clone(),
350                    relationship: RelationshipType::TypeDefinition,
351                    weight: 0.6,
352                    properties: scip_edge_properties(),
353                    created_at: now,
354                    valid_from: Some(now),
355                    valid_to: None,
356                });
357            }
358            // `is_reference` on a relationship indicates a superclass/supertype
359            // reference (e.g., class Dog extends Animal — Dog's SymbolInformation
360            // has a relationship to Animal with is_reference=true). Map to Inherits.
361            if rel.is_reference && !rel.is_implementation {
362                edges.push(Edge {
363                    id: format!("inherits:{node_id}->{target_node_id}"),
364                    src: node_id.clone(),
365                    dst: target_node_id,
366                    relationship: RelationshipType::Inherits,
367                    weight: 0.8,
368                    properties: scip_edge_properties(),
369                    created_at: now,
370                    valid_from: Some(now),
371                    valid_to: None,
372                });
373            }
374        }
375    }
376
377    // Apply folded Tier 3 children to parent node payloads.
378    for node in &mut nodes {
379        let qname = node.label.as_str();
380        if let Some(children) = folded_children.get(qname) {
381            let mut fields = Vec::new();
382            let mut type_params = Vec::new();
383            let mut variants = Vec::new();
384            for (name, category) in children {
385                match *category {
386                    "fields" => fields.push(serde_json::Value::String(name.clone())),
387                    "type_params" => type_params.push(serde_json::Value::String(name.clone())),
388                    "variants" => variants.push(serde_json::Value::String(name.clone())),
389                    _ => {}
390                }
391            }
392            if !fields.is_empty() {
393                node.payload
394                    .insert("fields".to_string(), serde_json::Value::Array(fields));
395            }
396            if !type_params.is_empty() {
397                node.payload.insert(
398                    "type_params".to_string(),
399                    serde_json::Value::Array(type_params),
400                );
401            }
402            if !variants.is_empty() {
403                node.payload
404                    .insert("variants".to_string(), serde_json::Value::Array(variants));
405            }
406        }
407    }
408
409    // Phase 2: Create pkg: nodes from external symbols (if enabled in config).
410    // Instead of one node per external symbol (thousands), we aggregate to one node
411    // per external *package* — this gives the API surface graph ("which modules depend
412    // on which packages") without polluting the graph with individual library symbols.
413    if config.create_external_nodes {
414        let mut pkg_nodes_created: HashSet<String> = HashSet::new();
415        for ext in &scip.externals {
416            if ext.package_manager.is_empty() || ext.package_name.is_empty() {
417                continue;
418            }
419            let node_id = format!("pkg:{}:{}", ext.package_manager, ext.package_name);
420            if !pkg_nodes_created.insert(node_id.clone()) {
421                continue; // Already created this package node
422            }
423
424            let mut payload = HashMap::new();
425            payload.insert(
426                "package_manager".to_string(),
427                serde_json::Value::String(ext.package_manager.clone()),
428            );
429            payload.insert(
430                "package_name".to_string(),
431                serde_json::Value::String(ext.package_name.clone()),
432            );
433            payload.insert(
434                "package_version".to_string(),
435                serde_json::Value::String(ext.package_version.clone()),
436            );
437            payload.insert(
438                "source".to_string(),
439                serde_json::Value::String("scip".to_string()),
440            );
441
442            nodes.push(GraphNode {
443                id: node_id,
444                kind: NodeKind::External,
445                label: ext.package_name.clone(),
446                payload,
447                centrality: 0.0,
448                memory_id: None,
449                namespace: ns.clone(),
450                valid_from: None,
451                valid_to: None,
452            });
453            ext_nodes_created += 1;
454        }
455    } // end if create_external_nodes
456
457    // Phase 3: Create edges from references.
458    // Pre-index source definitions by file path for O(1) lookup in find_enclosing_def.
459    // Exclude Tier 3 folded definitions — they have no graph nodes and would only
460    // inflate the linear scan in find_enclosing_def_indexed.
461    let mut defs_by_file: HashMap<&str, Vec<&ScipDefinition>> = HashMap::new();
462    for def in &source_defs {
463        if folded_to_parent.contains_key(&def.qualified_name) {
464            continue;
465        }
466        defs_by_file
467            .entry(def.file_path.as_str())
468            .or_default()
469            .push(def);
470    }
471
472    // Filter references to source files only.
473    let source_refs: Vec<&super::ScipReference> = scip
474        .references
475        .iter()
476        .filter(|r| is_source_path(&r.file_path))
477        .collect();
478
479    // Count references per (symbol, file) to enforce per-kind fan-out limits.
480    // Intentionally per-file, not global: a utility function referenced 30 times in
481    // file A and 30 times in file B stays under the limit in each file independently.
482    // Global counting would require a second pass; per-file is cheaper and still
483    // prevents the worst offenders (e.g., `log()` called 200 times in one file).
484    let mut ref_counts: HashMap<(&str, &str), usize> = HashMap::new();
485    for r in &source_refs {
486        *ref_counts
487            .entry((&r.scip_symbol, &r.file_path))
488            .or_insert(0) += 1;
489    }
490
491    // Build scip_symbol → NodeKind map for per-kind fan-out limits.
492    let symbol_to_kind: HashMap<&str, NodeKind> = source_defs
493        .iter()
494        .map(|d| (d.scip_symbol.as_str(), d.kind))
495        .collect();
496
497    for r in &source_refs {
498        // Skip high fan-out symbols using per-kind limits.
499        let count = ref_counts
500            .get(&(r.scip_symbol.as_str(), r.file_path.as_str()))
501            .copied()
502            .unwrap_or(0);
503        let target_kind = symbol_to_kind.get(r.scip_symbol.as_str()).copied();
504        let limit = match target_kind {
505            Some(NodeKind::Module) => config.fan_out_limits.module,
506            Some(NodeKind::Function) => config.fan_out_limits.function,
507            Some(NodeKind::Method) => config.fan_out_limits.method,
508            Some(NodeKind::Class | NodeKind::Trait | NodeKind::Interface) => {
509                config.fan_out_limits.class
510            }
511            _ => config.max_references_per_symbol,
512        };
513        if count > limit {
514            continue;
515        }
516
517        // R5: Filter noise calls using the blocklist.
518        if crate::index::blocklist::is_blocked_call_scip(&r.scip_symbol) {
519            continue;
520        }
521
522        // Resolve the referenced symbol to a node ID.
523        let mut target_node_id = if let Some(qname) = symbol_to_qname.get(r.scip_symbol.as_str()) {
524            format!("sym:{qname}")
525        } else {
526            // Might reference an external symbol.
527            match parse_external_node_id(&r.scip_symbol) {
528                Some(ext_id) => ext_id,
529                None => continue,
530            }
531        };
532
533        // Redirect folded Tier 3 symbols to their parent node.
534        if let Some(qname) = symbol_to_qname.get(r.scip_symbol.as_str()) {
535            if let Some(parent_id) = folded_to_parent.get(*qname) {
536                target_node_id = parent_id.clone();
537            }
538        }
539
540        // Find the enclosing definition in the same file to use as source.
541        // If we can't find one, use the file node.
542        let mut source_node_id = find_enclosing_def_indexed(&defs_by_file, &r.file_path, r.line)
543            .map(|def| format!("sym:{}", def.qualified_name))
544            .unwrap_or_else(|| format!("file:{}", r.file_path));
545
546        // Redirect if the enclosing def was itself folded.
547        if let Some(parent_id) = source_node_id
548            .strip_prefix("sym:")
549            .and_then(|qn| folded_to_parent.get(qn))
550        {
551            source_node_id = parent_id.clone();
552        }
553
554        // Don't create self-edges.
555        if source_node_id == target_node_id {
556            continue;
557        }
558
559        // Pick the most specific role for each reference. Priority:
560        //   IMPORT > WRITE > READ > kind-aware fallback
561        // A reference can have multiple role flags (e.g., IMPORT + READ_ACCESS),
562        // but we emit one edge per reference to avoid double-counting in
563        // PageRank — the more specific role subsumes the less specific one.
564        //
565        // scip-go workaround: scip-go sets READ_ACCESS on ALL references
566        // without semantic differentiation. When ONLY READ_ACCESS is set
567        // (no IMPORT, WRITE), fall through to the kind-aware default.
568        let semantic_mask = ROLE_IMPORT | ROLE_WRITE_ACCESS | ROLE_READ_ACCESS;
569        let is_scip_go_generic = r.role_bitmask & semantic_mask == ROLE_READ_ACCESS;
570
571        let (rel, weight) = if is_import_ref(r.role_bitmask) {
572            (RelationshipType::Imports, 0.5)
573        } else if is_write_ref(r.role_bitmask) {
574            (RelationshipType::Writes, 0.4)
575        } else if is_read_ref(r.role_bitmask) && !is_scip_go_generic {
576            (RelationshipType::Reads, 0.3)
577        } else {
578            // When the role bitmask is generic (no semantic flags), use the
579            // target node's kind to pick the correct relationship. A reference
580            // to a class is a type dependency, not a function call.
581            match target_kind {
582                Some(NodeKind::Class | NodeKind::Interface | NodeKind::Trait | NodeKind::Type) => {
583                    (RelationshipType::DependsOn, 0.3)
584                }
585                Some(NodeKind::Module | NodeKind::Package) => (RelationshipType::Imports, 0.5),
586                Some(NodeKind::Constant) => (RelationshipType::Reads, 0.3),
587                _ => (RelationshipType::Calls, 1.0),
588            }
589        };
590
591        let edge_prefix = rel.to_string().to_lowercase();
592        edges.push(Edge {
593            id: format!(
594                "{edge_prefix}:{source_node_id}->{target_node_id}:{}:{}",
595                r.file_path, r.line
596            ),
597            src: source_node_id.clone(),
598            dst: target_node_id.clone(),
599            relationship: rel,
600            weight,
601            properties: scip_edge_properties(),
602            created_at: now,
603            valid_from: Some(now),
604            valid_to: None,
605        });
606
607        // For non-import references to type-like symbols, also create a
608        // DependsOn edge. This captures "function X uses type Y" which is
609        // critical for blast-radius analysis. This is the ONE case where
610        // we emit two edges per reference — the structural relationship
611        // (Calls/Reads/Writes) AND the type dependency are distinct signals.
612        if !is_import_ref(r.role_bitmask) {
613            let is_type_target = matches!(
614                target_kind,
615                Some(
616                    NodeKind::Class
617                        | NodeKind::Trait
618                        | NodeKind::Interface
619                        | NodeKind::Type
620                        | NodeKind::Enum
621                )
622            );
623            if is_type_target {
624                edges.push(Edge {
625                    id: format!(
626                        "depends:{source_node_id}->{target_node_id}:{}:{}",
627                        r.file_path, r.line
628                    ),
629                    src: source_node_id,
630                    dst: target_node_id,
631                    relationship: RelationshipType::DependsOn,
632                    weight: 0.7,
633                    properties: scip_edge_properties(),
634                    created_at: now,
635                    valid_from: Some(now),
636                    valid_to: None,
637                });
638            }
639        }
640    }
641
642    // Deduplicate edges by ID (keep first occurrence).
643    let mut seen_edge_ids = HashSet::new();
644    edges.retain(|e| seen_edge_ids.insert(e.id.clone()));
645
646    // Collapse intra-class edges: methods of the same class calling each other
647    // are replaced by metadata on the parent class node.
648    if config.collapse_intra_class_edges && config.hierarchical_containment {
649        // Build sym:child → sym:parent map from containment edges.
650        let mut child_to_parent: HashMap<&str, &str> = HashMap::new();
651        for edge in &edges {
652            if edge.relationship == RelationshipType::Contains
653                && edge.src.starts_with("sym:")
654                && edge.dst.starts_with("sym:")
655            {
656                child_to_parent.insert(&edge.dst, &edge.src);
657            }
658        }
659
660        // Build node ID → kind map so we only collapse within class-like parents.
661        let node_kind_map: HashMap<&str, NodeKind> =
662            nodes.iter().map(|n| (n.id.as_str(), n.kind)).collect();
663
664        // Find edges where src and dst share the same class/struct/trait parent.
665        let mut intra_class_counts: HashMap<String, Vec<(String, String)>> = HashMap::new();
666        let mut intra_edge_ids: HashSet<String> = HashSet::new();
667        for edge in &edges {
668            if !matches!(
669                edge.relationship,
670                RelationshipType::Calls | RelationshipType::Reads | RelationshipType::Writes
671            ) {
672                continue;
673            }
674            let src_parent = child_to_parent.get(edge.src.as_str());
675            let dst_parent = child_to_parent.get(edge.dst.as_str());
676            if let (Some(sp), Some(dp)) = (src_parent, dst_parent) {
677                // Only collapse within class-like parents, not modules.
678                let parent_kind = node_kind_map.get(sp).copied();
679                let is_class_like = matches!(
680                    parent_kind,
681                    Some(NodeKind::Class | NodeKind::Trait | NodeKind::Interface | NodeKind::Enum)
682                );
683                if sp == dp && is_class_like {
684                    // Same parent — mark for collapsing.
685                    let src_leaf = edge.src.rsplit([':', '.']).next().unwrap_or(&edge.src);
686                    let dst_leaf = edge.dst.rsplit([':', '.']).next().unwrap_or(&edge.dst);
687                    intra_class_counts
688                        .entry(sp.to_string())
689                        .or_default()
690                        .push((src_leaf.to_string(), dst_leaf.to_string()));
691                    intra_edge_ids.insert(edge.id.clone());
692                }
693            }
694        }
695
696        // Remove intra-class edges and add metadata to parent nodes.
697        if !intra_edge_ids.is_empty() {
698            edges.retain(|e| !intra_edge_ids.contains(&e.id));
699            for node in &mut nodes {
700                if let Some(calls) = intra_class_counts.get(&node.id) {
701                    let call_entries: Vec<serde_json::Value> = calls
702                        .iter()
703                        .map(|(from, to)| serde_json::json!({"from": from, "to": to}))
704                        .collect();
705                    node.payload.insert(
706                        "intra_class_calls".to_string(),
707                        serde_json::Value::Array(call_entries),
708                    );
709                }
710            }
711        }
712    }
713
714    let files_covered: HashSet<String> = scip.covered_files.iter().cloned().collect();
715
716    // Ensure every node referenced by an edge exists. SCIP edges reference:
717    // - file: nodes (from CONTAINS edges) — may not exist if ast-grep didn't walk the file
718    // - ext: nodes (from reference/relationship edges) — may not be in scip.externals
719    // Without these, FK constraints cause entire edge batches to fail silently.
720    let existing_node_ids: HashSet<&str> = nodes.iter().map(|n| n.id.as_str()).collect();
721    let mut missing_ids: HashSet<String> = HashSet::new();
722    for edge in &edges {
723        if !existing_node_ids.contains(edge.src.as_str()) {
724            missing_ids.insert(edge.src.clone());
725        }
726        if !existing_node_ids.contains(edge.dst.as_str()) {
727            missing_ids.insert(edge.dst.clone());
728        }
729    }
730    for missing_id in &missing_ids {
731        let (kind, label) = if let Some(file_path) = missing_id.strip_prefix("file:") {
732            (NodeKind::File, file_path.to_string())
733        } else if let Some(pkg_rest) = missing_id.strip_prefix("pkg:") {
734            // pkg:{manager}:{name} — use package name as label
735            let label = pkg_rest.rsplit(':').next().unwrap_or(pkg_rest).to_string();
736            ext_nodes_created += 1;
737            (NodeKind::External, label)
738        } else if missing_id.starts_with("ext:") {
739            // Legacy ext: IDs from relationships — still create stub if needed
740            let label = missing_id
741                .rsplit(':')
742                .next()
743                .unwrap_or(missing_id)
744                .to_string();
745            ext_nodes_created += 1;
746            (NodeKind::External, label)
747        } else if let Some(qname) = missing_id.strip_prefix("sym:") {
748            // Interface methods, abstract methods, or symbols defined in external
749            // code that SCIP references but didn't emit a definition for.
750            let label = qname.rsplit([':', '.']).next().unwrap_or(qname).to_string();
751            (NodeKind::Method, label)
752        } else {
753            continue; // Don't create stubs for unknown prefixes
754        };
755        let mut payload = HashMap::new();
756        payload.insert(
757            "source".to_string(),
758            serde_json::Value::String("scip".to_string()),
759        );
760        nodes.push(GraphNode {
761            id: missing_id.clone(),
762            kind,
763            label,
764            payload,
765            centrality: 0.0,
766            memory_id: None,
767            namespace: ns.clone(),
768            valid_from: None,
769            valid_to: None,
770        });
771    }
772
773    // Filter edges whose endpoints were removed by noise filtering.
774    // Noise definitions (parameters, locals, typeLiterals) are filtered from nodes
775    // but SCIP references may still point to them, causing FK constraint failures.
776    // We also allow edges to file: and pkg: nodes which are created by the persistence
777    // layer (not in this build result's nodes vec).
778    //
779    // Additionally filter edges targeting language stdlib packages — these are the
780    // SCIP equivalent of the ast-grep call blocklist. Calls to TS builtins (Array,
781    // Promise, string) and Python stdlib (os, json, re) add no structural value.
782    let valid_node_ids: HashSet<&str> = nodes.iter().map(|n| n.id.as_str()).collect();
783    let edge_count_before = edges.len();
784    edges.retain(|e| {
785        // Drop edges to language stdlib packages
786        if is_stdlib_package(&e.dst) {
787            return false;
788        }
789        let src_ok = valid_node_ids.contains(e.src.as_str())
790            || e.src.starts_with("file:")
791            || e.src.starts_with("pkg:");
792        let dst_ok = valid_node_ids.contains(e.dst.as_str())
793            || e.dst.starts_with("file:")
794            || e.dst.starts_with("pkg:");
795        src_ok && dst_ok
796    });
797    let edges_dropped = edge_count_before - edges.len();
798    if edges_dropped > 0 {
799        tracing::debug!("Dropped {edges_dropped} SCIP edges referencing filtered/stdlib nodes");
800    }
801
802    ScipBuildResult {
803        nodes,
804        edges,
805        memories,
806        ext_nodes_created,
807        files_covered,
808        doc_memories_created,
809    }
810}
811
812/// Find the innermost definition that encloses a given line in a file,
813/// using a pre-indexed HashMap for O(defs_in_file) instead of O(all_defs).
814fn find_enclosing_def_indexed<'a>(
815    defs_by_file: &HashMap<&str, Vec<&'a ScipDefinition>>,
816    file_path: &str,
817    line: u32,
818) -> Option<&'a ScipDefinition> {
819    defs_by_file
820        .get(file_path)?
821        .iter()
822        .filter(|d| d.line_start <= line && d.line_end >= line)
823        .min_by_key(|d| d.line_end - d.line_start)
824        .copied()
825}
826
827/// Check if a file path looks like a project source file (relative, no escape).
828///
829/// Rejects paths that escape the project root (`..`), absolute paths, and common
830/// non-source directories (build caches, vendor dirs, virtualenvs, node_modules).
831fn is_source_path(path: &str) -> bool {
832    // Must be relative and not escape project root
833    if path.starts_with('/') || path.starts_with("..") {
834        return false;
835    }
836    // Reject common non-source paths across languages
837    let reject_dirs = [
838        "node_modules/",
839        ".venv/",
840        "site-packages/",
841        "__pycache__/",
842        ".gradle/",
843        ".m2/",
844        "/go-build/",
845        "vendor/", // Go vendored deps
846        "dist/",
847        "build/",
848    ];
849    if reject_dirs.iter().any(|r| path.contains(r)) {
850        return false;
851    }
852    // Reject generated code directories and output files
853    if path.contains("__generated__") || path.contains(".generated.") {
854        return false;
855    }
856    // Reject bundled/minified JS output
857    if path.ends_with(".bundle.js")
858        || path.ends_with(".min.js")
859        || path.ends_with(".min.css")
860        || path.contains("/webpack_bundles/")
861    {
862        return false;
863    }
864    true
865}
866
867/// Fallback noise filter using SCIP descriptor suffixes.
868///
869/// This runs AFTER the primary `is_noise_kind()` filter in the reader. It catches
870/// noise that slips through when `SymbolInformation.Kind` is `UnspecifiedKind`
871/// (common with scip-go and older indexers that don't populate Kind).
872///
873/// Uses descriptor suffix metadata: parameters, type parameters, locals inside
874/// functions, positional disambiguators (trailing digits), and typeLiteral members.
875fn is_noise_symbol(def: &ScipDefinition, parsed: &scip::types::Symbol) -> bool {
876    // Skip generated code (SCIP provides this flag)
877    if def.is_generated {
878        return true;
879    }
880
881    // typeLiteral in any descriptor name (TS anonymous inline type members)
882    if parsed
883        .descriptors
884        .iter()
885        .any(|d| d.name.contains("typeLiteral"))
886    {
887        return true;
888    }
889
890    let leaf = match parsed.descriptors.last() {
891        Some(d) => d,
892        None => return false,
893    };
894
895    use scip::types::descriptor::Suffix;
896    match leaf.suffix.enum_value() {
897        // Parameters, type parameters, and locals are never graph-worthy.
898        // Local (suffix 8) is used by scip-typescript for local variables,
899        // destructured params, and function-scoped bindings.
900        Ok(Suffix::Parameter | Suffix::TypeParameter | Suffix::Local) => return true,
901        // Meta descriptors are compiler/framework metadata, not user code.
902        Ok(Suffix::Meta) => return true,
903        // Term descriptor: check context to distinguish fields from locals
904        Ok(Suffix::Term) => {
905            // Term inside a Method = local variable / destructured param.
906            let parent_suffix = parsed
907                .descriptors
908                .iter()
909                .rev()
910                .nth(1)
911                .and_then(|d| d.suffix.enum_value().ok());
912            if matches!(parent_suffix, Some(Suffix::Method)) {
913                return true;
914            }
915            // Positional disambiguator: any Term whose name ends in digits.
916            // SCIP appends digits to disambiguate anonymous/positional symbols.
917            if has_trailing_digits(&leaf.name) {
918                return true;
919            }
920        }
921        _ => {}
922    }
923
924    false
925}
926
927/// Check if a name ends with ASCII digits (SCIP positional disambiguator).
928fn has_trailing_digits(name: &str) -> bool {
929    name.len() > 1 && name.ends_with(|c: char| c.is_ascii_digit())
930}
931
932/// Check if a node ID is a language stdlib package that should not receive edges.
933///
934/// These are the SCIP equivalent of the ast-grep call blocklist — calls to
935/// TS builtins (Array, Promise, string) and Python stdlib (os, json, re)
936/// add no structural value to the knowledge graph.
937fn is_stdlib_package(node_id: &str) -> bool {
938    matches!(
939        node_id,
940        "pkg:npm:typescript"
941            | "pkg:npm:@types/node"
942            | "pkg:python:python-stdlib"
943            | "pkg:python:typing_extensions"
944            | "pkg:python:builtins"
945            | "pkg:maven:java.lang"
946            | "pkg:maven:java.util"
947            | "pkg:maven:java.io"
948            | "pkg:go:builtin"
949            | "pkg:go:fmt"
950            | "pkg:cargo:std"
951            | "pkg:cargo:core"
952            | "pkg:cargo:alloc"
953    )
954}
955
956/// Try to parse a SCIP symbol string into a package-level external node ID.
957///
958/// Returns `pkg:{manager}:{name}` — collapsing all symbols from the same package
959/// into a single node. This gives the API surface graph without per-symbol noise.
960fn parse_external_node_id(scip_symbol: &str) -> Option<String> {
961    let parsed = scip::symbol::parse_symbol(scip_symbol).ok()?;
962    let package = parsed.package.as_ref()?;
963    if package.manager.is_empty() || package.name.is_empty() {
964        return None;
965    }
966    Some(format!("pkg:{}:{}", package.manager, package.name))
967}
968
969/// Extract the containment chain from a pre-parsed SCIP symbol.
970///
971/// For a symbol like `rust-analyzer cargo foo 1.0 auth/middleware/validate_token().`,
972/// returns: `[("auth", Module), ("auth::middleware", Module), ("auth::middleware::validate_token", Function)]`.
973///
974/// The chain represents the hierarchical nesting: file→auth→auth::middleware→validate_token.
975fn extract_containment_chain_from_parsed(parsed: &scip::types::Symbol) -> Vec<(String, NodeKind)> {
976    // Detect separator from scheme
977    let scheme = &parsed.scheme;
978    let sep = if scheme == "rust-analyzer" || scheme == "lsif-clang" {
979        "::"
980    } else {
981        "."
982    };
983
984    let mut chain = Vec::new();
985    let mut cumulative_parts: Vec<&str> = Vec::new();
986    let leaf_kind = super::infer_kind_from_parsed(parsed);
987
988    for desc in &parsed.descriptors {
989        if desc.name.is_empty() {
990            continue;
991        }
992        cumulative_parts.push(&desc.name);
993        let qname = cumulative_parts.join(sep);
994        // For intermediate segments, use descriptor suffix to determine kind.
995        let seg_kind = if cumulative_parts.len() < parsed.descriptors.len() {
996            use scip::types::descriptor::Suffix;
997            match desc.suffix.enum_value() {
998                Ok(Suffix::Package | Suffix::Namespace) => NodeKind::Module,
999                Ok(Suffix::Type) => NodeKind::Class,
1000                Ok(Suffix::Method) => NodeKind::Method,
1001                Ok(Suffix::Macro) => NodeKind::Macro,
1002                _ => NodeKind::Module,
1003            }
1004        } else {
1005            leaf_kind
1006        };
1007        chain.push((qname, seg_kind));
1008    }
1009
1010    chain
1011}
1012
1013/// Check if a qualified name represents a wildcard ambient module declaration
1014/// (e.g., TypeScript `declare module '*.css'`).
1015///
1016/// These are type-system catch-alls — every matching import resolves to them,
1017/// creating thousands of fan-in edges with no semantic value.
1018fn is_wildcard_module(qualified_name: &str) -> bool {
1019    // SCIP represents these as qualified names containing `'*` (the glob pattern
1020    // is part of the module name in the TS declaration).
1021    qualified_name.contains("'*")
1022}
1023
1024/// Standard edge properties for SCIP-derived edges (allocated once, cloned per edge).
1025/// SCIP base confidence for multi-layer fusion.
1026/// ast-grep = 0.10, SCIP = 0.15, LSP = 0.20 (per north-star).
1027const SCIP_BASE_CONFIDENCE: f64 = 0.15;
1028
1029fn scip_edge_properties() -> HashMap<String, serde_json::Value> {
1030    use std::sync::LazyLock;
1031    static PROPS: LazyLock<HashMap<String, serde_json::Value>> = LazyLock::new(|| {
1032        let mut props = HashMap::new();
1033        props.insert(
1034            "source".to_string(),
1035            serde_json::Value::String("scip".to_string()),
1036        );
1037        props.insert(
1038            "confidence".to_string(),
1039            serde_json::json!(SCIP_BASE_CONFIDENCE),
1040        );
1041        props.insert("source_layers".to_string(), serde_json::json!(["scip"]));
1042        props
1043    });
1044    PROPS.clone()
1045}
1046
1047#[cfg(test)]
1048#[path = "../tests/scip_graph_builder_tests.rs"]
1049mod tests;