Skip to main content

codemem_engine/index/scip/
mod.rs

1//! SCIP integration: reader, orchestrator, and graph builder.
2//!
3//! - **Reader** (`mod.rs`): Parse `.scip` protobuf files into intermediate structs.
4//! - **Orchestrator** (`orchestrate.rs`): Auto-detect languages and indexers, run them.
5
6pub mod graph_builder;
7pub mod orchestrate;
8
9use codemem_core::NodeKind;
10use protobuf::Message;
11use scip::types::Index;
12
13/// A symbol definition extracted from a SCIP index.
14#[derive(Debug, Clone)]
15pub struct ScipDefinition {
16    /// Full SCIP symbol string (globally unique).
17    pub scip_symbol: String,
18    /// Language-appropriate qualified name (e.g., `auth::jwt::validate` for Rust).
19    pub qualified_name: String,
20    /// Relative file path from project root.
21    pub file_path: String,
22    /// Start line (0-indexed).
23    pub line_start: u32,
24    /// End line (0-indexed). Same as `line_start` for single-line symbols.
25    pub line_end: u32,
26    /// Start column (0-indexed).
27    pub col_start: u32,
28    /// End column (0-indexed).
29    pub col_end: u32,
30    /// Mapped node kind.
31    pub kind: NodeKind,
32    /// Hover documentation lines from `SymbolInformation.documentation`.
33    pub documentation: Vec<String>,
34    /// Relationships declared by this symbol.
35    pub relationships: Vec<ScipRelationship>,
36    /// Whether this definition is in test code.
37    pub is_test: bool,
38    /// Whether this definition is in generated code.
39    pub is_generated: bool,
40}
41
42/// A symbol reference (non-definition occurrence) extracted from a SCIP index.
43#[derive(Debug, Clone)]
44pub struct ScipReference {
45    /// Full SCIP symbol string being referenced.
46    pub scip_symbol: String,
47    /// Relative file path where the reference occurs.
48    pub file_path: String,
49    /// Line number (0-indexed).
50    pub line: u32,
51    /// Column (0-indexed).
52    pub col_start: u32,
53    /// End column (0-indexed).
54    pub col_end: u32,
55    /// Raw role bitmask from SCIP.
56    pub role_bitmask: i32,
57}
58
59/// An external (dependency) symbol from the SCIP index.
60#[derive(Debug, Clone)]
61pub struct ScipExternal {
62    /// Full SCIP symbol string.
63    pub scip_symbol: String,
64    /// Package manager (e.g., "cargo", "npm", "pip").
65    pub package_manager: String,
66    /// Package name (e.g., "serde", "@types/node").
67    pub package_name: String,
68    /// Package version (e.g., "1.0.0").
69    pub package_version: String,
70    /// Mapped node kind.
71    pub kind: NodeKind,
72    /// Hover documentation.
73    pub documentation: Vec<String>,
74}
75
76/// A relationship declared on a `SymbolInformation`.
77#[derive(Debug, Clone)]
78pub struct ScipRelationship {
79    /// Target symbol string.
80    pub target_symbol: String,
81    pub is_implementation: bool,
82    pub is_type_definition: bool,
83    pub is_reference: bool,
84    pub is_definition: bool,
85}
86
87/// Parsed result from reading a `.scip` file.
88#[derive(Debug, Clone)]
89pub struct ScipReadResult {
90    /// Project root from SCIP metadata.
91    pub project_root: String,
92    /// All symbol definitions found.
93    pub definitions: Vec<ScipDefinition>,
94    /// All symbol references found.
95    pub references: Vec<ScipReference>,
96    /// All external (dependency) symbols.
97    pub externals: Vec<ScipExternal>,
98    /// Set of relative file paths covered by this SCIP index.
99    pub covered_files: Vec<String>,
100}
101
102// SCIP occurrence role bitmask constants.
103const ROLE_DEFINITION: i32 = 0x1;
104pub(crate) const ROLE_IMPORT: i32 = 0x2;
105pub(crate) const ROLE_WRITE_ACCESS: i32 = 0x4;
106pub(crate) const ROLE_READ_ACCESS: i32 = 0x8;
107const ROLE_TEST: i32 = 0x20;
108const ROLE_GENERATED: i32 = 0x10;
109
110/// Parse SCIP protobuf bytes into intermediate structs.
111pub fn parse_scip_bytes(bytes: &[u8]) -> Result<ScipReadResult, String> {
112    let index = Index::parse_from_bytes(bytes)
113        .map_err(|e| format!("Failed to parse SCIP protobuf: {e}"))?;
114
115    let project_root = index
116        .metadata
117        .as_ref()
118        .map(|m| m.project_root.clone())
119        .unwrap_or_default();
120
121    let mut definitions = Vec::new();
122    let mut references = Vec::new();
123    let mut covered_files = Vec::new();
124
125    for doc in &index.documents {
126        let file_path = &doc.relative_path;
127        let language = &doc.language;
128        covered_files.push(file_path.clone());
129
130        let lang_sep = detect_language_separator(language);
131
132        // Build a map of symbol string -> SymbolInformation for this document.
133        let mut sym_info_map = std::collections::HashMap::new();
134        for sym_info in &doc.symbols {
135            if !sym_info.symbol.is_empty() {
136                sym_info_map.insert(sym_info.symbol.as_str(), sym_info);
137            }
138        }
139
140        for occ in &doc.occurrences {
141            if occ.symbol.is_empty() || scip::symbol::is_local_symbol(&occ.symbol) {
142                continue;
143            }
144
145            let (start_line, start_col, end_line, end_col) = match parse_range(&occ.range) {
146                Some(r) => r,
147                None => continue,
148            };
149
150            let roles = occ.symbol_roles;
151            let is_def = (roles & ROLE_DEFINITION) != 0;
152            let is_test = (roles & ROLE_TEST) != 0;
153            let is_generated = (roles & ROLE_GENERATED) != 0;
154
155            if is_def {
156                // Early noise filter: if SymbolInformation.Kind identifies this as a
157                // variable, parameter, or literal type, skip it entirely. This avoids
158                // building qualified names and containment chains for noise symbols.
159                if let Some(info) = sym_info_map.get(occ.symbol.as_str()) {
160                    if is_noise_kind(info.kind.value()) {
161                        continue;
162                    }
163                }
164
165                let qualified_name = match scip_symbol_to_qualified_name(&occ.symbol, lang_sep) {
166                    Some(q) => q,
167                    None => continue,
168                };
169
170                // Look up SymbolInformation for kind and documentation.
171                let (kind, documentation, relationships) =
172                    if let Some(info) = sym_info_map.get(occ.symbol.as_str()) {
173                        let kind = resolve_node_kind(info.kind.value(), &occ.symbol);
174                        let docs: Vec<String> =
175                            info.documentation.iter().map(|s| s.to_string()).collect();
176                        let rels: Vec<ScipRelationship> = info
177                            .relationships
178                            .iter()
179                            .map(|r| ScipRelationship {
180                                target_symbol: r.symbol.clone(),
181                                is_implementation: r.is_implementation,
182                                is_type_definition: r.is_type_definition,
183                                is_reference: r.is_reference,
184                                is_definition: r.is_definition,
185                            })
186                            .collect();
187                        (kind, docs, rels)
188                    } else {
189                        (infer_kind_from_symbol(&occ.symbol), Vec::new(), Vec::new())
190                    };
191
192                definitions.push(ScipDefinition {
193                    scip_symbol: occ.symbol.clone(),
194                    qualified_name,
195                    file_path: file_path.clone(),
196                    line_start: start_line,
197                    line_end: end_line,
198                    col_start: start_col,
199                    col_end: end_col,
200                    kind,
201                    documentation,
202                    relationships,
203                    is_test,
204                    is_generated,
205                });
206            } else {
207                references.push(ScipReference {
208                    scip_symbol: occ.symbol.clone(),
209                    file_path: file_path.clone(),
210                    line: start_line,
211                    col_start: start_col,
212                    col_end: end_col,
213                    role_bitmask: roles,
214                });
215            }
216        }
217    }
218
219    // Process external symbols.
220    let externals = index
221        .external_symbols
222        .iter()
223        .filter(|ext| !ext.symbol.is_empty() && !scip::symbol::is_local_symbol(&ext.symbol))
224        .filter_map(|ext| {
225            let parsed = scip::symbol::parse_symbol(&ext.symbol).ok()?;
226            let package = parsed.package.as_ref()?;
227            let kind = resolve_node_kind(ext.kind.value(), &ext.symbol);
228            let documentation: Vec<String> =
229                ext.documentation.iter().map(|s| s.to_string()).collect();
230
231            Some(ScipExternal {
232                scip_symbol: ext.symbol.clone(),
233                package_manager: package.manager.clone(),
234                package_name: package.name.clone(),
235                package_version: package.version.clone(),
236                kind,
237                documentation,
238            })
239        })
240        .collect();
241
242    // SCIP occurrences only mark the identifier token, not the full body extent.
243    // Infer body ranges: each definition extends to the next sibling at the same
244    // nesting depth (or end-of-file). This lets find_enclosing_def_indexed() match
245    // references inside function bodies to their enclosing function.
246    infer_definition_extents(&mut definitions);
247
248    Ok(ScipReadResult {
249        project_root,
250        definitions,
251        references,
252        externals,
253        covered_files,
254    })
255}
256
257/// Infer body extents for definitions whose `line_start == line_end` (identifier-only ranges).
258///
259/// Groups definitions by file, sorts by line, then counts nesting depth (based on
260/// SCIP descriptor chain length). Each definition's `line_end` is set to just before
261/// the next sibling at the same or shallower depth, or `u32::MAX` for the last in a file.
262fn infer_definition_extents(definitions: &mut [ScipDefinition]) {
263    use std::collections::HashMap;
264
265    // Group definition indices by file (clone file_path to avoid borrowing definitions).
266    let mut by_file: HashMap<String, Vec<usize>> = HashMap::new();
267    for (i, def) in definitions.iter().enumerate() {
268        by_file.entry(def.file_path.clone()).or_default().push(i);
269    }
270
271    for indices in by_file.values() {
272        // Sort by line_start.
273        let mut sorted: Vec<usize> = indices.clone();
274        sorted.sort_by_key(|&i| definitions[i].line_start);
275
276        // Pre-compute depths to avoid re-borrowing during mutation.
277        let depths: Vec<usize> = sorted
278            .iter()
279            .map(|&i| descriptor_depth(&definitions[i].scip_symbol))
280            .collect();
281
282        for pos in 0..sorted.len() {
283            let idx = sorted[pos];
284            // Skip if already has a meaningful range (multi-line identifier).
285            if definitions[idx].line_end > definitions[idx].line_start {
286                continue;
287            }
288
289            let my_depth = depths[pos];
290
291            // Find the next definition at the same or shallower depth.
292            // Default to u32::MAX so the last definition in a file "owns" all
293            // remaining lines. This is a known trade-off: references in trailing
294            // comments or whitespace will be attributed to the last function.
295            // Without CST data we can't know where the body actually ends.
296            let mut end_line = u32::MAX;
297            for next_pos in pos + 1..sorted.len() {
298                if depths[next_pos] <= my_depth {
299                    end_line = definitions[sorted[next_pos]].line_start.saturating_sub(1);
300                    break;
301                }
302            }
303
304            definitions[idx].line_end = end_line;
305        }
306    }
307}
308
309/// Count the number of descriptors in a SCIP symbol to determine nesting depth.
310fn descriptor_depth(scip_symbol: &str) -> usize {
311    scip::symbol::parse_symbol(scip_symbol)
312        .map(|p| p.descriptors.len())
313        .unwrap_or(0)
314}
315
316/// Parse a SCIP occurrence range into (start_line, start_col, end_line, end_col).
317///
318/// 3-element: `[line, startCol, endCol]` (single-line).
319/// 4-element: `[startLine, startCol, endLine, endCol]` (multi-line).
320fn parse_range(range: &[i32]) -> Option<(u32, u32, u32, u32)> {
321    match range.len() {
322        3 => Some((
323            range[0].try_into().ok()?,
324            range[1].try_into().ok()?,
325            range[0].try_into().ok()?,
326            range[2].try_into().ok()?,
327        )),
328        4 => Some((
329            range[0].try_into().ok()?,
330            range[1].try_into().ok()?,
331            range[2].try_into().ok()?,
332            range[3].try_into().ok()?,
333        )),
334        _ => None,
335    }
336}
337
338/// Extract a qualified name from a SCIP symbol string using the appropriate language separator.
339///
340/// Strips scheme, package manager, package name, and version — joins descriptors with the
341/// language-appropriate separator (`::` for Rust, `.` for Python/TS/Java/Go).
342pub fn scip_symbol_to_qualified_name(scip_symbol: &str, lang_separator: &str) -> Option<String> {
343    let parsed = scip::symbol::parse_symbol(scip_symbol).ok()?;
344    let parts: Vec<&str> = parsed
345        .descriptors
346        .iter()
347        .map(|d| d.name.as_str())
348        .filter(|s| !s.is_empty())
349        .collect();
350    if parts.is_empty() {
351        return None;
352    }
353    Some(parts.join(lang_separator))
354}
355
356/// Detect the appropriate separator for qualified names based on language.
357///
358/// Returns `"::"` for Rust/C++, `"."` for everything else.
359pub fn detect_language_separator(language: &str) -> &'static str {
360    match language.to_lowercase().as_str() {
361        "rust" | "cpp" | "c++" => "::",
362        _ => ".",
363    }
364}
365
366/// Map a SCIP `SymbolInformation.Kind` integer value to a codemem `NodeKind`.
367///
368/// Uses the `scip::types::symbol_information::Kind` enum for compile-time safety.
369/// Returns `None` for `UnspecifiedKind` (0) so callers can fall back to descriptor inference.
370fn scip_kind_to_node_kind(kind: i32) -> Option<NodeKind> {
371    use scip::types::symbol_information::Kind;
372    match kind {
373        x if x == Kind::Class as i32 || x == Kind::Struct as i32 => Some(NodeKind::Class),
374        x if x == Kind::Interface as i32 || x == Kind::Protocol as i32 => Some(NodeKind::Interface),
375        x if x == Kind::Trait as i32 => Some(NodeKind::Trait),
376        x if x == Kind::Enum as i32 => Some(NodeKind::Enum),
377        x if x == Kind::EnumMember as i32 => Some(NodeKind::EnumVariant),
378        x if x == Kind::Field as i32
379            || x == Kind::StaticField as i32
380            || x == Kind::StaticDataMember as i32 =>
381        {
382            Some(NodeKind::Field)
383        }
384        x if x == Kind::Property as i32 || x == Kind::StaticProperty as i32 => {
385            Some(NodeKind::Property)
386        }
387        x if x == Kind::TypeParameter as i32 => Some(NodeKind::TypeParameter),
388        x if x == Kind::Macro as i32 => Some(NodeKind::Macro),
389        x if x == Kind::Function as i32 || x == Kind::Constructor as i32 => {
390            Some(NodeKind::Function)
391        }
392        x if x == Kind::Method as i32
393            || x == Kind::StaticMethod as i32
394            || x == Kind::AbstractMethod as i32
395            || x == Kind::TraitMethod as i32
396            || x == Kind::ProtocolMethod as i32
397            || x == Kind::PureVirtualMethod as i32
398            || x == Kind::MethodSpecification as i32
399            || x == Kind::Getter as i32
400            || x == Kind::Setter as i32
401            || x == Kind::Accessor as i32 =>
402        {
403            Some(NodeKind::Method)
404        }
405        x if x == Kind::Namespace as i32
406            || x == Kind::Module as i32
407            || x == Kind::PackageObject as i32 =>
408        {
409            Some(NodeKind::Module)
410        }
411        x if x == Kind::Package as i32 || x == Kind::Library as i32 => Some(NodeKind::Package),
412        x if x == Kind::TypeAlias as i32
413            || x == Kind::Type as i32
414            || x == Kind::AssociatedType as i32 =>
415        {
416            Some(NodeKind::Type)
417        }
418        x if x == Kind::Constant as i32 || x == Kind::StaticVariable as i32 => {
419            Some(NodeKind::Constant)
420        }
421        _ => None,
422    }
423}
424
425/// Check if a `SymbolInformation.Kind` value represents a symbol that should
426/// never become a graph node in a knowledge graph.
427///
428/// This is the primary noise filter — when the indexer provides a Kind, we trust
429/// it over descriptor Suffix heuristics. Variables, parameters, literal types,
430/// and other non-structural symbols are filtered here.
431pub fn is_noise_kind(kind: i32) -> bool {
432    use scip::types::symbol_information::Kind;
433    matches!(kind,
434        x if x == Kind::Variable as i32
435            || x == Kind::Parameter as i32
436            || x == Kind::SelfParameter as i32
437            || x == Kind::ThisParameter as i32
438            || x == Kind::ParameterLabel as i32
439            || x == Kind::TypeParameter as i32
440            // Literal/value types — not structural
441            || x == Kind::Boolean as i32
442            || x == Kind::Number as i32
443            || x == Kind::String as i32
444            || x == Kind::Null as i32
445            || x == Kind::Array as i32
446            || x == Kind::Object as i32
447            || x == Kind::Key as i32
448            || x == Kind::Pattern as i32
449            // Receiver/error types
450            || x == Kind::MethodReceiver as i32
451            || x == Kind::Error as i32
452    )
453}
454
455/// Infer `NodeKind` from the SCIP symbol's descriptor suffixes when
456/// `SymbolInformation.Kind` is `UnspecifiedKind` (e.g., scip-go).
457///
458/// Descriptor suffix conventions (from the SCIP spec):
459/// - `/` → Package/Namespace
460/// - `#` → Type (struct, class, interface)
461/// - `().` → Method
462/// - `.` → Term (function, variable, field)
463/// - `!` → Macro
464pub fn infer_kind_from_symbol(scip_symbol: &str) -> NodeKind {
465    let parsed = match scip::symbol::parse_symbol(scip_symbol) {
466        Ok(p) => p,
467        Err(_) => return NodeKind::Function,
468    };
469    infer_kind_from_parsed(&parsed)
470}
471
472/// Infer node kind from an already-parsed SCIP symbol, avoiding a redundant parse.
473pub fn infer_kind_from_parsed(parsed: &scip::types::Symbol) -> NodeKind {
474    let last = match parsed.descriptors.last() {
475        Some(d) => d,
476        None => return NodeKind::Function,
477    };
478    use scip::types::descriptor::Suffix;
479    match last.suffix.enum_value() {
480        Ok(Suffix::Package | Suffix::Namespace) => NodeKind::Module,
481        Ok(Suffix::Type) => NodeKind::Class,
482        Ok(Suffix::Method) => NodeKind::Method,
483        Ok(Suffix::Macro) => NodeKind::Macro,
484        Ok(Suffix::TypeParameter) => NodeKind::TypeParameter,
485        Ok(Suffix::Parameter) => NodeKind::Field,
486        Ok(Suffix::Term) => {
487            // A Term inside a Type (class/interface/struct) is a field/property,
488            // not a function. Check the parent descriptor.
489            let parent = parsed.descriptors.iter().rev().nth(1);
490            match parent.and_then(|d| d.suffix.enum_value().ok()) {
491                Some(Suffix::Type) => NodeKind::Field,
492                _ => {
493                    // Module-level Term with UPPER_CASE name → Constant, not Function.
494                    // SCIP classifies `const ACCOUNT_ROUTE = "/account"` as Term,
495                    // but these are constants/config values, not callable functions.
496                    if is_constant_name(&last.name) {
497                        NodeKind::Constant
498                    } else {
499                        NodeKind::Function
500                    }
501                }
502            }
503        }
504        _ => NodeKind::Function, // Meta, Local, UnspecifiedSuffix
505    }
506}
507
508/// Check if a name looks like a constant (UPPER_CASE_WITH_UNDERSCORES or ALL_CAPS).
509/// Examples: `ACCOUNT_ROUTE`, `API_URL`, `MAX_RETRIES`, `DEBUG`.
510/// Counter-examples: `useState`, `handleClick`, `_build_filters`.
511fn is_constant_name(name: &str) -> bool {
512    !name.is_empty()
513        && name
514            .chars()
515            .all(|c| c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_')
516        && name.chars().any(|c| c.is_ascii_uppercase())
517}
518
519/// Resolve a node kind: use the SCIP `Kind` if specified, otherwise infer from the symbol.
520fn resolve_node_kind(kind: i32, scip_symbol: &str) -> NodeKind {
521    scip_kind_to_node_kind(kind).unwrap_or_else(|| infer_kind_from_symbol(scip_symbol))
522}
523
524/// Check if a reference has the import role.
525pub fn is_import_ref(role_bitmask: i32) -> bool {
526    (role_bitmask & ROLE_IMPORT) != 0
527}
528
529/// Check if a reference has read access.
530pub fn is_read_ref(role_bitmask: i32) -> bool {
531    (role_bitmask & ROLE_READ_ACCESS) != 0
532}
533
534/// Check if a reference has write access.
535pub fn is_write_ref(role_bitmask: i32) -> bool {
536    (role_bitmask & ROLE_WRITE_ACCESS) != 0
537}
538
539#[cfg(test)]
540#[path = "../tests/scip_reader_tests.rs"]
541mod tests;