Skip to main content

codemem_engine/index/scip/
mod.rs

1//! SCIP integration: reader, orchestrator, and graph builder.
2//!
3//! - **Reader** (`mod.rs`): Parse `.scip` protobuf files into intermediate structs.
4//! - **Orchestrator** (`orchestrate.rs`): Auto-detect languages and indexers, run them.
5
6pub mod graph_builder;
7pub mod orchestrate;
8
9use codemem_core::NodeKind;
10use protobuf::Message;
11use scip::types::Index;
12
13/// A symbol definition extracted from a SCIP index.
14#[derive(Debug, Clone)]
15pub struct ScipDefinition {
16    /// Full SCIP symbol string (globally unique).
17    pub scip_symbol: String,
18    /// Language-appropriate qualified name (e.g., `auth::jwt::validate` for Rust).
19    pub qualified_name: String,
20    /// Relative file path from project root.
21    pub file_path: String,
22    /// Start line (0-indexed).
23    pub line_start: u32,
24    /// End line (0-indexed). Same as `line_start` for single-line symbols.
25    pub line_end: u32,
26    /// Start column (0-indexed).
27    pub col_start: u32,
28    /// End column (0-indexed).
29    pub col_end: u32,
30    /// Mapped node kind.
31    pub kind: NodeKind,
32    /// Hover documentation lines from `SymbolInformation.documentation`.
33    pub documentation: Vec<String>,
34    /// Relationships declared by this symbol.
35    pub relationships: Vec<ScipRelationship>,
36    /// Whether this definition is in test code.
37    pub is_test: bool,
38    /// Whether this definition is in generated code.
39    pub is_generated: bool,
40}
41
42/// A symbol reference (non-definition occurrence) extracted from a SCIP index.
43#[derive(Debug, Clone)]
44pub struct ScipReference {
45    /// Full SCIP symbol string being referenced.
46    pub scip_symbol: String,
47    /// Relative file path where the reference occurs.
48    pub file_path: String,
49    /// Line number (0-indexed).
50    pub line: u32,
51    /// Column (0-indexed).
52    pub col_start: u32,
53    /// End column (0-indexed).
54    pub col_end: u32,
55    /// Raw role bitmask from SCIP.
56    pub role_bitmask: i32,
57}
58
59/// An external (dependency) symbol from the SCIP index.
60#[derive(Debug, Clone)]
61pub struct ScipExternal {
62    /// Full SCIP symbol string.
63    pub scip_symbol: String,
64    /// Package manager (e.g., "cargo", "npm", "pip").
65    pub package_manager: String,
66    /// Package name (e.g., "serde", "@types/node").
67    pub package_name: String,
68    /// Package version (e.g., "1.0.0").
69    pub package_version: String,
70    /// Mapped node kind.
71    pub kind: NodeKind,
72    /// Hover documentation.
73    pub documentation: Vec<String>,
74}
75
76/// A relationship declared on a `SymbolInformation`.
77#[derive(Debug, Clone)]
78pub struct ScipRelationship {
79    /// Target symbol string.
80    pub target_symbol: String,
81    pub is_implementation: bool,
82    pub is_type_definition: bool,
83    pub is_reference: bool,
84    pub is_definition: bool,
85}
86
87/// Parsed result from reading a `.scip` file.
88#[derive(Debug, Clone)]
89pub struct ScipReadResult {
90    /// Project root from SCIP metadata.
91    pub project_root: String,
92    /// All symbol definitions found.
93    pub definitions: Vec<ScipDefinition>,
94    /// All symbol references found.
95    pub references: Vec<ScipReference>,
96    /// All external (dependency) symbols.
97    pub externals: Vec<ScipExternal>,
98    /// Set of relative file paths covered by this SCIP index.
99    pub covered_files: Vec<String>,
100}
101
102// SCIP occurrence role bitmask constants.
103const ROLE_DEFINITION: i32 = 0x1;
104pub(crate) const ROLE_IMPORT: i32 = 0x2;
105pub(crate) const ROLE_WRITE_ACCESS: i32 = 0x4;
106pub(crate) const ROLE_READ_ACCESS: i32 = 0x8;
107const ROLE_TEST: i32 = 0x20;
108const ROLE_GENERATED: i32 = 0x10;
109
110/// Parse SCIP protobuf bytes into intermediate structs.
111pub fn parse_scip_bytes(bytes: &[u8]) -> Result<ScipReadResult, String> {
112    let index = Index::parse_from_bytes(bytes)
113        .map_err(|e| format!("Failed to parse SCIP protobuf: {e}"))?;
114
115    let project_root = index
116        .metadata
117        .as_ref()
118        .map(|m| m.project_root.clone())
119        .unwrap_or_default();
120
121    let mut definitions = Vec::new();
122    let mut references = Vec::new();
123    let mut covered_files = Vec::new();
124
125    for doc in &index.documents {
126        let file_path = &doc.relative_path;
127        let language = &doc.language;
128        covered_files.push(file_path.clone());
129
130        let lang_sep = detect_language_separator(language);
131
132        // Build a map of symbol string -> SymbolInformation for this document.
133        let mut sym_info_map = std::collections::HashMap::new();
134        for sym_info in &doc.symbols {
135            if !sym_info.symbol.is_empty() {
136                sym_info_map.insert(sym_info.symbol.as_str(), sym_info);
137            }
138        }
139
140        for occ in &doc.occurrences {
141            if occ.symbol.is_empty() || scip::symbol::is_local_symbol(&occ.symbol) {
142                continue;
143            }
144
145            let (start_line, start_col, end_line, end_col) = match parse_range(&occ.range) {
146                Some(r) => r,
147                None => continue,
148            };
149
150            let roles = occ.symbol_roles;
151            let is_def = (roles & ROLE_DEFINITION) != 0;
152            let is_test = (roles & ROLE_TEST) != 0;
153            let is_generated = (roles & ROLE_GENERATED) != 0;
154
155            if is_def {
156                let qualified_name = match scip_symbol_to_qualified_name(&occ.symbol, lang_sep) {
157                    Some(q) => q,
158                    None => continue,
159                };
160
161                // Look up SymbolInformation for kind and documentation.
162                let (kind, documentation, relationships) =
163                    if let Some(info) = sym_info_map.get(occ.symbol.as_str()) {
164                        let kind = resolve_node_kind(info.kind.value(), &occ.symbol);
165                        let docs: Vec<String> =
166                            info.documentation.iter().map(|s| s.to_string()).collect();
167                        let rels: Vec<ScipRelationship> = info
168                            .relationships
169                            .iter()
170                            .map(|r| ScipRelationship {
171                                target_symbol: r.symbol.clone(),
172                                is_implementation: r.is_implementation,
173                                is_type_definition: r.is_type_definition,
174                                is_reference: r.is_reference,
175                                is_definition: r.is_definition,
176                            })
177                            .collect();
178                        (kind, docs, rels)
179                    } else {
180                        (infer_kind_from_symbol(&occ.symbol), Vec::new(), Vec::new())
181                    };
182
183                definitions.push(ScipDefinition {
184                    scip_symbol: occ.symbol.clone(),
185                    qualified_name,
186                    file_path: file_path.clone(),
187                    line_start: start_line,
188                    line_end: end_line,
189                    col_start: start_col,
190                    col_end: end_col,
191                    kind,
192                    documentation,
193                    relationships,
194                    is_test,
195                    is_generated,
196                });
197            } else {
198                references.push(ScipReference {
199                    scip_symbol: occ.symbol.clone(),
200                    file_path: file_path.clone(),
201                    line: start_line,
202                    col_start: start_col,
203                    col_end: end_col,
204                    role_bitmask: roles,
205                });
206            }
207        }
208    }
209
210    // Process external symbols.
211    let externals = index
212        .external_symbols
213        .iter()
214        .filter(|ext| !ext.symbol.is_empty() && !scip::symbol::is_local_symbol(&ext.symbol))
215        .filter_map(|ext| {
216            let parsed = scip::symbol::parse_symbol(&ext.symbol).ok()?;
217            let package = parsed.package.as_ref()?;
218            let kind = resolve_node_kind(ext.kind.value(), &ext.symbol);
219            let documentation: Vec<String> =
220                ext.documentation.iter().map(|s| s.to_string()).collect();
221
222            Some(ScipExternal {
223                scip_symbol: ext.symbol.clone(),
224                package_manager: package.manager.clone(),
225                package_name: package.name.clone(),
226                package_version: package.version.clone(),
227                kind,
228                documentation,
229            })
230        })
231        .collect();
232
233    // SCIP occurrences only mark the identifier token, not the full body extent.
234    // Infer body ranges: each definition extends to the next sibling at the same
235    // nesting depth (or end-of-file). This lets find_enclosing_def_indexed() match
236    // references inside function bodies to their enclosing function.
237    infer_definition_extents(&mut definitions);
238
239    Ok(ScipReadResult {
240        project_root,
241        definitions,
242        references,
243        externals,
244        covered_files,
245    })
246}
247
248/// Infer body extents for definitions whose `line_start == line_end` (identifier-only ranges).
249///
250/// Groups definitions by file, sorts by line, then counts nesting depth (based on
251/// SCIP descriptor chain length). Each definition's `line_end` is set to just before
252/// the next sibling at the same or shallower depth, or `u32::MAX` for the last in a file.
253fn infer_definition_extents(definitions: &mut [ScipDefinition]) {
254    use std::collections::HashMap;
255
256    // Group definition indices by file (clone file_path to avoid borrowing definitions).
257    let mut by_file: HashMap<String, Vec<usize>> = HashMap::new();
258    for (i, def) in definitions.iter().enumerate() {
259        by_file.entry(def.file_path.clone()).or_default().push(i);
260    }
261
262    for indices in by_file.values() {
263        // Sort by line_start.
264        let mut sorted: Vec<usize> = indices.clone();
265        sorted.sort_by_key(|&i| definitions[i].line_start);
266
267        // Pre-compute depths to avoid re-borrowing during mutation.
268        let depths: Vec<usize> = sorted
269            .iter()
270            .map(|&i| descriptor_depth(&definitions[i].scip_symbol))
271            .collect();
272
273        for pos in 0..sorted.len() {
274            let idx = sorted[pos];
275            // Skip if already has a meaningful range (multi-line identifier).
276            if definitions[idx].line_end > definitions[idx].line_start {
277                continue;
278            }
279
280            let my_depth = depths[pos];
281
282            // Find the next definition at the same or shallower depth.
283            // Default to u32::MAX so the last definition in a file "owns" all
284            // remaining lines. This is a known trade-off: references in trailing
285            // comments or whitespace will be attributed to the last function.
286            // Without CST data we can't know where the body actually ends.
287            let mut end_line = u32::MAX;
288            for next_pos in pos + 1..sorted.len() {
289                if depths[next_pos] <= my_depth {
290                    end_line = definitions[sorted[next_pos]].line_start.saturating_sub(1);
291                    break;
292                }
293            }
294
295            definitions[idx].line_end = end_line;
296        }
297    }
298}
299
300/// Count the number of descriptors in a SCIP symbol to determine nesting depth.
301fn descriptor_depth(scip_symbol: &str) -> usize {
302    scip::symbol::parse_symbol(scip_symbol)
303        .map(|p| p.descriptors.len())
304        .unwrap_or(0)
305}
306
307/// Parse a SCIP occurrence range into (start_line, start_col, end_line, end_col).
308///
309/// 3-element: `[line, startCol, endCol]` (single-line).
310/// 4-element: `[startLine, startCol, endLine, endCol]` (multi-line).
311fn parse_range(range: &[i32]) -> Option<(u32, u32, u32, u32)> {
312    match range.len() {
313        3 => Some((
314            range[0].try_into().ok()?,
315            range[1].try_into().ok()?,
316            range[0].try_into().ok()?,
317            range[2].try_into().ok()?,
318        )),
319        4 => Some((
320            range[0].try_into().ok()?,
321            range[1].try_into().ok()?,
322            range[2].try_into().ok()?,
323            range[3].try_into().ok()?,
324        )),
325        _ => None,
326    }
327}
328
329/// Extract a qualified name from a SCIP symbol string using the appropriate language separator.
330///
331/// Strips scheme, package manager, package name, and version — joins descriptors with the
332/// language-appropriate separator (`::` for Rust, `.` for Python/TS/Java/Go).
333pub fn scip_symbol_to_qualified_name(scip_symbol: &str, lang_separator: &str) -> Option<String> {
334    let parsed = scip::symbol::parse_symbol(scip_symbol).ok()?;
335    let parts: Vec<&str> = parsed
336        .descriptors
337        .iter()
338        .map(|d| d.name.as_str())
339        .filter(|s| !s.is_empty())
340        .collect();
341    if parts.is_empty() {
342        return None;
343    }
344    Some(parts.join(lang_separator))
345}
346
347/// Detect the appropriate separator for qualified names based on language.
348///
349/// Returns `"::"` for Rust/C++, `"."` for everything else.
350pub fn detect_language_separator(language: &str) -> &'static str {
351    match language.to_lowercase().as_str() {
352        "rust" | "cpp" | "c++" => "::",
353        _ => ".",
354    }
355}
356
357/// Map a SCIP `SymbolInformation.Kind` integer value to a codemem `NodeKind`.
358///
359/// Uses the `scip::types::symbol_information::Kind` enum for compile-time safety.
360/// Returns `None` for `UnspecifiedKind` (0) so callers can fall back to descriptor inference.
361fn scip_kind_to_node_kind(kind: i32) -> Option<NodeKind> {
362    use scip::types::symbol_information::Kind;
363    match kind {
364        x if x == Kind::Class as i32 => Some(NodeKind::Class),
365        x if x == Kind::Interface as i32 => Some(NodeKind::Interface),
366        x if x == Kind::Trait as i32 => Some(NodeKind::Trait),
367        x if x == Kind::Enum as i32 => Some(NodeKind::Enum),
368        x if x == Kind::EnumMember as i32 => Some(NodeKind::EnumVariant),
369        x if x == Kind::Field as i32 => Some(NodeKind::Field),
370        x if x == Kind::TypeParameter as i32 => Some(NodeKind::TypeParameter),
371        x if x == Kind::Macro as i32 => Some(NodeKind::Macro),
372        x if x == Kind::Property as i32 => Some(NodeKind::Property),
373        x if x == Kind::Function as i32 || x == Kind::Constructor as i32 => {
374            Some(NodeKind::Function)
375        }
376        x if x == Kind::Method as i32 => Some(NodeKind::Method),
377        x if x == Kind::Namespace as i32 || x == Kind::Module as i32 => Some(NodeKind::Module),
378        x if x == Kind::Package as i32 => Some(NodeKind::Package),
379        x if x == Kind::TypeAlias as i32 || x == Kind::Type as i32 => Some(NodeKind::Type),
380        x if x == Kind::Constant as i32 => Some(NodeKind::Constant),
381        _ => None,
382    }
383}
384
385/// Infer `NodeKind` from the SCIP symbol's descriptor suffixes when
386/// `SymbolInformation.Kind` is `UnspecifiedKind` (e.g., scip-go).
387///
388/// Descriptor suffix conventions (from the SCIP spec):
389/// - `/` → Package/Namespace
390/// - `#` → Type (struct, class, interface)
391/// - `().` → Method
392/// - `.` → Term (function, variable, field)
393/// - `!` → Macro
394pub fn infer_kind_from_symbol(scip_symbol: &str) -> NodeKind {
395    let parsed = match scip::symbol::parse_symbol(scip_symbol) {
396        Ok(p) => p,
397        Err(_) => return NodeKind::Function,
398    };
399    infer_kind_from_parsed(&parsed)
400}
401
402/// Infer node kind from an already-parsed SCIP symbol, avoiding a redundant parse.
403pub fn infer_kind_from_parsed(parsed: &scip::types::Symbol) -> NodeKind {
404    let last = match parsed.descriptors.last() {
405        Some(d) => d,
406        None => return NodeKind::Function,
407    };
408    use scip::types::descriptor::Suffix;
409    match last.suffix.enum_value() {
410        Ok(Suffix::Package | Suffix::Namespace) => NodeKind::Module,
411        Ok(Suffix::Type) => NodeKind::Class,
412        Ok(Suffix::Method) => NodeKind::Method,
413        Ok(Suffix::Macro) => NodeKind::Macro,
414        Ok(Suffix::TypeParameter) => NodeKind::TypeParameter,
415        Ok(Suffix::Parameter) => NodeKind::Field,
416        Ok(Suffix::Term) => {
417            // A Term inside a Type (class/interface/struct) is a field/property,
418            // not a function. Check the parent descriptor.
419            let parent = parsed.descriptors.iter().rev().nth(1);
420            match parent.and_then(|d| d.suffix.enum_value().ok()) {
421                Some(Suffix::Type) => NodeKind::Field,
422                _ => NodeKind::Function,
423            }
424        }
425        _ => NodeKind::Function, // Meta, Local, UnspecifiedSuffix
426    }
427}
428
429/// Resolve a node kind: use the SCIP `Kind` if specified, otherwise infer from the symbol.
430fn resolve_node_kind(kind: i32, scip_symbol: &str) -> NodeKind {
431    scip_kind_to_node_kind(kind).unwrap_or_else(|| infer_kind_from_symbol(scip_symbol))
432}
433
434/// Check if a reference has the import role.
435pub fn is_import_ref(role_bitmask: i32) -> bool {
436    (role_bitmask & ROLE_IMPORT) != 0
437}
438
439/// Check if a reference has read access.
440pub fn is_read_ref(role_bitmask: i32) -> bool {
441    (role_bitmask & ROLE_READ_ACCESS) != 0
442}
443
444/// Check if a reference has write access.
445pub fn is_write_ref(role_bitmask: i32) -> bool {
446    (role_bitmask & ROLE_WRITE_ACCESS) != 0
447}
448
449#[cfg(test)]
450#[path = "../tests/scip_reader_tests.rs"]
451mod tests;