Skip to main content

codemem_engine/index/
resolver.rs

1//! Reference resolution into graph edges.
2//!
3//! Resolves unresolved references (by simple name) to their target symbols
4//! and produces typed edges for the knowledge graph.
5
6use crate::index::symbol::{Reference, ReferenceKind, Symbol};
7use codemem_core::RelationshipType;
8use std::collections::{HashMap, HashSet};
9
10/// A resolved edge connecting two symbols by qualified name.
11#[derive(Debug, Clone)]
12pub struct ResolvedEdge {
13    /// Qualified name of the source symbol.
14    pub source_qualified_name: String,
15    /// Qualified name of the resolved target symbol.
16    pub target_qualified_name: String,
17    /// The relationship type for this edge.
18    pub relationship: RelationshipType,
19    /// File path where the reference occurs.
20    pub file_path: String,
21    /// Line number of the reference.
22    pub line: usize,
23    /// R2: Confidence of the resolution (0.0 = guessed, 1.0 = exact match).
24    pub resolution_confidence: f64,
25}
26
27/// A reference that could not be resolved to a known symbol.
28/// Preserved for deferred cross-repo linking.
29#[derive(Debug, Clone)]
30pub struct UnresolvedRef {
31    /// Qualified name of the source symbol containing this reference.
32    pub source_node: String,
33    /// The unresolved target name.
34    pub target_name: String,
35    /// Package hint extracted from import context (language-specific).
36    pub package_hint: Option<String>,
37    /// Kind of reference: "call", "import", "inherits", "implements", "type_usage".
38    pub ref_kind: String,
39    /// File where the reference occurs.
40    pub file_path: String,
41    /// Line number.
42    pub line: usize,
43}
44
45/// Combined result of reference resolution: resolved edges + unresolved refs.
46#[derive(Debug)]
47pub struct ResolveResult {
48    pub edges: Vec<ResolvedEdge>,
49    pub unresolved: Vec<UnresolvedRef>,
50}
51
52/// Resolves references to target symbols and produces graph edges.
53pub struct ReferenceResolver {
54    /// Map of qualified_name -> Symbol for exact resolution.
55    symbol_index: HashMap<String, Symbol>,
56    /// Map of simple name -> Vec<qualified_name> for ambiguous resolution.
57    name_index: HashMap<String, Vec<String>>,
58    /// R2: Set of imported qualified names per file for scoring.
59    file_imports: HashMap<String, HashSet<String>>,
60}
61
62impl ReferenceResolver {
63    /// Create a new empty resolver.
64    pub fn new() -> Self {
65        Self {
66            symbol_index: HashMap::new(),
67            name_index: HashMap::new(),
68            file_imports: HashMap::new(),
69        }
70    }
71
72    /// Add symbols to the resolver's index.
73    pub fn add_symbols(&mut self, symbols: &[Symbol]) {
74        for sym in symbols {
75            self.symbol_index
76                .insert(sym.qualified_name.clone(), sym.clone());
77
78            self.name_index
79                .entry(sym.name.clone())
80                .or_default()
81                .push(sym.qualified_name.clone());
82        }
83    }
84
85    /// R2: Register import references so the resolver can prefer imported symbols.
86    pub fn add_imports(&mut self, references: &[Reference]) {
87        for r in references {
88            if r.kind == ReferenceKind::Import {
89                self.file_imports
90                    .entry(r.file_path.clone())
91                    .or_default()
92                    .insert(r.target_name.clone());
93            }
94        }
95    }
96
97    /// Resolve a single reference to a target symbol with confidence.
98    ///
99    /// Resolution strategy:
100    /// 1. Exact match on qualified name (confidence 1.0)
101    /// 2. R4: Cross-module path resolution — strip `crate::` prefix, try partial matches
102    /// 3. Simple name match with R2 scoring heuristics (confidence varies)
103    /// 4. Unresolved (returns None)
104    pub fn resolve_with_confidence(&self, reference: &Reference) -> Option<(&Symbol, f64)> {
105        // 1. Exact qualified name match
106        if let Some(sym) = self.symbol_index.get(&reference.target_name) {
107            return Some((sym, 1.0));
108        }
109
110        // R4: Try stripping `crate::` prefix for cross-module resolution
111        if reference.target_name.starts_with("crate::") {
112            let stripped = &reference.target_name["crate::".len()..];
113            if let Some(sym) = self.symbol_index.get(stripped) {
114                return Some((sym, 0.95));
115            }
116            // Try matching against all qualified names ending with this suffix
117            for (qn, sym) in &self.symbol_index {
118                if qn.ends_with(stripped) {
119                    let prefix_len = qn.len() - stripped.len();
120                    if prefix_len == 0 || qn[..prefix_len].ends_with("::") {
121                        return Some((sym, 0.85));
122                    }
123                }
124            }
125        }
126
127        // R4: Try matching `module::function` against `crate::module::function`
128        if reference.target_name.contains("::") {
129            let with_crate = format!("crate::{}", reference.target_name);
130            if let Some(sym) = self.symbol_index.get(&with_crate) {
131                return Some((sym, 0.9));
132            }
133            // Try suffix matching for partial paths
134            for (qn, sym) in &self.symbol_index {
135                if qn.ends_with(&reference.target_name) {
136                    let prefix_len = qn.len() - reference.target_name.len();
137                    if prefix_len == 0 || qn[..prefix_len].ends_with("::") {
138                        return Some((sym, 0.8));
139                    }
140                }
141            }
142        }
143
144        // 2. Simple name match with scoring heuristics
145        let simple_name = reference
146            .target_name
147            .rsplit("::")
148            .next()
149            .unwrap_or(&reference.target_name);
150
151        if let Some(candidates) = self.name_index.get(simple_name) {
152            if candidates.len() == 1 {
153                // Unambiguous
154                let confidence = if simple_name == reference.target_name {
155                    0.9 // Exact simple name match
156                } else {
157                    0.7 // Matched via last segment only
158                };
159                return self
160                    .symbol_index
161                    .get(&candidates[0])
162                    .map(|s| (s, confidence));
163            }
164
165            // R2: Score candidates with heuristics
166            let file_imports = self.file_imports.get(&reference.file_path);
167            let mut best: Option<(&Symbol, f64)> = None;
168
169            for qn in candidates {
170                if let Some(sym) = self.symbol_index.get(qn) {
171                    let mut score: f64 = 0.0;
172
173                    // Prefer symbols imported in the same file
174                    if let Some(imports) = file_imports {
175                        if imports.contains(&sym.qualified_name)
176                            || imports.iter().any(|imp| imp.ends_with(&sym.name))
177                        {
178                            score += 0.4;
179                        }
180                    }
181
182                    // Prefer symbols in the same file
183                    if sym.file_path == reference.file_path {
184                        score += 0.3;
185                    }
186
187                    // Prefer exact name match (not just substring/last-segment)
188                    if sym.name == reference.target_name {
189                        score += 0.2;
190                    }
191
192                    // Prefer symbols in the same package/module (share path prefix)
193                    let ref_module = extract_module_path(&reference.file_path);
194                    let sym_module = extract_module_path(&sym.file_path);
195                    if ref_module == sym_module {
196                        score += 0.1;
197                    }
198
199                    if best.is_none() || score > best.unwrap().1 {
200                        best = Some((sym, score));
201                    }
202                }
203            }
204
205            if let Some((sym, score)) = best {
206                // Normalize score to a confidence value in [0.3, 0.8]
207                let confidence = 0.3 + (score.min(1.0) * 0.5);
208                return Some((sym, confidence));
209            }
210        }
211
212        None
213    }
214
215    /// Map a reference kind to a relationship type and apply kind-specific
216    /// confidence adjustments (e.g., Callback caps at 0.6).
217    fn resolve_edge(&self, r: &Reference) -> Option<ResolvedEdge> {
218        let (target, confidence) = self.resolve_with_confidence(r)?;
219        let relationship = match r.kind {
220            ReferenceKind::Call | ReferenceKind::Callback => RelationshipType::Calls,
221            ReferenceKind::Import => RelationshipType::Imports,
222            ReferenceKind::Inherits => RelationshipType::Inherits,
223            ReferenceKind::Implements => RelationshipType::Implements,
224            ReferenceKind::TypeUsage => RelationshipType::DependsOn,
225        };
226        // Callback references are speculative — cap confidence.
227        let confidence = if r.kind == ReferenceKind::Callback {
228            confidence.min(0.6)
229        } else {
230            confidence
231        };
232        Some(ResolvedEdge {
233            source_qualified_name: r.source_qualified_name.clone(),
234            target_qualified_name: target.qualified_name.clone(),
235            relationship,
236            file_path: r.file_path.clone(),
237            line: r.line,
238            resolution_confidence: confidence,
239        })
240    }
241
242    /// Resolve all references into edges.
243    ///
244    /// Only produces edges for successfully resolved references.
245    pub fn resolve_all(&self, references: &[Reference]) -> Vec<ResolvedEdge> {
246        references
247            .iter()
248            .filter_map(|r| self.resolve_edge(r))
249            .collect()
250    }
251
252    /// Resolve all references, collecting both resolved edges and unresolved refs.
253    ///
254    /// Like `resolve_all` but also preserves unresolved references with
255    /// package hints for deferred cross-repo linking.
256    pub fn resolve_all_with_unresolved(&self, references: &[Reference]) -> ResolveResult {
257        let mut edges = Vec::new();
258        let mut unresolved = Vec::new();
259
260        for r in references {
261            if let Some(edge) = self.resolve_edge(r) {
262                edges.push(edge);
263            } else {
264                let package_hint = extract_package_hint(&r.target_name, r.kind);
265                unresolved.push(UnresolvedRef {
266                    source_node: r.source_qualified_name.clone(),
267                    target_name: r.target_name.clone(),
268                    package_hint,
269                    ref_kind: r.kind.to_string(),
270                    file_path: r.file_path.clone(),
271                    line: r.line,
272                });
273            }
274        }
275
276        ResolveResult { edges, unresolved }
277    }
278}
279
280/// Extract a package hint from an import target name.
281///
282/// Language-specific rules:
283/// - Python: "requests.api.get" -> "requests" (first dot-segment)
284/// - TS/JS: "@acme/shared" -> "@acme/shared" (scoped), "lodash" -> "lodash" (first segment)
285/// - Go: "github.com/acme/utils" -> "github.com/acme/utils" (full module path)
286/// - Java: "com.acme.shared.Validator" -> "com.acme.shared" (drop last segment = class name)
287/// - Rust: "crate::module::item" -> None (local), "serde::Serialize" -> "serde" (first segment)
288///
289/// For Import references, the target_name is typically the full import path.
290/// For Call references, package_hint is usually None (calls are to local symbols).
291pub(crate) fn extract_package_hint(target_name: &str, kind: ReferenceKind) -> Option<String> {
292    // Only extract hints from Import references — calls/inherits are usually local
293    if kind != ReferenceKind::Import {
294        return None;
295    }
296
297    // Skip local/relative imports
298    if target_name.starts_with('.')
299        || target_name.starts_with("crate::")
300        || target_name.starts_with("super::")
301        || target_name.starts_with("self::")
302    {
303        return None;
304    }
305
306    // Scoped npm packages: @scope/package
307    if target_name.starts_with('@') {
308        // @acme/shared-lib/utils -> @acme/shared-lib
309        let parts: Vec<&str> = target_name.splitn(3, '/').collect();
310        if parts.len() >= 2 {
311            return Some(format!("{}/{}", parts[0], parts[1]));
312        }
313        return Some(target_name.to_string());
314    }
315
316    // Go module paths: github.com/user/repo, gopkg.in/yaml.v3, etc.
317    // Detect by presence of '/' and a domain-like first segment with a known
318    // hosting domain. A simple "contains dot" heuristic would misclassify
319    // npm packages like socket.io/client or lodash.get/deep.
320    if target_name.contains('/') {
321        let first_segment = target_name.split('/').next().unwrap_or("");
322        if is_go_module_domain(first_segment) {
323            // Domain-based Go module path — use full path as package hint
324            return Some(target_name.to_string());
325        }
326        // For non-domain slash paths (e.g., "lodash/merge"), extract first segment
327        if !first_segment.is_empty() {
328            return Some(first_segment.to_string());
329        }
330    }
331
332    // Rust crate imports: tokio::sync -> "tokio"
333    if target_name.contains("::") {
334        let first = target_name.split("::").next()?;
335        return Some(first.to_string());
336    }
337
338    // Python/TS/JS dot-separated: requests.api.get -> "requests"
339    if target_name.contains('.') {
340        let first = target_name.split('.').next()?;
341        return Some(first.to_string());
342    }
343
344    // Single word import: "requests", "lodash", "flask"
345    // Filter out Python stdlib modules that would pollute the package registry
346    // with false matches. These will never correspond to an indexed namespace.
347    if is_python_stdlib(target_name) {
348        return None;
349    }
350    Some(target_name.to_string())
351}
352
353/// Check if a string looks like a Go module hosting domain.
354/// Matches common Go module hosts and any domain with a dot + known code TLD.
355fn is_go_module_domain(segment: &str) -> bool {
356    matches!(
357        segment,
358        "github.com"
359            | "gitlab.com"
360            | "bitbucket.org"
361            | "golang.org"
362            | "google.golang.org"
363            | "gopkg.in"
364            | "go.uber.org"
365            | "go.etcd.io"
366            | "k8s.io"
367            | "sigs.k8s.io"
368            | "honnef.co"
369            | "mvdan.cc"
370    ) || (segment.contains('.')
371        && segment.rsplit('.').next().is_some_and(|tld| {
372            matches!(
373                tld,
374                "com" | "org" | "io" | "net" | "dev" | "in" | "cc" | "co"
375            )
376        }))
377}
378
379/// Common Python stdlib modules that should not produce package hints.
380/// These single-word imports would create false registry matches.
381fn is_python_stdlib(name: &str) -> bool {
382    matches!(
383        name,
384        "os" | "sys"
385            | "re"
386            | "io"
387            | "json"
388            | "math"
389            | "time"
390            | "datetime"
391            | "collections"
392            | "itertools"
393            | "functools"
394            | "typing"
395            | "logging"
396            | "pathlib"
397            | "subprocess"
398            | "threading"
399            | "multiprocessing"
400            | "unittest"
401            | "copy"
402            | "abc"
403            | "enum"
404            | "dataclasses"
405            | "contextlib"
406            | "argparse"
407            | "hashlib"
408            | "hmac"
409            | "secrets"
410            | "socket"
411            | "http"
412            | "email"
413            | "html"
414            | "xml"
415            | "csv"
416            | "sqlite3"
417            | "pickle"
418            | "shelve"
419            | "marshal"
420            | "struct"
421            | "codecs"
422            | "string"
423            | "textwrap"
424            | "difflib"
425            | "pprint"
426            | "warnings"
427            | "traceback"
428            | "inspect"
429            | "dis"
430            | "ast"
431            | "token"
432            | "keyword"
433            | "linecache"
434            | "shutil"
435            | "tempfile"
436            | "glob"
437            | "fnmatch"
438            | "stat"
439            | "fileinput"
440            | "configparser"
441            | "signal"
442            | "errno"
443            | "ctypes"
444            | "types"
445            | "weakref"
446            | "array"
447            | "bisect"
448            | "heapq"
449            | "queue"
450            | "random"
451            | "statistics"
452            | "decimal"
453            | "fractions"
454            | "operator"
455            | "uuid"
456            | "base64"
457            | "binascii"
458            | "zlib"
459            | "gzip"
460            | "zipfile"
461            | "tarfile"
462            | "pdb"
463            | "profile"
464            | "cProfile"
465            | "timeit"
466            | "platform"
467            | "sysconfig"
468            | "builtins"
469            | "asyncio"
470            | "concurrent"
471    )
472}
473
474/// Extract a module path from a file path for same-package heuristic.
475/// e.g., "src/index/parser.rs" -> "src/index"
476fn extract_module_path(file_path: &str) -> &str {
477    file_path.rsplit_once('/').map(|(dir, _)| dir).unwrap_or("")
478}
479
480impl Default for ReferenceResolver {
481    fn default() -> Self {
482        Self::new()
483    }
484}
485
486#[cfg(test)]
487#[path = "tests/resolver_tests.rs"]
488mod tests;