Skip to main content

codemem_engine/index/
resolver.rs

1//! Reference resolution into graph edges.
2//!
3//! Resolves unresolved references (by simple name) to their target symbols
4//! and produces typed edges for the knowledge graph.
5
6use crate::index::symbol::{Reference, ReferenceKind, Symbol};
7use codemem_core::RelationshipType;
8use std::collections::{HashMap, HashSet};
9
10/// A resolved edge connecting two symbols by qualified name.
11#[derive(Debug, Clone)]
12pub struct ResolvedEdge {
13    /// Qualified name of the source symbol.
14    pub source_qualified_name: String,
15    /// Qualified name of the resolved target symbol.
16    pub target_qualified_name: String,
17    /// The relationship type for this edge.
18    pub relationship: RelationshipType,
19    /// File path where the reference occurs.
20    pub file_path: String,
21    /// Line number of the reference.
22    pub line: usize,
23    /// R2: Confidence of the resolution (0.0 = guessed, 1.0 = exact match).
24    pub resolution_confidence: f64,
25}
26
27/// A reference that could not be resolved to a known symbol.
28/// Preserved for deferred cross-repo linking.
29#[derive(Debug, Clone)]
30pub struct UnresolvedRef {
31    /// Qualified name of the source symbol containing this reference.
32    pub source_node: String,
33    /// The unresolved target name.
34    pub target_name: String,
35    /// Package hint extracted from import context (language-specific).
36    pub package_hint: Option<String>,
37    /// Kind of reference: "call", "import", "inherits", "implements", "type_usage".
38    pub ref_kind: String,
39    /// File where the reference occurs.
40    pub file_path: String,
41    /// Line number.
42    pub line: usize,
43}
44
45/// Combined result of reference resolution: resolved edges + unresolved refs.
46#[derive(Debug)]
47pub struct ResolveResult {
48    pub edges: Vec<ResolvedEdge>,
49    pub unresolved: Vec<UnresolvedRef>,
50}
51
52/// Resolves references to target symbols and produces graph edges.
53pub struct ReferenceResolver {
54    /// Map of qualified_name -> Symbol for exact resolution.
55    symbol_index: HashMap<String, Symbol>,
56    /// Map of simple name -> Vec<qualified_name> for ambiguous resolution.
57    name_index: HashMap<String, Vec<String>>,
58    /// R2: Set of imported qualified names per file for scoring.
59    file_imports: HashMap<String, HashSet<String>>,
60}
61
62impl ReferenceResolver {
63    /// Create a new empty resolver.
64    pub fn new() -> Self {
65        Self {
66            symbol_index: HashMap::new(),
67            name_index: HashMap::new(),
68            file_imports: HashMap::new(),
69        }
70    }
71
72    /// Add symbols to the resolver's index.
73    pub fn add_symbols(&mut self, symbols: &[Symbol]) {
74        for sym in symbols {
75            self.symbol_index
76                .insert(sym.qualified_name.clone(), sym.clone());
77
78            self.name_index
79                .entry(sym.name.clone())
80                .or_default()
81                .push(sym.qualified_name.clone());
82        }
83    }
84
85    /// R2: Register import references so the resolver can prefer imported symbols.
86    pub fn add_imports(&mut self, references: &[Reference]) {
87        for r in references {
88            if r.kind == ReferenceKind::Import {
89                self.file_imports
90                    .entry(r.file_path.clone())
91                    .or_default()
92                    .insert(r.target_name.clone());
93            }
94        }
95    }
96
97    /// Resolve a single reference to a target symbol with confidence.
98    ///
99    /// Resolution strategy:
100    /// 1. Exact match on qualified name (confidence 1.0)
101    /// 2. R4: Cross-module path resolution — strip `crate::` prefix, try partial matches
102    /// 3. Simple name match with R2 scoring heuristics (confidence varies)
103    /// 4. Unresolved (returns None)
104    pub fn resolve_with_confidence(&self, reference: &Reference) -> Option<(&Symbol, f64)> {
105        // 1. Exact qualified name match
106        if let Some(sym) = self.symbol_index.get(&reference.target_name) {
107            return Some((sym, 1.0));
108        }
109
110        // R4: Try stripping `crate::` prefix for cross-module resolution
111        if reference.target_name.starts_with("crate::") {
112            let stripped = &reference.target_name["crate::".len()..];
113            if let Some(sym) = self.symbol_index.get(stripped) {
114                return Some((sym, 0.95));
115            }
116            // Try matching against all qualified names ending with this suffix
117            for (qn, sym) in &self.symbol_index {
118                if qn.ends_with(stripped) {
119                    let prefix_len = qn.len() - stripped.len();
120                    if prefix_len == 0 || qn[..prefix_len].ends_with("::") {
121                        return Some((sym, 0.85));
122                    }
123                }
124            }
125        }
126
127        // R4: Try matching `module::function` against `crate::module::function`
128        if reference.target_name.contains("::") {
129            let with_crate = format!("crate::{}", reference.target_name);
130            if let Some(sym) = self.symbol_index.get(&with_crate) {
131                return Some((sym, 0.9));
132            }
133            // Try suffix matching for partial paths
134            for (qn, sym) in &self.symbol_index {
135                if qn.ends_with(&reference.target_name) {
136                    let prefix_len = qn.len() - reference.target_name.len();
137                    if prefix_len == 0 || qn[..prefix_len].ends_with("::") {
138                        return Some((sym, 0.8));
139                    }
140                }
141            }
142        }
143
144        // 2. Simple name match with scoring heuristics
145        let simple_name = reference
146            .target_name
147            .rsplit("::")
148            .next()
149            .unwrap_or(&reference.target_name);
150
151        if let Some(candidates) = self.name_index.get(simple_name) {
152            if candidates.len() == 1 {
153                // Unambiguous
154                let confidence = if simple_name == reference.target_name {
155                    0.9 // Exact simple name match
156                } else {
157                    0.7 // Matched via last segment only
158                };
159                return self
160                    .symbol_index
161                    .get(&candidates[0])
162                    .map(|s| (s, confidence));
163            }
164
165            // R2: Score candidates with heuristics
166            let file_imports = self.file_imports.get(&reference.file_path);
167            let mut best: Option<(&Symbol, f64)> = None;
168
169            for qn in candidates {
170                if let Some(sym) = self.symbol_index.get(qn) {
171                    let mut score: f64 = 0.0;
172
173                    // Prefer symbols imported in the same file
174                    if let Some(imports) = file_imports {
175                        if imports.contains(&sym.qualified_name)
176                            || imports.iter().any(|imp| imp.ends_with(&sym.name))
177                        {
178                            score += 0.4;
179                        }
180                    }
181
182                    // Prefer symbols in the same file
183                    if sym.file_path == reference.file_path {
184                        score += 0.3;
185                    }
186
187                    // Prefer exact name match (not just substring/last-segment)
188                    if sym.name == reference.target_name {
189                        score += 0.2;
190                    }
191
192                    // Prefer symbols in the same package/module (share path prefix)
193                    let ref_module = extract_module_path(&reference.file_path);
194                    let sym_module = extract_module_path(&sym.file_path);
195                    if ref_module == sym_module {
196                        score += 0.1;
197                    }
198
199                    if best.is_none() || score > best.unwrap().1 {
200                        best = Some((sym, score));
201                    }
202                }
203            }
204
205            if let Some((sym, score)) = best {
206                // Normalize score to a confidence value in [0.3, 0.8]
207                let confidence = 0.3 + (score.min(1.0) * 0.5);
208                return Some((sym, confidence));
209            }
210        }
211
212        None
213    }
214
215    /// Resolve all references into edges.
216    ///
217    /// Only produces edges for successfully resolved references.
218    pub fn resolve_all(&self, references: &[Reference]) -> Vec<ResolvedEdge> {
219        references
220            .iter()
221            .filter_map(|r| {
222                let (target, confidence) = self.resolve_with_confidence(r)?;
223                let relationship = match r.kind {
224                    ReferenceKind::Call => RelationshipType::Calls,
225                    ReferenceKind::Import => RelationshipType::Imports,
226                    ReferenceKind::Inherits => RelationshipType::Inherits,
227                    ReferenceKind::Implements => RelationshipType::Implements,
228                    ReferenceKind::TypeUsage => RelationshipType::DependsOn,
229                };
230
231                Some(ResolvedEdge {
232                    source_qualified_name: r.source_qualified_name.clone(),
233                    target_qualified_name: target.qualified_name.clone(),
234                    relationship,
235                    file_path: r.file_path.clone(),
236                    line: r.line,
237                    resolution_confidence: confidence,
238                })
239            })
240            .collect()
241    }
242
243    /// Resolve all references, collecting both resolved edges and unresolved refs.
244    ///
245    /// Like `resolve_all` but also preserves unresolved references with
246    /// package hints for deferred cross-repo linking.
247    pub fn resolve_all_with_unresolved(&self, references: &[Reference]) -> ResolveResult {
248        let mut edges = Vec::new();
249        let mut unresolved = Vec::new();
250
251        for r in references {
252            match self.resolve_with_confidence(r) {
253                Some((target, confidence)) => {
254                    let relationship = match r.kind {
255                        ReferenceKind::Call => RelationshipType::Calls,
256                        ReferenceKind::Import => RelationshipType::Imports,
257                        ReferenceKind::Inherits => RelationshipType::Inherits,
258                        ReferenceKind::Implements => RelationshipType::Implements,
259                        ReferenceKind::TypeUsage => RelationshipType::DependsOn,
260                    };
261                    edges.push(ResolvedEdge {
262                        source_qualified_name: r.source_qualified_name.clone(),
263                        target_qualified_name: target.qualified_name.clone(),
264                        relationship,
265                        file_path: r.file_path.clone(),
266                        line: r.line,
267                        resolution_confidence: confidence,
268                    });
269                }
270                None => {
271                    let package_hint = extract_package_hint(&r.target_name, r.kind);
272                    unresolved.push(UnresolvedRef {
273                        source_node: r.source_qualified_name.clone(),
274                        target_name: r.target_name.clone(),
275                        package_hint,
276                        ref_kind: r.kind.to_string(),
277                        file_path: r.file_path.clone(),
278                        line: r.line,
279                    });
280                }
281            }
282        }
283
284        ResolveResult { edges, unresolved }
285    }
286}
287
288/// Extract a package hint from an import target name.
289///
290/// Language-specific rules:
291/// - Python: "requests.api.get" -> "requests" (first dot-segment)
292/// - TS/JS: "@acme/shared" -> "@acme/shared" (scoped), "lodash" -> "lodash" (first segment)
293/// - Go: "github.com/acme/utils" -> "github.com/acme/utils" (full module path)
294/// - Java: "com.acme.shared.Validator" -> "com.acme.shared" (drop last segment = class name)
295/// - Rust: "crate::module::item" -> None (local), "serde::Serialize" -> "serde" (first segment)
296///
297/// For Import references, the target_name is typically the full import path.
298/// For Call references, package_hint is usually None (calls are to local symbols).
299pub(crate) fn extract_package_hint(target_name: &str, kind: ReferenceKind) -> Option<String> {
300    // Only extract hints from Import references — calls/inherits are usually local
301    if kind != ReferenceKind::Import {
302        return None;
303    }
304
305    // Skip local/relative imports
306    if target_name.starts_with('.')
307        || target_name.starts_with("crate::")
308        || target_name.starts_with("super::")
309        || target_name.starts_with("self::")
310    {
311        return None;
312    }
313
314    // Scoped npm packages: @scope/package
315    if target_name.starts_with('@') {
316        // @acme/shared-lib/utils -> @acme/shared-lib
317        let parts: Vec<&str> = target_name.splitn(3, '/').collect();
318        if parts.len() >= 2 {
319            return Some(format!("{}/{}", parts[0], parts[1]));
320        }
321        return Some(target_name.to_string());
322    }
323
324    // Go module paths: github.com/user/repo, gopkg.in/yaml.v3, etc.
325    // Detect by presence of '/' and a domain-like first segment with a known
326    // hosting domain. A simple "contains dot" heuristic would misclassify
327    // npm packages like socket.io/client or lodash.get/deep.
328    if target_name.contains('/') {
329        let first_segment = target_name.split('/').next().unwrap_or("");
330        if is_go_module_domain(first_segment) {
331            // Domain-based Go module path — use full path as package hint
332            return Some(target_name.to_string());
333        }
334        // For non-domain slash paths (e.g., "lodash/merge"), extract first segment
335        if !first_segment.is_empty() {
336            return Some(first_segment.to_string());
337        }
338    }
339
340    // Rust crate imports: tokio::sync -> "tokio"
341    if target_name.contains("::") {
342        let first = target_name.split("::").next()?;
343        return Some(first.to_string());
344    }
345
346    // Python/TS/JS dot-separated: requests.api.get -> "requests"
347    if target_name.contains('.') {
348        let first = target_name.split('.').next()?;
349        return Some(first.to_string());
350    }
351
352    // Single word import: "requests", "lodash", "flask"
353    // Filter out Python stdlib modules that would pollute the package registry
354    // with false matches. These will never correspond to an indexed namespace.
355    if is_python_stdlib(target_name) {
356        return None;
357    }
358    Some(target_name.to_string())
359}
360
361/// Check if a string looks like a Go module hosting domain.
362/// Matches common Go module hosts and any domain with a dot + known code TLD.
363fn is_go_module_domain(segment: &str) -> bool {
364    matches!(
365        segment,
366        "github.com"
367            | "gitlab.com"
368            | "bitbucket.org"
369            | "golang.org"
370            | "google.golang.org"
371            | "gopkg.in"
372            | "go.uber.org"
373            | "go.etcd.io"
374            | "k8s.io"
375            | "sigs.k8s.io"
376            | "honnef.co"
377            | "mvdan.cc"
378    ) || (segment.contains('.')
379        && segment.rsplit('.').next().is_some_and(|tld| {
380            matches!(
381                tld,
382                "com" | "org" | "io" | "net" | "dev" | "in" | "cc" | "co"
383            )
384        }))
385}
386
387/// Common Python stdlib modules that should not produce package hints.
388/// These single-word imports would create false registry matches.
389fn is_python_stdlib(name: &str) -> bool {
390    matches!(
391        name,
392        "os" | "sys"
393            | "re"
394            | "io"
395            | "json"
396            | "math"
397            | "time"
398            | "datetime"
399            | "collections"
400            | "itertools"
401            | "functools"
402            | "typing"
403            | "logging"
404            | "pathlib"
405            | "subprocess"
406            | "threading"
407            | "multiprocessing"
408            | "unittest"
409            | "copy"
410            | "abc"
411            | "enum"
412            | "dataclasses"
413            | "contextlib"
414            | "argparse"
415            | "hashlib"
416            | "hmac"
417            | "secrets"
418            | "socket"
419            | "http"
420            | "email"
421            | "html"
422            | "xml"
423            | "csv"
424            | "sqlite3"
425            | "pickle"
426            | "shelve"
427            | "marshal"
428            | "struct"
429            | "codecs"
430            | "string"
431            | "textwrap"
432            | "difflib"
433            | "pprint"
434            | "warnings"
435            | "traceback"
436            | "inspect"
437            | "dis"
438            | "ast"
439            | "token"
440            | "keyword"
441            | "linecache"
442            | "shutil"
443            | "tempfile"
444            | "glob"
445            | "fnmatch"
446            | "stat"
447            | "fileinput"
448            | "configparser"
449            | "signal"
450            | "errno"
451            | "ctypes"
452            | "types"
453            | "weakref"
454            | "array"
455            | "bisect"
456            | "heapq"
457            | "queue"
458            | "random"
459            | "statistics"
460            | "decimal"
461            | "fractions"
462            | "operator"
463            | "uuid"
464            | "base64"
465            | "binascii"
466            | "zlib"
467            | "gzip"
468            | "zipfile"
469            | "tarfile"
470            | "pdb"
471            | "profile"
472            | "cProfile"
473            | "timeit"
474            | "platform"
475            | "sysconfig"
476            | "builtins"
477            | "asyncio"
478            | "concurrent"
479    )
480}
481
482/// Extract a module path from a file path for same-package heuristic.
483/// e.g., "src/index/parser.rs" -> "src/index"
484fn extract_module_path(file_path: &str) -> &str {
485    file_path.rsplit_once('/').map(|(dir, _)| dir).unwrap_or("")
486}
487
488impl Default for ReferenceResolver {
489    fn default() -> Self {
490        Self::new()
491    }
492}
493
494#[cfg(test)]
495#[path = "tests/resolver_tests.rs"]
496mod tests;