sem_core/parser/
graph.rs

1//! Entity dependency graph — cross-file reference extraction.
2//!
3//! Implements a two-pass approach inspired by arXiv:2601.08773 (Reliable Graph-RAG):
4//! Pass 1: Extract all entities, build a symbol table (name → entity ID).
5//! Pass 2: For each entity, extract identifier references from its AST subtree,
6//!         resolve them against the symbol table to create edges.
7//!
8//! This enables impact analysis: "if I change entity X, what else is affected?"
9
10use std::collections::{HashMap, HashSet};
11use std::path::Path;
12use std::sync::LazyLock;
13
14use rayon::prelude::*;
15use serde::{Deserialize, Serialize};
16
17use crate::git::types::{FileChange, FileStatus};
18use crate::model::entity::SemanticEntity;
19use crate::parser::registry::ParserRegistry;
20
21/// A reference from one entity to another.
22#[derive(Debug, Clone, Serialize, Deserialize)]
23#[serde(rename_all = "camelCase")]
24pub struct EntityRef {
25    pub from_entity: String,
26    pub to_entity: String,
27    pub ref_type: RefType,
28}
29
30/// Type of reference between entities.
31#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
32#[serde(rename_all = "lowercase")]
33pub enum RefType {
34    /// Function/method call
35    Calls,
36    /// Type reference (extends, implements, field type)
37    TypeRef,
38    /// Import/use statement reference
39    Imports,
40}
41
42/// A complete entity dependency graph for a set of files.
43#[derive(Debug)]
44pub struct EntityGraph {
45    /// All entities indexed by ID
46    pub entities: HashMap<String, EntityInfo>,
47    /// Edges: from_entity → [(to_entity, ref_type)]
48    pub edges: Vec<EntityRef>,
49    /// Reverse index: entity_id → entities that reference it
50    pub dependents: HashMap<String, Vec<String>>,
51    /// Forward index: entity_id → entities it references
52    pub dependencies: HashMap<String, Vec<String>>,
53}
54
55/// Minimal entity info stored in the graph.
56#[derive(Debug, Clone, Serialize, Deserialize)]
57#[serde(rename_all = "camelCase")]
58pub struct EntityInfo {
59    pub id: String,
60    pub name: String,
61    pub entity_type: String,
62    pub file_path: String,
63    #[serde(skip_serializing_if = "Option::is_none")]
64    pub parent_id: Option<String>,
65    pub start_line: usize,
66    pub end_line: usize,
67}
68
69impl EntityGraph {
70    /// Reconstruct an EntityGraph from pre-loaded parts (e.g. from a cache).
71    pub fn from_parts(entities: HashMap<String, EntityInfo>, edges: Vec<EntityRef>) -> Self {
72        let mut dependents: HashMap<String, Vec<String>> = HashMap::new();
73        let mut dependencies: HashMap<String, Vec<String>> = HashMap::new();
74        for edge in &edges {
75            dependents
76                .entry(edge.to_entity.clone())
77                .or_default()
78                .push(edge.from_entity.clone());
79            dependencies
80                .entry(edge.from_entity.clone())
81                .or_default()
82                .push(edge.to_entity.clone());
83        }
84        EntityGraph {
85            entities,
86            edges,
87            dependents,
88            dependencies,
89        }
90    }
91
92    /// Build an entity graph from a set of files.
93    ///
94    /// Pass 1: Extract all entities from all files using the parser registry.
95    /// Pass 2: For each entity, find identifier tokens and resolve them against
96    ///         the symbol table to create reference edges.
97    pub fn build(
98        root: &Path,
99        file_paths: &[String],
100        registry: &ParserRegistry,
101    ) -> Self {
102        // Pass 1: Extract all entities in parallel (file I/O + tree-sitter parsing)
103        let all_entities: Vec<SemanticEntity> = file_paths
104            .par_iter()
105            .filter_map(|file_path| {
106                let full_path = root.join(file_path);
107                let content = std::fs::read_to_string(&full_path).ok()?;
108                let plugin = registry.get_plugin(file_path)?;
109                Some(plugin.extract_entities(&content, file_path))
110            })
111            .flatten()
112            .collect();
113
114        // Build symbol table: name → entity IDs (can be multiple with same name)
115        let mut symbol_table: HashMap<String, Vec<String>> = HashMap::with_capacity(all_entities.len());
116        let mut entity_map: HashMap<String, EntityInfo> = HashMap::with_capacity(all_entities.len());
117
118        for entity in &all_entities {
119            symbol_table
120                .entry(entity.name.clone())
121                .or_default()
122                .push(entity.id.clone());
123
124            entity_map.insert(
125                entity.id.clone(),
126                EntityInfo {
127                    id: entity.id.clone(),
128                    name: entity.name.clone(),
129                    entity_type: entity.entity_type.clone(),
130                    file_path: entity.file_path.clone(),
131                    parent_id: entity.parent_id.clone(),
132                    start_line: entity.start_line,
133                    end_line: entity.end_line,
134                },
135            );
136        }
137
138        // Build parent-child set for skipping class→method self-edges
139        let parent_child_pairs: HashSet<(&str, &str)> = all_entities
140            .iter()
141            .filter_map(|e| {
142                e.parent_id.as_ref().map(|pid| (pid.as_str(), e.id.as_str()))
143            })
144            .collect();
145
146        // Build set of (class_id, child_method_name) so classes skip refs to their own methods
147        let class_child_names: HashSet<(&str, &str)> = all_entities
148            .iter()
149            .filter_map(|e| {
150                e.parent_id.as_ref().map(|pid| (pid.as_str(), e.name.as_str()))
151            })
152            .collect();
153
154        // Build import table: (file_path, imported_name) → target entity ID
155        // e.g. ("io_handler.py", "validate") → "core.py::function::validate"
156        let import_table = build_import_table(root, file_paths, &symbol_table, &entity_map);
157
158        // Pass 2: Extract references in parallel, then resolve against symbol table
159        // Step 2a: Parallel reference extraction per entity
160        let resolved_refs: Vec<(String, String, RefType)> = all_entities
161            .par_iter()
162            .flat_map(|entity| {
163                let refs = extract_references_from_content(&entity.content, &entity.name);
164                let mut entity_edges = Vec::new();
165                for ref_name in refs {
166                    // Skip references to names that are this class's own methods
167                    if class_child_names.contains(&(entity.id.as_str(), ref_name)) {
168                        continue;
169                    }
170
171                    // Check import table first: if this file imports this name,
172                    // resolve to the import target instead of global symbol table
173                    let import_key = (entity.file_path.clone(), ref_name.to_string());
174                    if let Some(import_target_id) = import_table.get(&import_key) {
175                        if import_target_id != &entity.id
176                            && !parent_child_pairs.contains(&(entity.id.as_str(), import_target_id.as_str()))
177                            && !parent_child_pairs.contains(&(import_target_id.as_str(), entity.id.as_str()))
178                        {
179                            let ref_type = infer_ref_type(&entity.content, &ref_name);
180                            entity_edges.push((
181                                entity.id.clone(),
182                                import_target_id.clone(),
183                                ref_type,
184                            ));
185                        }
186                        continue;
187                    }
188
189                    if let Some(target_ids) = symbol_table.get(ref_name) {
190                        // Without an import, only resolve to entities in the same file.
191                        // Cross-file resolution is handled by the import table above.
192                        let target = target_ids
193                            .iter()
194                            .find(|id| {
195                                *id != &entity.id
196                                    && entity_map
197                                        .get(*id)
198                                        .map_or(false, |e| e.file_path == entity.file_path)
199                            });
200
201                        if let Some(target_id) = target {
202                            // Skip parent-child edges (class → own method)
203                            if parent_child_pairs.contains(&(entity.id.as_str(), target_id.as_str()))
204                                || parent_child_pairs.contains(&(target_id.as_str(), entity.id.as_str()))
205                            {
206                                continue;
207                            }
208                            let ref_type = infer_ref_type(&entity.content, &ref_name);
209                            entity_edges.push((
210                                entity.id.clone(),
211                                target_id.clone(),
212                                ref_type,
213                            ));
214                        }
215                    }
216                }
217                entity_edges
218            })
219            .collect();
220
221        // Step 2b: Build edge indexes from resolved references
222        let mut edges: Vec<EntityRef> = Vec::with_capacity(resolved_refs.len());
223        let mut dependents: HashMap<String, Vec<String>> = HashMap::new();
224        let mut dependencies: HashMap<String, Vec<String>> = HashMap::new();
225
226        for (from_entity, to_entity, ref_type) in resolved_refs {
227            dependents
228                .entry(to_entity.clone())
229                .or_default()
230                .push(from_entity.clone());
231            dependencies
232                .entry(from_entity.clone())
233                .or_default()
234                .push(to_entity.clone());
235            edges.push(EntityRef {
236                from_entity,
237                to_entity,
238                ref_type,
239            });
240        }
241
242        EntityGraph {
243            entities: entity_map,
244            edges,
245            dependents,
246            dependencies,
247        }
248    }
249
250    /// Get entities that depend on the given entity (reverse deps).
251    pub fn get_dependents(&self, entity_id: &str) -> Vec<&EntityInfo> {
252        self.dependents
253            .get(entity_id)
254            .map(|ids| {
255                ids.iter()
256                    .filter_map(|id| self.entities.get(id))
257                    .collect()
258            })
259            .unwrap_or_default()
260    }
261
262    /// Get entities that the given entity depends on (forward deps).
263    pub fn get_dependencies(&self, entity_id: &str) -> Vec<&EntityInfo> {
264        self.dependencies
265            .get(entity_id)
266            .map(|ids| {
267                ids.iter()
268                    .filter_map(|id| self.entities.get(id))
269                    .collect()
270            })
271            .unwrap_or_default()
272    }
273
274    /// Impact analysis: if the given entity changes, what else might be affected?
275    /// Returns all transitive dependents (breadth-first), capped at 10k.
276    pub fn impact_analysis(&self, entity_id: &str) -> Vec<&EntityInfo> {
277        self.impact_analysis_capped(entity_id, 10_000)
278    }
279
280    /// Impact analysis with a cap on maximum nodes visited.
281    /// Returns transitive dependents up to the cap. Uses borrowed strings.
282    pub fn impact_analysis_capped(&self, entity_id: &str, max_visited: usize) -> Vec<&EntityInfo> {
283        let mut visited: HashSet<&str> = HashSet::new();
284        let mut queue: std::collections::VecDeque<&str> = std::collections::VecDeque::new();
285        let mut result = Vec::new();
286
287        let start_key = match self.entities.get_key_value(entity_id) {
288            Some((k, _)) => k.as_str(),
289            None => return result,
290        };
291
292        queue.push_back(start_key);
293        visited.insert(start_key);
294
295        while let Some(current) = queue.pop_front() {
296            if result.len() >= max_visited {
297                break;
298            }
299            if let Some(deps) = self.dependents.get(current) {
300                for dep in deps {
301                    if visited.insert(dep.as_str()) {
302                        if let Some(info) = self.entities.get(dep.as_str()) {
303                            result.push(info);
304                        }
305                        queue.push_back(dep.as_str());
306                        if result.len() >= max_visited {
307                            break;
308                        }
309                    }
310                }
311            }
312        }
313
314        result
315    }
316
317    /// Count transitive dependents without collecting them (faster for large graphs).
318    /// Uses borrowed strings to avoid allocation overhead.
319    pub fn impact_count(&self, entity_id: &str, max_count: usize) -> usize {
320        let mut visited: HashSet<&str> = HashSet::new();
321        let mut queue: std::collections::VecDeque<&str> = std::collections::VecDeque::new();
322        let mut count = 0;
323
324        // We need entity_id to live long enough; look it up in our entities map
325        let start_key = match self.entities.get_key_value(entity_id) {
326            Some((k, _)) => k.as_str(),
327            None => return 0,
328        };
329
330        queue.push_back(start_key);
331        visited.insert(start_key);
332
333        while let Some(current) = queue.pop_front() {
334            if count >= max_count {
335                break;
336            }
337            if let Some(deps) = self.dependents.get(current) {
338                for dep in deps {
339                    if visited.insert(dep.as_str()) {
340                        count += 1;
341                        queue.push_back(dep.as_str());
342                        if count >= max_count {
343                            break;
344                        }
345                    }
346                }
347            }
348        }
349
350        count
351    }
352
353    /// Filter entities to those that look like tests.
354    /// Uses name heuristics, file path patterns, and content patterns.
355    pub fn filter_test_entities(&self, entities: &[crate::model::entity::SemanticEntity]) -> HashSet<String> {
356        let mut test_ids = HashSet::new();
357        for entity in entities {
358            if is_test_entity(entity) {
359                test_ids.insert(entity.id.clone());
360            }
361        }
362        test_ids
363    }
364
365    /// Impact analysis filtered to test entities only.
366    /// Returns transitive dependents that are test functions/methods.
367    pub fn test_impact(
368        &self,
369        entity_id: &str,
370        all_entities: &[crate::model::entity::SemanticEntity],
371    ) -> Vec<&EntityInfo> {
372        let test_ids = self.filter_test_entities(all_entities);
373        let impact = self.impact_analysis(entity_id);
374        impact
375            .into_iter()
376            .filter(|info| test_ids.contains(&info.id))
377            .collect()
378    }
379
380    /// Incrementally update the graph from a set of changed files.
381    ///
382    /// Instead of rebuilding the entire graph, this only re-extracts entities
383    /// from changed files and re-resolves their references. This is faster
384    /// than a full rebuild when only a few files changed.
385    ///
386    /// For each changed file:
387    /// - Deleted: remove all entities from that file, prune edges
388    /// - Added/Modified: remove old entities, extract new ones, rebuild references
389    /// - Renamed: update file paths in entity info
390    pub fn update_from_changes(
391        &mut self,
392        changed_files: &[FileChange],
393        root: &Path,
394        registry: &ParserRegistry,
395    ) {
396        let mut affected_files: HashSet<String> = HashSet::new();
397        let mut new_entities: Vec<SemanticEntity> = Vec::new();
398
399        for change in changed_files {
400            affected_files.insert(change.file_path.clone());
401            if let Some(ref old_path) = change.old_file_path {
402                affected_files.insert(old_path.clone());
403            }
404
405            match change.status {
406                FileStatus::Deleted => {
407                    self.remove_entities_for_file(&change.file_path);
408                }
409                FileStatus::Renamed => {
410                    // Update file paths for renamed files
411                    if let Some(ref old_path) = change.old_file_path {
412                        self.remove_entities_for_file(old_path);
413                    }
414                    // Extract entities from the new file
415                    if let Some(entities) = self.extract_file_entities(
416                        &change.file_path,
417                        change.after_content.as_deref(),
418                        root,
419                        registry,
420                    ) {
421                        new_entities.extend(entities);
422                    }
423                }
424                FileStatus::Added | FileStatus::Modified => {
425                    // Remove old entities for this file
426                    self.remove_entities_for_file(&change.file_path);
427                    // Extract new entities
428                    if let Some(entities) = self.extract_file_entities(
429                        &change.file_path,
430                        change.after_content.as_deref(),
431                        root,
432                        registry,
433                    ) {
434                        new_entities.extend(entities);
435                    }
436                }
437            }
438        }
439
440        // Add new entities to the entity map
441        for entity in &new_entities {
442            self.entities.insert(
443                entity.id.clone(),
444                EntityInfo {
445                    id: entity.id.clone(),
446                    name: entity.name.clone(),
447                    entity_type: entity.entity_type.clone(),
448                    file_path: entity.file_path.clone(),
449                    parent_id: entity.parent_id.clone(),
450                    start_line: entity.start_line,
451                    end_line: entity.end_line,
452                },
453            );
454        }
455
456        // Rebuild the global symbol table from all current entities
457        let symbol_table = self.build_symbol_table();
458
459        // Re-resolve references for new entities
460        for entity in &new_entities {
461            self.resolve_entity_references(entity, &symbol_table);
462        }
463
464        // Also re-resolve references for entities in OTHER files that might
465        // reference entities in changed files (their targets may have changed)
466        let changed_entity_names: HashSet<String> = new_entities
467            .iter()
468            .map(|e| e.name.clone())
469            .collect();
470
471        // Find entities in unchanged files that reference any changed entity name
472        let entities_to_recheck: Vec<String> = self
473            .entities
474            .values()
475            .filter(|e| !affected_files.contains(&e.file_path))
476            .filter(|e| {
477                self.dependencies
478                    .get(&e.id)
479                    .map_or(false, |deps| {
480                        deps.iter().any(|dep_id| {
481                            self.entities
482                                .get(dep_id)
483                                .map_or(false, |dep| changed_entity_names.contains(&dep.name))
484                        })
485                    })
486            })
487            .map(|e| e.id.clone())
488            .collect();
489
490        // We don't have the full SemanticEntity for unchanged files, so we skip
491        // deep re-resolution here. The forward/reverse indexes are already updated
492        // by remove_entities_for_file and resolve_entity_references.
493        // For entities that had dangling references (their target was deleted),
494        // the edges were already pruned.
495        let _ = entities_to_recheck; // acknowledge but don't act on for now
496    }
497
498    /// Extract entities from a file, using provided content or reading from disk.
499    fn extract_file_entities(
500        &self,
501        file_path: &str,
502        content: Option<&str>,
503        root: &Path,
504        registry: &ParserRegistry,
505    ) -> Option<Vec<SemanticEntity>> {
506        let plugin = registry.get_plugin(file_path)?;
507
508        let content = if let Some(c) = content {
509            c.to_string()
510        } else {
511            let full_path = root.join(file_path);
512            std::fs::read_to_string(&full_path).ok()?
513        };
514
515        Some(plugin.extract_entities(&content, file_path))
516    }
517
518    /// Remove all entities belonging to a specific file and prune their edges.
519    fn remove_entities_for_file(&mut self, file_path: &str) {
520        // Collect entity IDs to remove
521        let ids_to_remove: Vec<String> = self
522            .entities
523            .values()
524            .filter(|e| e.file_path == file_path)
525            .map(|e| e.id.clone())
526            .collect();
527
528        let id_set: HashSet<&str> = ids_to_remove.iter().map(|s| s.as_str()).collect();
529
530        // Remove from entity map
531        for id in &ids_to_remove {
532            self.entities.remove(id);
533        }
534
535        // Remove edges involving these entities
536        self.edges
537            .retain(|e| !id_set.contains(e.from_entity.as_str()) && !id_set.contains(e.to_entity.as_str()));
538
539        // Clean up dependency/dependent indexes
540        for id in &ids_to_remove {
541            // Remove forward deps
542            if let Some(deps) = self.dependencies.remove(id) {
543                // Also remove from reverse index
544                for dep in &deps {
545                    if let Some(dependents) = self.dependents.get_mut(dep) {
546                        dependents.retain(|d| d != id);
547                    }
548                }
549            }
550            // Remove reverse deps
551            if let Some(deps) = self.dependents.remove(id) {
552                // Also remove from forward index
553                for dep in &deps {
554                    if let Some(dependencies) = self.dependencies.get_mut(dep) {
555                        dependencies.retain(|d| d != id);
556                    }
557                }
558            }
559        }
560    }
561
562    /// Build a symbol table from all current entities.
563    fn build_symbol_table(&self) -> HashMap<String, Vec<String>> {
564        let mut symbol_table: HashMap<String, Vec<String>> = HashMap::new();
565        for entity in self.entities.values() {
566            symbol_table
567                .entry(entity.name.clone())
568                .or_default()
569                .push(entity.id.clone());
570        }
571        symbol_table
572    }
573
574    /// Resolve references for a single entity against the symbol table.
575    fn resolve_entity_references(
576        &mut self,
577        entity: &SemanticEntity,
578        symbol_table: &HashMap<String, Vec<String>>,
579    ) {
580        let refs = extract_references_from_content(&entity.content, &entity.name);
581
582        for ref_name in refs {
583            if let Some(target_ids) = symbol_table.get(ref_name) {
584                let target = target_ids
585                    .iter()
586                    .find(|id| {
587                        *id != &entity.id
588                            && self
589                                .entities
590                                .get(*id)
591                                .map_or(false, |e| e.file_path == entity.file_path)
592                    })
593                    .or_else(|| target_ids.iter().find(|id| *id != &entity.id));
594
595                if let Some(target_id) = target {
596                    let ref_type = infer_ref_type(&entity.content, &ref_name);
597                    self.edges.push(EntityRef {
598                        from_entity: entity.id.clone(),
599                        to_entity: target_id.clone(),
600                        ref_type,
601                    });
602                    self.dependents
603                        .entry(target_id.clone())
604                        .or_default()
605                        .push(entity.id.clone());
606                    self.dependencies
607                        .entry(entity.id.clone())
608                        .or_default()
609                        .push(target_id.clone());
610                }
611            }
612        }
613    }
614}
615
616/// Check if an entity looks like a test based on name, file path, and content patterns.
617fn is_test_entity(entity: &crate::model::entity::SemanticEntity) -> bool {
618    let name = &entity.name;
619    let path = &entity.file_path;
620    let content = &entity.content;
621
622    // Name patterns
623    if name.starts_with("test_") || name.starts_with("Test") || name.ends_with("_test") || name.ends_with("Test") {
624        return true;
625    }
626    if name.starts_with("it_") || name.starts_with("describe_") || name.starts_with("spec_") {
627        return true;
628    }
629
630    // File path patterns
631    let path_lower = path.to_lowercase();
632    let in_test_file = path_lower.contains("/test/")
633        || path_lower.contains("/tests/")
634        || path_lower.contains("/spec/")
635        || path_lower.contains("_test.")
636        || path_lower.contains(".test.")
637        || path_lower.contains("_spec.")
638        || path_lower.contains(".spec.");
639
640    // Content patterns (test annotations/decorators)
641    let has_test_marker = content.contains("#[test]")
642        || content.contains("#[cfg(test)]")
643        || content.contains("@Test")
644        || content.contains("@pytest")
645        || content.contains("@test")
646        || content.contains("describe(")
647        || content.contains("it(")
648        || content.contains("test(");
649
650    in_test_file && has_test_marker
651}
652
653/// Build import table: maps (file_path, imported_name) → target entity ID.
654///
655/// Parses `from X import Y` / `import X` / `use X` style statements from entity content
656/// and resolves Y to the entity it refers to in the symbol table.
657fn build_import_table(
658    root: &Path,
659    file_paths: &[String],
660    symbol_table: &HashMap<String, Vec<String>>,
661    entity_map: &HashMap<String, EntityInfo>,
662) -> HashMap<(String, String), String> {
663    let mut import_table: HashMap<(String, String), String> = HashMap::new();
664
665    for file_path in file_paths {
666        let full_path = root.join(file_path);
667        let content = match std::fs::read_to_string(&full_path) {
668            Ok(c) => c,
669            Err(_) => continue,
670        };
671
672        // Join multi-line imports into single logical lines
673        // e.g. "from .cookies import (\n    foo,\n    bar,\n)" -> "from .cookies import foo, bar"
674        let mut logical_lines: Vec<String> = Vec::new();
675        let mut current_line = String::new();
676        let mut in_parens = false;
677
678        for line in content.lines() {
679            let trimmed = line.trim();
680            if in_parens {
681                // Strip parentheses and comments
682                let clean = trimmed.trim_end_matches(|c: char| c == ')' || c == ',');
683                let clean = clean.split('#').next().unwrap_or(clean).trim();
684                if !clean.is_empty() && clean != "(" {
685                    current_line.push_str(", ");
686                    current_line.push_str(clean);
687                }
688                if trimmed.contains(')') {
689                    in_parens = false;
690                    logical_lines.push(std::mem::take(&mut current_line));
691                }
692            } else if trimmed.starts_with("from ") && trimmed.contains(" import ") {
693                if trimmed.contains('(') && !trimmed.contains(')') {
694                    // Multi-line import starts
695                    in_parens = true;
696                    // Take everything before the paren
697                    let before_paren = trimmed.split('(').next().unwrap_or(trimmed);
698                    current_line = before_paren.trim().to_string();
699                    // Also grab anything after the paren on this line
700                    if let Some(after) = trimmed.split('(').nth(1) {
701                        let after = after.trim().trim_end_matches(')').trim();
702                        if !after.is_empty() {
703                            current_line.push(' ');
704                            current_line.push_str(after);
705                        }
706                    }
707                } else {
708                    logical_lines.push(trimmed.to_string());
709                }
710            }
711        }
712
713        for logical_line in &logical_lines {
714            if let Some(rest) = logical_line.strip_prefix("from ") {
715                // Find " import " or " import," (multi-line imports join with comma)
716                let import_match = rest.find(" import ")
717                    .map(|pos| (pos, 8))
718                    .or_else(|| rest.find(" import,").map(|pos| (pos, 8)));
719                if let Some((import_pos, skip)) = import_match {
720                    let module_path = &rest[..import_pos];
721                    let names_str = &rest[import_pos + skip..];
722
723                    let source_module = module_path
724                        .trim_start_matches('.')
725                        .rsplit('.')
726                        .next()
727                        .unwrap_or(module_path.trim_start_matches('.'));
728
729                    for name_part in names_str.split(',') {
730                        let name_part = name_part.trim();
731                        let imported_name = name_part.split_whitespace().next().unwrap_or(name_part);
732                        // Strip trailing parens/punctuation
733                        let imported_name = imported_name.trim_matches(|c: char| c == '(' || c == ')' || c == ',');
734                        if imported_name.is_empty() {
735                            continue;
736                        }
737
738                        if let Some(target_ids) = symbol_table.get(imported_name) {
739                            let target = target_ids.iter().find(|id| {
740                                entity_map.get(*id).map_or(false, |e| {
741                                    let stem = e.file_path.rsplit('/').next().unwrap_or(&e.file_path);
742                                    let stem = stem.strip_suffix(".py")
743                                        .or_else(|| stem.strip_suffix(".ts"))
744                                        .or_else(|| stem.strip_suffix(".js"))
745                                        .or_else(|| stem.strip_suffix(".rs"))
746                                        .unwrap_or(stem);
747                                    stem == source_module
748                                })
749                            });
750                            if let Some(target_id) = target {
751                                import_table.insert(
752                                    (file_path.clone(), imported_name.to_string()),
753                                    target_id.clone(),
754                                );
755                            }
756                        }
757                    }
758                }
759            }
760        }
761    }
762
763    import_table
764}
765
766/// Strip comments and string literals from content to avoid false references.
767/// Returns a new string with comments/docstrings replaced by spaces.
768fn strip_comments_and_strings(content: &str) -> String {
769    let bytes = content.as_bytes();
770    let len = bytes.len();
771    let mut result = vec![b' '; len];
772    let mut i = 0;
773
774    while i < len {
775        // Triple-quoted strings (Python docstrings)
776        if i + 2 < len && bytes[i] == b'"' && bytes[i + 1] == b'"' && bytes[i + 2] == b'"' {
777            i += 3;
778            while i + 2 < len {
779                if bytes[i] == b'"' && bytes[i + 1] == b'"' && bytes[i + 2] == b'"' {
780                    i += 3;
781                    break;
782                }
783                i += 1;
784            }
785            continue;
786        }
787        if i + 2 < len && bytes[i] == b'\'' && bytes[i + 1] == b'\'' && bytes[i + 2] == b'\'' {
788            i += 3;
789            while i + 2 < len {
790                if bytes[i] == b'\'' && bytes[i + 1] == b'\'' && bytes[i + 2] == b'\'' {
791                    i += 3;
792                    break;
793                }
794                i += 1;
795            }
796            continue;
797        }
798        // Double-quoted strings
799        if bytes[i] == b'"' {
800            i += 1;
801            while i < len {
802                if bytes[i] == b'\\' { i += 2; continue; }
803                if bytes[i] == b'"' { i += 1; break; }
804                i += 1;
805            }
806            continue;
807        }
808        // Single-quoted strings
809        if bytes[i] == b'\'' {
810            i += 1;
811            while i < len {
812                if bytes[i] == b'\\' { i += 2; continue; }
813                if bytes[i] == b'\'' { i += 1; break; }
814                i += 1;
815            }
816            continue;
817        }
818        // Python/Ruby single-line comments
819        if bytes[i] == b'#' {
820            while i < len && bytes[i] != b'\n' { i += 1; }
821            continue;
822        }
823        // C-style single-line comments
824        if i + 1 < len && bytes[i] == b'/' && bytes[i + 1] == b'/' {
825            while i < len && bytes[i] != b'\n' { i += 1; }
826            continue;
827        }
828        // C-style block comments
829        if i + 1 < len && bytes[i] == b'/' && bytes[i + 1] == b'*' {
830            i += 2;
831            while i + 1 < len {
832                if bytes[i] == b'*' && bytes[i + 1] == b'/' { i += 2; break; }
833                i += 1;
834            }
835            continue;
836        }
837        // Regular code: copy through
838        result[i] = bytes[i];
839        i += 1;
840    }
841
842    String::from_utf8_lossy(&result).into_owned()
843}
844
845/// Extract identifier references from entity content using simple token analysis.
846/// Strips comments and strings first to avoid false positives from docstrings.
847/// Returns borrowed slices from the stripped content.
848fn extract_references_from_content<'a>(content: &'a str, own_name: &str) -> Vec<&'a str> {
849    // We need to figure out which words appear only in comments/strings vs real code.
850    // Strategy: strip comments/strings, then only accept words that appear in the stripped version.
851    let stripped = strip_comments_and_strings(content);
852    let stripped_words: HashSet<&str> = stripped
853        .split(|c: char| !c.is_alphanumeric() && c != '_')
854        .filter(|w| !w.is_empty())
855        .collect();
856
857    let mut refs = Vec::new();
858    let mut seen: HashSet<&str> = HashSet::new();
859
860    for word in content.split(|c: char| !c.is_alphanumeric() && c != '_') {
861        if word.is_empty() || word == own_name {
862            continue;
863        }
864        if is_keyword(word) || word.len() < 2 {
865            continue;
866        }
867        // Skip very short lowercase identifiers (likely local vars: i, x, a, ok, id, etc.)
868        if word.starts_with(|c: char| c.is_lowercase()) && word.len() < 3 {
869            continue;
870        }
871        if !word.starts_with(|c: char| c.is_alphabetic() || c == '_') {
872            continue;
873        }
874        // Skip common local variable names that create false graph edges
875        if is_common_local_name(word) {
876            continue;
877        }
878        // Skip words that only appear in comments/strings
879        if !stripped_words.contains(word) {
880            continue;
881        }
882        if seen.insert(word) {
883            refs.push(word);
884        }
885    }
886
887    refs
888}
889
890static COMMON_LOCAL_NAMES: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
891    [
892        "result", "results", "data", "config", "value", "values",
893        "item", "items", "input", "output", "args", "opts",
894        "name", "path", "file", "line", "count", "index",
895        "temp", "prev", "next", "curr", "current", "node",
896        "left", "right", "root", "head", "tail", "body",
897        "text", "content", "source", "target", "entry",
898        "error", "errors", "message", "response", "request",
899        "context", "state", "props", "event", "handler",
900        "callback", "options", "params", "query", "list",
901        "base", "info", "meta", "kind", "mode", "flag",
902        "size", "length", "width", "height", "start", "stop",
903        "begin", "done", "found", "status", "code", "test",
904    ].into_iter().collect()
905});
906
907/// Names that are overwhelmingly local variables, not entity references.
908/// These create massive false-positive edges in the dependency graph.
909fn is_common_local_name(word: &str) -> bool {
910    COMMON_LOCAL_NAMES.contains(word)
911}
912
913/// Infer reference type from context using word-boundary-aware matching.
914fn infer_ref_type(content: &str, ref_name: &str) -> RefType {
915    // Check if it's a function call: ref_name followed by ( with word boundary before.
916    // Avoids format! allocation by finding ref_name and checking the next char.
917    let bytes = content.as_bytes();
918    let name_bytes = ref_name.as_bytes();
919    let mut search_start = 0;
920    while let Some(rel_pos) = content[search_start..].find(ref_name) {
921        let pos = search_start + rel_pos;
922        let after = pos + name_bytes.len();
923        // Check next char is '('
924        if after < bytes.len() && bytes[after] == b'(' {
925            // Verify word boundary before
926            let is_boundary = pos == 0 || {
927                let prev = bytes[pos - 1];
928                !prev.is_ascii_alphanumeric() && prev != b'_'
929            };
930            if is_boundary {
931                return RefType::Calls;
932            }
933        }
934        // Advance past pos to the next char boundary to avoid slicing inside a multi-byte UTF-8 char.
935        search_start = pos + 1;
936        while search_start < content.len() && !content.is_char_boundary(search_start) {
937            search_start += 1;
938        }
939    }
940
941    // Check if it's in an import/use statement (line-level, not substring)
942    for line in content.lines() {
943        let trimmed = line.trim();
944        if (trimmed.starts_with("import ") || trimmed.starts_with("use ")
945            || trimmed.starts_with("from ") || trimmed.starts_with("require("))
946            && trimmed.contains(ref_name)
947        {
948            return RefType::Imports;
949        }
950    }
951
952    // Default to type reference
953    RefType::TypeRef
954}
955
956static KEYWORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
957    [
958        // Common across languages
959        "if", "else", "for", "while", "do", "switch", "case", "break",
960        "continue", "return", "try", "catch", "finally", "throw",
961        "new", "delete", "typeof", "instanceof", "in", "of",
962        "true", "false", "null", "undefined", "void", "this",
963        "super", "class", "extends", "implements", "interface",
964        "enum", "const", "let", "var", "function", "async",
965        "await", "yield", "import", "export", "default", "from",
966        "as", "static", "public", "private", "protected",
967        "abstract", "final", "override",
968        // Rust
969        "fn", "pub", "mod", "use", "struct", "impl", "trait",
970        "where", "type", "self", "Self", "mut", "ref", "match",
971        "loop", "move", "unsafe", "extern", "crate", "dyn",
972        // Python
973        "def", "elif", "except", "raise", "with",
974        "pass", "lambda", "nonlocal", "global", "assert",
975        "True", "False", "and", "or", "not", "is",
976        // Go
977        "func", "package", "range", "select", "chan", "go",
978        "defer", "map", "make", "append", "len", "cap",
979        // C/C++
980        "auto", "register", "volatile", "sizeof", "typedef",
981        "template", "typename", "namespace", "virtual", "inline",
982        "constexpr", "nullptr", "noexcept", "explicit", "friend",
983        "operator", "using", "cout", "endl", "cerr", "cin",
984        "printf", "scanf", "malloc", "free", "NULL", "include",
985        "ifdef", "ifndef", "endif", "define", "pragma",
986        // Ruby
987        "end", "then", "elsif", "unless", "until",
988        "begin", "rescue", "ensure", "when", "require",
989        "attr_accessor", "attr_reader", "attr_writer",
990        "puts", "nil", "module", "defined",
991        // C#
992        "internal", "sealed", "readonly",
993        "partial", "delegate", "event", "params", "out",
994        "object", "decimal", "sbyte", "ushort", "uint",
995        "ulong", "nint", "nuint", "dynamic",
996        "get", "set", "value", "init", "record",
997        // Types (primitives)
998        "string", "number", "boolean", "int", "float", "double",
999        "bool", "char", "byte", "i8", "i16", "i32", "i64",
1000        "u8", "u16", "u32", "u64", "f32", "f64", "usize",
1001        "isize", "str", "String", "Vec", "Option", "Result",
1002        "Box", "Arc", "Rc", "HashMap", "HashSet", "Some",
1003        "Ok", "Err",
1004    ].into_iter().collect()
1005});
1006
1007fn is_keyword(word: &str) -> bool {
1008    KEYWORDS.contains(word)
1009}
1010
1011#[cfg(test)]
1012mod tests {
1013    use super::*;
1014    use crate::git::types::{FileChange, FileStatus};
1015    use std::io::Write;
1016    use tempfile::TempDir;
1017
1018    fn create_test_repo() -> (TempDir, ParserRegistry) {
1019        let dir = TempDir::new().unwrap();
1020        let registry = crate::parser::plugins::create_default_registry();
1021        (dir, registry)
1022    }
1023
1024    fn write_file(dir: &Path, name: &str, content: &str) {
1025        let path = dir.join(name);
1026        if let Some(parent) = path.parent() {
1027            std::fs::create_dir_all(parent).unwrap();
1028        }
1029        let mut f = std::fs::File::create(path).unwrap();
1030        f.write_all(content.as_bytes()).unwrap();
1031    }
1032
1033    #[test]
1034    fn test_incremental_add_file() {
1035        let (dir, registry) = create_test_repo();
1036        let root = dir.path();
1037
1038        // Start with one file
1039        write_file(root, "a.ts", "export function foo() { return bar(); }\n");
1040        write_file(root, "b.ts", "export function bar() { return 1; }\n");
1041
1042        let mut graph = EntityGraph::build(root, &["a.ts".into(), "b.ts".into()], &registry);
1043        assert_eq!(graph.entities.len(), 2);
1044
1045        // Add a new file
1046        write_file(root, "c.ts", "export function baz() { return foo(); }\n");
1047        graph.update_from_changes(
1048            &[FileChange {
1049                file_path: "c.ts".into(),
1050                status: FileStatus::Added,
1051                old_file_path: None,
1052                before_content: None,
1053                after_content: None, // will read from disk
1054            }],
1055            root,
1056            &registry,
1057        );
1058
1059        assert_eq!(graph.entities.len(), 3);
1060        assert!(graph.entities.contains_key("c.ts::function::baz"));
1061        // baz references foo
1062        let baz_deps = graph.get_dependencies("c.ts::function::baz");
1063        assert!(
1064            baz_deps.iter().any(|d| d.name == "foo"),
1065            "baz should depend on foo. Deps: {:?}",
1066            baz_deps.iter().map(|d| &d.name).collect::<Vec<_>>()
1067        );
1068    }
1069
1070    #[test]
1071    fn test_incremental_delete_file() {
1072        let (dir, registry) = create_test_repo();
1073        let root = dir.path();
1074
1075        write_file(root, "a.ts", "export function foo() { return bar(); }\n");
1076        write_file(root, "b.ts", "export function bar() { return 1; }\n");
1077
1078        let mut graph = EntityGraph::build(root, &["a.ts".into(), "b.ts".into()], &registry);
1079        assert_eq!(graph.entities.len(), 2);
1080
1081        // Delete b.ts
1082        graph.update_from_changes(
1083            &[FileChange {
1084                file_path: "b.ts".into(),
1085                status: FileStatus::Deleted,
1086                old_file_path: None,
1087                before_content: None,
1088                after_content: None,
1089            }],
1090            root,
1091            &registry,
1092        );
1093
1094        assert_eq!(graph.entities.len(), 1);
1095        assert!(!graph.entities.contains_key("b.ts::function::bar"));
1096        // foo's dependency on bar should be pruned
1097        let foo_deps = graph.get_dependencies("a.ts::function::foo");
1098        assert!(
1099            foo_deps.is_empty(),
1100            "foo's deps should be empty after bar deleted. Deps: {:?}",
1101            foo_deps.iter().map(|d| &d.name).collect::<Vec<_>>()
1102        );
1103    }
1104
1105    #[test]
1106    fn test_incremental_modify_file() {
1107        let (dir, registry) = create_test_repo();
1108        let root = dir.path();
1109
1110        write_file(root, "a.ts", "export function foo() { return bar(); }\n");
1111        write_file(root, "b.ts", "export function bar() { return 1; }\nexport function baz() { return 2; }\n");
1112
1113        let mut graph = EntityGraph::build(root, &["a.ts".into(), "b.ts".into()], &registry);
1114        assert_eq!(graph.entities.len(), 3);
1115
1116        // Modify a.ts to call baz instead of bar
1117        write_file(root, "a.ts", "export function foo() { return baz(); }\n");
1118        graph.update_from_changes(
1119            &[FileChange {
1120                file_path: "a.ts".into(),
1121                status: FileStatus::Modified,
1122                old_file_path: None,
1123                before_content: None,
1124                after_content: None,
1125            }],
1126            root,
1127            &registry,
1128        );
1129
1130        assert_eq!(graph.entities.len(), 3);
1131        // foo should now depend on baz, not bar
1132        let foo_deps = graph.get_dependencies("a.ts::function::foo");
1133        let dep_names: Vec<&str> = foo_deps.iter().map(|d| d.name.as_str()).collect();
1134        assert!(dep_names.contains(&"baz"), "foo should depend on baz after modification. Deps: {:?}", dep_names);
1135        assert!(!dep_names.contains(&"bar"), "foo should no longer depend on bar. Deps: {:?}", dep_names);
1136    }
1137
1138    #[test]
1139    fn test_incremental_with_content() {
1140        let (dir, registry) = create_test_repo();
1141        let root = dir.path();
1142
1143        write_file(root, "a.ts", "export function foo() { return 1; }\n");
1144        let mut graph = EntityGraph::build(root, &["a.ts".into()], &registry);
1145        assert_eq!(graph.entities.len(), 1);
1146
1147        // Add file with content provided directly (no disk read needed)
1148        graph.update_from_changes(
1149            &[FileChange {
1150                file_path: "b.ts".into(),
1151                status: FileStatus::Added,
1152                old_file_path: None,
1153                before_content: None,
1154                after_content: Some("export function bar() { return foo(); }\n".into()),
1155            }],
1156            root,
1157            &registry,
1158        );
1159
1160        assert_eq!(graph.entities.len(), 2);
1161        let bar_deps = graph.get_dependencies("b.ts::function::bar");
1162        assert!(bar_deps.iter().any(|d| d.name == "foo"));
1163    }
1164
1165    #[test]
1166    fn test_extract_references() {
1167        let content = "function processData(input) {\n  const result = validateInput(input);\n  return transform(result);\n}";
1168        let refs = extract_references_from_content(content, "processData");
1169        assert!(refs.contains(&"validateInput"));
1170        assert!(refs.contains(&"transform"));
1171        assert!(!refs.contains(&"processData")); // self excluded
1172    }
1173
1174    #[test]
1175    fn test_extract_references_skips_keywords() {
1176        let content = "function foo() { if (true) { return false; } }";
1177        let refs = extract_references_from_content(content, "foo");
1178        assert!(!refs.contains(&"if"));
1179        assert!(!refs.contains(&"true"));
1180        assert!(!refs.contains(&"return"));
1181        assert!(!refs.contains(&"false"));
1182    }
1183
1184    #[test]
1185    fn test_infer_ref_type_call() {
1186        assert_eq!(
1187            infer_ref_type("validateInput(data)", "validateInput"),
1188            RefType::Calls,
1189        );
1190    }
1191
1192    #[test]
1193    fn test_infer_ref_type_type() {
1194        assert_eq!(
1195            infer_ref_type("let x: MyType = something", "MyType"),
1196            RefType::TypeRef,
1197        );
1198    }
1199
1200    #[test]
1201    fn test_infer_ref_type_multibyte_utf8() {
1202        // Ensure no panic when content contains multi-byte UTF-8 characters
1203        assert_eq!(
1204            infer_ref_type("let café = foo(x)", "foo"),
1205            RefType::Calls,
1206        );
1207        assert_eq!(
1208            infer_ref_type("class HandicapfrPublicationFieldsEnum:\n    É = 1\n    bar()", "bar"),
1209            RefType::Calls,
1210        );
1211        // No match should not panic either
1212        assert_eq!(
1213            infer_ref_type("// 日本語コメント\nlet x = 1", "missing"),
1214            RefType::TypeRef,
1215        );
1216    }
1217}
sem_core/parser/graph.rs

sem_core/parser/
graph.rs