Skip to main content

gid_core/code_graph/
extract.rs

1use std::collections::{HashMap, HashSet};
2use std::path::Path;
3use std::time::{Instant, UNIX_EPOCH};
4
5use regex::Regex;
6use tree_sitter::Parser;
7use walkdir::WalkDir;
8use xxhash_rust::xxh64::xxh64;
9
10use super::lang::{python::*, rust_lang::*, typescript::*};
11use super::types::*;
12use crate::graph::Graph;
13use crate::unify::graph_to_codegraph;
14
15// ═══ Current metadata version. Bump on struct changes → triggers full rebuild. ═══
16const EXTRACT_META_VERSION: u32 = 2;
17
18// ═══ Shared Helper Types ═══
19
20/// Intermediate state collected during per-file parsing.
21/// Holds all the maps needed for cross-file reference resolution.
22#[derive(Default)]
23struct ExtractState {
24    nodes: Vec<CodeNode>,
25    edges: Vec<CodeEdge>,
26    class_map: HashMap<String, String>,
27    func_map: HashMap<String, Vec<String>>,
28    module_map: HashMap<String, String>,
29    method_to_class: HashMap<String, String>,
30    class_methods: HashMap<String, Vec<String>>,
31    class_parents: HashMap<String, Vec<String>>,
32    file_imported_names: HashMap<String, HashSet<String>>,
33    all_struct_field_types: HashMap<String, HashMap<String, String>>,
34}
35
36/// Result of parsing a single file.
37struct FileParseResult {
38    nodes: Vec<CodeNode>,
39    edges: Vec<CodeEdge>,
40    imports: HashSet<String>,
41    struct_field_types: HashMap<String, HashMap<String, String>>,
42}
43
44// ═══ Shared Helper Functions ═══
45
46/// Walk a directory and collect source file entries (rel_path, content, language).
47/// Also builds the module_map from file paths.
48fn collect_source_files(
49    dir: &Path,
50    module_map: &mut HashMap<String, String>,
51) -> Vec<(String, String, Language)> {
52    let mut file_entries: Vec<(String, String, Language)> = Vec::new();
53    // Collect partial path candidates: partial → Vec<file_id>
54    // We defer insertion so we can detect ambiguous partials (same basename in different dirs).
55    let mut partial_candidates: HashMap<String, Vec<String>> = HashMap::new();
56
57    for entry in WalkDir::new(dir)
58        .follow_links(false)
59        .max_depth(20)
60        .into_iter()
61        .filter_entry(|e| {
62            // Always enter the root directory (depth 0) — the user explicitly
63            // chose this path, so we should respect it even if the directory
64            // name starts with '.' (e.g. temp dirs like .tmpXXXX).
65            if e.depth() == 0 {
66                return true;
67            }
68            let name = e.file_name().to_str().unwrap_or("");
69            !name.starts_with('.')
70                && name != "node_modules"
71                && name != "__pycache__"
72                && name != "target"
73                && name != "build"
74                && name != "dist"
75                && name != ".git"
76                && name != ".eggs"
77                && name != ".tox"
78        })
79    {
80        let entry = match entry {
81            Ok(e) => e,
82            Err(_) => continue,
83        };
84
85        if !entry.file_type().is_file() {
86            continue;
87        }
88
89        let path = entry.path();
90        let lang = Language::from_path(path);
91        if lang == Language::Unknown {
92            continue;
93        }
94
95        let rel_path = path
96            .strip_prefix(dir)
97            .unwrap_or(path)
98            .to_string_lossy()
99            .to_string();
100
101        // Skip certain files
102        if rel_path == "setup.py" || rel_path == "conftest.py" || rel_path.contains("__pycache__") {
103            continue;
104        }
105
106        let content = match std::fs::read_to_string(path) {
107            Ok(c) => c,
108            Err(_) => continue,
109        };
110
111        // Build module path
112        let module_path = rel_path
113            .replace('/', ".")
114            .trim_end_matches(".py")
115            .trim_end_matches(".rs")
116            .trim_end_matches(".ts")
117            .trim_end_matches(".tsx")
118            .trim_end_matches(".js")
119            .trim_end_matches(".jsx")
120            .to_string();
121
122        let file_id = format!("file:{}", rel_path);
123        // Register full module path (always unique since it includes full relative path)
124        module_map.insert(module_path.clone(), file_id.clone());
125
126        // Collect partial path candidates (defer insertion to detect ambiguity)
127        let parts: Vec<&str> = module_path.split('.').collect();
128        for start in 1..parts.len() {
129            let partial = parts[start..].join(".");
130            partial_candidates.entry(partial).or_default().push(file_id.clone());
131        }
132
133        file_entries.push((rel_path, content, lang));
134    }
135
136    // Only register unambiguous partials — if two files share the same partial,
137    // register neither to avoid ghost nodes (ISS-007).
138    for (partial, candidates) in partial_candidates {
139        if candidates.len() == 1 {
140            module_map.entry(partial).or_insert_with(|| candidates.into_iter().next().unwrap());
141        }
142        // If len > 1, skip — ambiguous partial, don't register
143    }
144
145    file_entries
146}
147
148/// Parse a single file and return its nodes, edges, imports, and struct field types.
149fn parse_single_file(
150    rel_path: &str,
151    content: &str,
152    lang: &Language,
153    parser: &mut Parser,
154    class_map: &mut HashMap<String, String>,
155) -> Option<FileParseResult> {
156    let (file_nodes, file_edges, imports, struct_field_types) = match lang {
157        Language::Python => {
158            let (nodes, edges, imports) = extract_python_tree_sitter(
159                rel_path, content, parser, class_map,
160            );
161            (nodes, edges, imports, HashMap::new())
162        }
163        Language::Rust => {
164            let (nodes, edges, imports, field_types) = extract_rust_tree_sitter(
165                rel_path, content, parser, class_map,
166            );
167            (nodes, edges, imports, field_types)
168        }
169        Language::TypeScript => {
170            let ext = rel_path.rsplit('.').next().unwrap_or("ts");
171            let (nodes, edges, imports) = extract_typescript_tree_sitter(
172                rel_path, content, parser, class_map, ext,
173            );
174            (nodes, edges, imports, HashMap::new())
175        }
176        Language::Unknown => return None,
177    };
178
179    Some(FileParseResult {
180        nodes: file_nodes,
181        edges: file_edges,
182        imports,
183        struct_field_types,
184    })
185}
186
187/// Integrate a parsed file's results into the ExtractState.
188fn integrate_file_results(
189    state: &mut ExtractState,
190    rel_path: &str,
191    result: FileParseResult,
192) {
193    // Update maps
194    for node in &result.nodes {
195        if node.kind == NodeKind::Class {
196            state.class_map.insert(node.name.clone(), node.id.clone());
197        } else if node.kind == NodeKind::Function {
198            state.func_map
199                .entry(node.name.clone())
200                .or_default()
201                .push(node.id.clone());
202        }
203    }
204
205    // Track method→class and class→methods relationships
206    for edge in &result.edges {
207        if edge.relation == EdgeRelation::DefinedIn {
208            if edge.from.starts_with("method:") && edge.to.starts_with("class:") {
209                state.method_to_class.insert(edge.from.clone(), edge.to.clone());
210                state.class_methods
211                    .entry(edge.to.clone())
212                    .or_default()
213                    .push(edge.from.clone());
214            }
215        }
216        if edge.relation == EdgeRelation::Inherits {
217            if let Some(parent_id) = state.class_map.get(
218                edge.to.strip_prefix("class_ref:").unwrap_or(&edge.to),
219            ) {
220                state.class_parents
221                    .entry(edge.from.clone())
222                    .or_default()
223                    .push(parent_id.clone());
224            }
225        }
226    }
227
228    // Store imported names
229    if !result.imports.is_empty() {
230        state.file_imported_names.insert(rel_path.to_string(), result.imports);
231    }
232
233    // Store struct field types
234    for (struct_name, fields) in result.struct_field_types {
235        state.all_struct_field_types.insert(struct_name, fields);
236    }
237
238    // Always create a file node for every source file — even files that contain
239    // only re-exports (pub mod / mod / use) produce no extractable entities,
240    // but they're still part of the code structure and are referenced by
241    // BelongsTo edges (file → module).
242    state.nodes.push(CodeNode::new_file(rel_path));
243
244    state.nodes.extend(result.nodes);
245    state.edges.extend(result.edges);
246}
247
248/// Build helper maps needed for call edge extraction (class_init_map, node_pkg_map).
249fn build_call_extraction_maps(state: &ExtractState) -> (
250    HashMap<String, Vec<(String, String)>>,
251    HashMap<String, String>,
252) {
253    // class_init_map for constructor resolution
254    let class_init_map: HashMap<String, Vec<(String, String)>> = {
255        let mut map: HashMap<String, Vec<(String, String)>> = HashMap::new();
256        for node in &state.nodes {
257            if node.kind == NodeKind::Function && node.name == "__init__" && !node.is_test {
258                if let Some(class_id) = state.method_to_class.get(&node.id) {
259                    if let Some(class_name) = class_id.rsplit(':').next() {
260                        map.entry(class_name.to_string())
261                            .or_default()
262                            .push((node.file_path.clone(), node.id.clone()));
263                    }
264                }
265            }
266        }
267        map
268    };
269
270    // node_pkg_map for package-scoped resolution
271    let node_pkg_map: HashMap<String, String> = state.nodes
272        .iter()
273        .map(|n| {
274            let pkg = n.file_path.rsplitn(2, '/').nth(1).unwrap_or("").to_string();
275            (n.id.clone(), pkg)
276        })
277        .collect();
278
279    (class_init_map, node_pkg_map)
280}
281
282/// Extract call edges for a specific file (third pass in the pipeline).
283fn extract_calls_for_file(
284    rel_path: &str,
285    content: &str,
286    lang: &Language,
287    parser: &mut Parser,
288    state: &ExtractState,
289    class_init_map: &HashMap<String, Vec<(String, String)>>,
290    node_pkg_map: &HashMap<String, String>,
291    module_map: &HashMap<String, String>,
292    edges: &mut Vec<CodeEdge>,
293) {
294    let file_func_ids: HashSet<String> = state.nodes
295        .iter()
296        .filter(|n| n.file_path == *rel_path && n.kind == NodeKind::Function)
297        .map(|n| n.id.clone())
298        .collect();
299
300    let package_dir = rel_path.rsplitn(2, '/').nth(1).unwrap_or("");
301
302    match lang {
303        Language::Python => {
304            if parser.set_language(&tree_sitter_python::LANGUAGE.into()).is_err() {
305                return;
306            }
307
308            if let Some(tree) = parser.parse(content, None) {
309                let source = content.as_bytes();
310                let root = tree.root_node();
311
312                extract_calls_from_tree(
313                    root,
314                    source,
315                    rel_path,
316                    &state.func_map,
317                    &state.method_to_class,
318                    &state.class_parents,
319                    &file_func_ids,
320                    &state.file_imported_names,
321                    package_dir,
322                    class_init_map,
323                    node_pkg_map,
324                    edges,
325                );
326            }
327
328            // Test-to-source mapping for Python
329            let is_test_file = rel_path.contains("/tests/") || rel_path.contains("/test_");
330            if is_test_file {
331                let file_id = format!("file:{}", rel_path);
332                let re_from_import = Regex::new(r"^from\s+([\w.]+)\s+import").unwrap();
333
334                for line in content.lines() {
335                    if let Some(cap) = re_from_import.captures(line) {
336                        let module = cap[1].to_string();
337                        if let Some(source_file_id) = module_map.get(&module) {
338                            edges.push(CodeEdge {
339                                from: file_id.clone(),
340                                to: source_file_id.clone(),
341                                relation: EdgeRelation::TestsFor,
342                                weight: 0.5,
343                                call_count: 1,
344                                in_error_path: false,
345                                confidence: 1.0,
346                                call_site_line: None,
347                                call_site_column: None,
348                            });
349                        }
350                    }
351                }
352            }
353        }
354        Language::Rust => {
355            if parser.set_language(&tree_sitter_rust::LANGUAGE.into()).is_err() {
356                return;
357            }
358
359            if let Some(tree) = parser.parse(content, None) {
360                let source = content.as_bytes();
361                let root = tree.root_node();
362
363                extract_calls_rust(
364                    root,
365                    source,
366                    rel_path,
367                    &state.func_map,
368                    &state.method_to_class,
369                    &file_func_ids,
370                    node_pkg_map,
371                    &state.file_imported_names,
372                    &state.all_struct_field_types,
373                    edges,
374                );
375            }
376        }
377        Language::TypeScript => {
378            let extension = rel_path.rsplit('.').next().unwrap_or("");
379            let lang_result = match extension {
380                "tsx" => parser.set_language(&tree_sitter_typescript::LANGUAGE_TSX.into()),
381                "ts" => parser.set_language(&tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
382                "jsx" => parser.set_language(&tree_sitter_javascript::LANGUAGE.into()),
383                _ => parser.set_language(&tree_sitter_javascript::LANGUAGE.into()),
384            };
385
386            if lang_result.is_err() {
387                return;
388            }
389
390            if let Some(tree) = parser.parse(content, None) {
391                let source = content.as_bytes();
392                let root = tree.root_node();
393
394                extract_calls_typescript(
395                    root,
396                    source,
397                    rel_path,
398                    &state.func_map,
399                    &state.method_to_class,
400                    &file_func_ids,
401                    &state.file_imported_names,
402                    node_pkg_map,
403                    edges,
404                );
405            }
406        }
407        Language::Unknown => {}
408    }
409}
410
411/// Resolve placeholder references in edges (class_ref:, module_ref:, func_ref:).
412fn resolve_references(
413    edges: Vec<CodeEdge>,
414    class_map: &HashMap<String, String>,
415    func_map: &HashMap<String, Vec<String>>,
416    module_map: &HashMap<String, String>,
417) -> Vec<CodeEdge> {
418    let mut resolved_edges = Vec::new();
419    for edge in edges {
420        if edge.to.starts_with("class_ref:") {
421            let class_name = &edge.to["class_ref:".len()..];
422            if let Some(class_id) = class_map.get(class_name) {
423                resolved_edges.push(CodeEdge {
424                    from: edge.from,
425                    to: class_id.clone(),
426                    relation: edge.relation,
427                    weight: edge.weight,
428                    call_count: edge.call_count,
429                    in_error_path: edge.in_error_path,
430                    confidence: edge.confidence,
431                    call_site_line: edge.call_site_line,
432                    call_site_column: edge.call_site_column,
433                });
434            }
435        } else if edge.to.starts_with("module_ref:") {
436            let module = &edge.to["module_ref:".len()..];
437            let resolved_file_id = module_map.get(module).cloned()
438                .or_else(|| {
439                    let importing_file = edge.from.strip_prefix("file:").unwrap_or(&edge.from);
440                    resolve_ts_import(importing_file, module, module_map)
441                });
442
443            if let Some(file_id) = resolved_file_id {
444                resolved_edges.push(CodeEdge {
445                    from: edge.from,
446                    to: file_id,
447                    relation: edge.relation,
448                    weight: edge.weight,
449                    call_count: edge.call_count,
450                    in_error_path: edge.in_error_path,
451                    confidence: edge.confidence,
452                    call_site_line: edge.call_site_line,
453                    call_site_column: edge.call_site_column,
454                });
455            }
456        } else if edge.to.starts_with("func_ref:") {
457            let func_name = &edge.to["func_ref:".len()..];
458            if let Some(func_ids) = func_map.get(func_name) {
459                if let Some(func_id) = func_ids.first() {
460                    resolved_edges.push(CodeEdge {
461                        from: edge.from,
462                        to: func_id.clone(),
463                        relation: edge.relation,
464                        weight: edge.weight,
465                        call_count: edge.call_count,
466                        in_error_path: edge.in_error_path,
467                        confidence: edge.confidence,
468                        call_site_line: edge.call_site_line,
469                        call_site_column: edge.call_site_column,
470                    });
471                }
472            }
473        } else {
474            resolved_edges.push(edge);
475        }
476    }
477    resolved_edges
478}
479
480/// Remove phantom file nodes — nodes with `kind == File` whose `file_path`
481/// doesn't exist in the set of actual files we walked. Also removes edges
482/// referencing removed nodes. (ISS-007 fix)
483fn remove_phantom_nodes(
484    nodes: &mut Vec<CodeNode>,
485    edges: &mut Vec<CodeEdge>,
486    valid_file_paths: &HashSet<&str>,
487) {
488    let before_nodes = nodes.len();
489    nodes.retain(|n| {
490        if n.kind == NodeKind::File {
491            valid_file_paths.contains(n.file_path.as_str())
492        } else {
493            true
494        }
495    });
496    let removed = before_nodes - nodes.len();
497    if removed > 0 {
498        tracing::debug!("Removed {} phantom file node(s)", removed);
499        let valid_node_ids: HashSet<&str> = nodes.iter().map(|n| n.id.as_str()).collect();
500        edges.retain(|e| {
501            valid_node_ids.contains(e.from.as_str()) && valid_node_ids.contains(e.to.as_str())
502        });
503    }
504}
505
506/// Deduplicate call edges, compute call_count, and compute weights.
507/// Remap DefinedIn edges from cross-file impl blocks to their actual class nodes.
508///
509/// When `impl FooBar { fn method() {} }` appears in file X but `struct FooBar` is
510/// defined in file Y, the method gets a DefinedIn edge like:
511///   method:X:FooBar.method → class:X:FooBar
512/// But the actual class node is `class:Y:FooBar`. This function remaps these
513/// dangling edges to point to the correct node.
514pub(crate) fn remap_cross_file_impl_edges(edges: &mut Vec<CodeEdge>, nodes: &[CodeNode]) {
515    // Build set of valid node IDs and a map from type_name → actual class node ID
516    let valid_ids: HashSet<&str> = nodes.iter().map(|n| n.id.as_str()).collect();
517    let mut type_to_class_id: HashMap<&str, &str> = HashMap::new();
518
519    for node in nodes {
520        if node.kind == NodeKind::Class {
521            // class:path/to/file.rs:TypeName → extract "TypeName"
522            if let Some(type_name) = node.id.rsplit(':').next() {
523                // If multiple classes have the same name, first one wins
524                // (could be improved with module-aware resolution)
525                type_to_class_id.entry(type_name).or_insert(&node.id);
526            }
527        }
528    }
529
530    for edge in edges.iter_mut() {
531        if edge.relation == EdgeRelation::DefinedIn
532            && edge.to.starts_with("class:")
533            && !valid_ids.contains(edge.to.as_str())
534        {
535            // Extract type name from the dangling class ref
536            if let Some(type_name) = edge.to.rsplit(':').next() {
537                if let Some(&actual_id) = type_to_class_id.get(type_name) {
538                    tracing::trace!(
539                        "Remapped cross-file impl edge: {} → {} (was {})",
540                        edge.from, actual_id, edge.to
541                    );
542                    edge.to = actual_id.to_string();
543                }
544            }
545        }
546    }
547}
548
549fn dedup_and_finalize_edges(edges: Vec<CodeEdge>, nodes: &[CodeNode]) -> Vec<CodeEdge> {
550    // Remap cross-file impl DefinedIn edges before deduplication
551    let mut edges = edges;
552    remap_cross_file_impl_edges(&mut edges, nodes);
553
554    let mut edge_map: HashMap<(String, String), CodeEdge> = HashMap::new();
555    let mut other_edges: Vec<CodeEdge> = Vec::new();
556
557    for edge in edges {
558        if edge.relation == EdgeRelation::Calls {
559            let key = (edge.from.clone(), edge.to.clone());
560            let entry = edge_map.entry(key).or_insert_with(|| {
561                let mut e = edge.clone();
562                e.call_count = 0;
563                e
564            });
565            entry.call_count += 1;
566            if edge.confidence > entry.confidence {
567                entry.confidence = edge.confidence;
568            }
569            if edge.in_error_path {
570                entry.in_error_path = true;
571            }
572        } else {
573            other_edges.push(edge);
574        }
575    }
576
577    let mut final_edges: Vec<CodeEdge> = edge_map.into_values().collect();
578    final_edges.extend(other_edges);
579
580    // Compute weights for all edges
581    for edge in &mut final_edges {
582        edge.compute_weight();
583    }
584
585    // Add override edges
586    add_override_edges(nodes, &mut final_edges);
587
588    final_edges
589}
590
591/// Compute the FileDelta between current filesystem and stored metadata.
592/// (Hash-only variant, useful for testing without filesystem mtime)
593#[allow(dead_code)]
594pub fn compute_file_delta(
595    current_files: &[(String, String, Language)],
596    metadata: &ExtractMetadata,
597) -> FileDelta {
598    let mut delta = FileDelta::default();
599
600    let current_paths: HashSet<&str> = current_files.iter().map(|(p, _, _)| p.as_str()).collect();
601    let stored_paths: HashSet<&str> = metadata.files.keys().map(|p| p.as_str()).collect();
602
603    for (rel_path, content, _lang) in current_files {
604        if let Some(stored) = metadata.files.get(rel_path.as_str()) {
605            // File exists in both — check if changed
606            let content_hash = xxh64(content.as_bytes(), 0);
607            if content_hash == stored.content_hash {
608                delta.unchanged.push(rel_path.clone());
609            } else {
610                delta.modified.push(rel_path.clone());
611            }
612        } else {
613            // New file
614            delta.added.push(rel_path.clone());
615        }
616    }
617
618    // Find deleted files
619    for stored_path in &stored_paths {
620        if !current_paths.contains(*stored_path) {
621            delta.deleted.push(stored_path.to_string());
622        }
623    }
624
625    delta
626}
627
628/// Build FileState for a file from its parsed results and content.
629#[allow(dead_code)]
630fn build_file_state(
631    content: &str,
632    node_ids: &[String],
633    edge_count: usize,
634) -> FileState {
635    let mtime = 0u64; // Will be set by caller from filesystem metadata
636    let content_hash = xxh64(content.as_bytes(), 0);
637    FileState {
638        mtime,
639        content_hash,
640        node_ids: node_ids.to_vec(),
641        edge_count,
642    }
643}
644
645/// Get the mtime for a file.
646fn get_file_mtime(dir: &Path, rel_path: &str) -> u64 {
647    let full_path = dir.join(rel_path);
648    std::fs::metadata(&full_path)
649        .and_then(|m| m.modified())
650        .map(|t| t.duration_since(UNIX_EPOCH).unwrap_or_default().as_secs())
651        .unwrap_or(0)
652}
653
654// ═══ Module Node & Cross-Layer Edge Generation ═══
655
656/// Generate module nodes from directory structure.
657/// Each directory containing at least one source file becomes a Module node.
658/// Returns (module_nodes, edges) where edges are module→parent belongs_to.
659fn generate_module_nodes(file_entries: &[(String, String, Language)]) -> (Vec<CodeNode>, Vec<CodeEdge>) {
660    let mut dir_set: HashSet<String> = HashSet::new();
661
662    // Collect all directories that contain source files
663    for (rel_path, _, _) in file_entries {
664        if let Some(dir) = rel_path.rsplitn(2, '/').nth(1) {
665            if !dir.is_empty() {
666                // Add this directory and all ancestors
667                let mut current = dir.to_string();
668                loop {
669                    dir_set.insert(current.clone());
670                    match current.rsplitn(2, '/').nth(1) {
671                        Some(parent) if !parent.is_empty() => current = parent.to_string(),
672                        _ => break,
673                    }
674                }
675            }
676        }
677    }
678
679    let mut nodes = Vec::new();
680    let mut edges = Vec::new();
681
682    for dir in &dir_set {
683        nodes.push(CodeNode::new_module(dir));
684
685        // Module → parent module (belongs_to)
686        if let Some(parent) = dir.rsplitn(2, '/').nth(1) {
687            if !parent.is_empty() && dir_set.contains(parent) {
688                edges.push(CodeEdge::new(
689                    &format!("module:{}", dir),
690                    &format!("module:{}", parent),
691                    EdgeRelation::BelongsTo,
692                ));
693            }
694        }
695    }
696
697    (nodes, edges)
698}
699
700/// Generate file → module belongs_to edges.
701fn generate_file_to_module_edges(file_entries: &[(String, String, Language)]) -> Vec<CodeEdge> {
702    let mut edges = Vec::new();
703    for (rel_path, _, _) in file_entries {
704        if let Some(dir) = rel_path.rsplitn(2, '/').nth(1) {
705            if !dir.is_empty() {
706                edges.push(CodeEdge::new(
707                    &format!("file:{}", rel_path),
708                    &format!("module:{}", dir),
709                    EdgeRelation::BelongsTo,
710                ));
711            }
712        }
713    }
714    edges
715}
716
717/// Generate TestsFor edges for Rust test files using naming conventions.
718/// Matches: tests/auth.rs → src/auth.rs, tests/test_auth.rs → src/auth.rs, tests/auth.rs → src/auth/mod.rs
719fn generate_rust_tests_for_edges(file_entries: &[(String, String, Language)]) -> Vec<CodeEdge> {
720    let mut edges = Vec::new();
721
722    // Collect source files (non-test) with their stems
723    let mut source_stems: HashMap<String, String> = HashMap::new();
724    for (path, _, lang) in file_entries {
725        if *lang != Language::Rust {
726            continue;
727        }
728        if path.starts_with("tests/") || path.contains("/tests/") {
729            continue;
730        }
731        // stem: "src/auth/middleware.rs" → "auth/middleware"
732        // also: "src/auth/mod.rs" → "auth"
733        let without_prefix = path.strip_prefix("src/").unwrap_or(path);
734        let stem = without_prefix.trim_end_matches(".rs");
735        let stem = if stem.ends_with("/mod") {
736            &stem[..stem.len() - 4]
737        } else {
738            stem
739        };
740        source_stems.insert(stem.to_string(), format!("file:{}", path));
741    }
742
743    // Find test files and match to source
744    for (path, _, lang) in file_entries {
745        if *lang != Language::Rust {
746            continue;
747        }
748        if !path.starts_with("tests/") && !path.contains("/tests/") {
749            continue;
750        }
751
752        let test_file_id = format!("file:{}", path);
753
754        // Extract test stem: "tests/test_auth.rs" → "auth", "tests/auth.rs" → "auth"
755        let raw = path.strip_prefix("tests/")
756            .or_else(|| {
757                // Handle nested: "crates/foo/tests/bar.rs" → "bar"
758                path.rsplit_once("/tests/").map(|(_, rest)| rest)
759            })
760            .unwrap_or(path)
761            .trim_end_matches(".rs");
762        let test_stem = raw.strip_prefix("test_").unwrap_or(raw);
763
764        // Try matching: exact stem, or module name
765        if let Some(source_id) = source_stems.get(test_stem) {
766            edges.push(CodeEdge::new_heuristic(
767                &test_file_id,
768                source_id,
769                EdgeRelation::TestsFor,
770                0.8, // naming convention match, not import analysis
771            ));
772        }
773    }
774
775    edges
776}
777
778/// Generate TestsFor edges for TypeScript/JavaScript test files using naming conventions.
779/// Matches: auth.test.ts → auth.ts, auth.spec.ts → auth.ts, __tests__/auth.test.ts → auth.ts
780fn generate_ts_tests_for_edges(file_entries: &[(String, String, Language)]) -> Vec<CodeEdge> {
781    let mut edges = Vec::new();
782
783    // Collect source files (non-test) — both by full stem and by basename
784    let mut source_stems: HashMap<String, String> = HashMap::new();
785    let mut source_basenames: HashMap<String, Vec<String>> = HashMap::new();
786    for (path, _, lang) in file_entries {
787        if *lang != Language::TypeScript {
788            continue;
789        }
790        if path.contains(".test.") || path.contains(".spec.") || path.contains("__tests__/") {
791            continue;
792        }
793        let stem = path
794            .trim_end_matches(".ts")
795            .trim_end_matches(".tsx")
796            .trim_end_matches(".js")
797            .trim_end_matches(".jsx");
798        let file_id = format!("file:{}", path);
799        source_stems.insert(stem.to_string(), file_id.clone());
800        // Also index by basename for fallback matching
801        let basename = stem.rsplit('/').next().unwrap_or(stem);
802        source_basenames
803            .entry(basename.to_string())
804            .or_default()
805            .push(file_id);
806    }
807
808    for (path, _, lang) in file_entries {
809        if *lang != Language::TypeScript {
810            continue;
811        }
812        let is_test = path.contains(".test.") || path.contains(".spec.") || path.contains("__tests__/");
813        if !is_test {
814            continue;
815        }
816
817        let test_file_id = format!("file:{}", path);
818
819        // "src/auth.test.ts" → "src/auth", "src/auth.spec.ts" → "src/auth"
820        // "__tests__/auth.test.ts" → "auth"
821        let source_stem = path
822            .replace(".test.", ".")
823            .replace(".spec.", ".")
824            .replace("__tests__/", "")
825            .trim_end_matches(".ts")
826            .trim_end_matches(".tsx")
827            .trim_end_matches(".js")
828            .trim_end_matches(".jsx")
829            .to_string();
830
831        if let Some(source_id) = source_stems.get(&source_stem) {
832            edges.push(CodeEdge::new_heuristic(
833                &test_file_id,
834                source_id,
835                EdgeRelation::TestsFor,
836                0.8, // naming convention match
837            ));
838        } else {
839            // Fallback: strip common test directory prefixes and match by basename
840            let stripped = source_stem
841                .strip_prefix("tests/")
842                .or_else(|| source_stem.strip_prefix("test/"))
843                .or_else(|| source_stem.strip_prefix("__tests__/"))
844                .unwrap_or(&source_stem);
845            let test_basename = stripped.rsplit('/').next().unwrap_or(stripped);
846
847            if let Some(source_ids) = source_basenames.get(test_basename) {
848                // If unique match, use it. If ambiguous, take first (alphabetically).
849                if let Some(source_id) = source_ids.first() {
850                    edges.push(CodeEdge::new_heuristic(
851                        &test_file_id,
852                        source_id,
853                        EdgeRelation::TestsFor,
854                        0.6, // lower confidence for basename-only match
855                    ));
856                }
857            }
858        }
859    }
860
861    edges
862}
863
864/// Generate TestsFor edges for Python test files using naming conventions.
865/// Matches: test_auth.py → auth.py, tests/test_auth.py → auth.py
866fn generate_python_tests_for_edges(file_entries: &[(String, String, Language)]) -> Vec<CodeEdge> {
867    let mut edges = Vec::new();
868
869    // Collect source files (non-test)
870    let mut source_stems: HashMap<String, String> = HashMap::new();
871    for (path, _, lang) in file_entries {
872        if *lang != Language::Python {
873            continue;
874        }
875        if path.starts_with("tests/") || path.contains("/tests/") {
876            continue;
877        }
878        let name = path.rsplit('/').next().unwrap_or(path);
879        if name.starts_with("test_") || name.starts_with("conftest") {
880            continue;
881        }
882        // stem: "src/auth.py" → "auth", "auth/middleware.py" → "auth/middleware"
883        let stem = path.trim_end_matches(".py");
884        // Remove leading src/ if present
885        let stem = stem.strip_prefix("src/").unwrap_or(stem);
886        source_stems.insert(stem.to_string(), format!("file:{}", path));
887        // Also register just the filename stem for simple matching
888        if let Some(basename) = stem.rsplit('/').next() {
889            source_stems.entry(basename.to_string()).or_insert_with(|| format!("file:{}", path));
890        }
891    }
892
893    // Find test files and match to source
894    for (path, _, lang) in file_entries {
895        if *lang != Language::Python {
896            continue;
897        }
898        let name = path.rsplit('/').next().unwrap_or(path);
899        if !name.starts_with("test_") && !path.starts_with("tests/") && !path.contains("/tests/") {
900            continue;
901        }
902
903        let test_file_id = format!("file:{}", path);
904
905        // Extract test stem: "tests/test_auth.py" → "auth", "test_auth.py" → "auth"
906        let raw = path
907            .strip_prefix("tests/")
908            .or_else(|| path.rsplit_once("/tests/").map(|(_, rest)| rest))
909            .unwrap_or(path);
910        let basename = raw.rsplit('/').next().unwrap_or(raw);
911        let test_stem = basename
912            .trim_end_matches(".py")
913            .strip_prefix("test_")
914            .unwrap_or(basename.trim_end_matches(".py"));
915
916        if let Some(source_id) = source_stems.get(test_stem) {
917            edges.push(CodeEdge::new_heuristic(
918                &test_file_id,
919                source_id,
920                EdgeRelation::TestsFor,
921                0.8, // naming convention match
922            ));
923        }
924    }
925
926    edges
927}
928
929// ═══ Public test accessors for ISS-009 helpers ═══
930
931/// Public wrapper for testing module node generation.
932#[cfg(test)]
933pub fn generate_module_nodes_pub(file_entries: &[(String, String, Language)]) -> (Vec<CodeNode>, Vec<CodeEdge>) {
934    generate_module_nodes(file_entries)
935}
936
937/// Public wrapper for testing file→module edge generation.
938#[cfg(test)]
939pub fn generate_file_to_module_edges_pub(file_entries: &[(String, String, Language)]) -> Vec<CodeEdge> {
940    generate_file_to_module_edges(file_entries)
941}
942
943/// Public wrapper for testing Rust TestsFor edge generation.
944#[cfg(test)]
945pub fn generate_rust_tests_for_edges_pub(file_entries: &[(String, String, Language)]) -> Vec<CodeEdge> {
946    generate_rust_tests_for_edges(file_entries)
947}
948
949/// Public wrapper for testing TypeScript TestsFor edge generation.
950#[cfg(test)]
951pub fn generate_ts_tests_for_edges_pub(file_entries: &[(String, String, Language)]) -> Vec<CodeEdge> {
952    generate_ts_tests_for_edges(file_entries)
953}
954
955/// Public wrapper for testing Python TestsFor edge generation.
956#[cfg(test)]
957pub fn generate_python_tests_for_edges_pub(file_entries: &[(String, String, Language)]) -> Vec<CodeEdge> {
958    generate_python_tests_for_edges(file_entries)
959}
960
961impl CodeGraph {
962    /// Extract with per-repo cache. Cache key = repo_name + base_commit.
963    /// If a cached graph exists on disk, returns it instantly.
964    /// Otherwise extracts fresh and saves to cache.
965    pub fn extract_cached(repo_dir: &Path, repo_name: &str, base_commit: &str) -> Self {
966        let cache_dir = repo_dir.parent().unwrap_or(repo_dir).join(".graph-cache");
967        let _ = std::fs::create_dir_all(&cache_dir);
968
969        // Cache key: sanitized repo name + first 8 chars of commit
970        let safe_repo = repo_name.replace('/', "__");
971        let short_commit = &base_commit[..base_commit.len().min(8)];
972        let cache_file = cache_dir.join(format!("{}__{}.json", safe_repo, short_commit));
973
974        // Try to load from cache
975        if cache_file.exists() {
976            if let Ok(data) = std::fs::read_to_string(&cache_file) {
977                if let Ok(mut graph) = serde_json::from_str::<CodeGraph>(&data) {
978                    graph.build_indexes();
979                    tracing::info!(
980                        "Loaded code graph from cache: {} ({} nodes, {} edges)",
981                        cache_file.display(),
982                        graph.nodes.len(),
983                        graph.edges.len()
984                    );
985                    return graph;
986                }
987            }
988            // Cache corrupt, delete and re-extract
989            let _ = std::fs::remove_file(&cache_file);
990        }
991
992        // Extract fresh
993        let graph = Self::extract_from_dir(repo_dir);
994
995        // Save to cache (best-effort, don't fail if write fails)
996        if let Ok(json) = serde_json::to_string(&graph) {
997            let _ = std::fs::write(&cache_file, json);
998            tracing::info!(
999                "Saved code graph to cache: {} ({} nodes, {} edges)",
1000                cache_file.display(),
1001                graph.nodes.len(),
1002                graph.edges.len()
1003            );
1004        }
1005
1006        graph
1007    }
1008
1009    /// Extract code graph from a directory.
1010    pub fn extract_from_dir(dir: &Path) -> Self {
1011        let mut state = ExtractState::default();
1012
1013        // First pass: collect files and build module map
1014        let file_entries = collect_source_files(dir, &mut state.module_map);
1015
1016        // Generate module nodes from directory structure (ISS-009)
1017        let (module_nodes, module_edges) = generate_module_nodes(&file_entries);
1018        state.nodes.extend(module_nodes);
1019        state.edges.extend(module_edges);
1020
1021        // Generate file → module belongs_to edges (ISS-009)
1022        // These reference file:X nodes which are created in integrate_file_results below,
1023        // but edges can reference forward — they're resolved at index build time.
1024        let file_module_edges = generate_file_to_module_edges(&file_entries);
1025
1026        // Generate TestsFor edges from naming conventions (ISS-009)
1027        let rust_test_edges = generate_rust_tests_for_edges(&file_entries);
1028        let ts_test_edges = generate_ts_tests_for_edges(&file_entries);
1029        let python_test_edges = generate_python_tests_for_edges(&file_entries);
1030
1031        // Second pass: parse each file
1032        let mut parser = Parser::new();
1033        let python_language = tree_sitter_python::LANGUAGE;
1034        parser.set_language(&python_language.into()).ok();
1035
1036        for (rel_path, content, lang) in &file_entries {
1037            if let Some(result) = parse_single_file(rel_path, content, lang, &mut parser, &mut state.class_map) {
1038                integrate_file_results(&mut state, rel_path, result);
1039            }
1040        }
1041
1042        // Build helper maps for call extraction
1043        let (class_init_map, node_pkg_map) = build_call_extraction_maps(&state);
1044
1045        // Third pass: extract call edges
1046        // Take edges out to avoid simultaneous immutable borrow of `state` + mutable borrow of `state.edges`
1047        let mut edges = std::mem::take(&mut state.edges);
1048        for (rel_path, content, lang) in &file_entries {
1049            extract_calls_for_file(
1050                rel_path, content, lang, &mut parser, &state,
1051                &class_init_map, &node_pkg_map, &state.module_map, &mut edges,
1052            );
1053        }
1054        // Add cross-layer edges (ISS-009)
1055        edges.extend(file_module_edges);
1056        edges.extend(rust_test_edges);
1057        edges.extend(ts_test_edges);
1058        edges.extend(python_test_edges);
1059        state.edges = edges;
1060
1061        // Resolve placeholder references
1062        let resolved = resolve_references(
1063            state.edges,
1064            &state.class_map,
1065            &state.func_map,
1066            &state.module_map,
1067        );
1068
1069        // Deduplicate and finalize
1070        let mut final_edges = dedup_and_finalize_edges(resolved, &state.nodes);
1071
1072        // Remove phantom file nodes — files that don't exist on disk (ISS-007)
1073        let valid_file_paths: HashSet<&str> = file_entries.iter().map(|(p, _, _)| p.as_str()).collect();
1074        remove_phantom_nodes(&mut state.nodes, &mut final_edges, &valid_file_paths);
1075
1076        let mut graph = CodeGraph {
1077            nodes: state.nodes,
1078            edges: final_edges,
1079            outgoing: HashMap::new(),
1080            incoming: HashMap::new(),
1081            node_index: HashMap::new(),
1082        };
1083        graph.build_indexes();
1084        graph
1085    }
1086
1087    /// Incremental extraction: only re-parse changed files.
1088    /// Falls back to full extraction if no prior metadata exists or if force=true.
1089    ///
1090    /// Returns the updated CodeGraph and an ExtractReport describing what changed.
1091    pub fn extract_incremental(
1092        dir: &Path,
1093        gid_dir: &Path,
1094        meta_path: &Path,
1095        force: bool,
1096    ) -> anyhow::Result<(Self, ExtractReport)> {
1097        let start = Instant::now();
1098
1099        // If force, do a full rebuild
1100        if force {
1101            tracing::info!("Force flag set, performing full rebuild");
1102            return Self::do_full_rebuild(dir, gid_dir, meta_path, start);
1103        }
1104
1105        // Try to load existing metadata
1106        let metadata = match Self::load_metadata(meta_path) {
1107            Some(meta) => {
1108                if meta.version != EXTRACT_META_VERSION {
1109                    tracing::info!(
1110                        "Metadata version mismatch (got {}, expected {}), performing full rebuild",
1111                        meta.version, EXTRACT_META_VERSION
1112                    );
1113                    return Self::do_full_rebuild(dir, gid_dir, meta_path, start);
1114                }
1115                meta
1116            }
1117            None => {
1118                tracing::info!("No prior metadata found, performing full rebuild");
1119                return Self::do_full_rebuild(dir, gid_dir, meta_path, start);
1120            }
1121        };
1122
1123        // Try to load existing graph: first graph.yml, then code-graph.json (migration)
1124        let graph_yml_path = gid_dir.join("graph.yml");
1125        let json_path = gid_dir.join("code-graph.json");
1126        let existing_graph = match Self::load_from_graph_yml(&graph_yml_path) {
1127            Some(g) => g,
1128            None => match Self::load_graph_json(&json_path) {
1129                Some(g) => {
1130                    tracing::info!("Loaded graph from code-graph.json (migration fallback)");
1131                    g
1132                }
1133                None => {
1134                    tracing::info!("No prior graph found, performing full rebuild");
1135                    return Self::do_full_rebuild(dir, gid_dir, meta_path, start);
1136                }
1137            },
1138        };
1139
1140        // Collect current files
1141        let mut module_map: HashMap<String, String> = HashMap::new();
1142        let file_entries = collect_source_files(dir, &mut module_map);
1143
1144        // Compute delta using content hash (mtime is checked first for speed)
1145        let delta = compute_file_delta_with_mtime(dir, &file_entries, &metadata);
1146
1147        tracing::info!(
1148            "File delta: {} added, {} modified, {} deleted, {} unchanged",
1149            delta.added.len(), delta.modified.len(), delta.deleted.len(), delta.unchanged.len()
1150        );
1151
1152        // If no changes, return existing graph
1153        if delta.is_empty() {
1154            let report = ExtractReport {
1155                added: 0,
1156                modified: 0,
1157                deleted: 0,
1158                unchanged: delta.unchanged.len(),
1159                full_rebuild: false,
1160                duration_ms: start.elapsed().as_millis() as u64,
1161            };
1162            return Ok((existing_graph, report));
1163        }
1164
1165        // Phase 1: Remove stale data from deleted/modified files
1166        let changed_files: HashSet<&str> = delta.modified.iter()
1167            .chain(delta.deleted.iter())
1168            .map(|s| s.as_str())
1169            .collect();
1170
1171        let mut graph = existing_graph;
1172
1173        // Collect stale node IDs from deleted/modified files
1174        let mut stale_node_ids: HashSet<String> = HashSet::new();
1175        for file_path in &changed_files {
1176            if let Some(file_state) = metadata.files.get(*file_path) {
1177                for node_id in &file_state.node_ids {
1178                    stale_node_ids.insert(node_id.clone());
1179                }
1180            }
1181            // Also remove the file node itself
1182            stale_node_ids.insert(format!("file:{}", file_path));
1183        }
1184
1185        // Remove stale nodes and their edges
1186        graph.nodes.retain(|n| !stale_node_ids.contains(&n.id));
1187        graph.edges.retain(|e| {
1188            !stale_node_ids.contains(&e.from) && !stale_node_ids.contains(&e.to)
1189        });
1190
1191        // Dangling edge cleanup: remove edges pointing to non-existent nodes
1192        let valid_node_ids: HashSet<&str> = graph.nodes.iter().map(|n| n.id.as_str()).collect();
1193        graph.edges.retain(|e| {
1194            valid_node_ids.contains(e.from.as_str()) && valid_node_ids.contains(e.to.as_str())
1195        });
1196
1197        tracing::debug!(
1198            "After stale removal: {} nodes, {} edges",
1199            graph.nodes.len(), graph.edges.len()
1200        );
1201
1202        // Phase 2: Parse only added/modified files
1203        let files_to_parse: HashSet<&str> = delta.added.iter()
1204            .chain(delta.modified.iter())
1205            .map(|s| s.as_str())
1206            .collect();
1207
1208        // Build state from existing graph nodes for reference resolution
1209        let mut state = ExtractState::default();
1210        state.module_map = module_map;
1211
1212        // Populate maps from existing (unchanged) nodes
1213        for node in &graph.nodes {
1214            if node.kind == NodeKind::Class {
1215                state.class_map.insert(node.name.clone(), node.id.clone());
1216            } else if node.kind == NodeKind::Function {
1217                state.func_map
1218                    .entry(node.name.clone())
1219                    .or_default()
1220                    .push(node.id.clone());
1221            }
1222        }
1223
1224        // Populate method_to_class and class_methods from existing edges
1225        for edge in &graph.edges {
1226            if edge.relation == EdgeRelation::DefinedIn {
1227                if edge.from.starts_with("method:") && edge.to.starts_with("class:") {
1228                    state.method_to_class.insert(edge.from.clone(), edge.to.clone());
1229                    state.class_methods
1230                        .entry(edge.to.clone())
1231                        .or_default()
1232                        .push(edge.from.clone());
1233                }
1234            }
1235            if edge.relation == EdgeRelation::Inherits {
1236                if let Some(parent_id) = state.class_map.get(
1237                    edge.to.strip_prefix("class_ref:").unwrap_or(&edge.to),
1238                ) {
1239                    state.class_parents
1240                        .entry(edge.from.clone())
1241                        .or_default()
1242                        .push(parent_id.clone());
1243                }
1244            }
1245        }
1246
1247        // Parse changed files
1248        let mut parser = Parser::new();
1249        parser.set_language(&tree_sitter_python::LANGUAGE.into()).ok();
1250
1251        // Track per-file node IDs for metadata
1252        let mut new_file_states: HashMap<String, FileState> = HashMap::new();
1253
1254        for (rel_path, content, lang) in &file_entries {
1255            if !files_to_parse.contains(rel_path.as_str()) {
1256                continue;
1257            }
1258
1259            if let Some(result) = parse_single_file(rel_path, content, lang, &mut parser, &mut state.class_map) {
1260                let node_ids: Vec<String> = result.nodes.iter().map(|n| n.id.clone()).collect();
1261                let node_ids_with_file = {
1262                    let mut ids = vec![format!("file:{}", rel_path)];
1263                    ids.extend(node_ids);
1264                    ids
1265                };
1266
1267                integrate_file_results(&mut state, rel_path, result);
1268
1269                // We'll compute edge_count after call extraction
1270                let mtime = get_file_mtime(dir, rel_path);
1271                let content_hash = xxh64(content.as_bytes(), 0);
1272                new_file_states.insert(rel_path.clone(), FileState {
1273                    mtime,
1274                    content_hash,
1275                    node_ids: node_ids_with_file,
1276                    edge_count: 0,
1277                });
1278            }
1279        }
1280
1281        // Merge new nodes into graph
1282        graph.nodes.extend(state.nodes.drain(..));
1283
1284        // Re-populate maps from ALL nodes (existing + new) for reference resolution
1285        state.class_map.clear();
1286        state.func_map.clear();
1287        state.method_to_class.clear();
1288        state.class_methods.clear();
1289        state.class_parents.clear();
1290
1291        for node in &graph.nodes {
1292            if node.kind == NodeKind::Class {
1293                state.class_map.insert(node.name.clone(), node.id.clone());
1294            } else if node.kind == NodeKind::Function {
1295                state.func_map
1296                    .entry(node.name.clone())
1297                    .or_default()
1298                    .push(node.id.clone());
1299            }
1300        }
1301
1302        // Rebuild method_to_class etc from all edges (existing + newly added file edges)
1303        let all_edges_for_maps: Vec<&CodeEdge> = graph.edges.iter()
1304            .chain(state.edges.iter())
1305            .collect();
1306
1307        for edge in &all_edges_for_maps {
1308            if edge.relation == EdgeRelation::DefinedIn {
1309                if edge.from.starts_with("method:") && edge.to.starts_with("class:") {
1310                    state.method_to_class.insert(edge.from.clone(), edge.to.clone());
1311                    state.class_methods
1312                        .entry(edge.to.clone())
1313                        .or_default()
1314                        .push(edge.from.clone());
1315                }
1316            }
1317            if edge.relation == EdgeRelation::Inherits {
1318                if let Some(parent_id) = state.class_map.get(
1319                    edge.to.strip_prefix("class_ref:").unwrap_or(&edge.to),
1320                ) {
1321                    state.class_parents
1322                        .entry(edge.from.clone())
1323                        .or_default()
1324                        .push(parent_id.clone());
1325                }
1326            }
1327        }
1328
1329        // Populate file_imported_names from both existing unchanged files and newly parsed
1330        // For unchanged files, we need to re-read their imports (they're not stored in metadata)
1331        // Actually, for the call extraction pass, we only extract calls for CHANGED files,
1332        // and those files' imports are already in state.file_imported_names
1333        // Unchanged files' existing call edges are already in the graph.
1334
1335        // Build helper maps for call extraction
1336        // Note: We need nodes from BOTH the existing graph and new state
1337        // Temporarily set state.nodes to all graph nodes for building maps
1338        let saved_nodes = std::mem::take(&mut state.nodes);
1339        state.nodes = graph.nodes.clone();
1340        let (class_init_map, node_pkg_map) = build_call_extraction_maps(&state);
1341        state.nodes = saved_nodes;
1342
1343        // Phase 2b: Extract call edges for changed files only
1344        let mut new_call_edges: Vec<CodeEdge> = Vec::new();
1345        for (rel_path, content, lang) in &file_entries {
1346            if !files_to_parse.contains(rel_path.as_str()) {
1347                continue;
1348            }
1349            extract_calls_for_file(
1350                rel_path, content, lang, &mut parser, &state,
1351                &class_init_map, &node_pkg_map, &state.module_map,
1352                &mut new_call_edges,
1353            );
1354        }
1355
1356        // Count edges per file for metadata
1357        for edge in &new_call_edges {
1358            // Determine which file this edge belongs to by looking at the source node's file
1359            let source_file = graph.nodes.iter()
1360                .find(|n| n.id == edge.from)
1361                .map(|n| n.file_path.clone());
1362            if let Some(fp) = source_file {
1363                if let Some(fs) = new_file_states.get_mut(&fp) {
1364                    fs.edge_count += 1;
1365                }
1366            }
1367        }
1368
1369        // Phase 3: Merge new edges and resolve references
1370        let mut all_new_edges = state.edges;
1371        all_new_edges.extend(new_call_edges);
1372
1373        let resolved_new = resolve_references(
1374            all_new_edges,
1375            &state.class_map,
1376            &state.func_map,
1377            &state.module_map,
1378        );
1379
1380        // Add resolved new edges to existing graph edges
1381        graph.edges.extend(resolved_new);
1382
1383        // Deduplicate and finalize ALL edges
1384        let final_edges = dedup_and_finalize_edges(graph.edges, &graph.nodes);
1385        graph.edges = final_edges;
1386
1387        // Remove phantom file nodes — files that don't exist on disk (ISS-007)
1388        let valid_file_paths: HashSet<&str> = file_entries.iter().map(|(p, _, _)| p.as_str()).collect();
1389        remove_phantom_nodes(&mut graph.nodes, &mut graph.edges, &valid_file_paths);
1390
1391        // Rebuild indexes
1392        graph.outgoing.clear();
1393        graph.incoming.clear();
1394        graph.node_index.clear();
1395        graph.build_indexes();
1396
1397        // Phase 5: Update metadata (caller writes graph.yml)
1398
1399        // Build updated metadata
1400        let mut new_metadata = ExtractMetadata {
1401            version: EXTRACT_META_VERSION,
1402            updated_at: chrono::Utc::now().to_rfc3339(),
1403            files: HashMap::new(),
1404        };
1405
1406        // Copy unchanged file states from old metadata
1407        for path in &delta.unchanged {
1408            if let Some(old_state) = metadata.files.get(path) {
1409                new_metadata.files.insert(path.clone(), old_state.clone());
1410            }
1411        }
1412
1413        // Add new/modified file states
1414        for (path, file_state) in new_file_states {
1415            new_metadata.files.insert(path, file_state);
1416        }
1417
1418        // Save metadata
1419        Self::save_metadata(meta_path, &new_metadata);
1420
1421        let report = ExtractReport {
1422            added: delta.added.len(),
1423            modified: delta.modified.len(),
1424            deleted: delta.deleted.len(),
1425            unchanged: delta.unchanged.len(),
1426            full_rebuild: false,
1427            duration_ms: start.elapsed().as_millis() as u64,
1428        };
1429
1430        tracing::info!("{}", report);
1431
1432        Ok((graph, report))
1433    }
1434
1435    /// Full rebuild with metadata generation.
1436    fn do_full_rebuild(
1437        dir: &Path,
1438        _gid_dir: &Path,
1439        meta_path: &Path,
1440        start: Instant,
1441    ) -> anyhow::Result<(Self, ExtractReport)> {
1442        let mut state = ExtractState::default();
1443
1444        // First pass: collect files and build module map
1445        let file_entries = collect_source_files(dir, &mut state.module_map);
1446        let total_files = file_entries.len();
1447
1448        // Generate module nodes from directory structure (ISS-009)
1449        let (module_nodes, module_edges) = generate_module_nodes(&file_entries);
1450        state.nodes.extend(module_nodes);
1451        state.edges.extend(module_edges);
1452
1453        // Generate file → module belongs_to edges (ISS-009)
1454        let file_module_edges = generate_file_to_module_edges(&file_entries);
1455
1456        // Generate TestsFor edges from naming conventions (ISS-009)
1457        let rust_test_edges = generate_rust_tests_for_edges(&file_entries);
1458        let ts_test_edges = generate_ts_tests_for_edges(&file_entries);
1459        let python_test_edges = generate_python_tests_for_edges(&file_entries);
1460
1461        // Second pass: parse each file
1462        let mut parser = Parser::new();
1463        parser.set_language(&tree_sitter_python::LANGUAGE.into()).ok();
1464
1465        let mut per_file_node_ids: HashMap<String, Vec<String>> = HashMap::new();
1466
1467        for (rel_path, content, lang) in &file_entries {
1468            if let Some(result) = parse_single_file(rel_path, content, lang, &mut parser, &mut state.class_map) {
1469                let mut node_ids: Vec<String> = result.nodes.iter().map(|n| n.id.clone()).collect();
1470                // Always include the file node — integrate_file_results creates it unconditionally
1471                node_ids.insert(0, format!("file:{}", rel_path));
1472                per_file_node_ids.insert(rel_path.clone(), node_ids);
1473                integrate_file_results(&mut state, rel_path, result);
1474            }
1475        }
1476
1477        // Build helper maps for call extraction
1478        let (class_init_map, node_pkg_map) = build_call_extraction_maps(&state);
1479
1480        // Third pass: extract call edges
1481        // Take edges out to avoid simultaneous immutable borrow of `state` + mutable borrow of `state.edges`
1482        let mut edges = std::mem::take(&mut state.edges);
1483        for (rel_path, content, lang) in &file_entries {
1484            extract_calls_for_file(
1485                rel_path, content, lang, &mut parser, &state,
1486                &class_init_map, &node_pkg_map, &state.module_map, &mut edges,
1487            );
1488        }
1489        // Add cross-layer edges (ISS-009)
1490        edges.extend(file_module_edges);
1491        edges.extend(rust_test_edges);
1492        edges.extend(ts_test_edges);
1493        edges.extend(python_test_edges);
1494        state.edges = edges;
1495
1496        // Resolve, dedup, finalize
1497        let resolved = resolve_references(
1498            state.edges,
1499            &state.class_map,
1500            &state.func_map,
1501            &state.module_map,
1502        );
1503        let mut final_edges = dedup_and_finalize_edges(resolved, &state.nodes);
1504
1505        // Remove phantom file nodes — files that don't exist on disk (ISS-007)
1506        let valid_file_paths: HashSet<&str> = file_entries.iter().map(|(p, _, _)| p.as_str()).collect();
1507        remove_phantom_nodes(&mut state.nodes, &mut final_edges, &valid_file_paths);
1508
1509        let mut graph = CodeGraph {
1510            nodes: state.nodes,
1511            edges: final_edges,
1512            outgoing: HashMap::new(),
1513            incoming: HashMap::new(),
1514            node_index: HashMap::new(),
1515        };
1516        graph.build_indexes();
1517
1518        // Build and save metadata (caller writes graph.yml)
1519        let mut metadata = ExtractMetadata {
1520            version: EXTRACT_META_VERSION,
1521            updated_at: chrono::Utc::now().to_rfc3339(),
1522            files: HashMap::new(),
1523        };
1524
1525        for (rel_path, content, _lang) in &file_entries {
1526            let mtime = get_file_mtime(dir, rel_path);
1527            let content_hash = xxh64(content.as_bytes(), 0);
1528            let node_ids = per_file_node_ids.get(rel_path).cloned().unwrap_or_default();
1529
1530            // Count edges originating from nodes in this file
1531            let file_node_ids: HashSet<&str> = node_ids.iter().map(|s| s.as_str()).collect();
1532            let edge_count = graph.edges.iter()
1533                .filter(|e| file_node_ids.contains(e.from.as_str()))
1534                .count();
1535
1536            metadata.files.insert(rel_path.clone(), FileState {
1537                mtime,
1538                content_hash,
1539                node_ids,
1540                edge_count,
1541            });
1542        }
1543
1544        Self::save_metadata(meta_path, &metadata);
1545
1546        let report = ExtractReport {
1547            added: total_files,
1548            modified: 0,
1549            deleted: 0,
1550            unchanged: 0,
1551            full_rebuild: true,
1552            duration_ms: start.elapsed().as_millis() as u64,
1553        };
1554
1555        tracing::info!("{}", report);
1556
1557        Ok((graph, report))
1558    }
1559
1560    /// Load extract metadata from disk.
1561    fn load_metadata(meta_path: &Path) -> Option<ExtractMetadata> {
1562        let data = std::fs::read_to_string(meta_path).ok()?;
1563        serde_json::from_str(&data).ok()
1564    }
1565
1566    /// Save extract metadata to disk.
1567    fn save_metadata(meta_path: &Path, metadata: &ExtractMetadata) {
1568        if let Some(parent) = meta_path.parent() {
1569            let _ = std::fs::create_dir_all(parent);
1570        }
1571        if let Ok(json) = serde_json::to_string_pretty(metadata) {
1572            if let Err(e) = std::fs::write(meta_path, json) {
1573                tracing::warn!("Failed to save extract metadata: {}", e);
1574            }
1575        }
1576    }
1577
1578    /// Load a CodeGraph from graph.yml by converting code-layer nodes.
1579    fn load_from_graph_yml(graph_yml_path: &Path) -> Option<Self> {
1580        let data = std::fs::read_to_string(graph_yml_path).ok()?;
1581        let graph: Graph = serde_yaml::from_str(&data).ok()?;
1582        let cg = graph_to_codegraph(&graph);
1583        if cg.nodes.is_empty() {
1584            return None;
1585        }
1586        Some(cg)
1587    }
1588
1589    /// Load a graph from JSON format (migration fallback for old projects).
1590    fn load_graph_json(graph_path: &Path) -> Option<Self> {
1591        let data = std::fs::read_to_string(graph_path).ok()?;
1592        let mut graph: Self = serde_json::from_str(&data).ok()?;
1593        graph.build_indexes();
1594        Some(graph)
1595    }
1596
1597    /// Save graph as JSON.
1598    fn save_graph_json(graph_path: &Path, graph: &Self) {
1599        if let Some(parent) = graph_path.parent() {
1600            let _ = std::fs::create_dir_all(parent);
1601        }
1602        if let Ok(json) = serde_json::to_string(graph) {
1603            if let Err(e) = std::fs::write(graph_path, json) {
1604                tracing::warn!("Failed to save graph: {}", e);
1605            }
1606        }
1607    }
1608}
1609
1610/// Compute file delta with mtime-first, hash-second strategy.
1611fn compute_file_delta_with_mtime(
1612    dir: &Path,
1613    current_files: &[(String, String, Language)],
1614    metadata: &ExtractMetadata,
1615) -> FileDelta {
1616    let mut delta = FileDelta::default();
1617
1618    let current_paths: HashSet<&str> = current_files.iter().map(|(p, _, _)| p.as_str()).collect();
1619
1620    for (rel_path, content, _lang) in current_files {
1621        if let Some(stored) = metadata.files.get(rel_path.as_str()) {
1622            // File exists in both — check if changed
1623            let content_hash = xxh64(content.as_bytes(), 0);
1624            let mtime = get_file_mtime(dir, rel_path);
1625            if mtime == stored.mtime && content_hash == stored.content_hash {
1626                // Both mtime and content match — definitely unchanged
1627                delta.unchanged.push(rel_path.clone());
1628            } else if content_hash == stored.content_hash {
1629                // Content same despite mtime change (e.g. touch)
1630                delta.unchanged.push(rel_path.clone());
1631            } else {
1632                delta.modified.push(rel_path.clone());
1633            }
1634        } else {
1635            // New file
1636            delta.added.push(rel_path.clone());
1637        }
1638    }
1639
1640    // Find deleted files
1641    for stored_path in metadata.files.keys() {
1642        if !current_paths.contains(stored_path.as_str()) {
1643            delta.deleted.push(stored_path.clone());
1644        }
1645    }
1646
1647    delta
1648}