1pub mod ast_extract;
11pub mod dedup;
12pub mod lang_config;
13pub mod parser;
14pub mod semantic;
15pub mod treesitter;
16
17use std::collections::{HashMap, HashSet};
18use std::path::{Path, PathBuf};
19
20use graphify_core::confidence::Confidence;
21use graphify_core::model::{ExtractionResult, GraphEdge, NodeType};
22use rayon::prelude::*;
23use tracing::{debug, info, warn};
24
25pub const DISPATCH: &[(&str, &str)] = &[
27 (".py", "python"),
28 (".js", "javascript"),
29 (".jsx", "javascript"),
30 (".ts", "typescript"),
31 (".tsx", "typescript"),
32 (".go", "go"),
33 (".rs", "rust"),
34 (".java", "java"),
35 (".c", "c"),
36 (".h", "c"),
37 (".cpp", "cpp"),
38 (".cc", "cpp"),
39 (".cxx", "cpp"),
40 (".hpp", "cpp"),
41 (".rb", "ruby"),
42 (".cs", "csharp"),
43 (".kt", "kotlin"),
44 (".kts", "kotlin"),
45 (".scala", "scala"),
46 (".php", "php"),
47 (".swift", "swift"),
48 (".lua", "lua"),
49 (".toc", "lua"),
50 (".zig", "zig"),
51 (".ps1", "powershell"),
52 (".ex", "elixir"),
53 (".exs", "elixir"),
54 (".m", "objc"),
55 (".mm", "objc"),
56 (".jl", "julia"),
57 (".dart", "dart"),
58];
59
60fn dispatch_map() -> &'static HashMap<&'static str, &'static str> {
62 static MAP: std::sync::LazyLock<HashMap<&str, &str>> =
63 std::sync::LazyLock::new(|| DISPATCH.iter().copied().collect());
64 &MAP
65}
66
67pub fn language_for_path(path: &Path) -> Option<&'static str> {
69 let ext = path.extension()?.to_str()?;
70 dispatch_map().get(&*format!(".{ext}")).copied()
71}
72
73pub fn collect_files(target: &Path) -> Vec<PathBuf> {
75 let map = dispatch_map();
76 let mut files = Vec::new();
77 collect_files_inner(target, map, &mut files);
78 files.sort();
79 files
80}
81
82fn collect_files_inner(dir: &Path, map: &HashMap<&str, &str>, out: &mut Vec<PathBuf>) {
83 let entries = match std::fs::read_dir(dir) {
84 Ok(e) => e,
85 Err(e) => {
86 warn!("cannot read directory {}: {e}", dir.display());
87 return;
88 }
89 };
90 for entry in entries.flatten() {
91 let path = entry.path();
92 if path.is_dir() {
93 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
94 if name.starts_with('.')
95 || name == "node_modules"
96 || name == "__pycache__"
97 || name == "target"
98 || name == "vendor"
99 || name == "venv"
100 || name == ".git"
101 {
102 continue;
103 }
104 collect_files_inner(&path, map, out);
105 } else if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
106 let dotted = format!(".{ext}");
107 if map.contains_key(dotted.as_str()) {
108 out.push(path);
109 }
110 }
111 }
112}
113
114pub fn extract(paths: &[PathBuf]) -> ExtractionResult {
122 let results: Vec<ExtractionResult> = paths
123 .par_iter()
124 .filter_map(|path| {
125 let lang = if let Some(l) = language_for_path(path) {
126 l
127 } else {
128 debug!("skipping unsupported file: {}", path.display());
129 return None;
130 };
131
132 let source = match std::fs::read(path) {
133 Ok(s) => s,
134 Err(e) => {
135 warn!("cannot read {}: {e}", path.display());
136 return None;
137 }
138 };
139
140 debug!("extracting {} ({})", path.display(), lang);
141
142 let mut result = if let Some(ts_result) = treesitter::try_extract(path, &source, lang) {
143 debug!("used tree-sitter for {} ({})", path.display(), lang);
144 ts_result
145 } else {
146 let source_str = String::from_utf8_lossy(&source);
147 ast_extract::extract_file(path, source_str.as_ref(), lang)
148 };
149 dedup::dedup_file(&mut result);
150
151 Some(result)
152 })
153 .collect();
154
155 let mut combined = ExtractionResult::default();
156 for r in results {
157 combined.nodes.extend(r.nodes);
158 combined.edges.extend(r.edges);
159 combined.hyperedges.extend(r.hyperedges);
160 }
161
162 resolve_python_imports(&mut combined);
163
164 resolve_cross_file_imports(&mut combined);
165
166 resolve_cross_file_calls(&mut combined);
167
168 info!(
169 "extraction complete: {} nodes, {} edges",
170 combined.nodes.len(),
171 combined.edges.len()
172 );
173
174 combined
175}
176
177fn resolve_python_imports(result: &mut ExtractionResult) {
182 let label_to_ids: HashMap<String, Vec<(String, String)>> = {
183 let mut map: HashMap<String, Vec<(String, String)>> = HashMap::new();
184 for n in &result.nodes {
185 map.entry(n.label.clone())
186 .or_default()
187 .push((n.id.clone(), n.source_file.clone()));
188 }
189 map
190 };
191
192 let mut stem_to_entity_ids: HashMap<String, Vec<String>> = HashMap::new();
193 let defined_targets: HashSet<String> = result
194 .edges
195 .iter()
196 .filter(|e| e.relation == "defines")
197 .map(|e| e.target.clone())
198 .collect();
199 for node in &result.nodes {
200 if !defined_targets.contains(&node.id) {
201 continue;
202 }
203 let stem = std::path::Path::new(&node.source_file)
204 .file_stem()
205 .and_then(|s| s.to_str())
206 .unwrap_or("")
207 .to_string();
208 stem_to_entity_ids
209 .entry(stem)
210 .or_default()
211 .push(node.id.clone());
212 }
213
214 let mut star_expansions: Vec<GraphEdge> = Vec::new();
215
216 for edge in &mut result.edges {
217 if edge.relation == "imports" {
218 let import_label = result
219 .nodes
220 .iter()
221 .find(|n| n.id == edge.target)
222 .map_or("", |n| n.label.as_str());
223
224 if import_label.contains('*') {
225 let module_name = import_label.trim_end_matches(".*").trim_end_matches(" *");
227 if let Some(entity_ids) = stem_to_entity_ids.get(module_name) {
228 for target_id in entity_ids {
229 star_expansions.push(GraphEdge {
230 source: edge.source.clone(),
231 target: target_id.clone(),
232 relation: "uses".to_string(),
233 confidence: Confidence::Inferred,
234 confidence_score: 0.7,
235 source_file: edge.source_file.clone(),
236 source_location: None,
237 weight: 0.7,
238 provenance: Some("cross-file:python-star-import".to_string()),
239 extra: Default::default(),
240 });
241 }
242 }
243 } else if let Some(candidates) = label_to_ids.get(&edge.target) {
244 let resolved = candidates
245 .iter()
246 .find(|(_, sf)| sf == &edge.source_file)
247 .or_else(|| candidates.first())
248 .map(|(id, _)| id.clone());
249 if let Some(resolved_id) = resolved {
250 edge.target = resolved_id;
251 edge.confidence = graphify_core::confidence::Confidence::Extracted;
252 }
253 }
254 }
255 }
256
257 if !star_expansions.is_empty() {
258 debug!(
259 "python star import expansion: created {} uses edges",
260 star_expansions.len()
261 );
262 result.edges.extend(star_expansions);
263 }
264}
265
266fn resolve_cross_file_imports(result: &mut ExtractionResult) {
273 let mut id_to_label: HashMap<String, String> = HashMap::new();
274 let mut stem_to_entities: HashMap<String, Vec<(String, String, NodeType)>> = HashMap::new();
275 let mut go_pkg_to_entities: HashMap<String, Vec<(String, String, NodeType)>> = HashMap::new();
276 let mut source_file_to_stem: HashMap<String, String> = HashMap::new();
277 let mut file_id_to_source: HashMap<String, String> = HashMap::new();
278
279 let defined_entity_ids: HashSet<String> = result
280 .edges
281 .iter()
282 .filter(|e| e.relation == "defines")
283 .map(|e| e.target.clone())
284 .collect();
285
286 let mut source_file_entities: HashMap<String, Vec<String>> = HashMap::new();
287 for edge in &result.edges {
288 if edge.relation == "defines" {
289 source_file_entities
290 .entry(edge.source_file.clone())
291 .or_default()
292 .push(edge.target.clone());
293 }
294 }
295
296 for node in &result.nodes {
297 id_to_label.insert(node.id.clone(), node.label.clone());
298
299 if node.node_type == NodeType::File {
300 let stem = Path::new(&node.source_file)
301 .file_stem()
302 .and_then(|s| s.to_str())
303 .unwrap_or("")
304 .to_string();
305 source_file_to_stem.insert(node.source_file.clone(), stem);
306 file_id_to_source.insert(node.id.clone(), node.source_file.clone());
307 continue;
308 }
309
310 if !defined_entity_ids.contains(&node.id) {
311 continue;
312 }
313
314 let path = Path::new(&node.source_file);
315 let stem = path
316 .file_stem()
317 .and_then(|s| s.to_str())
318 .unwrap_or("")
319 .to_string();
320
321 stem_to_entities.entry(stem).or_default().push((
322 node.label.clone(),
323 node.id.clone(),
324 node.node_type.clone(),
325 ));
326
327 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
328 if ext == "go"
329 && let Some(dir) = path
330 .parent()
331 .and_then(|d| d.file_name())
332 .and_then(|d| d.to_str())
333 {
334 go_pkg_to_entities
335 .entry(dir.to_string())
336 .or_default()
337 .push((node.label.clone(), node.id.clone(), node.node_type.clone()));
338 }
339 }
340
341 let mut new_edges: Vec<GraphEdge> = Vec::new();
342 let mut seen = HashSet::new();
343
344 for edge in &result.edges {
345 if edge.relation != "imports" {
346 continue;
347 }
348
349 let source_file = &edge.source_file;
350 let ext = Path::new(source_file)
351 .extension()
352 .and_then(|e| e.to_str())
353 .unwrap_or("");
354
355 let import_label = match id_to_label.get(&edge.target) {
356 Some(label) => label.as_str(),
357 None => continue,
358 };
359
360 if import_label.is_empty() {
361 continue;
362 }
363
364 let target_entities = match ext {
365 "js" | "jsx" | "ts" | "tsx" => resolve_jsts_import(import_label, &stem_to_entities),
366 "go" => resolve_go_import(import_label, &stem_to_entities, &go_pkg_to_entities),
367 "rs" => resolve_rust_import(import_label, &stem_to_entities),
368 "java" => resolve_dot_import(import_label, &stem_to_entities),
369 "cs" => resolve_dot_import(import_label, &stem_to_entities),
370 "c" | "h" | "cpp" | "cc" | "cxx" | "hpp" => {
371 resolve_c_include(import_label, &stem_to_entities)
372 }
373 "kt" | "kts" => {
374 let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
375 resolve_dot_import(cleaned.trim(), &stem_to_entities)
376 }
377 "php" => {
378 let cleaned = import_label.strip_prefix("use ").unwrap_or(import_label);
379 resolve_backslash_import(cleaned.trim(), &stem_to_entities)
380 }
381 "dart" => resolve_dart_import(import_label, &stem_to_entities),
382 "scala" => {
383 let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
384 resolve_dot_import(cleaned.trim(), &stem_to_entities)
385 }
386 "swift" => {
387 let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
388 resolve_dot_import(cleaned.trim(), &stem_to_entities)
389 }
390 _ => continue,
391 };
392
393 if target_entities.is_empty() {
394 continue;
395 }
396
397 let local_entities = match source_file_entities.get(source_file) {
398 Some(ids) => ids,
399 None => continue,
400 };
401
402 let target_by_label: HashMap<&str, &String> = target_entities
403 .iter()
404 .filter_map(|(lbl, id, _)| {
405 if !lbl.is_empty() {
406 Some((lbl.as_str(), id))
407 } else {
408 None
409 }
410 })
411 .collect();
412
413 for local_id in local_entities {
414 let local_label = match id_to_label.get(local_id) {
415 Some(l) => l,
416 None => continue,
417 };
418
419 if let Some(&target_id) = target_by_label.get(local_label.as_str()) {
420 if local_id == target_id {
421 continue;
422 }
423 let key = (local_id.clone(), target_id.clone());
424 if seen.contains(&key) {
425 continue;
426 }
427 seen.insert(key);
428 new_edges.push(GraphEdge {
429 source: local_id.clone(),
430 target: target_id.clone(),
431 relation: "uses".to_string(),
432 confidence: Confidence::Inferred,
433 confidence_score: 0.8,
434 source_file: source_file.clone(),
435 source_location: None,
436 weight: 0.8,
437 provenance: Some("cross-file:import-resolve".to_string()),
438 extra: Default::default(),
439 });
440 continue;
441 }
442
443 const MAX_FALLBACK_EDGES: usize = 50;
444 let mut fallback_count = 0;
445 for (_, target_id, _) in &target_entities {
446 if local_id == target_id {
447 continue;
448 }
449 let key = (local_id.clone(), target_id.clone());
450 if seen.contains(&key) {
451 continue;
452 }
453 seen.insert(key);
454 new_edges.push(GraphEdge {
455 source: local_id.clone(),
456 target: target_id.clone(),
457 relation: "uses".to_string(),
458 confidence: Confidence::Inferred,
459 confidence_score: 0.8,
460 source_file: source_file.clone(),
461 source_location: None,
462 weight: 0.8,
463 provenance: Some("cross-file:import-resolve:fallback".to_string()),
464 extra: Default::default(),
465 });
466 fallback_count += 1;
467 if fallback_count >= MAX_FALLBACK_EDGES {
468 break;
469 }
470 }
471 }
472 }
473
474 if !new_edges.is_empty() {
475 debug!(
476 "cross-file import resolution: created {} inferred uses edges",
477 new_edges.len()
478 );
479 }
480
481 result.edges.extend(new_edges);
482}
483
484fn resolve_cross_file_calls(result: &mut ExtractionResult) {
490 let callable_ids: HashSet<String> = result
491 .nodes
492 .iter()
493 .filter(|n| matches!(n.node_type, NodeType::Function | NodeType::Method))
494 .map(|n| n.id.clone())
495 .collect();
496
497 let existing_calls: HashSet<(String, String)> = result
498 .edges
499 .iter()
500 .filter(|e| e.relation == "calls")
501 .map(|e| (e.source.clone(), e.target.clone()))
502 .collect();
503
504 let uses_edges: Vec<(String, String, String)> = result
505 .edges
506 .iter()
507 .filter(|e| e.relation == "uses" && callable_ids.contains(&e.source))
508 .map(|e| (e.source.clone(), e.target.clone(), e.source_file.clone()))
509 .collect();
510
511 let mut new_edges: Vec<GraphEdge> = Vec::new();
512
513 for (source, target, source_file) in uses_edges {
514 let key = (source.clone(), target.clone());
515 if existing_calls.contains(&key) {
516 continue;
517 }
518 new_edges.push(GraphEdge {
519 source,
520 target,
521 relation: "calls".to_string(),
522 confidence: Confidence::Inferred,
523 confidence_score: 0.5,
524 source_file,
525 source_location: None,
526 weight: 0.5,
527 provenance: Some("cross-file:call-resolve".to_string()),
528 extra: HashMap::new(),
529 });
530 }
531
532 if !new_edges.is_empty() {
533 debug!(
534 "cross-file call resolution: created {} inferred calls edges",
535 new_edges.len()
536 );
537 }
538
539 result.edges.extend(new_edges);
540}
541
542fn resolve_jsts_import<'a>(
552 import_label: &str,
553 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
554) -> Vec<&'a (String, String, NodeType)> {
555 let label = import_label.split(" as ").next().unwrap_or(import_label);
556
557 let parts: Vec<&str> = label.split('/').collect();
558
559 if parts.len() >= 2 {
560 let module_stem = parts[0].trim_start_matches('.');
561 if let Some(entities) = stem_to_entities.get(module_stem) {
562 return entities.iter().collect();
563 }
564 }
565
566 if let Some(last) = parts.last() {
567 let stem = last.trim_start_matches('.');
568 if let Some(entities) = stem_to_entities.get(stem) {
569 return entities.iter().collect();
570 }
571 }
572
573 let simple = label.trim_start_matches("./").trim_start_matches("../");
574 if let Some(entities) = stem_to_entities.get(simple) {
575 return entities.iter().collect();
576 }
577
578 if let Some(entities) = stem_to_entities.get("index")
579 && (label.contains('/') || label.starts_with('.'))
580 {
581 return entities.iter().collect();
582 }
583
584 Vec::new()
585}
586
587fn resolve_go_import<'a>(
593 import_label: &str,
594 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
595 go_pkg_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
596) -> Vec<&'a (String, String, NodeType)> {
597 let label = import_label
598 .trim_start_matches(". ")
599 .trim_start_matches("_ ");
600 let label = if label.contains('"') {
601 label.split('"').nth(1).unwrap_or(label)
602 } else {
603 label
604 };
605
606 let pkg_name = label.rsplit('/').next().unwrap_or(label);
607
608 if let Some(entities) = go_pkg_to_entities.get(pkg_name) {
609 return entities.iter().collect();
610 }
611
612 if let Some(entities) = stem_to_entities.get(pkg_name) {
613 return entities.iter().collect();
614 }
615
616 Vec::new()
617}
618
619fn resolve_rust_import<'a>(
624 import_label: &str,
625 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
626) -> Vec<&'a (String, String, NodeType)> {
627 let label = import_label
628 .strip_prefix("pub use ")
629 .unwrap_or(import_label);
630 let segments: Vec<&str> = label.split("::").collect();
631
632 if segments.last() == Some(&"*") && segments.len() >= 2 {
633 let module = segments[segments.len() - 2];
634 if let Some(entities) = stem_to_entities.get(module) {
635 return entities.iter().collect();
636 }
637 }
638
639 if let Some(last) = segments.last()
640 && *last != "*"
641 && let Some(entities) = stem_to_entities.get(*last)
642 {
643 return entities.iter().collect();
644 }
645
646 if segments.len() >= 2 {
647 let module = segments[segments.len() - 2];
648 if let Some(entities) = stem_to_entities.get(module) {
649 let last = segments.last().unwrap();
650 let filtered: Vec<_> = entities.iter().filter(|(lbl, _, _)| lbl == last).collect();
651 if !filtered.is_empty() {
652 return filtered;
653 }
654 return entities.iter().collect();
655 }
656 }
657
658 Vec::new()
659}
660
661fn resolve_dot_import<'a>(
666 import_label: &str,
667 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
668) -> Vec<&'a (String, String, NodeType)> {
669 let label = import_label.strip_prefix("static ").unwrap_or(import_label);
670 let label = if let Some(idx) = label.find(" = ") {
671 label[idx + 3..].trim()
672 } else {
673 label
674 };
675
676 let segments: Vec<&str> = label.split('.').collect();
677
678 if let Some(last) = segments.last()
679 && let Some(entities) = stem_to_entities.get(*last)
680 {
681 return entities.iter().collect();
682 }
683
684 if segments.len() >= 2 {
685 let module = segments[segments.len() - 2];
686 if let Some(entities) = stem_to_entities.get(module) {
687 let last = segments.last().unwrap();
688 let filtered: Vec<_> = entities.iter().filter(|(lbl, _, _)| lbl == last).collect();
689 if !filtered.is_empty() {
690 return filtered;
691 }
692 return entities.iter().collect();
693 }
694 }
695
696 Vec::new()
697}
698
699fn resolve_c_include<'a>(
704 import_label: &str,
705 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
706) -> Vec<&'a (String, String, NodeType)> {
707 let label = import_label
708 .trim_start_matches('<')
709 .trim_end_matches('>')
710 .trim_start_matches('"')
711 .trim_end_matches('"');
712
713 let stem = std::path::Path::new(label)
714 .file_stem()
715 .and_then(|s| s.to_str())
716 .unwrap_or(label);
717
718 if let Some(entities) = stem_to_entities.get(stem) {
719 return entities.iter().collect();
720 }
721
722 Vec::new()
723}
724
725fn resolve_backslash_import<'a>(
729 import_label: &str,
730 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
731) -> Vec<&'a (String, String, NodeType)> {
732 let segments: Vec<&str> = import_label.split('\\').collect();
733
734 if let Some(last) = segments.last()
735 && let Some(entities) = stem_to_entities.get(*last)
736 {
737 return entities.iter().collect();
738 }
739
740 if segments.len() >= 2 {
741 let module = segments[segments.len() - 2];
742 if let Some(entities) = stem_to_entities.get(module) {
743 return entities.iter().collect();
744 }
745 }
746
747 Vec::new()
748}
749
750fn resolve_dart_import<'a>(
755 import_label: &str,
756 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
757) -> Vec<&'a (String, String, NodeType)> {
758 let mut label = import_label;
759
760 if let Some(stripped) = label.strip_prefix("import ") {
761 label = stripped;
762 } else if let Some(stripped) = label.strip_prefix("export ") {
763 label = stripped;
764 } else if let Some(stripped) = label.strip_prefix("part ") {
765 label = stripped;
766 }
767
768 let path_and_alias = label;
769 let path_part = if let Some(idx) = path_and_alias.find(" as ") {
770 &path_and_alias[..idx]
771 } else {
772 path_and_alias
773 };
774
775 let path_deferred = path_part;
776 let path_no_deferred = if let Some(idx) = path_deferred.find(" deferred") {
777 &path_deferred[..idx]
778 } else {
779 path_deferred
780 };
781
782 let quoted = path_no_deferred.trim();
783 let unquoted = quoted
784 .trim_matches('\'') .trim_matches('"');
786
787 let normalized = if unquoted.contains("../") {
788 let last_segment = unquoted.rsplit('/').next().unwrap_or(unquoted);
789 last_segment.strip_suffix(".dart").unwrap_or(last_segment)
790 } else {
791 let path_part = unquoted.strip_prefix("package:").unwrap_or(unquoted);
792
793 let last_segment = path_part.rsplit('/').next().unwrap_or(path_part);
794
795 last_segment.strip_suffix(".dart").unwrap_or(last_segment)
796 };
797
798 if let Some(entities) = stem_to_entities.get(normalized) {
799 return entities.iter().collect();
800 }
801
802 Vec::new()
803}
804
805#[cfg(test)]
806mod tests;