1pub mod ast_extract;
11pub mod dedup;
12pub mod lang_config;
13pub mod parser;
14pub mod semantic;
15pub mod treesitter;
16
17use std::collections::{HashMap, HashSet};
18use std::path::{Path, PathBuf};
19
20use graphify_core::confidence::Confidence;
21use graphify_core::model::{ExtractionResult, GraphEdge, NodeType};
22use rayon::prelude::*;
23use tracing::{debug, info, warn};
24
25pub const DISPATCH: &[(&str, &str)] = &[
27 (".py", "python"),
28 (".js", "javascript"),
29 (".jsx", "javascript"),
30 (".ts", "typescript"),
31 (".tsx", "typescript"),
32 (".go", "go"),
33 (".rs", "rust"),
34 (".java", "java"),
35 (".c", "c"),
36 (".h", "c"),
37 (".cpp", "cpp"),
38 (".cc", "cpp"),
39 (".cxx", "cpp"),
40 (".hpp", "cpp"),
41 (".rb", "ruby"),
42 (".cs", "csharp"),
43 (".kt", "kotlin"),
44 (".kts", "kotlin"),
45 (".scala", "scala"),
46 (".php", "php"),
47 (".swift", "swift"),
48 (".lua", "lua"),
49 (".toc", "lua"),
50 (".zig", "zig"),
51 (".ps1", "powershell"),
52 (".ex", "elixir"),
53 (".exs", "elixir"),
54 (".m", "objc"),
55 (".mm", "objc"),
56 (".jl", "julia"),
57 (".dart", "dart"),
58];
59
60fn dispatch_map() -> &'static HashMap<&'static str, &'static str> {
62 static MAP: std::sync::LazyLock<HashMap<&str, &str>> =
63 std::sync::LazyLock::new(|| DISPATCH.iter().copied().collect());
64 &MAP
65}
66
67pub fn language_for_path(path: &Path) -> Option<&'static str> {
69 let ext = path.extension()?.to_str()?;
70 dispatch_map().get(&*format!(".{ext}")).copied()
71}
72
73pub fn collect_files(target: &Path) -> Vec<PathBuf> {
75 let map = dispatch_map();
76 let mut files = Vec::new();
77 collect_files_inner(target, map, &mut files);
78 files.sort();
79 files
80}
81
82fn collect_files_inner(dir: &Path, map: &HashMap<&str, &str>, out: &mut Vec<PathBuf>) {
83 let entries = match std::fs::read_dir(dir) {
84 Ok(e) => e,
85 Err(e) => {
86 warn!("cannot read directory {}: {e}", dir.display());
87 return;
88 }
89 };
90 for entry in entries.flatten() {
91 let path = entry.path();
92 if path.is_dir() {
93 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
94 if name.starts_with('.')
95 || name == "node_modules"
96 || name == "__pycache__"
97 || name == "target"
98 || name == "vendor"
99 || name == "venv"
100 || name == ".git"
101 {
102 continue;
103 }
104 collect_files_inner(&path, map, out);
105 } else if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
106 let dotted = format!(".{ext}");
107 if map.contains_key(dotted.as_str()) {
108 out.push(path);
109 }
110 }
111 }
112}
113
114pub fn extract(paths: &[PathBuf]) -> ExtractionResult {
122 let results: Vec<ExtractionResult> = paths
123 .par_iter()
124 .filter_map(|path| {
125 let lang = if let Some(l) = language_for_path(path) {
126 l
127 } else {
128 debug!("skipping unsupported file: {}", path.display());
129 return None;
130 };
131
132 let source = match std::fs::read(path) {
133 Ok(s) => s,
134 Err(e) => {
135 warn!("cannot read {}: {e}", path.display());
136 return None;
137 }
138 };
139
140 debug!("extracting {} ({})", path.display(), lang);
141
142 let mut result = if let Some(ts_result) = treesitter::try_extract(path, &source, lang) {
143 debug!("used tree-sitter for {} ({})", path.display(), lang);
144 ts_result
145 } else {
146 let source_str = String::from_utf8_lossy(&source);
147 ast_extract::extract_file(path, source_str.as_ref(), lang)
148 };
149 dedup::dedup_file(&mut result);
150
151 Some(result)
152 })
153 .collect();
154
155 let mut combined = ExtractionResult::default();
156 for r in results {
157 combined.nodes.extend(r.nodes);
158 combined.edges.extend(r.edges);
159 combined.hyperedges.extend(r.hyperedges);
160 }
161
162 resolve_python_imports(&mut combined);
163
164 resolve_cross_file_imports(&mut combined);
165
166 resolve_cross_file_calls(&mut combined);
167
168 info!(
169 "extraction complete: {} nodes, {} edges",
170 combined.nodes.len(),
171 combined.edges.len()
172 );
173
174 combined
175}
176
177fn resolve_python_imports(result: &mut ExtractionResult) {
182 let label_to_ids: HashMap<String, Vec<(String, String)>> = {
183 let mut map: HashMap<String, Vec<(String, String)>> = HashMap::new();
184 for n in &result.nodes {
185 map.entry(n.label.clone())
186 .or_default()
187 .push((n.id.clone(), n.source_file.clone()));
188 }
189 map
190 };
191
192 let mut stem_to_entity_ids: HashMap<String, Vec<String>> = HashMap::new();
193 let defined_targets: HashSet<String> = result
194 .edges
195 .iter()
196 .filter(|e| e.relation == "defines")
197 .map(|e| e.target.clone())
198 .collect();
199 for node in &result.nodes {
200 if !defined_targets.contains(&node.id) {
201 continue;
202 }
203 let stem = std::path::Path::new(&node.source_file)
204 .file_stem()
205 .and_then(|s| s.to_str())
206 .unwrap_or("")
207 .to_string();
208 stem_to_entity_ids
209 .entry(stem)
210 .or_default()
211 .push(node.id.clone());
212 }
213
214 let mut star_expansions: Vec<GraphEdge> = Vec::new();
215
216 for edge in &mut result.edges {
217 if edge.relation == "imports" {
218 let import_label = result
219 .nodes
220 .iter()
221 .find(|n| n.id == edge.target)
222 .map_or("", |n| n.label.as_str());
223
224 if import_label.contains('*') {
225 let module_name = import_label.trim_end_matches(".*").trim_end_matches(" *");
227 if let Some(entity_ids) = stem_to_entity_ids.get(module_name) {
228 for target_id in entity_ids {
229 star_expansions.push(GraphEdge {
230 source: edge.source.clone(),
231 target: target_id.clone(),
232 relation: "uses".to_string(),
233 confidence: Confidence::Inferred,
234 confidence_score: 0.7,
235 source_file: edge.source_file.clone(),
236 source_location: None,
237 weight: 0.7,
238 provenance: Some("cross-file:python-star-import".to_string()),
239 extra: Default::default(),
240 });
241 }
242 }
243 } else if let Some(candidates) = label_to_ids.get(&edge.target) {
244 let resolved = candidates
245 .iter()
246 .find(|(_, sf)| sf == &edge.source_file)
247 .or_else(|| candidates.first())
248 .map(|(id, _)| id.clone());
249 if let Some(resolved_id) = resolved {
250 edge.target = resolved_id;
251 edge.confidence = graphify_core::confidence::Confidence::Extracted;
252 }
253 }
254 }
255 }
256
257 if !star_expansions.is_empty() {
258 debug!(
259 "python star import expansion: created {} uses edges",
260 star_expansions.len()
261 );
262 result.edges.extend(star_expansions);
263 }
264}
265
266fn resolve_cross_file_imports(result: &mut ExtractionResult) {
273 let mut id_to_label: HashMap<String, String> = HashMap::new();
274 let mut stem_to_entities: HashMap<String, Vec<(String, String, NodeType)>> = HashMap::new();
275 let mut go_pkg_to_entities: HashMap<String, Vec<(String, String, NodeType)>> = HashMap::new();
276 let mut source_file_to_stem: HashMap<String, String> = HashMap::new();
277 let mut file_id_to_source: HashMap<String, String> = HashMap::new();
278
279 let defined_entity_ids: HashSet<String> = result
280 .edges
281 .iter()
282 .filter(|e| e.relation == "defines")
283 .map(|e| e.target.clone())
284 .collect();
285
286 let mut source_file_entities: HashMap<String, Vec<String>> = HashMap::new();
287 for edge in &result.edges {
288 if edge.relation == "defines" {
289 source_file_entities
290 .entry(edge.source_file.clone())
291 .or_default()
292 .push(edge.target.clone());
293 }
294 }
295
296 for node in &result.nodes {
297 id_to_label.insert(node.id.clone(), node.label.clone());
298
299 if node.node_type == NodeType::File {
300 let stem = Path::new(&node.source_file)
301 .file_stem()
302 .and_then(|s| s.to_str())
303 .unwrap_or("")
304 .to_string();
305 source_file_to_stem.insert(node.source_file.clone(), stem);
306 file_id_to_source.insert(node.id.clone(), node.source_file.clone());
307 continue;
308 }
309
310 if !defined_entity_ids.contains(&node.id) {
311 continue;
312 }
313
314 let path = Path::new(&node.source_file);
315 let stem = path
316 .file_stem()
317 .and_then(|s| s.to_str())
318 .unwrap_or("")
319 .to_string();
320
321 stem_to_entities.entry(stem).or_default().push((
322 node.label.clone(),
323 node.id.clone(),
324 node.node_type.clone(),
325 ));
326
327 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
328 if ext == "go"
329 && let Some(dir) = path
330 .parent()
331 .and_then(|d| d.file_name())
332 .and_then(|d| d.to_str())
333 {
334 go_pkg_to_entities
335 .entry(dir.to_string())
336 .or_default()
337 .push((node.label.clone(), node.id.clone(), node.node_type.clone()));
338 }
339 }
340
341 let mut new_edges: Vec<GraphEdge> = Vec::new();
342 let mut seen = HashSet::new();
343
344 for edge in &result.edges {
345 if edge.relation != "imports" {
346 continue;
347 }
348
349 let source_file = &edge.source_file;
350 let ext = Path::new(source_file)
351 .extension()
352 .and_then(|e| e.to_str())
353 .unwrap_or("");
354
355 let import_label = match id_to_label.get(&edge.target) {
356 Some(label) => label.as_str(),
357 None => continue,
358 };
359
360 if import_label.is_empty() {
361 continue;
362 }
363
364 let target_entities = match ext {
365 "js" | "jsx" | "ts" | "tsx" => resolve_jsts_import(import_label, &stem_to_entities),
366 "go" => resolve_go_import(import_label, &stem_to_entities, &go_pkg_to_entities),
367 "rs" => resolve_rust_import(import_label, &stem_to_entities),
368 "java" => resolve_dot_import(import_label, &stem_to_entities),
369 "cs" => resolve_dot_import(import_label, &stem_to_entities),
370 "c" | "h" | "cpp" | "cc" | "cxx" | "hpp" => {
371 resolve_c_include(import_label, &stem_to_entities)
372 }
373 "kt" | "kts" => {
374 let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
375 resolve_dot_import(cleaned.trim(), &stem_to_entities)
376 }
377 "php" => {
378 let cleaned = import_label.strip_prefix("use ").unwrap_or(import_label);
379 resolve_backslash_import(cleaned.trim(), &stem_to_entities)
380 }
381 "dart" => resolve_dart_import(import_label, &stem_to_entities),
382 "scala" => {
383 let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
384 resolve_dot_import(cleaned.trim(), &stem_to_entities)
385 }
386 "swift" => {
387 let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
388 resolve_dot_import(cleaned.trim(), &stem_to_entities)
389 }
390 _ => continue,
391 };
392
393 if target_entities.is_empty() {
394 continue;
395 }
396
397 let local_entities = match source_file_entities.get(source_file) {
398 Some(ids) => ids,
399 None => continue,
400 };
401
402 let target_by_label: HashMap<&str, &String> = target_entities
403 .iter()
404 .filter_map(|(lbl, id, _)| {
405 if !lbl.is_empty() {
406 Some((lbl.as_str(), id))
407 } else {
408 None
409 }
410 })
411 .collect();
412
413 for local_id in local_entities {
414 let local_label = match id_to_label.get(local_id) {
415 Some(l) => l,
416 None => continue,
417 };
418
419 if let Some(&target_id) = target_by_label.get(local_label.as_str()) {
420 if local_id == target_id {
421 continue;
422 }
423 let key = (local_id.clone(), target_id.clone());
424 if seen.contains(&key) {
425 continue;
426 }
427 seen.insert(key);
428 new_edges.push(GraphEdge {
429 source: local_id.clone(),
430 target: target_id.clone(),
431 relation: "uses".to_string(),
432 confidence: Confidence::Inferred,
433 confidence_score: 0.8,
434 source_file: source_file.clone(),
435 source_location: None,
436 weight: 0.8,
437 provenance: Some("cross-file:import-resolve".to_string()),
438 extra: Default::default(),
439 });
440 continue;
441 }
442
443 if let Some(symbols) = edge
446 .extra
447 .get("imported_symbols")
448 .and_then(|v| v.as_array())
449 {
450 let edges_before = new_edges.len();
451 for sym in symbols {
452 let sym_str = sym.as_str().unwrap_or("");
453 for (lbl, target_id, _nt) in &target_entities {
454 if lbl == sym_str {
455 if local_id == target_id {
456 continue;
457 }
458 let key = (local_id.clone(), target_id.clone());
459 if seen.contains(&key) {
460 continue;
461 }
462 seen.insert(key);
463 new_edges.push(GraphEdge {
464 source: local_id.clone(),
465 target: target_id.clone(),
466 relation: "uses".to_string(),
467 confidence: Confidence::Inferred,
468 confidence_score: 0.85,
469 source_file: source_file.clone(),
470 source_location: None,
471 weight: 0.85,
472 provenance: Some("cross-file:import-resolve".to_string()),
473 extra: HashMap::new(),
474 });
475 }
476 }
477 }
478 if new_edges.len() > edges_before {
479 continue; }
481 }
482
483 const MAX_FALLBACK_EDGES: usize = 50;
484 let mut fallback_count = 0;
485 for (_, target_id, _) in &target_entities {
486 if local_id == target_id {
487 continue;
488 }
489 let key = (local_id.clone(), target_id.clone());
490 if seen.contains(&key) {
491 continue;
492 }
493 seen.insert(key);
494 new_edges.push(GraphEdge {
495 source: local_id.clone(),
496 target: target_id.clone(),
497 relation: "uses".to_string(),
498 confidence: Confidence::Inferred,
499 confidence_score: 0.8,
500 source_file: source_file.clone(),
501 source_location: None,
502 weight: 0.8,
503 provenance: Some("cross-file:import-resolve:fallback".to_string()),
504 extra: Default::default(),
505 });
506 fallback_count += 1;
507 if fallback_count >= MAX_FALLBACK_EDGES {
508 break;
509 }
510 }
511 }
512 }
513
514 if !new_edges.is_empty() {
515 debug!(
516 "cross-file import resolution: created {} inferred uses edges",
517 new_edges.len()
518 );
519 }
520
521 result.edges.extend(new_edges);
522}
523
524fn resolve_cross_file_calls(result: &mut ExtractionResult) {
530 let callable_ids: HashSet<String> = result
531 .nodes
532 .iter()
533 .filter(|n| matches!(n.node_type, NodeType::Function | NodeType::Method))
534 .map(|n| n.id.clone())
535 .collect();
536
537 let existing_calls: HashSet<(String, String)> = result
538 .edges
539 .iter()
540 .filter(|e| e.relation == "calls")
541 .map(|e| (e.source.clone(), e.target.clone()))
542 .collect();
543
544 let uses_edges: Vec<(String, String, String)> = result
545 .edges
546 .iter()
547 .filter(|e| e.relation == "uses" && callable_ids.contains(&e.source))
548 .map(|e| (e.source.clone(), e.target.clone(), e.source_file.clone()))
549 .collect();
550
551 let mut new_edges: Vec<GraphEdge> = Vec::new();
552
553 for (source, target, source_file) in uses_edges {
554 let key = (source.clone(), target.clone());
555 if existing_calls.contains(&key) {
556 continue;
557 }
558 new_edges.push(GraphEdge {
559 source,
560 target,
561 relation: "calls".to_string(),
562 confidence: Confidence::Inferred,
563 confidence_score: 0.5,
564 source_file,
565 source_location: None,
566 weight: 0.5,
567 provenance: Some("cross-file:call-resolve".to_string()),
568 extra: HashMap::new(),
569 });
570 }
571
572 if !new_edges.is_empty() {
573 debug!(
574 "cross-file call resolution: created {} inferred calls edges",
575 new_edges.len()
576 );
577 }
578
579 result.edges.extend(new_edges);
580}
581
582fn resolve_jsts_import<'a>(
592 import_label: &str,
593 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
594) -> Vec<&'a (String, String, NodeType)> {
595 let label = import_label.split(" as ").next().unwrap_or(import_label);
596
597 let parts: Vec<&str> = label.split('/').collect();
598
599 if parts.len() >= 2 {
600 let module_stem = parts[0].trim_start_matches('.');
601 if let Some(entities) = stem_to_entities.get(module_stem) {
602 return entities.iter().collect();
603 }
604 }
605
606 if let Some(last) = parts.last() {
607 let stem = last.trim_start_matches('.');
608 if let Some(entities) = stem_to_entities.get(stem) {
609 return entities.iter().collect();
610 }
611 }
612
613 let simple = label.trim_start_matches("./").trim_start_matches("../");
614 if let Some(entities) = stem_to_entities.get(simple) {
615 return entities.iter().collect();
616 }
617
618 if let Some(entities) = stem_to_entities.get("index")
619 && (label.contains('/') || label.starts_with('.'))
620 {
621 return entities.iter().collect();
622 }
623
624 Vec::new()
625}
626
627fn resolve_go_import<'a>(
633 import_label: &str,
634 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
635 go_pkg_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
636) -> Vec<&'a (String, String, NodeType)> {
637 let label = import_label
638 .trim_start_matches(". ")
639 .trim_start_matches("_ ");
640 let label = if label.contains('"') {
641 label.split('"').nth(1).unwrap_or(label)
642 } else {
643 label
644 };
645
646 let pkg_name = label.rsplit('/').next().unwrap_or(label);
647
648 if let Some(entities) = go_pkg_to_entities.get(pkg_name) {
649 return entities.iter().collect();
650 }
651
652 if let Some(entities) = stem_to_entities.get(pkg_name) {
653 return entities.iter().collect();
654 }
655
656 Vec::new()
657}
658
659fn resolve_rust_import<'a>(
664 import_label: &str,
665 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
666) -> Vec<&'a (String, String, NodeType)> {
667 let label = import_label
668 .strip_prefix("pub use ")
669 .unwrap_or(import_label);
670 let segments: Vec<&str> = label.split("::").collect();
671
672 if segments.last() == Some(&"*") && segments.len() >= 2 {
673 let module = segments[segments.len() - 2];
674 if let Some(entities) = stem_to_entities.get(module) {
675 return entities.iter().collect();
676 }
677 }
678
679 if let Some(last) = segments.last()
680 && *last != "*"
681 && let Some(entities) = stem_to_entities.get(*last)
682 {
683 return entities.iter().collect();
684 }
685
686 if segments.len() >= 2 {
687 let module = segments[segments.len() - 2];
688 if let Some(entities) = stem_to_entities.get(module) {
689 let last = segments.last().unwrap();
690 let filtered: Vec<_> = entities.iter().filter(|(lbl, _, _)| lbl == last).collect();
691 if !filtered.is_empty() {
692 return filtered;
693 }
694 return entities.iter().collect();
695 }
696 }
697
698 Vec::new()
699}
700
701fn resolve_dot_import<'a>(
706 import_label: &str,
707 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
708) -> Vec<&'a (String, String, NodeType)> {
709 let label = import_label.strip_prefix("static ").unwrap_or(import_label);
710 let label = if let Some(idx) = label.find(" = ") {
711 label[idx + 3..].trim()
712 } else {
713 label
714 };
715
716 let segments: Vec<&str> = label.split('.').collect();
717
718 if let Some(last) = segments.last()
719 && let Some(entities) = stem_to_entities.get(*last)
720 {
721 return entities.iter().collect();
722 }
723
724 if segments.len() >= 2 {
725 let module = segments[segments.len() - 2];
726 if let Some(entities) = stem_to_entities.get(module) {
727 let last = segments.last().unwrap();
728 let filtered: Vec<_> = entities.iter().filter(|(lbl, _, _)| lbl == last).collect();
729 if !filtered.is_empty() {
730 return filtered;
731 }
732 return entities.iter().collect();
733 }
734 }
735
736 Vec::new()
737}
738
739fn resolve_c_include<'a>(
744 import_label: &str,
745 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
746) -> Vec<&'a (String, String, NodeType)> {
747 let label = import_label
748 .trim_start_matches('<')
749 .trim_end_matches('>')
750 .trim_start_matches('"')
751 .trim_end_matches('"');
752
753 let stem = std::path::Path::new(label)
754 .file_stem()
755 .and_then(|s| s.to_str())
756 .unwrap_or(label);
757
758 if let Some(entities) = stem_to_entities.get(stem) {
759 return entities.iter().collect();
760 }
761
762 Vec::new()
763}
764
765fn resolve_backslash_import<'a>(
769 import_label: &str,
770 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
771) -> Vec<&'a (String, String, NodeType)> {
772 let segments: Vec<&str> = import_label.split('\\').collect();
773
774 if let Some(last) = segments.last()
775 && let Some(entities) = stem_to_entities.get(*last)
776 {
777 return entities.iter().collect();
778 }
779
780 if segments.len() >= 2 {
781 let module = segments[segments.len() - 2];
782 if let Some(entities) = stem_to_entities.get(module) {
783 return entities.iter().collect();
784 }
785 }
786
787 Vec::new()
788}
789
790fn resolve_dart_import<'a>(
795 import_label: &str,
796 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
797) -> Vec<&'a (String, String, NodeType)> {
798 let mut label = import_label;
799
800 if let Some(stripped) = label.strip_prefix("import ") {
801 label = stripped;
802 } else if let Some(stripped) = label.strip_prefix("export ") {
803 label = stripped;
804 } else if let Some(stripped) = label.strip_prefix("part ") {
805 label = stripped;
806 }
807
808 let path_and_alias = label;
809 let path_part = if let Some(idx) = path_and_alias.find(" as ") {
810 &path_and_alias[..idx]
811 } else {
812 path_and_alias
813 };
814
815 let path_deferred = path_part;
816 let path_no_deferred = if let Some(idx) = path_deferred.find(" deferred") {
817 &path_deferred[..idx]
818 } else {
819 path_deferred
820 };
821
822 let quoted = path_no_deferred.trim();
823 let unquoted = quoted
824 .trim_matches('\'') .trim_matches('"');
826
827 let normalized = if unquoted.contains("../") {
828 let last_segment = unquoted.rsplit('/').next().unwrap_or(unquoted);
829 last_segment.strip_suffix(".dart").unwrap_or(last_segment)
830 } else {
831 let path_part = unquoted.strip_prefix("package:").unwrap_or(unquoted);
832
833 let last_segment = path_part.rsplit('/').next().unwrap_or(path_part);
834
835 last_segment.strip_suffix(".dart").unwrap_or(last_segment)
836 };
837
838 if let Some(entities) = stem_to_entities.get(normalized) {
839 return entities.iter().collect();
840 }
841
842 Vec::new()
843}
844
845#[cfg(test)]
846mod tests;