1pub mod ast_extract;
11pub mod dedup;
12pub mod lang_config;
13pub mod parser;
14pub mod semantic;
15pub mod treesitter;
16
17use std::collections::{HashMap, HashSet};
18use std::path::{Path, PathBuf};
19
20use graphify_core::confidence::Confidence;
21use graphify_core::model::{ExtractionResult, GraphEdge, NodeType};
22use rayon::prelude::*;
23use tracing::{debug, info, warn};
24
25pub const DISPATCH: &[(&str, &str)] = &[
27 (".py", "python"),
28 (".js", "javascript"),
29 (".jsx", "javascript"),
30 (".ts", "typescript"),
31 (".tsx", "typescript"),
32 (".go", "go"),
33 (".rs", "rust"),
34 (".java", "java"),
35 (".c", "c"),
36 (".h", "c"),
37 (".cpp", "cpp"),
38 (".cc", "cpp"),
39 (".cxx", "cpp"),
40 (".hpp", "cpp"),
41 (".rb", "ruby"),
42 (".cs", "csharp"),
43 (".kt", "kotlin"),
44 (".kts", "kotlin"),
45 (".scala", "scala"),
46 (".php", "php"),
47 (".swift", "swift"),
48 (".lua", "lua"),
49 (".toc", "lua"),
50 (".zig", "zig"),
51 (".ps1", "powershell"),
52 (".ex", "elixir"),
53 (".exs", "elixir"),
54 (".m", "objc"),
55 (".mm", "objc"),
56 (".jl", "julia"),
57 (".dart", "dart"),
58];
59
60fn dispatch_map() -> &'static HashMap<&'static str, &'static str> {
62 static MAP: std::sync::LazyLock<HashMap<&str, &str>> =
63 std::sync::LazyLock::new(|| DISPATCH.iter().copied().collect());
64 &MAP
65}
66
67pub fn language_for_path(path: &Path) -> Option<&'static str> {
69 let ext = path.extension()?.to_str()?;
70 dispatch_map().get(&*format!(".{ext}")).copied()
71}
72
73pub fn collect_files(target: &Path) -> Vec<PathBuf> {
75 let map = dispatch_map();
76 let mut files = Vec::new();
77 collect_files_inner(target, map, &mut files);
78 files.sort();
79 files
80}
81
82fn collect_files_inner(dir: &Path, map: &HashMap<&str, &str>, out: &mut Vec<PathBuf>) {
83 let entries = match std::fs::read_dir(dir) {
84 Ok(e) => e,
85 Err(e) => {
86 warn!("cannot read directory {}: {e}", dir.display());
87 return;
88 }
89 };
90 for entry in entries.flatten() {
91 let path = entry.path();
92 if path.is_dir() {
93 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
94 if name.starts_with('.')
95 || name == "node_modules"
96 || name == "__pycache__"
97 || name == "target"
98 || name == "vendor"
99 || name == "venv"
100 || name == ".git"
101 {
102 continue;
103 }
104 collect_files_inner(&path, map, out);
105 } else if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
106 let dotted = format!(".{ext}");
107 if map.contains_key(dotted.as_str()) {
108 out.push(path);
109 }
110 }
111 }
112}
113
114pub fn extract(paths: &[PathBuf]) -> ExtractionResult {
122 let results: Vec<ExtractionResult> = paths
123 .par_iter()
124 .filter_map(|path| {
125 let lang = if let Some(l) = language_for_path(path) {
126 l
127 } else {
128 debug!("skipping unsupported file: {}", path.display());
129 return None;
130 };
131
132 let source = match std::fs::read(path) {
133 Ok(s) => s,
134 Err(e) => {
135 warn!("cannot read {}: {e}", path.display());
136 return None;
137 }
138 };
139
140 debug!("extracting {} ({})", path.display(), lang);
141
142 let mut result = if let Some(ts_result) = treesitter::try_extract(path, &source, lang) {
143 debug!("used tree-sitter for {} ({})", path.display(), lang);
144 ts_result
145 } else {
146 let source_str = String::from_utf8_lossy(&source);
147 ast_extract::extract_file(path, source_str.as_ref(), lang)
148 };
149 dedup::dedup_file(&mut result);
150
151 Some(result)
152 })
153 .collect();
154
155 let mut combined = ExtractionResult::default();
156 for r in results {
157 combined.nodes.extend(r.nodes);
158 combined.edges.extend(r.edges);
159 combined.hyperedges.extend(r.hyperedges);
160 }
161
162 resolve_python_imports(&mut combined);
163
164 resolve_cross_file_imports(&mut combined);
165
166 resolve_cross_file_calls(&mut combined);
167
168 info!(
169 "extraction complete: {} nodes, {} edges",
170 combined.nodes.len(),
171 combined.edges.len()
172 );
173
174 combined
175}
176
177fn resolve_python_imports(result: &mut ExtractionResult) {
182 let label_to_ids: HashMap<String, Vec<(String, String)>> = {
183 let mut map: HashMap<String, Vec<(String, String)>> = HashMap::new();
184 for n in &result.nodes {
185 map.entry(n.label.clone())
186 .or_default()
187 .push((n.id.clone(), n.source_file.clone()));
188 }
189 map
190 };
191
192 let mut stem_to_entity_ids: HashMap<String, Vec<String>> = HashMap::new();
193 let defined_targets: HashSet<String> = result
194 .edges
195 .iter()
196 .filter(|e| e.relation == "defines")
197 .map(|e| e.target.clone())
198 .collect();
199 for node in &result.nodes {
200 if !defined_targets.contains(&node.id) {
201 continue;
202 }
203 let stem = std::path::Path::new(&node.source_file)
204 .file_stem()
205 .and_then(|s| s.to_str())
206 .unwrap_or("")
207 .to_string();
208 stem_to_entity_ids
209 .entry(stem)
210 .or_default()
211 .push(node.id.clone());
212 }
213
214 let mut star_expansions: Vec<GraphEdge> = Vec::new();
215
216 for edge in &mut result.edges {
217 if edge.relation == "imports" {
218 let import_label = result
219 .nodes
220 .iter()
221 .find(|n| n.id == edge.target)
222 .map_or("", |n| n.label.as_str());
223
224 if import_label.contains('*') {
225 let module_name = import_label.trim_end_matches(".*").trim_end_matches(" *");
227 if let Some(entity_ids) = stem_to_entity_ids.get(module_name) {
228 for target_id in entity_ids {
229 star_expansions.push(GraphEdge {
230 source: edge.source.clone(),
231 target: target_id.clone(),
232 relation: "uses".to_string(),
233 confidence: Confidence::Inferred,
234 confidence_score: 0.7,
235 source_file: edge.source_file.clone(),
236 source_location: None,
237 weight: 0.7,
238 extra: Default::default(),
239 });
240 }
241 }
242 } else if let Some(candidates) = label_to_ids.get(&edge.target) {
243 let resolved = candidates
244 .iter()
245 .find(|(_, sf)| sf == &edge.source_file)
246 .or_else(|| candidates.first())
247 .map(|(id, _)| id.clone());
248 if let Some(resolved_id) = resolved {
249 edge.target = resolved_id;
250 edge.confidence = graphify_core::confidence::Confidence::Extracted;
251 }
252 }
253 }
254 }
255
256 if !star_expansions.is_empty() {
257 debug!(
258 "python star import expansion: created {} uses edges",
259 star_expansions.len()
260 );
261 result.edges.extend(star_expansions);
262 }
263}
264
265fn resolve_cross_file_imports(result: &mut ExtractionResult) {
272 let mut id_to_label: HashMap<String, String> = HashMap::new();
273 let mut stem_to_entities: HashMap<String, Vec<(String, String, NodeType)>> = HashMap::new();
274 let mut go_pkg_to_entities: HashMap<String, Vec<(String, String, NodeType)>> = HashMap::new();
275 let mut source_file_to_stem: HashMap<String, String> = HashMap::new();
276 let mut file_id_to_source: HashMap<String, String> = HashMap::new();
277
278 let defined_entity_ids: HashSet<String> = result
279 .edges
280 .iter()
281 .filter(|e| e.relation == "defines")
282 .map(|e| e.target.clone())
283 .collect();
284
285 let mut source_file_entities: HashMap<String, Vec<String>> = HashMap::new();
286 for edge in &result.edges {
287 if edge.relation == "defines" {
288 source_file_entities
289 .entry(edge.source_file.clone())
290 .or_default()
291 .push(edge.target.clone());
292 }
293 }
294
295 for node in &result.nodes {
296 id_to_label.insert(node.id.clone(), node.label.clone());
297
298 if node.node_type == NodeType::File {
299 let stem = Path::new(&node.source_file)
300 .file_stem()
301 .and_then(|s| s.to_str())
302 .unwrap_or("")
303 .to_string();
304 source_file_to_stem.insert(node.source_file.clone(), stem);
305 file_id_to_source.insert(node.id.clone(), node.source_file.clone());
306 continue;
307 }
308
309 if !defined_entity_ids.contains(&node.id) {
310 continue;
311 }
312
313 let path = Path::new(&node.source_file);
314 let stem = path
315 .file_stem()
316 .and_then(|s| s.to_str())
317 .unwrap_or("")
318 .to_string();
319
320 stem_to_entities.entry(stem).or_default().push((
321 node.label.clone(),
322 node.id.clone(),
323 node.node_type.clone(),
324 ));
325
326 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
327 if ext == "go"
328 && let Some(dir) = path
329 .parent()
330 .and_then(|d| d.file_name())
331 .and_then(|d| d.to_str())
332 {
333 go_pkg_to_entities
334 .entry(dir.to_string())
335 .or_default()
336 .push((node.label.clone(), node.id.clone(), node.node_type.clone()));
337 }
338 }
339
340 let mut new_edges: Vec<GraphEdge> = Vec::new();
341 let mut seen = HashSet::new();
342
343 for edge in &result.edges {
344 if edge.relation != "imports" {
345 continue;
346 }
347
348 let source_file = &edge.source_file;
349 let ext = Path::new(source_file)
350 .extension()
351 .and_then(|e| e.to_str())
352 .unwrap_or("");
353
354 let import_label = match id_to_label.get(&edge.target) {
355 Some(label) => label.as_str(),
356 None => continue,
357 };
358
359 if import_label.is_empty() {
360 continue;
361 }
362
363 let target_entities = match ext {
364 "js" | "jsx" | "ts" | "tsx" => resolve_jsts_import(import_label, &stem_to_entities),
365 "go" => resolve_go_import(import_label, &stem_to_entities, &go_pkg_to_entities),
366 "rs" => resolve_rust_import(import_label, &stem_to_entities),
367 "java" => resolve_dot_import(import_label, &stem_to_entities),
368 "cs" => resolve_dot_import(import_label, &stem_to_entities),
369 "c" | "h" | "cpp" | "cc" | "cxx" | "hpp" => {
370 resolve_c_include(import_label, &stem_to_entities)
371 }
372 "kt" | "kts" => {
373 let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
374 resolve_dot_import(cleaned.trim(), &stem_to_entities)
375 }
376 "php" => {
377 let cleaned = import_label.strip_prefix("use ").unwrap_or(import_label);
378 resolve_backslash_import(cleaned.trim(), &stem_to_entities)
379 }
380 "dart" => resolve_dart_import(import_label, &stem_to_entities),
381 "scala" => {
382 let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
383 resolve_dot_import(cleaned.trim(), &stem_to_entities)
384 }
385 "swift" => {
386 let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
387 resolve_dot_import(cleaned.trim(), &stem_to_entities)
388 }
389 _ => continue,
390 };
391
392 if target_entities.is_empty() {
393 continue;
394 }
395
396 let local_entities = match source_file_entities.get(source_file) {
397 Some(ids) => ids,
398 None => continue,
399 };
400
401 let target_by_label: HashMap<&str, &String> = target_entities
402 .iter()
403 .filter_map(|(lbl, id, _)| {
404 if !lbl.is_empty() {
405 Some((lbl.as_str(), id))
406 } else {
407 None
408 }
409 })
410 .collect();
411
412 for local_id in local_entities {
413 let local_label = match id_to_label.get(local_id) {
414 Some(l) => l,
415 None => continue,
416 };
417
418 if let Some(&target_id) = target_by_label.get(local_label.as_str()) {
419 if local_id == target_id {
420 continue;
421 }
422 let key = (local_id.clone(), target_id.clone());
423 if seen.contains(&key) {
424 continue;
425 }
426 seen.insert(key);
427 new_edges.push(GraphEdge {
428 source: local_id.clone(),
429 target: target_id.clone(),
430 relation: "uses".to_string(),
431 confidence: Confidence::Inferred,
432 confidence_score: 0.8,
433 source_file: source_file.clone(),
434 source_location: None,
435 weight: 0.8,
436 extra: Default::default(),
437 });
438 continue;
439 }
440
441 const MAX_FALLBACK_EDGES: usize = 50;
442 let mut fallback_count = 0;
443 for (_, target_id, _) in &target_entities {
444 if local_id == target_id {
445 continue;
446 }
447 let key = (local_id.clone(), target_id.clone());
448 if seen.contains(&key) {
449 continue;
450 }
451 seen.insert(key);
452 new_edges.push(GraphEdge {
453 source: local_id.clone(),
454 target: target_id.clone(),
455 relation: "uses".to_string(),
456 confidence: Confidence::Inferred,
457 confidence_score: 0.8,
458 source_file: source_file.clone(),
459 source_location: None,
460 weight: 0.8,
461 extra: Default::default(),
462 });
463 fallback_count += 1;
464 if fallback_count >= MAX_FALLBACK_EDGES {
465 break;
466 }
467 }
468 }
469 }
470
471 if !new_edges.is_empty() {
472 debug!(
473 "cross-file import resolution: created {} inferred uses edges",
474 new_edges.len()
475 );
476 }
477
478 result.edges.extend(new_edges);
479}
480
481fn resolve_cross_file_calls(result: &mut ExtractionResult) {
487 let callable_ids: HashSet<String> = result
488 .nodes
489 .iter()
490 .filter(|n| matches!(n.node_type, NodeType::Function | NodeType::Method))
491 .map(|n| n.id.clone())
492 .collect();
493
494 let existing_calls: HashSet<(String, String)> = result
495 .edges
496 .iter()
497 .filter(|e| e.relation == "calls")
498 .map(|e| (e.source.clone(), e.target.clone()))
499 .collect();
500
501 let uses_edges: Vec<(String, String, String)> = result
502 .edges
503 .iter()
504 .filter(|e| e.relation == "uses" && callable_ids.contains(&e.source))
505 .map(|e| (e.source.clone(), e.target.clone(), e.source_file.clone()))
506 .collect();
507
508 let mut new_edges: Vec<GraphEdge> = Vec::new();
509
510 for (source, target, source_file) in uses_edges {
511 let key = (source.clone(), target.clone());
512 if existing_calls.contains(&key) {
513 continue;
514 }
515 new_edges.push(GraphEdge {
516 source,
517 target,
518 relation: "calls".to_string(),
519 confidence: Confidence::Inferred,
520 confidence_score: 0.5,
521 source_file,
522 source_location: None,
523 weight: 0.5,
524 extra: HashMap::new(),
525 });
526 }
527
528 if !new_edges.is_empty() {
529 debug!(
530 "cross-file call resolution: created {} inferred calls edges",
531 new_edges.len()
532 );
533 }
534
535 result.edges.extend(new_edges);
536}
537
538fn resolve_jsts_import<'a>(
548 import_label: &str,
549 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
550) -> Vec<&'a (String, String, NodeType)> {
551 let label = import_label.split(" as ").next().unwrap_or(import_label);
552
553 let parts: Vec<&str> = label.split('/').collect();
554
555 if parts.len() >= 2 {
556 let module_stem = parts[0].trim_start_matches('.');
557 if let Some(entities) = stem_to_entities.get(module_stem) {
558 return entities.iter().collect();
559 }
560 }
561
562 if let Some(last) = parts.last() {
563 let stem = last.trim_start_matches('.');
564 if let Some(entities) = stem_to_entities.get(stem) {
565 return entities.iter().collect();
566 }
567 }
568
569 let simple = label.trim_start_matches("./").trim_start_matches("../");
570 if let Some(entities) = stem_to_entities.get(simple) {
571 return entities.iter().collect();
572 }
573
574 if let Some(entities) = stem_to_entities.get("index")
575 && (label.contains('/') || label.starts_with('.'))
576 {
577 return entities.iter().collect();
578 }
579
580 Vec::new()
581}
582
583fn resolve_go_import<'a>(
589 import_label: &str,
590 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
591 go_pkg_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
592) -> Vec<&'a (String, String, NodeType)> {
593 let label = import_label
594 .trim_start_matches(". ")
595 .trim_start_matches("_ ");
596 let label = if label.contains('"') {
597 label.split('"').nth(1).unwrap_or(label)
598 } else {
599 label
600 };
601
602 let pkg_name = label.rsplit('/').next().unwrap_or(label);
603
604 if let Some(entities) = go_pkg_to_entities.get(pkg_name) {
605 return entities.iter().collect();
606 }
607
608 if let Some(entities) = stem_to_entities.get(pkg_name) {
609 return entities.iter().collect();
610 }
611
612 Vec::new()
613}
614
615fn resolve_rust_import<'a>(
620 import_label: &str,
621 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
622) -> Vec<&'a (String, String, NodeType)> {
623 let label = import_label
624 .strip_prefix("pub use ")
625 .unwrap_or(import_label);
626 let segments: Vec<&str> = label.split("::").collect();
627
628 if segments.last() == Some(&"*") && segments.len() >= 2 {
629 let module = segments[segments.len() - 2];
630 if let Some(entities) = stem_to_entities.get(module) {
631 return entities.iter().collect();
632 }
633 }
634
635 if let Some(last) = segments.last()
636 && *last != "*"
637 && let Some(entities) = stem_to_entities.get(*last)
638 {
639 return entities.iter().collect();
640 }
641
642 if segments.len() >= 2 {
643 let module = segments[segments.len() - 2];
644 if let Some(entities) = stem_to_entities.get(module) {
645 let last = segments.last().unwrap();
646 let filtered: Vec<_> = entities.iter().filter(|(lbl, _, _)| lbl == last).collect();
647 if !filtered.is_empty() {
648 return filtered;
649 }
650 return entities.iter().collect();
651 }
652 }
653
654 Vec::new()
655}
656
657fn resolve_dot_import<'a>(
662 import_label: &str,
663 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
664) -> Vec<&'a (String, String, NodeType)> {
665 let label = import_label.strip_prefix("static ").unwrap_or(import_label);
666 let label = if let Some(idx) = label.find(" = ") {
667 label[idx + 3..].trim()
668 } else {
669 label
670 };
671
672 let segments: Vec<&str> = label.split('.').collect();
673
674 if let Some(last) = segments.last()
675 && let Some(entities) = stem_to_entities.get(*last)
676 {
677 return entities.iter().collect();
678 }
679
680 if segments.len() >= 2 {
681 let module = segments[segments.len() - 2];
682 if let Some(entities) = stem_to_entities.get(module) {
683 let last = segments.last().unwrap();
684 let filtered: Vec<_> = entities.iter().filter(|(lbl, _, _)| lbl == last).collect();
685 if !filtered.is_empty() {
686 return filtered;
687 }
688 return entities.iter().collect();
689 }
690 }
691
692 Vec::new()
693}
694
695fn resolve_c_include<'a>(
700 import_label: &str,
701 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
702) -> Vec<&'a (String, String, NodeType)> {
703 let label = import_label
704 .trim_start_matches('<')
705 .trim_end_matches('>')
706 .trim_start_matches('"')
707 .trim_end_matches('"');
708
709 let stem = std::path::Path::new(label)
710 .file_stem()
711 .and_then(|s| s.to_str())
712 .unwrap_or(label);
713
714 if let Some(entities) = stem_to_entities.get(stem) {
715 return entities.iter().collect();
716 }
717
718 Vec::new()
719}
720
721fn resolve_backslash_import<'a>(
725 import_label: &str,
726 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
727) -> Vec<&'a (String, String, NodeType)> {
728 let segments: Vec<&str> = import_label.split('\\').collect();
729
730 if let Some(last) = segments.last()
731 && let Some(entities) = stem_to_entities.get(*last)
732 {
733 return entities.iter().collect();
734 }
735
736 if segments.len() >= 2 {
737 let module = segments[segments.len() - 2];
738 if let Some(entities) = stem_to_entities.get(module) {
739 return entities.iter().collect();
740 }
741 }
742
743 Vec::new()
744}
745
746fn resolve_dart_import<'a>(
751 import_label: &str,
752 stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
753) -> Vec<&'a (String, String, NodeType)> {
754 let mut label = import_label;
755
756 if let Some(stripped) = label.strip_prefix("import ") {
757 label = stripped;
758 } else if let Some(stripped) = label.strip_prefix("export ") {
759 label = stripped;
760 } else if let Some(stripped) = label.strip_prefix("part ") {
761 label = stripped;
762 }
763
764 let path_and_alias = label;
765 let path_part = if let Some(idx) = path_and_alias.find(" as ") {
766 &path_and_alias[..idx]
767 } else {
768 path_and_alias
769 };
770
771 let path_deferred = path_part;
772 let path_no_deferred = if let Some(idx) = path_deferred.find(" deferred") {
773 &path_deferred[..idx]
774 } else {
775 path_deferred
776 };
777
778 let quoted = path_no_deferred.trim();
779 let unquoted = quoted
780 .trim_matches('\'') .trim_matches('"');
782
783 let normalized = if unquoted.contains("../") {
784 let last_segment = unquoted.rsplit('/').next().unwrap_or(unquoted);
785 last_segment.strip_suffix(".dart").unwrap_or(last_segment)
786 } else {
787 let path_part = unquoted.strip_prefix("package:").unwrap_or(unquoted);
788
789 let last_segment = path_part.rsplit('/').next().unwrap_or(path_part);
790
791 last_segment.strip_suffix(".dart").unwrap_or(last_segment)
792 };
793
794 if let Some(entities) = stem_to_entities.get(normalized) {
795 return entities.iter().collect();
796 }
797
798 Vec::new()
799}
800
801#[cfg(test)]
802mod tests;