pub mod ast_extract;
pub mod dedup;
pub mod lang_config;
pub mod parser;
pub mod semantic;
pub mod treesitter;
use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use graphify_core::confidence::Confidence;
use graphify_core::model::{ExtractionResult, GraphEdge, NodeType};
use rayon::prelude::*;
use tracing::{debug, info, warn};
pub const DISPATCH: &[(&str, &str)] = &[
(".py", "python"),
(".js", "javascript"),
(".jsx", "javascript"),
(".ts", "typescript"),
(".tsx", "typescript"),
(".go", "go"),
(".rs", "rust"),
(".java", "java"),
(".c", "c"),
(".h", "c"),
(".cpp", "cpp"),
(".cc", "cpp"),
(".cxx", "cpp"),
(".hpp", "cpp"),
(".rb", "ruby"),
(".cs", "csharp"),
(".kt", "kotlin"),
(".kts", "kotlin"),
(".scala", "scala"),
(".php", "php"),
(".swift", "swift"),
(".lua", "lua"),
(".toc", "lua"),
(".zig", "zig"),
(".ps1", "powershell"),
(".ex", "elixir"),
(".exs", "elixir"),
(".m", "objc"),
(".mm", "objc"),
(".jl", "julia"),
(".dart", "dart"),
];
fn dispatch_map() -> &'static HashMap<&'static str, &'static str> {
static MAP: std::sync::LazyLock<HashMap<&str, &str>> =
std::sync::LazyLock::new(|| DISPATCH.iter().copied().collect());
&MAP
}
pub fn language_for_path(path: &Path) -> Option<&'static str> {
let ext = path.extension()?.to_str()?;
dispatch_map().get(&*format!(".{ext}")).copied()
}
pub fn collect_files(target: &Path) -> Vec<PathBuf> {
let map = dispatch_map();
let mut files = Vec::new();
collect_files_inner(target, map, &mut files);
files.sort();
files
}
fn collect_files_inner(dir: &Path, map: &HashMap<&str, &str>, out: &mut Vec<PathBuf>) {
let entries = match std::fs::read_dir(dir) {
Ok(e) => e,
Err(e) => {
warn!("cannot read directory {}: {e}", dir.display());
return;
}
};
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() {
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if name.starts_with('.')
|| name == "node_modules"
|| name == "__pycache__"
|| name == "target"
|| name == "vendor"
|| name == "venv"
|| name == ".git"
{
continue;
}
collect_files_inner(&path, map, out);
} else if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
let dotted = format!(".{ext}");
if map.contains_key(dotted.as_str()) {
out.push(path);
}
}
}
}
pub fn extract(paths: &[PathBuf]) -> ExtractionResult {
let results: Vec<ExtractionResult> = paths
.par_iter()
.filter_map(|path| {
let lang = if let Some(l) = language_for_path(path) {
l
} else {
debug!("skipping unsupported file: {}", path.display());
return None;
};
let source = match std::fs::read(path) {
Ok(s) => s,
Err(e) => {
warn!("cannot read {}: {e}", path.display());
return None;
}
};
debug!("extracting {} ({})", path.display(), lang);
let mut result = if let Some(ts_result) = treesitter::try_extract(path, &source, lang) {
debug!("used tree-sitter for {} ({})", path.display(), lang);
ts_result
} else {
let source_str = String::from_utf8_lossy(&source);
ast_extract::extract_file(path, source_str.as_ref(), lang)
};
dedup::dedup_file(&mut result);
Some(result)
})
.collect();
let mut combined = ExtractionResult::default();
for r in results {
combined.nodes.extend(r.nodes);
combined.edges.extend(r.edges);
combined.hyperedges.extend(r.hyperedges);
}
resolve_python_imports(&mut combined);
resolve_cross_file_imports(&mut combined);
resolve_cross_file_calls(&mut combined);
info!(
"extraction complete: {} nodes, {} edges",
combined.nodes.len(),
combined.edges.len()
);
combined
}
fn resolve_python_imports(result: &mut ExtractionResult) {
let label_to_ids: HashMap<String, Vec<(String, String)>> = {
let mut map: HashMap<String, Vec<(String, String)>> = HashMap::new();
for n in &result.nodes {
map.entry(n.label.clone())
.or_default()
.push((n.id.clone(), n.source_file.clone()));
}
map
};
let mut stem_to_entity_ids: HashMap<String, Vec<String>> = HashMap::new();
let defined_targets: HashSet<String> = result
.edges
.iter()
.filter(|e| e.relation == "defines")
.map(|e| e.target.clone())
.collect();
for node in &result.nodes {
if !defined_targets.contains(&node.id) {
continue;
}
let stem = std::path::Path::new(&node.source_file)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_string();
stem_to_entity_ids
.entry(stem)
.or_default()
.push(node.id.clone());
}
let mut star_expansions: Vec<GraphEdge> = Vec::new();
for edge in &mut result.edges {
if edge.relation == "imports" {
let import_label = result
.nodes
.iter()
.find(|n| n.id == edge.target)
.map_or("", |n| n.label.as_str());
if import_label.contains('*') {
let module_name = import_label.trim_end_matches(".*").trim_end_matches(" *");
if let Some(entity_ids) = stem_to_entity_ids.get(module_name) {
for target_id in entity_ids {
star_expansions.push(GraphEdge {
source: edge.source.clone(),
target: target_id.clone(),
relation: "uses".to_string(),
confidence: Confidence::Inferred,
confidence_score: 0.7,
source_file: edge.source_file.clone(),
source_location: None,
weight: 0.7,
provenance: Some("cross-file:python-star-import".to_string()),
extra: Default::default(),
});
}
}
} else if let Some(candidates) = label_to_ids.get(&edge.target) {
let resolved = candidates
.iter()
.find(|(_, sf)| sf == &edge.source_file)
.or_else(|| candidates.first())
.map(|(id, _)| id.clone());
if let Some(resolved_id) = resolved {
edge.target = resolved_id;
edge.confidence = graphify_core::confidence::Confidence::Extracted;
}
}
}
}
if !star_expansions.is_empty() {
debug!(
"python star import expansion: created {} uses edges",
star_expansions.len()
);
result.edges.extend(star_expansions);
}
}
fn resolve_cross_file_imports(result: &mut ExtractionResult) {
let mut id_to_label: HashMap<String, String> = HashMap::new();
let mut stem_to_entities: HashMap<String, Vec<(String, String, NodeType)>> = HashMap::new();
let mut go_pkg_to_entities: HashMap<String, Vec<(String, String, NodeType)>> = HashMap::new();
let mut source_file_to_stem: HashMap<String, String> = HashMap::new();
let mut file_id_to_source: HashMap<String, String> = HashMap::new();
let defined_entity_ids: HashSet<String> = result
.edges
.iter()
.filter(|e| e.relation == "defines")
.map(|e| e.target.clone())
.collect();
let mut source_file_entities: HashMap<String, Vec<String>> = HashMap::new();
for edge in &result.edges {
if edge.relation == "defines" {
source_file_entities
.entry(edge.source_file.clone())
.or_default()
.push(edge.target.clone());
}
}
for node in &result.nodes {
id_to_label.insert(node.id.clone(), node.label.clone());
if node.node_type == NodeType::File {
let stem = Path::new(&node.source_file)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_string();
source_file_to_stem.insert(node.source_file.clone(), stem);
file_id_to_source.insert(node.id.clone(), node.source_file.clone());
continue;
}
if !defined_entity_ids.contains(&node.id) {
continue;
}
let path = Path::new(&node.source_file);
let stem = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_string();
stem_to_entities.entry(stem).or_default().push((
node.label.clone(),
node.id.clone(),
node.node_type.clone(),
));
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
if ext == "go"
&& let Some(dir) = path
.parent()
.and_then(|d| d.file_name())
.and_then(|d| d.to_str())
{
go_pkg_to_entities
.entry(dir.to_string())
.or_default()
.push((node.label.clone(), node.id.clone(), node.node_type.clone()));
}
}
let mut new_edges: Vec<GraphEdge> = Vec::new();
let mut seen = HashSet::new();
for edge in &result.edges {
if edge.relation != "imports" {
continue;
}
let source_file = &edge.source_file;
let ext = Path::new(source_file)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("");
let import_label = match id_to_label.get(&edge.target) {
Some(label) => label.as_str(),
None => continue,
};
if import_label.is_empty() {
continue;
}
let target_entities = match ext {
"js" | "jsx" | "ts" | "tsx" => resolve_jsts_import(import_label, &stem_to_entities),
"go" => resolve_go_import(import_label, &stem_to_entities, &go_pkg_to_entities),
"rs" => resolve_rust_import(import_label, &stem_to_entities),
"java" => resolve_dot_import(import_label, &stem_to_entities),
"cs" => resolve_dot_import(import_label, &stem_to_entities),
"c" | "h" | "cpp" | "cc" | "cxx" | "hpp" => {
resolve_c_include(import_label, &stem_to_entities)
}
"kt" | "kts" => {
let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
resolve_dot_import(cleaned.trim(), &stem_to_entities)
}
"php" => {
let cleaned = import_label.strip_prefix("use ").unwrap_or(import_label);
resolve_backslash_import(cleaned.trim(), &stem_to_entities)
}
"dart" => resolve_dart_import(import_label, &stem_to_entities),
"scala" => {
let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
resolve_dot_import(cleaned.trim(), &stem_to_entities)
}
"swift" => {
let cleaned = import_label.strip_prefix("import ").unwrap_or(import_label);
resolve_dot_import(cleaned.trim(), &stem_to_entities)
}
_ => continue,
};
if target_entities.is_empty() {
continue;
}
let local_entities = match source_file_entities.get(source_file) {
Some(ids) => ids,
None => continue,
};
let target_by_label: HashMap<&str, &String> = target_entities
.iter()
.filter_map(|(lbl, id, _)| {
if !lbl.is_empty() {
Some((lbl.as_str(), id))
} else {
None
}
})
.collect();
for local_id in local_entities {
let local_label = match id_to_label.get(local_id) {
Some(l) => l,
None => continue,
};
if let Some(&target_id) = target_by_label.get(local_label.as_str()) {
if local_id == target_id {
continue;
}
let key = (local_id.clone(), target_id.clone());
if seen.contains(&key) {
continue;
}
seen.insert(key);
new_edges.push(GraphEdge {
source: local_id.clone(),
target: target_id.clone(),
relation: "uses".to_string(),
confidence: Confidence::Inferred,
confidence_score: 0.8,
source_file: source_file.clone(),
source_location: None,
weight: 0.8,
provenance: Some("cross-file:import-resolve".to_string()),
extra: Default::default(),
});
continue;
}
const MAX_FALLBACK_EDGES: usize = 50;
let mut fallback_count = 0;
for (_, target_id, _) in &target_entities {
if local_id == target_id {
continue;
}
let key = (local_id.clone(), target_id.clone());
if seen.contains(&key) {
continue;
}
seen.insert(key);
new_edges.push(GraphEdge {
source: local_id.clone(),
target: target_id.clone(),
relation: "uses".to_string(),
confidence: Confidence::Inferred,
confidence_score: 0.8,
source_file: source_file.clone(),
source_location: None,
weight: 0.8,
provenance: Some("cross-file:import-resolve:fallback".to_string()),
extra: Default::default(),
});
fallback_count += 1;
if fallback_count >= MAX_FALLBACK_EDGES {
break;
}
}
}
}
if !new_edges.is_empty() {
debug!(
"cross-file import resolution: created {} inferred uses edges",
new_edges.len()
);
}
result.edges.extend(new_edges);
}
fn resolve_cross_file_calls(result: &mut ExtractionResult) {
let callable_ids: HashSet<String> = result
.nodes
.iter()
.filter(|n| matches!(n.node_type, NodeType::Function | NodeType::Method))
.map(|n| n.id.clone())
.collect();
let existing_calls: HashSet<(String, String)> = result
.edges
.iter()
.filter(|e| e.relation == "calls")
.map(|e| (e.source.clone(), e.target.clone()))
.collect();
let uses_edges: Vec<(String, String, String)> = result
.edges
.iter()
.filter(|e| e.relation == "uses" && callable_ids.contains(&e.source))
.map(|e| (e.source.clone(), e.target.clone(), e.source_file.clone()))
.collect();
let mut new_edges: Vec<GraphEdge> = Vec::new();
for (source, target, source_file) in uses_edges {
let key = (source.clone(), target.clone());
if existing_calls.contains(&key) {
continue;
}
new_edges.push(GraphEdge {
source,
target,
relation: "calls".to_string(),
confidence: Confidence::Inferred,
confidence_score: 0.5,
source_file,
source_location: None,
weight: 0.5,
provenance: Some("cross-file:call-resolve".to_string()),
extra: HashMap::new(),
});
}
if !new_edges.is_empty() {
debug!(
"cross-file call resolution: created {} inferred calls edges",
new_edges.len()
);
}
result.edges.extend(new_edges);
}
fn resolve_jsts_import<'a>(
import_label: &str,
stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
) -> Vec<&'a (String, String, NodeType)> {
let label = import_label.split(" as ").next().unwrap_or(import_label);
let parts: Vec<&str> = label.split('/').collect();
if parts.len() >= 2 {
let module_stem = parts[0].trim_start_matches('.');
if let Some(entities) = stem_to_entities.get(module_stem) {
return entities.iter().collect();
}
}
if let Some(last) = parts.last() {
let stem = last.trim_start_matches('.');
if let Some(entities) = stem_to_entities.get(stem) {
return entities.iter().collect();
}
}
let simple = label.trim_start_matches("./").trim_start_matches("../");
if let Some(entities) = stem_to_entities.get(simple) {
return entities.iter().collect();
}
if let Some(entities) = stem_to_entities.get("index")
&& (label.contains('/') || label.starts_with('.'))
{
return entities.iter().collect();
}
Vec::new()
}
fn resolve_go_import<'a>(
import_label: &str,
stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
go_pkg_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
) -> Vec<&'a (String, String, NodeType)> {
let label = import_label
.trim_start_matches(". ")
.trim_start_matches("_ ");
let label = if label.contains('"') {
label.split('"').nth(1).unwrap_or(label)
} else {
label
};
let pkg_name = label.rsplit('/').next().unwrap_or(label);
if let Some(entities) = go_pkg_to_entities.get(pkg_name) {
return entities.iter().collect();
}
if let Some(entities) = stem_to_entities.get(pkg_name) {
return entities.iter().collect();
}
Vec::new()
}
fn resolve_rust_import<'a>(
import_label: &str,
stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
) -> Vec<&'a (String, String, NodeType)> {
let label = import_label
.strip_prefix("pub use ")
.unwrap_or(import_label);
let segments: Vec<&str> = label.split("::").collect();
if segments.last() == Some(&"*") && segments.len() >= 2 {
let module = segments[segments.len() - 2];
if let Some(entities) = stem_to_entities.get(module) {
return entities.iter().collect();
}
}
if let Some(last) = segments.last()
&& *last != "*"
&& let Some(entities) = stem_to_entities.get(*last)
{
return entities.iter().collect();
}
if segments.len() >= 2 {
let module = segments[segments.len() - 2];
if let Some(entities) = stem_to_entities.get(module) {
let last = segments.last().unwrap();
let filtered: Vec<_> = entities.iter().filter(|(lbl, _, _)| lbl == last).collect();
if !filtered.is_empty() {
return filtered;
}
return entities.iter().collect();
}
}
Vec::new()
}
fn resolve_dot_import<'a>(
import_label: &str,
stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
) -> Vec<&'a (String, String, NodeType)> {
let label = import_label.strip_prefix("static ").unwrap_or(import_label);
let label = if let Some(idx) = label.find(" = ") {
label[idx + 3..].trim()
} else {
label
};
let segments: Vec<&str> = label.split('.').collect();
if let Some(last) = segments.last()
&& let Some(entities) = stem_to_entities.get(*last)
{
return entities.iter().collect();
}
if segments.len() >= 2 {
let module = segments[segments.len() - 2];
if let Some(entities) = stem_to_entities.get(module) {
let last = segments.last().unwrap();
let filtered: Vec<_> = entities.iter().filter(|(lbl, _, _)| lbl == last).collect();
if !filtered.is_empty() {
return filtered;
}
return entities.iter().collect();
}
}
Vec::new()
}
fn resolve_c_include<'a>(
import_label: &str,
stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
) -> Vec<&'a (String, String, NodeType)> {
let label = import_label
.trim_start_matches('<')
.trim_end_matches('>')
.trim_start_matches('"')
.trim_end_matches('"');
let stem = std::path::Path::new(label)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or(label);
if let Some(entities) = stem_to_entities.get(stem) {
return entities.iter().collect();
}
Vec::new()
}
fn resolve_backslash_import<'a>(
import_label: &str,
stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
) -> Vec<&'a (String, String, NodeType)> {
let segments: Vec<&str> = import_label.split('\\').collect();
if let Some(last) = segments.last()
&& let Some(entities) = stem_to_entities.get(*last)
{
return entities.iter().collect();
}
if segments.len() >= 2 {
let module = segments[segments.len() - 2];
if let Some(entities) = stem_to_entities.get(module) {
return entities.iter().collect();
}
}
Vec::new()
}
fn resolve_dart_import<'a>(
import_label: &str,
stem_to_entities: &'a HashMap<String, Vec<(String, String, NodeType)>>,
) -> Vec<&'a (String, String, NodeType)> {
let mut label = import_label;
if let Some(stripped) = label.strip_prefix("import ") {
label = stripped;
} else if let Some(stripped) = label.strip_prefix("export ") {
label = stripped;
} else if let Some(stripped) = label.strip_prefix("part ") {
label = stripped;
}
let path_and_alias = label;
let path_part = if let Some(idx) = path_and_alias.find(" as ") {
&path_and_alias[..idx]
} else {
path_and_alias
};
let path_deferred = path_part;
let path_no_deferred = if let Some(idx) = path_deferred.find(" deferred") {
&path_deferred[..idx]
} else {
path_deferred
};
let quoted = path_no_deferred.trim();
let unquoted = quoted
.trim_matches('\'') .trim_matches('"');
let normalized = if unquoted.contains("../") {
let last_segment = unquoted.rsplit('/').next().unwrap_or(unquoted);
last_segment.strip_suffix(".dart").unwrap_or(last_segment)
} else {
let path_part = unquoted.strip_prefix("package:").unwrap_or(unquoted);
let last_segment = path_part.rsplit('/').next().unwrap_or(path_part);
last_segment.strip_suffix(".dart").unwrap_or(last_segment)
};
if let Some(entities) = stem_to_entities.get(normalized) {
return entities.iter().collect();
}
Vec::new()
}
#[cfg(test)]
mod tests;