#![warn(missing_docs)]
use crate::graph::pdg::{Node, NodeType, ProgramDependenceGraph};
use crate::parse::prelude::SignatureInfo;
use regex::Regex;
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
pub fn extract_pdg_from_signatures(
signatures: Vec<SignatureInfo>,
source_code: &[u8],
file_path: &str,
language: &str,
) -> ProgramDependenceGraph {
let mut pdg = ProgramDependenceGraph::new();
let mut node_ids: HashMap<String, crate::graph::pdg::NodeId> = HashMap::new();
for sig in &signatures {
let node = signature_to_node(sig, file_path, language);
let nid = pdg.add_node(node);
node_ids.insert(sig.qualified_name.clone(), nid);
}
let containment = infer_class_nodes_and_containment(
&signatures,
&mut pdg,
&mut node_ids,
file_path,
language,
);
pdg.add_containment_edges(containment);
let data_edges = extract_data_flow_edges(&signatures, &node_ids);
pdg.add_data_flow_edges(data_edges);
let inheritance = extract_inheritance_edges(&signatures, &node_ids);
pdg.add_inheritance_edges(inheritance);
let call_edges = extract_call_edges(&signatures, &node_ids);
pdg.add_call_edges(call_edges);
let import_edges = extract_import_edges(
&signatures,
&node_ids,
&mut pdg,
file_path,
language,
source_code,
);
pdg.add_import_edges(import_edges);
pdg
}
fn infer_class_nodes_and_containment(
signatures: &[SignatureInfo],
pdg: &mut ProgramDependenceGraph,
node_ids: &mut HashMap<String, crate::graph::pdg::NodeId>,
file_path: &str,
language: &str,
) -> Vec<(crate::graph::pdg::NodeId, crate::graph::pdg::NodeId)> {
let mut class_methods: HashMap<String, Vec<crate::graph::pdg::NodeId>> = HashMap::new();
for sig in signatures {
if !sig.is_method {
continue;
}
let normalized = normalize_symbol(&sig.qualified_name);
if let Some(dot_pos) = normalized.rfind('.') {
let class_prefix = normalized[..dot_pos].to_string();
if let Some(&mnid) = node_ids.get(&sig.qualified_name) {
class_methods.entry(class_prefix).or_default().push(mnid);
}
}
}
let mut containment = Vec::new();
for (class_name, method_nids) in &class_methods {
let already_exists = node_ids.contains_key(class_name)
|| node_ids.keys().any(|k| normalize_symbol(k) == *class_name);
if already_exists {
if let Some(&class_nid) = node_ids.get(class_name).or_else(|| {
node_ids
.iter()
.find(|(k, _)| normalize_symbol(k) == *class_name)
.map(|(_, v)| v)
}) {
for &mnid in method_nids {
containment.push((class_nid, mnid));
}
}
continue;
}
let (min_start, max_end) = method_nids.iter().fold((usize::MAX, 0), |(mn, mx), &mnid| {
pdg.get_node(mnid)
.map(|n| (mn.min(n.byte_range.0), mx.max(n.byte_range.1)))
.unwrap_or((mn, mx))
});
let short_name = class_name
.rsplit('.')
.next()
.unwrap_or(class_name)
.to_string();
let class_complexity: u32 = method_nids
.iter()
.filter_map(|&mnid| pdg.get_node(mnid))
.map(|node| node.complexity)
.sum();
let class_node = Node {
id: format!("{}:{}", file_path, class_name),
node_type: NodeType::Class,
name: short_name,
file_path: Arc::from(file_path),
byte_range: (
if min_start == usize::MAX {
0
} else {
min_start
},
max_end,
),
complexity: if class_complexity > 0 {
class_complexity
} else {
method_nids.len() as u32
},
language: language.to_string(),
};
let class_nid = pdg.add_node(class_node);
node_ids.insert(class_name.clone(), class_nid);
for &mnid in method_nids {
containment.push((class_nid, mnid));
}
}
containment
}
const EXCLUDED_TYPES: &[&str] = &[
"str",
"string",
"String",
"&str",
"int",
"i32",
"i64",
"u32",
"u64",
"usize",
"f32",
"f64",
"bool",
"void",
"None",
"null",
"undefined",
"any",
"Any",
"object",
"Object",
"self",
"Self",
"cls",
"this",
"bytes",
"Bytes",
"Vec",
"List",
"list",
"dict",
"Dict",
"HashMap",
"Option",
"Result",
"Error",
"Exception",
"T",
"U",
"K",
"V",
];
fn is_excluded_type(t: &str) -> bool {
let base = t.split('<').next().unwrap_or(t).trim();
EXCLUDED_TYPES.contains(&base)
}
pub fn extract_data_flow_edges(
signatures: &[SignatureInfo],
node_ids: &HashMap<String, crate::graph::pdg::NodeId>,
) -> Vec<(
crate::graph::pdg::NodeId,
crate::graph::pdg::NodeId,
String,
f32,
)> {
let mut edges = Vec::new();
let mut seen: HashSet<(crate::graph::pdg::NodeId, crate::graph::pdg::NodeId)> = HashSet::new();
let mut producers: HashMap<String, Vec<&SignatureInfo>> = HashMap::new();
let mut consumers: HashMap<String, Vec<&SignatureInfo>> = HashMap::new();
let mut call_set: HashMap<String, HashSet<String>> = HashMap::new();
for sig in signatures {
if let Some(ret) = &sig.return_type {
let norm = normalize_type_name(ret);
if !norm.is_empty() && !is_excluded_type(&norm) {
producers.entry(norm).or_default().push(sig);
}
}
for param in &sig.parameters {
if let Some(t) = ¶m.type_annotation {
let norm = normalize_type_name(t);
if !norm.is_empty() && !is_excluded_type(&norm) {
consumers.entry(norm).or_default().push(sig);
}
}
}
let calls: HashSet<String> = sig.calls.iter().map(|c| normalize_symbol(c)).collect();
call_set.insert(normalize_symbol(&sig.qualified_name), calls);
}
for (type_name, producer_sigs) in &producers {
if let Some(consumer_sigs) = consumers.get(type_name) {
for prod in producer_sigs {
for cons in consumer_sigs {
if prod.qualified_name == cons.qualified_name {
continue;
}
let (Some(&from), Some(&to)) = (
node_ids.get(&prod.qualified_name),
node_ids.get(&cons.qualified_name),
) else {
continue;
};
if seen.insert((from, to)) {
edges.push((from, to, type_name.clone(), 0.85));
}
}
}
}
}
for (type_name, ret_sigs) in &producers {
if ret_sigs.len() < 2 {
continue;
}
for i in 0..ret_sigs.len() {
for j in 0..ret_sigs.len() {
if i == j {
continue;
}
let a = ret_sigs[i];
let b = ret_sigs[j];
let a_norm = normalize_symbol(&a.qualified_name);
let b_norm = normalize_symbol(&b.qualified_name);
let a_calls_b = call_set
.get(&a_norm)
.map(|s| s.contains(&b_norm))
.unwrap_or(false);
if a_calls_b {
let (Some(&from), Some(&to)) = (
node_ids.get(&a.qualified_name),
node_ids.get(&b.qualified_name),
) else {
continue;
};
if seen.insert((from, to)) {
edges.push((from, to, format!("ret:{}", type_name), 0.65));
}
}
}
}
}
let mut by_normalized_name: HashMap<String, Vec<&SignatureInfo>> = HashMap::new();
for sig in signatures {
by_normalized_name
.entry(normalize_symbol(&sig.qualified_name))
.or_default()
.push(sig);
}
for sig_a in signatures {
let a_norm = normalize_symbol(&sig_a.qualified_name);
let Some(a_calls) = call_set.get(&a_norm) else {
continue;
};
for called_norm in a_calls {
let Some(callee_sigs) = by_normalized_name.get(called_norm) else {
continue;
};
for sig_b in callee_sigs {
let a_types: HashSet<String> = sig_a
.parameters
.iter()
.filter_map(|p| p.type_annotation.as_ref())
.map(|t| normalize_type_name(t))
.filter(|t| !t.is_empty() && !is_excluded_type(t))
.collect();
let b_types: HashSet<String> = sig_b
.parameters
.iter()
.filter_map(|p| p.type_annotation.as_ref())
.map(|t| normalize_type_name(t))
.filter(|t| !t.is_empty() && !is_excluded_type(t))
.collect();
let shared: Vec<&String> = a_types.intersection(&b_types).collect();
if shared.is_empty() {
continue;
}
let (Some(&from), Some(&to)) = (
node_ids.get(&sig_a.qualified_name),
node_ids.get(&sig_b.qualified_name),
) else {
continue;
};
if seen.insert((from, to)) {
edges.push((from, to, format!("param:{}", shared[0]), 0.45));
}
}
}
}
edges
}
fn normalize_type_name(raw: &str) -> String {
let stripped = raw
.trim()
.trim_start_matches('&')
.trim_start_matches("mut ")
.trim();
if let Some(inner_start) = stripped.find('<') {
let inner = &stripped[inner_start + 1..];
let inner_end = inner.rfind('>').unwrap_or(inner.len());
let inner_type = inner[..inner_end].split(',').next().unwrap_or("").trim();
if !inner_type.is_empty() && !is_excluded_type(inner_type) {
return inner_type.to_string();
}
}
stripped.to_string()
}
const MIN_INHERITANCE_CONFIDENCE: f32 = 0.45;
const COMMON_METHOD_NAMES: &[&str] = &[
"new",
"init",
"__init__",
"constructor",
"create",
"build",
"toString",
"to_string",
"__str__",
"__repr__",
"equals",
"__eq__",
"hashCode",
"__hash__",
"clone",
"__clone__",
"copy",
"dispose",
"close",
"__del__",
"finalize",
"update",
"get",
"set",
"run",
"start",
"stop",
"execute",
];
fn is_common_method(name: &str) -> bool {
COMMON_METHOD_NAMES.contains(&name)
}
const ABSTRACT_BASE_PREFIXES: &[&str] = &["Abstract", "Base", "I"];
const ABSTRACT_BASE_SUFFIXES: &[&str] = &[
"Base",
"Mixin",
"Interface",
"Protocol",
"Trait",
"ABC",
"Abstract",
];
fn looks_like_abstract_base(class_name: &str) -> bool {
ABSTRACT_BASE_PREFIXES
.iter()
.any(|p| class_name.starts_with(p) && class_name.len() > p.len())
|| ABSTRACT_BASE_SUFFIXES
.iter()
.any(|s| class_name.ends_with(s) && class_name.len() > s.len())
}
#[derive(Debug, Default)]
struct InheritanceEvidence {
super_call_confidence: f32,
override_confidence: f32,
naming_confidence: f32,
nesting_confidence: f32,
}
impl InheritanceEvidence {
fn max_confidence(&self) -> f32 {
self.super_call_confidence
.max(self.override_confidence)
.max(self.naming_confidence)
.max(self.nesting_confidence)
}
}
pub fn extract_inheritance_edges(
signatures: &[SignatureInfo],
node_ids: &HashMap<String, crate::graph::pdg::NodeId>,
) -> Vec<(crate::graph::pdg::NodeId, crate::graph::pdg::NodeId, f32)> {
let mut edges = Vec::new();
let mut class_methods: HashMap<String, Vec<&SignatureInfo>> = HashMap::new();
for sig in signatures {
if sig.is_method {
let normalized = normalize_symbol(&sig.qualified_name);
if let Some(dot_pos) = normalized.rfind('.') {
let class_name = normalized[..dot_pos].to_string();
class_methods.entry(class_name).or_default().push(sig);
}
}
}
let class_names: Vec<&String> = class_methods.keys().collect();
if class_names.len() < 2 {
return edges;
}
let mut class_method_names: HashMap<&str, HashSet<&str>> = HashMap::new();
for (cls, methods) in &class_methods {
let names: HashSet<&str> = methods.iter().map(|sig| sig.name.as_str()).collect();
class_method_names.insert(cls.as_str(), names);
}
let mut seen_pairs: HashSet<(usize, usize)> = HashSet::new();
for i in 0..class_names.len() {
for j in 0..class_names.len() {
if i == j {
continue;
}
let pair = if i < j { (i, j) } else { (j, i) };
if !seen_pairs.insert(pair) {
continue;
}
let cls_a = class_names[i].as_str();
let cls_b = class_names[j].as_str();
let methods_a = &class_methods[cls_a];
let methods_b = &class_methods[cls_b];
let mut evidence = InheritanceEvidence::default();
for method_a in methods_a {
let method_name = &method_a.name;
let super_patterns = [
format!("super.{}", method_name),
format!("super::{}", method_name),
format!("parent.{}", method_name),
format!("Base.{}", method_name),
format!("{}.{}", cls_b, method_name),
format!("{}::{}", cls_b, method_name),
];
let calls_super = method_a.calls.iter().any(|call| {
let norm_call = normalize_symbol(call);
super_patterns
.iter()
.any(|pat| norm_call.ends_with(&normalize_symbol(pat)))
|| norm_call.starts_with("super.")
|| norm_call.starts_with("super::")
|| norm_call.starts_with("parent.")
});
if calls_super {
evidence.super_call_confidence = 0.90_f32.max(evidence.super_call_confidence);
}
}
for method_b in methods_b {
let method_name = &method_b.name;
let super_patterns = [
format!("super.{}", method_name),
format!("super::{}", method_name),
format!("parent.{}", method_name),
format!("Base.{}", method_name),
format!("{}.{}", cls_a, method_name),
format!("{}::{}", cls_a, method_name),
];
let calls_super = method_b.calls.iter().any(|call| {
let norm_call = normalize_symbol(call);
super_patterns
.iter()
.any(|pat| norm_call.ends_with(&normalize_symbol(pat)))
|| norm_call.starts_with("super.")
|| norm_call.starts_with("super::")
|| norm_call.starts_with("parent.")
});
if calls_super {
evidence.super_call_confidence = 0.90_f32.max(evidence.super_call_confidence);
}
}
let names_a = class_method_names.get(cls_a).cloned().unwrap_or_default();
let names_b = class_method_names.get(cls_b).cloned().unwrap_or_default();
let shared_count = names_a
.intersection(&names_b)
.filter(|&&name| !is_common_method(name))
.count();
evidence.override_confidence = match shared_count {
0 | 1 => 0.0,
2 => 0.45,
3 => 0.60,
_ => 0.75,
};
let short_a = cls_a.rsplit('.').next().unwrap_or(cls_a);
let short_b = cls_b.rsplit('.').next().unwrap_or(cls_b);
if (looks_like_abstract_base(short_a) || looks_like_abstract_base(short_b))
&& shared_count >= 1
{
evidence.naming_confidence = 0.50;
}
let a_is_prefix_of_b = cls_b.starts_with(cls_a)
&& cls_b
.chars()
.nth(cls_a.len())
.map(|c| c == '.')
.unwrap_or(false);
let b_is_prefix_of_a = cls_a.starts_with(cls_b)
&& cls_a
.chars()
.nth(cls_b.len())
.map(|c| c == '.')
.unwrap_or(false);
if a_is_prefix_of_b || b_is_prefix_of_a {
evidence.nesting_confidence = 0.70;
}
let confidence = evidence.max_confidence();
if confidence < MIN_INHERITANCE_CONFIDENCE {
continue;
}
let (child_cls, parent_cls) = determine_inheritance_direction(
cls_a, cls_b, methods_a, methods_b, &evidence, short_a, short_b,
);
let child_nid = node_ids
.get(child_cls)
.or_else(|| {
class_methods
.get(child_cls)
.and_then(|m| m.first())
.and_then(|sig| node_ids.get(&sig.qualified_name))
})
.copied();
let parent_nid = node_ids
.get(parent_cls)
.or_else(|| {
class_methods
.get(parent_cls)
.and_then(|m| m.first())
.and_then(|sig| node_ids.get(&sig.qualified_name))
})
.copied();
if let (Some(child_id), Some(parent_id)) = (child_nid, parent_nid) {
edges.push((child_id, parent_id, confidence));
}
}
}
edges
}
fn determine_inheritance_direction<'a>(
cls_a: &'a str,
cls_b: &'a str,
methods_a: &[&SignatureInfo],
_methods_b: &[&SignatureInfo],
evidence: &InheritanceEvidence,
short_a: &str,
short_b: &str,
) -> (&'a str, &'a str) {
if evidence.super_call_confidence > 0.0 {
let a_calls_super = methods_a.iter().any(|sig| {
sig.calls.iter().any(|c| {
let norm = normalize_symbol(c);
norm.starts_with("super.")
|| norm.starts_with("super::")
|| norm.starts_with("parent.")
|| norm.contains(cls_b)
})
});
if a_calls_super {
return (cls_a, cls_b);
}
return (cls_b, cls_a);
}
if looks_like_abstract_base(short_a) {
return (cls_b, cls_a);
}
if looks_like_abstract_base(short_b) {
return (cls_a, cls_b);
}
if cls_b.starts_with(cls_a) {
return (cls_b, cls_a);
}
if cls_a.starts_with(cls_b) {
return (cls_a, cls_b);
}
if cls_a.len() <= cls_b.len() {
(cls_b, cls_a)
} else {
(cls_a, cls_b)
}
}
pub fn extract_call_edges(
signatures: &[SignatureInfo],
node_ids: &HashMap<String, crate::graph::pdg::NodeId>,
) -> Vec<(crate::graph::pdg::NodeId, crate::graph::pdg::NodeId)> {
let mut edges = Vec::new();
let mut seen: HashSet<(crate::graph::pdg::NodeId, crate::graph::pdg::NodeId)> = HashSet::new();
let mut exact_map: HashMap<String, Vec<crate::graph::pdg::NodeId>> = HashMap::new();
let mut last_map: HashMap<String, Vec<crate::graph::pdg::NodeId>> = HashMap::new();
let mut suffix_map: HashMap<String, Vec<crate::graph::pdg::NodeId>> = HashMap::new();
let mut namespace_map: HashMap<String, Vec<crate::graph::pdg::NodeId>> = HashMap::new();
for sig in signatures {
if let Some(&id) = node_ids.get(&sig.qualified_name) {
let normalized = normalize_symbol(&sig.qualified_name);
let segments: Vec<&str> = normalized.split('.').filter(|s| !s.is_empty()).collect();
exact_map.entry(normalized.clone()).or_default().push(id);
if let Some(last) = segments.last() {
last_map.entry(last.to_string()).or_default().push(id);
}
if segments.len() > 1 {
let ns = segments[..segments.len() - 1].join(".");
namespace_map.entry(ns).or_default().push(id);
}
for len in 2..=3_usize.min(segments.len()) {
let start = segments.len() - len;
let key = segments[start..].join(".");
suffix_map.entry(key).or_default().push(id);
}
}
}
let mut alias_map: HashMap<String, String> = HashMap::new();
for sig in signatures {
for import in &sig.imports {
let alias = import.alias.clone().or_else(|| {
import
.path
.split(['.', ':', '/', '\\'])
.next_back()
.map(|s| s.to_string())
});
if let Some(alias) = alias {
alias_map
.entry(alias)
.or_insert_with(|| import.path.clone());
}
}
}
for sig in signatures {
let Some(&caller_id) = node_ids.get(&sig.qualified_name) else {
continue;
};
let caller_ns = {
let norm = normalize_symbol(&sig.qualified_name);
let segs: Vec<&str> = norm.split('.').collect();
if segs.len() > 1 {
Some(segs[..segs.len() - 1].join("."))
} else {
None
}
};
for call_target in &sig.calls {
let mut candidates = vec![call_target.clone()];
let call_segs: Vec<String> = normalize_symbol(call_target)
.split('.')
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect();
if let Some(first) = call_segs.first() {
if let Some(import_path) = alias_map.get(first) {
if call_segs.len() == 1 {
candidates.push(import_path.clone());
} else {
candidates.push(format!("{}.{}", import_path, call_segs[1..].join(".")));
}
}
}
if let Some(ns) = &caller_ns {
if let Some(first) = call_segs.first() {
if matches!(
first.as_str(),
"self" | "this" | "super" | "Self" | "crate" | "base"
) {
let rest = call_segs[1..].join(".");
if !rest.is_empty() {
candidates.push(format!("{}.{}", ns, rest));
}
} else if call_segs.len() == 1 {
candidates.push(format!("{}.{}", ns, first));
}
}
}
let mut targets: Vec<crate::graph::pdg::NodeId> = Vec::new();
for candidate in candidates {
let norm = normalize_symbol(&candidate);
let segs: Vec<&str> = norm.split('.').filter(|s| !s.is_empty()).collect();
if let Some(ids) = exact_map.get(&norm) {
targets.extend(ids);
}
if let Some(last) = segs.last() {
if let Some(ids) = last_map.get(*last) {
targets.extend(ids);
}
}
for len in 2..=3_usize.min(segs.len()) {
let start = segs.len() - len;
let key = segs[start..].join(".");
if let Some(ids) = suffix_map.get(&key) {
targets.extend(ids);
}
}
}
for target_id in targets {
if caller_id != target_id && seen.insert((caller_id, target_id)) {
edges.push((caller_id, target_id));
}
}
let callee_name = normalize_symbol(call_target);
if let Some((scoped_prefix, _member)) = callee_name.rsplit_once('.') {
let bare_type = scoped_prefix.rsplit('.').next().unwrap_or(scoped_prefix);
let looks_like_type = bare_type.chars().next().is_some_and(|c| c.is_uppercase());
if looks_like_type {
let struct_nid = node_ids
.get(scoped_prefix)
.or_else(|| node_ids.get(bare_type))
.or_else(|| last_map.get(bare_type).and_then(|v| v.first()));
if let Some(&snid) = struct_nid {
let pair = (caller_id, snid);
if !seen.contains(&pair) {
seen.insert(pair);
edges.push(pair);
}
}
}
}
}
}
edges
}
pub fn extract_import_paths_from_source(source_code: &[u8], language: &str) -> HashSet<String> {
let Ok(source) = std::str::from_utf8(source_code) else {
return HashSet::new();
};
let lang = language.to_ascii_lowercase();
let source = strip_block_comments(&lang, source);
match lang.as_str() {
"python" | "py" => extract_python_imports(&source),
"javascript" | "js" | "typescript" | "ts" | "jsx" | "tsx" => extract_js_ts_imports(&source),
"rust" | "rs" => extract_rust_imports(&source),
"go" | "golang" => extract_go_imports(&source),
"java" => extract_java_imports(&source),
"csharp" | "cs" | "c#" => extract_csharp_imports(&source),
"ruby" | "rb" => extract_ruby_imports(&source),
"php" => extract_php_imports(&source),
"lua" => extract_lua_imports(&source),
"scala" => extract_scala_imports(&source),
"c" | "cpp" | "c++" | "cxx" | "cc" | "h" | "hpp" => extract_c_imports(&source),
_ => HashSet::new(),
}
}
fn strip_block_comments(lang: &str, source: &str) -> String {
match lang {
"python" | "py" | "ruby" | "rb" => source.to_string(), _ => {
let mut result = String::with_capacity(source.len());
let mut chars = source.chars().peekable();
while let Some(c) = chars.next() {
if c == '/' && chars.peek() == Some(&'*') {
chars.next(); loop {
match chars.next() {
Some('*') if chars.peek() == Some(&'/') => {
chars.next();
break;
}
None => break,
_ => {}
}
}
result.push(' '); } else {
result.push(c);
}
}
result
}
}
}
fn extract_python_imports(source: &str) -> HashSet<String> {
let mut imports = HashSet::new();
let re_import = Regex::new(r"(?m)^import\s+([\w,\s.]+)").unwrap();
for cap in re_import.captures_iter(source) {
for name in cap[1].split(',') {
let trimmed = name.split_whitespace().next().unwrap_or("").trim();
if !trimmed.is_empty() {
imports.insert(trimmed.to_string());
}
}
}
let re_from =
Regex::new(r"(?s)from\s+([\w.]+)\s+import\s+(?:\(([^)]+)\)|(\w[\w\s,*]*))").unwrap();
for cap in re_from.captures_iter(source) {
let module = cap[1].trim();
imports.insert(module.to_string());
let names_str = cap.get(2).or(cap.get(3)).map(|m| m.as_str()).unwrap_or("");
for name in names_str.split(',') {
let sym = name
.split_whitespace()
.next()
.unwrap_or("")
.trim()
.trim_matches('*');
if !sym.is_empty() && sym != "*" {
imports.insert(format!("{}.{}", module, sym));
}
}
}
imports
}
fn extract_js_ts_imports(source: &str) -> HashSet<String> {
let mut imports = HashSet::new();
let re_named =
Regex::new(r#"(?s)import\s+(?:type\s+)?\{[^}]*\}\s+from\s+['"]([^'"]+)['"]"#).unwrap();
for cap in re_named.captures_iter(source) {
imports.insert(cap[1].trim().to_string());
}
let re_default =
Regex::new(r#"import\s+(?:type\s+)?(?:\*\s+as\s+\w+|\w+)\s+from\s+['"]([^'"]+)['"]"#)
.unwrap();
for cap in re_default.captures_iter(source) {
imports.insert(cap[1].trim().to_string());
}
let re_require = Regex::new(r#"require\s*\(\s*['"]([^'"]+)['"]\s*\)"#).unwrap();
for cap in re_require.captures_iter(source) {
imports.insert(cap[1].trim().to_string());
}
let re_export = Regex::new(r#"export\s+(?:\*|\{[^}]*\})\s+from\s+['"]([^'"]+)['"]"#).unwrap();
for cap in re_export.captures_iter(source) {
imports.insert(cap[1].trim().to_string());
}
imports
}
fn extract_rust_imports(source: &str) -> HashSet<String> {
let mut imports = HashSet::new();
let collapsed = collapse_multiline(source, "use ", ';');
for stmt in &collapsed {
let use_stmt = stmt.trim_start_matches("use ").trim_end_matches(';').trim();
expand_rust_use(use_stmt, &mut imports);
}
imports
}
fn expand_rust_use(stmt: &str, out: &mut HashSet<String>) {
if let Some(brace_start) = stmt.find('{') {
let base = stmt[..brace_start]
.trim()
.trim_end_matches("::")
.replace("::", ".");
let inner = stmt[brace_start + 1..]
.trim_end_matches('}')
.trim_end_matches(';');
for item in split_respecting_braces(inner) {
let item = item.trim();
if item == "self" {
out.insert(base.clone());
continue;
}
if item.contains('{') {
expand_rust_use(&format!("{}::{}", base.replace('.', "::"), item), out);
} else {
let full = format!("{}.{}", base, item.replace("::", "."));
out.insert(full);
}
}
} else {
out.insert(stmt.replace("::", ".").trim_matches('.').to_string());
}
}
fn split_respecting_braces(s: &str) -> Vec<&str> {
let mut result = Vec::new();
let mut depth = 0i32;
let mut last = 0;
for (i, c) in s.char_indices() {
match c {
'{' => depth += 1,
'}' => depth -= 1,
',' if depth == 0 => {
result.push(s[last..i].trim());
last = i + 1;
}
_ => {}
}
}
let tail = s[last..].trim();
if !tail.is_empty() {
result.push(tail);
}
result
}
fn collapse_multiline(source: &str, prefix: &str, terminator: char) -> Vec<String> {
let mut results = Vec::new();
let mut in_stmt = false;
let mut current = String::new();
for line in source.lines() {
let trimmed = line.trim();
if trimmed.starts_with("//") {
continue;
}
if !in_stmt && trimmed.starts_with(prefix) {
in_stmt = true;
current = trimmed.to_string();
} else if in_stmt {
current.push(' ');
current.push_str(trimmed);
}
if in_stmt {
if let Some(end) = current.find(terminator) {
results.push(current[..=end].to_string());
in_stmt = false;
current = String::new();
}
}
}
results
}
fn extract_go_imports(source: &str) -> HashSet<String> {
let mut imports = HashSet::new();
let re_single = Regex::new(r#"import\s+["']([^"']+)["']"#).unwrap();
for cap in re_single.captures_iter(source) {
imports.insert(cap[1].trim().to_string());
}
let re_block = Regex::new(r#"(?s)import\s*\(([^)]+)\)"#).unwrap();
let re_path = Regex::new(r#"["']([^"']+)["']"#).unwrap();
for cap in re_block.captures_iter(source) {
let inner = &cap[1];
for p in re_path.captures_iter(inner) {
imports.insert(p[1].trim().to_string());
}
}
imports
}
fn extract_java_imports(source: &str) -> HashSet<String> {
let mut imports = HashSet::new();
let re = Regex::new(r"(?m)^import(?:\s+static)?\s+([\w.*]+)\s*;").unwrap();
for cap in re.captures_iter(source) {
imports.insert(cap[1].trim().to_string());
}
imports
}
fn extract_csharp_imports(source: &str) -> HashSet<String> {
let mut imports = HashSet::new();
let re = Regex::new(r"(?m)^using(?:\s+static)?\s+([\w.]+)\s*;").unwrap();
for cap in re.captures_iter(source) {
imports.insert(cap[1].trim().to_string());
}
imports
}
fn extract_ruby_imports(source: &str) -> HashSet<String> {
let mut imports = HashSet::new();
let re = Regex::new(r#"(?:require|require_relative|load)\s*['"]([^'"]+)['"]"#).unwrap();
for cap in re.captures_iter(source) {
imports.insert(cap[1].trim().to_string());
}
imports
}
fn extract_php_imports(source: &str) -> HashSet<String> {
let mut imports = HashSet::new();
let re = Regex::new(r"(?m)^use\s+([\w\\]+)(?:\s+as\s+\w+)?\s*;").unwrap();
for cap in re.captures_iter(source) {
let path = cap[1].trim().replace('\\', ".");
imports.insert(path);
}
let re_require =
Regex::new(r#"(?:require|include)(?:_once)?\s*\(?['"]([^'"]+)['"]\)?"#).unwrap();
for cap in re_require.captures_iter(source) {
imports.insert(cap[1].trim().to_string());
}
imports
}
fn extract_lua_imports(source: &str) -> HashSet<String> {
let mut imports = HashSet::new();
let re = Regex::new(r#"require\s*\(?['"]([^'"]+)['"]\)?"#).unwrap();
for cap in re.captures_iter(source) {
imports.insert(cap[1].replace('.', "/").trim().to_string());
}
imports
}
fn extract_scala_imports(source: &str) -> HashSet<String> {
let mut imports = HashSet::new();
let re = Regex::new(r"(?m)^\s*import\s+([^\n]+)$").unwrap();
let selector_re = Regex::new(r"^([\w.]+)(?:\.\{([^}]+)\}|\.(\w+|\*))?$").unwrap();
for cap in re.captures_iter(source) {
let stmt = cap[1].trim();
if let Some(sel) = selector_re.captures(stmt) {
let base = &sel[1];
if let Some(names) = sel.get(2) {
for name in names.as_str().split(',') {
let n = name.trim();
if n != "_" && !n.is_empty() {
imports.insert(format!("{}.{}", base, n));
}
}
} else if let Some(single) = sel.get(3) {
imports.insert(format!("{}.{}", base, single.as_str()));
} else {
imports.insert(base.to_string());
}
}
}
imports
}
fn extract_c_imports(source: &str) -> HashSet<String> {
let mut imports = HashSet::new();
let re = Regex::new(r#"#include\s*[<"']([^>"']+)[>"']"#).unwrap();
for cap in re.captures_iter(source) {
imports.insert(cap[1].trim().to_string());
}
imports
}
fn extract_import_edges(
signatures: &[SignatureInfo],
node_ids: &HashMap<String, crate::graph::pdg::NodeId>,
pdg: &mut ProgramDependenceGraph,
file_path: &str,
language: &str,
source_code: &[u8],
) -> Vec<(crate::graph::pdg::NodeId, crate::graph::pdg::NodeId)> {
let mut edges = Vec::new();
let mut seen: HashSet<(crate::graph::pdg::NodeId, crate::graph::pdg::NodeId)> = HashSet::new();
let mut unique_paths: HashSet<String> = signatures
.iter()
.flat_map(|sig| sig.imports.iter().map(|imp| imp.path.clone()))
.collect();
unique_paths.extend(extract_import_paths_from_source(source_code, language));
if unique_paths.is_empty() {
return edges;
}
let module_sym = format!("{}:__module__", file_path);
let importer_nid = pdg.find_by_symbol(&module_sym).unwrap_or_else(|| {
pdg.add_node(Node {
id: module_sym,
node_type: NodeType::Module,
name: "__module__".to_string(),
file_path: Arc::from(file_path),
byte_range: (0, 0),
complexity: 1,
language: language.to_string(),
})
});
let mut symbol_map: HashMap<String, Vec<crate::graph::pdg::NodeId>> = HashMap::new();
for sig in signatures {
if let Some(&nid) = node_ids.get(&sig.qualified_name) {
let norm = normalize_symbol(&sig.qualified_name);
symbol_map.entry(norm.clone()).or_default().push(nid);
if let Some(last) = norm.split('.').next_back() {
symbol_map.entry(last.to_string()).or_default().push(nid);
}
}
}
let mut external_nodes: HashMap<String, crate::graph::pdg::NodeId> = HashMap::new();
for path in unique_paths {
let targets = resolve_import_targets(&path, &symbol_map);
let targets = if targets.is_empty() {
let eid = *external_nodes.entry(path.clone()).or_insert_with(|| {
pdg.add_node(Node {
id: format!("{}:__external__:{}", file_path, path),
node_type: NodeType::External,
name: path.clone(),
file_path: Arc::from(file_path),
byte_range: (0, 0),
complexity: 1,
language: "external".to_string(),
})
});
vec![eid]
} else {
targets
};
for target in targets {
if target == importer_nid {
continue;
}
if seen.insert((importer_nid, target)) {
edges.push((importer_nid, target));
}
}
}
edges
}
fn resolve_import_targets(
import_path: &str,
symbol_map: &HashMap<String, Vec<crate::graph::pdg::NodeId>>,
) -> Vec<crate::graph::pdg::NodeId> {
let normalized = normalize_symbol(import_path);
let mut targets: Vec<crate::graph::pdg::NodeId> = Vec::new();
if let Some(ids) = symbol_map.get(&normalized) {
targets.extend(ids);
}
let parts: Vec<&str> = normalized.split('.').collect();
for len in 2..=3_usize.min(parts.len()) {
let start = parts.len() - len;
let key = parts[start..].join(".");
if let Some(ids) = symbol_map.get(&key) {
targets.extend(ids);
}
}
if targets.is_empty() {
if let Some(last) = normalized.split('.').next_back() {
if let Some(ids) = symbol_map.get(last) {
targets.extend(ids);
}
}
}
targets.sort_by_key(|id| id.index());
targets.dedup();
targets
}
pub fn normalize_symbol(raw: &str) -> String {
let trimmed = raw.split('(').next().unwrap_or(raw).trim();
trimmed
.replace("?.", ".")
.replace("::", ".")
.replace("->", ".")
.replace(['\\', '/', ':'], ".")
.replace("..", ".")
.trim_matches('.')
.to_string()
}
fn signature_to_node(sig: &SignatureInfo, file_path: &str, language: &str) -> Node {
let node_type = if sig.is_method {
NodeType::Method
} else {
NodeType::Function
};
let complexity = if sig.cyclomatic_complexity > 0 {
sig.cyclomatic_complexity
} else {
1u32 + sig.parameters.len() as u32
};
Node {
id: format!("{}:{}", file_path, sig.qualified_name),
node_type,
name: sig.name.clone(),
file_path: Arc::from(file_path),
byte_range: sig.byte_range,
complexity,
language: language.to_string(),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parse::prelude::{Parameter, SignatureInfo, Visibility};
fn sig(name: &str, qualified: &str, is_method: bool) -> SignatureInfo {
SignatureInfo {
name: name.to_string(),
qualified_name: qualified.to_string(),
parameters: vec![],
return_type: None,
visibility: Visibility::Public,
is_async: false,
is_method,
docstring: None,
calls: vec![],
imports: vec![],
byte_range: (0, 100),
cyclomatic_complexity: 0,
}
}
fn sig_with_types(
name: &str,
qualified: &str,
params: Vec<(&str, &str)>,
ret: Option<&str>,
) -> SignatureInfo {
let parameters = params
.into_iter()
.map(|(pname, ptype)| Parameter {
name: pname.to_string(),
type_annotation: Some(ptype.to_string()),
default_value: None,
})
.collect();
SignatureInfo {
name: name.to_string(),
qualified_name: qualified.to_string(),
parameters,
return_type: ret.map(|s| s.to_string()),
visibility: Visibility::Public,
is_async: false,
is_method: false,
docstring: None,
calls: vec![],
imports: vec![],
byte_range: (0, 100),
cyclomatic_complexity: 0,
}
}
#[test]
fn containment_edges_are_not_call_edges() {
let sigs = vec![sig("speak", "Animal::speak", true)];
let pdg = extract_pdg_from_signatures(sigs, b"", "f.py", "python");
let call_count = pdg
.edge_indices()
.filter_map(|e| pdg.get_edge(e))
.filter(|e| e.edge_type == crate::graph::pdg::EdgeType::Call)
.count();
let containment_count = pdg
.edge_indices()
.filter_map(|e| pdg.get_edge(e))
.filter(|e| e.edge_type == crate::graph::pdg::EdgeType::Containment)
.count();
assert_eq!(call_count, 0, "Containment should not produce Call edges");
assert_eq!(
containment_count, 1,
"Should have one Class→Method containment edge"
);
}
#[test]
fn data_flow_signal_a_produces_directed_edge() {
let producer = sig_with_types("make_user", "make_user", vec![], Some("User"));
let consumer = sig_with_types("save_user", "save_user", vec![("u", "User")], None);
let mut nids = HashMap::new();
let mut pdg = ProgramDependenceGraph::new();
let p = pdg.add_node(signature_to_node(&producer, "f.rs", "rust"));
let c = pdg.add_node(signature_to_node(&consumer, "f.rs", "rust"));
nids.insert("make_user".to_string(), p);
nids.insert("save_user".to_string(), c);
let edges = extract_data_flow_edges(&[producer, consumer], &nids);
assert!(
!edges.is_empty(),
"Signal A should produce a data flow edge"
);
let (from, to, _, conf) = &edges[0];
assert_eq!((*from, *to), (p, c), "Edge should be producer → consumer");
assert!(*conf >= 0.8, "Signal A confidence should be >= 0.8");
}
#[test]
fn data_flow_clique_not_generated() {
let sigs: Vec<SignatureInfo> = (0..10)
.map(|i| {
sig_with_types(
&format!("f{i}"),
&format!("f{i}"),
vec![("s", "String")],
None,
)
})
.collect();
let mut nids = HashMap::new();
let mut pdg = ProgramDependenceGraph::new();
for s in &sigs {
let nid = pdg.add_node(signature_to_node(s, "f.rs", "rust"));
nids.insert(s.qualified_name.clone(), nid);
}
let edges = extract_data_flow_edges(&sigs, &nids);
assert_eq!(
edges.len(),
0,
"Shared param type without call or return relationship should not produce edges"
);
}
#[test]
fn inheritance_super_call_signal() {
let parent_speak = sig("speak", "Animal::speak", true);
let mut child_speak = sig("speak", "Dog::speak", true);
child_speak.calls.push("super.speak".to_string());
let sigs = vec![parent_speak, child_speak];
let pdg = extract_pdg_from_signatures(sigs, b"", "f.py", "python");
let inheritance_edges: Vec<_> = pdg
.edge_indices()
.filter_map(|e| {
let edge = pdg.get_edge(e)?;
if edge.edge_type == crate::graph::pdg::EdgeType::Inheritance {
Some(edge.metadata.confidence.unwrap_or(0.0))
} else {
None
}
})
.collect();
assert!(
!inheritance_edges.is_empty(),
"Super call should produce inheritance edge"
);
assert!(
inheritance_edges[0] >= 0.85,
"Super call confidence should be high"
);
}
#[test]
fn python_multiline_import_parsed() {
let source = b"from os.path import (\n join,\n exists,\n dirname\n)\n";
let imports = extract_import_paths_from_source(source, "python");
assert!(imports.contains("os.path"), "Module should be captured");
assert!(imports.contains("os.path.join") || imports.iter().any(|s| s.contains("join")));
}
#[test]
fn rust_brace_import_expanded() {
let source = b"use std::{\n collections::HashMap,\n sync::Arc,\n};\n";
let imports = extract_import_paths_from_source(source, "rust");
assert!(imports.iter().any(|s| s.contains("HashMap")));
assert!(imports.iter().any(|s| s.contains("Arc")));
}
#[test]
fn typescript_multiline_import_parsed() {
let source = b"import {\n useState,\n useEffect,\n useCallback\n} from 'react';\n";
let imports = extract_import_paths_from_source(source, "typescript");
assert!(imports.contains("react"));
}
#[test]
fn cyclomatic_complexity_wiring_from_signature_to_node() {
use crate::parse::traits::{Parameter, SignatureInfo, Visibility};
let sig_simple = SignatureInfo {
name: "simple".to_string(),
qualified_name: "simple".to_string(),
parameters: vec![],
return_type: None,
visibility: Visibility::Public,
is_async: false,
is_method: false,
docstring: None,
calls: vec![],
imports: vec![],
byte_range: (0, 10),
cyclomatic_complexity: 0,
};
let node = signature_to_node(&sig_simple, "test.rs", "rust");
assert_eq!(node.complexity, 1, "Simple: no params → complexity 1");
let sig_complex = SignatureInfo {
cyclomatic_complexity: 5,
..sig_simple.clone()
};
let node = signature_to_node(&sig_complex, "test.rs", "rust");
assert_eq!(node.complexity, 5, "Complex: cyclomatic=5 → complexity 5");
let sig_params = SignatureInfo {
name: "with_params".to_string(),
qualified_name: "with_params".to_string(),
parameters: vec![
Parameter {
name: "a".into(),
type_annotation: None,
default_value: None,
},
Parameter {
name: "b".into(),
type_annotation: None,
default_value: None,
},
],
cyclomatic_complexity: 0,
..sig_simple
};
let node = signature_to_node(&sig_params, "test.rs", "rust");
assert_eq!(node.complexity, 3, "Params: 2 params → complexity 3");
let sig_both = SignatureInfo {
cyclomatic_complexity: 10,
..sig_params
};
let node = signature_to_node(&sig_both, "test.rs", "rust");
assert_eq!(
node.complexity, 10,
"Both: cyclomatic=10 overrides param count"
);
}
}